using System.Reflection.Metadata; using System.Text.RegularExpressions; using DBConnection.Models; using HtmlAgilityPack; namespace WebNovelPortalAPI.Scrapers; public class KakuyomuScraper : IScraper { private const string UrlPattern = @"https?:\/\/kakuyomu\.jp\/works\/\d+\/?"; private const string BaseUrl = "https://kakuyomu.jp"; public bool MatchesUrl(string url) { var regex = new Regex(UrlPattern, RegexOptions.IgnoreCase); return regex.IsMatch(url); } private string GetNovelTitle(HtmlDocument document) { var xpath = @"//*[@id='workTitle']/a"; return document.DocumentNode.SelectSingleNode(xpath).InnerText; } private Author GetAuthor(HtmlDocument document) { var nameXPath = @"//*[@id='workAuthor-activityName']/a"; var urlXPath = @"//*[@id='workAuthor-activityName']/a"; var authorName = document.DocumentNode.SelectSingleNode(nameXPath).InnerText; var authorUrl = document.DocumentNode.SelectSingleNode(urlXPath).Attributes["href"].Value; Author author = new Author { Name = authorName, Url = $"{BaseUrl + authorUrl}" }; return author; } private List GetChapters(HtmlDocument document) { var urlxpath = @"//a[@class='widget-toc-episode-episodeTitle']"; var namexpath = @"span"; var urlnodes = document.DocumentNode.SelectNodes(urlxpath); var chapters = urlnodes.Select((node, i) => new Chapter { ChapterNumber = i + 1, Url = $"{BaseUrl}{node.Attributes["href"].Value}", Name = node.SelectSingleNode(namexpath).InnerText }); return chapters.ToList(); } private List GetTags(HtmlDocument document) { var xpath = @"//span[@itemprop='keywords']/a"; var nodes = document.DocumentNode.SelectNodes(xpath); return nodes.Select(node => new Tag { TagValue = node.InnerText }).ToList(); } private DateTime GetPostedDate(HtmlDocument document) { var xpath = @"//time[@itemprop='datePublished']"; return DateTime.Parse(document.DocumentNode.SelectSingleNode(xpath).InnerText); } private DateTime GetLastUpdatedDate(HtmlDocument document) { var xpath = @"//time[@itemprop='dateModified']"; return DateTime.Parse(document.DocumentNode.SelectSingleNode(xpath).InnerText); } public Novel ScrapeNovel(string url) { Novel novel = new Novel(); var web = new HtmlWeb(); var doc = web.Load(url); if (doc == null) { throw new Exception("Error parsing document"); } return new Novel { Author = GetAuthor(doc), Chapters = GetChapters(doc), DatePosted = GetPostedDate(doc), LastUpdated = GetLastUpdatedDate(doc), Tags = GetTags(doc), Title = GetNovelTitle(doc), Url = url }; } public string? ScrapeChapterContent(string chapterUrl) { throw new NotImplementedException(); } }