using System.Net; using System.Text.RegularExpressions; using HtmlAgilityPack; using Common.Models.DBDomain; using Common.Models.Enums; namespace WebNovelPortalAPI.Scrapers; public class SyosetuScraper : AbstractScraper { protected override string UrlMatchPattern => @"https?:\/\/(\w+)\.syosetu\.com\/\w+\/?"; protected override string BaseUrlPattern => @"https?:\/\/(\w+)\.syosetu\.com"; protected override string? WorkTitlePattern => @"//p[@class='novel_title']"; protected override string? AuthorNamePattern => @"//div[@class='novel_writername']/a | //div[@class='novel_writername']"; protected override string? AuthorLinkPattern => @"//div[@class='novel_writername']/a"; protected override string? ChapterUrlPattern => @"//dl[@class='novel_sublist2']//a"; protected override string? ChapterPostedPattern => @"following-sibling::dt[@class='long_update']"; protected override string? ChapterUpdatedPattern => @"span"; protected override string? TagPattern => @"//th[text()='キーワード']/following-sibling::td"; protected override string? DatePostedPattern => @"//th[text()='掲載日']/following-sibling::td"; protected override string? DateUpdatedPattern => @"//th[contains(text(),'掲載日')]/following-sibling::td"; private async Task GetInfoPage(string baseUrl, string novelUrl) { string novelInfoBase = $"/novelview/infotop/ncode/"; string novelRegex = @"https?:\/\/\w+\.syosetu\.com\/(\w+)\/?"; string novelCode = new Regex(novelRegex).Match(novelUrl).Groups[1].Value; string novelInfoPage = $"{baseUrl}{novelInfoBase}{novelCode}"; return await GetPage(novelInfoPage); } protected override List GetChapters(HtmlDocument document, string baseUrl, string novelUrl) { string dateUpdatedRegex = @"\d\d\d\d\/\d\d\/\d\d \d\d:\d\d"; var nodes = document.DocumentNode.SelectNodes(ChapterUrlPattern); return nodes.Select((node,i) => { var datePostedNode = node.ParentNode.SelectSingleNode(ChapterPostedPattern); var datePosted = DateTime.Parse(new Regex(dateUpdatedRegex).Match(datePostedNode.InnerText).Value); var dateUpdatedNode = datePostedNode.SelectSingleNode(ChapterUpdatedPattern); DateTime dateUpdated; if (dateUpdatedNode == null) { dateUpdated = datePosted; } else { dateUpdated = DateTime.Parse(new Regex(dateUpdatedRegex).Match(dateUpdatedNode.Attributes["title"].Value).Value); } return new Chapter { Name = node.InnerText, Url = baseUrl + node.Attributes["href"].Value, ChapterNumber = i+1, DatePosted = datePosted.ToUniversalTime(), DateUpdated = dateUpdated.ToUniversalTime() }; }).ToList(); } protected override Author GetAuthor(HtmlDocument document, string baseUrl, string novelUrl) { var authorLink = document.DocumentNode.SelectSingleNode(AuthorLinkPattern)?.Attributes["href"].Value ?? null; if (string.IsNullOrEmpty(authorLink)) { return null; } var authorName = document.DocumentNode.SelectSingleNode(AuthorNamePattern).InnerText.Replace("\n", ""); return new Author { Name = authorName, Url = authorLink }; } protected override DateTime GetPostedDate(HtmlDocument document, string baseUrl, string novelUrl) { var node = document.DocumentNode.SelectSingleNode(DatePostedPattern); return DateTime.Parse(node.InnerText).ToUniversalTime(); } protected override DateTime GetLastUpdatedDate(HtmlDocument document, string baseUrl, string novelUrl) { return DateTime.Parse(document.DocumentNode.SelectNodes(DateUpdatedPattern)[1].InnerText).ToUniversalTime(); } protected override List GetTags(HtmlDocument document, string baseUrl, string novelUrl) { var tags = document.DocumentNode.SelectSingleNode(TagPattern).InnerText.Replace("\n", "").Replace(" ", " ").Split(' '); return tags.Select(i => new Tag {TagValue = i}).Union(GetMetadataTags(document, baseUrl, novelUrl)).ToList(); } protected override List RequestCookies() { var domain = ".syosetu.com"; return new List { new Cookie { Domain = domain, Name = "over18", Value = "yes" } }; } protected override IEnumerable GetMetadataTags(HtmlDocument document, string baseUrl, string novelUrl) { bool nsfw = Regex.Match(baseUrl, BaseUrlPattern).Groups[1].Value == "novel18"; var tags = new List { Tag.GetSiteTag(baseUrl), Tag.GetOriginalWorkTag() }; if (nsfw) { tags.Add(Tag.GetNsfwTag()); } return tags; } public override async Task ScrapeNovel(string url) { var baseUrl = new Regex(BaseUrlPattern).Match(url).Value; var novelUrl = new Regex(UrlMatchPattern).Match(url).Value; HtmlDocument baseDoc; HtmlDocument novelInfoPage; try { baseDoc = await GetPage(novelUrl); novelInfoPage = await GetInfoPage(baseUrl, novelUrl); } catch (Exception e) { throw new Exception("Error parsing document"); } return new Novel { Title = GetNovelTitle(baseDoc, baseUrl, novelUrl), Author = GetAuthor(baseDoc, baseUrl, novelUrl), Chapters = GetChapters(baseDoc, baseUrl, novelUrl), LastUpdated = GetLastUpdatedDate(novelInfoPage, baseUrl, novelUrl), Tags = GetTags(novelInfoPage, baseUrl, novelUrl), DatePosted = GetPostedDate(novelInfoPage, baseUrl, novelUrl), Url = novelUrl }; } }