using System.Net; using System.Net.Http.Headers; using System.Text.RegularExpressions; using HtmlAgilityPack; using Common.Models.DBDomain; using Common.Models.Enums; namespace WebNovelPortalAPI.Scrapers; public abstract class AbstractScraper : IScraper { protected AbstractScraper() { var cookieContainer = new CookieContainer(); var handler = new HttpClientHandler { CookieContainer = cookieContainer }; HttpClient client = new HttpClient(handler); client.DefaultRequestHeaders.UserAgent.Add(new ProductInfoHeaderValue("Chrome","96.0.4664.110")); foreach (var cookie in RequestCookies()) { cookieContainer.Add(cookie); } HttpClient = client; } protected HttpClient HttpClient { get; } protected abstract string UrlMatchPattern { get; } protected abstract string BaseUrlPattern { get; } protected virtual string? WorkTitlePattern { get; } protected virtual string? AuthorNamePattern { get; } protected virtual string? AuthorLinkPattern { get; } protected virtual string? ChapterUrlPattern { get; } protected virtual string? ChapterNamePattern { get; } protected virtual string? ChapterPostedPattern { get; } protected virtual string? ChapterUpdatedPattern { get; } protected virtual string? TagPattern { get; } protected virtual string? DatePostedPattern { get; } protected virtual string? DateUpdatedPattern { get; } protected virtual NovelStatus DefaultStatus => NovelStatus.Unknown; protected async Task GetPage(string url) { var response = await HttpClient.GetAsync(url); var doc = new HtmlDocument(); doc.LoadHtml(await response.Content.ReadAsStringAsync()); return doc; } protected virtual (DateTime? Posted, DateTime? Updated) GetDateTimeForChapter(HtmlNode linkNode, HtmlNode baseNode, string baseUrl, string novelUrl) { return (null, null); } public virtual bool MatchesUrl(string url) { var regex = new Regex(UrlMatchPattern, RegexOptions.IgnoreCase); return regex.IsMatch(url); } protected virtual string GetNovelTitle(HtmlDocument document, string baseUrl, string novelUrl) { var xpath = WorkTitlePattern; return document.DocumentNode.SelectSingleNode(xpath).InnerText; } protected virtual Author GetAuthor(HtmlDocument document, string baseUrl, string novelUrl) { var nameXPath = AuthorNamePattern; var urlXPath = AuthorLinkPattern; try { var authorName = document.DocumentNode.SelectSingleNode(nameXPath).InnerText; var authorUrl = document.DocumentNode.SelectSingleNode(urlXPath).Attributes["href"].Value; Author author = new Author { Name = authorName, Url = $"{baseUrl + authorUrl}" }; return author; } catch (Exception e) { return null; } } protected virtual List GetChapters(HtmlDocument document, string baseUrl, string novelUrl) { var urlxpath = ChapterUrlPattern; var namexpath = ChapterNamePattern; var urlnodes = document.DocumentNode.SelectNodes(urlxpath); var chapters = urlnodes.Select((node, i) => { var dates = GetDateTimeForChapter(node, document.DocumentNode, baseUrl, novelUrl); return new Chapter { ChapterNumber = i + 1, Url = $"{baseUrl}{node.Attributes["href"].Value}", Name = node.SelectSingleNode(namexpath).InnerText, DatePosted = dates.Posted?.ToUniversalTime(), DateUpdated = dates.Updated?.ToUniversalTime() }; }); return chapters.ToList(); } protected virtual List GetTags(HtmlDocument document, string baseUrl, string novelUrl) { var xpath = TagPattern; var nodes = document.DocumentNode.SelectNodes(xpath); return nodes.Select(node => new Tag { TagValue = node.InnerText }).Union(GetMetadataTags(document, baseUrl, novelUrl)).ToList(); } protected virtual DateTime GetPostedDate(HtmlDocument document, string baseUrl, string novelUrl) { var xpath = DatePostedPattern; return DateTime.Parse(document.DocumentNode.SelectSingleNode(xpath).InnerText).ToUniversalTime(); } protected virtual DateTime GetLastUpdatedDate(HtmlDocument document, string baseUrl, string novelUrl) { var xpath = DateUpdatedPattern; return DateTime.Parse(document.DocumentNode.SelectSingleNode(xpath).InnerText).ToUniversalTime(); } protected virtual List RequestCookies() { return new List(); } protected abstract IEnumerable GetMetadataTags(HtmlDocument document, string baseUrl, string novelUrl); public virtual async Task ScrapeNovel(string url) { var baseUrl = new Regex(BaseUrlPattern).Match(url).Value; var novelUrl = new Regex(UrlMatchPattern).Match(url).Value; var doc = await GetPage(novelUrl); if (string.IsNullOrEmpty(doc.Text)) { throw new Exception("Error parsing document"); } return new Novel { Author = GetAuthor(doc, baseUrl, novelUrl), Chapters = GetChapters(doc, baseUrl, novelUrl), DatePosted = GetPostedDate(doc, baseUrl, novelUrl), LastUpdated = GetLastUpdatedDate(doc, baseUrl, novelUrl), Tags = GetTags(doc, baseUrl, novelUrl), Title = GetNovelTitle(doc, baseUrl, novelUrl), Url = novelUrl, Status = DefaultStatus }; } public Task ScrapeChapterContent(string chapterUrl) { throw new NotImplementedException(); } }