167 lines
5.7 KiB
C#
167 lines
5.7 KiB
C#
using System.Net;
|
|
using System.Net.Http.Headers;
|
|
using System.Text.RegularExpressions;
|
|
using HtmlAgilityPack;
|
|
using Common.Models.DBDomain;
|
|
|
|
namespace WebNovelPortalAPI.Scrapers;
|
|
|
|
public abstract class AbstractScraper : IScraper
|
|
{
|
|
protected AbstractScraper()
|
|
{
|
|
var cookieContainer = new CookieContainer();
|
|
var handler = new HttpClientHandler
|
|
{
|
|
CookieContainer = cookieContainer
|
|
};
|
|
HttpClient client = new HttpClient(handler);
|
|
client.DefaultRequestHeaders.UserAgent.Add(new ProductInfoHeaderValue("Chrome","96.0.4664.110"));
|
|
foreach (var cookie in RequestCookies())
|
|
{
|
|
cookieContainer.Add(cookie);
|
|
}
|
|
|
|
HttpClient = client;
|
|
}
|
|
|
|
protected HttpClient HttpClient { get; }
|
|
protected abstract string UrlMatchPattern { get; }
|
|
protected abstract string BaseUrlPattern { get; }
|
|
protected virtual string? WorkTitlePattern { get; }
|
|
protected virtual string? AuthorNamePattern { get; }
|
|
protected virtual string? AuthorLinkPattern { get; }
|
|
protected virtual string? ChapterUrlPattern { get; }
|
|
protected virtual string? ChapterNamePattern { get; }
|
|
protected virtual string? ChapterPostedPattern { get; }
|
|
protected virtual string? ChapterUpdatedPattern { get; }
|
|
protected virtual string? TagPattern { get; }
|
|
protected virtual string? DatePostedPattern { get; }
|
|
protected virtual string? DateUpdatedPattern { get; }
|
|
|
|
protected async Task<HtmlDocument> GetPage(string url)
|
|
{
|
|
var response = await HttpClient.GetAsync(url);
|
|
var doc = new HtmlDocument();
|
|
doc.LoadHtml(await response.Content.ReadAsStringAsync());
|
|
return doc;
|
|
}
|
|
|
|
protected virtual (DateTime? Posted, DateTime? Updated) GetDateTimeForChapter(HtmlNode linkNode, HtmlNode baseNode,
|
|
string baseUrl, string novelUrl)
|
|
{
|
|
return (null, null);
|
|
}
|
|
|
|
public virtual bool MatchesUrl(string url)
|
|
{
|
|
var regex = new Regex(UrlMatchPattern, RegexOptions.IgnoreCase);
|
|
return regex.IsMatch(url);
|
|
}
|
|
|
|
protected virtual string GetNovelTitle(HtmlDocument document, string baseUrl, string novelUrl)
|
|
{
|
|
var xpath = WorkTitlePattern;
|
|
return document.DocumentNode.SelectSingleNode(xpath).InnerText;
|
|
}
|
|
|
|
protected virtual Author GetAuthor(HtmlDocument document, string baseUrl, string novelUrl)
|
|
{
|
|
var nameXPath = AuthorNamePattern;
|
|
var urlXPath = AuthorLinkPattern;
|
|
try
|
|
{
|
|
var authorName = document.DocumentNode.SelectSingleNode(nameXPath).InnerText;
|
|
var authorUrl = document.DocumentNode.SelectSingleNode(urlXPath).Attributes["href"].Value;
|
|
Author author = new Author
|
|
{
|
|
Name = authorName,
|
|
Url = $"{baseUrl + authorUrl}"
|
|
};
|
|
return author;
|
|
}
|
|
catch (Exception e)
|
|
{
|
|
return null;
|
|
}
|
|
|
|
}
|
|
|
|
protected virtual List<Chapter> GetChapters(HtmlDocument document, string baseUrl, string novelUrl)
|
|
{
|
|
var urlxpath = ChapterUrlPattern;
|
|
var namexpath = ChapterNamePattern;
|
|
var urlnodes = document.DocumentNode.SelectNodes(urlxpath);
|
|
var chapters = urlnodes.Select((node, i) =>
|
|
{
|
|
var dates = GetDateTimeForChapter(node, document.DocumentNode, baseUrl, novelUrl);
|
|
return new Chapter
|
|
{
|
|
ChapterNumber = i + 1,
|
|
Url = $"{baseUrl}{node.Attributes["href"].Value}",
|
|
Name = node.SelectSingleNode(namexpath).InnerText,
|
|
DatePosted = dates.Posted?.ToUniversalTime(),
|
|
DateUpdated = dates.Updated?.ToUniversalTime()
|
|
};
|
|
});
|
|
|
|
return chapters.ToList();
|
|
}
|
|
|
|
protected virtual List<Tag> GetTags(HtmlDocument document, string baseUrl, string novelUrl)
|
|
{
|
|
var xpath = TagPattern;
|
|
var nodes = document.DocumentNode.SelectNodes(xpath);
|
|
return nodes.Select(node => new Tag
|
|
{
|
|
TagValue = node.InnerText
|
|
}).Union(GetMetadataTags(document, baseUrl, novelUrl)).ToList();
|
|
}
|
|
|
|
protected virtual DateTime GetPostedDate(HtmlDocument document, string baseUrl, string novelUrl)
|
|
{
|
|
var xpath = DatePostedPattern;
|
|
return DateTime.Parse(document.DocumentNode.SelectSingleNode(xpath).InnerText).ToUniversalTime();
|
|
}
|
|
|
|
protected virtual DateTime GetLastUpdatedDate(HtmlDocument document, string baseUrl, string novelUrl)
|
|
{
|
|
var xpath = DateUpdatedPattern;
|
|
return DateTime.Parse(document.DocumentNode.SelectSingleNode(xpath).InnerText).ToUniversalTime();
|
|
}
|
|
|
|
protected virtual List<Cookie> RequestCookies()
|
|
{
|
|
return new List<Cookie>();
|
|
}
|
|
|
|
protected abstract IEnumerable<Tag> GetMetadataTags(HtmlDocument document, string baseUrl, string novelUrl);
|
|
|
|
public virtual async Task<Novel> ScrapeNovel(string url)
|
|
{
|
|
|
|
var baseUrl = new Regex(BaseUrlPattern).Match(url).Value;
|
|
var novelUrl = new Regex(UrlMatchPattern).Match(url).Value;
|
|
var doc = await GetPage(novelUrl);
|
|
if (string.IsNullOrEmpty(doc.Text))
|
|
{
|
|
throw new Exception("Error parsing document");
|
|
}
|
|
|
|
return new Novel
|
|
{
|
|
Author = GetAuthor(doc, baseUrl, novelUrl),
|
|
Chapters = GetChapters(doc, baseUrl, novelUrl),
|
|
DatePosted = GetPostedDate(doc, baseUrl, novelUrl),
|
|
LastUpdated = GetLastUpdatedDate(doc, baseUrl, novelUrl),
|
|
Tags = GetTags(doc, baseUrl, novelUrl),
|
|
Title = GetNovelTitle(doc, baseUrl, novelUrl),
|
|
Url = novelUrl
|
|
};
|
|
}
|
|
|
|
public Task<string?> ScrapeChapterContent(string chapterUrl)
|
|
{
|
|
throw new NotImplementedException();
|
|
}
|
|
} |