Refactor and novel18 support (added cookie support in general to AbstractScraper.cs
All checks were successful
continuous-integration/drone/push Build is passing
All checks were successful
continuous-integration/drone/push Build is passing
This commit is contained in:
@@ -1,11 +1,31 @@
|
||||
using System.Net;
|
||||
using System.Net.Http.Headers;
|
||||
using System.Text.RegularExpressions;
|
||||
using HtmlAgilityPack;
|
||||
using Treestar.Shared.Models.DBDomain;
|
||||
using Common.Models.DBDomain;
|
||||
|
||||
namespace WebNovelPortalAPI.Scrapers;
|
||||
|
||||
public abstract class AbstractScraper : IScraper
|
||||
{
|
||||
protected AbstractScraper()
|
||||
{
|
||||
var cookieContainer = new CookieContainer();
|
||||
var handler = new HttpClientHandler
|
||||
{
|
||||
CookieContainer = cookieContainer
|
||||
};
|
||||
HttpClient client = new HttpClient(handler);
|
||||
client.DefaultRequestHeaders.UserAgent.Add(new ProductInfoHeaderValue("Chrome","96.0.4664.110"));
|
||||
foreach (var cookie in RequestCookies())
|
||||
{
|
||||
cookieContainer.Add(cookie);
|
||||
}
|
||||
|
||||
HttpClient = client;
|
||||
}
|
||||
|
||||
protected HttpClient HttpClient { get; }
|
||||
protected abstract string UrlMatchPattern { get; }
|
||||
protected abstract string BaseUrlPattern { get; }
|
||||
protected virtual string? WorkTitlePattern { get; }
|
||||
@@ -19,6 +39,14 @@ public abstract class AbstractScraper : IScraper
|
||||
protected virtual string? DatePostedPattern { get; }
|
||||
protected virtual string? DateUpdatedPattern { get; }
|
||||
|
||||
protected async Task<HtmlDocument> GetPage(string url)
|
||||
{
|
||||
var response = await HttpClient.GetAsync(url);
|
||||
var doc = new HtmlDocument();
|
||||
doc.LoadHtml(await response.Content.ReadAsStringAsync());
|
||||
return doc;
|
||||
}
|
||||
|
||||
protected virtual (DateTime? Posted, DateTime? Updated) GetDateTimeForChapter(HtmlNode linkNode, HtmlNode baseNode,
|
||||
string baseUrl, string novelUrl)
|
||||
{
|
||||
@@ -87,7 +115,7 @@ public abstract class AbstractScraper : IScraper
|
||||
return nodes.Select(node => new Tag
|
||||
{
|
||||
TagValue = node.InnerText
|
||||
}).ToList();
|
||||
}).Union(GetMetadataTags(document, baseUrl, novelUrl)).ToList();
|
||||
}
|
||||
|
||||
protected virtual DateTime GetPostedDate(HtmlDocument document, string baseUrl, string novelUrl)
|
||||
@@ -101,18 +129,25 @@ public abstract class AbstractScraper : IScraper
|
||||
var xpath = DateUpdatedPattern;
|
||||
return DateTime.Parse(document.DocumentNode.SelectSingleNode(xpath).InnerText).ToUniversalTime();
|
||||
}
|
||||
|
||||
public Novel ScrapeNovel(string url)
|
||||
|
||||
protected virtual List<Cookie> RequestCookies()
|
||||
{
|
||||
return new List<Cookie>();
|
||||
}
|
||||
|
||||
protected abstract IEnumerable<Tag> GetMetadataTags(HtmlDocument document, string baseUrl, string novelUrl);
|
||||
|
||||
public virtual async Task<Novel> ScrapeNovel(string url)
|
||||
{
|
||||
var web = new HtmlWeb();
|
||||
var doc = web.Load(url);
|
||||
if (doc == null)
|
||||
{
|
||||
throw new Exception("Error parsing document");
|
||||
}
|
||||
|
||||
var baseUrl = new Regex(BaseUrlPattern).Match(url).Value;
|
||||
var novelUrl = new Regex(UrlMatchPattern).Match(url).Value;
|
||||
var doc = await GetPage(novelUrl);
|
||||
if (string.IsNullOrEmpty(doc.Text))
|
||||
{
|
||||
throw new Exception("Error parsing document");
|
||||
}
|
||||
|
||||
return new Novel
|
||||
{
|
||||
Author = GetAuthor(doc, baseUrl, novelUrl),
|
||||
@@ -125,7 +160,7 @@ public abstract class AbstractScraper : IScraper
|
||||
};
|
||||
}
|
||||
|
||||
public string? ScrapeChapterContent(string chapterUrl)
|
||||
public Task<string?> ScrapeChapterContent(string chapterUrl)
|
||||
{
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user