Updated lots of stuff, got multi scrape working, need to test not-nullable chapter novel ids with our current model, now supports sqlite and postgres concurrently (and easy add more), need to get it deployed/do auth
Some checks failed
continuous-integration/drone/push Build is failing

This commit is contained in:
2022-07-16 17:17:43 -04:00
parent eab3399268
commit d98324c11e
73 changed files with 1591 additions and 680 deletions

View File

@@ -1,6 +1,6 @@
using System.Text.RegularExpressions;
using DBConnection.Models;
using HtmlAgilityPack;
using Treestar.Shared.Models.DBDomain;
namespace WebNovelPortalAPI.Scrapers;
@@ -18,6 +18,12 @@ public abstract class AbstractScraper : IScraper
protected virtual string? TagPattern { get; }
protected virtual string? DatePostedPattern { get; }
protected virtual string? DateUpdatedPattern { get; }
protected virtual (DateTime? Posted, DateTime? Updated) GetDateTimeForChapter(HtmlNode linkNode, HtmlNode baseNode,
string baseUrl, string novelUrl)
{
return (null, null);
}
public virtual bool MatchesUrl(string url)
{
@@ -25,43 +31,56 @@ public abstract class AbstractScraper : IScraper
return regex.IsMatch(url);
}
protected virtual string GetNovelTitle(HtmlDocument document)
protected virtual string GetNovelTitle(HtmlDocument document, string baseUrl, string novelUrl)
{
var xpath = WorkTitlePattern;
return document.DocumentNode.SelectSingleNode(xpath).InnerText;
}
protected virtual Author GetAuthor(HtmlDocument document, string baseUrl)
protected virtual Author GetAuthor(HtmlDocument document, string baseUrl, string novelUrl)
{
var nameXPath = AuthorNamePattern;
var urlXPath = AuthorLinkPattern;
var authorName = document.DocumentNode.SelectSingleNode(nameXPath).InnerText;
var authorUrl = document.DocumentNode.SelectSingleNode(urlXPath).Attributes["href"].Value;
Author author = new Author
try
{
Name = authorName,
Url = $"{baseUrl + authorUrl}"
};
return author;
var authorName = document.DocumentNode.SelectSingleNode(nameXPath).InnerText;
var authorUrl = document.DocumentNode.SelectSingleNode(urlXPath).Attributes["href"].Value;
Author author = new Author
{
Name = authorName,
Url = $"{baseUrl + authorUrl}"
};
return author;
}
catch (Exception e)
{
return null;
}
}
protected virtual List<Chapter> GetChapters(HtmlDocument document, string baseUrl)
protected virtual List<Chapter> GetChapters(HtmlDocument document, string baseUrl, string novelUrl)
{
var urlxpath = ChapterUrlPattern;
var namexpath = ChapterNamePattern;
var urlnodes = document.DocumentNode.SelectNodes(urlxpath);
var chapters = urlnodes.Select((node, i) => new Chapter
var chapters = urlnodes.Select((node, i) =>
{
ChapterNumber = i + 1,
Url = $"{baseUrl}{node.Attributes["href"].Value}",
Name = node.SelectSingleNode(namexpath).InnerText
var dates = GetDateTimeForChapter(node, document.DocumentNode, baseUrl, novelUrl);
return new Chapter
{
ChapterNumber = i + 1,
Url = $"{baseUrl}{node.Attributes["href"].Value}",
Name = node.SelectSingleNode(namexpath).InnerText,
DatePosted = dates.Posted,
DateUpdated = dates.Updated
};
});
return chapters.ToList();
}
protected virtual List<Tag> GetTags(HtmlDocument document)
protected virtual List<Tag> GetTags(HtmlDocument document, string baseUrl, string novelUrl)
{
var xpath = TagPattern;
var nodes = document.DocumentNode.SelectNodes(xpath);
@@ -71,13 +90,13 @@ public abstract class AbstractScraper : IScraper
}).ToList();
}
protected virtual DateTime GetPostedDate(HtmlDocument document)
protected virtual DateTime GetPostedDate(HtmlDocument document, string baseUrl, string novelUrl)
{
var xpath = DatePostedPattern;
return DateTime.Parse(document.DocumentNode.SelectSingleNode(xpath).InnerText);
}
protected virtual DateTime GetLastUpdatedDate(HtmlDocument document)
protected virtual DateTime GetLastUpdatedDate(HtmlDocument document, string baseUrl, string novelUrl)
{
var xpath = DateUpdatedPattern;
return DateTime.Parse(document.DocumentNode.SelectSingleNode(xpath).InnerText);
@@ -96,12 +115,12 @@ public abstract class AbstractScraper : IScraper
var novelUrl = new Regex(UrlMatchPattern).Match(url).Value;
return new Novel
{
Author = GetAuthor(doc, baseUrl),
Chapters = GetChapters(doc, baseUrl),
DatePosted = GetPostedDate(doc),
LastUpdated = GetLastUpdatedDate(doc),
Tags = GetTags(doc),
Title = GetNovelTitle(doc),
Author = GetAuthor(doc, baseUrl, novelUrl),
Chapters = GetChapters(doc, baseUrl, novelUrl),
DatePosted = GetPostedDate(doc, baseUrl, novelUrl),
LastUpdated = GetLastUpdatedDate(doc, baseUrl, novelUrl),
Tags = GetTags(doc, baseUrl, novelUrl),
Title = GetNovelTitle(doc, baseUrl, novelUrl),
Url = novelUrl
};
}