Files
WebNovelPortal/WebNovelPortalAPI/Scrapers/AbstractScraper.cs
littlefoot e6d6b629db
Some checks failed
continuous-integration/drone/push Build is failing
db changes and build pipeline
2022-07-15 12:21:37 -04:00

113 lines
3.7 KiB
C#

using System.Text.RegularExpressions;
using DBConnection.Models;
using HtmlAgilityPack;
namespace WebNovelPortalAPI.Scrapers;
public abstract class AbstractScraper : IScraper
{
protected abstract string UrlMatchPattern { get; }
protected abstract string BaseUrlPattern { get; }
protected virtual string? WorkTitlePattern { get; }
protected virtual string? AuthorNamePattern { get; }
protected virtual string? AuthorLinkPattern { get; }
protected virtual string? ChapterUrlPattern { get; }
protected virtual string? ChapterNamePattern { get; }
protected virtual string? ChapterPostedPattern { get; }
protected virtual string? ChapterUpdatedPattern { get; }
protected virtual string? TagPattern { get; }
protected virtual string? DatePostedPattern { get; }
protected virtual string? DateUpdatedPattern { get; }
public virtual bool MatchesUrl(string url)
{
var regex = new Regex(UrlMatchPattern, RegexOptions.IgnoreCase);
return regex.IsMatch(url);
}
protected virtual string GetNovelTitle(HtmlDocument document)
{
var xpath = WorkTitlePattern;
return document.DocumentNode.SelectSingleNode(xpath).InnerText;
}
protected virtual Author GetAuthor(HtmlDocument document, string baseUrl)
{
var nameXPath = AuthorNamePattern;
var urlXPath = AuthorLinkPattern;
var authorName = document.DocumentNode.SelectSingleNode(nameXPath).InnerText;
var authorUrl = document.DocumentNode.SelectSingleNode(urlXPath).Attributes["href"].Value;
Author author = new Author
{
Name = authorName,
Url = $"{baseUrl + authorUrl}"
};
return author;
}
protected virtual List<Chapter> GetChapters(HtmlDocument document, string baseUrl)
{
var urlxpath = ChapterUrlPattern;
var namexpath = ChapterNamePattern;
var urlnodes = document.DocumentNode.SelectNodes(urlxpath);
var chapters = urlnodes.Select((node, i) => new Chapter
{
ChapterNumber = i + 1,
Url = $"{baseUrl}{node.Attributes["href"].Value}",
Name = node.SelectSingleNode(namexpath).InnerText
});
return chapters.ToList();
}
protected virtual List<Tag> GetTags(HtmlDocument document)
{
var xpath = TagPattern;
var nodes = document.DocumentNode.SelectNodes(xpath);
return nodes.Select(node => new Tag
{
TagValue = node.InnerText
}).ToList();
}
protected virtual DateTime GetPostedDate(HtmlDocument document)
{
var xpath = DatePostedPattern;
return DateTime.Parse(document.DocumentNode.SelectSingleNode(xpath).InnerText);
}
protected virtual DateTime GetLastUpdatedDate(HtmlDocument document)
{
var xpath = DateUpdatedPattern;
return DateTime.Parse(document.DocumentNode.SelectSingleNode(xpath).InnerText);
}
public Novel ScrapeNovel(string url)
{
var web = new HtmlWeb();
var doc = web.Load(url);
if (doc == null)
{
throw new Exception("Error parsing document");
}
var baseUrl = new Regex(BaseUrlPattern).Match(url).Value;
var novelUrl = new Regex(UrlMatchPattern).Match(url).Value;
return new Novel
{
Author = GetAuthor(doc, baseUrl),
Chapters = GetChapters(doc, baseUrl),
DatePosted = GetPostedDate(doc),
LastUpdated = GetLastUpdatedDate(doc),
Tags = GetTags(doc),
Title = GetNovelTitle(doc),
Url = novelUrl
};
}
public string? ScrapeChapterContent(string chapterUrl)
{
throw new NotImplementedException();
}
}