db changes and build pipeline
Some checks failed
continuous-integration/drone/push Build is failing

This commit is contained in:
2022-07-15 12:21:37 -04:00
parent 5337e7ccb8
commit e6d6b629db
30 changed files with 1332 additions and 132 deletions

View File

@@ -5,95 +5,29 @@ using HtmlAgilityPack;
namespace WebNovelPortalAPI.Scrapers;
public class KakuyomuScraper : IScraper
public class KakuyomuScraper : AbstractScraper
{
private const string UrlPattern = @"https?:\/\/kakuyomu\.jp\/works\/\d+\/?";
private const string BaseUrl = "https://kakuyomu.jp";
public bool MatchesUrl(string url)
{
var regex = new Regex(UrlPattern, RegexOptions.IgnoreCase);
return regex.IsMatch(url);
}
protected override string UrlMatchPattern => @"https?:\/\/kakuyomu\.jp\/works\/\d+\/?";
private string GetNovelTitle(HtmlDocument document)
{
var xpath = @"//*[@id='workTitle']/a";
return document.DocumentNode.SelectSingleNode(xpath).InnerText;
}
protected override string BaseUrlPattern => @"https?:\/\/kakuyomu\.jp";
private Author GetAuthor(HtmlDocument document)
{
var nameXPath = @"//*[@id='workAuthor-activityName']/a";
var urlXPath = @"//*[@id='workAuthor-activityName']/a";
var authorName = document.DocumentNode.SelectSingleNode(nameXPath).InnerText;
var authorUrl = document.DocumentNode.SelectSingleNode(urlXPath).Attributes["href"].Value;
Author author = new Author
{
Name = authorName,
Url = $"{BaseUrl + authorUrl}"
};
return author;
protected override string? WorkTitlePattern => @"//*[@id='workTitle']/a";
protected override string? AuthorNamePattern => @"//*[@id='workAuthor-activityName']/a";
protected override string? AuthorLinkPattern => @"//*[@id='workAuthor-activityName']/a";
}
protected override string? ChapterUrlPattern => @"//a[@class='widget-toc-episode-episodeTitle']";
private List<Chapter> GetChapters(HtmlDocument document)
{
var urlxpath = @"//a[@class='widget-toc-episode-episodeTitle']";
var namexpath = @"span";
var urlnodes = document.DocumentNode.SelectNodes(urlxpath);
var chapters = urlnodes.Select((node, i) => new Chapter
{
ChapterNumber = i + 1,
Url = $"{BaseUrl}{node.Attributes["href"].Value}",
Name = node.SelectSingleNode(namexpath).InnerText
});
protected override string? ChapterNamePattern => @"span";
return chapters.ToList();
}
protected override string? ChapterPostedPattern => base.ChapterPostedPattern;
private List<Tag> GetTags(HtmlDocument document)
{
var xpath = @"//span[@itemprop='keywords']/a";
var nodes = document.DocumentNode.SelectNodes(xpath);
return nodes.Select(node => new Tag
{
TagValue = node.InnerText
}).ToList();
}
protected override string? ChapterUpdatedPattern => base.ChapterUpdatedPattern;
private DateTime GetPostedDate(HtmlDocument document)
{
var xpath = @"//time[@itemprop='datePublished']";
return DateTime.Parse(document.DocumentNode.SelectSingleNode(xpath).InnerText);
}
protected override string? TagPattern => @"//span[@itemprop='keywords']/a";
private DateTime GetLastUpdatedDate(HtmlDocument document)
{
var xpath = @"//time[@itemprop='dateModified']";
return DateTime.Parse(document.DocumentNode.SelectSingleNode(xpath).InnerText);
}
protected override string? DatePostedPattern => @"//time[@itemprop='datePublished']";
public Novel ScrapeNovel(string url)
{
Novel novel = new Novel();
var web = new HtmlWeb();
var doc = web.Load(url);
if (doc == null)
{
throw new Exception("Error parsing document");
}
return new Novel
{
Author = GetAuthor(doc),
Chapters = GetChapters(doc),
DatePosted = GetPostedDate(doc),
LastUpdated = GetLastUpdatedDate(doc),
Tags = GetTags(doc),
Title = GetNovelTitle(doc),
Url = url
};
}
protected override string? DateUpdatedPattern => @"//time[@itemprop='dateModified']";
public string? ScrapeChapterContent(string chapterUrl)
{