Updated lots of stuff, got multi scrape working, need to test not-nullable chapter novel ids with our current model, now supports sqlite and postgres concurrently (and easy add more), need to get it deployed/do auth
Some checks failed
continuous-integration/drone/push Build is failing
Some checks failed
continuous-integration/drone/push Build is failing
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
using System.Text.RegularExpressions;
|
||||
using DBConnection.Models;
|
||||
using HtmlAgilityPack;
|
||||
using Treestar.Shared.Models.DBDomain;
|
||||
|
||||
namespace WebNovelPortalAPI.Scrapers;
|
||||
|
||||
@@ -18,6 +18,12 @@ public abstract class AbstractScraper : IScraper
|
||||
protected virtual string? TagPattern { get; }
|
||||
protected virtual string? DatePostedPattern { get; }
|
||||
protected virtual string? DateUpdatedPattern { get; }
|
||||
|
||||
protected virtual (DateTime? Posted, DateTime? Updated) GetDateTimeForChapter(HtmlNode linkNode, HtmlNode baseNode,
|
||||
string baseUrl, string novelUrl)
|
||||
{
|
||||
return (null, null);
|
||||
}
|
||||
|
||||
public virtual bool MatchesUrl(string url)
|
||||
{
|
||||
@@ -25,43 +31,56 @@ public abstract class AbstractScraper : IScraper
|
||||
return regex.IsMatch(url);
|
||||
}
|
||||
|
||||
protected virtual string GetNovelTitle(HtmlDocument document)
|
||||
protected virtual string GetNovelTitle(HtmlDocument document, string baseUrl, string novelUrl)
|
||||
{
|
||||
var xpath = WorkTitlePattern;
|
||||
return document.DocumentNode.SelectSingleNode(xpath).InnerText;
|
||||
}
|
||||
|
||||
protected virtual Author GetAuthor(HtmlDocument document, string baseUrl)
|
||||
protected virtual Author GetAuthor(HtmlDocument document, string baseUrl, string novelUrl)
|
||||
{
|
||||
var nameXPath = AuthorNamePattern;
|
||||
var urlXPath = AuthorLinkPattern;
|
||||
var authorName = document.DocumentNode.SelectSingleNode(nameXPath).InnerText;
|
||||
var authorUrl = document.DocumentNode.SelectSingleNode(urlXPath).Attributes["href"].Value;
|
||||
Author author = new Author
|
||||
try
|
||||
{
|
||||
Name = authorName,
|
||||
Url = $"{baseUrl + authorUrl}"
|
||||
};
|
||||
return author;
|
||||
var authorName = document.DocumentNode.SelectSingleNode(nameXPath).InnerText;
|
||||
var authorUrl = document.DocumentNode.SelectSingleNode(urlXPath).Attributes["href"].Value;
|
||||
Author author = new Author
|
||||
{
|
||||
Name = authorName,
|
||||
Url = $"{baseUrl + authorUrl}"
|
||||
};
|
||||
return author;
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
protected virtual List<Chapter> GetChapters(HtmlDocument document, string baseUrl)
|
||||
protected virtual List<Chapter> GetChapters(HtmlDocument document, string baseUrl, string novelUrl)
|
||||
{
|
||||
var urlxpath = ChapterUrlPattern;
|
||||
var namexpath = ChapterNamePattern;
|
||||
var urlnodes = document.DocumentNode.SelectNodes(urlxpath);
|
||||
var chapters = urlnodes.Select((node, i) => new Chapter
|
||||
var chapters = urlnodes.Select((node, i) =>
|
||||
{
|
||||
ChapterNumber = i + 1,
|
||||
Url = $"{baseUrl}{node.Attributes["href"].Value}",
|
||||
Name = node.SelectSingleNode(namexpath).InnerText
|
||||
var dates = GetDateTimeForChapter(node, document.DocumentNode, baseUrl, novelUrl);
|
||||
return new Chapter
|
||||
{
|
||||
ChapterNumber = i + 1,
|
||||
Url = $"{baseUrl}{node.Attributes["href"].Value}",
|
||||
Name = node.SelectSingleNode(namexpath).InnerText,
|
||||
DatePosted = dates.Posted,
|
||||
DateUpdated = dates.Updated
|
||||
};
|
||||
});
|
||||
|
||||
return chapters.ToList();
|
||||
}
|
||||
|
||||
protected virtual List<Tag> GetTags(HtmlDocument document)
|
||||
protected virtual List<Tag> GetTags(HtmlDocument document, string baseUrl, string novelUrl)
|
||||
{
|
||||
var xpath = TagPattern;
|
||||
var nodes = document.DocumentNode.SelectNodes(xpath);
|
||||
@@ -71,13 +90,13 @@ public abstract class AbstractScraper : IScraper
|
||||
}).ToList();
|
||||
}
|
||||
|
||||
protected virtual DateTime GetPostedDate(HtmlDocument document)
|
||||
protected virtual DateTime GetPostedDate(HtmlDocument document, string baseUrl, string novelUrl)
|
||||
{
|
||||
var xpath = DatePostedPattern;
|
||||
return DateTime.Parse(document.DocumentNode.SelectSingleNode(xpath).InnerText);
|
||||
}
|
||||
|
||||
protected virtual DateTime GetLastUpdatedDate(HtmlDocument document)
|
||||
protected virtual DateTime GetLastUpdatedDate(HtmlDocument document, string baseUrl, string novelUrl)
|
||||
{
|
||||
var xpath = DateUpdatedPattern;
|
||||
return DateTime.Parse(document.DocumentNode.SelectSingleNode(xpath).InnerText);
|
||||
@@ -96,12 +115,12 @@ public abstract class AbstractScraper : IScraper
|
||||
var novelUrl = new Regex(UrlMatchPattern).Match(url).Value;
|
||||
return new Novel
|
||||
{
|
||||
Author = GetAuthor(doc, baseUrl),
|
||||
Chapters = GetChapters(doc, baseUrl),
|
||||
DatePosted = GetPostedDate(doc),
|
||||
LastUpdated = GetLastUpdatedDate(doc),
|
||||
Tags = GetTags(doc),
|
||||
Title = GetNovelTitle(doc),
|
||||
Author = GetAuthor(doc, baseUrl, novelUrl),
|
||||
Chapters = GetChapters(doc, baseUrl, novelUrl),
|
||||
DatePosted = GetPostedDate(doc, baseUrl, novelUrl),
|
||||
LastUpdated = GetLastUpdatedDate(doc, baseUrl, novelUrl),
|
||||
Tags = GetTags(doc, baseUrl, novelUrl),
|
||||
Title = GetNovelTitle(doc, baseUrl, novelUrl),
|
||||
Url = novelUrl
|
||||
};
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
using DBConnection.Models;
|
||||
using Treestar.Shared.Models.DBDomain;
|
||||
|
||||
namespace WebNovelPortalAPI.Scrapers;
|
||||
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
using System.Reflection.Metadata;
|
||||
using System.Text.RegularExpressions;
|
||||
using DBConnection.Models;
|
||||
using HtmlAgilityPack;
|
||||
|
||||
namespace WebNovelPortalAPI.Scrapers;
|
||||
@@ -19,7 +18,7 @@ public class KakuyomuScraper : AbstractScraper
|
||||
|
||||
protected override string? ChapterNamePattern => @"span";
|
||||
|
||||
protected override string? ChapterPostedPattern => base.ChapterPostedPattern;
|
||||
protected override string? ChapterPostedPattern => @"time";
|
||||
|
||||
protected override string? ChapterUpdatedPattern => base.ChapterUpdatedPattern;
|
||||
|
||||
@@ -29,8 +28,10 @@ public class KakuyomuScraper : AbstractScraper
|
||||
|
||||
protected override string? DateUpdatedPattern => @"//time[@itemprop='dateModified']";
|
||||
|
||||
public string? ScrapeChapterContent(string chapterUrl)
|
||||
protected override (DateTime? Posted, DateTime? Updated) GetDateTimeForChapter(HtmlNode linkNode, HtmlNode baseNode, string baseUrl,
|
||||
string novelUrl)
|
||||
{
|
||||
throw new NotImplementedException();
|
||||
var datePosted = linkNode.SelectSingleNode(ChapterPostedPattern).Attributes["datetime"].Value;
|
||||
return (DateTime.Parse(datePosted), null);
|
||||
}
|
||||
}
|
||||
@@ -1,10 +1,15 @@
|
||||
using System.Text.RegularExpressions;
|
||||
using HtmlAgilityPack;
|
||||
using Treestar.Shared.Models.DBDomain;
|
||||
|
||||
namespace WebNovelPortalAPI.Scrapers;
|
||||
|
||||
public class SyosetuScraper : AbstractScraper
|
||||
{
|
||||
|
||||
protected override string UrlMatchPattern => @"https?:\/\/\w+\.syosetu\.com\/\w+\/?";
|
||||
|
||||
protected override string BaseUrlPattern => @"https?:\/\/\w+\.syosetu\.com\/?";
|
||||
protected override string BaseUrlPattern => @"https?:\/\/\w+\.syosetu\.com";
|
||||
|
||||
protected override string? WorkTitlePattern => @"//p[@class='novel_title']";
|
||||
|
||||
@@ -14,15 +19,101 @@ public class SyosetuScraper : AbstractScraper
|
||||
|
||||
protected override string? ChapterUrlPattern => @"//dl[@class='novel_sublist2']//a";
|
||||
|
||||
protected override string? ChapterNamePattern => @"//dl[@class='novel_sublist2']//a";
|
||||
protected override string? ChapterPostedPattern => @"following-sibling::dt[@class='long_update']";
|
||||
|
||||
protected override string? ChapterPostedPattern => base.ChapterPostedPattern;
|
||||
protected override string? ChapterUpdatedPattern => @"span";
|
||||
|
||||
protected override string? ChapterUpdatedPattern => base.ChapterUpdatedPattern;
|
||||
protected override string? TagPattern => @"//th[text()='キーワード']/following-sibling::td";
|
||||
|
||||
protected override string? TagPattern => base.TagPattern;
|
||||
protected override string? DatePostedPattern => @"//th[text()='掲載日']/following-sibling::td";
|
||||
|
||||
protected override string? DatePostedPattern => base.DatePostedPattern;
|
||||
protected override string? DateUpdatedPattern => @"//th[contains(text(),'掲載日')]/following-sibling::td";
|
||||
|
||||
protected override string? DateUpdatedPattern => base.DateUpdatedPattern;
|
||||
private HtmlDocument? GetInfoPage(string baseUrl, string novelUrl)
|
||||
{
|
||||
string novelInfoBase = $"/novelview/infotop/ncode/";
|
||||
string novelRegex = @"https?:\/\/\w+\.syosetu\.com\/(\w+)\/?";
|
||||
string novelCode = new Regex(novelRegex).Match(novelUrl).Groups[1].Value;
|
||||
string novelInfoPage = $"{baseUrl}{novelInfoBase}{novelCode}";
|
||||
var web = new HtmlWeb();
|
||||
return web.Load(novelInfoPage);
|
||||
}
|
||||
|
||||
protected override List<Chapter> GetChapters(HtmlDocument document, string baseUrl, string novelUrl)
|
||||
{
|
||||
string dateUpdatedRegex = @"\d\d\d\d\/\d\d\/\d\d \d\d:\d\d";
|
||||
var nodes = document.DocumentNode.SelectNodes(ChapterUrlPattern);
|
||||
return nodes.Select((node,i) =>
|
||||
{
|
||||
var datePostedNode = node.ParentNode.SelectSingleNode(ChapterPostedPattern);
|
||||
var datePosted = DateTime.Parse(new Regex(dateUpdatedRegex).Match(datePostedNode.InnerText).Value);
|
||||
var dateUpdatedNode = datePostedNode.SelectSingleNode(ChapterUpdatedPattern);
|
||||
DateTime dateUpdated;
|
||||
if (dateUpdatedNode == null)
|
||||
{
|
||||
dateUpdated = datePosted;
|
||||
}
|
||||
else
|
||||
{
|
||||
dateUpdated = DateTime.Parse(new Regex(dateUpdatedRegex).Match(dateUpdatedNode.Attributes["title"].Value).Value);
|
||||
}
|
||||
return new Chapter
|
||||
{
|
||||
Name = node.InnerText,
|
||||
Url = baseUrl + node.Attributes["href"].Value,
|
||||
ChapterNumber = i+1,
|
||||
DatePosted = datePosted,
|
||||
DateUpdated = dateUpdated
|
||||
};
|
||||
}).ToList();
|
||||
}
|
||||
|
||||
protected override Author GetAuthor(HtmlDocument document, string baseUrl, string novelUrl)
|
||||
{
|
||||
var authorLink = document.DocumentNode.SelectSingleNode(AuthorLinkPattern)?.Attributes["href"].Value ?? null;
|
||||
if (string.IsNullOrEmpty(authorLink))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
var authorName = document.DocumentNode.SelectSingleNode(AuthorNamePattern).InnerText.Replace("\n", "");
|
||||
return new Author
|
||||
{
|
||||
Name = authorName,
|
||||
Url = authorLink
|
||||
};
|
||||
}
|
||||
|
||||
protected override DateTime GetPostedDate(HtmlDocument document, string baseUrl, string novelUrl)
|
||||
{
|
||||
var doc = GetInfoPage(baseUrl, novelUrl);
|
||||
if (doc == null)
|
||||
{
|
||||
return DateTime.MinValue;
|
||||
}
|
||||
|
||||
var node = doc.DocumentNode.SelectSingleNode(DatePostedPattern);
|
||||
return DateTime.Parse(node.InnerText);
|
||||
}
|
||||
|
||||
protected override DateTime GetLastUpdatedDate(HtmlDocument document, string baseUrl, string novelUrl)
|
||||
{
|
||||
var doc = GetInfoPage(baseUrl, novelUrl);
|
||||
if (doc == null)
|
||||
{
|
||||
return DateTime.MinValue;
|
||||
}
|
||||
return DateTime.Parse(doc.DocumentNode.SelectNodes(DateUpdatedPattern)[1].InnerText);
|
||||
}
|
||||
|
||||
protected override List<Tag> GetTags(HtmlDocument document, string baseUrl, string novelUrl)
|
||||
{
|
||||
var doc = GetInfoPage(baseUrl, novelUrl);
|
||||
if (doc == null)
|
||||
{
|
||||
return new List<Tag>();
|
||||
}
|
||||
|
||||
var tags = doc.DocumentNode.SelectSingleNode(TagPattern).InnerText.Replace("\n", "").Replace(" ", " ").Split(' ');
|
||||
return tags.Select(i => new Tag {TagValue = i}).ToList();
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user