Files
WebNovelPortal/WebNovelPortalAPI/Scrapers/SyosetuScraper.cs
littlefoot 12a1f48fbd
All checks were successful
continuous-integration/drone/push Build is passing
Fix up times and remove extraneous api inject from NovelList.razor
2022-07-17 22:26:22 -04:00

119 lines
4.6 KiB
C#

using System.Text.RegularExpressions;
using HtmlAgilityPack;
using Treestar.Shared.Models.DBDomain;
namespace WebNovelPortalAPI.Scrapers;
public class SyosetuScraper : AbstractScraper
{
protected override string UrlMatchPattern => @"https?:\/\/\w+\.syosetu\.com\/\w+\/?";
protected override string BaseUrlPattern => @"https?:\/\/\w+\.syosetu\.com";
protected override string? WorkTitlePattern => @"//p[@class='novel_title']";
protected override string? AuthorNamePattern => @"//div[@class='novel_writername']/a | //div[@class='novel_writername']";
protected override string? AuthorLinkPattern => @"//div[@class='novel_writername']/a";
protected override string? ChapterUrlPattern => @"//dl[@class='novel_sublist2']//a";
protected override string? ChapterPostedPattern => @"following-sibling::dt[@class='long_update']";
protected override string? ChapterUpdatedPattern => @"span";
protected override string? TagPattern => @"//th[text()='キーワード']/following-sibling::td";
protected override string? DatePostedPattern => @"//th[text()='掲載日']/following-sibling::td";
protected override string? DateUpdatedPattern => @"//th[contains(text(),'掲載日')]/following-sibling::td";
private HtmlDocument? GetInfoPage(string baseUrl, string novelUrl)
{
string novelInfoBase = $"/novelview/infotop/ncode/";
string novelRegex = @"https?:\/\/\w+\.syosetu\.com\/(\w+)\/?";
string novelCode = new Regex(novelRegex).Match(novelUrl).Groups[1].Value;
string novelInfoPage = $"{baseUrl}{novelInfoBase}{novelCode}";
var web = new HtmlWeb();
return web.Load(novelInfoPage);
}
protected override List<Chapter> GetChapters(HtmlDocument document, string baseUrl, string novelUrl)
{
string dateUpdatedRegex = @"\d\d\d\d\/\d\d\/\d\d \d\d:\d\d";
var nodes = document.DocumentNode.SelectNodes(ChapterUrlPattern);
return nodes.Select((node,i) =>
{
var datePostedNode = node.ParentNode.SelectSingleNode(ChapterPostedPattern);
var datePosted = DateTime.Parse(new Regex(dateUpdatedRegex).Match(datePostedNode.InnerText).Value);
var dateUpdatedNode = datePostedNode.SelectSingleNode(ChapterUpdatedPattern);
DateTime dateUpdated;
if (dateUpdatedNode == null)
{
dateUpdated = datePosted;
}
else
{
dateUpdated = DateTime.Parse(new Regex(dateUpdatedRegex).Match(dateUpdatedNode.Attributes["title"].Value).Value);
}
return new Chapter
{
Name = node.InnerText,
Url = baseUrl + node.Attributes["href"].Value,
ChapterNumber = i+1,
DatePosted = datePosted.ToUniversalTime(),
DateUpdated = dateUpdated.ToUniversalTime()
};
}).ToList();
}
protected override Author GetAuthor(HtmlDocument document, string baseUrl, string novelUrl)
{
var authorLink = document.DocumentNode.SelectSingleNode(AuthorLinkPattern)?.Attributes["href"].Value ?? null;
if (string.IsNullOrEmpty(authorLink))
{
return null;
}
var authorName = document.DocumentNode.SelectSingleNode(AuthorNamePattern).InnerText.Replace("\n", "");
return new Author
{
Name = authorName,
Url = authorLink
};
}
protected override DateTime GetPostedDate(HtmlDocument document, string baseUrl, string novelUrl)
{
var doc = GetInfoPage(baseUrl, novelUrl);
if (doc == null)
{
return DateTime.MinValue;
}
var node = doc.DocumentNode.SelectSingleNode(DatePostedPattern);
return DateTime.Parse(node.InnerText).ToUniversalTime();
}
protected override DateTime GetLastUpdatedDate(HtmlDocument document, string baseUrl, string novelUrl)
{
var doc = GetInfoPage(baseUrl, novelUrl);
if (doc == null)
{
return DateTime.MinValue;
}
return DateTime.Parse(doc.DocumentNode.SelectNodes(DateUpdatedPattern)[1].InnerText).ToUniversalTime();
}
protected override List<Tag> GetTags(HtmlDocument document, string baseUrl, string novelUrl)
{
var doc = GetInfoPage(baseUrl, novelUrl);
if (doc == null)
{
return new List<Tag>();
}
var tags = doc.DocumentNode.SelectSingleNode(TagPattern).InnerText.Replace("\n", "").Replace("&nbsp;", " ").Split(' ');
return tags.Select(i => new Tag {TagValue = i}).ToList();
}
}