Refactor and novel18 support (added cookie support in general to AbstractScraper.cs
All checks were successful
continuous-integration/drone/push Build is passing
All checks were successful
continuous-integration/drone/push Build is passing
This commit is contained in:
@@ -1,15 +1,17 @@
|
||||
using System.Net;
|
||||
using System.Text.RegularExpressions;
|
||||
using HtmlAgilityPack;
|
||||
using Treestar.Shared.Models.DBDomain;
|
||||
using Common.Models.DBDomain;
|
||||
using Common.Models.Enums;
|
||||
|
||||
namespace WebNovelPortalAPI.Scrapers;
|
||||
|
||||
public class SyosetuScraper : AbstractScraper
|
||||
{
|
||||
|
||||
protected override string UrlMatchPattern => @"https?:\/\/\w+\.syosetu\.com\/\w+\/?";
|
||||
protected override string UrlMatchPattern => @"https?:\/\/(\w+)\.syosetu\.com\/\w+\/?";
|
||||
|
||||
protected override string BaseUrlPattern => @"https?:\/\/\w+\.syosetu\.com";
|
||||
protected override string BaseUrlPattern => @"https?:\/\/(\w+)\.syosetu\.com";
|
||||
|
||||
protected override string? WorkTitlePattern => @"//p[@class='novel_title']";
|
||||
|
||||
@@ -29,14 +31,14 @@ public class SyosetuScraper : AbstractScraper
|
||||
|
||||
protected override string? DateUpdatedPattern => @"//th[contains(text(),'掲載日')]/following-sibling::td";
|
||||
|
||||
private HtmlDocument? GetInfoPage(string baseUrl, string novelUrl)
|
||||
private async Task<HtmlDocument> GetInfoPage(string baseUrl, string novelUrl)
|
||||
{
|
||||
string novelInfoBase = $"/novelview/infotop/ncode/";
|
||||
string novelRegex = @"https?:\/\/\w+\.syosetu\.com\/(\w+)\/?";
|
||||
string novelCode = new Regex(novelRegex).Match(novelUrl).Groups[1].Value;
|
||||
string novelInfoPage = $"{baseUrl}{novelInfoBase}{novelCode}";
|
||||
var web = new HtmlWeb();
|
||||
return web.Load(novelInfoPage);
|
||||
return await GetPage(novelInfoPage);
|
||||
|
||||
}
|
||||
|
||||
protected override List<Chapter> GetChapters(HtmlDocument document, string baseUrl, string novelUrl)
|
||||
@@ -85,35 +87,86 @@ public class SyosetuScraper : AbstractScraper
|
||||
|
||||
protected override DateTime GetPostedDate(HtmlDocument document, string baseUrl, string novelUrl)
|
||||
{
|
||||
var doc = GetInfoPage(baseUrl, novelUrl);
|
||||
if (doc == null)
|
||||
{
|
||||
return DateTime.MinValue;
|
||||
}
|
||||
|
||||
var node = doc.DocumentNode.SelectSingleNode(DatePostedPattern);
|
||||
var node = document.DocumentNode.SelectSingleNode(DatePostedPattern);
|
||||
return DateTime.Parse(node.InnerText).ToUniversalTime();
|
||||
}
|
||||
|
||||
protected override DateTime GetLastUpdatedDate(HtmlDocument document, string baseUrl, string novelUrl)
|
||||
{
|
||||
var doc = GetInfoPage(baseUrl, novelUrl);
|
||||
if (doc == null)
|
||||
{
|
||||
return DateTime.MinValue;
|
||||
}
|
||||
return DateTime.Parse(doc.DocumentNode.SelectNodes(DateUpdatedPattern)[1].InnerText).ToUniversalTime();
|
||||
return DateTime.Parse(document.DocumentNode.SelectNodes(DateUpdatedPattern)[1].InnerText).ToUniversalTime();
|
||||
}
|
||||
|
||||
protected override List<Tag> GetTags(HtmlDocument document, string baseUrl, string novelUrl)
|
||||
{
|
||||
var doc = GetInfoPage(baseUrl, novelUrl);
|
||||
if (doc == null)
|
||||
var tags = document.DocumentNode.SelectSingleNode(TagPattern).InnerText.Replace("\n", "").Replace(" ", " ").Split(' ');
|
||||
return tags.Select(i => new Tag {TagValue = i}).Union(GetMetadataTags(document, baseUrl, novelUrl)).ToList();
|
||||
}
|
||||
|
||||
protected override List<Cookie> RequestCookies()
|
||||
{
|
||||
var domain = ".syosetu.com";
|
||||
return new List<Cookie>
|
||||
{
|
||||
return new List<Tag>();
|
||||
new Cookie
|
||||
{
|
||||
Domain = domain,
|
||||
Name = "over18",
|
||||
Value = "yes"
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
protected override IEnumerable<Tag> GetMetadataTags(HtmlDocument document, string baseUrl, string novelUrl)
|
||||
{
|
||||
bool nsfw = Regex.Match(baseUrl, BaseUrlPattern).Groups[1].Value == "novel18";
|
||||
var tags = new List<Tag>
|
||||
{
|
||||
Tag.GetSiteTag(baseUrl),
|
||||
Tag.GetOriginalWorkTag()
|
||||
};
|
||||
if (nsfw)
|
||||
{
|
||||
tags.Add(Tag.GetNsfwTag());
|
||||
}
|
||||
|
||||
var tags = doc.DocumentNode.SelectSingleNode(TagPattern).InnerText.Replace("\n", "").Replace(" ", " ").Split(' ');
|
||||
return tags.Select(i => new Tag {TagValue = i}).ToList();
|
||||
return tags;
|
||||
}
|
||||
|
||||
public override async Task<Novel> ScrapeNovel(string url)
|
||||
{
|
||||
var baseUrl = new Regex(BaseUrlPattern).Match(url).Value;
|
||||
var novelUrl = new Regex(UrlMatchPattern).Match(url).Value;
|
||||
HtmlDocument baseDoc;
|
||||
HtmlDocument novelInfoPage;
|
||||
try
|
||||
{
|
||||
baseDoc = await GetPage(novelUrl);
|
||||
novelInfoPage = await GetInfoPage(baseUrl, novelUrl);
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
throw new Exception("Error parsing document");
|
||||
}
|
||||
|
||||
return new Novel
|
||||
{
|
||||
Title = GetNovelTitle(baseDoc,
|
||||
baseUrl,
|
||||
novelUrl),
|
||||
Author = GetAuthor(baseDoc,
|
||||
baseUrl,
|
||||
novelUrl),
|
||||
Chapters = GetChapters(baseDoc,
|
||||
baseUrl,
|
||||
novelUrl),
|
||||
LastUpdated = GetLastUpdatedDate(novelInfoPage, baseUrl, novelUrl),
|
||||
Tags = GetTags(novelInfoPage,
|
||||
baseUrl,
|
||||
novelUrl),
|
||||
DatePosted = GetPostedDate(novelInfoPage,
|
||||
baseUrl,
|
||||
novelUrl),
|
||||
Url = novelUrl
|
||||
};
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user