Files
WebNovelPortal/WebNovelPortalAPI/Scrapers/SyosetuScraper.cs
littlefoot 13b7306ca2
All checks were successful
continuous-integration/drone/push Build is passing
Syosetu single chapter novel support, closes #1
2022-08-03 14:33:08 -04:00

193 lines
6.9 KiB
C#

using System.Net;
using System.Text.RegularExpressions;
using HtmlAgilityPack;
using Common.Models.DBDomain;
using Common.Models.Enums;
namespace WebNovelPortalAPI.Scrapers;
public class SyosetuScraper : AbstractScraper
{
protected override string UrlMatchPattern => @"https?:\/\/(\w+)\.syosetu\.com\/n\w+";
protected override string BaseUrlPattern => @"https?:\/\/(\w+)\.syosetu\.com";
protected override string? WorkTitlePattern => @"//p[@class='novel_title']";
protected override string? AuthorNamePattern => @"//div[@class='novel_writername']/a";
protected override string? AuthorLinkPattern => @"//div[@class='novel_writername']/a";
protected override string? ChapterUrlPattern => @"//dl[@class='novel_sublist2']//a";
protected override string? ChapterPostedPattern => @"following-sibling::dt[@class='long_update']";
protected override string? ChapterUpdatedPattern => @"span";
protected override string? TagPattern => @"//th[text()='キーワード']/following-sibling::td";
protected override string? DatePostedPattern => @"//th[text()='掲載日']/following-sibling::td";
protected override string? DateUpdatedPattern => @"//th[contains(text(),'掲載日')]/following-sibling::td | //th[contains(text(),'最終更新日')]/following-sibling::td";
private async Task<HtmlDocument> GetInfoPage(string baseUrl, string novelUrl)
{
string novelInfoBase = $"/novelview/infotop/ncode/";
string novelRegex = @"https?:\/\/\w+\.syosetu\.com\/(\w+)\/?";
string novelCode = new Regex(novelRegex, RegexOptions.IgnoreCase).Match(novelUrl).Groups[1].Value;
string novelInfoPage = $"{baseUrl}{novelInfoBase}{novelCode}";
return await GetPage(novelInfoPage);
}
protected List<Chapter> GetChapters(HtmlDocument document, string baseUrl, string novelUrl, string novelName, DateTime novelPostedDate, DateTime novelUpdatedDate)
{
string dateUpdatedRegex = @"\d\d\d\d\/\d\d\/\d\d \d\d:\d\d";
var nodes = document.DocumentNode.SelectNodes(ChapterUrlPattern);
// single chapter syosetu novel
if (nodes == null)
{
return new List<Chapter>
{
new Chapter
{
ChapterNumber = 1,
Name = novelName,
Url = novelUrl,
DatePosted = novelPostedDate,
DateUpdated = novelUpdatedDate
}
};
}
return nodes.Select((node,i) =>
{
var datePostedNode = node.ParentNode.SelectSingleNode(ChapterPostedPattern);
var datePosted = DateTime.Parse(new Regex(dateUpdatedRegex).Match(datePostedNode.InnerText).Value);
var dateUpdatedNode = datePostedNode.SelectSingleNode(ChapterUpdatedPattern);
DateTime dateUpdated;
if (dateUpdatedNode == null)
{
dateUpdated = datePosted;
}
else
{
dateUpdated = DateTime.Parse(new Regex(dateUpdatedRegex).Match(dateUpdatedNode.Attributes["title"].Value).Value);
}
return new Chapter
{
Name = node.InnerText,
Url = baseUrl + node.Attributes["href"].Value,
ChapterNumber = i+1,
DatePosted = datePosted.ToUniversalTime(),
DateUpdated = dateUpdated.ToUniversalTime()
};
}).ToList();
}
protected override Author GetAuthor(HtmlDocument document, string baseUrl, string novelUrl)
{
var authorLink = document.DocumentNode.SelectSingleNode(AuthorLinkPattern)?.Attributes["href"].Value ?? null;
if (string.IsNullOrEmpty(authorLink))
{
return null;
}
var authorName = document.DocumentNode.SelectSingleNode(AuthorNamePattern).InnerText.Replace("\n", "");
return new Author
{
Name = authorName,
Url = authorLink
};
}
protected override DateTime GetPostedDate(HtmlDocument document, string baseUrl, string novelUrl)
{
var node = document.DocumentNode.SelectSingleNode(DatePostedPattern);
return DateTime.Parse(node.InnerText).ToUniversalTime();
}
protected override DateTime GetLastUpdatedDate(HtmlDocument document, string baseUrl, string novelUrl)
{
return DateTime.Parse(document.DocumentNode.SelectNodes(DateUpdatedPattern).Last().InnerText).ToUniversalTime();
}
protected override List<Tag> GetTags(HtmlDocument document, string baseUrl, string novelUrl)
{
var tags = document.DocumentNode.SelectSingleNode(TagPattern).InnerText.Replace("\n", "").Replace("&nbsp;", " ").Split(' ');
return tags.Select(i => new Tag {TagValue = i}).Union(GetMetadataTags(document, baseUrl, novelUrl)).ToList();
}
protected override List<Cookie> RequestCookies()
{
var domain = ".syosetu.com";
return new List<Cookie>
{
new Cookie
{
Domain = domain,
Name = "over18",
Value = "yes"
}
};
}
protected override IEnumerable<Tag> GetMetadataTags(HtmlDocument document, string baseUrl, string novelUrl)
{
bool nsfw = Regex.Match(baseUrl, BaseUrlPattern).Groups[1].Value == "novel18";
var tags = new List<Tag>
{
Tag.GetSiteTag(baseUrl),
Tag.GetOriginalWorkTag()
};
if (nsfw)
{
tags.Add(Tag.GetNsfwTag());
}
return tags;
}
public override async Task<Novel> ScrapeNovel(string url)
{
var baseUrl = new Regex(BaseUrlPattern, RegexOptions.IgnoreCase).Match(url).Value;
var novelUrl = new Regex(UrlMatchPattern, RegexOptions.IgnoreCase).Match(url).Value;
HtmlDocument baseDoc;
HtmlDocument novelInfoPage;
try
{
baseDoc = await GetPage(novelUrl);
novelInfoPage = await GetInfoPage(baseUrl, novelUrl);
}
catch (Exception e)
{
throw new Exception("Error parsing document");
}
var novelName = GetNovelTitle(baseDoc,
baseUrl,
novelUrl);
var lastUpdated = GetLastUpdatedDate(novelInfoPage, baseUrl, novelUrl);
var datePosted = GetPostedDate(novelInfoPage,
baseUrl,
novelUrl);
return new Novel
{
Title = novelName,
Author = GetAuthor(baseDoc,
baseUrl,
novelUrl),
Chapters = GetChapters(baseDoc,
baseUrl,
novelUrl,
novelName,
datePosted,
lastUpdated),
LastUpdated = lastUpdated,
Tags = GetTags(novelInfoPage,
baseUrl,
novelUrl),
DatePosted = datePosted,
Url = novelUrl
};
}
}