172 lines
6.1 KiB
C#
172 lines
6.1 KiB
C#
using System.Net;
|
|
using System.Text.RegularExpressions;
|
|
using HtmlAgilityPack;
|
|
using Common.Models.DBDomain;
|
|
using Common.Models.Enums;
|
|
|
|
namespace WebNovelPortalAPI.Scrapers;
|
|
|
|
public class SyosetuScraper : AbstractScraper
|
|
{
|
|
|
|
protected override string UrlMatchPattern => @"https?:\/\/(\w+)\.syosetu\.com\/\w+\/?";
|
|
|
|
protected override string BaseUrlPattern => @"https?:\/\/(\w+)\.syosetu\.com";
|
|
|
|
protected override string? WorkTitlePattern => @"//p[@class='novel_title']";
|
|
|
|
protected override string? AuthorNamePattern => @"//div[@class='novel_writername']/a";
|
|
|
|
protected override string? AuthorLinkPattern => @"//div[@class='novel_writername']/a";
|
|
|
|
protected override string? ChapterUrlPattern => @"//dl[@class='novel_sublist2']//a";
|
|
|
|
protected override string? ChapterPostedPattern => @"following-sibling::dt[@class='long_update']";
|
|
|
|
protected override string? ChapterUpdatedPattern => @"span";
|
|
|
|
protected override string? TagPattern => @"//th[text()='キーワード']/following-sibling::td";
|
|
|
|
protected override string? DatePostedPattern => @"//th[text()='掲載日']/following-sibling::td";
|
|
|
|
protected override string? DateUpdatedPattern => @"//th[contains(text(),'掲載日')]/following-sibling::td";
|
|
|
|
private async Task<HtmlDocument> GetInfoPage(string baseUrl, string novelUrl)
|
|
{
|
|
string novelInfoBase = $"/novelview/infotop/ncode/";
|
|
string novelRegex = @"https?:\/\/\w+\.syosetu\.com\/(\w+)\/?";
|
|
string novelCode = new Regex(novelRegex).Match(novelUrl).Groups[1].Value;
|
|
string novelInfoPage = $"{baseUrl}{novelInfoBase}{novelCode}";
|
|
return await GetPage(novelInfoPage);
|
|
|
|
}
|
|
|
|
protected override List<Chapter> GetChapters(HtmlDocument document, string baseUrl, string novelUrl)
|
|
{
|
|
string dateUpdatedRegex = @"\d\d\d\d\/\d\d\/\d\d \d\d:\d\d";
|
|
var nodes = document.DocumentNode.SelectNodes(ChapterUrlPattern);
|
|
return nodes.Select((node,i) =>
|
|
{
|
|
var datePostedNode = node.ParentNode.SelectSingleNode(ChapterPostedPattern);
|
|
var datePosted = DateTime.Parse(new Regex(dateUpdatedRegex).Match(datePostedNode.InnerText).Value);
|
|
var dateUpdatedNode = datePostedNode.SelectSingleNode(ChapterUpdatedPattern);
|
|
DateTime dateUpdated;
|
|
if (dateUpdatedNode == null)
|
|
{
|
|
dateUpdated = datePosted;
|
|
}
|
|
else
|
|
{
|
|
dateUpdated = DateTime.Parse(new Regex(dateUpdatedRegex).Match(dateUpdatedNode.Attributes["title"].Value).Value);
|
|
}
|
|
return new Chapter
|
|
{
|
|
Name = node.InnerText,
|
|
Url = baseUrl + node.Attributes["href"].Value,
|
|
ChapterNumber = i+1,
|
|
DatePosted = datePosted.ToUniversalTime(),
|
|
DateUpdated = dateUpdated.ToUniversalTime()
|
|
};
|
|
}).ToList();
|
|
}
|
|
|
|
protected override Author GetAuthor(HtmlDocument document, string baseUrl, string novelUrl)
|
|
{
|
|
var authorLink = document.DocumentNode.SelectSingleNode(AuthorLinkPattern)?.Attributes["href"].Value ?? null;
|
|
if (string.IsNullOrEmpty(authorLink))
|
|
{
|
|
return null;
|
|
}
|
|
var authorName = document.DocumentNode.SelectSingleNode(AuthorNamePattern).InnerText.Replace("\n", "");
|
|
return new Author
|
|
{
|
|
Name = authorName,
|
|
Url = authorLink
|
|
};
|
|
}
|
|
|
|
protected override DateTime GetPostedDate(HtmlDocument document, string baseUrl, string novelUrl)
|
|
{
|
|
var node = document.DocumentNode.SelectSingleNode(DatePostedPattern);
|
|
return DateTime.Parse(node.InnerText).ToUniversalTime();
|
|
}
|
|
|
|
protected override DateTime GetLastUpdatedDate(HtmlDocument document, string baseUrl, string novelUrl)
|
|
{
|
|
return DateTime.Parse(document.DocumentNode.SelectNodes(DateUpdatedPattern)[1].InnerText).ToUniversalTime();
|
|
}
|
|
|
|
protected override List<Tag> GetTags(HtmlDocument document, string baseUrl, string novelUrl)
|
|
{
|
|
var tags = document.DocumentNode.SelectSingleNode(TagPattern).InnerText.Replace("\n", "").Replace(" ", " ").Split(' ');
|
|
return tags.Select(i => new Tag {TagValue = i}).Union(GetMetadataTags(document, baseUrl, novelUrl)).ToList();
|
|
}
|
|
|
|
protected override List<Cookie> RequestCookies()
|
|
{
|
|
var domain = ".syosetu.com";
|
|
return new List<Cookie>
|
|
{
|
|
new Cookie
|
|
{
|
|
Domain = domain,
|
|
Name = "over18",
|
|
Value = "yes"
|
|
}
|
|
};
|
|
}
|
|
|
|
protected override IEnumerable<Tag> GetMetadataTags(HtmlDocument document, string baseUrl, string novelUrl)
|
|
{
|
|
bool nsfw = Regex.Match(baseUrl, BaseUrlPattern).Groups[1].Value == "novel18";
|
|
var tags = new List<Tag>
|
|
{
|
|
Tag.GetSiteTag(baseUrl),
|
|
Tag.GetOriginalWorkTag()
|
|
};
|
|
if (nsfw)
|
|
{
|
|
tags.Add(Tag.GetNsfwTag());
|
|
}
|
|
|
|
return tags;
|
|
}
|
|
|
|
public override async Task<Novel> ScrapeNovel(string url)
|
|
{
|
|
var baseUrl = new Regex(BaseUrlPattern).Match(url).Value;
|
|
var novelUrl = new Regex(UrlMatchPattern).Match(url).Value;
|
|
HtmlDocument baseDoc;
|
|
HtmlDocument novelInfoPage;
|
|
try
|
|
{
|
|
baseDoc = await GetPage(novelUrl);
|
|
novelInfoPage = await GetInfoPage(baseUrl, novelUrl);
|
|
}
|
|
catch (Exception e)
|
|
{
|
|
throw new Exception("Error parsing document");
|
|
}
|
|
|
|
return new Novel
|
|
{
|
|
Title = GetNovelTitle(baseDoc,
|
|
baseUrl,
|
|
novelUrl),
|
|
Author = GetAuthor(baseDoc,
|
|
baseUrl,
|
|
novelUrl),
|
|
Chapters = GetChapters(baseDoc,
|
|
baseUrl,
|
|
novelUrl),
|
|
LastUpdated = GetLastUpdatedDate(novelInfoPage, baseUrl, novelUrl),
|
|
Tags = GetTags(novelInfoPage,
|
|
baseUrl,
|
|
novelUrl),
|
|
DatePosted = GetPostedDate(novelInfoPage,
|
|
baseUrl,
|
|
novelUrl),
|
|
Url = novelUrl
|
|
};
|
|
}
|
|
} |