102 lines
3.1 KiB
C#
102 lines
3.1 KiB
C#
using System.Reflection.Metadata;
|
|
using System.Text.RegularExpressions;
|
|
using DBConnection.Models;
|
|
using HtmlAgilityPack;
|
|
|
|
namespace WebNovelPortalAPI.Scrapers;
|
|
|
|
public class KakuyomuScraper : IScraper
|
|
{
|
|
private const string UrlPattern = @"https?:\/\/kakuyomu\.jp\/works\/\d+\/?";
|
|
private const string BaseUrl = "https://kakuyomu.jp";
|
|
public bool MatchesUrl(string url)
|
|
{
|
|
var regex = new Regex(UrlPattern, RegexOptions.IgnoreCase);
|
|
return regex.IsMatch(url);
|
|
}
|
|
|
|
private string GetNovelTitle(HtmlDocument document)
|
|
{
|
|
var xpath = @"//*[@id='workTitle']/a";
|
|
return document.DocumentNode.SelectSingleNode(xpath).InnerText;
|
|
}
|
|
|
|
private Author GetAuthor(HtmlDocument document)
|
|
{
|
|
var nameXPath = @"//*[@id='workAuthor-activityName']/a";
|
|
var urlXPath = @"//*[@id='workAuthor-activityName']/a";
|
|
var authorName = document.DocumentNode.SelectSingleNode(nameXPath).InnerText;
|
|
var authorUrl = document.DocumentNode.SelectSingleNode(urlXPath).Attributes["href"].Value;
|
|
Author author = new Author
|
|
{
|
|
Name = authorName,
|
|
Url = $"{BaseUrl + authorUrl}"
|
|
};
|
|
return author;
|
|
|
|
}
|
|
|
|
private List<Chapter> GetChapters(HtmlDocument document)
|
|
{
|
|
var urlxpath = @"//a[@class='widget-toc-episode-episodeTitle']";
|
|
var namexpath = @"span";
|
|
var urlnodes = document.DocumentNode.SelectNodes(urlxpath);
|
|
var chapters = urlnodes.Select((node, i) => new Chapter
|
|
{
|
|
ChapterNumber = i + 1,
|
|
Url = $"{BaseUrl}{node.Attributes["href"].Value}",
|
|
Name = node.SelectSingleNode(namexpath).InnerText
|
|
});
|
|
|
|
return chapters.ToList();
|
|
}
|
|
|
|
private List<Tag> GetTags(HtmlDocument document)
|
|
{
|
|
var xpath = @"//span[@itemprop='keywords']/a";
|
|
var nodes = document.DocumentNode.SelectNodes(xpath);
|
|
return nodes.Select(node => new Tag
|
|
{
|
|
TagValue = node.InnerText
|
|
}).ToList();
|
|
}
|
|
|
|
private DateTime GetPostedDate(HtmlDocument document)
|
|
{
|
|
var xpath = @"//time[@itemprop='datePublished']";
|
|
return DateTime.Parse(document.DocumentNode.SelectSingleNode(xpath).InnerText);
|
|
}
|
|
|
|
private DateTime GetLastUpdatedDate(HtmlDocument document)
|
|
{
|
|
var xpath = @"//time[@itemprop='dateModified']";
|
|
return DateTime.Parse(document.DocumentNode.SelectSingleNode(xpath).InnerText);
|
|
}
|
|
|
|
public Novel ScrapeNovel(string url)
|
|
{
|
|
Novel novel = new Novel();
|
|
var web = new HtmlWeb();
|
|
var doc = web.Load(url);
|
|
if (doc == null)
|
|
{
|
|
throw new Exception("Error parsing document");
|
|
}
|
|
|
|
return new Novel
|
|
{
|
|
Author = GetAuthor(doc),
|
|
Chapters = GetChapters(doc),
|
|
DatePosted = GetPostedDate(doc),
|
|
LastUpdated = GetLastUpdatedDate(doc),
|
|
Tags = GetTags(doc),
|
|
Title = GetNovelTitle(doc),
|
|
Url = url
|
|
};
|
|
}
|
|
|
|
public string? ScrapeChapterContent(string chapterUrl)
|
|
{
|
|
throw new NotImplementedException();
|
|
}
|
|
} |