This commit is contained in:
113
WebNovelPortalAPI/Scrapers/AbstractScraper.cs
Normal file
113
WebNovelPortalAPI/Scrapers/AbstractScraper.cs
Normal file
@@ -0,0 +1,113 @@
|
||||
using System.Text.RegularExpressions;
|
||||
using DBConnection.Models;
|
||||
using HtmlAgilityPack;
|
||||
|
||||
namespace WebNovelPortalAPI.Scrapers;
|
||||
|
||||
public abstract class AbstractScraper : IScraper
|
||||
{
|
||||
protected abstract string UrlMatchPattern { get; }
|
||||
protected abstract string BaseUrlPattern { get; }
|
||||
protected virtual string? WorkTitlePattern { get; }
|
||||
protected virtual string? AuthorNamePattern { get; }
|
||||
protected virtual string? AuthorLinkPattern { get; }
|
||||
protected virtual string? ChapterUrlPattern { get; }
|
||||
protected virtual string? ChapterNamePattern { get; }
|
||||
protected virtual string? ChapterPostedPattern { get; }
|
||||
protected virtual string? ChapterUpdatedPattern { get; }
|
||||
protected virtual string? TagPattern { get; }
|
||||
protected virtual string? DatePostedPattern { get; }
|
||||
protected virtual string? DateUpdatedPattern { get; }
|
||||
|
||||
public virtual bool MatchesUrl(string url)
|
||||
{
|
||||
var regex = new Regex(UrlMatchPattern, RegexOptions.IgnoreCase);
|
||||
return regex.IsMatch(url);
|
||||
}
|
||||
|
||||
protected virtual string GetNovelTitle(HtmlDocument document)
|
||||
{
|
||||
var xpath = WorkTitlePattern;
|
||||
return document.DocumentNode.SelectSingleNode(xpath).InnerText;
|
||||
}
|
||||
|
||||
protected virtual Author GetAuthor(HtmlDocument document, string baseUrl)
|
||||
{
|
||||
var nameXPath = AuthorNamePattern;
|
||||
var urlXPath = AuthorLinkPattern;
|
||||
var authorName = document.DocumentNode.SelectSingleNode(nameXPath).InnerText;
|
||||
var authorUrl = document.DocumentNode.SelectSingleNode(urlXPath).Attributes["href"].Value;
|
||||
Author author = new Author
|
||||
{
|
||||
Name = authorName,
|
||||
Url = $"{baseUrl + authorUrl}"
|
||||
};
|
||||
return author;
|
||||
|
||||
}
|
||||
|
||||
protected virtual List<Chapter> GetChapters(HtmlDocument document, string baseUrl)
|
||||
{
|
||||
var urlxpath = ChapterUrlPattern;
|
||||
var namexpath = ChapterNamePattern;
|
||||
var urlnodes = document.DocumentNode.SelectNodes(urlxpath);
|
||||
var chapters = urlnodes.Select((node, i) => new Chapter
|
||||
{
|
||||
ChapterNumber = i + 1,
|
||||
Url = $"{baseUrl}{node.Attributes["href"].Value}",
|
||||
Name = node.SelectSingleNode(namexpath).InnerText
|
||||
});
|
||||
|
||||
return chapters.ToList();
|
||||
}
|
||||
|
||||
protected virtual List<Tag> GetTags(HtmlDocument document)
|
||||
{
|
||||
var xpath = TagPattern;
|
||||
var nodes = document.DocumentNode.SelectNodes(xpath);
|
||||
return nodes.Select(node => new Tag
|
||||
{
|
||||
TagValue = node.InnerText
|
||||
}).ToList();
|
||||
}
|
||||
|
||||
protected virtual DateTime GetPostedDate(HtmlDocument document)
|
||||
{
|
||||
var xpath = DatePostedPattern;
|
||||
return DateTime.Parse(document.DocumentNode.SelectSingleNode(xpath).InnerText);
|
||||
}
|
||||
|
||||
protected virtual DateTime GetLastUpdatedDate(HtmlDocument document)
|
||||
{
|
||||
var xpath = DateUpdatedPattern;
|
||||
return DateTime.Parse(document.DocumentNode.SelectSingleNode(xpath).InnerText);
|
||||
}
|
||||
|
||||
public Novel ScrapeNovel(string url)
|
||||
{
|
||||
var web = new HtmlWeb();
|
||||
var doc = web.Load(url);
|
||||
if (doc == null)
|
||||
{
|
||||
throw new Exception("Error parsing document");
|
||||
}
|
||||
|
||||
var baseUrl = new Regex(BaseUrlPattern).Match(url).Value;
|
||||
var novelUrl = new Regex(UrlMatchPattern).Match(url).Value;
|
||||
return new Novel
|
||||
{
|
||||
Author = GetAuthor(doc, baseUrl),
|
||||
Chapters = GetChapters(doc, baseUrl),
|
||||
DatePosted = GetPostedDate(doc),
|
||||
LastUpdated = GetLastUpdatedDate(doc),
|
||||
Tags = GetTags(doc),
|
||||
Title = GetNovelTitle(doc),
|
||||
Url = novelUrl
|
||||
};
|
||||
}
|
||||
|
||||
public string? ScrapeChapterContent(string chapterUrl)
|
||||
{
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
}
|
||||
@@ -5,95 +5,29 @@ using HtmlAgilityPack;
|
||||
|
||||
namespace WebNovelPortalAPI.Scrapers;
|
||||
|
||||
public class KakuyomuScraper : IScraper
|
||||
public class KakuyomuScraper : AbstractScraper
|
||||
{
|
||||
private const string UrlPattern = @"https?:\/\/kakuyomu\.jp\/works\/\d+\/?";
|
||||
private const string BaseUrl = "https://kakuyomu.jp";
|
||||
public bool MatchesUrl(string url)
|
||||
{
|
||||
var regex = new Regex(UrlPattern, RegexOptions.IgnoreCase);
|
||||
return regex.IsMatch(url);
|
||||
}
|
||||
protected override string UrlMatchPattern => @"https?:\/\/kakuyomu\.jp\/works\/\d+\/?";
|
||||
|
||||
private string GetNovelTitle(HtmlDocument document)
|
||||
{
|
||||
var xpath = @"//*[@id='workTitle']/a";
|
||||
return document.DocumentNode.SelectSingleNode(xpath).InnerText;
|
||||
}
|
||||
protected override string BaseUrlPattern => @"https?:\/\/kakuyomu\.jp";
|
||||
|
||||
private Author GetAuthor(HtmlDocument document)
|
||||
{
|
||||
var nameXPath = @"//*[@id='workAuthor-activityName']/a";
|
||||
var urlXPath = @"//*[@id='workAuthor-activityName']/a";
|
||||
var authorName = document.DocumentNode.SelectSingleNode(nameXPath).InnerText;
|
||||
var authorUrl = document.DocumentNode.SelectSingleNode(urlXPath).Attributes["href"].Value;
|
||||
Author author = new Author
|
||||
{
|
||||
Name = authorName,
|
||||
Url = $"{BaseUrl + authorUrl}"
|
||||
};
|
||||
return author;
|
||||
protected override string? WorkTitlePattern => @"//*[@id='workTitle']/a";
|
||||
protected override string? AuthorNamePattern => @"//*[@id='workAuthor-activityName']/a";
|
||||
protected override string? AuthorLinkPattern => @"//*[@id='workAuthor-activityName']/a";
|
||||
|
||||
}
|
||||
protected override string? ChapterUrlPattern => @"//a[@class='widget-toc-episode-episodeTitle']";
|
||||
|
||||
private List<Chapter> GetChapters(HtmlDocument document)
|
||||
{
|
||||
var urlxpath = @"//a[@class='widget-toc-episode-episodeTitle']";
|
||||
var namexpath = @"span";
|
||||
var urlnodes = document.DocumentNode.SelectNodes(urlxpath);
|
||||
var chapters = urlnodes.Select((node, i) => new Chapter
|
||||
{
|
||||
ChapterNumber = i + 1,
|
||||
Url = $"{BaseUrl}{node.Attributes["href"].Value}",
|
||||
Name = node.SelectSingleNode(namexpath).InnerText
|
||||
});
|
||||
protected override string? ChapterNamePattern => @"span";
|
||||
|
||||
return chapters.ToList();
|
||||
}
|
||||
protected override string? ChapterPostedPattern => base.ChapterPostedPattern;
|
||||
|
||||
private List<Tag> GetTags(HtmlDocument document)
|
||||
{
|
||||
var xpath = @"//span[@itemprop='keywords']/a";
|
||||
var nodes = document.DocumentNode.SelectNodes(xpath);
|
||||
return nodes.Select(node => new Tag
|
||||
{
|
||||
TagValue = node.InnerText
|
||||
}).ToList();
|
||||
}
|
||||
protected override string? ChapterUpdatedPattern => base.ChapterUpdatedPattern;
|
||||
|
||||
private DateTime GetPostedDate(HtmlDocument document)
|
||||
{
|
||||
var xpath = @"//time[@itemprop='datePublished']";
|
||||
return DateTime.Parse(document.DocumentNode.SelectSingleNode(xpath).InnerText);
|
||||
}
|
||||
protected override string? TagPattern => @"//span[@itemprop='keywords']/a";
|
||||
|
||||
private DateTime GetLastUpdatedDate(HtmlDocument document)
|
||||
{
|
||||
var xpath = @"//time[@itemprop='dateModified']";
|
||||
return DateTime.Parse(document.DocumentNode.SelectSingleNode(xpath).InnerText);
|
||||
}
|
||||
protected override string? DatePostedPattern => @"//time[@itemprop='datePublished']";
|
||||
|
||||
public Novel ScrapeNovel(string url)
|
||||
{
|
||||
Novel novel = new Novel();
|
||||
var web = new HtmlWeb();
|
||||
var doc = web.Load(url);
|
||||
if (doc == null)
|
||||
{
|
||||
throw new Exception("Error parsing document");
|
||||
}
|
||||
|
||||
return new Novel
|
||||
{
|
||||
Author = GetAuthor(doc),
|
||||
Chapters = GetChapters(doc),
|
||||
DatePosted = GetPostedDate(doc),
|
||||
LastUpdated = GetLastUpdatedDate(doc),
|
||||
Tags = GetTags(doc),
|
||||
Title = GetNovelTitle(doc),
|
||||
Url = url
|
||||
};
|
||||
}
|
||||
protected override string? DateUpdatedPattern => @"//time[@itemprop='dateModified']";
|
||||
|
||||
public string? ScrapeChapterContent(string chapterUrl)
|
||||
{
|
||||
|
||||
28
WebNovelPortalAPI/Scrapers/SyosetuScraper.cs
Normal file
28
WebNovelPortalAPI/Scrapers/SyosetuScraper.cs
Normal file
@@ -0,0 +1,28 @@
|
||||
namespace WebNovelPortalAPI.Scrapers;
|
||||
|
||||
public class SyosetuScraper : AbstractScraper
|
||||
{
|
||||
protected override string UrlMatchPattern => @"https?:\/\/\w+\.syosetu\.com\/\w+\/?";
|
||||
|
||||
protected override string BaseUrlPattern => @"https?:\/\/\w+\.syosetu\.com\/?";
|
||||
|
||||
protected override string? WorkTitlePattern => @"//p[@class='novel_title']";
|
||||
|
||||
protected override string? AuthorNamePattern => @"//div[@class='novel_writername']/a | //div[@class='novel_writername']";
|
||||
|
||||
protected override string? AuthorLinkPattern => @"//div[@class='novel_writername']/a";
|
||||
|
||||
protected override string? ChapterUrlPattern => @"//dl[@class='novel_sublist2']//a";
|
||||
|
||||
protected override string? ChapterNamePattern => @"//dl[@class='novel_sublist2']//a";
|
||||
|
||||
protected override string? ChapterPostedPattern => base.ChapterPostedPattern;
|
||||
|
||||
protected override string? ChapterUpdatedPattern => base.ChapterUpdatedPattern;
|
||||
|
||||
protected override string? TagPattern => base.TagPattern;
|
||||
|
||||
protected override string? DatePostedPattern => base.DatePostedPattern;
|
||||
|
||||
protected override string? DateUpdatedPattern => base.DateUpdatedPattern;
|
||||
}
|
||||
Reference in New Issue
Block a user