Initial efcore migration and updates to make sure upserting novels (mostly) works. still need to do chapter handling

2022-07-14 23:12:12 -04:00
parent 5402923e9f
commit 5337e7ccb8
25 changed files with 962 additions and 64 deletions
--- a/WebNovelPortalAPI/Scrapers/KakuyomuScraper.cs
+++ b/WebNovelPortalAPI/Scrapers/KakuyomuScraper.cs
@@ -0,0 +1,102 @@
+using System.Reflection.Metadata;
+using System.Text.RegularExpressions;
+using DBConnection.Models;
+using HtmlAgilityPack;
+
+namespace WebNovelPortalAPI.Scrapers;
+
+public class KakuyomuScraper : IScraper
+{
+    private const string UrlPattern = @"https?:\/\/kakuyomu\.jp\/works\/\d+\/?";
+    private const string BaseUrl = "https://kakuyomu.jp";
+    public bool MatchesUrl(string url)
+    {
+        var regex = new Regex(UrlPattern, RegexOptions.IgnoreCase);
+        return regex.IsMatch(url);
+    }
+
+    private string GetNovelTitle(HtmlDocument document)
+    {
+        var xpath = @"//*[@id='workTitle']/a";
+        return document.DocumentNode.SelectSingleNode(xpath).InnerText;
+    }
+
+    private Author GetAuthor(HtmlDocument document)
+    {
+        var nameXPath = @"//*[@id='workAuthor-activityName']/a";
+        var urlXPath = @"//*[@id='workAuthor-activityName']/a";
+        var authorName = document.DocumentNode.SelectSingleNode(nameXPath).InnerText;
+        var authorUrl = document.DocumentNode.SelectSingleNode(urlXPath).Attributes["href"].Value;
+        Author author = new Author
+        {
+            Name = authorName,
+            Url = $"{BaseUrl + authorUrl}"
+        };
+        return author;
+
+    }
+
+    private List<Chapter> GetChapters(HtmlDocument document)
+    {
+        var urlxpath = @"//a[@class='widget-toc-episode-episodeTitle']";
+        var namexpath = @"span";
+        var urlnodes = document.DocumentNode.SelectNodes(urlxpath);
+        var chapters = urlnodes.Select((node, i) => new Chapter
+        {
+            ChapterNumber = i + 1,
+            Url = $"{BaseUrl}{node.Attributes["href"].Value}",
+            Name = node.SelectSingleNode(namexpath).InnerText
+        });
+
+        return chapters.ToList();
+    }
+
+    private List<Tag> GetTags(HtmlDocument document)
+    {
+        var xpath = @"//span[@itemprop='keywords']/a";
+        var nodes = document.DocumentNode.SelectNodes(xpath);
+        return nodes.Select(node => new Tag
+        {
+            TagValue = node.InnerText
+        }).ToList();
+    }
+
+    private DateTime GetPostedDate(HtmlDocument document)
+    {
+        var xpath = @"//time[@itemprop='datePublished']";
+        return DateTime.Parse(document.DocumentNode.SelectSingleNode(xpath).InnerText);
+    }
+
+    private DateTime GetLastUpdatedDate(HtmlDocument document)
+    {
+        var xpath = @"//time[@itemprop='dateModified']";
+        return DateTime.Parse(document.DocumentNode.SelectSingleNode(xpath).InnerText);
+    }
+
+    public Novel ScrapeNovel(string url)
+    {
+        Novel novel = new Novel();
+        var web = new HtmlWeb();
+        var doc = web.Load(url);
+        if (doc == null)
+        {
+            throw new Exception("Error parsing document");
+        }
+
+        return new Novel
+        {
+            Author = GetAuthor(doc),
+            Chapters = GetChapters(doc),
+            DatePosted = GetPostedDate(doc),
+            LastUpdated = GetLastUpdatedDate(doc),
+            Tags = GetTags(doc),
+            Title = GetNovelTitle(doc),
+            Url = url
+        };
+    }
+
+    public string? ScrapeChapterContent(string chapterUrl)
+    {
+        throw new NotImplementedException();
+    }
+}