Updated lots of stuff, got multi scrape working, need to test not-nullable chapter novel ids with our current model, now supports sqlite and postgres concurrently (and easy add more), need to get it deployed/do auth
Some checks failed
continuous-integration/drone/push Build is failing

This commit is contained in:
2022-07-16 17:17:43 -04:00
parent eab3399268
commit d98324c11e
73 changed files with 1591 additions and 680 deletions

View File

@@ -3,12 +3,15 @@ using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using DBConnection;
using DBConnection.Models;
using DBConnection.Repositories;
using DBConnection.Repositories.Interfaces;
using Microsoft.AspNetCore.Http;
using Microsoft.AspNetCore.Mvc;
using Shared.Models.DTO;
using Treestar.Shared.Models.DBDomain;
using Treestar.Shared.Models.DTO;
using Treestar.Shared.Models.DTO.Requests;
using Treestar.Shared.Models.DTO.Responses;
using WebNovelPortalAPI.Exceptions;
using WebNovelPortalAPI.Scrapers;
namespace WebNovelPortalAPI.Controllers
@@ -26,6 +29,17 @@ namespace WebNovelPortalAPI.Controllers
_novelRepository = novelRepository;
}
private async Task<Novel?> ScrapeNovel(string url)
{
var scraper = MatchScraper(url);
if (scraper == null)
{
throw new NoMatchingScraperException(url);
}
var novel = scraper.ScrapeNovel(url);
return novel;
}
private IScraper? MatchScraper(string novelUrl)
{
return _scrapers.FirstOrDefault(i => i.MatchesUrl(novelUrl));
@@ -45,27 +59,57 @@ namespace WebNovelPortalAPI.Controllers
}
[HttpPost]
[Route("scrapeNovel")]
public async Task<IActionResult> ScrapeNovel(ScrapeNovelRequest request)
[Route("scrapeNovels")]
public async Task<IActionResult> ScrapeNovels(ScrapeNovelsRequest request)
{
var scraper = MatchScraper(request.NovelUrl);
if (scraper == null)
var successfulScrapes = new List<Novel>();
var failures = new Dictionary<string, Exception>();
foreach (var novelUrl in request.NovelUrls)
{
return BadRequest("Invalid url, no valid scraper configured");
try
{
successfulScrapes.Add(await ScrapeNovel(novelUrl));
}
catch (Exception e)
{
failures[novelUrl] = e;
}
}
Novel novel;
IEnumerable<Novel> successfulUploads;
try
{
novel = scraper.ScrapeNovel(request.NovelUrl);
successfulUploads = await _novelRepository.UpsertMany(successfulScrapes);
}
catch (Exception e)
{
return StatusCode(500, e);
}
return Ok(new ScrapeNovelsResponse
{
Failures = failures,
SuccessfulNovels = successfulScrapes
});
}
var novelUpload = await _novelRepository.Upsert(novel);
return Ok(novelUpload);
[HttpPost]
[Route("scrapeNovel")]
public async Task<IActionResult> ScrapeNovel(ScrapeNovelRequest request)
{
try
{
var novel = await ScrapeNovel(request.NovelUrl);
var dbNovel = await _novelRepository.Upsert(novel);
return Ok(dbNovel);
}
catch (NoMatchingScraperException e)
{
return BadRequest("Invalid url, no valid scraper configured");
}
catch (Exception e)
{
return StatusCode(500, e);
}
}
}
}

View File

@@ -0,0 +1,10 @@
namespace WebNovelPortalAPI.Exceptions;
public class NoMatchingScraperException: Exception
{
public NoMatchingScraperException(string novelUrl) : base($"Novel URL {novelUrl} did not match any registered web scraper.")
{
}
}

View File

@@ -1,4 +1,5 @@
using DBConnection;
using DBConnection.Contexts;
using DBConnection.Extensions;
using Microsoft.EntityFrameworkCore;
using Newtonsoft.Json;

View File

@@ -1,6 +1,6 @@
using System.Text.RegularExpressions;
using DBConnection.Models;
using HtmlAgilityPack;
using Treestar.Shared.Models.DBDomain;
namespace WebNovelPortalAPI.Scrapers;
@@ -18,6 +18,12 @@ public abstract class AbstractScraper : IScraper
protected virtual string? TagPattern { get; }
protected virtual string? DatePostedPattern { get; }
protected virtual string? DateUpdatedPattern { get; }
protected virtual (DateTime? Posted, DateTime? Updated) GetDateTimeForChapter(HtmlNode linkNode, HtmlNode baseNode,
string baseUrl, string novelUrl)
{
return (null, null);
}
public virtual bool MatchesUrl(string url)
{
@@ -25,43 +31,56 @@ public abstract class AbstractScraper : IScraper
return regex.IsMatch(url);
}
protected virtual string GetNovelTitle(HtmlDocument document)
protected virtual string GetNovelTitle(HtmlDocument document, string baseUrl, string novelUrl)
{
var xpath = WorkTitlePattern;
return document.DocumentNode.SelectSingleNode(xpath).InnerText;
}
protected virtual Author GetAuthor(HtmlDocument document, string baseUrl)
protected virtual Author GetAuthor(HtmlDocument document, string baseUrl, string novelUrl)
{
var nameXPath = AuthorNamePattern;
var urlXPath = AuthorLinkPattern;
var authorName = document.DocumentNode.SelectSingleNode(nameXPath).InnerText;
var authorUrl = document.DocumentNode.SelectSingleNode(urlXPath).Attributes["href"].Value;
Author author = new Author
try
{
Name = authorName,
Url = $"{baseUrl + authorUrl}"
};
return author;
var authorName = document.DocumentNode.SelectSingleNode(nameXPath).InnerText;
var authorUrl = document.DocumentNode.SelectSingleNode(urlXPath).Attributes["href"].Value;
Author author = new Author
{
Name = authorName,
Url = $"{baseUrl + authorUrl}"
};
return author;
}
catch (Exception e)
{
return null;
}
}
protected virtual List<Chapter> GetChapters(HtmlDocument document, string baseUrl)
protected virtual List<Chapter> GetChapters(HtmlDocument document, string baseUrl, string novelUrl)
{
var urlxpath = ChapterUrlPattern;
var namexpath = ChapterNamePattern;
var urlnodes = document.DocumentNode.SelectNodes(urlxpath);
var chapters = urlnodes.Select((node, i) => new Chapter
var chapters = urlnodes.Select((node, i) =>
{
ChapterNumber = i + 1,
Url = $"{baseUrl}{node.Attributes["href"].Value}",
Name = node.SelectSingleNode(namexpath).InnerText
var dates = GetDateTimeForChapter(node, document.DocumentNode, baseUrl, novelUrl);
return new Chapter
{
ChapterNumber = i + 1,
Url = $"{baseUrl}{node.Attributes["href"].Value}",
Name = node.SelectSingleNode(namexpath).InnerText,
DatePosted = dates.Posted,
DateUpdated = dates.Updated
};
});
return chapters.ToList();
}
protected virtual List<Tag> GetTags(HtmlDocument document)
protected virtual List<Tag> GetTags(HtmlDocument document, string baseUrl, string novelUrl)
{
var xpath = TagPattern;
var nodes = document.DocumentNode.SelectNodes(xpath);
@@ -71,13 +90,13 @@ public abstract class AbstractScraper : IScraper
}).ToList();
}
protected virtual DateTime GetPostedDate(HtmlDocument document)
protected virtual DateTime GetPostedDate(HtmlDocument document, string baseUrl, string novelUrl)
{
var xpath = DatePostedPattern;
return DateTime.Parse(document.DocumentNode.SelectSingleNode(xpath).InnerText);
}
protected virtual DateTime GetLastUpdatedDate(HtmlDocument document)
protected virtual DateTime GetLastUpdatedDate(HtmlDocument document, string baseUrl, string novelUrl)
{
var xpath = DateUpdatedPattern;
return DateTime.Parse(document.DocumentNode.SelectSingleNode(xpath).InnerText);
@@ -96,12 +115,12 @@ public abstract class AbstractScraper : IScraper
var novelUrl = new Regex(UrlMatchPattern).Match(url).Value;
return new Novel
{
Author = GetAuthor(doc, baseUrl),
Chapters = GetChapters(doc, baseUrl),
DatePosted = GetPostedDate(doc),
LastUpdated = GetLastUpdatedDate(doc),
Tags = GetTags(doc),
Title = GetNovelTitle(doc),
Author = GetAuthor(doc, baseUrl, novelUrl),
Chapters = GetChapters(doc, baseUrl, novelUrl),
DatePosted = GetPostedDate(doc, baseUrl, novelUrl),
LastUpdated = GetLastUpdatedDate(doc, baseUrl, novelUrl),
Tags = GetTags(doc, baseUrl, novelUrl),
Title = GetNovelTitle(doc, baseUrl, novelUrl),
Url = novelUrl
};
}

View File

@@ -1,4 +1,4 @@
using DBConnection.Models;
using Treestar.Shared.Models.DBDomain;
namespace WebNovelPortalAPI.Scrapers;

View File

@@ -1,6 +1,5 @@
using System.Reflection.Metadata;
using System.Text.RegularExpressions;
using DBConnection.Models;
using HtmlAgilityPack;
namespace WebNovelPortalAPI.Scrapers;
@@ -19,7 +18,7 @@ public class KakuyomuScraper : AbstractScraper
protected override string? ChapterNamePattern => @"span";
protected override string? ChapterPostedPattern => base.ChapterPostedPattern;
protected override string? ChapterPostedPattern => @"time";
protected override string? ChapterUpdatedPattern => base.ChapterUpdatedPattern;
@@ -29,8 +28,10 @@ public class KakuyomuScraper : AbstractScraper
protected override string? DateUpdatedPattern => @"//time[@itemprop='dateModified']";
public string? ScrapeChapterContent(string chapterUrl)
protected override (DateTime? Posted, DateTime? Updated) GetDateTimeForChapter(HtmlNode linkNode, HtmlNode baseNode, string baseUrl,
string novelUrl)
{
throw new NotImplementedException();
var datePosted = linkNode.SelectSingleNode(ChapterPostedPattern).Attributes["datetime"].Value;
return (DateTime.Parse(datePosted), null);
}
}

View File

@@ -1,10 +1,15 @@
using System.Text.RegularExpressions;
using HtmlAgilityPack;
using Treestar.Shared.Models.DBDomain;
namespace WebNovelPortalAPI.Scrapers;
public class SyosetuScraper : AbstractScraper
{
protected override string UrlMatchPattern => @"https?:\/\/\w+\.syosetu\.com\/\w+\/?";
protected override string BaseUrlPattern => @"https?:\/\/\w+\.syosetu\.com\/?";
protected override string BaseUrlPattern => @"https?:\/\/\w+\.syosetu\.com";
protected override string? WorkTitlePattern => @"//p[@class='novel_title']";
@@ -14,15 +19,101 @@ public class SyosetuScraper : AbstractScraper
protected override string? ChapterUrlPattern => @"//dl[@class='novel_sublist2']//a";
protected override string? ChapterNamePattern => @"//dl[@class='novel_sublist2']//a";
protected override string? ChapterPostedPattern => @"following-sibling::dt[@class='long_update']";
protected override string? ChapterPostedPattern => base.ChapterPostedPattern;
protected override string? ChapterUpdatedPattern => @"span";
protected override string? ChapterUpdatedPattern => base.ChapterUpdatedPattern;
protected override string? TagPattern => @"//th[text()='キーワード']/following-sibling::td";
protected override string? TagPattern => base.TagPattern;
protected override string? DatePostedPattern => @"//th[text()='掲載日']/following-sibling::td";
protected override string? DatePostedPattern => base.DatePostedPattern;
protected override string? DateUpdatedPattern => @"//th[contains(text(),'掲載日')]/following-sibling::td";
protected override string? DateUpdatedPattern => base.DateUpdatedPattern;
private HtmlDocument? GetInfoPage(string baseUrl, string novelUrl)
{
string novelInfoBase = $"/novelview/infotop/ncode/";
string novelRegex = @"https?:\/\/\w+\.syosetu\.com\/(\w+)\/?";
string novelCode = new Regex(novelRegex).Match(novelUrl).Groups[1].Value;
string novelInfoPage = $"{baseUrl}{novelInfoBase}{novelCode}";
var web = new HtmlWeb();
return web.Load(novelInfoPage);
}
protected override List<Chapter> GetChapters(HtmlDocument document, string baseUrl, string novelUrl)
{
string dateUpdatedRegex = @"\d\d\d\d\/\d\d\/\d\d \d\d:\d\d";
var nodes = document.DocumentNode.SelectNodes(ChapterUrlPattern);
return nodes.Select((node,i) =>
{
var datePostedNode = node.ParentNode.SelectSingleNode(ChapterPostedPattern);
var datePosted = DateTime.Parse(new Regex(dateUpdatedRegex).Match(datePostedNode.InnerText).Value);
var dateUpdatedNode = datePostedNode.SelectSingleNode(ChapterUpdatedPattern);
DateTime dateUpdated;
if (dateUpdatedNode == null)
{
dateUpdated = datePosted;
}
else
{
dateUpdated = DateTime.Parse(new Regex(dateUpdatedRegex).Match(dateUpdatedNode.Attributes["title"].Value).Value);
}
return new Chapter
{
Name = node.InnerText,
Url = baseUrl + node.Attributes["href"].Value,
ChapterNumber = i+1,
DatePosted = datePosted,
DateUpdated = dateUpdated
};
}).ToList();
}
protected override Author GetAuthor(HtmlDocument document, string baseUrl, string novelUrl)
{
var authorLink = document.DocumentNode.SelectSingleNode(AuthorLinkPattern)?.Attributes["href"].Value ?? null;
if (string.IsNullOrEmpty(authorLink))
{
return null;
}
var authorName = document.DocumentNode.SelectSingleNode(AuthorNamePattern).InnerText.Replace("\n", "");
return new Author
{
Name = authorName,
Url = authorLink
};
}
protected override DateTime GetPostedDate(HtmlDocument document, string baseUrl, string novelUrl)
{
var doc = GetInfoPage(baseUrl, novelUrl);
if (doc == null)
{
return DateTime.MinValue;
}
var node = doc.DocumentNode.SelectSingleNode(DatePostedPattern);
return DateTime.Parse(node.InnerText);
}
protected override DateTime GetLastUpdatedDate(HtmlDocument document, string baseUrl, string novelUrl)
{
var doc = GetInfoPage(baseUrl, novelUrl);
if (doc == null)
{
return DateTime.MinValue;
}
return DateTime.Parse(doc.DocumentNode.SelectNodes(DateUpdatedPattern)[1].InnerText);
}
protected override List<Tag> GetTags(HtmlDocument document, string baseUrl, string novelUrl)
{
var doc = GetInfoPage(baseUrl, novelUrl);
if (doc == null)
{
return new List<Tag>();
}
var tags = doc.DocumentNode.SelectSingleNode(TagPattern).InnerText.Replace("\n", "").Replace("&nbsp;", " ").Split(' ');
return tags.Select(i => new Tag {TagValue = i}).ToList();
}
}

View File

@@ -22,7 +22,7 @@
<ItemGroup>
<ProjectReference Include="..\DBConnection\DBConnection.csproj" />
<ProjectReference Include="..\Shared\Shared.csproj" />
<ProjectReference Include="..\Treestar.Shared\Treestar.Shared.csproj" />
</ItemGroup>
</Project>

View File

@@ -6,7 +6,9 @@
}
},
"ConnectionStrings": {
"DefaultConnection": "Data Source=test_db"
"Sqlite": "Data Source=test_db",
"PostgresSql": "placeholder"
},
"DatabaseProvider": "Sqlite",
"AllowedHosts": "*"
}