Refactor and novel18 support (added cookie support in general to AbstractScraper.cs
All checks were successful
continuous-integration/drone/push Build is passing

This commit is contained in:
2022-07-20 22:04:13 -04:00
parent 12a1f48fbd
commit ceb8a0db8e
59 changed files with 353 additions and 240 deletions

View File

@@ -5,7 +5,7 @@ using System.Threading.Tasks;
using Microsoft.AspNetCore.Authorization;
using Microsoft.AspNetCore.Http;
using Microsoft.AspNetCore.Mvc;
using Treestar.Shared.Models.DBDomain;
using Common.Models.DBDomain;
using WebNovelPortalAPI.Middleware;
namespace WebNovelPortalAPI.Controllers

View File

@@ -8,10 +8,10 @@ using DBConnection.Repositories.Interfaces;
using Microsoft.AspNetCore.Authorization;
using Microsoft.AspNetCore.Http;
using Microsoft.AspNetCore.Mvc;
using Treestar.Shared.Models.DBDomain;
using Treestar.Shared.Models.DTO;
using Treestar.Shared.Models.DTO.Requests;
using Treestar.Shared.Models.DTO.Responses;
using Common.Models.DBDomain;
using Common.Models.DTO;
using Common.Models.DTO.Requests;
using Common.Models.DTO.Responses;
using WebNovelPortalAPI.Exceptions;
using WebNovelPortalAPI.Scrapers;
@@ -40,7 +40,7 @@ namespace WebNovelPortalAPI.Controllers
{
throw new NoMatchingScraperException(url);
}
var novel = scraper.ScrapeNovel(url);
var novel = await scraper.ScrapeNovel(url);
return novel;
}

View File

@@ -7,7 +7,7 @@ FROM mcr.microsoft.com/dotnet/sdk:6.0 AS build
WORKDIR /src
COPY ["WebNovelPortalAPI/WebNovelPortalAPI.csproj", "WebNovelPortalAPI/"]
COPY ["DBConnection/DBConnection.csproj", "DBConnection/"]
COPY ["Treestar.Shared/Treestar.Shared.csproj", "Treestar.Shared/"]
COPY ["Common/Common.csproj", "Common/"]
RUN dotnet restore "WebNovelPortalAPI/WebNovelPortalAPI.csproj"
COPY . .
WORKDIR "/src/WebNovelPortalAPI"

View File

@@ -1,7 +1,7 @@
using System.Security.Claims;
using DBConnection.Repositories.Interfaces;
using Microsoft.AspNetCore.Mvc.Filters;
using Treestar.Shared.Models.DBDomain;
using Common.Models.DBDomain;
namespace WebNovelPortalAPI.Middleware;

View File

@@ -4,7 +4,7 @@ using DBConnection.Extensions;
using Microsoft.EntityFrameworkCore;
using Microsoft.OpenApi.Models;
using Newtonsoft.Json;
using Treestar.Shared.Authentication.JwtBearer;
using Common.Authentication.JwtBearer;
using WebNovelPortalAPI.Extensions;
using WebNovelPortalAPI.Middleware;
using WebNovelPortalAPI.Scrapers;

View File

@@ -1,11 +1,31 @@
using System.Net;
using System.Net.Http.Headers;
using System.Text.RegularExpressions;
using HtmlAgilityPack;
using Treestar.Shared.Models.DBDomain;
using Common.Models.DBDomain;
namespace WebNovelPortalAPI.Scrapers;
public abstract class AbstractScraper : IScraper
{
protected AbstractScraper()
{
var cookieContainer = new CookieContainer();
var handler = new HttpClientHandler
{
CookieContainer = cookieContainer
};
HttpClient client = new HttpClient(handler);
client.DefaultRequestHeaders.UserAgent.Add(new ProductInfoHeaderValue("Chrome","96.0.4664.110"));
foreach (var cookie in RequestCookies())
{
cookieContainer.Add(cookie);
}
HttpClient = client;
}
protected HttpClient HttpClient { get; }
protected abstract string UrlMatchPattern { get; }
protected abstract string BaseUrlPattern { get; }
protected virtual string? WorkTitlePattern { get; }
@@ -19,6 +39,14 @@ public abstract class AbstractScraper : IScraper
protected virtual string? DatePostedPattern { get; }
protected virtual string? DateUpdatedPattern { get; }
protected async Task<HtmlDocument> GetPage(string url)
{
var response = await HttpClient.GetAsync(url);
var doc = new HtmlDocument();
doc.LoadHtml(await response.Content.ReadAsStringAsync());
return doc;
}
protected virtual (DateTime? Posted, DateTime? Updated) GetDateTimeForChapter(HtmlNode linkNode, HtmlNode baseNode,
string baseUrl, string novelUrl)
{
@@ -87,7 +115,7 @@ public abstract class AbstractScraper : IScraper
return nodes.Select(node => new Tag
{
TagValue = node.InnerText
}).ToList();
}).Union(GetMetadataTags(document, baseUrl, novelUrl)).ToList();
}
protected virtual DateTime GetPostedDate(HtmlDocument document, string baseUrl, string novelUrl)
@@ -101,18 +129,25 @@ public abstract class AbstractScraper : IScraper
var xpath = DateUpdatedPattern;
return DateTime.Parse(document.DocumentNode.SelectSingleNode(xpath).InnerText).ToUniversalTime();
}
public Novel ScrapeNovel(string url)
protected virtual List<Cookie> RequestCookies()
{
return new List<Cookie>();
}
protected abstract IEnumerable<Tag> GetMetadataTags(HtmlDocument document, string baseUrl, string novelUrl);
public virtual async Task<Novel> ScrapeNovel(string url)
{
var web = new HtmlWeb();
var doc = web.Load(url);
if (doc == null)
{
throw new Exception("Error parsing document");
}
var baseUrl = new Regex(BaseUrlPattern).Match(url).Value;
var novelUrl = new Regex(UrlMatchPattern).Match(url).Value;
var doc = await GetPage(novelUrl);
if (string.IsNullOrEmpty(doc.Text))
{
throw new Exception("Error parsing document");
}
return new Novel
{
Author = GetAuthor(doc, baseUrl, novelUrl),
@@ -125,7 +160,7 @@ public abstract class AbstractScraper : IScraper
};
}
public string? ScrapeChapterContent(string chapterUrl)
public Task<string?> ScrapeChapterContent(string chapterUrl)
{
throw new NotImplementedException();
}

View File

@@ -1,11 +1,11 @@
using Treestar.Shared.Models.DBDomain;
using Common.Models.DBDomain;
namespace WebNovelPortalAPI.Scrapers;
public interface IScraper
{
public bool MatchesUrl(string url);
public Novel ScrapeNovel(string url);
public string? ScrapeChapterContent(string chapterUrl);
public Task<Novel> ScrapeNovel(string url);
public Task<string?> ScrapeChapterContent(string chapterUrl);
}

View File

@@ -1,5 +1,6 @@
using System.Reflection.Metadata;
using System.Text.RegularExpressions;
using Common.Models.DBDomain;
using HtmlAgilityPack;
namespace WebNovelPortalAPI.Scrapers;
@@ -34,4 +35,13 @@ public class KakuyomuScraper : AbstractScraper
var datePosted = linkNode.SelectSingleNode(ChapterPostedPattern).Attributes["datetime"].Value;
return (DateTime.Parse(datePosted).ToUniversalTime(), null);
}
protected override IEnumerable<Tag> GetMetadataTags(HtmlDocument document, string baseUrl, string novelUrl)
{
return new List<Tag>
{
Tag.GetSiteTag(baseUrl),
Tag.GetOriginalWorkTag()
};
}
}

View File

@@ -1,15 +1,17 @@
using System.Net;
using System.Text.RegularExpressions;
using HtmlAgilityPack;
using Treestar.Shared.Models.DBDomain;
using Common.Models.DBDomain;
using Common.Models.Enums;
namespace WebNovelPortalAPI.Scrapers;
public class SyosetuScraper : AbstractScraper
{
protected override string UrlMatchPattern => @"https?:\/\/\w+\.syosetu\.com\/\w+\/?";
protected override string UrlMatchPattern => @"https?:\/\/(\w+)\.syosetu\.com\/\w+\/?";
protected override string BaseUrlPattern => @"https?:\/\/\w+\.syosetu\.com";
protected override string BaseUrlPattern => @"https?:\/\/(\w+)\.syosetu\.com";
protected override string? WorkTitlePattern => @"//p[@class='novel_title']";
@@ -29,14 +31,14 @@ public class SyosetuScraper : AbstractScraper
protected override string? DateUpdatedPattern => @"//th[contains(text(),'掲載日')]/following-sibling::td";
private HtmlDocument? GetInfoPage(string baseUrl, string novelUrl)
private async Task<HtmlDocument> GetInfoPage(string baseUrl, string novelUrl)
{
string novelInfoBase = $"/novelview/infotop/ncode/";
string novelRegex = @"https?:\/\/\w+\.syosetu\.com\/(\w+)\/?";
string novelCode = new Regex(novelRegex).Match(novelUrl).Groups[1].Value;
string novelInfoPage = $"{baseUrl}{novelInfoBase}{novelCode}";
var web = new HtmlWeb();
return web.Load(novelInfoPage);
return await GetPage(novelInfoPage);
}
protected override List<Chapter> GetChapters(HtmlDocument document, string baseUrl, string novelUrl)
@@ -85,35 +87,86 @@ public class SyosetuScraper : AbstractScraper
protected override DateTime GetPostedDate(HtmlDocument document, string baseUrl, string novelUrl)
{
var doc = GetInfoPage(baseUrl, novelUrl);
if (doc == null)
{
return DateTime.MinValue;
}
var node = doc.DocumentNode.SelectSingleNode(DatePostedPattern);
var node = document.DocumentNode.SelectSingleNode(DatePostedPattern);
return DateTime.Parse(node.InnerText).ToUniversalTime();
}
protected override DateTime GetLastUpdatedDate(HtmlDocument document, string baseUrl, string novelUrl)
{
var doc = GetInfoPage(baseUrl, novelUrl);
if (doc == null)
{
return DateTime.MinValue;
}
return DateTime.Parse(doc.DocumentNode.SelectNodes(DateUpdatedPattern)[1].InnerText).ToUniversalTime();
return DateTime.Parse(document.DocumentNode.SelectNodes(DateUpdatedPattern)[1].InnerText).ToUniversalTime();
}
protected override List<Tag> GetTags(HtmlDocument document, string baseUrl, string novelUrl)
{
var doc = GetInfoPage(baseUrl, novelUrl);
if (doc == null)
var tags = document.DocumentNode.SelectSingleNode(TagPattern).InnerText.Replace("\n", "").Replace("&nbsp;", " ").Split(' ');
return tags.Select(i => new Tag {TagValue = i}).Union(GetMetadataTags(document, baseUrl, novelUrl)).ToList();
}
protected override List<Cookie> RequestCookies()
{
var domain = ".syosetu.com";
return new List<Cookie>
{
return new List<Tag>();
new Cookie
{
Domain = domain,
Name = "over18",
Value = "yes"
}
};
}
protected override IEnumerable<Tag> GetMetadataTags(HtmlDocument document, string baseUrl, string novelUrl)
{
bool nsfw = Regex.Match(baseUrl, BaseUrlPattern).Groups[1].Value == "novel18";
var tags = new List<Tag>
{
Tag.GetSiteTag(baseUrl),
Tag.GetOriginalWorkTag()
};
if (nsfw)
{
tags.Add(Tag.GetNsfwTag());
}
var tags = doc.DocumentNode.SelectSingleNode(TagPattern).InnerText.Replace("\n", "").Replace("&nbsp;", " ").Split(' ');
return tags.Select(i => new Tag {TagValue = i}).ToList();
return tags;
}
public override async Task<Novel> ScrapeNovel(string url)
{
var baseUrl = new Regex(BaseUrlPattern).Match(url).Value;
var novelUrl = new Regex(UrlMatchPattern).Match(url).Value;
HtmlDocument baseDoc;
HtmlDocument novelInfoPage;
try
{
baseDoc = await GetPage(novelUrl);
novelInfoPage = await GetInfoPage(baseUrl, novelUrl);
}
catch (Exception e)
{
throw new Exception("Error parsing document");
}
return new Novel
{
Title = GetNovelTitle(baseDoc,
baseUrl,
novelUrl),
Author = GetAuthor(baseDoc,
baseUrl,
novelUrl),
Chapters = GetChapters(baseDoc,
baseUrl,
novelUrl),
LastUpdated = GetLastUpdatedDate(novelInfoPage, baseUrl, novelUrl),
Tags = GetTags(novelInfoPage,
baseUrl,
novelUrl),
DatePosted = GetPostedDate(novelInfoPage,
baseUrl,
novelUrl),
Url = novelUrl
};
}
}

View File

@@ -24,7 +24,7 @@
<ItemGroup>
<ProjectReference Include="..\DBConnection\DBConnection.csproj" />
<ProjectReference Include="..\Treestar.Shared\Treestar.Shared.csproj" />
<ProjectReference Include="..\Common\Common.csproj" />
</ItemGroup>
</Project>