Skip to content

Commit

Permalink
Added some tests and some refactors
Browse files Browse the repository at this point in the history
  • Loading branch information
Marcel0024 committed Jul 8, 2024
1 parent 93cee4c commit 905bb2c
Show file tree
Hide file tree
Showing 29 changed files with 750 additions and 234 deletions.
14 changes: 7 additions & 7 deletions CocoCrawler.sln
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "BackgroundServiceExtractLis
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tests", "tests", "{E0343CD2-205D-4169-A077-818B0B6602CE}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "CocoCrawler.Tests", "Tests\CocoCrawler.Tests.csproj", "{FE2D3737-4B8F-4FB2-88FF-E3627D996432}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "BackgroundServiceOpenLinksAndExtractObjects", "Examples\BackgroundServiceOpenLinksAndExtractObject\BackgroundServiceOpenLinksAndExtractObjects.csproj", "{9BBA41BF-10D9-4030-A523-8539734E208F}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "CocoCrawler.IntegrationTests", "Tests\CocoCrawler.IntegrationTests\CocoCrawler.IntegrationTests.csproj", "{904CA6C2-FC83-49AA-A5AE-2019D8EE54BE}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand All @@ -31,23 +31,23 @@ Global
{0B926C23-31CD-4208-9773-61EC3CA9BD23}.Debug|Any CPU.Build.0 = Debug|Any CPU
{0B926C23-31CD-4208-9773-61EC3CA9BD23}.Release|Any CPU.ActiveCfg = Release|Any CPU
{0B926C23-31CD-4208-9773-61EC3CA9BD23}.Release|Any CPU.Build.0 = Release|Any CPU
{FE2D3737-4B8F-4FB2-88FF-E3627D996432}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{FE2D3737-4B8F-4FB2-88FF-E3627D996432}.Debug|Any CPU.Build.0 = Debug|Any CPU
{FE2D3737-4B8F-4FB2-88FF-E3627D996432}.Release|Any CPU.ActiveCfg = Release|Any CPU
{FE2D3737-4B8F-4FB2-88FF-E3627D996432}.Release|Any CPU.Build.0 = Release|Any CPU
{9BBA41BF-10D9-4030-A523-8539734E208F}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{9BBA41BF-10D9-4030-A523-8539734E208F}.Debug|Any CPU.Build.0 = Debug|Any CPU
{9BBA41BF-10D9-4030-A523-8539734E208F}.Release|Any CPU.ActiveCfg = Release|Any CPU
{9BBA41BF-10D9-4030-A523-8539734E208F}.Release|Any CPU.Build.0 = Release|Any CPU
{904CA6C2-FC83-49AA-A5AE-2019D8EE54BE}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{904CA6C2-FC83-49AA-A5AE-2019D8EE54BE}.Debug|Any CPU.Build.0 = Debug|Any CPU
{904CA6C2-FC83-49AA-A5AE-2019D8EE54BE}.Release|Any CPU.ActiveCfg = Release|Any CPU
{904CA6C2-FC83-49AA-A5AE-2019D8EE54BE}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(NestedProjects) = preSolution
{04FDE2B4-C800-41B9-BC71-C39503871F13} = {E8E77F73-2709-4B42-9FDC-9475BD4306E2}
{0B926C23-31CD-4208-9773-61EC3CA9BD23} = {9EDB9EF4-A58C-44D4-87C9-A8960C007837}
{FE2D3737-4B8F-4FB2-88FF-E3627D996432} = {E0343CD2-205D-4169-A077-818B0B6602CE}
{9BBA41BF-10D9-4030-A523-8539734E208F} = {9EDB9EF4-A58C-44D4-87C9-A8960C007837}
{904CA6C2-FC83-49AA-A5AE-2019D8EE54BE} = {E0343CD2-205D-4169-A077-818B0B6602CE}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {992F3EF0-1D78-465D-914D-294A906EDF2C}
Expand Down
4 changes: 1 addition & 3 deletions CocoCrawler/Builders/CrawlerEngineBuilder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,9 @@ public async Task<CrawlerEngine> BuildAsync(CancellationToken cancellationToken
var engineSettings = EngineSettingsBuilder.Build();
var jobs = CrawlPages.Select(cp => cp.Build()).ToImmutableArray();

await engineSettings.Scheduler.AddAsync(jobs, cancellationToken);

await InitializeOutputs(jobs, cancellationToken);

return new CrawlerEngine(engineSettings);
return new CrawlerEngine(engineSettings, jobs);
}

private static async Task InitializeOutputs(ImmutableArray<PageCrawlJob> jobs, CancellationToken token)
Expand Down
31 changes: 17 additions & 14 deletions CocoCrawler/Builders/EngineSettingsBuilder.cs
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@

using CocoCrawler.Crawler;
using CocoCrawler.CrawlJob;
using CocoCrawler.Exceptions;
using CocoCrawler.Parser;
using CocoCrawler.Scheduler;
using Microsoft.Extensions.Logging;
using System.Net;

namespace CocoCrawler.Builders;

Expand All @@ -17,10 +17,10 @@ public class EngineSettingsBuilder
private bool ParallelismDisabled { get; set; } = false;
private string? UserAgent { get; set; } = null;
private ILoggerFactory? LoggerFactory { get; set; } = null;
private IScheduler Scheduler { get; set; } = new InMemoryScheduler();
private IParser Parser { get; set; } = new AngleSharpParser();
private IScheduler Scheduler { get; set; } = new MemoryScheduler();
private ICrawler Crawler { get; set; } = new PuppeteerCrawler();
private Cookie[] Cookies { get; set; } = [];
private Cookie[]? Cookies { get; set; } = null;

/// <summary>
/// Sets the headless mode for the browser.
Expand Down Expand Up @@ -85,25 +85,25 @@ public EngineSettingsBuilder DisableParallelism()
}

/// <summary>
/// Sets the parser to be used by the crawler engine.
/// Sets the scheduler to be used by the crawler engine.
/// </summary>
/// <param name="parser">The parser implementation.</param>
/// <param name="scheduler">The scheduler implementation.</param>
/// <returns>The CrawlerEngineBuilder instance.</returns>
public EngineSettingsBuilder WithParser(IParser parser)
public EngineSettingsBuilder WithScheduler(IScheduler scheduler)
{
Parser = parser;
Scheduler = scheduler;

return this;
}

/// <summary>
/// Sets the scheduler to be used by the crawler engine.
/// Sets the parser to be used by the crawler.
/// </summary>
/// <param name="scheduler">The scheduler implementation.</param>
/// <param name="parser">The crawler implementation.</param>
/// <returns>The CrawlerEngineBuilder instance.</returns>
public EngineSettingsBuilder WithScheduler(IScheduler scheduler)
public EngineSettingsBuilder WithParser(IParser parser)
{
Scheduler = scheduler;
Parser = parser;

return this;
}
Expand Down Expand Up @@ -168,20 +168,23 @@ internal EngineSettings Build()
throw new CocoCrawlerBuilderException("The total pages to crawl must be greater than or equal to 1.");
}

if (LoggerFactory is not null)
{
Crawler.WithLoggerFactory(LoggerFactory);
}

Crawler.WithParser(Parser);
Crawler.WithLoggerFactory(LoggerFactory);

return new EngineSettings(Headless,
ParallelismDisabled,
IgnoreUrls,
ParallelismDegree,
MaxPagesToCrawl,
UserAgent,
Parser,
Crawler,
Scheduler,
LoggerFactory,
Cookies,
Cookies is null ? [] : [.. Cookies],
new HistoryTracker());
}
}
27 changes: 12 additions & 15 deletions CocoCrawler/Builders/PageCrawlJobBuilder.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
using CocoCrawler.CrawlOutputs;
using AngleSharp.Css;
using AngleSharp.Css.Parser;
using CocoCrawler.CrawlOutputs;
using CocoCrawler.Exceptions;
using CocoCrawler.Job;
using CocoCrawler.Job.PageBrowserActions;
Expand Down Expand Up @@ -27,6 +29,11 @@ public PageCrawlJobBuilder(string url)
{
ArgumentException.ThrowIfNullOrWhiteSpace(url, nameof(url));

if (!Uri.TryCreate(url, UriKind.Absolute, out _))
{
throw new CocoCrawlerBuilderException($"{url} is not a valid URI.");
}

Url = url;
}

Expand Down Expand Up @@ -83,23 +90,13 @@ public PageCrawlJobBuilder OpenLinks(string linksSelector, Action<PageCrawlJobBu
}

/// <summary>
/// Adds a task to paginate through a list of elements on the page with additional page actions.
/// Adds a task to paginate through.
/// </summary>
/// <param name="paginationSelector">The CSS selector to select the pagination element.</param>
/// <param name="options">The action to configure the page actions for the pagination task.</param>
/// <returns>The updated <see cref="PageCrawlJobBuilder"/> instance.</returns>
public PageCrawlJobBuilder AddPagination(string paginationSelector, Action<PageActionsBuilder>? options = null)
public PageCrawlJobBuilder AddPagination(string paginationSelector)
{
PageActionsBuilder? pageActionsBuilder = null;

if (options is not null)
{
pageActionsBuilder = new PageActionsBuilder();

options(pageActionsBuilder);
}

Tasks.Add(new CrawlPagePaginateTask(paginationSelector, pageActionsBuilder?.Build()));
Tasks.Add(new CrawlPagePaginateTask(paginationSelector));

return this;
}
Expand Down Expand Up @@ -210,7 +207,7 @@ internal PageCrawlJob Build()
{
if (Tasks.Count == 0)
{
throw new CocoCrawlerBuilderException($"A Page requires a purpose, try calling .{nameof(OpenLinks)}() or .{nameof(ExtractObject)}() or {nameof(AddPagination)}()");
throw new CocoCrawlerBuilderException($"A Page requires a purpose, try calling .{nameof(OpenLinks)}() or .{nameof(ExtractObject)}() or {nameof(AddPagination)}().");
}

if (Url is null)
Expand Down
3 changes: 3 additions & 0 deletions CocoCrawler/CrawlJob/Cookie.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
namespace CocoCrawler.CrawlJob;

public record Cookie(string Name, string Value, string Domain, string Path = "/");
7 changes: 2 additions & 5 deletions CocoCrawler/CrawlJob/PageTasks/CrawlPagePaginateTask.cs
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
using CocoCrawler.Job.PageBrowserActions;
namespace CocoCrawler.Job.PageTasks;

namespace CocoCrawler.Job.PageTasks;

public class CrawlPagePaginateTask(string paginationSelector, PageActions? pageActions = null)
public class CrawlPagePaginateTask(string paginationSelector)
: IPageCrawlTask
{
public string PaginationSelector { get; init; } = paginationSelector;
public PageActions? PageActions { get; init; } = pageActions;
}
6 changes: 6 additions & 0 deletions CocoCrawler/CrawlOutputs/CsvFileCrawlOutput.cs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,12 @@ public virtual async Task Initiaize(CancellationToken cancellationToken)
{
File.Delete(filePath);
}

var dir = Path.GetDirectoryName(filePath);
if (!string.IsNullOrWhiteSpace(dir))
{
Directory.CreateDirectory(dir);
}
}
finally
{
Expand Down
4 changes: 2 additions & 2 deletions CocoCrawler/Crawler/ICrawler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ namespace CocoCrawler.Crawler;

public interface ICrawler
{
Task<CrawlResult> Crawl(IPage browserTab, PageCrawlJob job);
void WithParser(IParser parser);
void WithLoggerFactory(ILoggerFactory? loggerFactory);
void WithLoggerFactory(ILoggerFactory loggerFactory);
Task<CrawlResult> Crawl(IPage browserTab, PageCrawlJob job);
}
106 changes: 55 additions & 51 deletions CocoCrawler/Crawler/PuppeteerCrawler.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
using AngleSharp;
using AngleSharp.Dom;
using CocoCrawler.Job;
using CocoCrawler.Job;
using CocoCrawler.Job.PageBrowserActions;
using CocoCrawler.Job.PageTasks;
using CocoCrawler.Parser;
Expand All @@ -12,8 +10,8 @@ namespace CocoCrawler.Crawler;

public class PuppeteerCrawler : ICrawler
{
private IParser? Parser { get; set; }
private ILogger? Logger { get; set; }
private IParser? Parser { get; set; }

public virtual async Task<CrawlResult> Crawl(IPage browserTab, PageCrawlJob currentPageJob)
{
Expand Down Expand Up @@ -56,84 +54,90 @@ protected virtual async Task ExecutePageActions(IPage page, PageActions? browser

protected virtual async Task Parse(PageCrawlJob job, string html, List<PageCrawlJob> newJobs, JArray jArray)
{
ArgumentNullException.ThrowIfNull(Parser, nameof(Parser));
ArgumentNullException.ThrowIfNull(Parser);

var doc = await GetDocument(html);
await Parser.Init(html);

foreach (var task in job.Tasks)
{
switch (task)
{
case CrawlPageOpenLinksTask openLinks:
{
var urls = Parser.GetUrlsFromSelector(doc, openLinks.OpenLinksSelector);

Logger?.LogDebug("OpenLinks selector returned {Count} Urls found in openLinks task.", urls.Length);

foreach (var url in urls)
{
var newPageBuilder = openLinks.JobBuilder;

newPageBuilder.WithUrl(url);
newPageBuilder.AddOutput([.. job.Outputs]);
newPageBuilder.WithTasks(job.Tasks.Where(t => t is CrawlPageExtractObjectTask).ToArray());

var newPage = openLinks.JobBuilder.Build();

newJobs.Add(newPage);
}
}
HandleOpenLinksTask(openLinks, job, newJobs);
break;
case CrawlPagePaginateTask paginate:
{
var urls = Parser.GetUrlsFromSelector(doc, paginate.PaginationSelector);

Logger?.LogDebug("Paginate selector {Count} Urls found in paginate task.", urls.Length);

var newPages = urls.Select(url => new PageCrawlJob(url, [.. job.Tasks], [.. job.Outputs], paginate.PageActions));

newJobs.AddRange(newPages);
}
HandlePaginateTask(paginate, job, newJobs);
break;
case CrawlPageExtractObjectTask scrape:
{
jArray.Add(Parser.ExtractObject(doc, scrape));
}
HandleExtractObject(scrape, job.Url, jArray);
break;
case CrawlPageExtractListTask scrapeList:
{
var jArrayResult = Parser.ExtractList(doc, scrapeList);

foreach (var obj in jArrayResult.Cast<JObject>())
{
jArray.Add(obj);
}
}
HandleExtractList(scrapeList, jArray);
break;
default:
throw new NotImplementedException("Task not implemented");
}
}
}

protected virtual async Task<IDocument> GetDocument(string html)
private void HandleExtractList(CrawlPageExtractListTask scrapeList, JArray jArray)
{
var jArrayResult = Parser!.ExtractList(scrapeList);

foreach (var obj in jArrayResult.Cast<JObject>())
{
jArray.Add(obj);
}
}

private void HandleExtractObject(CrawlPageExtractObjectTask scrape, string url, JArray jArray)
{
var config = Configuration.Default;
var context = BrowsingContext.New(config);
var parsedObject = Parser!.ExtractObject(scrape);

var document = await context.OpenAsync(req => req.Content(html));
parsedObject.AddFirst(new JProperty("Url", url));

return document;
jArray.Add(parsedObject);
}

protected virtual void HandlePaginateTask(CrawlPagePaginateTask paginate, PageCrawlJob job, List<PageCrawlJob> newJobs)
{
var urls = Parser!.ParseForLinks(paginate.PaginationSelector);

Logger?.LogDebug("Paginate selector {Count} Urls found in paginate task.", urls.Length);

var newPages = urls.Select(url => new PageCrawlJob(url, [.. job.Tasks], [.. job.Outputs], job.BrowserActions));

newJobs.AddRange(newPages);
}

protected virtual void HandleOpenLinksTask(CrawlPageOpenLinksTask openLinks, PageCrawlJob job, List<PageCrawlJob> newJobs)
{
var urls = Parser!.ParseForLinks(openLinks.OpenLinksSelector);

Logger?.LogDebug("OpenLinks selector returned {Count} Urls found in openLinks task.", urls.Length);

foreach (var url in urls)
{
var newPageBuilder = openLinks.JobBuilder;

newPageBuilder.WithUrl(url);
newPageBuilder.AddOutput([.. job.Outputs]);
newPageBuilder.WithTasks(job.Tasks.Where(t => t is CrawlPageExtractObjectTask).ToArray());

var newPage = openLinks.JobBuilder.Build();

newJobs.Add(newPage);
}
}

public void WithParser(IParser parser)
{
Parser = parser;
}

public void WithLoggerFactory(ILoggerFactory? loggerFactory)
public void WithLoggerFactory(ILoggerFactory loggerFactory)
{
Logger = loggerFactory?.CreateLogger<PuppeteerCrawler>();
Logger = loggerFactory.CreateLogger<PuppeteerCrawler>();
}
}

Loading

0 comments on commit 905bb2c

Please sign in to comment.