From 905bb2ce942cfc35dfe49d7080d38ed7700f57ec Mon Sep 17 00:00:00 2001 From: Marcel <14852157+Marcel0024@users.noreply.github.com> Date: Mon, 8 Jul 2024 17:26:33 +0200 Subject: [PATCH] Added some tests and some refactors --- CocoCrawler.sln | 14 +- CocoCrawler/Builders/CrawlerEngineBuilder.cs | 4 +- CocoCrawler/Builders/EngineSettingsBuilder.cs | 31 ++-- CocoCrawler/Builders/PageCrawlJobBuilder.cs | 27 ++-- CocoCrawler/CrawlJob/Cookie.cs | 3 + .../PageTasks/CrawlPagePaginateTask.cs | 7 +- .../CrawlOutputs/CsvFileCrawlOutput.cs | 6 + CocoCrawler/Crawler/ICrawler.cs | 4 +- CocoCrawler/Crawler/PuppeteerCrawler.cs | 106 +++++++------ CocoCrawler/CrawlerEngine.cs | 55 ++++--- CocoCrawler/EngineSettings.cs | 7 +- CocoCrawler/Parser/AngleSharpParser.cs | 25 ++- CocoCrawler/Parser/IParser.cs | 10 +- CocoCrawler/Scheduler/IScheduler.cs | 7 +- CocoCrawler/Scheduler/InMemoryScheduler.cs | 63 ++++++++ CocoCrawler/Scheduler/MemoryScheduler.cs | 52 ------- .../RedditListingBackgroundService.cs | 2 +- .../RedditPostsBackgroundService.cs | 4 +- README.md | 18 ++- .../CocoCrawler.IntegrationTests.csproj | 44 ++++++ .../ConfigureEngine/CookiesTest.cs | 45 ++++++ .../ThrowOnBuilderExceptions.cs | 92 +++++++++++ .../ConfigureEngine/UserAgentTests.cs | 34 ++++ .../ExtractListAndPaginateTests.cs | 59 +++++++ .../Responses/main-page.html | 43 ++++++ .../Responses/page-2.html | 43 ++++++ .../OpenLinksExtractObjectAndPaginate.cs | 146 ++++++++++++++++++ Tests/CocoCrawler.Tests.csproj | 23 --- Tests/UnitTest1.cs | 10 -- 29 files changed, 750 insertions(+), 234 deletions(-) create mode 100644 CocoCrawler/CrawlJob/Cookie.cs create mode 100644 CocoCrawler/Scheduler/InMemoryScheduler.cs delete mode 100644 CocoCrawler/Scheduler/MemoryScheduler.cs create mode 100644 Tests/CocoCrawler.IntegrationTests/CocoCrawler.IntegrationTests.csproj create mode 100644 Tests/CocoCrawler.IntegrationTests/ConfigureEngine/CookiesTest.cs create mode 100644 Tests/CocoCrawler.IntegrationTests/ConfigureEngine/ThrowOnBuilderExceptions.cs create mode 100644 Tests/CocoCrawler.IntegrationTests/ConfigureEngine/UserAgentTests.cs create mode 100644 Tests/CocoCrawler.IntegrationTests/ExtractListAndPaginate/ExtractListAndPaginateTests.cs create mode 100644 Tests/CocoCrawler.IntegrationTests/ExtractListAndPaginate/Responses/main-page.html create mode 100644 Tests/CocoCrawler.IntegrationTests/ExtractListAndPaginate/Responses/page-2.html create mode 100644 Tests/CocoCrawler.IntegrationTests/OpenLinksExtractObjectAndPaginate/OpenLinksExtractObjectAndPaginate.cs delete mode 100644 Tests/CocoCrawler.Tests.csproj delete mode 100644 Tests/UnitTest1.cs diff --git a/CocoCrawler.sln b/CocoCrawler.sln index 6ae70bb..67695df 100644 --- a/CocoCrawler.sln +++ b/CocoCrawler.sln @@ -13,10 +13,10 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "BackgroundServiceExtractLis EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tests", "tests", "{E0343CD2-205D-4169-A077-818B0B6602CE}" EndProject -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "CocoCrawler.Tests", "Tests\CocoCrawler.Tests.csproj", "{FE2D3737-4B8F-4FB2-88FF-E3627D996432}" -EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "BackgroundServiceOpenLinksAndExtractObjects", "Examples\BackgroundServiceOpenLinksAndExtractObject\BackgroundServiceOpenLinksAndExtractObjects.csproj", "{9BBA41BF-10D9-4030-A523-8539734E208F}" EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "CocoCrawler.IntegrationTests", "Tests\CocoCrawler.IntegrationTests\CocoCrawler.IntegrationTests.csproj", "{904CA6C2-FC83-49AA-A5AE-2019D8EE54BE}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -31,14 +31,14 @@ Global {0B926C23-31CD-4208-9773-61EC3CA9BD23}.Debug|Any CPU.Build.0 = Debug|Any CPU {0B926C23-31CD-4208-9773-61EC3CA9BD23}.Release|Any CPU.ActiveCfg = Release|Any CPU {0B926C23-31CD-4208-9773-61EC3CA9BD23}.Release|Any CPU.Build.0 = Release|Any CPU - {FE2D3737-4B8F-4FB2-88FF-E3627D996432}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {FE2D3737-4B8F-4FB2-88FF-E3627D996432}.Debug|Any CPU.Build.0 = Debug|Any CPU - {FE2D3737-4B8F-4FB2-88FF-E3627D996432}.Release|Any CPU.ActiveCfg = Release|Any CPU - {FE2D3737-4B8F-4FB2-88FF-E3627D996432}.Release|Any CPU.Build.0 = Release|Any CPU {9BBA41BF-10D9-4030-A523-8539734E208F}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {9BBA41BF-10D9-4030-A523-8539734E208F}.Debug|Any CPU.Build.0 = Debug|Any CPU {9BBA41BF-10D9-4030-A523-8539734E208F}.Release|Any CPU.ActiveCfg = Release|Any CPU {9BBA41BF-10D9-4030-A523-8539734E208F}.Release|Any CPU.Build.0 = Release|Any CPU + {904CA6C2-FC83-49AA-A5AE-2019D8EE54BE}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {904CA6C2-FC83-49AA-A5AE-2019D8EE54BE}.Debug|Any CPU.Build.0 = Debug|Any CPU + {904CA6C2-FC83-49AA-A5AE-2019D8EE54BE}.Release|Any CPU.ActiveCfg = Release|Any CPU + {904CA6C2-FC83-49AA-A5AE-2019D8EE54BE}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -46,8 +46,8 @@ Global GlobalSection(NestedProjects) = preSolution {04FDE2B4-C800-41B9-BC71-C39503871F13} = {E8E77F73-2709-4B42-9FDC-9475BD4306E2} {0B926C23-31CD-4208-9773-61EC3CA9BD23} = {9EDB9EF4-A58C-44D4-87C9-A8960C007837} - {FE2D3737-4B8F-4FB2-88FF-E3627D996432} = {E0343CD2-205D-4169-A077-818B0B6602CE} {9BBA41BF-10D9-4030-A523-8539734E208F} = {9EDB9EF4-A58C-44D4-87C9-A8960C007837} + {904CA6C2-FC83-49AA-A5AE-2019D8EE54BE} = {E0343CD2-205D-4169-A077-818B0B6602CE} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {992F3EF0-1D78-465D-914D-294A906EDF2C} diff --git a/CocoCrawler/Builders/CrawlerEngineBuilder.cs b/CocoCrawler/Builders/CrawlerEngineBuilder.cs index b3183c6..7f85271 100644 --- a/CocoCrawler/Builders/CrawlerEngineBuilder.cs +++ b/CocoCrawler/Builders/CrawlerEngineBuilder.cs @@ -76,11 +76,9 @@ public async Task BuildAsync(CancellationToken cancellationToken var engineSettings = EngineSettingsBuilder.Build(); var jobs = CrawlPages.Select(cp => cp.Build()).ToImmutableArray(); - await engineSettings.Scheduler.AddAsync(jobs, cancellationToken); - await InitializeOutputs(jobs, cancellationToken); - return new CrawlerEngine(engineSettings); + return new CrawlerEngine(engineSettings, jobs); } private static async Task InitializeOutputs(ImmutableArray jobs, CancellationToken token) diff --git a/CocoCrawler/Builders/EngineSettingsBuilder.cs b/CocoCrawler/Builders/EngineSettingsBuilder.cs index fd60cca..3d89ccb 100644 --- a/CocoCrawler/Builders/EngineSettingsBuilder.cs +++ b/CocoCrawler/Builders/EngineSettingsBuilder.cs @@ -1,10 +1,10 @@  using CocoCrawler.Crawler; +using CocoCrawler.CrawlJob; using CocoCrawler.Exceptions; using CocoCrawler.Parser; using CocoCrawler.Scheduler; using Microsoft.Extensions.Logging; -using System.Net; namespace CocoCrawler.Builders; @@ -17,10 +17,10 @@ public class EngineSettingsBuilder private bool ParallelismDisabled { get; set; } = false; private string? UserAgent { get; set; } = null; private ILoggerFactory? LoggerFactory { get; set; } = null; + private IScheduler Scheduler { get; set; } = new InMemoryScheduler(); private IParser Parser { get; set; } = new AngleSharpParser(); - private IScheduler Scheduler { get; set; } = new MemoryScheduler(); private ICrawler Crawler { get; set; } = new PuppeteerCrawler(); - private Cookie[] Cookies { get; set; } = []; + private Cookie[]? Cookies { get; set; } = null; /// /// Sets the headless mode for the browser. @@ -85,25 +85,25 @@ public EngineSettingsBuilder DisableParallelism() } /// - /// Sets the parser to be used by the crawler engine. + /// Sets the scheduler to be used by the crawler engine. /// - /// The parser implementation. + /// The scheduler implementation. /// The CrawlerEngineBuilder instance. - public EngineSettingsBuilder WithParser(IParser parser) + public EngineSettingsBuilder WithScheduler(IScheduler scheduler) { - Parser = parser; + Scheduler = scheduler; return this; } /// - /// Sets the scheduler to be used by the crawler engine. + /// Sets the parser to be used by the crawler. /// - /// The scheduler implementation. + /// The crawler implementation. /// The CrawlerEngineBuilder instance. - public EngineSettingsBuilder WithScheduler(IScheduler scheduler) + public EngineSettingsBuilder WithParser(IParser parser) { - Scheduler = scheduler; + Parser = parser; return this; } @@ -168,8 +168,12 @@ internal EngineSettings Build() throw new CocoCrawlerBuilderException("The total pages to crawl must be greater than or equal to 1."); } + if (LoggerFactory is not null) + { + Crawler.WithLoggerFactory(LoggerFactory); + } + Crawler.WithParser(Parser); - Crawler.WithLoggerFactory(LoggerFactory); return new EngineSettings(Headless, ParallelismDisabled, @@ -177,11 +181,10 @@ internal EngineSettings Build() ParallelismDegree, MaxPagesToCrawl, UserAgent, - Parser, Crawler, Scheduler, LoggerFactory, - Cookies, + Cookies is null ? [] : [.. Cookies], new HistoryTracker()); } } diff --git a/CocoCrawler/Builders/PageCrawlJobBuilder.cs b/CocoCrawler/Builders/PageCrawlJobBuilder.cs index cd6825b..23f8d28 100644 --- a/CocoCrawler/Builders/PageCrawlJobBuilder.cs +++ b/CocoCrawler/Builders/PageCrawlJobBuilder.cs @@ -1,4 +1,6 @@ -using CocoCrawler.CrawlOutputs; +using AngleSharp.Css; +using AngleSharp.Css.Parser; +using CocoCrawler.CrawlOutputs; using CocoCrawler.Exceptions; using CocoCrawler.Job; using CocoCrawler.Job.PageBrowserActions; @@ -27,6 +29,11 @@ public PageCrawlJobBuilder(string url) { ArgumentException.ThrowIfNullOrWhiteSpace(url, nameof(url)); + if (!Uri.TryCreate(url, UriKind.Absolute, out _)) + { + throw new CocoCrawlerBuilderException($"{url} is not a valid URI."); + } + Url = url; } @@ -83,23 +90,13 @@ public PageCrawlJobBuilder OpenLinks(string linksSelector, Action - /// Adds a task to paginate through a list of elements on the page with additional page actions. + /// Adds a task to paginate through. /// /// The CSS selector to select the pagination element. - /// The action to configure the page actions for the pagination task. /// The updated instance. - public PageCrawlJobBuilder AddPagination(string paginationSelector, Action? options = null) + public PageCrawlJobBuilder AddPagination(string paginationSelector) { - PageActionsBuilder? pageActionsBuilder = null; - - if (options is not null) - { - pageActionsBuilder = new PageActionsBuilder(); - - options(pageActionsBuilder); - } - - Tasks.Add(new CrawlPagePaginateTask(paginationSelector, pageActionsBuilder?.Build())); + Tasks.Add(new CrawlPagePaginateTask(paginationSelector)); return this; } @@ -210,7 +207,7 @@ internal PageCrawlJob Build() { if (Tasks.Count == 0) { - throw new CocoCrawlerBuilderException($"A Page requires a purpose, try calling .{nameof(OpenLinks)}() or .{nameof(ExtractObject)}() or {nameof(AddPagination)}()"); + throw new CocoCrawlerBuilderException($"A Page requires a purpose, try calling .{nameof(OpenLinks)}() or .{nameof(ExtractObject)}() or {nameof(AddPagination)}()."); } if (Url is null) diff --git a/CocoCrawler/CrawlJob/Cookie.cs b/CocoCrawler/CrawlJob/Cookie.cs new file mode 100644 index 0000000..d48b5db --- /dev/null +++ b/CocoCrawler/CrawlJob/Cookie.cs @@ -0,0 +1,3 @@ +namespace CocoCrawler.CrawlJob; + +public record Cookie(string Name, string Value, string Domain, string Path = "/"); diff --git a/CocoCrawler/CrawlJob/PageTasks/CrawlPagePaginateTask.cs b/CocoCrawler/CrawlJob/PageTasks/CrawlPagePaginateTask.cs index 5b7de02..470c050 100644 --- a/CocoCrawler/CrawlJob/PageTasks/CrawlPagePaginateTask.cs +++ b/CocoCrawler/CrawlJob/PageTasks/CrawlPagePaginateTask.cs @@ -1,10 +1,7 @@ -using CocoCrawler.Job.PageBrowserActions; +namespace CocoCrawler.Job.PageTasks; -namespace CocoCrawler.Job.PageTasks; - -public class CrawlPagePaginateTask(string paginationSelector, PageActions? pageActions = null) +public class CrawlPagePaginateTask(string paginationSelector) : IPageCrawlTask { public string PaginationSelector { get; init; } = paginationSelector; - public PageActions? PageActions { get; init; } = pageActions; } diff --git a/CocoCrawler/CrawlOutputs/CsvFileCrawlOutput.cs b/CocoCrawler/CrawlOutputs/CsvFileCrawlOutput.cs index 705f3ab..230e1c6 100644 --- a/CocoCrawler/CrawlOutputs/CsvFileCrawlOutput.cs +++ b/CocoCrawler/CrawlOutputs/CsvFileCrawlOutput.cs @@ -26,6 +26,12 @@ public virtual async Task Initiaize(CancellationToken cancellationToken) { File.Delete(filePath); } + + var dir = Path.GetDirectoryName(filePath); + if (!string.IsNullOrWhiteSpace(dir)) + { + Directory.CreateDirectory(dir); + } } finally { diff --git a/CocoCrawler/Crawler/ICrawler.cs b/CocoCrawler/Crawler/ICrawler.cs index c26497a..901c8bb 100644 --- a/CocoCrawler/Crawler/ICrawler.cs +++ b/CocoCrawler/Crawler/ICrawler.cs @@ -7,7 +7,7 @@ namespace CocoCrawler.Crawler; public interface ICrawler { - Task Crawl(IPage browserTab, PageCrawlJob job); void WithParser(IParser parser); - void WithLoggerFactory(ILoggerFactory? loggerFactory); + void WithLoggerFactory(ILoggerFactory loggerFactory); + Task Crawl(IPage browserTab, PageCrawlJob job); } diff --git a/CocoCrawler/Crawler/PuppeteerCrawler.cs b/CocoCrawler/Crawler/PuppeteerCrawler.cs index 39ed940..39aa9a8 100644 --- a/CocoCrawler/Crawler/PuppeteerCrawler.cs +++ b/CocoCrawler/Crawler/PuppeteerCrawler.cs @@ -1,6 +1,4 @@ -using AngleSharp; -using AngleSharp.Dom; -using CocoCrawler.Job; +using CocoCrawler.Job; using CocoCrawler.Job.PageBrowserActions; using CocoCrawler.Job.PageTasks; using CocoCrawler.Parser; @@ -12,8 +10,8 @@ namespace CocoCrawler.Crawler; public class PuppeteerCrawler : ICrawler { - private IParser? Parser { get; set; } private ILogger? Logger { get; set; } + private IParser? Parser { get; set; } public virtual async Task Crawl(IPage browserTab, PageCrawlJob currentPageJob) { @@ -56,59 +54,25 @@ protected virtual async Task ExecutePageActions(IPage page, PageActions? browser protected virtual async Task Parse(PageCrawlJob job, string html, List newJobs, JArray jArray) { - ArgumentNullException.ThrowIfNull(Parser, nameof(Parser)); + ArgumentNullException.ThrowIfNull(Parser); - var doc = await GetDocument(html); + await Parser.Init(html); foreach (var task in job.Tasks) { switch (task) { case CrawlPageOpenLinksTask openLinks: - { - var urls = Parser.GetUrlsFromSelector(doc, openLinks.OpenLinksSelector); - - Logger?.LogDebug("OpenLinks selector returned {Count} Urls found in openLinks task.", urls.Length); - - foreach (var url in urls) - { - var newPageBuilder = openLinks.JobBuilder; - - newPageBuilder.WithUrl(url); - newPageBuilder.AddOutput([.. job.Outputs]); - newPageBuilder.WithTasks(job.Tasks.Where(t => t is CrawlPageExtractObjectTask).ToArray()); - - var newPage = openLinks.JobBuilder.Build(); - - newJobs.Add(newPage); - } - } + HandleOpenLinksTask(openLinks, job, newJobs); break; case CrawlPagePaginateTask paginate: - { - var urls = Parser.GetUrlsFromSelector(doc, paginate.PaginationSelector); - - Logger?.LogDebug("Paginate selector {Count} Urls found in paginate task.", urls.Length); - - var newPages = urls.Select(url => new PageCrawlJob(url, [.. job.Tasks], [.. job.Outputs], paginate.PageActions)); - - newJobs.AddRange(newPages); - } + HandlePaginateTask(paginate, job, newJobs); break; case CrawlPageExtractObjectTask scrape: - { - jArray.Add(Parser.ExtractObject(doc, scrape)); - } + HandleExtractObject(scrape, job.Url, jArray); break; case CrawlPageExtractListTask scrapeList: - { - var jArrayResult = Parser.ExtractList(doc, scrapeList); - - foreach (var obj in jArrayResult.Cast()) - { - jArray.Add(obj); - } - } + HandleExtractList(scrapeList, jArray); break; default: throw new NotImplementedException("Task not implemented"); @@ -116,14 +80,54 @@ protected virtual async Task Parse(PageCrawlJob job, string html, List GetDocument(string html) + private void HandleExtractList(CrawlPageExtractListTask scrapeList, JArray jArray) + { + var jArrayResult = Parser!.ExtractList(scrapeList); + + foreach (var obj in jArrayResult.Cast()) + { + jArray.Add(obj); + } + } + + private void HandleExtractObject(CrawlPageExtractObjectTask scrape, string url, JArray jArray) { - var config = Configuration.Default; - var context = BrowsingContext.New(config); + var parsedObject = Parser!.ExtractObject(scrape); - var document = await context.OpenAsync(req => req.Content(html)); + parsedObject.AddFirst(new JProperty("Url", url)); - return document; + jArray.Add(parsedObject); + } + + protected virtual void HandlePaginateTask(CrawlPagePaginateTask paginate, PageCrawlJob job, List newJobs) + { + var urls = Parser!.ParseForLinks(paginate.PaginationSelector); + + Logger?.LogDebug("Paginate selector {Count} Urls found in paginate task.", urls.Length); + + var newPages = urls.Select(url => new PageCrawlJob(url, [.. job.Tasks], [.. job.Outputs], job.BrowserActions)); + + newJobs.AddRange(newPages); + } + + protected virtual void HandleOpenLinksTask(CrawlPageOpenLinksTask openLinks, PageCrawlJob job, List newJobs) + { + var urls = Parser!.ParseForLinks(openLinks.OpenLinksSelector); + + Logger?.LogDebug("OpenLinks selector returned {Count} Urls found in openLinks task.", urls.Length); + + foreach (var url in urls) + { + var newPageBuilder = openLinks.JobBuilder; + + newPageBuilder.WithUrl(url); + newPageBuilder.AddOutput([.. job.Outputs]); + newPageBuilder.WithTasks(job.Tasks.Where(t => t is CrawlPageExtractObjectTask).ToArray()); + + var newPage = openLinks.JobBuilder.Build(); + + newJobs.Add(newPage); + } } public void WithParser(IParser parser) @@ -131,9 +135,9 @@ public void WithParser(IParser parser) Parser = parser; } - public void WithLoggerFactory(ILoggerFactory? loggerFactory) + public void WithLoggerFactory(ILoggerFactory loggerFactory) { - Logger = loggerFactory?.CreateLogger(); + Logger = loggerFactory.CreateLogger(); } } diff --git a/CocoCrawler/CrawlerEngine.cs b/CocoCrawler/CrawlerEngine.cs index 8ac7276..0b7a0b4 100644 --- a/CocoCrawler/CrawlerEngine.cs +++ b/CocoCrawler/CrawlerEngine.cs @@ -1,4 +1,5 @@ using CocoCrawler.Crawler; +using CocoCrawler.CrawlJob; using CocoCrawler.Exceptions; using CocoCrawler.Job; using CocoCrawler.Outputs; @@ -6,43 +7,42 @@ using Newtonsoft.Json.Linq; using PuppeteerSharp; using System.Collections.Immutable; -using System.Net; namespace CocoCrawler; -public class CrawlerEngine(EngineSettings settings) +public class CrawlerEngine(EngineSettings settings, ImmutableArray jobs) { private readonly ILogger? _logger = settings.LoggerFactory?.CreateLogger(); public virtual async Task RunAsync(CancellationToken cancellationToken = default) { - var parallelOptions = new ParallelOptions - { - CancellationToken = cancellationToken, - MaxDegreeOfParallelism = settings.ParallelismDegree - }; - - await DownloadBrowser(); - await using var browser = await LaunchBrowser(settings); + await using var browser = await DownloadAndLaunchBrowser(settings); + await settings.Scheduler.Init(jobs, cancellationToken); try { if (settings.DisableParallelism) { - // Use 1 browser with 1 tab for all jobs + // 1 browser with 1 tab for all jobs using var page = await browser.NewPageAsync(); await AddUserAgent(page, settings.UserAgent); await AddCookies(page, settings.Cookies); - await foreach (var job in settings.Scheduler.GetAllAsync(cancellationToken)) + await foreach (var job in settings.Scheduler.GetAll(cancellationToken)) { await CrawlPage(page, job, settings, cancellationToken); } } else { - // Use 1 browser with a tab for each job in parallel - await Parallel.ForEachAsync(settings.Scheduler.GetAllAsync(cancellationToken), parallelOptions, async (job, token) => + var parallelOptions = new ParallelOptions + { + CancellationToken = cancellationToken, + MaxDegreeOfParallelism = settings.ParallelismDegree + }; + + // 1 browser with a tab for each job in parallel + await Parallel.ForEachAsync(settings.Scheduler.GetAll(cancellationToken), parallelOptions, async (job, token) => { using var page = await browser.NewPageAsync(); await AddUserAgent(page, settings.UserAgent); @@ -54,12 +54,12 @@ await Parallel.ForEachAsync(settings.Scheduler.GetAllAsync(cancellationToken), p } catch (CocoCrawlerPageLimitReachedException ex) { - _logger?.LogInformation("Crawl Finished. {ex}. To Increase the limit call .ConfigureEngine(o => o.TotalPagesToCrawl(...))", ex.Message); + _logger?.LogInformation("Crawl Finished. {ex}. To Increase the crawl limit call .ConfigureEngine(o => o.TotalPagesToCrawl(...))", ex.Message); return; } catch (OperationCanceledException) { - _logger?.LogWarning("CancellationToken is cancelled, stopping engine."); + _logger?.LogWarning("Cancelled task. Stopping engine."); return; } catch (Exception ex) @@ -77,7 +77,7 @@ protected virtual async Task AddUserAgent(IPage page, string? userAgent) } } - protected virtual async Task AddCookies(IPage page, Cookie[] cookies) + protected virtual async Task AddCookies(IPage page, ImmutableArray cookies) { if (cookies.Length > 0) { @@ -105,7 +105,7 @@ protected virtual async Task HandleNewJobs(IList newJobs, EngineSe { var jobs = newJobs.Where(ncj => !engine.IgnoreUrls.Any(iu => iu == ncj.Url)); - await engine.Scheduler.AddAsync(jobs.ToImmutableArray(), token); + await engine.Scheduler.Add(jobs.ToImmutableArray(), token); } protected virtual async Task HandleParsedResults(JArray jArray, ImmutableArray outputs, CancellationToken token) @@ -119,16 +119,6 @@ protected virtual async Task HandleParsedResults(JArray jArray, ImmutableArray LaunchBrowser(EngineSettings engineSettings) - { - var launchOptions = new LaunchOptions() - { - Headless = engineSettings.IsHeadless - }; - - return await Puppeteer.LaunchAsync(launchOptions); - } - protected virtual void AddUrlToHistoryAndCheckLimit(string url, HistoryTracker historyTracker, int maxPagesToCrawl) { if (historyTracker.GetVisitedLinksCount() >= maxPagesToCrawl) @@ -139,9 +129,16 @@ protected virtual void AddUrlToHistoryAndCheckLimit(string url, HistoryTracker h historyTracker.AddUrl(url); } - protected virtual async Task DownloadBrowser() + protected virtual async Task DownloadAndLaunchBrowser(EngineSettings settings) { var browserFetcher = new BrowserFetcher(); await browserFetcher.DownloadAsync(); + + var launchOptions = new LaunchOptions() + { + Headless = settings.IsHeadless + }; + + return await Puppeteer.LaunchAsync(launchOptions); } } \ No newline at end of file diff --git a/CocoCrawler/EngineSettings.cs b/CocoCrawler/EngineSettings.cs index fed1153..8d98809 100644 --- a/CocoCrawler/EngineSettings.cs +++ b/CocoCrawler/EngineSettings.cs @@ -1,8 +1,8 @@ using CocoCrawler.Crawler; -using CocoCrawler.Parser; +using CocoCrawler.CrawlJob; using CocoCrawler.Scheduler; using Microsoft.Extensions.Logging; -using System.Net; +using System.Collections.Immutable; namespace CocoCrawler; @@ -13,9 +13,8 @@ public record EngineSettings( int ParallelismDegree, int MaxPagesToCrawl, string? UserAgent, - IParser Parser, ICrawler Crawler, IScheduler Scheduler, ILoggerFactory? LoggerFactory, - Cookie[] Cookies, + ImmutableArray Cookies, HistoryTracker HistoryTracker); \ No newline at end of file diff --git a/CocoCrawler/Parser/AngleSharpParser.cs b/CocoCrawler/Parser/AngleSharpParser.cs index 2cb1795..84403cb 100644 --- a/CocoCrawler/Parser/AngleSharpParser.cs +++ b/CocoCrawler/Parser/AngleSharpParser.cs @@ -1,4 +1,5 @@ -using AngleSharp.Dom; +using AngleSharp; +using AngleSharp.Dom; using CocoCrawler.Job.PageTasks; using Newtonsoft.Json.Linq; @@ -6,25 +7,35 @@ namespace CocoCrawler.Parser; public class AngleSharpParser : IParser { - public virtual string[] GetUrlsFromSelector(IDocument doc, string selector) + private IDocument? _document; + + public virtual async Task Init(string html) + { + var config = Configuration.Default; + var context = BrowsingContext.New(config); + + _document = await context.OpenAsync(req => req.Content(html)); + } + + public virtual string[] ParseForLinks(string linksSelector) { - return doc.QuerySelectorAll(selector) + return _document!.QuerySelectorAll(linksSelector) .Select(link => link.GetAttribute("href")) .Where(link => link is not null) .Select(link => link!) .ToArray(); } - public virtual JObject ExtractObject(IDocument doc, CrawlPageExtractObjectTask task) + public virtual JObject ExtractObject(CrawlPageExtractObjectTask task) { - return ParseObject(doc.DocumentElement!, task.Selectors); + return ParseObject(_document!.DocumentElement, task.Selectors); } - public virtual JArray ExtractList(IDocument doc, CrawlPageExtractListTask scrapeList) + public virtual JArray ExtractList(CrawlPageExtractListTask scrapeList) { var jArray = new JArray(); - var containers = doc.QuerySelectorAll(scrapeList.ContentContainersSelector); + var containers = _document!.QuerySelectorAll(scrapeList.ContentContainersSelector); if (containers is null || containers.Length == 0) { diff --git a/CocoCrawler/Parser/IParser.cs b/CocoCrawler/Parser/IParser.cs index 2f365ef..033d5aa 100644 --- a/CocoCrawler/Parser/IParser.cs +++ b/CocoCrawler/Parser/IParser.cs @@ -1,12 +1,12 @@ -using AngleSharp.Dom; -using CocoCrawler.Job.PageTasks; +using CocoCrawler.Job.PageTasks; using Newtonsoft.Json.Linq; namespace CocoCrawler.Parser; public interface IParser { - string[] GetUrlsFromSelector(IDocument doc, string selector); - JArray ExtractList(IDocument doc, CrawlPageExtractListTask scrapeList); - JObject ExtractObject(IDocument doc, CrawlPageExtractObjectTask task); + Task Init(string html); + string[] ParseForLinks(string linksSelector); + JArray ExtractList(CrawlPageExtractListTask scrapeList); + JObject ExtractObject(CrawlPageExtractObjectTask task); } diff --git a/CocoCrawler/Scheduler/IScheduler.cs b/CocoCrawler/Scheduler/IScheduler.cs index 3531c82..5a24eeb 100644 --- a/CocoCrawler/Scheduler/IScheduler.cs +++ b/CocoCrawler/Scheduler/IScheduler.cs @@ -5,7 +5,8 @@ namespace CocoCrawler.Scheduler; public interface IScheduler { - IAsyncEnumerable GetAllAsync(CancellationToken cancellationToken); - Task AddAsync(PageCrawlJob job, CancellationToken cancellationToken); - Task AddAsync(ImmutableArray jobs, CancellationToken cancellationToken); + IAsyncEnumerable GetAll(CancellationToken cancellationToken); + Task Add(PageCrawlJob job, CancellationToken cancellationToken); + Task Add(ImmutableArray jobs, CancellationToken cancellationToken); + Task Init(ImmutableArray jobs, CancellationToken cancellationToken); } diff --git a/CocoCrawler/Scheduler/InMemoryScheduler.cs b/CocoCrawler/Scheduler/InMemoryScheduler.cs new file mode 100644 index 0000000..ed87d51 --- /dev/null +++ b/CocoCrawler/Scheduler/InMemoryScheduler.cs @@ -0,0 +1,63 @@ +using CocoCrawler.Job; +using System.Collections.Immutable; +using System.Threading.Channels; + +namespace CocoCrawler.Scheduler; + +public class InMemoryScheduler : IScheduler +{ + private Channel _jobChannel = Channel.CreateUnbounded(); + private readonly Timer _timer; + private readonly TimeSpan _completeAfterLastJob; + + public InMemoryScheduler(int totalSecondsTimeoutAfterJob = 120) + { + _completeAfterLastJob = TimeSpan.FromSeconds(totalSecondsTimeoutAfterJob); + _timer = new Timer(CompleteIfNoJobReceived, null, _completeAfterLastJob, _completeAfterLastJob); + } + + public virtual IAsyncEnumerable GetAll(CancellationToken cancellationToken) + { + return _jobChannel.Reader.ReadAllAsync(cancellationToken); + } + + public virtual async Task Add(PageCrawlJob job, CancellationToken cancellationToken) + { + await _jobChannel.Writer.WriteAsync(job, cancellationToken); + ResetTimer(); + } + + public virtual async Task Add(ImmutableArray jobs, CancellationToken cancellationToken) + { + foreach (var job in jobs) + { + await Add(job, cancellationToken); + } + } + + public virtual async Task Init(ImmutableArray jobs, CancellationToken cancellationToken) + { + _jobChannel = Channel.CreateUnbounded(); + + foreach (var job in jobs) + { + await Add(job, cancellationToken); + } + } + + private void ResetTimer() + { + _timer.Change(_completeAfterLastJob, _completeAfterLastJob); + } + + private void CompleteIfNoJobReceived(object? state) + { + Complete(); + } + + private void Complete() + { + _jobChannel.Writer.Complete(); + _timer.Change(Timeout.Infinite, Timeout.Infinite); + } +} diff --git a/CocoCrawler/Scheduler/MemoryScheduler.cs b/CocoCrawler/Scheduler/MemoryScheduler.cs deleted file mode 100644 index fc0a2f1..0000000 --- a/CocoCrawler/Scheduler/MemoryScheduler.cs +++ /dev/null @@ -1,52 +0,0 @@ -using CocoCrawler.Job; -using System.Collections.Immutable; -using System.Threading.Channels; - -namespace CocoCrawler.Scheduler; - -public class MemoryScheduler : IScheduler -{ - private readonly Channel _jobChannel = Channel.CreateUnbounded(); - private readonly Timer _timer; - private readonly TimeSpan _completeAfterLastJob = TimeSpan.FromMinutes(2); - - public MemoryScheduler() - { - _timer = new Timer(CompleteIfNoJobReceived, null, _completeAfterLastJob, _completeAfterLastJob); - } - - public IAsyncEnumerable GetAllAsync(CancellationToken cancellationToken) - { - return _jobChannel.Reader.ReadAllAsync(cancellationToken); - } - - public async Task AddAsync(PageCrawlJob job, CancellationToken cancellationToken) - { - await _jobChannel.Writer.WriteAsync(job, cancellationToken); - ResetTimer(); - } - - public async Task AddAsync(ImmutableArray jobs, CancellationToken cancellationToken) - { - foreach (var job in jobs) - { - await AddAsync(job, cancellationToken); - } - } - - private void ResetTimer() - { - _timer.Change(_completeAfterLastJob, _completeAfterLastJob); - } - - private void CompleteIfNoJobReceived(object? state) - { - Complete(); - } - - private void Complete() - { - _jobChannel.Writer.Complete(); - _timer.Change(Timeout.Infinite, Timeout.Infinite); - } -} diff --git a/Examples/BackgroundServiceExtractList/RedditListingBackgroundService.cs b/Examples/BackgroundServiceExtractList/RedditListingBackgroundService.cs index 0c8fa4d..b7f8268 100644 --- a/Examples/BackgroundServiceExtractList/RedditListingBackgroundService.cs +++ b/Examples/BackgroundServiceExtractList/RedditListingBackgroundService.cs @@ -24,7 +24,7 @@ public override async Task StartAsync(CancellationToken cancellationToken) new("Total Comments","a.comments"), new("Url","a.title", "href") ]) - .AddPagination("span.next-button > a.not-exist", newPage => newPage.ScrollToEnd()) + .AddPagination("span.next-button > a") .AddOutputToConsole() .AddOutputToCsvFile("results.csv") ) diff --git a/Examples/BackgroundServiceOpenLinksAndExtractObject/RedditPostsBackgroundService.cs b/Examples/BackgroundServiceOpenLinksAndExtractObject/RedditPostsBackgroundService.cs index cd3d70f..aae2941 100644 --- a/Examples/BackgroundServiceOpenLinksAndExtractObject/RedditPostsBackgroundService.cs +++ b/Examples/BackgroundServiceOpenLinksAndExtractObject/RedditPostsBackgroundService.cs @@ -18,10 +18,10 @@ public override async Task StartAsync(CancellationToken cancellationToken) { _crawlerEngine = await new CrawlerEngineBuilder() .AddPage("https://old.reddit.com/r/csharp", pageOptions => pageOptions - .OpenLinks(linksSelector: "div.thing.link.self a.byselector: link.comments", subPageOptions => subPageOptions + .OpenLinks(linksSelector: "div.thing ul.buttons li:first-of-type a.comments", subPageOptions => subPageOptions .ExtractObject([ new("Title","div.sitetable.linklisting a.title"), - new("Url","div.sitetable.linklisting a.title", "href"), + new("Body", "usertext-body"), new("Upvotes", "div.sitetable.linklisting div.score.unvoted"), new("Top comment", "div.commentarea div.entry.unvoted div.md"), ])) diff --git a/README.md b/README.md index 84e10dd..4eb7fee 100644 --- a/README.md +++ b/README.md @@ -81,7 +81,9 @@ It's possible to add multiple pages to scrape with the same Tasks. await crawlerEngine.RunAsync(cancellationToken); ``` -This example starts at `https://old.reddit.com/r/csharp` and `https://old.reddit.com/r/dotnet` and opens each post and scrapes the title, url, upvotes and top comment. It also scrolls to the end of the page and waits 4 seconds before scraping the page. And it continues to the next page. +This example starts at `https://old.reddit.com/r/csharp` and `https://old.reddit.com/r/dotnet` and opens each post and scrapes the title, url, upvotes and top comment. It also scrolls to the end of the page and waits 4 seconds before scraping the page. And then it continues with the next pagination page. + + ## Configuring the Engine @@ -94,6 +96,20 @@ The engine can be configured with the following options: * `TotalPagesToCrawl(int total)`: The total number of pages to crawl * `WithParallelismDegree(int parallelismDegree)` : The number of pages to crawl in parallel +## Cookies + +It's possible to add cookies to all request + +```csharp +.ConfigureEngine(options => +{ + options.WithCookies([ + new("auth-cookie", "l;alqpekcoizmdfugnvkjgvsaaprufc", "thedomain.com"), + new("Cookie2", "def", "localhost") + ]); +}) +``` + ### Stopping the engine The engine stops when the diff --git a/Tests/CocoCrawler.IntegrationTests/CocoCrawler.IntegrationTests.csproj b/Tests/CocoCrawler.IntegrationTests/CocoCrawler.IntegrationTests.csproj new file mode 100644 index 0000000..cbf362b --- /dev/null +++ b/Tests/CocoCrawler.IntegrationTests/CocoCrawler.IntegrationTests.csproj @@ -0,0 +1,44 @@ + + + + net8.0 + enable + enable + + false + true + + + + + all + runtime; build; native; contentfiles; analyzers; buildtransitive + + + + + + + all + runtime; build; native; contentfiles; analyzers; buildtransitive + + + + + + + + + + + + + + Always + + + Always + + + + diff --git a/Tests/CocoCrawler.IntegrationTests/ConfigureEngine/CookiesTest.cs b/Tests/CocoCrawler.IntegrationTests/ConfigureEngine/CookiesTest.cs new file mode 100644 index 0000000..034bb08 --- /dev/null +++ b/Tests/CocoCrawler.IntegrationTests/ConfigureEngine/CookiesTest.cs @@ -0,0 +1,45 @@ +using CocoCrawler.Builders; +using CocoCrawler.Scheduler; +using FluentAssertions; +using WireMock.RequestBuilders; +using WireMock.ResponseBuilders; +using WireMock.Server; + +namespace CocoCrawler.IntegrationTests.Engine; + +public class CookiesTest +{ + private readonly WireMockServer _wireMockServer = WireMockServer.Start(); + + [Fact] + public async Task Cookies_Should_Be_Send_To_The_Client() + { + // Arange + _wireMockServer.Given(Request.Create().WithUrl($"{_wireMockServer.Url}/cookies")) + .RespondWith(Response.Create().WithSuccess()); + + var crawlerEngine = await new CrawlerEngineBuilder() + .AddPage($"{_wireMockServer.Url}/cookies", options => options.ExtractObject([new("No exist", "div.test")])) + .ConfigureEngine(ops => + { + ops.WithScheduler(new InMemoryScheduler(totalSecondsTimeoutAfterJob: 2)); + ops.WithCookies([ + new("Cookie1","abc", "localhost"), + new("Cookie2","def", "localhost") + ]); + }) + .BuildAsync(); + + // Act + await crawlerEngine.RunAsync(); + + // Assert + var cookiesSent = _wireMockServer.LogEntries.First()?.RequestMessage.Cookies; + + cookiesSent.Should().BeEquivalentTo(new Dictionary() + { + { "Cookie1", "abc" }, + { "Cookie2", "def" } + }); + } +} diff --git a/Tests/CocoCrawler.IntegrationTests/ConfigureEngine/ThrowOnBuilderExceptions.cs b/Tests/CocoCrawler.IntegrationTests/ConfigureEngine/ThrowOnBuilderExceptions.cs new file mode 100644 index 0000000..6410647 --- /dev/null +++ b/Tests/CocoCrawler.IntegrationTests/ConfigureEngine/ThrowOnBuilderExceptions.cs @@ -0,0 +1,92 @@ +using CocoCrawler.Builders; +using CocoCrawler.Exceptions; +using CocoCrawler.Scheduler; +using FluentAssertions; +using WireMock.Server; + +namespace CocoCrawler.IntegrationTests.Engine; + +public class ThrowOnBuilderExceptions +{ + private readonly WireMockServer _wireMockServer = WireMockServer.Start(); + + [Fact] + public async Task ShouldThrow_When_NoPagesAdded() + { + // Arange + var crawlerEngine = new CrawlerEngineBuilder(); + + // Act + async Task act() => await crawlerEngine.BuildAsync(); + + // Assert + var ex = await Assert.ThrowsAsync((Func>)act); + + ex.Message.Should().Be("At least one Page is required to build the engine. Try calling .AddPage() to add pages."); + } + + [Fact] + public void ShouldThrow_When_NotValidUri() + { + // Arange + var crawlerEngine = new CrawlerEngineBuilder(); + + // Act + CrawlerEngineBuilder act() => crawlerEngine.AddPage("notvalid", _ => { }); + + // Assert + var ex = Assert.Throws(act); + + ex.Message.Should().Be("notvalid is not a valid URI."); + } + + [Fact] + public async Task ShouldThrow_When_PagesDontHaveTask() + { + // Arange + var crawlerEngine = new CrawlerEngineBuilder() + .AddPage("https://localhost:5000", _ => { }); + + // Act + async Task act() => await crawlerEngine.BuildAsync(); + + // Assert + var ex = await Assert.ThrowsAsync((Func>)act); + + ex.Message.Should().Be("A Page requires a purpose, try calling .OpenLinks() or .ExtractObject() or AddPagination()."); + } + + [Fact] + public async Task ShouldThrow_When_InvalidParallelismDegree() + { + // Arange + var crawlerEngine = new CrawlerEngineBuilder() + .AddPage("https://localhost:5000", o => o.ExtractObject([new("","")])) + .ConfigureEngine(x => x.WithParallelismDegree(-1)); + + // Act + async Task act() => await crawlerEngine.BuildAsync(); + + // Assert + var ex = await Assert.ThrowsAsync((Func>)act); + + ex.Message.Should().Be("The parallelism degree must be greater than or equal to 1."); + } + + [Fact] + public async Task ShouldThrow_When_InvalidTotalPagesToCrawl() + { + // Arange + var crawlerEngine = new CrawlerEngineBuilder() + .AddPage("https://localhost:5000", o => o.ExtractObject([new("", "")])) + .ConfigureEngine(x => x.TotalPagesToCrawl(-1)); + + // Act + async Task act() => await crawlerEngine.BuildAsync(); + + // Assert + var ex = await Assert.ThrowsAsync((Func>)act); + + ex.Message.Should().Be("The total pages to crawl must be greater than or equal to 1."); + } +} diff --git a/Tests/CocoCrawler.IntegrationTests/ConfigureEngine/UserAgentTests.cs b/Tests/CocoCrawler.IntegrationTests/ConfigureEngine/UserAgentTests.cs new file mode 100644 index 0000000..2d46b22 --- /dev/null +++ b/Tests/CocoCrawler.IntegrationTests/ConfigureEngine/UserAgentTests.cs @@ -0,0 +1,34 @@ +using CocoCrawler.Builders; +using CocoCrawler.Scheduler; +using FluentAssertions; +using WireMock.RequestBuilders; +using WireMock.ResponseBuilders; +using WireMock.Server; + +namespace CocoCrawler.IntegrationTests.Engine; + +public class UserAgentTests +{ + private readonly WireMockServer _wireMockServer = WireMockServer.Start(); + + [Fact] + public async Task UserAgent_Should_Be_Overwritten() + { + // Arange + _wireMockServer.Given(Request.Create().WithUrl($"{_wireMockServer.Url}/useragent")) + .RespondWith(Response.Create().WithSuccess()); + + var crawlerEngine = await new CrawlerEngineBuilder() + .AddPage($"{_wireMockServer.Url}/useragent", options => options.ExtractObject([new("No exist", "div.test")])) + .ConfigureEngine(ops => ops.WithUserAgent("mock user agent aka not chrome").WithScheduler(new InMemoryScheduler(totalSecondsTimeoutAfterJob: 2))) + .BuildAsync(); + + // Act + await crawlerEngine.RunAsync(); + + // Assert + var userAgentUsed = _wireMockServer.LogEntries.First()?.RequestMessage.Headers?["User-Agent"]; + + userAgentUsed.Should().BeEquivalentTo("mock user agent aka not chrome"); + } +} diff --git a/Tests/CocoCrawler.IntegrationTests/ExtractListAndPaginate/ExtractListAndPaginateTests.cs b/Tests/CocoCrawler.IntegrationTests/ExtractListAndPaginate/ExtractListAndPaginateTests.cs new file mode 100644 index 0000000..e10717c --- /dev/null +++ b/Tests/CocoCrawler.IntegrationTests/ExtractListAndPaginate/ExtractListAndPaginateTests.cs @@ -0,0 +1,59 @@ +using CocoCrawler.Builders; +using FluentAssertions; +using WireMock.RequestBuilders; +using WireMock.ResponseBuilders; +using WireMock.Server; + +namespace CocoCrawler.IntegrationTests.ExtractListAndPaginate; + +public class ExtractObjectAndPaginateTests +{ + private readonly WireMockServer _wireMockServer = WireMockServer.Start(port: 9090); + + [Fact] + public async Task ExtractListAndPaginate_ShouldHaveDetailsInFile_OnHappyFlow() + { + // Arange + _wireMockServer.Given(Request.Create().WithUrl("http://localhost:9090/main-page")) + .RespondWith(Response.Create() + .WithHeader("Content-Type", "text/xml; charset=utf-8") + .WithBodyFromFile("ExtractListAndPaginate\\Responses\\main-page.html")); + + _wireMockServer.Given(Request.Create().WithUrl("http://localhost:9090/page-2")) + .RespondWith(Response.Create() + .WithHeader("Content-Type", "text/xml; charset=utf-8") + .WithBodyFromFile("ExtractListAndPaginate\\Responses\\page-2.html")); + + var crawlerEngine = await new CrawlerEngineBuilder() + .AddPage("http://localhost:9090/main-page", options => options + .ExtractList("div.content.test > div.listing", [ + new("Title", "div.title"), + new("Description", "div.description"), + new("Amount", "span.amount"), + new("Link", "a", "href") + ]) + .AddPagination("div.pagination a:nth-last-child(1)") + .AddOutputToCsvFile("ExtractListAndPaginate\\Results\\resultstest1.csv")) + .ConfigureEngine(ops => ops.TotalPagesToCrawl(2)) + .BuildAsync(); + + // Act + await crawlerEngine.RunAsync(); + + // Assert + var outputContents = File.ReadAllText("ExtractListAndPaginate\\Results\\resultstest1.csv"); + + var expect = @"Title,Description,Amount,Link +Title One,Description1,10,/linkone +Title Two,Description2,20,/linktwo +Title Three,Description3,30,/linkthree +Title Four,Description4,40,/linkfour +Title Five,Description5,5,/linkfive +Title Six,Description6,6,/linksix +Title Seven,Description7,7,/linkseven +Title Eight,Description8,8,/linkeight +"; + + outputContents.Should().BeEquivalentTo(expect); + } +} \ No newline at end of file diff --git a/Tests/CocoCrawler.IntegrationTests/ExtractListAndPaginate/Responses/main-page.html b/Tests/CocoCrawler.IntegrationTests/ExtractListAndPaginate/Responses/main-page.html new file mode 100644 index 0000000..752590b --- /dev/null +++ b/Tests/CocoCrawler.IntegrationTests/ExtractListAndPaginate/Responses/main-page.html @@ -0,0 +1,43 @@ + + + + + + Test A + + +
+
+
+
Title One
+
Description1
+ 10 + Link +
+
+
Title Two
+
Description2
+ 20 + Link +
+
+
Title Three
+
Description3
+ 30 + Link +
+
+
Title Four
+
Description4
+ 40 + Link +
+
+ + + diff --git a/Tests/CocoCrawler.IntegrationTests/ExtractListAndPaginate/Responses/page-2.html b/Tests/CocoCrawler.IntegrationTests/ExtractListAndPaginate/Responses/page-2.html new file mode 100644 index 0000000..b6457c4 --- /dev/null +++ b/Tests/CocoCrawler.IntegrationTests/ExtractListAndPaginate/Responses/page-2.html @@ -0,0 +1,43 @@ + + + + + + Test A + + +
+
+
+
Title Five
+
Description5
+ 5 + Link +
+
+
Title Six
+
Description6
+ 6 + Link +
+
+
Title Seven
+
Description7
+ 7 + Link +
+
+
Title Eight
+
Description8
+ 8 + Link +
+
+ + + diff --git a/Tests/CocoCrawler.IntegrationTests/OpenLinksExtractObjectAndPaginate/OpenLinksExtractObjectAndPaginate.cs b/Tests/CocoCrawler.IntegrationTests/OpenLinksExtractObjectAndPaginate/OpenLinksExtractObjectAndPaginate.cs new file mode 100644 index 0000000..bc75b5a --- /dev/null +++ b/Tests/CocoCrawler.IntegrationTests/OpenLinksExtractObjectAndPaginate/OpenLinksExtractObjectAndPaginate.cs @@ -0,0 +1,146 @@ +using CocoCrawler.Builders; +using CocoCrawler.Scheduler; +using FluentAssertions; +using WireMock.RequestBuilders; +using WireMock.ResponseBuilders; +using WireMock.Server; + +namespace CocoCrawler.IntegrationTests.ExtractListAndPaginate; + +public class OpenLinksExtractObjectAndPaginate +{ + private readonly WireMockServer _wireMockServer = WireMockServer.Start(port: 9010); + + [Fact] + public async Task ExtractListAndPaginate_ShouldHaveDetailsInFile_OnHappyFlow() + { + // Arange + foreach (var index in Enumerable.Range(1, 10)) + { + _wireMockServer.Given(Request.Create().WithUrl($"http://localhost:9010/main-page-{index}")) + .RespondWith(Response.Create() + .WithHeader("Content-Type", "text/xml; charset=utf-8") + .WithBody(GetListingHtmlPages(index))); + } + + foreach (var index in Enumerable.Range(1, 10 * 3)) + { + _wireMockServer.Given(Request.Create().WithUrl($"http://localhost:9010/content-page-{index}")) + .RespondWith(Response.Create() + .WithHeader("Content-Type", "text/xml; charset=utf-8") + .WithBody(GetContentPage(index))); + } + + var crawlerEngine = await new CrawlerEngineBuilder() + .AddPage("http://localhost:9010/main-page-1", options => options + .OpenLinks("div.content.test a.link", newPage => newPage + .ExtractObject([ + new("Title", "div.content.test div.title"), + new("Description", "div.content.test div.description"), + new("Amount", "span.amount"), + new("Link", "a", "href") + ])) + .AddPagination("div.pagination a:nth-last-child(1)") + .AddOutputToCsvFile("OpenLinksExtractObjectAndPaginate\\Results\\resultstest1.csv")) + .ConfigureEngine(e => e + .WithScheduler(new InMemoryScheduler(totalSecondsTimeoutAfterJob: 5)) + .WithIgnoreUrls(["http://localhost:9010/content-page-6"]) + .DisableParallelism()) + .BuildAsync(); + + // Act + await crawlerEngine.RunAsync(); + + // Assert + var outputContents = File.ReadAllText("OpenLinksExtractObjectAndPaginate\\Results\\resultstest1.csv"); + + var expect = @"Url,Title,Description,Amount,Link +http://localhost:9010/content-page-1,Title 1,Description1,Amount 10,link1 +http://localhost:9010/content-page-2,Title 2,Description2,Amount 20,link2 +http://localhost:9010/content-page-3,Title 3,Description3,Amount 30,link3 +http://localhost:9010/content-page-4,Title 4,Description4,Amount 40,link4 +http://localhost:9010/content-page-5,Title 5,Description5,Amount 50,link5 +http://localhost:9010/content-page-7,Title 7,Description7,Amount 70,link7 +http://localhost:9010/content-page-8,Title 8,Description8,Amount 80,link8 +http://localhost:9010/content-page-9,Title 9,Description9,Amount 90,link9 +http://localhost:9010/content-page-10,Title 10,Description10,Amount 100,link10 +http://localhost:9010/content-page-11,Title 11,Description11,Amount 110,link11 +http://localhost:9010/content-page-12,Title 12,Description12,Amount 120,link12 +http://localhost:9010/content-page-13,Title 13,Description13,Amount 130,link13 +http://localhost:9010/content-page-14,Title 14,Description14,Amount 140,link14 +http://localhost:9010/content-page-15,Title 15,Description15,Amount 150,link15 +http://localhost:9010/content-page-16,Title 16,Description16,Amount 160,link16 +http://localhost:9010/content-page-17,Title 17,Description17,Amount 170,link17 +http://localhost:9010/content-page-18,Title 18,Description18,Amount 180,link18 +http://localhost:9010/content-page-19,Title 19,Description19,Amount 190,link19 +http://localhost:9010/content-page-20,Title 20,Description20,Amount 200,link20 +http://localhost:9010/content-page-21,Title 21,Description21,Amount 210,link21 +http://localhost:9010/content-page-22,Title 22,Description22,Amount 220,link22 +http://localhost:9010/content-page-23,Title 23,Description23,Amount 230,link23 +http://localhost:9010/content-page-24,Title 24,Description24,Amount 240,link24 +http://localhost:9010/content-page-25,Title 25,Description25,Amount 250,link25 +http://localhost:9010/content-page-26,Title 26,Description26,Amount 260,link26 +http://localhost:9010/content-page-27,Title 27,Description27,Amount 270,link27 +http://localhost:9010/content-page-28,Title 28,Description28,Amount 280,link28 +http://localhost:9010/content-page-29,Title 29,Description29,Amount 290,link29 +http://localhost:9010/content-page-30,Title 30,Description30,Amount 300,link30 +"; + + outputContents.Should().BeEquivalentTo(expect); + } + + private static string GetContentPage(int index) + { + return $@" + + + + Test A + + +
+
+
Title {index}
+
Description{index}
+ Amount {index * 10} + Link +
+ + +"; + } + + private static string GetListingHtmlPages(int index) + { + int start = (index - 1) * 3; + + return $@" + + + + Test A + + +
+
+
+
Title {index}
+ Link +
+
+
Title Two
+ Link +
+
+
Title Three
+ Link +
+
+
+ {(index == 10 ? "" : $"Next"" +
+ + +"; + } +} \ No newline at end of file diff --git a/Tests/CocoCrawler.Tests.csproj b/Tests/CocoCrawler.Tests.csproj deleted file mode 100644 index 9c5b30a..0000000 --- a/Tests/CocoCrawler.Tests.csproj +++ /dev/null @@ -1,23 +0,0 @@ - - - - net8.0 - enable - enable - - false - true - - - - - - - - - - - - - - diff --git a/Tests/UnitTest1.cs b/Tests/UnitTest1.cs deleted file mode 100644 index d92e0ad..0000000 --- a/Tests/UnitTest1.cs +++ /dev/null @@ -1,10 +0,0 @@ -namespace CucaCrawlTests; - -public class UnitTest1 -{ - [Fact] - public void Test1() - { - Assert.True(true); - } -} \ No newline at end of file