Skip to content

Commit

Permalink
Async streaming of sitemaps
Browse files Browse the repository at this point in the history
  • Loading branch information
drmathias committed Sep 2, 2023
1 parent 981423b commit 6318842
Show file tree
Hide file tree
Showing 17 changed files with 428 additions and 323 deletions.
16 changes: 10 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ Table of Contents
===

- [Overview](#overview)
- [Why Build Yet Another Parser?](#why-build-yet-another-parser)
- [Design Considerations](#design-considerations)
- [Features](#features)
- [Usage](#usage)
- [Minimal Example](#minimal-example)
Expand All @@ -29,12 +29,12 @@ Supports the proposed [RFC9309](https://datatracker.ietf.org/doc/html/rfc9309) s
- Host
- Crawl-delay

# Why Build Yet Another Parser?

There are several _robots.txt_ and _sitemap_ parsers that already exist, however they all suffer from their lack of flexibility.
## Design Considerations

This library is based upon `HttpClient`, making it very familiar, easy to use and adaptable to your needs. Since you have full control over the `HttpClient`, you are able to configure custom message handlers to intercept outgoing requests and responses. For example, you may want to add custom headers on a request, configure additional logging or set up a retry policy.

Some websites can have very large sitemaps. For this reason, async streaming is supported as the preferred way of parsing sitemaps.

There is also the possibility to extend this library to support protocols other than HTTP, such as FTP.

# Features
Expand All @@ -53,6 +53,8 @@ There is also the possibility to extend this library to support protocols other
| Atom 0.3/1.0 feeds || 0.8 |
| Sitemaps XML format | ✔️ | |
| Simple text sitemaps | ✔️ | |
| Async streaming of sitemaps | ✔️ | |
| Cancellation token support | ✔️ | |
| Memory management | ✔️ | |

# Usage
Expand Down Expand Up @@ -136,9 +138,11 @@ var robotWebClient = new RobotWebClient<GitHubWebsite>(httpClient);
var robotsTxt = await robotWebClient.LoadRobotsTxtAsync();
// providing a datetime only retrieves sitemap items modified since this datetime
var modifiedSince = new DateTime(2023, 01, 01);
// sitemaps are scanned recursively and combined into single Sitemap object
// sitemaps are iterated asynchronously
// even if robots.txt does not contain sitemap directive, looks for a sitemap at {TWebsite.BaseAddress}/sitemap.xml
var sitemap = await robotsTxt.LoadSitemapAsync(modifiedSince);
await foreach(var item in robotsTxt.LoadSitemapAsync(modifiedSince))
{
}
```

## Checking a Rule
Expand Down
74 changes: 37 additions & 37 deletions src/Robots.Txt.Parser/Http/RobotWebClient.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
using System.Collections.Generic;
using System.Net.Http;
using System.Net.Mime;
using System.Runtime.CompilerServices;
using System.Threading;
using System.Threading.Tasks;

Expand Down Expand Up @@ -76,49 +77,48 @@ the 500-599 range.
return new RobotsTxt(this, userAgentRules, new Dictionary<ProductToken, int>(), null, new HashSet<Uri>());
}

var stream = await response.Content.ReadAsStreamAsync(cancellationToken);
await using var stream = await response.Content.ReadAsStreamAsync(cancellationToken);
return await new RobotsTxtParser(this).ReadFromStreamAsync(stream, cancellationToken);
}

async Task<Sitemap?> IRobotClient.LoadSitemapsAsync(IEnumerable<Uri> uris, DateTime? modifiedSince, CancellationToken cancellationToken)
async IAsyncEnumerable<UrlSetItem> IRobotClient.LoadSitemapsAsync(Uri uri, DateTime? modifiedSince, [EnumeratorCancellation] CancellationToken cancellationToken)
{
Sitemap? sitemap = null;
var request = new HttpRequestMessage(HttpMethod.Get, uri);
request.Headers.Add("Accept", "application/xml,text/plain,text/xml,*/*");
var response = await _httpClient.SendAsync(request, HttpCompletionOption.ResponseHeadersRead, cancellationToken);
if (!response.IsSuccessStatusCode) yield break;

foreach (var uri in uris)
{
var request = new HttpRequestMessage(HttpMethod.Get, uri);
request.Headers.Add("Accept", "application/xml,text/plain,text/xml,*/*");
var response = await _httpClient.SendAsync(request, HttpCompletionOption.ResponseHeadersRead, cancellationToken);
if (!response.IsSuccessStatusCode) return null;
using var stream = await response.Content.ReadAsStreamAsync(cancellationToken);

var parsedSitemap = response.Content.Headers.ContentType?.MediaType switch
{
MediaTypeNames.Text.Plain => await SimpleTextSitemapParser.ReadFromStreamAsync(stream, cancellationToken),
MediaTypeNames.Text.Xml or MediaTypeNames.Application.Xml or _
=> await SitemapParser.ReadFromStreamAsync(stream, modifiedSince, cancellationToken)
};

if (parsedSitemap is null)
{
continue;
}
await using var stream = await response.Content.ReadAsStreamAsync(cancellationToken);

if (sitemap is null)
{
sitemap = parsedSitemap;
continue;
}

if (parsedSitemap is SitemapIndex sitemapRoot)
{
var sitemaps = await (this as IRobotWebClient).LoadSitemapsAsync(sitemapRoot.SitemapUris, modifiedSince, cancellationToken);
if (sitemaps is not null) sitemap = sitemaps.Combine(sitemaps);
}

sitemap = sitemap.Combine(parsedSitemap);
switch (response.Content.Headers.ContentType?.MediaType)
{
case MediaTypeNames.Text.Plain:
await foreach (var urlSet in SimpleTextSitemapParser.ReadFromStreamAsync(stream, cancellationToken))
{
yield return urlSet;
}
yield break;
case MediaTypeNames.Text.Xml or MediaTypeNames.Application.Xml:
default:
var sitemap = await SitemapParser.ReadFromStreamAsync(stream, modifiedSince, cancellationToken);
if (sitemap is SitemapIndex index)
{
await foreach (var location in index.SitemapUris)
{
await foreach (var item in (this as IRobotClient).LoadSitemapsAsync(location, modifiedSince, cancellationToken))
{
yield return item;
}
}
}
else
{
await foreach (var item in sitemap.UrlSet)
{
yield return item;
}
}
yield break;
}

return sitemap;
}
}
2 changes: 1 addition & 1 deletion src/Robots.Txt.Parser/IRobotClient.cs
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,5 @@ public interface IRobotClient
/// <exception cref="HttpRequestException">Thrown if a status code that cannot be handled is returned.</exception>
Task<IRobotsTxt> LoadRobotsTxtAsync(CancellationToken cancellationToken = default);

protected internal Task<Sitemap?> LoadSitemapsAsync(IEnumerable<Uri> uris, DateTime? modifiedSince, CancellationToken cancellationToken);
protected internal IAsyncEnumerable<UrlSetItem> LoadSitemapsAsync(Uri uri, DateTime? modifiedSince = null, CancellationToken cancellationToken = default);
}
23 changes: 12 additions & 11 deletions src/Robots.Txt.Parser/ISitemap.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,35 +11,36 @@ public interface ISitemap
/// <summary>
/// Url set included in the Sitemap
/// </summary>
HashSet<UrlSetItem> UrlSet { get; }
IAsyncEnumerable<UrlSetItem> UrlSet { get; }
}

/// <summary>
/// Describes a Sitemap
/// </summary>
public class Sitemap : ISitemap
{
public Sitemap(HashSet<UrlSetItem> urlSet)
public Sitemap(IAsyncEnumerable<UrlSetItem> urlSet)
{
UrlSet = urlSet;
}

/// <inheritdoc />
public HashSet<UrlSetItem> UrlSet { get; }

internal Sitemap Combine(Sitemap other)
{
UrlSet.UnionWith(other.UrlSet);
return this;
}
public IAsyncEnumerable<UrlSetItem> UrlSet { get; }
}

internal class SitemapIndex : Sitemap
{
public SitemapIndex(HashSet<Uri> sitemapUris) : base(new HashSet<UrlSetItem>())
public SitemapIndex(IAsyncEnumerable<Uri> sitemapUris) : base(Empty<UrlSetItem>())
{
SitemapUris = sitemapUris;
}

public HashSet<Uri> SitemapUris { get; }
public IAsyncEnumerable<Uri> SitemapUris { get; }

#pragma warning disable CS1998
private static async IAsyncEnumerable<T> Empty<T>()
#pragma warning restore CS1998
{
yield break;
}
}
19 changes: 13 additions & 6 deletions src/Robots.Txt.Parser/RobotsTxt.cs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Threading;
using System.Threading.Tasks;

namespace Robots.Txt.Parser;

Expand All @@ -17,7 +17,7 @@ public interface IRobotsTxt
/// <param name="modifiedSince">Filter to retrieve site maps modified after this date</param>
/// <param name="cancellationToken">Cancellation token</param>
/// <returns>A sitemap, or null or no sitemap is found</returns>
ValueTask<ISitemap?> LoadSitemapAsync(DateTime? modifiedSince = default, CancellationToken cancellationToken = default);
IAsyncEnumerable<UrlSetItem> LoadSitemapAsync(DateTime? modifiedSince = default, CancellationToken cancellationToken = default);

/// <summary>
/// Retrieves the crawl delay specified for a User-Agent
Expand Down Expand Up @@ -71,10 +71,17 @@ internal RobotsTxt(IRobotClient client,
}

/// <inheritdoc />
public async ValueTask<ISitemap?> LoadSitemapAsync(DateTime? modifiedSince = default, CancellationToken cancellationToken = default)
=> _sitemapUrls.Count != 0
? await _client.LoadSitemapsAsync(_sitemapUrls, modifiedSince, cancellationToken)
: await _client.LoadSitemapsAsync(new[] { new Uri(_client.BaseAddress, "/sitemap.xml") }, modifiedSince, cancellationToken);
public async IAsyncEnumerable<UrlSetItem> LoadSitemapAsync(DateTime? modifiedSince = default, [EnumeratorCancellation] CancellationToken cancellationToken = default)
{
var urls = _sitemapUrls.Count != 0 ? _sitemapUrls.AsEnumerable() : new[] { new Uri(_client.BaseAddress, "/sitemap.xml") };
foreach (var url in urls)
{
await foreach (var item in _client.LoadSitemapsAsync(url, modifiedSince, cancellationToken))
{
yield return item;
}
}
}

/// <inheritdoc />
public bool TryGetCrawlDelay(ProductToken userAgent, out int crawlDelay)
Expand Down
3 changes: 3 additions & 0 deletions src/Robots.Txt.Parser/RobotsTxtException.cs
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
using System;
using System.Diagnostics.CodeAnalysis;
using System.Runtime.Serialization;

namespace Robots.Txt.Parser;

/// <summary>
/// Exception raised when parsing a robots.txt file
/// </summary>
[Serializable]
public class RobotsTxtException : Exception
{
internal RobotsTxtException()
Expand All @@ -20,6 +22,7 @@ internal RobotsTxtException(string? message, Exception? innerException) : base(m
{
}

[ExcludeFromCodeCoverage]
protected RobotsTxtException(SerializationInfo info, StreamingContext context) : base(info, context)
{
}
Expand Down
88 changes: 44 additions & 44 deletions src/Robots.Txt.Parser/SimpleTextSitemapParser.cs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Runtime.CompilerServices;
using System.Threading;
using System.Threading.Tasks;

namespace Robots.Txt.Parser;

Expand All @@ -11,55 +11,55 @@ namespace Robots.Txt.Parser;
/// </summary>
public static class SimpleTextSitemapParser
{
private const int MaxLines = 50000;
private const int ByteCount50MiB = 52_428_800;
private const int MaxLines = 50000;
private const int ByteCount50MiB = 52_428_800;

/// <summary>
/// Parses a <see cref="Sitemap"/> from a <see cref="Stream"/>
/// </summary>
/// <param name="stream">Sitemap document stream</param>
/// <param name="cancellationToken">Cancellation token</param>
/// <returns>The parsed <see cref="Sitemap"/></returns>
/// <exception cref="SitemapException">Raised when there is an error parsing the Sitemap</exception>
public static async Task<Sitemap> ReadFromStreamAsync(Stream stream, CancellationToken cancellationToken = default)
/// <summary>
/// Parses a <see cref="Sitemap"/> from a <see cref="Stream"/>
/// </summary>
/// <param name="stream">Sitemap document stream</param>
/// <param name="cancellationToken">Cancellation token</param>
/// <returns>The parsed <see cref="Sitemap"/></returns>
/// <exception cref="SitemapException">Raised when there is an error parsing the Sitemap</exception>
public static async IAsyncEnumerable<UrlSetItem> ReadFromStreamAsync(Stream stream, [EnumeratorCancellation] CancellationToken cancellationToken = default)
{
using var streamReader = new StreamReader(stream);
string? line;
var lineCount = 0;
while (((line = await streamReader.ReadLineAsync(cancellationToken)) is not null) && !cancellationToken.IsCancellationRequested)
{
var urlSet = new HashSet<UrlSetItem>();
try
{
using var streamReader = new StreamReader(stream);
string? line;
var lineCount = 0;
while (((line = await streamReader.ReadLineAsync(cancellationToken)) is not null) && !cancellationToken.IsCancellationRequested)
{
/*
Each text file ... and must be no larger than 50MiB (52,428,800 bytes)
*/
if (stream.Position > ByteCount50MiB) throw new SitemapException("Reached parsing limit");
/*
Each text file ... and must be no larger than 50MiB (52,428,800 bytes)
*/
if (stream.Position > ByteCount50MiB) throw new SitemapException("Reached parsing limit");

if (string.IsNullOrWhiteSpace(line)) continue;
if (string.IsNullOrWhiteSpace(line)) continue;

lineCount++;
lineCount++;

/*
Each text file can contain a maximum of 50,000 URLs
*/
if (lineCount > MaxLines) throw new SitemapException("Reached line limit");
/*
Each text file can contain a maximum of 50,000 URLs
*/
if (lineCount > MaxLines) throw new SitemapException("Reached line limit");

/*
The text file must have one URL per line. The URLs cannot contain embedded new lines.
You must fully specify URLs, including the http.
The text file must use UTF-8 encoding.
The text file should contain no information other than the list of URLs.
The text file should contain no header or footer information.
*/
urlSet.Add(new UrlSetItem(new Uri(line), null, null, null));
}
/*
The text file must have one URL per line. The URLs cannot contain embedded new lines.
You must fully specify URLs, including the http.
The text file must use UTF-8 encoding.
The text file should contain no information other than the list of URLs.
The text file should contain no header or footer information.
*/
Uri location;
try
{
location = new Uri(line);
}
catch (Exception e)
{
throw new SitemapException("Unable to parse sitemap item", e);
}

return new Sitemap(urlSet);
}
catch (Exception e) when (e is not SitemapException)
{
throw new SitemapException("Unable to parse sitemap", e);
}
yield return new UrlSetItem(location, null, null, null);
}
}
}
5 changes: 4 additions & 1 deletion src/Robots.Txt.Parser/SitemapException.cs
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
using System;
using System.Diagnostics.CodeAnalysis;
using System.Runtime.Serialization;

namespace Robots.Txt.Parser;

/// <summary>
/// Exception raised when parsing a Sitemap
/// Exception raised when parsing a sitemap
/// </summary>
[Serializable]
public class SitemapException : Exception
{
internal SitemapException()
Expand All @@ -20,6 +22,7 @@ internal SitemapException(string? message, Exception? innerException) : base(mes
{
}

[ExcludeFromCodeCoverage]
protected SitemapException(SerializationInfo info, StreamingContext context) : base(info, context)
{
}
Expand Down
Loading

0 comments on commit 6318842

Please sign in to comment.