From 56441e5919683cecda42533ada532797ea2da938 Mon Sep 17 00:00:00 2001 From: drmathias Date: Sat, 26 Aug 2023 19:55:40 +0100 Subject: [PATCH] Support wildcard (*) and end-of-match ($) paths --- src/Robots.Txt.Parser/RobotRuleChecker.cs | 4 +- src/Robots.Txt.Parser/UrlRule.cs | 51 +++--- .../RobotTxtRuleCheckerTests.cs | 153 ++++++++++++++++++ .../UrlRuleTests.cs | 34 ++-- 4 files changed, 205 insertions(+), 37 deletions(-) diff --git a/src/Robots.Txt.Parser/RobotRuleChecker.cs b/src/Robots.Txt.Parser/RobotRuleChecker.cs index 7dcff9c..306436a 100644 --- a/src/Robots.Txt.Parser/RobotRuleChecker.cs +++ b/src/Robots.Txt.Parser/RobotRuleChecker.cs @@ -40,8 +40,8 @@ public bool IsAllowed(string path) "The /robots.txt URL is always allowed" */ if (_rules.Count == 0 || path == "/robots.txt") return true; - var ruleMatch = _rules.Where(rule => rule.Matches(path)) - .OrderByDescending(rule => rule.Path.Length) + var ruleMatch = _rules.Where(rule => rule.Pattern.Matches(path)) + .OrderByDescending(rule => rule.Pattern.Length) .ThenBy(rule => rule.Type, new RuleTypeComparer()) .FirstOrDefault(); return ruleMatch is null || ruleMatch.Type == RuleType.Allow; diff --git a/src/Robots.Txt.Parser/UrlRule.cs b/src/Robots.Txt.Parser/UrlRule.cs index d8e8a34..dbc0178 100644 --- a/src/Robots.Txt.Parser/UrlRule.cs +++ b/src/Robots.Txt.Parser/UrlRule.cs @@ -1,3 +1,4 @@ +using System.Linq; using System.Web; namespace Robots.Txt.Parser; @@ -6,44 +7,58 @@ namespace Robots.Txt.Parser; /// Describes a robots.txt rule for a URL /// /// Rule type; either or -/// URL path -public record UrlRule(RuleType Type, UrlPathPattern Path) +/// URL path pattern +public record UrlRule(RuleType Type, UrlPathPattern Pattern); + +public class UrlPathPattern { + private readonly bool _matchSubPaths; + private readonly string[] _patternParts; + + private UrlPathPattern(string value) + { + Length = value.Length; + if (value.EndsWith('$')) value = value[..^1]; + else _matchSubPaths = true; + _patternParts = value.Split('*', System.StringSplitOptions.None) + .Select(part => HttpUtility.UrlDecode(part.Replace("%2F", "%252F"))) + .ToArray(); + } + + public int Length { get; } + /// /// Checks if a path matches the URL rule /// /// The URL path /// True if the path matches or is a sub-path; otherwise false - public bool Matches(UrlPath path) => !Path.IsEmpty && path.StartsWith(Path); -} - -public class UrlPathPattern : UrlPath -{ - private UrlPathPattern(string value, bool exactMatch) : base(value) + public bool Matches(UrlPath path) { - ExactPattern = exactMatch; + if (Length == 0 || path._value.IndexOf(_patternParts[0]) != 0) return false; + var currentIndex = _patternParts[0].Length; + for (var x = 1; x < _patternParts.Length; x++) + { + var matchIndex = path._value.IndexOf(_patternParts[x], currentIndex); + if (matchIndex == -1) return false; + currentIndex = matchIndex + _patternParts[x].Length; + } + return _matchSubPaths || currentIndex == path.Length; } - public bool ExactPattern { get; } - - public static implicit operator UrlPathPattern(string value) => !value.EndsWith('$') ? new(value, false) : new(value[..^1], true); + public static implicit operator UrlPathPattern(string value) => new(value); } public class UrlPath { - private readonly string _value; + internal readonly string _value; - protected UrlPath(string value) + private UrlPath(string value) { _value = HttpUtility.UrlDecode(value.Replace("%2F", "%252F")); } public int Length => _value.Length; - public bool IsEmpty => _value == ""; - - public bool StartsWith(UrlPath path) => _value.StartsWith(path._value); - public static implicit operator UrlPath(string value) => new(value); } diff --git a/tests/Robots.Txt.Parser.Tests.Unit/RobotTxtRuleCheckerTests.cs b/tests/Robots.Txt.Parser.Tests.Unit/RobotTxtRuleCheckerTests.cs index f71c0af..85395b5 100644 --- a/tests/Robots.Txt.Parser.Tests.Unit/RobotTxtRuleCheckerTests.cs +++ b/tests/Robots.Txt.Parser.Tests.Unit/RobotTxtRuleCheckerTests.cs @@ -71,6 +71,139 @@ public async Task UserAgentWildcard_DisallowPath_DisallowOnMatch() ruleChecker.IsAllowed("/some/path").Should().Be(false); } + [Fact] + public async Task UserAgentWildcard_DisallowWildcardPath_DisallowOnMatch() + { + // Arrange + var file = +@"User-agent: * +Disallow: /some/*/path +"; + var stream = new MemoryStream(Encoding.UTF8.GetBytes(file)); + + // Act + var robotsTxt = await _parser.ReadFromStreamAsync(stream); + + // Assert + robotsTxt.TryGetRules("SomeBot", out var ruleChecker); + robotsTxt.Should().NotBe(null); + ruleChecker.IsAllowed("/some/other/sub/path").Should().Be(false); + } + + [Fact] + public async Task UserAgentWildcard_DisallowDoubleWildcardPath_DisallowOnMatch() + { + // Arrange + var file = +@"User-agent: * +Disallow: /some/**/path +"; + var stream = new MemoryStream(Encoding.UTF8.GetBytes(file)); + + // Act + var robotsTxt = await _parser.ReadFromStreamAsync(stream); + + // Assert + robotsTxt.TryGetRules("SomeBot", out var ruleChecker); + robotsTxt.Should().NotBe(null); + ruleChecker.IsAllowed("/some/other/sub/path").Should().Be(false); + } + + [Fact] + public async Task UserAgentWildcard_TwoPartWildcardPath_DisallowOnMatch() + { + // Arrange + var file = +@"User-agent: * +Disallow: /some/*/*/path +"; + var stream = new MemoryStream(Encoding.UTF8.GetBytes(file)); + + // Act + var robotsTxt = await _parser.ReadFromStreamAsync(stream); + + // Assert + robotsTxt.TryGetRules("SomeBot", out var ruleChecker); + robotsTxt.Should().NotBe(null); + ruleChecker.IsAllowed("/some/other/sub/path").Should().Be(false); + } + + [Fact] + public async Task UserAgentWildcard_TwoPartWildcardPath_DisallowSubpathMatch() + { + // Arrange + var file = +@"User-agent: * +Disallow: /some/*/*/path +"; + var stream = new MemoryStream(Encoding.UTF8.GetBytes(file)); + + // Act + var robotsTxt = await _parser.ReadFromStreamAsync(stream); + + // Assert + robotsTxt.TryGetRules("SomeBot", out var ruleChecker); + robotsTxt.Should().NotBe(null); + ruleChecker.IsAllowed("/some/other/sub/path/end").Should().Be(false); + } + + [Fact] + public async Task UserAgentWildcard_WildcardPathWithEndOfMatch_AllowSubpathMatch() + { + // Arrange + var file = +@"User-agent: * +Disallow: /some/*/*/path$ +"; + var stream = new MemoryStream(Encoding.UTF8.GetBytes(file)); + + // Act + var robotsTxt = await _parser.ReadFromStreamAsync(stream); + + // Assert + robotsTxt.TryGetRules("SomeBot", out var ruleChecker); + robotsTxt.Should().NotBe(null); + ruleChecker.IsAllowed("/some/other/sub/path/end").Should().Be(true); + } + + [Fact] + public async Task UserAgentWildcard_DisallowEndOfMatchPath_DisallowOnExactMatch() + { + // Arrange + var file = +@"User-agent: * +Disallow: /some/path$ +"; + var stream = new MemoryStream(Encoding.UTF8.GetBytes(file)); + + // Act + var robotsTxt = await _parser.ReadFromStreamAsync(stream); + + // Assert + robotsTxt.TryGetRules("SomeBot", out var ruleChecker); + robotsTxt.Should().NotBe(null); + ruleChecker.IsAllowed("/some/path").Should().Be(false); + } + + [Fact] + public async Task UserAgentWildcard_DisallowEndOfMatchPath_AllowOnSubPathMatch() + { + // Arrange + var file = +@"User-agent: * +Disallow: /some/path$ +"; + var stream = new MemoryStream(Encoding.UTF8.GetBytes(file)); + + // Act + var robotsTxt = await _parser.ReadFromStreamAsync(stream); + + // Assert + robotsTxt.TryGetRules("SomeBot", out var ruleChecker); + robotsTxt.Should().NotBe(null); + ruleChecker.IsAllowed("/some/path/subdirectory").Should().Be(true); + } + [Fact] public async Task UserAgentWildcard_DisallowPath_DisallowOnSubpath() { @@ -406,6 +539,26 @@ public async Task WildcardUserAgent_DisallowAllAndAllowPath_AllowPathMatch() ruleChecker.IsAllowed("/some/path").Should().Be(true); } + [Fact] + public async Task UserAgentWildcard_DisallowAllAndAllowWildcardPath_AllowWildcardPathMatch() + { + // Arrange + var file = +@"User-agent: * +Disallow: / +Allow: /some/*/path +"; + var stream = new MemoryStream(Encoding.UTF8.GetBytes(file)); + + // Act + var robotsTxt = await _parser.ReadFromStreamAsync(stream); + + // Assert + robotsTxt.TryGetRules("SomeBot", out var ruleChecker); + robotsTxt.Should().NotBe(null); + ruleChecker.IsAllowed("/some/other/sub/path").Should().Be(true); + } + [Fact] public async Task WildcardUserAgentRuleMatch_DisallowAllAndAllowPath_AllowSubpathMatch() { diff --git a/tests/Robots.Txt.Parser.Tests.Unit/UrlRuleTests.cs b/tests/Robots.Txt.Parser.Tests.Unit/UrlRuleTests.cs index 9e0f1cc..d4d7e20 100644 --- a/tests/Robots.Txt.Parser.Tests.Unit/UrlRuleTests.cs +++ b/tests/Robots.Txt.Parser.Tests.Unit/UrlRuleTests.cs @@ -12,7 +12,7 @@ public void Matches_EmptyRulePath_ReturnFalse() var urlRule = new UrlRule(RuleType.Disallow, ""); // Act - var matches = urlRule.Matches("/some/path"); + var matches = urlRule.Pattern.Matches("/some/path"); // Assert matches.Should().Be(false); @@ -25,7 +25,7 @@ public void Matches_DifferentPath_ReturnFalse() var urlRule = new UrlRule(RuleType.Disallow, "/some/path"); // Act - var matches = urlRule.Matches("/some/other/path"); + var matches = urlRule.Pattern.Matches("/some/other/path"); // Assert matches.Should().Be(false); @@ -38,7 +38,7 @@ public void Matches_DirectoryQualifier_ReturnFalse() var urlRule = new UrlRule(RuleType.Disallow, "/some/path/"); // Act - var matches = urlRule.Matches("/some/path"); + var matches = urlRule.Pattern.Matches("/some/path"); // Assert matches.Should().Be(false); @@ -51,7 +51,7 @@ public void Matches_ExactMatch_ReturnTrue() var urlRule = new UrlRule(RuleType.Disallow, "/some/path"); // Act - var matches = urlRule.Matches("/some/path"); + var matches = urlRule.Pattern.Matches("/some/path"); // Assert matches.Should().Be(true); @@ -64,7 +64,7 @@ public void Matches_FileMatch_ReturnTrue() var urlRule = new UrlRule(RuleType.Disallow, "/some/path"); // Act - var matches = urlRule.Matches("/some/path.html"); + var matches = urlRule.Pattern.Matches("/some/path.html"); // Assert matches.Should().Be(true); @@ -77,7 +77,7 @@ public void Matches_SubdirectoryMatch_ReturnTrue() var urlRule = new UrlRule(RuleType.Disallow, "/some/path"); // Act - var matches = urlRule.Matches("/some/path/subdirectory"); + var matches = urlRule.Pattern.Matches("/some/path/subdirectory"); // Assert matches.Should().Be(true); @@ -90,7 +90,7 @@ public void Matches_OctectBothLowercase_ReturnTrue() var urlRule = new UrlRule(RuleType.Disallow, "/some/path%3c"); // Act - var matches = urlRule.Matches("/some/path%3c"); + var matches = urlRule.Pattern.Matches("/some/path%3c"); // Assert matches.Should().Be(true); @@ -103,7 +103,7 @@ public void Matches_OctectBothUppercase_ReturnTrue() var urlRule = new UrlRule(RuleType.Disallow, "/some/path%3C"); // Act - var matches = urlRule.Matches("/some/path%3C"); + var matches = urlRule.Pattern.Matches("/some/path%3C"); // Assert matches.Should().Be(true); @@ -116,7 +116,7 @@ public void Matches_OctectRuleLowercasePathUppercase_ReturnTrue() var urlRule = new UrlRule(RuleType.Disallow, "/some/path%3c"); // Act - var matches = urlRule.Matches("/some/path%3C"); + var matches = urlRule.Pattern.Matches("/some/path%3C"); // Assert matches.Should().Be(true); @@ -129,7 +129,7 @@ public void Matches_OctectRuleUppercasePathLowercase_ReturnTrue() var urlRule = new UrlRule(RuleType.Disallow, "/some/path%3C"); // Act - var matches = urlRule.Matches("/some/path%3c"); + var matches = urlRule.Pattern.Matches("/some/path%3c"); // Assert matches.Should().Be(true); @@ -142,7 +142,7 @@ public void Matches_OctectForwardSlashBothUrl_ReturnTrue() var urlRule = new UrlRule(RuleType.Disallow, "/some/path%2F"); // Act - var matches = urlRule.Matches("/some/path%2F"); + var matches = urlRule.Pattern.Matches("/some/path%2F"); // Assert matches.Should().Be(true); @@ -155,7 +155,7 @@ public void Matches_OctectForwardSlashOnlyInRule_ReturnFalse() var urlRule = new UrlRule(RuleType.Disallow, "/some/path%2F"); // Act - var matches = urlRule.Matches("/some/path/"); + var matches = urlRule.Pattern.Matches("/some/path/"); // Assert matches.Should().Be(false); @@ -168,7 +168,7 @@ public void Matches_OctectForwardSlashOnlyInPath_ReturnFalse() var urlRule = new UrlRule(RuleType.Disallow, "/some/path/"); // Act - var matches = urlRule.Matches("/some/path%2F"); + var matches = urlRule.Pattern.Matches("/some/path%2F"); // Assert matches.Should().Be(false); @@ -181,7 +181,7 @@ public void Matches_OctectNotForwardSlashLowercaseOnlyInRule_ReturnTrue() var urlRule = new UrlRule(RuleType.Disallow, "/some/path%7e"); // Act - var matches = urlRule.Matches("/some/path~"); + var matches = urlRule.Pattern.Matches("/some/path~"); // Assert matches.Should().Be(true); @@ -194,7 +194,7 @@ public void Matches_OctectNotForwardSlashLowercaseOnlyInPath_ReturnTrue() var urlRule = new UrlRule(RuleType.Disallow, "/some/path~"); // Act - var matches = urlRule.Matches("/some/path%7e"); + var matches = urlRule.Pattern.Matches("/some/path%7e"); // Assert matches.Should().Be(true); @@ -207,7 +207,7 @@ public void Matches_OctectNotForwardSlashUppercaseOnlyInRule_ReturnTrue() var urlRule = new UrlRule(RuleType.Disallow, "/some/path%7E"); // Act - var matches = urlRule.Matches("/some/path~"); + var matches = urlRule.Pattern.Matches("/some/path~"); // Assert matches.Should().Be(true); @@ -220,7 +220,7 @@ public void Matches_OctectNotForwardSlashUppercaseOnlyInPath_ReturnTrue() var urlRule = new UrlRule(RuleType.Disallow, "/some/path~"); // Act - var matches = urlRule.Matches("/some/path%7E"); + var matches = urlRule.Pattern.Matches("/some/path%7E"); // Assert matches.Should().Be(true);