Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize percent-encoded UTF8 processing in Uri #32552

Merged
merged 17 commits into from
Dec 14, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions src/libraries/System.Private.Uri/src/System.Private.Uri.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,11 @@
<Compile Include="System\DomainNameHelper.cs" />
<Compile Include="System\GenericUriParser.cs" />
<Compile Include="System\IPv4AddressHelper.cs" />
<Compile Include="$(CommonPath)System\Net\IPv4AddressHelper.Common.cs"
Link="System\IPv4AddressHelper.Common.cs" />
<Compile Include="$(CommonPath)System\Net\IPv4AddressHelper.Common.cs" Link="System\IPv4AddressHelper.Common.cs" />
<Compile Include="System\IPv6AddressHelper.cs" />
<Compile Include="$(CommonPath)System\Net\IPv6AddressHelper.Common.cs"
Link="System\IPv6AddressHelper.Common.cs" />
<Compile Include="$(CommonPath)System\Net\IPv6AddressHelper.Common.cs" Link="System\IPv6AddressHelper.Common.cs" />
<Compile Include="System\IriHelper.cs" />
<Compile Include="System\PercentEncodingHelper.cs" />
<Compile Include="System\UncNameHelper.cs" />
<Compile Include="System\Uri.cs" />
<Compile Include="System\UriBuilder.cs" />
Expand All @@ -34,8 +33,8 @@
<Compile Include="System\UriPartial.cs" />
<Compile Include="System\UriScheme.cs" />
<Compile Include="System\UriSyntax.cs" />
<Compile Include="$(CommonPath)System\Text\ValueStringBuilder.cs"
Link="Common\System\Text\ValueStringBuilder.cs" />
<Compile Include="$(CommonPath)System\Text\ValueStringBuilder.cs" Link="Common\System\Text\ValueStringBuilder.cs" />
<Compile Include="System\ValueStringBuilderExtensions.cs" />
</ItemGroup>
<ItemGroup Condition="'$(TargetsWindows)' == 'true'">
<Compile Include="System\Uri.Windows.cs" />
Expand Down
144 changes: 53 additions & 91 deletions src/libraries/System.Private.Uri/src/System/IriHelper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// The .NET Foundation licenses this file to you under the MIT license.

using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Text;

namespace System
Expand All @@ -14,10 +15,10 @@ internal static class IriHelper
//
internal static bool CheckIriUnicodeRange(char unicode, bool isQuery)
{
return ((unicode >= '\u00A0' && unicode <= '\uD7FF') ||
(unicode >= '\uF900' && unicode <= '\uFDCF') ||
(unicode >= '\uFDF0' && unicode <= '\uFFEF') ||
(isQuery && unicode >= '\uE000' && unicode <= '\uF8FF'));
return IsInInclusiveRange(unicode, '\u00A0', '\uD7FF')
|| IsInInclusiveRange(unicode, '\uF900', '\uFDCF')
|| IsInInclusiveRange(unicode, '\uFDF0', '\uFFEF')
|| (isQuery && IsInInclusiveRange(unicode, '\uE000', '\uF8FF'));
}

//
Expand Down Expand Up @@ -47,6 +48,27 @@ internal static bool CheckIriUnicodeRange(char highSurr, char lowSurr, out bool
return false;
}

internal static bool CheckIriUnicodeRange(uint value, bool isQuery)
{
if (value <= 0xFFFF)
{
return IsInInclusiveRange(value, '\u00A0', '\uD7FF')
|| IsInInclusiveRange(value, '\uF900', '\uFDCF')
|| IsInInclusiveRange(value, '\uFDF0', '\uFFEF')
|| (isQuery && IsInInclusiveRange(value, '\uE000', '\uF8FF'));
}
else
{
return ((value & 0xFFFF) < 0xFFFE)
&& !IsInInclusiveRange(value, 0xE0000, 0xE0FFF)
&& (isQuery || value < 0xF0000);
}
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool IsInInclusiveRange(uint value, uint min, uint max)
=> (value - min) <= (max - min);

//
// Check reserved chars according to RFC 3987 in a specific component
//
Expand All @@ -67,114 +89,55 @@ internal static bool CheckIsReserved(char ch, UriComponents component)
internal static unsafe string EscapeUnescapeIri(char* pInput, int start, int end, UriComponents component)
{
int size = end - start;
ValueStringBuilder dest = new ValueStringBuilder(size);
byte[]? bytes = null;

int next = start;
char ch;
ValueStringBuilder dest = size <= 256
? new ValueStringBuilder(stackalloc char[256])
lpereira marked this conversation as resolved.
Show resolved Hide resolved
: new ValueStringBuilder(size);

Span<byte> maxUtf8EncodedSpan = stackalloc byte[4];

for (; next < end; ++next)
for (int i = start; i < end; ++i)
{
if ((ch = pInput[next]) == '%')
char ch = pInput[i];
if (ch == '%')
{
if (next + 2 < end)
if (end - i > 2)
{
ch = UriHelper.DecodeHexChars(pInput[next + 1], pInput[next + 2]);
ch = UriHelper.DecodeHexChars(pInput[i + 1], pInput[i + 2]);

// Do not unescape a reserved char
if (ch == Uri.c_DummyChar || ch == '%' || CheckIsReserved(ch, component) || UriHelper.IsNotSafeForUnescape(ch))
{
// keep as is
dest.Append(pInput[next++]);
dest.Append(pInput[next++]);
dest.Append(pInput[next]);
dest.Append(pInput[i++]);
dest.Append(pInput[i++]);
dest.Append(pInput[i]);
Comment on lines +111 to +113
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Each Append will perform a bounds check. Can you use the Append(char *, int) overload here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That overload will do a Span slice as well. I was thinking of adding Append(char, char) and Append(char, char, char) overloads as they can have a measurable perf impact (that would be part of a separate PR adressing #22903).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Append(char, char) and Append(char, char, char) seem quite awkward API choices, IMHO. Maybe make the Append(char *, int) overload not create a Span slice?

continue;
}
else if (ch <= '\x7F')
{
Debug.Assert(ch < 0xFF, "Expecting ASCII character.");
//ASCII
dest.Append(ch);
next += 2;
i += 2;
continue;
}
else
{
// possibly utf8 encoded sequence of unicode

// check if safe to unescape according to Iri rules

Debug.Assert(ch < 0xFF, "Expecting ASCII character.");

int startSeq = next;
int byteCount = 1;
// lazy initialization of max size, will reuse the array for next sequences
if (bytes is null)
bytes = new byte[end - next];

bytes[0] = (byte)ch;
next += 3;
while (next < end)
{
// Check on exit criterion
if ((ch = pInput[next]) != '%' || next + 2 >= end)
break;

// already made sure we have 3 characters in str
ch = UriHelper.DecodeHexChars(pInput[next + 1], pInput[next + 2]);

//invalid hex sequence ?
if (ch == Uri.c_DummyChar)
break;
// character is not part of a UTF-8 sequence ?
else if (ch < '\x80')
break;
else
{
//a UTF-8 sequence
bytes[byteCount++] = (byte)ch;
next += 3;
}

Debug.Assert(ch < 0xFF, "Expecting ASCII character.");
}
next--; // for loop will increment


// Using encoder with no replacement fall-back will skip all invalid UTF-8 sequences.
Encoding noFallbackCharUTF8 = Encoding.GetEncoding(
Encoding.UTF8.CodePage,
new EncoderReplacementFallback(""),
new DecoderReplacementFallback(""));

char[] unescapedChars = new char[bytes.Length];
int charCount = noFallbackCharUTF8.GetChars(bytes, 0, byteCount, unescapedChars, 0);


if (charCount != 0)
{
// If invalid sequences were present in the original escaped string, we need to
// copy the escaped versions of those sequences.
// Decoded Unicode values will be kept only when they are allowed by the URI/IRI RFC
// rules.
UriHelper.MatchUTF8Sequence(ref dest, unescapedChars, charCount, bytes,
byteCount, component == UriComponents.Query, true);
}
else
{
// copy escaped sequence as is
for (int i = startSeq; i <= next; ++i)
{
dest.Append(pInput[i]);
}
}
int charactersRead = PercentEncodingHelper.UnescapePercentEncodedUTF8Sequence(
pInput + i,
end - i,
ref dest,
component == UriComponents.Query,
iriParsing: true);

Debug.Assert(charactersRead > 0);
i += charactersRead - 1; // -1 as i will be incremented in the loop
}
}
else
{
dest.Append(pInput[next]);
dest.Append(pInput[i]);
}
}
else if (ch > '\x7f')
Expand All @@ -186,9 +149,9 @@ internal static unsafe string EscapeUnescapeIri(char* pInput, int start, int end

char ch2 = '\0';

if ((char.IsHighSurrogate(ch)) && (next + 1 < end))
if ((char.IsHighSurrogate(ch)) && (i + 1 < end))
{
ch2 = pInput[next + 1];
ch2 = pInput[i + 1];
isInIriUnicodeRange = CheckIriUnicodeRange(ch, ch2, out surrogatePair, component == UriComponents.Query);
}
else
Expand Down Expand Up @@ -227,18 +190,17 @@ internal static unsafe string EscapeUnescapeIri(char* pInput, int start, int end

if (surrogatePair)
{
next++;
i++;
}
}
else
{
// just copy the character
dest.Append(pInput[next]);
dest.Append(pInput[i]);
}
}

string result = dest.ToString();
return result;
return dest.ToString();
}
}
}
Loading