Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extra tests for assembly name parser. #64022

Merged
merged 16 commits into from
Jan 22, 2022
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions src/coreclr/binder/inc/assemblyidentity.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,9 @@ namespace BINDER_SPACE
IDENTITY_FLAG_PUBLIC_KEY_TOKEN = 0x004,
IDENTITY_FLAG_PUBLIC_KEY = 0x008,
IDENTITY_FLAG_CULTURE = 0x010,
IDENTITY_FLAG_LANGUAGE = 0x020,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IDENTITY_FLAG_LANGUAGE was unused

IDENTITY_FLAG_PROCESSOR_ARCHITECTURE = 0x040,
IDENTITY_FLAG_RETARGETABLE = 0x080,
IDENTITY_FLAG_PUBLIC_KEY_TOKEN_NULL = 0x100,
IDENTITY_FLAG_CUSTOM = 0x200,
IDENTITY_FLAG_CUSTOM_NULL = 0x400,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IDENTITY_FLAG_CUSTOM was to support "custom" attribute that would have a binary blob as a value. We would not use the blob for anything. Managed AssemblyName would have no knowledge that custom existed.

Modulo validation that "custom" is not duplicated and the blob content is a valid hex, it is not any different from the treatment of unrecognized attributes.

IDENTITY_FLAG_CONTENT_TYPE = 0x800,
IDENTITY_FLAG_FULL_NAME = (IDENTITY_FLAG_SIMPLE_NAME |
IDENTITY_FLAG_VERSION)
Expand All @@ -50,7 +47,6 @@ namespace BINDER_SPACE
// Need to pre-populate SBuffers because of bogus asserts
static const BYTE byteArr[] = { 0 };
m_publicKeyOrTokenBLOB.SetImmutable(byteArr, sizeof(byteArr));
m_customBLOB.SetImmutable(byteArr, sizeof(byteArr));
}
~AssemblyIdentity()
{
Expand Down Expand Up @@ -83,7 +79,6 @@ namespace BINDER_SPACE
SBuffer m_publicKeyOrTokenBLOB;
PEKIND m_kProcessorArchitecture;
AssemblyContentType m_kContentType;
SBuffer m_customBLOB;
DWORD m_dwIdentityFlags;
};

Expand Down
16 changes: 5 additions & 11 deletions src/coreclr/binder/inc/stringlexer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,32 +55,28 @@ namespace BINDER_SPACE
inline StringLexer();
inline ~StringLexer();

inline void Init(SString &inputString, BOOL fSupportEscaping);
inline void Init(SString &inputString);

static inline BOOL IsWhitespace(WCHAR wcChar);
static inline BOOL IsEOS(WCHAR wcChar);
static inline BOOL IsQuoteCharacter(WCHAR wcChar);

virtual BOOL IsSeparatorChar(WCHAR wcChar) = NULL;
virtual LEXEME_TYPE GetLexemeType(WCHAR wcChar) = NULL;
BOOL IsSeparatorChar(WCHAR wcChar);
LEXEME_TYPE GetLexemeType(WCHAR wcChar);

protected:
static const WCHAR INVALID_CHARACTER = -1;

LEXEME_TYPE GetNextLexeme(SString &currentString, BOOL fPermitUnescapedQuotes = FALSE);
LEXEME_TYPE GetNextLexeme(SString &currentString);

inline WCHAR PopCharacter(BOOL *pfIsEscaped);
inline void PushCharacter(WCHAR wcCurrentChar,
BOOL fIsEscaped);

inline WCHAR GetRawCharacter();
inline void PushRawCharacter();
inline WCHAR DecodeUTF16Character();
inline WCHAR GetNextCharacter(BOOL *pfIsEscaped);

inline WCHAR ParseUnicode();
LEXEME_TYPE ParseString(SString &currentString,
BOOL fPermitUnescapeQuotes);
LEXEME_TYPE ParseString(SString &currentString);

void TrimTrailingWhiteSpaces(SString &currentString);

Expand All @@ -89,8 +85,6 @@ namespace BINDER_SPACE

WCHAR m_wcCurrentChar;
BOOL m_fCurrentCharIsEscaped;
BOOL m_fSupportEscaping;
BOOL m_fReadRawCharacter;
};

#include "stringlexer.inl"
Expand Down
164 changes: 27 additions & 137 deletions src/coreclr/binder/inc/stringlexer.inl
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,10 @@ StringLexer::~StringLexer()
// Nothing to do here
}

void StringLexer::Init(SString &inputString, BOOL fSupportEscaping)
void StringLexer::Init(SString &inputString)
{
m_cursor = inputString.Begin();
m_end = inputString.End();
m_fSupportEscaping = fSupportEscaping;
m_fReadRawCharacter = FALSE;
}

BOOL StringLexer::IsWhitespace(WCHAR wcChar)
Expand All @@ -55,6 +53,7 @@ WCHAR StringLexer::PopCharacter(BOOL *pfIsEscaped)
{
m_wcCurrentChar = INVALID_CHARACTER;
*pfIsEscaped = m_fCurrentCharIsEscaped;
m_cursor++;
}
else
{
Expand All @@ -71,172 +70,63 @@ void StringLexer::PushCharacter(WCHAR wcCurrentChar,

m_wcCurrentChar = wcCurrentChar;
m_fCurrentCharIsEscaped = fIsEscaped;
m_cursor--;
}

WCHAR StringLexer::GetRawCharacter()
{
WCHAR wcCurrentChar = 0;

if (m_cursor <= m_end)
if (m_cursor < m_end)
{
wcCurrentChar = m_cursor[0];
m_fReadRawCharacter = TRUE;
m_cursor++;
}
else
{
m_fReadRawCharacter = FALSE;
}

return wcCurrentChar;
}

void StringLexer::PushRawCharacter()
{
if (m_fReadRawCharacter)
{
m_cursor--;
m_fReadRawCharacter = FALSE;
}
}

WCHAR StringLexer::DecodeUTF16Character()
Copy link
Member Author

@VSadov VSadov Jan 21, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was only used when lexing numbers in escaped unicode characters like \u12345 .
Since we are dropping \u, the whole support for UTF16 surrogate pairs is dead code.

{
// See http://www.ietf.org/rfc/rfc2781.txt for details on UTF-16 encoding.

WCHAR wcCurrentChar = 0;
SCOUNT_T nCharacters = m_end - m_cursor + 1;
WCHAR wcChar1 = GetRawCharacter();

if (wcChar1 < 0xd800)
{
wcCurrentChar = wcChar1;
// do not allow \0 anywhere in the string.
if (wcCurrentChar == 0)
{
wcCurrentChar = INVALID_CHARACTER;
}
}
else
{
// StringLexer is not designed to handle UTF-16 characters beyond the Basic Multilingual Plane,
// since it stores all characters in 16-bit WCHARs.
// However, since the vast majority of the time, we (Microsoft) produce the manifests,
// this is likely a non-scenario, as the other Unicode planes would never be used in practice.

if (wcChar1 <= 0xdbff) // 0xd800 - 0xdbff indicates the first WCHAR of a surrogate pair
{
if (nCharacters >= 2)
{
GetRawCharacter(); // Skip the second WCHAR of the surrogate pair
}
}
// Otherwise, the character is either in the 0xdc00 - 0xdfff range, indicating the second WCHAR of a surrogate pair,
// or in the 0xE000 - 0xFFFF range, which has within it ranges of invalid characters, and which we conservatively treat
// as invalid.

wcCurrentChar = INVALID_CHARACTER;
// EOS
wcCurrentChar = 0;
}

return wcCurrentChar;
}


WCHAR StringLexer::GetNextCharacter(BOOL *pfIsEscaped)
{
*pfIsEscaped = FALSE;

WCHAR wcCurrentChar = GetRawCharacter(); // DecodeUTF16Character()
WCHAR wcCurrentChar = GetRawCharacter();
if (wcCurrentChar == L'\\')
{
WCHAR wcTempChar = GetRawCharacter(); // DecodeUTF16Character()
WCHAR wcTempChar = GetRawCharacter();

if (m_fSupportEscaping)
{
// Handle standard escapes
switch (wcTempChar)
{
case L'"':
case L'\'':
case L',':
case L'\\':
case L'/':
case L'=':
break;
case L't':
wcTempChar = 9;
break;
case L'n':
wcTempChar = 10;
break;
case L'r':
wcTempChar = 13;
break;
case L'u':
wcTempChar = ParseUnicode();
break;
default:
return INVALID_CHARACTER;
}

*pfIsEscaped = TRUE;
wcCurrentChar = wcTempChar;
}
else
{
// Do not handle escapes except for quotes
switch (wcTempChar)
{
case L'"':
case L'\'':
*pfIsEscaped = TRUE;
wcCurrentChar = wcTempChar;
break;
default:
PushRawCharacter();
break;
}
}
}

return wcCurrentChar;
}

WCHAR StringLexer::ParseUnicode()
{
int nCharacters = 0;
WCHAR wcUnicodeChar = 0;

for(;;)
{
WCHAR wcCurrentChar = DecodeUTF16Character();
nCharacters++;

if (wcCurrentChar == L';')
// Handle standard escapes
switch (wcTempChar)
{
case L'"':
case L'\'':
case L',':
case L'\\':
case L'=':
case L't':
case L'n':
case L'r':
break;
}
else if ((wcCurrentChar == INVALID_CHARACTER) || (nCharacters >= 9))
{
default:
return INVALID_CHARACTER;
}

wcUnicodeChar <<= 4;

if ((wcCurrentChar >= L'0') && (wcCurrentChar <= L'9'))
{
wcUnicodeChar += (wcCurrentChar - L'0');
}
else if ((wcCurrentChar >= L'a') && (wcCurrentChar <= L'f'))
{
wcUnicodeChar += (wcCurrentChar - L'a') + 10;
}
else if ((wcCurrentChar >= L'A') && (wcCurrentChar <= L'F'))
{
wcUnicodeChar += (wcCurrentChar - L'A') + 10;
}
else
{
return INVALID_CHARACTER;
}
*pfIsEscaped = TRUE;
wcCurrentChar = wcTempChar;
}

return wcUnicodeChar;
return wcCurrentChar;
}

#endif
11 changes: 4 additions & 7 deletions src/coreclr/binder/inc/textualidentityparser.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,9 @@ namespace BINDER_SPACE
TextualIdentityParser(AssemblyIdentity *pAssemblyIdentity);
~TextualIdentityParser();

virtual BOOL IsSeparatorChar(WCHAR wcChar);
virtual StringLexer::LEXEME_TYPE GetLexemeType(WCHAR wcChar);

static HRESULT Parse(/* in */ SString &textualIdentity,
/* out */ AssemblyIdentity *pAssemblyIdentity,
/* in */ BOOL fPermitUnescapedQuotes = FALSE);
/* out */ AssemblyIdentity *pAssemblyIdentity);

static HRESULT ToString(/* in */ AssemblyIdentity *pAssemblyIdentity,
/* in */ DWORD dwIdentityFlags,
/* out */ SString &textualIdentity);
Expand All @@ -45,15 +42,15 @@ namespace BINDER_SPACE
/* in */ BOOL fValidateHex,
/* in */ BOOL fIsToken,
/* out */ SBuffer &publicKeyOrTokenBLOB);

static void BlobToHex(/* in */ SBuffer &publicKeyOrTokenBLOB,
/* out */ SString &publicKeyOrToken);

BOOL ParseString(/* in */ SString &textualString,
/* out */ SString &contentString);

protected:
BOOL Parse(/* in */ SString &textualIdentity,
/* in */ BOOL fPermitUnescapedQuotes = FALSE);
BOOL Parse(/* in */ SString &textualIdentity);

BOOL PopulateAssemblyIdentity(/* in */ SString &attributeString,
/* in */ SString &valueString);
Expand Down
28 changes: 24 additions & 4 deletions src/coreclr/binder/stringlexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
namespace BINDER_SPACE
{
StringLexer::LEXEME_TYPE
StringLexer::GetNextLexeme(SString &currentString, BOOL fPermitUnescapedQuotes)
StringLexer::GetNextLexeme(SString &currentString)
{
BOOL fIsEscaped = FALSE;
WCHAR wcCurrentChar = INVALID_CHARACTER;
Expand All @@ -43,11 +43,11 @@ namespace BINDER_SPACE

// First character of string lexeme; push it back
PushCharacter(wcCurrentChar, fIsEscaped);
return ParseString(currentString, fPermitUnescapedQuotes);
return ParseString(currentString);
}

StringLexer::LEXEME_TYPE
StringLexer::ParseString(SString &currentString, BOOL fPermitUnescapedQuotes)
StringLexer::ParseString(SString &currentString)
{
BOOL fIsFirstCharacter = TRUE;
WCHAR wcCurrentChar = INVALID_CHARACTER;
Expand Down Expand Up @@ -99,7 +99,7 @@ namespace BINDER_SPACE
break;
}

if (!fPermitUnescapedQuotes && !fIsEscaped && IsQuoteCharacter(wcCurrentChar) && !IsQuoteCharacter(wcOpeningQuote))
if (!fIsEscaped && IsQuoteCharacter(wcCurrentChar) && !IsQuoteCharacter(wcOpeningQuote))
{
// Unescaped quotes in the middle of the string are an error
return LEXEME_TYPE_INVALID;
Expand Down Expand Up @@ -147,4 +147,24 @@ namespace BINDER_SPACE
currentString.Truncate(cursor + 1);
}
}

BOOL StringLexer::IsSeparatorChar(WCHAR wcChar)
{
return ((wcChar == W(',')) || (wcChar == W('=')));
}

StringLexer::LEXEME_TYPE StringLexer::GetLexemeType(WCHAR wcChar)
{
switch (wcChar)
{
case W('='):
return LEXEME_TYPE_EQUALS;
case W(','):
return LEXEME_TYPE_COMMA;
case 0:
return LEXEME_TYPE_END_OF_STREAM;
default:
return LEXEME_TYPE_STRING;
}
}
};
Loading