Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix JSON serialization for UTF-32 characters. #998

Merged
merged 2 commits into from
Nov 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions YamlDotNet.Test/Serialization/SerializationTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -886,6 +886,15 @@ public void SerializationOfAnchorWorksInJson()
.BeEquivalentTo(@"{""x"": {""z"": {""v"": ""1""}}, ""y"": {""k"": {""z"": {""v"": ""1""}}}}");
}

[Fact]
public void SerializationOfUtf32WorksInJson()
{
var obj = new { TestProperty = "Sea life \U0001F99E" };

SerializerBuilder.JsonCompatible().Build().Serialize(obj).Trim().Should()
.Be(@"{""TestProperty"": ""Sea life \uD83E\uDD9E""}");
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Apparently, lobster in UTF32 is different from lobster in UTF16. 🤷

https://www.unicodecharacter.org/U+1F99E

}

[Fact]
// Todo: this is actually roundtrip
public void DeserializationOfDefaultsWorkInJson()
Expand Down
18 changes: 16 additions & 2 deletions YamlDotNet/Core/Emitter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ public class Emitter : IEmitter
private bool isWhitespace;
private bool isIndentation;
private readonly bool forceIndentLess;
private readonly bool useUtf16SurrogatePair;
private readonly string newLine;

private bool isDocumentEndWritten;
Expand Down Expand Up @@ -148,6 +149,7 @@ public Emitter(TextWriter output, EmitterSettings settings)
this.maxSimpleKeyLength = settings.MaxSimpleKeyLength;
this.skipAnchorName = settings.SkipAnchorName;
this.forceIndentLess = !settings.IndentSequences;
this.useUtf16SurrogatePair = settings.UseUtf16SurrogatePairs;
this.newLine = settings.NewLine;

this.output = output;
Expand Down Expand Up @@ -1189,8 +1191,20 @@ private void WriteDoubleQuotedScalar(string value, bool allowBreaks)
{
if (index + 1 < value.Length && IsLowSurrogate(value[index + 1]))
{
Write('U');
Write(char.ConvertToUtf32(character, value[index + 1]).ToString("X08", CultureInfo.InvariantCulture));
if (useUtf16SurrogatePair)
{
Write('u');
Write(code.ToString("X04", CultureInfo.InvariantCulture));
Write('\\');
Write('u');
Write(((ushort)value[index + 1]).ToString("X04", CultureInfo.InvariantCulture));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should you check to make sure this won’t go out of bounds?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure but do you need to advance the index by one to handle the case when utf32 character is in the middle of the string?

Copy link
Contributor Author

@nahk-ivanov nahk-ivanov Oct 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't all of this already done on lines 1192 and 1208? The previous code worked somehow 🤷 we are just rendering it differently, not reading more or less than what was read before.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I’m not at a computer, just a phone, so reviewing prs can be a bit tricky. If it’s already done then great. Ignore this comment.

}
else
{
Write('U');
Write(char.ConvertToUtf32(character, value[index + 1]).ToString("X08", CultureInfo.InvariantCulture));
}

index++;
}
else
Expand Down
44 changes: 37 additions & 7 deletions YamlDotNet/Core/EmitterSettings.cs
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,22 @@ public sealed class EmitterSettings
/// </summary>
public bool IndentSequences { get; }

/// <summary>
/// If true, then 4-byte UTF-32 characters are broken into two 2-byte code-points.
/// </summary>
/// <remarks>
/// This ensures compatibility with JSON format, as it does not allow '\Uxxxxxxxxx'
/// and instead expects two escaped 2-byte character '\uxxxx\uxxxx'.
/// </remarks>
public bool UseUtf16SurrogatePairs { get; }

public static readonly EmitterSettings Default = new EmitterSettings();

public EmitterSettings()
{
}

public EmitterSettings(int bestIndent, int bestWidth, bool isCanonical, int maxSimpleKeyLength, bool skipAnchorName = false, bool indentSequences = false, string? newLine = null)
public EmitterSettings(int bestIndent, int bestWidth, bool isCanonical, int maxSimpleKeyLength, bool skipAnchorName = false, bool indentSequences = false, string? newLine = null, bool useUtf16SurrogatePairs = false)
{
if (bestIndent < 2 || bestIndent > 9)
{
Expand All @@ -93,6 +102,7 @@ public EmitterSettings(int bestIndent, int bestWidth, bool isCanonical, int maxS
SkipAnchorName = skipAnchorName;
IndentSequences = indentSequences;
NewLine = newLine ?? Environment.NewLine;
UseUtf16SurrogatePairs = useUtf16SurrogatePairs;
}

public EmitterSettings WithBestIndent(int bestIndent)
Expand All @@ -104,7 +114,8 @@ public EmitterSettings WithBestIndent(int bestIndent)
MaxSimpleKeyLength,
SkipAnchorName,
IndentSequences,
NewLine
NewLine,
UseUtf16SurrogatePairs
);
}

Expand All @@ -117,7 +128,8 @@ public EmitterSettings WithBestWidth(int bestWidth)
MaxSimpleKeyLength,
SkipAnchorName,
IndentSequences,
NewLine
NewLine,
UseUtf16SurrogatePairs
);
}

Expand All @@ -130,7 +142,8 @@ public EmitterSettings WithMaxSimpleKeyLength(int maxSimpleKeyLength)
maxSimpleKeyLength,
SkipAnchorName,
IndentSequences,
NewLine
NewLine,
UseUtf16SurrogatePairs
);
}

Expand All @@ -143,7 +156,8 @@ public EmitterSettings WithNewLine(string newLine)
MaxSimpleKeyLength,
SkipAnchorName,
IndentSequences,
newLine
newLine,
UseUtf16SurrogatePairs
);
}

Expand All @@ -167,7 +181,8 @@ public EmitterSettings WithoutAnchorName()
MaxSimpleKeyLength,
true,
IndentSequences,
NewLine
NewLine,
UseUtf16SurrogatePairs
);
}

Expand All @@ -180,7 +195,22 @@ public EmitterSettings WithIndentedSequences()
MaxSimpleKeyLength,
SkipAnchorName,
true,
NewLine
NewLine,
UseUtf16SurrogatePairs
);
}

public EmitterSettings WithUtf16SurrogatePairs()
{
return new EmitterSettings(
BestIndent,
BestWidth,
IsCanonical,
MaxSimpleKeyLength,
SkipAnchorName,
IndentSequences,
NewLine,
true
);
}
}
Expand Down
3 changes: 2 additions & 1 deletion YamlDotNet/Serialization/SerializerBuilder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,8 @@ public SerializerBuilder JsonCompatible()
{
this.emitterSettings = this.emitterSettings
.WithMaxSimpleKeyLength(int.MaxValue)
.WithoutAnchorName();
.WithoutAnchorName()
.WithUtf16SurrogatePairs();

return this
.WithTypeConverter(new GuidConverter(true), w => w.InsteadOf<GuidConverter>())
Expand Down
3 changes: 2 additions & 1 deletion YamlDotNet/Serialization/StaticSerializerBuilder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,8 @@ public StaticSerializerBuilder JsonCompatible()
{
this.emitterSettings = this.emitterSettings
.WithMaxSimpleKeyLength(int.MaxValue)
.WithoutAnchorName();
.WithoutAnchorName()
.WithUtf16SurrogatePairs();

return this
.WithTypeConverter(new GuidConverter(true), w => w.InsteadOf<GuidConverter>())
Expand Down