Skip to content

Commit

Permalink
Add support for NFKD and the decomposed counterpart of UTS 46 without…
Browse files Browse the repository at this point in the history
… ignored and disallowed (#1967)
  • Loading branch information
hsivonen authored Jun 2, 2022
1 parent 269b807 commit a3ba544
Show file tree
Hide file tree
Showing 32 changed files with 34,956 additions and 2,697 deletions.
3 changes: 3 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 4 additions & 2 deletions experimental/collator/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,12 @@ icu_provider = { version = "0.6", path = "../../provider/core", features = ["mac
icu_locid = { version = "0.6", path = "../../components/locid" }
icu_normalizer = { version = "0.6", path = "../../experimental/normalizer" }
icu_properties = { version = "0.6", path = "../../components/properties" }
icu_uniset = { version = "0.5", path = "../../utils/uniset" }
serde = { version = "1.0", default-features = false, features = ["derive", "alloc"], optional = true }
zerovec = { version = "0.7", path = "../../utils/zerovec", features = ["serde"] }
utf8_iter = "1.0"
crabbake = { version = "0.4", path = "../../experimental/crabbake", optional = true, features = ["derive"] }
zerofrom = { version = "0.1.0", path = "../../utils/zerofrom" }

[dev-dependencies]
icu_testdata = { version = "0.6", path = "../../provider/testdata" }
Expand All @@ -56,5 +58,5 @@ bench = false # This option is required for Benchmark CI

[features]
default = []
serde = ["dep:serde", "zerovec/serde", "icu_char16trie/serde", "icu_properties/serde", "icu_normalizer/serde", "icu_codepointtrie/serde"]
datagen = ["serde", "crabbake", "zerovec/crabbake", "icu_char16trie/crabbake", "icu_properties/crabbake", "icu_normalizer/crabbake", "icu_codepointtrie/crabbake"]
serde = ["dep:serde", "zerovec/serde", "icu_char16trie/serde", "icu_properties/serde", "icu_normalizer/serde", "icu_uniset/serde", "icu_codepointtrie/serde"]
datagen = ["serde", "crabbake", "zerovec/crabbake", "icu_char16trie/crabbake", "icu_properties/crabbake", "icu_normalizer/crabbake", "icu_uniset/crabbake", "icu_codepointtrie/crabbake"]
22 changes: 22 additions & 0 deletions experimental/collator/src/comparison.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ use core::cmp::Ordering;
use core::convert::TryFrom;
use icu_locid::Locale;
use icu_normalizer::provider::CanonicalDecompositionDataV1Marker;
use icu_normalizer::provider::CanonicalDecompositionTablesV1Marker;
use icu_normalizer::Decomposition;
use icu_properties::provider::CanonicalCombiningClassV1Marker;
use icu_provider::DataPayload;
Expand Down Expand Up @@ -72,6 +73,7 @@ pub struct Collator {
options: CollatorOptions,
reordering: Option<DataPayload<CollationReorderingV1Marker>>,
decompositions: DataPayload<CanonicalDecompositionDataV1Marker>,
tables: DataPayload<CanonicalDecompositionTablesV1Marker>,
ccc: DataPayload<CanonicalCombiningClassV1Marker>,
lithuanian_dot_above: bool,
}
Expand All @@ -90,6 +92,7 @@ impl Collator {
+ ResourceProvider<CollationMetadataV1Marker>
+ ResourceProvider<CollationReorderingV1Marker>
+ ResourceProvider<CanonicalDecompositionDataV1Marker>
+ ResourceProvider<CanonicalDecompositionTablesV1Marker>
+ ResourceProvider<CanonicalCombiningClassV1Marker>
+ ?Sized,
{
Expand Down Expand Up @@ -203,6 +206,10 @@ impl Collator {
.load_resource(&DataRequest::default())?
.take_payload()?;

let tables: DataPayload<CanonicalDecompositionTablesV1Marker> = data_provider
.load_resource(&DataRequest::default())?
.take_payload()?;

let ccc: DataPayload<CanonicalCombiningClassV1Marker> =
icu_properties::maps::get_canonical_combining_class(data_provider)?;

Expand Down Expand Up @@ -246,6 +253,7 @@ impl Collator {
options: merged_options,
reordering,
decompositions,
tables,
ccc,
lithuanian_dot_above: metadata.lithuanian_dot_above(),
})
Expand All @@ -272,11 +280,15 @@ impl Collator {
return Decomposition::new(
decode_utf16(left.iter().copied()).map(utf16_error_to_replacement),
self.decompositions.get(),
self.tables.get(),
None,
&self.ccc.get().code_point_trie,
)
.cmp(Decomposition::new(
decode_utf16(right.iter().copied()).map(utf16_error_to_replacement),
self.decompositions.get(),
self.tables.get(),
None,
&self.ccc.get().code_point_trie,
));
}
Expand All @@ -301,11 +313,15 @@ impl Collator {
return Decomposition::new(
left.chars(),
self.decompositions.get(),
self.tables.get(),
None,
&self.ccc.get().code_point_trie,
)
.cmp(Decomposition::new(
right.chars(),
self.decompositions.get(),
self.tables.get(),
None,
&self.ccc.get().code_point_trie,
));
}
Expand All @@ -330,11 +346,15 @@ impl Collator {
return Decomposition::new(
left.chars(),
self.decompositions.get(),
self.tables.get(),
None,
&self.ccc.get().code_point_trie,
)
.cmp(Decomposition::new(
right.chars(),
self.decompositions.get(),
self.tables.get(),
None,
&self.ccc.get().code_point_trie,
));
}
Expand Down Expand Up @@ -411,6 +431,7 @@ impl Collator {
)
.unwrap(), // length already validated
self.decompositions.get(),
self.tables.get(),
&self.ccc.get().code_point_trie,
numeric_primary,
self.lithuanian_dot_above,
Expand All @@ -426,6 +447,7 @@ impl Collator {
)
.unwrap(), // length already validated
self.decompositions.get(),
self.tables.get(),
&self.ccc.get().code_point_trie,
numeric_primary,
self.lithuanian_dot_above,
Expand Down
Loading

2 comments on commit a3ba544

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Performance Alert ⚠️

Possible performance regression was detected for benchmark.
Benchmark result of this commit is worse than the previous benchmark result exceeding threshold 1.

Benchmark suite Current: a3ba544 Previous: 269b807 Ratio
provider/testdata/data/testdata.postcard 2301197 bytes 2221790 bytes 1.04

This comment was automatically generated by workflow using github-action-benchmark.

CC: @gnrunge @sffc @zbraniecki @echeran

@echeran
Copy link
Contributor

@echeran echeran commented on a3ba544 Jun 2, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The increase in data is expected because this PR adds normalization-related data to testdata. This occurred in the datasize CI job.

Another non-issue (as Henri predicted) is that one of the CI runs during development the PR had a failure due to flakiness. That occurred in the binsize CI job, and was due to Github Actions infrastructure internal issues. That job passed on the CI run triggered by the merge to main.

Please sign in to comment.