Skip to content

Commit

Permalink
Provide raw canonical composition and decomposition (#2099)
Browse files Browse the repository at this point in the history
Closes #2073.
  • Loading branch information
hsivonen authored Jul 18, 2022
1 parent f187f24 commit 321350b
Show file tree
Hide file tree
Showing 17 changed files with 2,180 additions and 55 deletions.
2 changes: 1 addition & 1 deletion components/collator/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ Create a directory `$PROJECTS/localicu`

Create a directory `$PROJECTS/icu-build` and `cd` into it.

Run `../icu/icu4c/source/runConfigureICU --enable-debug Linux --prefix /opt/Projects/localicu --enable-static`
Run `../icu/icu4c/source/runConfigureICU --enable-debug Linux --prefix $PROJECTS/localicu --enable-static`

Run `make`

Expand Down
2 changes: 1 addition & 1 deletion components/collator/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
//!
//! Create a directory `$PROJECTS/icu-build` and `cd` into it.
//!
//! Run `../icu/icu4c/source/runConfigureICU --enable-debug Linux --prefix /opt/Projects/localicu --enable-static`
//! Run `../icu/icu4c/source/runConfigureICU --enable-debug Linux --prefix $PROJECTS/localicu --enable-static`
//!
//! Run `make`
//!
Expand Down
390 changes: 337 additions & 53 deletions components/normalizer/src/lib.rs

Large diffs are not rendered by default.

16 changes: 16 additions & 0 deletions components/normalizer/src/provider.rs
Original file line number Diff line number Diff line change
Expand Up @@ -124,3 +124,19 @@ pub struct CompositionPassthroughV1<'data> {
#[cfg_attr(feature = "serde", serde(borrow))]
pub potential_passthrough_and_not_backward_combining: UnicodeSet<'data>,
}

/// Non-recursive canonical decompositions that differ from
/// `DecompositionDataV1`.
#[icu_provider::data_struct(NonRecursiveDecompositionSupplementV1Marker = "normalizer/decomp@1")]
#[derive(Debug, PartialEq, Clone)]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake), databake(path = icu_normalizer::provider))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
pub struct NonRecursiveDecompositionSupplementV1<'data> {
/// Trie for the supplementary non-recursive decompositions
#[cfg_attr(feature = "serde", serde(borrow))]
pub trie: CodePointTrie<'data, u32>,
/// Decompositions with at least one character outside
/// the BMP
#[cfg_attr(feature = "serde", serde(borrow))]
pub scalars24: ZeroVec<'data, U24>,
}
67 changes: 67 additions & 0 deletions components/normalizer/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use crate::CanonicalComposition;
use crate::CanonicalDecomposition;
use crate::ComposingNormalizer;
use crate::Decomposed;
use crate::DecomposingNormalizer;

#[test]
Expand Down Expand Up @@ -340,3 +343,67 @@ fn test_hangul() {
assert!(norm_iter.eq("A\u{AC1B}".chars()));
}
}

#[test]
fn test_canonical_composition() {
let data_provider = icu_testdata::get_provider();
let comp = CanonicalComposition::try_new(&data_provider).unwrap();

assert_eq!(comp.compose('a', 'b'), None); // Just two starters

assert_eq!(comp.compose('a', '\u{0308}'), Some('ä'));
assert_eq!(comp.compose('A', '\u{0308}'), Some('Ä'));
assert_eq!(comp.compose('ẹ', '\u{0302}'), Some('ệ'));
assert_eq!(comp.compose('Ẹ', '\u{0302}'), Some('Ệ'));
assert_eq!(comp.compose('\u{1D157}', '\u{1D165}'), None); // Composition exclusion

assert_eq!(comp.compose('ে', 'া'), Some('ো')); // Second is starter; BMP
assert_eq!(comp.compose('𑄱', '𑄧'), Some('𑄮')); // Second is starter; non-BMP

assert_eq!(comp.compose('ᄀ', 'ᅡ'), Some('가')); // Hangul LV
assert_eq!(comp.compose('가', 'ᆨ'), Some('각')); // Hangul LVT
}

#[test]
fn test_canonical_decomposition() {
let data_provider = icu_testdata::get_provider();
let decomp = CanonicalDecomposition::try_new(&data_provider).unwrap();

assert_eq!(
decomp.decompose('ä'),
Decomposed::Expansion('a', '\u{0308}')
);
assert_eq!(
decomp.decompose('Ä'),
Decomposed::Expansion('A', '\u{0308}')
);
assert_eq!(
decomp.decompose('ệ'),
Decomposed::Expansion('ẹ', '\u{0302}')
);
assert_eq!(
decomp.decompose('Ệ'),
Decomposed::Expansion('Ẹ', '\u{0302}')
);
assert_eq!(
decomp.decompose('\u{1D15E}'),
Decomposed::Expansion('\u{1D157}', '\u{1D165}')
);
assert_eq!(decomp.decompose('ো'), Decomposed::Expansion('ে', 'া'));
assert_eq!(decomp.decompose('𑄮'), Decomposed::Expansion('𑄱', '𑄧'));
assert_eq!(decomp.decompose('가'), Decomposed::Expansion('ᄀ', 'ᅡ'));
assert_eq!(decomp.decompose('각'), Decomposed::Expansion('가', 'ᆨ'));

assert_eq!(decomp.decompose('\u{212B}'), Decomposed::Singleton('Å')); // ANGSTROM SIGN
assert_eq!(decomp.decompose('\u{2126}'), Decomposed::Singleton('Ω')); // OHM SIGN

assert_eq!(decomp.decompose('\u{1F71}'), Decomposed::Singleton('ά')); // oxia
assert_eq!(
decomp.decompose('\u{1F72}'),
Decomposed::Expansion('ε', '\u{0300}')
); // not oxia but in the oxia range
assert_eq!(
decomp.decompose('ά'),
Decomposed::Expansion('α', '\u{0301}')
); // tonos
}
3 changes: 3 additions & 0 deletions provider/datagen/src/registry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ pub fn all_keys() -> Vec<ResourceKey> {
icu_normalizer::provider::CompatibilityCompositionPassthroughV1Marker::KEY,
#[cfg(feature = "experimental")]
icu_normalizer::provider::Uts46CompositionPassthroughV1Marker::KEY,
#[cfg(feature = "experimental")]
icu_normalizer::provider::NonRecursiveDecompositionSupplementV1Marker::KEY,
];
v.extend(icu_properties::provider::ALL_KEYS);
#[cfg(feature = "experimental")]
Expand Down Expand Up @@ -190,6 +192,7 @@ macro_rules! create_datagen_provider {
$crate::transform::icuexport::normalizer::CanonicalCompositionPassthroughProvider,
$crate::transform::icuexport::normalizer::CompatibilityCompositionPassthroughProvider,
$crate::transform::icuexport::normalizer::Uts46CompositionPassthroughProvider,
$crate::transform::icuexport::normalizer::NonRecursiveDecompositionSupplementProvider,
$crate::transform::icuexport::ucase::CaseMappingDataProvider,
$crate::transform::icuexport::uprops::EnumeratedPropertyCodePointTrieProvider,
$crate::transform::icuexport::uprops::ScriptWithExtensionsPropertyProvider,
Expand Down
39 changes: 39 additions & 0 deletions provider/datagen/src/transform/icuexport/normalizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,39 @@ macro_rules! normalization_canonical_compositions_provider {
};
}

macro_rules! normalization_non_recursive_decomposition_supplement_provider {
($marker:ident, $provider:ident, $file_name:literal) => {
normalization_provider!(
$marker,
$provider,
NonRecursiveDecompositionSupplement,
$file_name,
{
let trie = CodePointTrie::<u32>::try_from(&toml_data.trie)
.map_err(|e| DataError::custom("trie conversion").with_display_context(&e))?;
let mut scalars24: Vec<U24> = Vec::new();
for &u in toml_data.scalars32.iter() {
scalars24.push(
u.try_into()
.map_err(|_| DataError::custom("scalars24 conversion"))?,
);
}

Ok(DataResponse {
metadata: DataResponseMetadata::default(),
payload: Some(DataPayload::from_owned(
NonRecursiveDecompositionSupplementV1 {
trie,
scalars24: ZeroVec::alloc_from_slice(&scalars24),
},
)),
})
},
toml_data // simply matches the identifier in the above block
);
};
}

normalization_data_provider!(
CanonicalDecompositionDataV1Marker,
CanonicalDecompositionDataProvider,
Expand Down Expand Up @@ -257,3 +290,9 @@ normalization_canonical_compositions_provider!(
CanonicalCompositionsProvider,
"compositions"
);

normalization_non_recursive_decomposition_supplement_provider!(
NonRecursiveDecompositionSupplementV1Marker,
NonRecursiveDecompositionSupplementProvider,
"decompositionex"
);
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,9 @@ pub struct CompositionPassthrough {
pub struct CanonicalCompositions {
pub compositions: Vec<u16>,
}

#[derive(serde::Deserialize)]
pub struct NonRecursiveDecompositionSupplement {
pub trie: CodePointTrieToml,
pub scalars32: Vec<u32>,
}
5 changes: 5 additions & 0 deletions provider/testdata/data/baked/any.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ impl AnyProvider for BakedDataProvider {
::icu_normalizer::provider::CompatibilityDecompositionSupplementV1Marker::KEY.get_hash();
const COMPATIBILITYDECOMPOSITIONTABLESV1MARKER: ::icu_provider::ResourceKeyHash =
::icu_normalizer::provider::CompatibilityDecompositionTablesV1Marker::KEY.get_hash();
const NONRECURSIVEDECOMPOSITIONSUPPLEMENTV1MARKER: ::icu_provider::ResourceKeyHash =
::icu_normalizer::provider::NonRecursiveDecompositionSupplementV1Marker::KEY.get_hash();
const UTS46COMPOSITIONPASSTHROUGHV1MARKER: ::icu_provider::ResourceKeyHash =
::icu_normalizer::provider::Uts46CompositionPassthroughV1Marker::KEY.get_hash();
const UTS46DECOMPOSITIONSUPPLEMENTV1MARKER: ::icu_provider::ResourceKeyHash =
Expand Down Expand Up @@ -252,6 +254,9 @@ impl AnyProvider for BakedDataProvider {
COMPATIBILITYDECOMPOSITIONTABLESV1MARKER => AnyPayload::from_static_ref::<
<::icu_normalizer::provider::CompatibilityDecompositionTablesV1Marker as DataMarker>::Yokeable,
>(litemap_slice_get(normalizer::nfkdex_v1::DATA, key, req)?),
NONRECURSIVEDECOMPOSITIONSUPPLEMENTV1MARKER => AnyPayload::from_static_ref::<
<::icu_normalizer::provider::NonRecursiveDecompositionSupplementV1Marker as DataMarker>::Yokeable,
>(litemap_slice_get(normalizer::decomp_v1::DATA, key, req)?),
UTS46COMPOSITIONPASSTHROUGHV1MARKER => AnyPayload::from_static_ref::<
<::icu_normalizer::provider::Uts46CompositionPassthroughV1Marker as DataMarker>::Yokeable,
>(litemap_slice_get(normalizer::uts46_v1::DATA, key, req)?),
Expand Down
13 changes: 13 additions & 0 deletions provider/testdata/data/baked/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -497,6 +497,19 @@ impl ResourceProvider<::icu_normalizer::provider::CompatibilityDecompositionTabl
Ok (DataResponse { metadata : Default :: default () , payload : Some (DataPayload :: from_owned (zerofrom :: ZeroFrom :: zero_from (litemap_slice_get (normalizer :: nfkdex_v1 :: DATA , < :: icu_normalizer :: provider :: CompatibilityDecompositionTablesV1Marker as ResourceMarker > :: KEY , req) ? ,))) , })
}
}
impl ResourceProvider<::icu_normalizer::provider::NonRecursiveDecompositionSupplementV1Marker>
for BakedDataProvider
{
fn load_resource(
&self,
req: &DataRequest,
) -> Result<
DataResponse<::icu_normalizer::provider::NonRecursiveDecompositionSupplementV1Marker>,
DataError,
> {
Ok (DataResponse { metadata : Default :: default () , payload : Some (DataPayload :: from_owned (zerofrom :: ZeroFrom :: zero_from (litemap_slice_get (normalizer :: decomp_v1 :: DATA , < :: icu_normalizer :: provider :: NonRecursiveDecompositionSupplementV1Marker as ResourceMarker > :: KEY , req) ? ,))) , })
}
}
impl ResourceProvider<::icu_normalizer::provider::Uts46CompositionPassthroughV1Marker>
for BakedDataProvider
{
Expand Down
Loading

1 comment on commit 321350b

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Performance Alert ⚠️

Possible performance regression was detected for benchmark.
Benchmark result of this commit is worse than the previous benchmark result exceeding threshold 1.

Benchmark suite Current: 321350b Previous: f187f24 Ratio
provider/testdata/data/testdata.postcard 3963063 bytes 3958887 bytes 1.00

This comment was automatically generated by workflow using github-action-benchmark.

CC: @gnrunge @sffc @zbraniecki @echeran

Please sign in to comment.