Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Store only secondary weight in diacritic table and remove jamo tailoring bit #1978

Merged
merged 15 commits into from
Jun 6, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 16 additions & 12 deletions experimental/collator/src/comparison.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
//! the comparison of collation element sequences.

use crate::elements::{
CollationElement, CollationElements, NonPrimary, COMBINING_DIACRITICS_COUNT, JAMO_COUNT, NO_CE,
NO_CE_PRIMARY, NO_CE_SECONDARY, NO_CE_TERTIARY, QUATERNARY_MASK,
CollationElement, CollationElements, NonPrimary, JAMO_COUNT, NO_CE, NO_CE_PRIMARY,
NO_CE_SECONDARY, NO_CE_TERTIARY, OPTIMIZED_DIACRITICS_MAX_COUNT, QUATERNARY_MASK,
};
use crate::error::CollatorError;
use crate::provider::CollationDataV1Marker;
Expand Down Expand Up @@ -179,9 +179,10 @@ impl Collator {
.load_resource(&DataRequest::default())?
.take_payload()?;

let tailored_diacritics = metadata.tailored_diacritics();
let diacritics: DataPayload<CollationDiacriticsV1Marker> = data_provider
.load_resource(&DataRequest {
options: if metadata.tailored_diacritics() {
options: if tailored_diacritics {
resource_options
} else {
ResourceOptions::default()
Expand All @@ -190,7 +191,16 @@ impl Collator {
})?
.take_payload()?;

if diacritics.get().ce32s.len() != COMBINING_DIACRITICS_COUNT {
if tailored_diacritics {
// In the tailored case we accept a shorter table in which case the tailoring is
// responsible for supplying the missing values in the trie.
// As of June 2022, none of the collations actually use a shortened table.
// Vietnamese and Ewe load a full-length alternative table and the rest use
// the default one.
if diacritics.get().secondaries.len() > OPTIMIZED_DIACRITICS_MAX_COUNT {
return Err(CollatorError::MalformedData);
}
} else if diacritics.get().secondaries.len() != OPTIMIZED_DIACRITICS_MAX_COUNT {
return Err(CollatorError::MalformedData);
}

Expand Down Expand Up @@ -426,10 +436,7 @@ impl Collator {
tailoring.get(),
<&[<u32 as AsULE>::ULE; JAMO_COUNT]>::try_from(self.jamo.get().ce32s.as_ule_slice())
.unwrap(), // length already validated
<&[<u32 as AsULE>::ULE; COMBINING_DIACRITICS_COUNT]>::try_from(
self.diacritics.get().ce32s.as_ule_slice(),
)
.unwrap(), // length already validated
&self.diacritics.get().secondaries,
self.decompositions.get(),
self.tables.get(),
&self.ccc.get().code_point_trie,
Expand All @@ -442,10 +449,7 @@ impl Collator {
tailoring.get(),
<&[<u32 as AsULE>::ULE; JAMO_COUNT]>::try_from(self.jamo.get().ce32s.as_ule_slice())
.unwrap(), // length already validated
<&[<u32 as AsULE>::ULE; COMBINING_DIACRITICS_COUNT]>::try_from(
self.diacritics.get().ce32s.as_ule_slice(),
)
.unwrap(), // length already validated
&self.diacritics.get().secondaries,
self.decompositions.get(),
self.tables.get(),
&self.ccc.get().code_point_trie,
Expand Down
42 changes: 18 additions & 24 deletions experimental/collator/src/elements.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,9 @@ const HANGUL_S_COUNT: u32 = 11172;
pub(crate) const JAMO_COUNT: usize = 256; // 0x1200 - 0x1100

const COMBINING_DIACRITICS_BASE: usize = 0x0300;
const COMBINING_DIACRITICS_LIMIT: usize = 0x0370;
pub(crate) const COMBINING_DIACRITICS_COUNT: usize =
COMBINING_DIACRITICS_LIMIT - COMBINING_DIACRITICS_BASE;
const OPTIMIZED_DIACRITICS_LIMIT: usize = 0x034F;
pub(crate) const OPTIMIZED_DIACRITICS_MAX_COUNT: usize =
OPTIMIZED_DIACRITICS_LIMIT - COMBINING_DIACRITICS_BASE;

pub(crate) const CASE_MASK: u16 = 0xC000;
pub(crate) const TERTIARY_MASK: u16 = 0x3F3F; // ONLY_TERTIARY_MASK in ICU4C
Expand Down Expand Up @@ -467,6 +467,11 @@ impl CollationElement {
CollationElement((u64::from(primary) << 32) | COMMON_SEC_AND_TER_CE)
}

#[inline(always)]
pub fn new_from_secondary(secondary: u16) -> Self {
CollationElement((u64::from(secondary) << 16) | COMMON_TERTIARY_CE)
}

#[inline(always)]
pub fn new_implicit_from_char(c: char) -> Self {
// Collation::unassignedPrimaryFromCodePoint
Expand Down Expand Up @@ -700,7 +705,7 @@ where
/// Note: in ICU4C the jamo table contains only modern jamo. Here, the jamo table contains the whole Unicode block.
jamo: &'data [<u32 as AsULE>::ULE; JAMO_COUNT],
/// The `CollationElement32` mapping for the Combining Diacritical Marks block.
diacritics: &'data [<u32 as AsULE>::ULE; COMBINING_DIACRITICS_COUNT],
diacritics: &'data ZeroSlice<u16>,
/// NFD main trie.
trie: &'data CodePointTrie<'data, u32>,
/// NFD helper set
Expand Down Expand Up @@ -731,7 +736,7 @@ where
root: &'data CollationDataV1,
tailoring: &'data CollationDataV1,
jamo: &'data [<u32 as AsULE>::ULE; JAMO_COUNT],
diacritics: &'data [<u32 as AsULE>::ULE; COMBINING_DIACRITICS_COUNT],
diacritics: &'data ZeroSlice<u16>,
decompositions: &'data DecompositionDataV1,
tables: &'data DecompositionTablesV1,
ccc: &'data CodePointTrie<'data, CanonicalCombiningClass>,
Expand Down Expand Up @@ -1183,15 +1188,11 @@ where
if self.is_next_decomposition_starts_with_starter() {
let diacritic_index =
(low as usize).wrapping_sub(COMBINING_DIACRITICS_BASE);
if diacritic_index < self.diacritics.len() {
if let Some(secondary) = self.diacritics.get(diacritic_index) {
debug_assert!(low != 0x0344, "Should never have COMBINING GREEK DIALYTIKA TONOS here, since it should have decomposed further.");
if let Some(ce) = ce32.to_ce_simple_or_long_primary() {
// Inner unwrap: already checked len()
// Outer unwrap: expectation of data integrity.
let ce_for_combining = CollationElement32::new_from_ule(
self.diacritics[diacritic_index],
)
.to_ce_self_contained_or_gigo();
let ce_for_combining =
CollationElement::new_from_secondary(secondary);
self.pending.push(ce_for_combining);
self.mark_prefix_unmatchable();
return ce;
Expand All @@ -1205,13 +1206,8 @@ where
TrieResult::NoMatch | TrieResult::NoValue => {
if let Some(ce) = default.to_ce_simple_or_long_primary()
{
// Inner unwrap: already checked len()
// Outer unwrap: expectation of data integrity.
let ce_for_combining =
CollationElement32::new_from_ule(
self.diacritics[diacritic_index],
)
.to_ce_self_contained_or_gigo();
CollationElement::new_from_secondary(secondary);
self.pending.push(ce_for_combining);
self.mark_prefix_unmatchable();
return ce;
Expand Down Expand Up @@ -1854,7 +1850,7 @@ where
'combining: while i < combining_characters.len() {
c = combining_characters[i].character();
let diacritic_index = (c as usize).wrapping_sub(COMBINING_DIACRITICS_BASE);
if let Some(&diacritic) = self.diacritics.get(diacritic_index) {
if let Some(secondary) = self.diacritics.get(diacritic_index) {
// TODO: unlikely annotation for the first two conditions here:
if c == '\u{0307}'
&& self.lithuanian_dot_above
Expand All @@ -1873,15 +1869,13 @@ where
continue 'combining;
}
}
self.pending.push(
CollationElement32::new_from_ule(diacritic)
.to_ce_self_contained_or_gigo(),
);
self.pending
.push(CollationElement::new_from_secondary(secondary));
self.mark_prefix_unmatchable();
i += 1;
continue 'combining;
}
// `c` is not from the Combining Diacritical Marks block.
// `c` is not a table-optimized diacritic.
// Not bothering to micro optimize away the move of the remaining
// part of `combining_characters`.
let _ = combining_characters.drain(..=i);
Expand Down
24 changes: 7 additions & 17 deletions experimental/collator/src/provider.rs
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ impl<'data> CollationDataV1<'data> {
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
pub struct CollationDiacriticsV1<'data> {
#[cfg_attr(feature = "serde", serde(borrow))]
pub ce32s: ZeroVec<'data, u32>,
pub secondaries: ZeroVec<'data, u16>,
}

#[icu_provider::data_struct(CollationJamoV1Marker = "collator/jamo@1")]
Expand Down Expand Up @@ -232,14 +232,12 @@ impl CollationMetadataV1 {
const MAX_VARIABLE_MASK: u32 = 0b11;
const TAILORED_MASK: u32 = 1 << 3;
const TAILORED_DIACRITICS_MASK: u32 = 1 << 4;
#[allow(dead_code)]
const TAILORED_JAMO_MASK: u32 = 1 << 5;
const REORDERING_MASK: u32 = 1 << 6;
const LITHUANIAN_DOT_ABOVE_MASK: u32 = 1 << 7;
const BACWARD_SECOND_LEVEL_MASK: u32 = 1 << 8;
const ALTERNATE_SHIFTED_MASK: u32 = 1 << 9;
const CASE_FIRST_MASK: u32 = 1 << 10;
const UPPER_FIRST_MASK: u32 = 1 << 11;
const REORDERING_MASK: u32 = 1 << 5;
const LITHUANIAN_DOT_ABOVE_MASK: u32 = 1 << 6;
const BACWARD_SECOND_LEVEL_MASK: u32 = 1 << 7;
const ALTERNATE_SHIFTED_MASK: u32 = 1 << 8;
const CASE_FIRST_MASK: u32 = 1 << 9;
const UPPER_FIRST_MASK: u32 = 1 << 10;

#[inline(always)]
pub(crate) fn max_variable(&self) -> MaxVariable {
Expand All @@ -259,14 +257,6 @@ impl CollationMetadataV1 {
self.bits & CollationMetadataV1::TAILORED_DIACRITICS_MASK != 0
}

/// Korean search
/// XXX: Currently ignored and being redesigned
#[allow(dead_code)]
#[inline(always)]
pub(crate) fn tailored_jamo(&self) -> bool {
self.bits & CollationMetadataV1::TAILORED_JAMO_MASK != 0
}

/// Lithuanian
#[inline(always)]
pub(crate) fn lithuanian_dot_above(&self) -> bool {
Expand Down
4 changes: 2 additions & 2 deletions provider/datagen/src/transform/collator/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ struct CollationData {
/// Serde counterpart for `CollationDiacriticsV1`.
#[derive(serde::Deserialize)]
struct CollationDiacritics {
pub ce32s: Vec<u32>,
pub secondaries: Vec<u16>,
}

/// Serde counterpart for `CollationJamoV1`.
Expand Down Expand Up @@ -221,7 +221,7 @@ collation_provider!(
CollationDiacritics,
"_dia",
icu_collator::provider::CollationDiacriticsV1 {
ce32s: ZeroVec::alloc_from_slice(&toml_data.ce32s),
secondaries: ZeroVec::alloc_from_slice(&toml_data.secondaries),
}
),
(
Expand Down
Loading