diff --git a/Cargo.lock b/Cargo.lock index f499fc64377..48ca866dc1e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1026,7 +1026,9 @@ dependencies = [ "criterion", "icu_benchmark_macros", "icu_provider", + "litemap", "serde", + "tinystr", ] [[package]] diff --git a/components/provider_ppucd/Cargo.toml b/components/provider_ppucd/Cargo.toml index 9808c06d94b..c05e02dcbaf 100644 --- a/components/provider_ppucd/Cargo.toml +++ b/components/provider_ppucd/Cargo.toml @@ -26,5 +26,5 @@ skip_optional_dependencies = true icu_locid = { version = "0.1", path = "../locid" } icu_provider = { version = "0.1", path = "../provider" } icu_locid_macros = { version = "0.1", path = "../locid/macros" } -tinystr = "0.4" icu_uniset = { version = "0.1", path = "../uniset" } +tinystr = "0.4" \ No newline at end of file diff --git a/components/provider_ppucd/src/enum_prop_mapping.rs b/components/provider_ppucd/src/enum_prop_mapping.rs new file mode 100644 index 00000000000..2e137d8726f --- /dev/null +++ b/components/provider_ppucd/src/enum_prop_mapping.rs @@ -0,0 +1,862 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use icu_uniset::enum_props::*; +use std::str::FromStr; +use tinystr::TinyStr16; + +// +// Single getter function for enumerated property name: +// Enum prop name string -> Rust enum +// + +fn get_enum_property_enum(name: &str) -> Option { + Some(match name { + "bc" => EnumeratedProperty::BidiClass, + "bpt" => EnumeratedProperty::BidiPairedBracketType, + "ccc" => EnumeratedProperty::CanonicalCombiningClass, + "dt" => EnumeratedProperty::DecompositionType, + "ea" => EnumeratedProperty::EastAsianWidth, + "gc" => EnumeratedProperty::GeneralCategory, + "GCB" => EnumeratedProperty::GraphemeClusterBreak, + "hst" => EnumeratedProperty::HangulSyllableType, + "InPC" => EnumeratedProperty::IndicPositionalCategory, + "InSC" => EnumeratedProperty::IndicSyllabicCategory, + "jg" => EnumeratedProperty::JoiningGroup, + "jt" => EnumeratedProperty::JoiningType, + "lb" => EnumeratedProperty::LineBreak, + "lccc" => EnumeratedProperty::LeadCanonicalCombiningClass, + "NFC_QC" => EnumeratedProperty::NFCQuickCheck, + "NFD_QC" => EnumeratedProperty::NFDQuickCheck, + "NFKC_QC" => EnumeratedProperty::NFKCQuickCheck, + "NFKD_QC" => EnumeratedProperty::NFKDQuickCheck, + "nt" => EnumeratedProperty::NumericType, + "SB" => EnumeratedProperty::SentenceBreak, + "tccc" => EnumeratedProperty::TrailCanonicalCombiningClass, + "vo" => EnumeratedProperty::VerticalOrientation, + "WB" => EnumeratedProperty::WordBreak, + _ => return None, + }) +} + +// +// Getter function per enumerated property: +// Enum prop val string -> Rust enum +// + +fn get_bidi_class_enum(name: &str) -> Option { + Some(match name { + "AL" => BidiClass::ArabicLetter, + "AN" => BidiClass::ArabicNumber, + "B" => BidiClass::ParagraphSeparator, + "BN" => BidiClass::BoundaryNeutral, + "CS" => BidiClass::CommonSeparator, + "EN" => BidiClass::EuropeanNumber, + "ES" => BidiClass::EuropeanSeparator, + "ET" => BidiClass::EuropeanTerminator, + "FSI" => BidiClass::FirstStrongIsolate, + "L" => BidiClass::LeftToRight, + "LRE" => BidiClass::LeftToRightEmbedding, + "LRI" => BidiClass::LeftToRightIsolate, + "LRO" => BidiClass::LeftToRightOverride, + "NSM" => BidiClass::NonspacingMark, + "ON" => BidiClass::OtherNeutral, + "PDF" => BidiClass::PopDirectionalFormat, + "PDI" => BidiClass::PopDirectionalIsolate, + "R" => BidiClass::RightToLeft, + "RLE" => BidiClass::RightToLeftEmbedding, + "RLI" => BidiClass::RightToLeftIsolate, + "RLO" => BidiClass::RightToLeftOverride, + "S" => BidiClass::SegmentSeparator, + "WS" => BidiClass::WhiteSpace, + _ => return None, + }) +} + +fn get_bidi_paired_bracket_type_enum(name: &str) -> Option { + Some(match name { + "c" => BidiPairedBracketType::Close, + "n" => BidiPairedBracketType::None, + "o" => BidiPairedBracketType::Open, + _ => return None, + }) +} + +fn get_canonical_combining_class_enum(name: &str) -> Option { + Some(match name { + "0" => CanonicalCombiningClass::NotReordered, + "1" => CanonicalCombiningClass::Overlay, + "10" => CanonicalCombiningClass::CCC10, + "103" => CanonicalCombiningClass::CCC103, + "107" => CanonicalCombiningClass::CCC107, + "11" => CanonicalCombiningClass::CCC11, + "118" => CanonicalCombiningClass::CCC118, + "12" => CanonicalCombiningClass::CCC12, + "122" => CanonicalCombiningClass::CCC122, + "129" => CanonicalCombiningClass::CCC129, + "13" => CanonicalCombiningClass::CCC13, + "130" => CanonicalCombiningClass::CCC130, + "132" => CanonicalCombiningClass::CCC132, + "133" => CanonicalCombiningClass::CCC133, + "14" => CanonicalCombiningClass::CCC14, + "15" => CanonicalCombiningClass::CCC15, + "16" => CanonicalCombiningClass::CCC16, + "17" => CanonicalCombiningClass::CCC17, + "18" => CanonicalCombiningClass::CCC18, + "19" => CanonicalCombiningClass::CCC19, + "20" => CanonicalCombiningClass::CCC20, + "200" => CanonicalCombiningClass::AttachedBelowLeft, + "202" => CanonicalCombiningClass::AttachedBelow, + "21" => CanonicalCombiningClass::CCC21, + "214" => CanonicalCombiningClass::AttachedAbove, + "216" => CanonicalCombiningClass::AttachedAboveRight, + "218" => CanonicalCombiningClass::BelowLeft, + "22" => CanonicalCombiningClass::CCC22, + "220" => CanonicalCombiningClass::Below, + "222" => CanonicalCombiningClass::BelowRight, + "224" => CanonicalCombiningClass::Left, + "226" => CanonicalCombiningClass::Right, + "228" => CanonicalCombiningClass::AboveLeft, + "23" => CanonicalCombiningClass::CCC23, + "230" => CanonicalCombiningClass::Above, + "232" => CanonicalCombiningClass::AboveRight, + "233" => CanonicalCombiningClass::DoubleBelow, + "234" => CanonicalCombiningClass::DoubleAbove, + "24" => CanonicalCombiningClass::CCC24, + "240" => CanonicalCombiningClass::IotaSubscript, + "25" => CanonicalCombiningClass::CCC25, + "26" => CanonicalCombiningClass::CCC26, + "27" => CanonicalCombiningClass::CCC27, + "28" => CanonicalCombiningClass::CCC28, + "29" => CanonicalCombiningClass::CCC29, + "30" => CanonicalCombiningClass::CCC30, + "31" => CanonicalCombiningClass::CCC31, + "32" => CanonicalCombiningClass::CCC32, + "33" => CanonicalCombiningClass::CCC33, + "34" => CanonicalCombiningClass::CCC34, + "35" => CanonicalCombiningClass::CCC35, + "36" => CanonicalCombiningClass::CCC36, + "6" => CanonicalCombiningClass::HanReading, + "7" => CanonicalCombiningClass::Nukta, + "8" => CanonicalCombiningClass::KanaVoicing, + "84" => CanonicalCombiningClass::CCC84, + "9" => CanonicalCombiningClass::Virama, + "91" => CanonicalCombiningClass::CCC91, + _ => return None, + }) +} + +fn get_decomposition_type_enum(name: &str) -> Option { + Some(match name { + "Can" => DecompositionType::Can, + "Com" => DecompositionType::Com, + "Enc" => DecompositionType::Enc, + "Fin" => DecompositionType::Fin, + "Font" => DecompositionType::Font, + "Fra" => DecompositionType::Fra, + "Init" => DecompositionType::Init, + "Iso" => DecompositionType::Iso, + "Med" => DecompositionType::Med, + "Nar" => DecompositionType::Nar, + "Nb" => DecompositionType::Nb, + "None" => DecompositionType::None, + "Sml" => DecompositionType::Sml, + "Sqr" => DecompositionType::Sqr, + "Sub" => DecompositionType::Sub, + "Sup" => DecompositionType::Sup, + "Vert" => DecompositionType::Vert, + "Wide" => DecompositionType::Wide, + _ => return None, + }) +} + +fn get_east_asian_width_enum(name: &str) -> Option { + Some(match name { + "A" => EastAsianWidth::Ambiguous, + "F" => EastAsianWidth::Fullwidth, + "H" => EastAsianWidth::Halfwidth, + "N" => EastAsianWidth::Neutral, + "Na" => EastAsianWidth::Narrow, + "W" => EastAsianWidth::Wide, + _ => return None, + }) +} + +fn get_general_category_enum(name: &str) -> Option { + Some(match name { + "C" => GeneralCategory::Other, + "Cc" => GeneralCategory::Cntrl, + "Cf" => GeneralCategory::Format, + "Cn" => GeneralCategory::Unassigned, + "Co" => GeneralCategory::PrivateUse, + "Cs" => GeneralCategory::Surrogate, + "L" => GeneralCategory::Letter, + "LC" => GeneralCategory::CasedLetter, + "Ll" => GeneralCategory::LowercaseLetter, + "Lm" => GeneralCategory::ModifierLetter, + "Lo" => GeneralCategory::OtherLetter, + "Lt" => GeneralCategory::TitlecaseLetter, + "Lu" => GeneralCategory::UppercaseLetter, + "M" => GeneralCategory::CombiningMark, + "Mc" => GeneralCategory::SpacingMark, + "Me" => GeneralCategory::EnclosingMark, + "Mn" => GeneralCategory::NonspacingMark, + "N" => GeneralCategory::Number, + "Nd" => GeneralCategory::Digit, + "Nl" => GeneralCategory::LetterNumber, + "No" => GeneralCategory::OtherNumber, + "P" => GeneralCategory::Punct, + "Pc" => GeneralCategory::ConnectorPunctuation, + "Pd" => GeneralCategory::DashPunctuation, + "Pe" => GeneralCategory::ClosePunctuation, + "Pf" => GeneralCategory::FinalPunctuation, + "Pi" => GeneralCategory::InitialPunctuation, + "Po" => GeneralCategory::OtherPunctuation, + "Ps" => GeneralCategory::OpenPunctuation, + "S" => GeneralCategory::Symbol, + "Sc" => GeneralCategory::CurrencySymbol, + "Sk" => GeneralCategory::ModifierSymbol, + "Sm" => GeneralCategory::MathSymbol, + "So" => GeneralCategory::OtherSymbol, + "Z" => GeneralCategory::Separator, + "Zl" => GeneralCategory::LineSeparator, + "Zp" => GeneralCategory::ParagraphSeparator, + "Zs" => GeneralCategory::SpaceSeparator, + _ => return None, + }) +} + +fn get_grapheme_cluster_break_enum(name: &str) -> Option { + Some(match name { + "CN" => GraphemeClusterBreak::Control, + "CR" => GraphemeClusterBreak::CR, + "EB" => GraphemeClusterBreak::EBase, + "EBG" => GraphemeClusterBreak::EBaseGAZ, + "EM" => GraphemeClusterBreak::EModifier, + "EX" => GraphemeClusterBreak::Extend, + "GAZ" => GraphemeClusterBreak::GlueAfterZwj, + "L" => GraphemeClusterBreak::L, + "LF" => GraphemeClusterBreak::LF, + "LV" => GraphemeClusterBreak::LV, + "LVT" => GraphemeClusterBreak::LVT, + "PP" => GraphemeClusterBreak::Prepend, + "RI" => GraphemeClusterBreak::RegionalIndicator, + "SM" => GraphemeClusterBreak::SpacingMark, + "T" => GraphemeClusterBreak::T, + "V" => GraphemeClusterBreak::V, + "XX" => GraphemeClusterBreak::Other, + "ZWJ" => GraphemeClusterBreak::ZWJ, + _ => return None, + }) +} + +fn get_hangul_syllable_type_enum(name: &str) -> Option { + Some(match name { + "L" => HangulSyllableType::LeadingJamo, + "LV" => HangulSyllableType::LVSyllable, + "LVT" => HangulSyllableType::LVTSyllable, + "NA" => HangulSyllableType::NotApplicable, + "T" => HangulSyllableType::TrailingJamo, + "V" => HangulSyllableType::VowelJamo, + _ => return None, + }) +} + +fn get_indic_positional_category_enum(name: &str) -> Option { + Some(match name { + "Bottom" => IndicPositionalCategory::Bottom, + "Bottom_And_Left" => IndicPositionalCategory::BottomAndLeft, + "Bottom_And_Right" => IndicPositionalCategory::BottomAndRight, + "Left" => IndicPositionalCategory::Left, + "Left_And_Right" => IndicPositionalCategory::LeftAndRight, + "NA" => IndicPositionalCategory::NA, + "Overstruck" => IndicPositionalCategory::Overstruck, + "Right" => IndicPositionalCategory::Right, + "Top" => IndicPositionalCategory::Top, + "Top_And_Bottom" => IndicPositionalCategory::TopAndBottom, + "Top_And_Bottom_And_Left" => IndicPositionalCategory::TopAndBottomAndLeft, + "Top_And_Bottom_And_Right" => IndicPositionalCategory::TopAndBottomAndRight, + "Top_And_Left" => IndicPositionalCategory::TopAndLeft, + "Top_And_Left_And_Right" => IndicPositionalCategory::TopAndLeftAndRight, + "Top_And_Right" => IndicPositionalCategory::TopAndRight, + "Visual_Order_Left" => IndicPositionalCategory::VisualOrderLeft, + _ => return None, + }) +} + +fn get_indic_syllabic_category_enum(name: &str) -> Option { + Some(match name { + "Avagraha" => IndicSyllabicCategory::Avagraha, + "Bindu" => IndicSyllabicCategory::Bindu, + "Brahmi_Joining_Number" => IndicSyllabicCategory::BrahmiJoiningNumber, + "Cantillation_Mark" => IndicSyllabicCategory::CantillationMark, + "Consonant" => IndicSyllabicCategory::Consonant, + "Consonant_Dead" => IndicSyllabicCategory::ConsonantDead, + "Consonant_Final" => IndicSyllabicCategory::ConsonantFinal, + "Consonant_Head_Letter" => IndicSyllabicCategory::ConsonantHeadLetter, + "Consonant_Initial_Postfixed" => IndicSyllabicCategory::ConsonantInitialPostfixed, + "Consonant_Killer" => IndicSyllabicCategory::ConsonantKiller, + "Consonant_Medial" => IndicSyllabicCategory::ConsonantMedial, + "Consonant_Placeholder" => IndicSyllabicCategory::ConsonantPlaceholder, + "Consonant_Preceding_Repha" => IndicSyllabicCategory::ConsonantPrecedingRepha, + "Consonant_Prefixed" => IndicSyllabicCategory::ConsonantPrefixed, + "Consonant_Subjoined" => IndicSyllabicCategory::ConsonantSubjoined, + "Consonant_Succeeding_Repha" => IndicSyllabicCategory::ConsonantSucceedingRepha, + "Consonant_With_Stacker" => IndicSyllabicCategory::ConsonantWithStacker, + "Gemination_Mark" => IndicSyllabicCategory::GeminationMark, + "Invisible_Stacker" => IndicSyllabicCategory::InvisibleStacker, + "Joiner" => IndicSyllabicCategory::Joiner, + "Modifying_Letter" => IndicSyllabicCategory::ModifyingLetter, + "Non_Joiner" => IndicSyllabicCategory::NonJoiner, + "Nukta" => IndicSyllabicCategory::Nukta, + "Number" => IndicSyllabicCategory::Number, + "Number_Joiner" => IndicSyllabicCategory::NumberJoiner, + "Other" => IndicSyllabicCategory::Other, + "Pure_Killer" => IndicSyllabicCategory::PureKiller, + "Register_Shifter" => IndicSyllabicCategory::RegisterShifter, + "Syllable_Modifier" => IndicSyllabicCategory::SyllableModifier, + "Tone_Letter" => IndicSyllabicCategory::ToneLetter, + "Tone_Mark" => IndicSyllabicCategory::ToneMark, + "Virama" => IndicSyllabicCategory::Virama, + "Visarga" => IndicSyllabicCategory::Visarga, + "Vowel" => IndicSyllabicCategory::Vowel, + "Vowel_Dependent" => IndicSyllabicCategory::VowelDependent, + "Vowel_Independent" => IndicSyllabicCategory::VowelIndependent, + _ => return None, + }) +} + +fn get_joining_group_enum(name: &str) -> Option { + Some(match name { + "African_Feh" => JoiningGroup::AfricanFeh, + "African_Noon" => JoiningGroup::AfricanNoon, + "African_Qaf" => JoiningGroup::AfricanQaf, + "Ain" => JoiningGroup::Ain, + "Alaph" => JoiningGroup::Alaph, + "Alef" => JoiningGroup::Alef, + "Beh" => JoiningGroup::Beh, + "Beth" => JoiningGroup::Beth, + "Burushaski_Yeh_Barree" => JoiningGroup::BurushaskiYehBarree, + "Dal" => JoiningGroup::Dal, + "Dalath_Rish" => JoiningGroup::DalathRish, + "E" => JoiningGroup::E, + "Farsi_Yeh" => JoiningGroup::FarsiYeh, + "Fe" => JoiningGroup::Fe, + "Feh" => JoiningGroup::Feh, + "Final_Semkath" => JoiningGroup::FinalSemkath, + "Gaf" => JoiningGroup::Gaf, + "Gamal" => JoiningGroup::Gamal, + "Hah" => JoiningGroup::Hah, + "Hanifi_Rohingya_Kinna_Ya" => JoiningGroup::HanifiRohingyaKinnaYa, + "Hanifi_Rohingya_Pa" => JoiningGroup::HanifiRohingyaPa, + "He" => JoiningGroup::He, + "Heh" => JoiningGroup::Heh, + "Heh_Goal" => JoiningGroup::HehGoal, + "Heth" => JoiningGroup::Heth, + "Kaf" => JoiningGroup::Kaf, + "Kaph" => JoiningGroup::Kaph, + "Khaph" => JoiningGroup::Khaph, + "Knotted_Heh" => JoiningGroup::KnottedHeh, + "Lam" => JoiningGroup::Lam, + "Lamadh" => JoiningGroup::Lamadh, + "Malayalam_Bha" => JoiningGroup::MalayalamBha, + "Malayalam_Ja" => JoiningGroup::MalayalamJa, + "Malayalam_Lla" => JoiningGroup::MalayalamLla, + "Malayalam_Llla" => JoiningGroup::MalayalamLlla, + "Malayalam_Nga" => JoiningGroup::MalayalamNga, + "Malayalam_Nna" => JoiningGroup::MalayalamNna, + "Malayalam_Nnna" => JoiningGroup::MalayalamNnna, + "Malayalam_Nya" => JoiningGroup::MalayalamNya, + "Malayalam_Ra" => JoiningGroup::MalayalamRa, + "Malayalam_Ssa" => JoiningGroup::MalayalamSsa, + "Malayalam_Tta" => JoiningGroup::MalayalamTta, + "Manichaean_Aleph" => JoiningGroup::ManichaeanAleph, + "Manichaean_Ayin" => JoiningGroup::ManichaeanAyin, + "Manichaean_Beth" => JoiningGroup::ManichaeanBeth, + "Manichaean_Daleth" => JoiningGroup::ManichaeanDaleth, + "Manichaean_Dhamedh" => JoiningGroup::ManichaeanDhamedh, + "Manichaean_Five" => JoiningGroup::ManichaeanFive, + "Manichaean_Gimel" => JoiningGroup::ManichaeanGimel, + "Manichaean_Heth" => JoiningGroup::ManichaeanHeth, + "Manichaean_Hundred" => JoiningGroup::ManichaeanHundred, + "Manichaean_Kaph" => JoiningGroup::ManichaeanKaph, + "Manichaean_Lamedh" => JoiningGroup::ManichaeanLamedh, + "Manichaean_Mem" => JoiningGroup::ManichaeanMem, + "Manichaean_Nun" => JoiningGroup::ManichaeanNun, + "Manichaean_One" => JoiningGroup::ManichaeanOne, + "Manichaean_Pe" => JoiningGroup::ManichaeanPe, + "Manichaean_Qoph" => JoiningGroup::ManichaeanQoph, + "Manichaean_Resh" => JoiningGroup::ManichaeanResh, + "Manichaean_Sadhe" => JoiningGroup::ManichaeanSadhe, + "Manichaean_Samekh" => JoiningGroup::ManichaeanSamekh, + "Manichaean_Taw" => JoiningGroup::ManichaeanTaw, + "Manichaean_Ten" => JoiningGroup::ManichaeanTen, + "Manichaean_Teth" => JoiningGroup::ManichaeanTeth, + "Manichaean_Thamedh" => JoiningGroup::ManichaeanThamedh, + "Manichaean_Twenty" => JoiningGroup::ManichaeanTwenty, + "Manichaean_Waw" => JoiningGroup::ManichaeanWaw, + "Manichaean_Yodh" => JoiningGroup::ManichaeanYodh, + "Manichaean_Zayin" => JoiningGroup::ManichaeanZayin, + "Meem" => JoiningGroup::Meem, + "Mim" => JoiningGroup::Mim, + "No_Joining_Group" => JoiningGroup::NoJoiningGroup, + "Noon" => JoiningGroup::Noon, + "Nun" => JoiningGroup::Nun, + "Nya" => JoiningGroup::Nya, + "Pe" => JoiningGroup::Pe, + "Qaf" => JoiningGroup::Qaf, + "Qaph" => JoiningGroup::Qaph, + "Reh" => JoiningGroup::Reh, + "Reversed_Pe" => JoiningGroup::ReversedPe, + "Rohingya_Yeh" => JoiningGroup::RohingyaYeh, + "Sad" => JoiningGroup::Sad, + "Sadhe" => JoiningGroup::Sadhe, + "Seen" => JoiningGroup::Seen, + "Semkath" => JoiningGroup::Semkath, + "Shin" => JoiningGroup::Shin, + "Straight_Waw" => JoiningGroup::StraightWaw, + "Swash_Kaf" => JoiningGroup::SwashKaf, + "Syriac_Waw" => JoiningGroup::SyriacWaw, + "Tah" => JoiningGroup::Tah, + "Taw" => JoiningGroup::Taw, + "Teh_Marbuta" => JoiningGroup::TehMarbuta, + "Teh_Marbuta_Goal" => JoiningGroup::TehMarbutaGoal, + "Teth" => JoiningGroup::Teth, + "Waw" => JoiningGroup::Waw, + "Yeh" => JoiningGroup::Yeh, + "Yeh_Barree" => JoiningGroup::YehBarree, + "Yeh_With_Tail" => JoiningGroup::YehWithTail, + "Yudh" => JoiningGroup::Yudh, + "Yudh_He" => JoiningGroup::YudhHe, + "Zain" => JoiningGroup::Zain, + "Zhain" => JoiningGroup::Zhain, + _ => return None, + }) +} + +fn get_joining_type_enum(name: &str) -> Option { + Some(match name { + "C" => JoiningType::JoinCausing, + "D" => JoiningType::DualJoining, + "L" => JoiningType::LeftJoining, + "R" => JoiningType::RightJoining, + "T" => JoiningType::Transparent, + "U" => JoiningType::NonJoining, + _ => return None, + }) +} + +fn get_line_break_enum(name: &str) -> Option { + Some(match name { + "AI" => LineBreak::Ambiguous, + "AL" => LineBreak::Alphabetic, + "B2" => LineBreak::BreakBoth, + "BA" => LineBreak::BreakAfter, + "BB" => LineBreak::BreakBefore, + "BK" => LineBreak::MandatoryBreak, + "CB" => LineBreak::ContingentBreak, + "CJ" => LineBreak::ConditionalJapaneseStarter, + "CL" => LineBreak::ClosePunctuation, + "CM" => LineBreak::CombiningMark, + "CP" => LineBreak::CloseParenthesis, + "CR" => LineBreak::CarriageReturn, + "EB" => LineBreak::EBase, + "EM" => LineBreak::EModifier, + "EX" => LineBreak::Exclamation, + "GL" => LineBreak::Glue, + "H2" => LineBreak::H2, + "H3" => LineBreak::H3, + "HL" => LineBreak::HebrewLetter, + "HY" => LineBreak::Hyphen, + "ID" => LineBreak::Ideographic, + "IN" => LineBreak::Inseperable, + "IS" => LineBreak::InfixNumeric, + "JL" => LineBreak::JL, + "JT" => LineBreak::JT, + "JV" => LineBreak::JV, + "LF" => LineBreak::LineFeed, + "NL" => LineBreak::NextLine, + "NS" => LineBreak::Nonstarter, + "NU" => LineBreak::Numeric, + "OP" => LineBreak::OpenPunctuation, + "PO" => LineBreak::PostfixNumeric, + "PR" => LineBreak::PrefixNumeric, + "QU" => LineBreak::Quotation, + "RI" => LineBreak::RegionalIndicator, + "SA" => LineBreak::ComplexContext, + "SG" => LineBreak::Surrogate, + "SP" => LineBreak::Space, + "SY" => LineBreak::BreakSymbols, + "WJ" => LineBreak::WordJoiner, + "XX" => LineBreak::Unknown, + "ZW" => LineBreak::ZWSpace, + "ZWJ" => LineBreak::ZWJ, + _ => return None, + }) +} + +fn get_lead_canonical_combining_class_enum(name: &str) -> Option { + Some(match name { + "0" => LeadCanonicalCombiningClass::NotReordered, + "1" => LeadCanonicalCombiningClass::Overlay, + "10" => LeadCanonicalCombiningClass::CCC10, + "103" => LeadCanonicalCombiningClass::CCC103, + "107" => LeadCanonicalCombiningClass::CCC107, + "11" => LeadCanonicalCombiningClass::CCC11, + "118" => LeadCanonicalCombiningClass::CCC118, + "12" => LeadCanonicalCombiningClass::CCC12, + "122" => LeadCanonicalCombiningClass::CCC122, + "129" => LeadCanonicalCombiningClass::CCC129, + "13" => LeadCanonicalCombiningClass::CCC13, + "130" => LeadCanonicalCombiningClass::CCC130, + "132" => LeadCanonicalCombiningClass::CCC132, + "133" => LeadCanonicalCombiningClass::CCC133, + "14" => LeadCanonicalCombiningClass::CCC14, + "15" => LeadCanonicalCombiningClass::CCC15, + "16" => LeadCanonicalCombiningClass::CCC16, + "17" => LeadCanonicalCombiningClass::CCC17, + "18" => LeadCanonicalCombiningClass::CCC18, + "19" => LeadCanonicalCombiningClass::CCC19, + "20" => LeadCanonicalCombiningClass::CCC20, + "200" => LeadCanonicalCombiningClass::AttachedBelowLeft, + "202" => LeadCanonicalCombiningClass::AttachedBelow, + "21" => LeadCanonicalCombiningClass::CCC21, + "214" => LeadCanonicalCombiningClass::AttachedAbove, + "216" => LeadCanonicalCombiningClass::AttachedAboveRight, + "218" => LeadCanonicalCombiningClass::BelowLeft, + "22" => LeadCanonicalCombiningClass::CCC22, + "220" => LeadCanonicalCombiningClass::Below, + "222" => LeadCanonicalCombiningClass::BelowRight, + "224" => LeadCanonicalCombiningClass::Left, + "226" => LeadCanonicalCombiningClass::Right, + "228" => LeadCanonicalCombiningClass::AboveLeft, + "23" => LeadCanonicalCombiningClass::CCC23, + "230" => LeadCanonicalCombiningClass::Above, + "232" => LeadCanonicalCombiningClass::AboveRight, + "233" => LeadCanonicalCombiningClass::DoubleBelow, + "234" => LeadCanonicalCombiningClass::DoubleAbove, + "24" => LeadCanonicalCombiningClass::CCC24, + "240" => LeadCanonicalCombiningClass::IotaSubscript, + "25" => LeadCanonicalCombiningClass::CCC25, + "26" => LeadCanonicalCombiningClass::CCC26, + "27" => LeadCanonicalCombiningClass::CCC27, + "28" => LeadCanonicalCombiningClass::CCC28, + "29" => LeadCanonicalCombiningClass::CCC29, + "30" => LeadCanonicalCombiningClass::CCC30, + "31" => LeadCanonicalCombiningClass::CCC31, + "32" => LeadCanonicalCombiningClass::CCC32, + "33" => LeadCanonicalCombiningClass::CCC33, + "34" => LeadCanonicalCombiningClass::CCC34, + "35" => LeadCanonicalCombiningClass::CCC35, + "36" => LeadCanonicalCombiningClass::CCC36, + "6" => LeadCanonicalCombiningClass::HanReading, + "7" => LeadCanonicalCombiningClass::Nukta, + "8" => LeadCanonicalCombiningClass::KanaVoicing, + "84" => LeadCanonicalCombiningClass::CCC84, + "9" => LeadCanonicalCombiningClass::Virama, + "91" => LeadCanonicalCombiningClass::CCC91, + _ => return None, + }) +} + +fn get_nfc_quick_check_enum(name: &str) -> Option { + Some(match name { + "M" => NFCQuickCheck::Maybe, + "N" => NFCQuickCheck::No, + "Y" => NFCQuickCheck::Yes, + _ => return None, + }) +} + +fn get_nfd_quick_check_enum(name: &str) -> Option { + Some(match name { + "N" => NFDQuickCheck::No, + "Y" => NFDQuickCheck::Yes, + _ => return None, + }) +} + +fn get_nfkc_quick_check_enum(name: &str) -> Option { + Some(match name { + "M" => NFKCQuickCheck::Maybe, + "N" => NFKCQuickCheck::No, + "Y" => NFKCQuickCheck::Yes, + _ => return None, + }) +} + +fn get_nfkd_quick_check_enum(name: &str) -> Option { + Some(match name { + "N" => NFKDQuickCheck::No, + "Y" => NFKDQuickCheck::Yes, + _ => return None, + }) +} + +fn get_numeric_type_enum(name: &str) -> Option { + Some(match name { + "De" => NumericType::Decimal, + "Di" => NumericType::Digit, + "None" => NumericType::None, + "Nu" => NumericType::Numeric, + _ => return None, + }) +} + +fn get_sentence_break_enum(name: &str) -> Option { + Some(match name { + "AT" => SentenceBreak::ATerm, + "CL" => SentenceBreak::Close, + "CR" => SentenceBreak::CR, + "EX" => SentenceBreak::Extend, + "FO" => SentenceBreak::Format, + "LE" => SentenceBreak::OLetter, + "LF" => SentenceBreak::LF, + "LO" => SentenceBreak::Lower, + "NU" => SentenceBreak::Numeric, + "SC" => SentenceBreak::SContinue, + "SE" => SentenceBreak::Sep, + "SP" => SentenceBreak::Sp, + "ST" => SentenceBreak::STerm, + "UP" => SentenceBreak::Upper, + "XX" => SentenceBreak::Other, + _ => return None, + }) +} + +fn get_trail_canonical_combining_class_enum(name: &str) -> Option { + Some(match name { + "0" => TrailCanonicalCombiningClass::NotReordered, + "1" => TrailCanonicalCombiningClass::Overlay, + "10" => TrailCanonicalCombiningClass::CCC10, + "103" => TrailCanonicalCombiningClass::CCC103, + "107" => TrailCanonicalCombiningClass::CCC107, + "11" => TrailCanonicalCombiningClass::CCC11, + "118" => TrailCanonicalCombiningClass::CCC118, + "12" => TrailCanonicalCombiningClass::CCC12, + "122" => TrailCanonicalCombiningClass::CCC122, + "129" => TrailCanonicalCombiningClass::CCC129, + "13" => TrailCanonicalCombiningClass::CCC13, + "130" => TrailCanonicalCombiningClass::CCC130, + "132" => TrailCanonicalCombiningClass::CCC132, + "133" => TrailCanonicalCombiningClass::CCC133, + "14" => TrailCanonicalCombiningClass::CCC14, + "15" => TrailCanonicalCombiningClass::CCC15, + "16" => TrailCanonicalCombiningClass::CCC16, + "17" => TrailCanonicalCombiningClass::CCC17, + "18" => TrailCanonicalCombiningClass::CCC18, + "19" => TrailCanonicalCombiningClass::CCC19, + "20" => TrailCanonicalCombiningClass::CCC20, + "200" => TrailCanonicalCombiningClass::AttachedBelowLeft, + "202" => TrailCanonicalCombiningClass::AttachedBelow, + "21" => TrailCanonicalCombiningClass::CCC21, + "214" => TrailCanonicalCombiningClass::AttachedAbove, + "216" => TrailCanonicalCombiningClass::AttachedAboveRight, + "218" => TrailCanonicalCombiningClass::BelowLeft, + "22" => TrailCanonicalCombiningClass::CCC22, + "220" => TrailCanonicalCombiningClass::Below, + "222" => TrailCanonicalCombiningClass::BelowRight, + "224" => TrailCanonicalCombiningClass::Left, + "226" => TrailCanonicalCombiningClass::Right, + "228" => TrailCanonicalCombiningClass::AboveLeft, + "23" => TrailCanonicalCombiningClass::CCC23, + "230" => TrailCanonicalCombiningClass::Above, + "232" => TrailCanonicalCombiningClass::AboveRight, + "233" => TrailCanonicalCombiningClass::DoubleBelow, + "234" => TrailCanonicalCombiningClass::DoubleAbove, + "24" => TrailCanonicalCombiningClass::CCC24, + "240" => TrailCanonicalCombiningClass::IotaSubscript, + "25" => TrailCanonicalCombiningClass::CCC25, + "26" => TrailCanonicalCombiningClass::CCC26, + "27" => TrailCanonicalCombiningClass::CCC27, + "28" => TrailCanonicalCombiningClass::CCC28, + "29" => TrailCanonicalCombiningClass::CCC29, + "30" => TrailCanonicalCombiningClass::CCC30, + "31" => TrailCanonicalCombiningClass::CCC31, + "32" => TrailCanonicalCombiningClass::CCC32, + "33" => TrailCanonicalCombiningClass::CCC33, + "34" => TrailCanonicalCombiningClass::CCC34, + "35" => TrailCanonicalCombiningClass::CCC35, + "36" => TrailCanonicalCombiningClass::CCC36, + "6" => TrailCanonicalCombiningClass::HanReading, + "7" => TrailCanonicalCombiningClass::Nukta, + "8" => TrailCanonicalCombiningClass::KanaVoicing, + "84" => TrailCanonicalCombiningClass::CCC84, + "9" => TrailCanonicalCombiningClass::Virama, + "91" => TrailCanonicalCombiningClass::CCC91, + _ => return None, + }) +} + +fn get_vertical_orientation_enum(name: &str) -> Option { + Some(match name { + "R" => VerticalOrientation::Rotated, + "Tr" => VerticalOrientation::TransformedRotated, + "Tu" => VerticalOrientation::TransformedUpright, + "U" => VerticalOrientation::Upright, + _ => return None, + }) +} + +fn get_word_break_enum(name: &str) -> Option { + Some(match name { + "CR" => WordBreak::CR, + "DQ" => WordBreak::DoubleQuote, + "EB" => WordBreak::EBase, + "EBG" => WordBreak::EBaseGAZ, + "EM" => WordBreak::EModifier, + "EX" => WordBreak::ExtendNumLet, + "Extend" => WordBreak::Extend, + "FO" => WordBreak::Format, + "GAZ" => WordBreak::GlueAfterZwj, + "HL" => WordBreak::HebrewLetter, + "KA" => WordBreak::Katakana, + "LE" => WordBreak::ALetter, + "LF" => WordBreak::LF, + "MB" => WordBreak::MidNumLet, + "ML" => WordBreak::MidLetter, + "MN" => WordBreak::MidNum, + "NL" => WordBreak::Newline, + "NU" => WordBreak::Numeric, + "RI" => WordBreak::RegionalIndicator, + "SQ" => WordBreak::SingleQuote, + "WSegSpace" => WordBreak::WSegSpace, + "XX" => WordBreak::Other, + "ZWJ" => WordBreak::ZWJ, + _ => return None, + }) +} + +// +// Helper fn to help generate identifer for the prop_name=prop_val `UnicodeProperty` +// + +fn get_prop_name_val_as_i32(prop_name: &str, prop_val: &str) -> Option<(i32, i32)> { + let name_enum_opt = get_enum_property_enum(prop_name); + let val_enum_i32_opt = match name_enum_opt { + Some(EnumeratedProperty::BidiClass) => get_bidi_class_enum(prop_val).map(|x| x as i32), + Some(EnumeratedProperty::BidiPairedBracketType) => { + get_bidi_paired_bracket_type_enum(prop_val).map(|x| x as i32) + } + Some(EnumeratedProperty::CanonicalCombiningClass) => { + get_canonical_combining_class_enum(prop_val).map(|x| x as i32) + } + Some(EnumeratedProperty::DecompositionType) => { + get_decomposition_type_enum(prop_val).map(|x| x as i32) + } + Some(EnumeratedProperty::EastAsianWidth) => { + get_east_asian_width_enum(prop_val).map(|x| x as i32) + } + Some(EnumeratedProperty::GeneralCategory) => { + get_general_category_enum(prop_val).map(|x| x as i32) + } + Some(EnumeratedProperty::GraphemeClusterBreak) => { + get_grapheme_cluster_break_enum(prop_val).map(|x| x as i32) + } + Some(EnumeratedProperty::HangulSyllableType) => { + get_hangul_syllable_type_enum(prop_val).map(|x| x as i32) + } + Some(EnumeratedProperty::IndicPositionalCategory) => { + get_indic_positional_category_enum(prop_val).map(|x| x as i32) + } + Some(EnumeratedProperty::IndicSyllabicCategory) => { + get_indic_syllabic_category_enum(prop_val).map(|x| x as i32) + } + Some(EnumeratedProperty::JoiningGroup) => { + get_joining_group_enum(prop_val).map(|x| x as i32) + } + Some(EnumeratedProperty::JoiningType) => get_joining_type_enum(prop_val).map(|x| x as i32), + Some(EnumeratedProperty::LineBreak) => get_line_break_enum(prop_val).map(|x| x as i32), + Some(EnumeratedProperty::LeadCanonicalCombiningClass) => { + get_lead_canonical_combining_class_enum(prop_val).map(|x| x as i32) + } + Some(EnumeratedProperty::NFCQuickCheck) => { + get_nfc_quick_check_enum(prop_val).map(|x| x as i32) + } + Some(EnumeratedProperty::NFDQuickCheck) => { + get_nfd_quick_check_enum(prop_val).map(|x| x as i32) + } + Some(EnumeratedProperty::NFKCQuickCheck) => { + get_nfkc_quick_check_enum(prop_val).map(|x| x as i32) + } + Some(EnumeratedProperty::NFKDQuickCheck) => { + get_nfkd_quick_check_enum(prop_val).map(|x| x as i32) + } + Some(EnumeratedProperty::NumericType) => get_numeric_type_enum(prop_val).map(|x| x as i32), + Some(EnumeratedProperty::SentenceBreak) => { + get_sentence_break_enum(prop_val).map(|x| x as i32) + } + Some(EnumeratedProperty::TrailCanonicalCombiningClass) => { + get_trail_canonical_combining_class_enum(prop_val).map(|x| x as i32) + } + Some(EnumeratedProperty::VerticalOrientation) => { + get_vertical_orientation_enum(prop_val).map(|x| x as i32) + } + Some(EnumeratedProperty::WordBreak) => get_word_break_enum(prop_val).map(|x| x as i32), + _ => None, + }; + let name_enum_i32_opt = name_enum_opt.map(|x| x as i32); + match (name_enum_i32_opt, val_enum_i32_opt) { + (Some(name_i32), Some(val_i32)) => Some((name_i32, val_i32)), + _ => None, + } +} + +pub fn get_prop_name_identifier(prop_name: &str, prop_val: &str) -> Option { + let name_val_i32_opt = get_prop_name_val_as_i32(prop_name, prop_val); + let name_val_string_opt = match name_val_i32_opt { + Some((name_i32, val_i32)) => Some(format!("{}={}", name_i32, val_i32)), + _ => None, + }; + match name_val_string_opt { + Some(id_str) => TinyStr16::from_str(&id_str).ok(), + _ => None, + } +} + +#[cfg(test)] +mod enum_tests { + use super::*; + + #[test] + fn prop_name_str_to_enum_fn_test() { + assert_eq!(get_line_break_enum("LF"), Some(LineBreak::LineFeed)); + assert_eq!(get_line_break_enum("cheezburger"), None); + } + + #[test] + fn prop_value_str_to_enum_fn_test() { + assert_eq!( + get_canonical_combining_class_enum("21"), + Some(CanonicalCombiningClass::CCC21) + ); + assert_eq!(get_canonical_combining_class_enum("cheezburger"), None); + } + + #[test] + fn get_prop_name_val_as_i32_test() { + let act_prop_i32_tuple_opt_1 = get_prop_name_val_as_i32("lb", "LF"); + let exp_prop_i32_tuple_opt_1 = Some(( + EnumeratedProperty::LineBreak as i32, + LineBreak::LineFeed as i32, + )); + assert_eq!(act_prop_i32_tuple_opt_1, exp_prop_i32_tuple_opt_1); + + assert_eq!(get_prop_name_val_as_i32("lb", "cheezburger"), None); + assert_eq!(get_prop_name_val_as_i32("cheezburger", "LF"), None); + assert_eq!(get_prop_name_val_as_i32("cheez", "cheez"), None); + } + + #[test] + fn get_prop_name_identifier_test() { + assert_eq!( + get_prop_name_identifier("lb", "LF"), + TinyStr16::from_str("12=26").ok() + ); + assert_eq!( + get_prop_name_identifier("ccc", "230"), + TinyStr16::from_str("2=230").ok() + ); + } +} diff --git a/components/provider_ppucd/src/lib.rs b/components/provider_ppucd/src/lib.rs index d267ba2b90b..164e1faeb4a 100644 --- a/components/provider_ppucd/src/lib.rs +++ b/components/provider_ppucd/src/lib.rs @@ -2,6 +2,7 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). +pub mod enum_prop_mapping; mod error; pub mod parse_ppucd; pub mod support; diff --git a/components/provider_ppucd/src/parse_ppucd.rs b/components/provider_ppucd/src/parse_ppucd.rs index 1914ea5ec5b..8db8d19da45 100644 --- a/components/provider_ppucd/src/parse_ppucd.rs +++ b/components/provider_ppucd/src/parse_ppucd.rs @@ -2,13 +2,20 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). +use std::borrow::Cow; use std::collections::{HashMap, HashSet}; use std::iter::Iterator; use std::u32; +use crate::enum_prop_mapping::get_prop_name_identifier; use crate::support::UnicodeProperties; use icu_uniset::provider::UnicodeProperty; use icu_uniset::{UnicodeSet, UnicodeSetBuilder}; +use tinystr::TinyStr16; + +// +// Provider-related structs and impl functions +// fn split_line(line: &str) -> Vec<&str> { line.split(';').collect::>() @@ -35,6 +42,9 @@ fn get_data_line_prop_vals<'s>(data_line_parts: &[&'s str]) -> HashMap<&'s str, } else { // For properties that don't take values, let their value in the map be the prop name itself // This applies to binary properties. + // Also, keep the string as-is to preserve any initial minus sign that + // indicates for binary properties that this line excludes this code + // point range from inclusion in the property set. m.insert(prop_str, prop_str); } } @@ -49,33 +59,92 @@ fn is_property_line(line: &str) -> bool { line.starts_with("property;") } -/// For a property definition line, update the property aliases map. -/// Only operate on binary properties, currently. +fn is_enum_val_line(line: &str) -> bool { + line.starts_with("value;") +} + +/// For a property definition or enumerated property value line, update the aliases map. fn update_aliases<'s>(prop_aliases: &mut HashMap<&'s str, HashSet<&'s str>>, line: &'s str) { - let mut line_parts = split_line(&line); + let line_parts = split_line(&line); + let mut line_parts: &[&str] = line_parts.as_slice(); + line_parts = &line_parts[2..]; + + // Properties defined in UTS 18 but not in UCD may have an empty line part in PPUCD + // because of the non-existence in UCD. See Compatibility Properties in UTS 18 Annex C + // that provide backwards compatibility for POSIX-style properties: + // https://unicode.org/reports/tr18/#Compatibility_Properties + if line_parts[0].is_empty() { + line_parts = &line_parts[1..]; + } + + let canonical_name = line_parts[0]; + let all_names_set: HashSet<&'s str> = line_parts.iter().copied().collect(); + prop_aliases.insert(canonical_name, all_names_set); +} + +/// Parse property definition line for binary and enumerated properties. +fn update_property_aliases<'s>( + binary_prop_aliases: &mut HashMap<&'s str, HashSet<&'s str>>, + enum_prop_aliases: &mut HashMap<&'s str, HashSet<&'s str>>, + line: &'s str, +) { + let line_parts = split_line(&line); assert_eq!(&"property", &line_parts[0]); let prop_type = &line_parts[1]; if prop_type == &"Binary" { - line_parts.drain(0..2); - - // TODO: ask Markus what to do with the property lines that appear to have - // no canonical name - // property;Binary;;alnum - // property;Binary;;blank - // property;Binary;;graph - // property;Binary;;print - // property;Binary;;xdigit - if line_parts[0].is_empty() { - line_parts.drain(0..1); - } + update_aliases(binary_prop_aliases, line); + } else if prop_type == &"Enumerated" { + update_aliases(enum_prop_aliases, line); + } +} - let canonical_name = line_parts[0]; - let all_names: Vec<&'s str> = line_parts.iter().copied().collect(); - let all_names_set: HashSet<&'s str> = all_names.into_iter().collect(); - prop_aliases.insert(canonical_name, all_names_set); +/// Parse enum property value definition line. +fn update_enum_val_aliases<'s>( + enum_val_aliases: &mut HashMap<&'s str, HashMap<&'s str, HashSet<&'s str>>>, + line: &'s str, +) { + let line_parts = split_line(&line); + let mut line_parts: &[&str] = line_parts.as_slice(); + assert_eq!(&"value", &line_parts[0]); + line_parts = &line_parts[1..]; + let enum_prop_name = line_parts[0]; + let enum_prop_val = line_parts[1]; + enum_val_aliases + .entry(enum_prop_name) + .or_insert_with(HashMap::new); + let enum_val_alias_map: &mut HashMap<&str, HashSet<&str>> = + enum_val_aliases.get_mut(&enum_prop_name).unwrap(); + enum_val_alias_map + .entry(enum_prop_val) + .or_insert_with(HashSet::new); + let enum_prop_val_aliases: &mut HashSet<&str> = + enum_val_alias_map.get_mut(&enum_prop_val).unwrap(); + enum_prop_val_aliases.insert(enum_prop_val); + line_parts = &line_parts[2..]; + // What remains of line_parts are all of the remaining aliases for this + // enumerated property's value + for alias in line_parts { + enum_prop_val_aliases.insert(alias); } } +/// Mutate the map so any binary exclusion values +/// (in other words, values for binary properties that are prefixed with a +/// minus sign, as described in the PPUCD documentation, ex: "-Gr_Base") +/// are not included and any existing include values for the binary property +/// (ex: "Gr_Base") are accordingly also removed from the map. +fn apply_exclude_vals_for_binary_props<'s>(prop_vals: &mut HashMap<&'s str, &'s str>) { + let mut prop_names: HashSet<&'s str> = prop_vals.keys().copied().collect(); + // If we see "-Gr_Base", then remove both "Gr_Base" and "-Gr_Base". + for prop_name in prop_vals.keys() { + if let Some(orig_prop_name) = prop_name.strip_prefix('-') { + prop_names.remove(&orig_prop_name); + prop_names.remove(prop_name); + } + } + prop_vals.retain(|prop_name, _| prop_names.contains(prop_name)); +} + fn is_defaults_line(line: &str) -> bool { line.starts_with("defaults;") } @@ -133,6 +202,7 @@ fn get_code_point_overrides(line: &str) -> (UnicodeSet, HashMap<&str, &str>) { let range_str = &line_parts[1]; let range_bound_strs = &range_str.split("..").collect::>(); + // a "cp" line in PPUCD can either represent a single code point or a code point range let range_result = if range_bound_strs.len() > 1 { let range_start: &u32 = &u32::from_str_radix(&range_bound_strs[0], 16).unwrap(); let range_end: &u32 = &u32::from_str_radix(&range_bound_strs[1], 16).unwrap(); // inclusive end val in PPUCD @@ -176,6 +246,9 @@ fn get_code_point_prop_vals<'s>( for (range, block_prop_vals) in blocks { if range.contains_u32(code_point) { prop_vals.extend(block_prop_vals); + // Apply any exclude overrides for a binary property as indicated + // when the binary property name is prefixed with a "-" + apply_exclude_vals_for_binary_props(&mut prop_vals); } } @@ -183,6 +256,9 @@ fn get_code_point_prop_vals<'s>( for (range, code_point_prop_vals) in code_point_overrides { if range.contains_u32(code_point) { prop_vals.extend(code_point_prop_vals); + // Apply any exclude overrides for a binary property as indicated + // when the binary property name is prefixed with a "-" + apply_exclude_vals_for_binary_props(&mut prop_vals); } } @@ -220,7 +296,110 @@ fn get_binary_prop_unisets<'s>( m } -/// Parse a whole PPUCD file that was loaded into a `String` and return a +/// Return a Map of `UnicodeSet`s for each of the enumerated properties' +/// values. The key in the map will be a string slice created from the +/// combination of the enumerated property name and property value (ex: +/// `"gc=Lo"`), and the value is the corresponding `UnicodeSet`. +fn get_enum_prop_unisets<'s>( + enum_prop_aliases: &HashMap<&'s str, HashSet<&'s str>>, + enum_val_aliases: &HashMap<&'s str, HashMap<&'s str, HashSet<&'s str>>>, + code_points: &HashMap>, +) -> HashMap, UnicodeSet> { + let mut m: HashMap<&str, HashMap<&str, UnicodeSetBuilder>> = HashMap::new(); + + let enum_val_mappings: HashMap<&str, HashMap<&str, &str>> = + get_enum_val_canonical_mapping(enum_val_aliases); + + for (canonical_prop_name, all_prop_name_aliases) in enum_prop_aliases { + for (code_point, code_point_prop_key_vals) in code_points { + let code_point_prop_names: HashSet<&str> = + code_point_prop_key_vals.keys().copied().collect(); + if !all_prop_name_aliases.is_disjoint(&code_point_prop_names) { + for prop_name in all_prop_name_aliases.intersection(&code_point_prop_names) { + let val_name: &str = code_point_prop_key_vals.get(prop_name).unwrap(); + let canonicalized_val_name: &str = enum_val_mappings + .get(canonical_prop_name) + .unwrap() + .get(val_name) + .unwrap(); + + if !m.contains_key(canonical_prop_name) { + m.insert(canonical_prop_name, HashMap::new()); + } + + if !m + .get(canonical_prop_name) + .unwrap() + .contains_key(canonicalized_val_name) + { + let result_prop_val_builder_map: &mut HashMap<&str, UnicodeSetBuilder> = + m.get_mut(canonical_prop_name).unwrap(); + result_prop_val_builder_map + .insert(canonicalized_val_name, UnicodeSetBuilder::new()); + } + + let enum_val_uniset_builder: &mut UnicodeSetBuilder = m + .get_mut(canonical_prop_name) + .unwrap() + .get_mut(canonicalized_val_name) + .unwrap(); + enum_val_uniset_builder.add_char(std::char::from_u32(*code_point).unwrap()); + } + } + } + } + + let mut result: HashMap, UnicodeSet> = HashMap::new(); + + // Insert UnicodeSets into `result`, with a key like `"5=10"` that + // is the integer representation of the Rust enums for the Unicode + // enumerated property name (`gc` or `General_Category`) and the Unicode + // enumerated property value (`Lo` or `Other_letter`). + for (canonical_prop_name, prop_val_builder_map) in m { + for (canonical_val_name, uniset_builder) in prop_val_builder_map { + let enum_val_uniset_name = + get_prop_name_identifier(canonical_prop_name, canonical_val_name); + if let Some(name_str) = enum_val_uniset_name { + let enum_val_uniset_name: Cow<'s, TinyStr16> = Cow::Owned(name_str); + let uniset = uniset_builder.build(); + result.insert(enum_val_uniset_name, uniset); + } + } + } + + result +} + +fn aliases_as_canonical_mappings<'s>( + aliases_map: &HashMap<&'s str, HashSet<&'s str>>, +) -> HashMap<&'s str, &'s str> { + let mut result: HashMap<&str, &str> = HashMap::new(); + for (canonical_name, aliases) in aliases_map { + result.insert(<&str>::clone(canonical_name), <&str>::clone(canonical_name)); + for alias in aliases { + result.insert(<&str>::clone(alias), <&str>::clone(canonical_name)); + } + } + + result +} + +fn get_enum_val_canonical_mapping<'s>( + enum_val_aliases: &HashMap<&'s str, HashMap<&'s str, HashSet<&'s str>>>, +) -> HashMap<&'s str, HashMap<&'s str, &'s str>> { + let mut result: HashMap<&str, HashMap<&str, &str>> = HashMap::new(); + for (enum_prop_canon_name, enum_val_aliases) in enum_val_aliases { + let enum_val_canonical_mapping = aliases_as_canonical_mappings(enum_val_aliases); + result.insert( + <&str>::clone(enum_prop_canon_name), + enum_val_canonical_mapping, + ); + } + + result +} + +/// Parse a whole PPUCD file that was loaded into a string slice and return a /// struct of the binary and enumerated property inversion lists. /// Note: even though `UnicodeProperties` stores a sequential data structure of /// the `UnicodeProperty` struct, there is no inherent ordering of the entries. @@ -229,7 +408,11 @@ pub fn parse<'s>(s: &'s str) -> UnicodeProperties<'s> { let parseable_lines = lines.filter(|line| !is_skip_ppucd_line(line)); - let mut prop_aliases: HashMap<&'s str, HashSet<&'s str>> = HashMap::new(); + let mut binary_prop_aliases: HashMap<&'s str, HashSet<&'s str>> = HashMap::new(); + + let mut enum_prop_aliases: HashMap<&'s str, HashSet<&'s str>> = HashMap::new(); + + let mut enum_val_aliases: HashMap<&'s str, HashMap<&'s str, HashSet<&'s str>>> = HashMap::new(); let mut defaults: HashMap<&'s str, &'s str> = HashMap::new(); @@ -241,11 +424,17 @@ pub fn parse<'s>(s: &'s str) -> UnicodeProperties<'s> { // in a PPUCD `cp` line, according to the PPUCD file format spec. let mut code_point_overrides: HashMap> = HashMap::new(); + // current implementation uses this data structure to pull all of the code + // point info into memory let mut code_points: HashMap> = HashMap::new(); + // parse PPUCD to fill out data structures for info of property name aliases + // and overrides at defaults/blocks/cp levels for line in parseable_lines { if is_property_line(&line) { - update_aliases(&mut prop_aliases, &line); + update_property_aliases(&mut binary_prop_aliases, &mut enum_prop_aliases, &line); + } else if is_enum_val_line(&line) { + update_enum_val_aliases(&mut enum_val_aliases, &line); } else if is_defaults_line(&line) { defaults = get_defaults_prop_vals(&line); } else if is_block_line(&line) { @@ -270,13 +459,30 @@ pub fn parse<'s>(s: &'s str) -> UnicodeProperties<'s> { } } + // This vector becomes the return value for the fn. Push each new + // `UnicodeProperty` constructed from each UnicodeSet + name for all of the + // binary properties and enumerated properties parsed from the input. + let mut props: Vec = vec![]; + let binary_prop_unisets: HashMap<&'s str, UnicodeSet> = - get_binary_prop_unisets(&prop_aliases, &code_points); + get_binary_prop_unisets(&binary_prop_aliases, &code_points); - let mut props: Vec = vec![]; + let enum_prop_unisets: HashMap, UnicodeSet> = + get_enum_prop_unisets(&enum_prop_aliases, &enum_val_aliases, &code_points); for (canonical_name, uniset) in binary_prop_unisets { - let ppucd_prop: UnicodeProperty = UnicodeProperty::from_uniset(&uniset, canonical_name); + let ppucd_prop: UnicodeProperty = + UnicodeProperty::from_uniset(&uniset, Cow::Borrowed(canonical_name)); + props.push(ppucd_prop); + } + + for (key_val_tuple_name, uniset) in enum_prop_unisets { + let key_val_tuple_name_str: Cow<'s, str> = match key_val_tuple_name { + Cow::Borrowed(tiny_str) => Cow::Borrowed(tiny_str.as_str()), + Cow::Owned(tiny_str) => Cow::Owned(tiny_str.to_string()), + }; + let ppucd_prop: UnicodeProperty = + UnicodeProperty::from_uniset(&uniset, key_val_tuple_name_str); props.push(ppucd_prop); } @@ -289,28 +495,28 @@ mod gen_properties_test { #[test] fn skip_ppucd_line_test() { - assert_eq!(true, is_skip_ppucd_line(&String::from("ucd;13.0.0"))); + assert_eq!(true, is_skip_ppucd_line("ucd;13.0.0")); assert_eq!( false, - is_skip_ppucd_line(&String::from("value;InSC;Gemination_Mark;Gemination_Mark")) + is_skip_ppucd_line("value;InSC;Gemination_Mark;Gemination_Mark") ); } #[test] fn split_line_test() { - let line = String::from("cp;0020;bc=WS;gc=Zs;lb=SP;na=SPACE;Name_Alias=abbreviation=SP;Pat_WS;SB=SP;WB=WSegSpace;WSpace"); + let line = "cp;0020;bc=WS;gc=Zs;lb=SP;na=SPACE;Name_Alias=abbreviation=SP;Pat_WS;SB=SP;WB=WSegSpace;WSpace"; let exp_parts = vec![ - String::from("cp"), - String::from("0020"), - String::from("bc=WS"), - String::from("gc=Zs"), - String::from("lb=SP"), - String::from("na=SPACE"), - String::from("Name_Alias=abbreviation=SP"), - String::from("Pat_WS"), - String::from("SB=SP"), - String::from("WB=WSegSpace"), - String::from("WSpace"), + "cp", + "0020", + "bc=WS", + "gc=Zs", + "lb=SP", + "na=SPACE", + "Name_Alias=abbreviation=SP", + "Pat_WS", + "SB=SP", + "WB=WSegSpace", + "WSpace", ]; let line_parts = split_line(&line); assert_eq!(exp_parts, line_parts); @@ -377,10 +583,9 @@ mod gen_properties_test { #[test] fn code_point_overrides_test() { - let defaults_line = String::from("defaults;0000..10FFFF;age=NA;bc=L;blk=NB;bpt=n;cf=;dm=;dt=None;ea=N;FC_NFKC=;gc=Cn;GCB=XX;gcm=Cn;hst=NA;InPC=NA;InSC=Other;jg=No_Joining_Group;jt=U;lb=XX;lc=;NFC_QC=Y;NFD_QC=Y;NFKC_CF=;NFKC_QC=Y;NFKD_QC=Y;nt=None;SB=XX;sc=Zzzz;scf=;scx=