From 6db4862fc54951249c6897c4fde64eb597b17cf7 Mon Sep 17 00:00:00 2001 From: Greg Tatum Date: Thu, 8 Apr 2021 16:55:06 -0500 Subject: [PATCH] Components bag support with only skeleton matching (#587) --- components/datetime/src/error.rs | 3 + components/datetime/src/fields/mod.rs | 15 + components/datetime/src/fields/symbols.rs | 68 +++ components/datetime/src/options/components.rs | 275 +++++++++- .../datetime/src/options/preferences.rs | 10 + components/datetime/src/provider/helpers.rs | 34 +- components/datetime/src/skeleton.rs | 357 ++++++++++++- components/datetime/tests/datetime.rs | 35 +- components/datetime/tests/fixtures/structs.rs | 4 + .../tests/components-combine-date-time.json | 24 + .../tests/components-exact-matches.json | 485 ++++++++++++++++++ .../tests/components-width-differences.json | 17 + .../tests/fixtures/tests/components.json | 23 - 13 files changed, 1305 insertions(+), 45 deletions(-) create mode 100644 components/datetime/tests/fixtures/tests/components-combine-date-time.json create mode 100644 components/datetime/tests/fixtures/tests/components-exact-matches.json create mode 100644 components/datetime/tests/fixtures/tests/components-width-differences.json delete mode 100644 components/datetime/tests/fixtures/tests/components.json diff --git a/components/datetime/src/error.rs b/components/datetime/src/error.rs index e974b4094f2..e8732a6d805 100644 --- a/components/datetime/src/error.rs +++ b/components/datetime/src/error.rs @@ -3,6 +3,7 @@ // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). use crate::pattern; +use crate::skeleton::SkeletonError; use icu_provider::prelude::DataError; /// A list of possible error outcomes for the [`DateTimeFormat`](crate::DateTimeFormat) struct. @@ -18,6 +19,8 @@ pub enum DateTimeFormatError { /// Missing field in date time input /// TODO: How can we return which field was missing? MissingInputField, + /// An error from skeleton matching, + Skeleton(SkeletonError), } impl From for DateTimeFormatError { diff --git a/components/datetime/src/fields/mod.rs b/components/datetime/src/fields/mod.rs index fa6c56d208c..5d3f6590076 100644 --- a/components/datetime/src/fields/mod.rs +++ b/components/datetime/src/fields/mod.rs @@ -37,6 +37,21 @@ pub struct Field { pub length: FieldLength, } +impl Field { + pub fn get_length_type(&self) -> TextOrNumeric { + match self.symbol { + FieldSymbol::Year(year) => year.get_length_type(&self.length), + FieldSymbol::Month(month) => month.get_length_type(&self.length), + FieldSymbol::Day(day) => day.get_length_type(&self.length), + FieldSymbol::Weekday(weekday) => weekday.get_length_type(&self.length), + FieldSymbol::DayPeriod(day_period) => day_period.get_length_type(&self.length), + FieldSymbol::Hour(hour) => hour.get_length_type(&self.length), + FieldSymbol::Minute => TextOrNumeric::Numeric, + FieldSymbol::Second(second) => second.get_length_type(&self.length), + } + } +} + impl From<(FieldSymbol, FieldLength)> for Field { fn from(input: (FieldSymbol, FieldLength)) -> Self { Self { diff --git a/components/datetime/src/fields/symbols.rs b/components/datetime/src/fields/symbols.rs index dae218a29c0..93379bee4e9 100644 --- a/components/datetime/src/fields/symbols.rs +++ b/components/datetime/src/fields/symbols.rs @@ -2,6 +2,7 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). +use crate::fields::FieldLength; use std::{cmp::Ordering, convert::TryFrom}; #[derive(Debug, PartialEq)] @@ -28,6 +29,18 @@ pub enum FieldSymbol { Second(Second), } +#[derive(Debug, Eq, PartialEq, Clone, Copy)] +pub enum TextOrNumeric { + Text, + Numeric, +} + +/// FieldSymbols can be either text or numeric. This categorization is important when matching +/// skeletons with a components::Bag. +pub trait LengthType { + fn get_length_type(&self, length: &FieldLength) -> TextOrNumeric; +} + impl FieldSymbol { /// Skeletons are a Vec, and represent the Fields that can be used to match to a /// specific pattern. The order of the Vec does not affect the Pattern that is output. @@ -158,6 +171,12 @@ pub enum Year { WeekOf, } +impl LengthType for Year { + fn get_length_type(&self, _length: &FieldLength) -> TextOrNumeric { + TextOrNumeric::Numeric + } +} + impl TryFrom for Year { type Error = SymbolError; fn try_from(b: u8) -> Result { @@ -185,6 +204,19 @@ pub enum Month { StandAlone, } +impl LengthType for Month { + fn get_length_type(&self, length: &FieldLength) -> TextOrNumeric { + match length { + FieldLength::One => TextOrNumeric::Numeric, + FieldLength::TwoDigit => TextOrNumeric::Numeric, + FieldLength::Abbreviated => TextOrNumeric::Text, + FieldLength::Wide => TextOrNumeric::Text, + FieldLength::Narrow => TextOrNumeric::Text, + FieldLength::Six => TextOrNumeric::Text, + } + } +} + impl TryFrom for Month { type Error = SymbolError; fn try_from(b: u8) -> Result { @@ -214,6 +246,12 @@ pub enum Day { ModifiedJulianDay, } +impl LengthType for Day { + fn get_length_type(&self, _length: &FieldLength) -> TextOrNumeric { + TextOrNumeric::Numeric + } +} + impl TryFrom for Day { type Error = SymbolError; fn try_from(b: u8) -> Result { @@ -245,6 +283,12 @@ pub enum Hour { H24, } +impl LengthType for Hour { + fn get_length_type(&self, _length: &FieldLength) -> TextOrNumeric { + TextOrNumeric::Numeric + } +} + impl TryFrom for Hour { type Error = SymbolError; fn try_from(b: u8) -> Result { @@ -275,6 +319,12 @@ pub enum Second { Millisecond, } +impl LengthType for Second { + fn get_length_type(&self, _length: &FieldLength) -> TextOrNumeric { + TextOrNumeric::Numeric + } +} + impl TryFrom for Second { type Error = SymbolError; fn try_from(b: u8) -> Result { @@ -304,6 +354,18 @@ pub enum Weekday { StandAlone, } +impl LengthType for Weekday { + fn get_length_type(&self, length: &FieldLength) -> TextOrNumeric { + match self { + Weekday::Format => TextOrNumeric::Text, + Weekday::Local | Weekday::StandAlone => match length { + FieldLength::One | FieldLength::TwoDigit => TextOrNumeric::Text, + _ => TextOrNumeric::Numeric, + }, + } + } +} + impl TryFrom for Weekday { type Error = SymbolError; fn try_from(b: u8) -> Result { @@ -332,6 +394,12 @@ pub enum DayPeriod { NoonMidnight, } +impl LengthType for DayPeriod { + fn get_length_type(&self, _length: &FieldLength) -> TextOrNumeric { + TextOrNumeric::Text + } +} + impl TryFrom for DayPeriod { type Error = SymbolError; fn try_from(b: u8) -> Result { diff --git a/components/datetime/src/options/components.rs b/components/datetime/src/options/components.rs index 2bf881283c8..b95db49f393 100644 --- a/components/datetime/src/options/components.rs +++ b/components/datetime/src/options/components.rs @@ -2,23 +2,41 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). -//! Components is a model of encoding information on how to format date and time by specifying a list of components -//! the user wants to be visible in the formatted string and how each field should be displayed. +//! # Implementation status //! -//! This model closely corresponds to `ECMA402` API and allows for high level of customization compared to `Length` model. +//! This is currently only a partial implementation of the UTS 35 skeleton matching algorithm. //! -//! Additionally, the bag contains an optional set of `Preferences` which represent user preferred adjustments -//! that can be applied onto the pattern right before formatting. +//! | Algorithm step | Status | +//! |----------------|--------| +//! | Match skeleton fields according to a ranking | Implemented | +//! | Adjust the matched pattern to have certain widths | Not yet implemented. See [issue #584](https://github.com/unicode-org/icu4x/issues/584) | +//! | Match date and times separately, and them combine them | Not yet implemented. See [issue #585](https://github.com/unicode-org/icu4x/issues/585) | +//! | Use appendItems to fill in a pattern with missing fields | Not yet, and may not be fully implemented. See [issue #586](https://github.com/unicode-org/icu4x/issues/586) | //! -//! # Pattern Selection +//! # Description //! -//! It is important to understand that the components bag is a human-friendly way to describe a skeleton, not a pattern. -//! That means that the components and their lengths provided by the user will be matched against available patterns for -//! a given locale and the closest available pattern will be used for formatting. +//! A [`components::Bag`](struct.Bag.html) is a model of encoding information on how to format date +//! and time by specifying a list of components the user wants to be visible in the formatted string +//! and how each field should be displayed. +//! +//! This model closely corresponds to `ECMA402` API and allows for high level of customization +//! compared to `Length` model. +//! +//! Additionally, the bag contains an optional set of `Preferences` which represent user +//! preferred adjustments that can be applied onto the pattern right before formatting. +//! +//! ## Pattern Selection +//! +//! The [`components::Bag`](struct.Bag.html) is a way for the developer to describe which components +//! should be included in in a date time, and how they should be displayed. There is not a strict +//! guarantee in how the final date will be displayed to the end user. The user's preferences and +//! locale information can override the developer preferences. +//! +//! The fields in the [`components::Bag`](struct.Bag.html) are matched against available patterns in +//! the `CLDR` locale data. A best fit is found, and presented to the user. This means that in +//! certain situations, and component combinations, fields will not have a match, or the match will +//! have a different type of presentation for a given locale. //! -//! That means, that it is possible that if the user asks for a combination of fields or lengths that `CLDR` has no -//! data associated with, the selected pattern may be different than the selection in the `Components` bag. -//! Such scenarios should be rare. //! //! # Examples //! @@ -54,10 +72,13 @@ //! *Note*: The exact result returned from [`DateTimeFormat`](crate::DateTimeFormat) is a subject to change over //! time. Formatted result should be treated as opaque and displayed to the user as-is, //! and it is strongly recommended to never write tests that expect a particular formatted output. +use crate::fields::{self, Field, FieldLength, FieldSymbol}; + use super::preferences; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +/// See the [module-level](./index.html) docs for more information. #[derive(Debug, Clone, PartialEq)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct Bag { @@ -73,10 +94,189 @@ pub struct Bag { pub time_zone_name: Option, - #[cfg_attr(feature = "serde", serde(skip_serializing, skip_deserializing))] pub preferences: Option, } +impl Bag { + /// Converts the components::Bag into a Vec. The fields will be ordered in from most + /// significant field to least significant. This is the order the fields are listed in + /// the UTS 35 table - https://unicode.org/reports/tr35/tr35-dates.html#Date_Field_Symbol_Table + pub(crate) fn to_vec_fields(&self) -> Vec { + let mut fields = Vec::new(); + if let Some(_era) = self.era { + unimplemented!("FieldSymbol::Era is needed. See issue #486.") + } + + if let Some(year) = self.year { + // Unimplemented year fields: + // Y - Week of Year + // u - Extended year + // U - Cyclic year name + // r - Related Gregorian year + fields.push(Field { + symbol: FieldSymbol::Year(fields::Year::Calendar), + length: match year { + // Calendar year (numeric). + // y 2, 20, 201, 2017, 20173 + // yy 02, 20, 01, 17, 73 + // yyy 002, 020, 201, 2017, 20173 (not implemented) + // yyyy 0002, 0020, 0201, 2017, 20173 (not implemented) + // yyyyy+ ... (not implemented) + Numeric::Numeric => FieldLength::One, + Numeric::TwoDigit => FieldLength::TwoDigit, + }, + }); + } + + // TODO(#501) - Unimplemented quarter fields: + // Q - Quarter number/name + // q - Stand-alone quarter + + if let Some(month) = self.month { + fields.push(Field { + // Always choose Month::Format as Month::StandAlone is not used in skeletons. + symbol: FieldSymbol::Month(fields::Month::Format), + length: match month { + // (intended to be used in conjunction with ‘d’ for day number). + // M 9, 12 Numeric: minimum digits + // MM 09, 12 Numeric: 2 digits, zero pad if needed + // MMM Sep Abbreviated + // MMMM September Wide + // MMMMM S Narrow + Month::Numeric => FieldLength::One, + Month::TwoDigit => FieldLength::TwoDigit, + Month::Long => FieldLength::Wide, + Month::Short => FieldLength::Abbreviated, + Month::Narrow => FieldLength::Narrow, + }, + }); + } + + // TODO(#502) - Unimplemented week fields: + // w - Week of year + // W - Week of month + + if let Some(day) = self.day { + // TODO(#591,#592) Unimplemented day fields: + // D - Day of year + // F - Day of week in month + // g - Modified Julian day. + fields.push(Field { + symbol: FieldSymbol::Day(fields::Day::DayOfMonth), + length: match day { + // Day of month (numeric). + // d 1 Numeric: minimum digits + // dd 01 Numeric: 2 digits, zero pad if needed + Numeric::Numeric => FieldLength::One, + Numeric::TwoDigit => FieldLength::TwoDigit, + }, + }); + } + + if let Some(weekday) = self.weekday { + // TODO(#593) Unimplemented fields + // e - Local day of week. + // c - Stand-alone local day of week. + fields.push(Field { + symbol: FieldSymbol::Weekday(fields::Weekday::Format), + length: match weekday { + // Day of week name, format length. + // + // E..EEE Tue Abbreviated + // EEEE Tuesday Wide + // EEEEE T Narrow + // EEEEEE Tu Short + Text::Long => FieldLength::Wide, + Text::Short => FieldLength::One, + Text::Narrow => FieldLength::Narrow, + }, + }); + } + + // The period fields are not included in skeletons: + // a - AM, PM + // b - am, pm, noon, midnight + // c - flexible day periods + + if let Some(hour) = self.hour { + // fields::Hour::H11 + // fields::Hour::H12 + // fields::Hour::H23 + // fields::Hour::H24 + + // When used in skeleton data or in a skeleton passed in an API for flexible date + // pattern generation, it should match the 12-hour-cycle format preferred by the + // locale (h or K); it should not match a 24-hour-cycle format (H or k). + fields.push(Field { + symbol: FieldSymbol::Hour(match self.preferences { + Some(preferences::Bag { + hour_cycle: Some(hour_cycle), + }) => match hour_cycle { + // k - symbol + preferences::HourCycle::H24 => fields::Hour::H24, + // H - symbol + preferences::HourCycle::H23 => fields::Hour::H23, + // h - symbol + preferences::HourCycle::H12 => fields::Hour::H12, + // K - symbol + preferences::HourCycle::H11 => fields::Hour::H11, + }, + // TODO(#594) - This should default should be the locale default, which is + // region-based (h12 for US, h23 for GB, etc). This is in CLDR, but we need + // to load it as well as think about the best architecture for where that + // data loading code should reside. + _ => fields::Hour::H24, + }), + length: match hour { + // Example for h: (note that this is the same for k, K, and H) + // h 1, 12 Numeric: minimum digits + // hh 01, 12 Numeric: 2 digits, zero pad if needed + Numeric::Numeric => FieldLength::One, + Numeric::TwoDigit => FieldLength::TwoDigit, + }, + }); + } + + if let Some(minute) = self.minute { + // m 8, 59 Numeric: minimum digits + // mm 08, 59 Numeric: 2 digits, zero pad if needed + fields.push(Field { + symbol: FieldSymbol::Minute, + length: match minute { + Numeric::Numeric => FieldLength::One, + Numeric::TwoDigit => FieldLength::TwoDigit, + }, + }); + } + + if let Some(second) = self.second { + // s 8, 12 Numeric: minimum digits + // ss 08, 12 Numeric: 2 digits, zero pad if needed + fields.push(Field { + symbol: FieldSymbol::Second(fields::Second::Second), + length: match second { + Numeric::Numeric => FieldLength::One, + Numeric::TwoDigit => FieldLength::TwoDigit, + }, + }); + // S - Not used in skeletons. + // A - Milliseconds in day. Not used in skeletons. + } + + // TODO(#583) - Implement: + // if self.time_zone_name.is_some() { + // unimplemented!(); + // } + + debug_assert!( + fields.windows(2).all(|f| f[0] < f[1]), + "The fields are sorted and unique." + ); + + fields + } +} + impl Default for Bag { fn default() -> Self { Self { @@ -140,3 +340,52 @@ pub enum TimeZoneName { #[cfg_attr(feature = "serde", serde(rename = "short"))] Short, } + +#[cfg(test)] +mod test { + use super::*; + + // Shorten these for terser tests. + type Symbol = FieldSymbol; + type Length = FieldLength; + + #[test] + fn test_component_bag_to_vec_field() { + let bag = Bag::default(); + assert_eq!( + bag.to_vec_fields(), + vec![ + (Symbol::Year(fields::Year::Calendar), Length::One).into(), + (Symbol::Month(fields::Month::Format), Length::Wide).into(), + (Symbol::Day(fields::Day::DayOfMonth), Length::One).into(), + (Symbol::Hour(fields::Hour::H24), Length::One).into(), + (Symbol::Minute, Length::One).into(), + (Symbol::Second(fields::Second::Second), Length::One).into(), + ] + ); + } + + #[test] + fn test_component_bag_to_vec_field2() { + let bag = Bag { + era: None, + year: Some(Numeric::Numeric), + month: Some(Month::TwoDigit), + day: Some(Numeric::Numeric), + weekday: None, + hour: None, + minute: None, + second: None, + time_zone_name: None, + preferences: None, + }; + assert_eq!( + bag.to_vec_fields(), + vec![ + (Symbol::Year(fields::Year::Calendar), Length::One).into(), + (Symbol::Month(fields::Month::Format), Length::TwoDigit).into(), + (Symbol::Day(fields::Day::DayOfMonth), Length::One).into(), + ] + ); + } +} diff --git a/components/datetime/src/options/preferences.rs b/components/datetime/src/options/preferences.rs index 75f2267e43d..b4f981ab24e 100644 --- a/components/datetime/src/options/preferences.rs +++ b/components/datetime/src/options/preferences.rs @@ -21,6 +21,9 @@ //! ``` use crate::fields; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; + /// Bag of preferences stores user preferences which may affect the result of date and time formatting. /// /// # Examples @@ -33,12 +36,15 @@ use crate::fields; /// }; /// ``` #[derive(Debug, Clone, PartialEq)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct Bag { + #[cfg_attr(feature = "serde", serde(rename = "hourCycle"))] pub hour_cycle: Option, } /// User Preference for adjusting how hour component is displayed. #[derive(Debug, Clone, Copy, PartialEq)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub enum HourCycle { /// Hour is formatted to be in range 1-24 /// @@ -50,6 +56,7 @@ pub enum HourCycle { /// "19:00"; /// "23:21"; /// ``` + #[cfg_attr(feature = "serde", serde(rename = "h24"))] H24, /// Hour is formatted to be in range 0-23 /// @@ -61,6 +68,7 @@ pub enum HourCycle { /// "19:00"; /// "23:21"; /// ``` + #[cfg_attr(feature = "serde", serde(rename = "h23"))] H23, /// Hour is formatted to be in range 1-12 /// @@ -72,6 +80,7 @@ pub enum HourCycle { /// "7:00"; /// "11:21"; /// ``` + #[cfg_attr(feature = "serde", serde(rename = "h12"))] H12, /// Hour is formatted to be in range 0-11 /// @@ -83,6 +92,7 @@ pub enum HourCycle { /// "7:00"; /// "11:21"; /// ``` + #[cfg_attr(feature = "serde", serde(rename = "h11"))] H11, } diff --git a/components/datetime/src/provider/helpers.rs b/components/datetime/src/provider/helpers.rs index 73ead0d7839..48bc7695775 100644 --- a/components/datetime/src/provider/helpers.rs +++ b/components/datetime/src/provider/helpers.rs @@ -5,15 +5,20 @@ use crate::date; use crate::error::DateTimeFormatError; use crate::fields; -use crate::options::{length, DateTimeFormatOptions}; +use crate::options::{components, length, DateTimeFormatOptions}; use crate::pattern::Pattern; use crate::provider; +use crate::skeleton; use std::borrow::Cow; type Result = std::result::Result; pub trait DateTimePatterns { fn get_pattern_for_options(&self, options: &DateTimeFormatOptions) -> Result>; + fn get_pattern_for_components_bag( + &self, + components: &components::Bag, + ) -> Result>; fn get_pattern_for_length_bag(&self, length: &length::Bag) -> Result>; fn get_pattern_for_date_length(&self, length: length::Date) -> Result; fn get_pattern_for_time_length(&self, length: length::Time) -> Result; @@ -51,10 +56,35 @@ impl DateTimePatterns for provider::gregory::PatternsV1 { fn get_pattern_for_options(&self, options: &DateTimeFormatOptions) -> Result> { match options { DateTimeFormatOptions::Length(bag) => self.get_pattern_for_length_bag(bag), - DateTimeFormatOptions::Components(_) => unimplemented!(), + DateTimeFormatOptions::Components(bag) => self.get_pattern_for_components_bag(bag), } } + fn get_pattern_for_components_bag( + &self, + components: &components::Bag, + ) -> Result> { + // Not all skeletons are currently supported. + let available_format_patterns = + skeleton::get_available_format_patterns(&self.date_time.skeletons); + let requested_fields = components.to_vec_fields(); + Ok( + match skeleton::get_best_available_format_pattern( + available_format_patterns, + &requested_fields, + ) { + skeleton::BestSkeleton::AllFieldsMatch(available_format_pattern) + | skeleton::BestSkeleton::MissingOrExtraFields(available_format_pattern) => { + // In the short-term future, patterns can be dynamically generated to provide + // a better match than what is literally in the CLDR. For now, just clone the + // pattern. + Some(available_format_pattern.pattern.clone()) + } + skeleton::BestSkeleton::NoMatch => None, + }, + ) + } + fn get_pattern_for_length_bag(&self, length: &length::Bag) -> Result> { match (length.date, length.time) { (None, None) => Ok(None), diff --git a/components/datetime/src/skeleton.rs b/components/datetime/src/skeleton.rs index b5b507fac47..b9b76ba56a0 100644 --- a/components/datetime/src/skeleton.rs +++ b/components/datetime/src/skeleton.rs @@ -7,7 +7,11 @@ use smallvec::SmallVec; use std::{convert::TryFrom, fmt}; -use crate::fields::{self, Field, FieldLength, FieldSymbol}; +use crate::{ + fields::{self, Field, FieldLength, FieldSymbol}, + pattern::Pattern, + provider::gregory::patterns::{PatternV1, SkeletonV1, SkeletonsV1}, +}; #[cfg(feature = "provider_serde")] use serde::{ @@ -188,6 +192,26 @@ impl TryFrom<&str> for Skeleton { } } +/// The `AvailableFormatPattern` represents a specific pattern that is available for a given locale. +/// A [`Skeleton`] is used to match against to find the best pattern. +#[derive(Debug, PartialEq, Clone)] +pub struct AvailableFormatPattern<'a> { + /// The skeleton is used to match against. + skeleton: &'a Skeleton, + pub pattern: &'a Pattern, +} + +impl<'a> From<(&'a SkeletonV1, &'a PatternV1)> for AvailableFormatPattern<'a> { + fn from(tuple: (&'a SkeletonV1, &'a PatternV1)) -> Self { + let (skeleton_v1, pattern_v1) = tuple; + + AvailableFormatPattern { + skeleton: &skeleton_v1.0, + pattern: &pattern_v1.0, + } + } +} + #[derive(Debug)] pub enum SkeletonError { FieldLengthTooLong, @@ -260,10 +284,339 @@ impl From for SkeletonError { } } +// The following scalar values are for testing the suitability of a skeleton's field for the +// given input. Per UTS 35, the better the fit of a pattern, the "lower the distance". In this +// implementation each distance type is separated by an order of magnitiude. This magnitude needs +// to be at minimum a multiple of the max length of fields. As of CLDR 38 (2021-01), the max length +// of a skeleton in the "availableFormats" contained a total of 4 fields. The scores use a multiple +// of 10, as a number that will contain the range, and be easy to reason with. +// +// The only exception is on the largest magnitude of values (MISSING_OR_SKELETON_EXTRA_SYMBOL). The +// missing or extra count BOTH the requested fields and skeleton fields. This is fine since there +// is no higher magnitude. + +const MAX_SKELETON_FIELDS: u32 = 10; + +// Per the skeleton matching algorithm: +// https://unicode.org/reports/tr35/tr35-dates.html#Matching_Skeletons + +// > 1. "Input skeleton symbols" are replaced with the best match for a given locale. +// > - Hour: j → {H, k, h, K} + {a, b, B} +// > J → {H, k, h, K} +// > C → j + day period + +// The components::Bag does not support step 1 + +// > 2. For fields with symbols representing the same type (year, month, day, etc): +// > A. Most symbols have a small distance from each other. +// > - Months: M ≅ L (9 ≅ 9) conjuction, vs stand-alone +// > Week: E ≅ c (Tue ≅ 2) +// > Period: a ≅ b ≅ B (am. ≅ mid. ≅ at night) +// > Hour: H ≅ k ≅ h ≅ K (23, 24, 12, 11) + +// For step 2, the components::Bag will not produce "stand-alone" months, as no skeletons +// contain stand-alone months. + +const NO_DISTANCE: u32 = 0; + +// B. Width differences among fields, other than those marking text vs numeric, are given small +// distance from each other. +// - MMM ≅ MMMM (Sep ≅ September) +// MM ≅ M (09 ≅ 9) +const WIDTH_MISMATCH_DISTANCE: u32 = 1; + +// C. Numeric and text fields are given a larger distance from each other. +// - MMM ≈ MM (Sep ≈ 09) +// MMM +const TEXT_VS_NUMERIC_DISTANCE: u32 = 10; + +// D. Symbols representing substantial differences (week of year vs week of month) are given much +// larger a distances from each other. +// - d ≋ D; (12 ≋ 345) Day of month vs Day of year +const SUBSTANTIAL_DIFFERENCES_DISTANCE: u32 = 100; + +// A skeleton had more symbols than what was requested. +const SKELETON_EXTRA_SYMBOL: u32 = 1000; + +// A requested symbol is missing in the skeleton. Note that this final value can be more than +// MAX_SKELETON_FIELDS, as it's counting the missing requested fields, which can be longer than +// the stored skeletons. There cannot be any cases higher than this one. +const REQUESTED_SYMBOL_MISSING: u32 = 10000; + +/// According to the [UTS 35 skeleton matching algorithm](https://unicode.org/reports/tr35/tr35-dates.html#Matching_Skeletons) +/// there will be a guaranteed match for a skeleton. However, with this initial implementation, +/// there is no attempt to add on missing fields. This enum encodes the variants for the current +/// search for a best skeleton. +#[derive(Debug, PartialEq, Clone)] +pub enum BestSkeleton<'a> { + AllFieldsMatch(AvailableFormatPattern<'a>), + MissingOrExtraFields(AvailableFormatPattern<'a>), + NoMatch, +} + +/// A partial implementation of the [UTS 35 skeleton matching algorithm](https://unicode.org/reports/tr35/tr35-dates.html#Matching_Skeletons). +/// +/// The following is implemented: +/// +/// * Compute a score based on the best possible match for the given fields. +/// * Select the skeleton with highest score. +/// +/// The following is not implemented: +/// +/// * 2.6.2.1 Matching Skeletons +/// - TODO(#584) - Modify the resulting pattern to have fields of the same length. For example requesting +/// a skeleton "yMMMMd" can have a best match of ["yMMMd", "d MMM y"]. This pattern should +/// then be modified to use the requested length to produce a pattern "d MMMM y". +/// However, fields should not be changed from numeric to text. +/// * 2.6.2.2 Missing Skeleton Fields +/// - TODO(#585) - The mechanism to combine a date pattern and a time pattern. +/// - TODO(#586) - Using the CLDR appendItems field. Note: There is not agreement yet on how +/// much of this step to implement. See the issue for more information. +pub fn get_best_available_format_pattern<'a>( + available_format_patterns: impl Iterator> + 'a, + fields: &[Field], +) -> BestSkeleton<'a> { + let mut closest_format_pattern = None; + let mut closest_distance: u32 = u32::MAX; + let mut closest_missing_fields = 0; + + for available_format_pattern in available_format_patterns { + let skeleton = &available_format_pattern.skeleton; + debug_assert!( + skeleton.fields_len() <= MAX_SKELETON_FIELDS as usize, + "The distance mechanism assumes skeletons are less than MAX_SKELETON_FIELDS in length." + ); + let mut missing_fields = 0; + let mut distance: u32 = 0; + // The distance should fit into a u32. + + let mut requested_fields = fields.iter().peekable(); + let mut skeleton_fields = skeleton.fields_iter().peekable(); + loop { + let next = (requested_fields.peek(), skeleton_fields.peek()); + + // Try to find matching symbols. + match next { + (Some(requested_field), Some(skeleton_field)) => { + debug_assert!( + // As of the time of this writing, stand-alone months are not in the CLDR + // skeleton data. The components::Bag could produce stand-alone month fields, + // but since the CLDR does not have them, only Month::Format symbols are + // used for matching. + skeleton_field.symbol != FieldSymbol::Month(fields::Month::StandAlone) + ); + + if skeleton_field.symbol > requested_field.symbol { + // Keep searching for a matching skeleton field. + skeleton_fields.next(); + distance += SKELETON_EXTRA_SYMBOL; + continue; + } + + if skeleton_field.symbol < requested_field.symbol { + // The requested field symbol is missing from the skeleton. + distance += REQUESTED_SYMBOL_MISSING; + missing_fields += 1; + requested_fields.next(); + continue; + } + + distance += if requested_field == skeleton_field { + NO_DISTANCE + } else if requested_field.symbol != skeleton_field.symbol { + SUBSTANTIAL_DIFFERENCES_DISTANCE + } else if requested_field.get_length_type() != skeleton_field.get_length_type() + { + TEXT_VS_NUMERIC_DISTANCE + } else { + WIDTH_MISMATCH_DISTANCE + }; + + requested_fields.next(); + skeleton_fields.next(); + } + (None, Some(_)) => { + // The skeleton has additional fields that we are not matching. + distance += SKELETON_EXTRA_SYMBOL; + skeleton_fields.next(); + } + (Some(_), None) => { + // The skeleton is missing requested fields. + distance += REQUESTED_SYMBOL_MISSING; + requested_fields.next(); + missing_fields += 1; + } + (None, None) => { + break; + } + } + } + + if distance < closest_distance { + closest_format_pattern = Some(available_format_pattern); + closest_distance = distance; + closest_missing_fields = missing_fields; + } + } + + let closest_format_pattern = + closest_format_pattern.expect("At least one closest format pattern will always be found."); + + if closest_missing_fields == fields.len() { + return BestSkeleton::NoMatch; + } + if closest_distance >= SKELETON_EXTRA_SYMBOL { + return BestSkeleton::MissingOrExtraFields(closest_format_pattern); + } + BestSkeleton::AllFieldsMatch(closest_format_pattern) +} + +pub fn get_available_format_patterns<'a>( + skeletons: &'a SkeletonsV1, +) -> impl Iterator + 'a { + skeletons.0.iter().map(AvailableFormatPattern::from) +} + #[cfg(all(test, feature = "provider_serde"))] mod test { use super::*; - use crate::fields::{Day, Field, FieldLength, Month, Weekday}; + + use icu_locid_macros::langid; + use icu_provider::{DataProvider, DataRequest, ResourceOptions, ResourcePath}; + use std::borrow::Cow; + + use crate::{ + fields::{Day, Field, FieldLength, Month, Weekday}, + options::components, + provider::{gregory::DatesV1, key::GREGORY_V1}, + }; + + fn get_data_provider() -> Cow<'static, DatesV1> { + let provider = icu_testdata::get_provider(); + let langid = langid!("en"); + provider + .load_payload(&DataRequest { + resource_path: ResourcePath { + key: GREGORY_V1, + options: ResourceOptions { + variant: None, + langid: Some(langid), + }, + }, + }) + .unwrap() + .payload + .take() + .unwrap() + } + + /// This is an initial smoke test to verify the skeleton machinery is working. For more in-depth + /// testing see components/datetime/tests/fixtures/tests/components-*.json + #[test] + fn test_skeleton_matching() { + let components = components::Bag::default(); + let requested_fields = components.to_vec_fields(); + let data_provider = get_data_provider(); + let available_format_patterns = + get_available_format_patterns(&data_provider.patterns.date_time.skeletons); + + match get_best_available_format_pattern(available_format_patterns, &requested_fields) { + BestSkeleton::AllFieldsMatch(available_format_pattern) + | BestSkeleton::MissingOrExtraFields(available_format_pattern) => { + assert_eq!( + available_format_pattern.pattern.to_string(), + String::from("MMM d, y") + ) + } + BestSkeleton::NoMatch => { + panic!("No skeleton was found.") + } + }; + } + + #[test] + fn test_skeleton_matching_missing_fields() { + let components = components::Bag { + era: None, + year: None, + month: Some(components::Month::Numeric), + day: None, + weekday: Some(components::Text::Short), + hour: None, + minute: None, + second: None, + time_zone_name: None, + preferences: None, + }; + let requested_fields = components.to_vec_fields(); + let data_provider = get_data_provider(); + let available_format_patterns = + get_available_format_patterns(&data_provider.patterns.date_time.skeletons); + + match get_best_available_format_pattern(available_format_patterns, &requested_fields) { + BestSkeleton::MissingOrExtraFields(available_format_pattern) => { + assert_eq!( + available_format_pattern.pattern.to_string(), + String::from("L") + ) + } + best => panic!("Unexpected {:?}", best), + }; + } + + #[test] + fn test_skeleton_empty_bag() { + let components = components::Bag { + era: None, + year: None, + month: None, + day: None, + weekday: None, + hour: None, + minute: None, + second: None, + time_zone_name: None, + preferences: None, + }; + let requested_fields = components.to_vec_fields(); + let data_provider = get_data_provider(); + let available_format_patterns = + get_available_format_patterns(&data_provider.patterns.date_time.skeletons); + + assert_eq!( + get_best_available_format_pattern(available_format_patterns, &requested_fields), + BestSkeleton::NoMatch, + "No match was found" + ); + } + + /// There are no skeletons that match just the timezone. They all rely on the appendItems + /// data from the CLDR. + #[test] + fn test_skeleton_no_match() { + let components = components::Bag { + era: None, + year: None, + month: None, + day: None, + weekday: None, + hour: None, + minute: None, + second: None, + time_zone_name: Some(components::TimeZoneName::Long), + preferences: None, + }; + let requested_fields = components.to_vec_fields(); + let data_provider = get_data_provider(); + let available_format_patterns = + get_available_format_patterns(&data_provider.patterns.date_time.skeletons); + + assert_eq!( + get_best_available_format_pattern(available_format_patterns, &requested_fields), + BestSkeleton::NoMatch, + "No match was found" + ); + } // These were all of the skeletons from the "available formats" in the CLDR as of 2021-01 // Generated with: diff --git a/components/datetime/tests/datetime.rs b/components/datetime/tests/datetime.rs index 1163cef89cf..20ba597b08e 100644 --- a/components/datetime/tests/datetime.rs +++ b/components/datetime/tests/datetime.rs @@ -32,7 +32,14 @@ fn test_fixture(fixture_name: &str) { let value: MockDateTime = fx.input.value.parse().unwrap(); let result = dtf.format_to_string(&value); - assert_eq!(result, fx.output.value); + match fx.description { + Some(description) => assert_eq!( + result, fx.output.value, + "expected {:?} to equal {:?} – {}", + result, fx.output.value, description + ), + None => assert_eq!(result, fx.output.value), + } let mut s = String::new(); dtf.format_to_write(&mut s, &value).unwrap(); @@ -108,13 +115,31 @@ fn test_dayperiod_patterns() { #[test] fn test_length_fixtures() { + // components/datetime/tests/fixtures/tests/lengths.json test_fixture("lengths"); } -// Expected panic: 'not implemented', components/datetime/src/provider.rs:49:53 -// https://github.com/unicode-org/icu4x/issues/272 +/// Tests component::Bag configurations that have exact matches to CLDR skeletons. +#[test] +fn test_components_exact_matches() { + // components/datetime/tests/fixtures/tests/components-exact-matches.json + test_fixture("components-exact-matches"); +} + +/// Tests that component::Bags can adjust for width differences in the final pattern. +/// TODO(584) - This is unimplemented and will panic. +#[test] +#[should_panic] +fn test_components_width_differences() { + // components/datetime/tests/fixtures/tests/components-exact-matches.json + test_fixture("components-width-differences"); +} + +/// Tests that component::Bags can combine a date skeleton, and a time skeleton. +/// TODO(585) - This is unimplemented and will panic. #[test] #[should_panic] -fn test_components_fixtures() { - test_fixture("components"); +fn test_components_combine_date_time() { + // components/datetime/tests/fixtures/tests/components-date-time.json + test_fixture("components-combine-date-time"); } diff --git a/components/datetime/tests/fixtures/structs.rs b/components/datetime/tests/fixtures/structs.rs index 87556e49ce6..8b6d472017a 100644 --- a/components/datetime/tests/fixtures/structs.rs +++ b/components/datetime/tests/fixtures/structs.rs @@ -4,6 +4,9 @@ #![cfg(feature = "serde")] +//! This file contains the serde representaitons of the JSON files located in +//! components/datetime/tests/fixtures/tests + use icu_datetime::options::{components, length}; use serde::{Deserialize, Serialize}; @@ -12,6 +15,7 @@ pub struct Fixture(pub Vec); #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Test { + pub description: Option, pub input: TestInput, pub output: TestOutput, } diff --git a/components/datetime/tests/fixtures/tests/components-combine-date-time.json b/components/datetime/tests/fixtures/tests/components-combine-date-time.json new file mode 100644 index 00000000000..7bf687174ea --- /dev/null +++ b/components/datetime/tests/fixtures/tests/components-combine-date-time.json @@ -0,0 +1,24 @@ +[ + { + "description": "Combine date and time: yMMMd + Ehm", + "input": { + "locale": "en", + "value": "2020-01-07T08:25:07.000", + "options": { + "components": { + "year": "numeric", + "month": "short", + "day": "numeric", + + "weekday": "short", + "hour": "numeric", + "minute": "numeric", + "preferences": { "hourCycle": "h12" } + } + } + }, + "output": { + "value": "Tue, Jan 7, 2020, 8:25 AM" + } + } +] diff --git a/components/datetime/tests/fixtures/tests/components-exact-matches.json b/components/datetime/tests/fixtures/tests/components-exact-matches.json new file mode 100644 index 00000000000..9ed2d6229fe --- /dev/null +++ b/components/datetime/tests/fixtures/tests/components-exact-matches.json @@ -0,0 +1,485 @@ +[ + { + "description": "Exact match for: y => y", + "input": { + "locale": "en", + "value": "2020-01-07T08:25:07.000", + "options": { + "components": { + "year": "numeric" + } + } + }, + "output": { + "value": "2020" + } + }, + { + "description": "Exact match for: yM => M/y", + "input": { + "locale": "en", + "value": "2020-01-07T08:25:07.000", + "options": { + "components": { + "year": "numeric", + "month": "numeric" + } + } + }, + "output": { + "value": "1/2020" + } + }, + { + "description": "Exact match for: yMd => M/d/y", + "input": { + "locale": "en", + "value": "2020-01-07T08:25:07.000", + "options": { + "components": { + "year": "numeric", + "month": "numeric", + "day": "numeric" + } + } + }, + "output": { + "value": "1/7/2020" + } + }, + { + "description": "Exact match for: yMdE => E, M/d/y", + "input": { + "locale": "en", + "value": "2020-01-07T08:25:07.000", + "options": { + "components": { + "year": "numeric", + "month": "numeric", + "day": "numeric", + "weekday": "short" + } + } + }, + "output": { + "value": "Tue, 1/7/2020" + } + }, + { + "description": "Exact match for: yMMM => MMM y", + "input": { + "locale": "en", + "value": "2020-01-07T08:25:07.000", + "options": { + "components": { + "year": "numeric", + "month": "short" + } + } + }, + "output": { + "value": "Jan 2020" + } + }, + { + "description": "Exact match for: yMMMd => MMM d, y", + "input": { + "locale": "en", + "value": "2020-01-07T08:25:07.000", + "options": { + "components": { + "year": "numeric", + "month": "short", + "day": "numeric" + } + } + }, + "output": { + "value": "Jan 7, 2020" + } + }, + { + "description": "Exact match for: yMMMdE => E, MMM d, y", + "input": { + "locale": "en", + "value": "2020-01-07T08:25:07.000", + "options": { + "components": { + "year": "numeric", + "month": "short", + "day": "numeric", + "weekday": "short" + } + } + }, + "output": { + "value": "Tue, Jan 7, 2020" + } + }, + { + "description": "Exact match for: yMMMM => MMMM y", + "input": { + "locale": "en", + "value": "2020-01-07T08:25:07.000", + "options": { + "components": { + "year": "numeric", + "month": "long" + } + } + }, + "output": { + "value": "January 2020" + } + }, + { + "description": "Exact match for: M => M", + "input": { + "locale": "en", + "value": "2020-01-07T08:25:07.000", + "options": { + "components": { + "month": "short" + } + } + }, + "output": { + "value": "Jan" + } + }, + { + "description": "Exact match for: Md => M/d", + "input": { + "locale": "en", + "value": "2020-01-07T08:25:07.000", + "options": { + "components": { + "month": "numeric", + "day": "numeric" + } + } + }, + "output": { + "value": "1/7" + } + }, + { + "description": "Exact match for: MdE => E, M/d", + "input": { + "locale": "en", + "value": "2020-01-07T08:25:07.000", + "options": { + "components": { + "month": "numeric", + "day": "numeric", + "weekday": "short" + } + } + }, + "output": { + "value": "Tue, 1/7" + } + }, + { + "description": "Exact match for: MMM => LLL", + "input": { + "locale": "en", + "value": "2020-01-07T08:25:07.000", + "options": { + "components": { + "month": "short" + } + } + }, + "output": { + "value": "Jan" + } + }, + { + "description": "Exact match for: MMMd => MMM d", + "input": { + "locale": "en", + "value": "2020-01-07T08:25:07.000", + "options": { + "components": { + "month": "short", + "day": "numeric" + } + } + }, + "output": { + "value": "Jan 7" + } + }, + { + "description": "Exact match for: MMMdE => E, MMM d", + "input": { + "locale": "en", + "value": "2020-01-07T08:25:07.000", + "options": { + "components": { + "month": "short", + "day": "numeric", + "weekday": "short" + } + } + }, + "output": { + "value": "Tue, Jan 7" + } + }, + { + "description": "Exact match for: MMMMd => MMMM d", + "input": { + "locale": "en", + "value": "2020-01-07T08:25:07.000", + "options": { + "components": { + "month": "long", + "day": "numeric" + } + } + }, + "output": { + "value": "January 7" + } + }, + { + "description": "Exact match for: d => d", + "input": { + "locale": "en", + "value": "2020-01-07T08:25:07.000", + "options": { + "components": { + "day": "numeric" + } + } + }, + "output": { + "value": "7" + } + }, + { + "description": "Exact match for: dE => d E", + "input": { + "locale": "en", + "value": "2020-01-07T08:25:07.000", + "options": { + "components": { + "day": "numeric", + "weekday": "short" + } + } + }, + "output": { + "value": "7 Tue" + } + }, + { + "description": "Exact match for: E => ccc", + "input": { + "locale": "en", + "value": "2020-01-07T08:25:07.000", + "options": { + "components": { + "weekday": "short" + } + } + }, + "output": { + "value": "Tue" + } + }, + { + "description": "Exact match for: Ehm => E h:mm a", + "input": { + "locale": "en", + "value": "2020-01-07T08:25:07.000", + "options": { + "components": { + "weekday": "short", + "hour": "numeric", + "minute": "numeric", + "preferences": { "hourCycle": "h12" } + } + } + }, + "output": { + "value": "Tue 8:25 AM" + } + }, + { + "description": "Exact match for: Ehms => E h:mm:ss a", + "input": { + "locale": "en", + "value": "2020-01-07T08:25:07.000", + "options": { + "components": { + "weekday": "short", + "hour": "numeric", + "minute": "numeric", + "second": "numeric", + "preferences": { "hourCycle": "h12" } + } + } + }, + "output": { + "value": "Tue 8:25:07 AM" + } + }, + { + "description": "Exact match for: EHm => E h:mm a", + "input": { + "locale": "en", + "value": "2020-01-07T08:25:07.000", + "options": { + "components": { + "weekday": "short", + "hour": "numeric", + "minute": "numeric", + "preferences": { "hourCycle": "h23" } + } + } + }, + "output": { + "value": "Tue 08:25" + } + }, + { + "description": "Exact match for: EHms => E h:mm:ss a", + "input": { + "locale": "en", + "value": "2020-01-07T08:25:07.000", + "options": { + "components": { + "weekday": "short", + "hour": "numeric", + "minute": "numeric", + "second": "numeric", + "preferences": { "hourCycle": "h23" } + } + } + }, + "output": { + "value": "Tue 08:25:07" + } + }, + { + "description": "Exact match for: h => h a", + "input": { + "locale": "en", + "value": "2020-01-07T08:25:07.000", + "options": { + "components": { + "hour": "numeric", + "preferences": { "hourCycle": "h12" } + } + } + }, + "output": { + "value": "8 AM" + } + }, + { + "description": "Exact match for: hm => h:mm a", + "input": { + "locale": "en", + "value": "2020-01-07T08:25:07.000", + "options": { + "components": { + "hour": "numeric", + "minute": "numeric", + "preferences": { "hourCycle": "h12" } + } + } + }, + "output": { + "value": "8:25 AM" + } + }, + { + "description": "Exact match for: hms => h:mm:ss a", + "input": { + "locale": "en", + "value": "2020-01-07T08:25:07.000", + "options": { + "components": { + "hour": "numeric", + "minute": "numeric", + "second": "numeric", + "preferences": { "hourCycle": "h12" } + } + } + }, + "output": { + "value": "8:25:07 AM" + } + }, + { + "description": "Exact match for: H => HH", + "input": { + "locale": "en", + "value": "2020-01-07T08:25:07.000", + "options": { + "components": { + "hour": "numeric", + "preferences": { "hourCycle": "h23" } + } + } + }, + "output": { + "value": "08" + } + }, + { + "description": "Exact match for: Hm => HH:mm", + "input": { + "locale": "en", + "value": "2020-01-07T08:25:07.000", + "options": { + "components": { + "hour": "numeric", + "minute": "numeric", + "preferences": { "hourCycle": "h23" } + } + } + }, + "output": { + "value": "08:25" + } + }, + { + "description": "Exact match for: Hms => HH:mm:ss", + "input": { + "locale": "en", + "value": "2020-01-07T08:25:07.000", + "options": { + "components": { + "hour": "numeric", + "minute": "numeric", + "second": "numeric", + "preferences": { "hourCycle": "h23" } + } + } + }, + "output": { + "value": "08:25:07" + } + }, + { + "description": "Exact match for: ms => mm:ss", + "input": { + "locale": "en", + "value": "2020-01-07T08:25:07.000", + "options": { + "components": { + "minute": "numeric", + "second": "numeric" + } + } + }, + "output": { + "value": "25:07" + } + } +] diff --git a/components/datetime/tests/fixtures/tests/components-width-differences.json b/components/datetime/tests/fixtures/tests/components-width-differences.json new file mode 100644 index 00000000000..cf1a801a1e1 --- /dev/null +++ b/components/datetime/tests/fixtures/tests/components-width-differences.json @@ -0,0 +1,17 @@ +[ + { + "description": "Width difference: y vs yy", + "input": { + "locale": "en", + "value": "2020-01-07T08:25:07.000", + "options": { + "components": { + "year": "two-digit" + } + } + }, + "output": { + "value": "20" + } + } +] diff --git a/components/datetime/tests/fixtures/tests/components.json b/components/datetime/tests/fixtures/tests/components.json deleted file mode 100644 index 07bbc2ffe38..00000000000 --- a/components/datetime/tests/fixtures/tests/components.json +++ /dev/null @@ -1,23 +0,0 @@ -[ - { - "input": { - "locale": "en", - "value": "2020-01-21T08:25:07.000", - "options": { - "components": { - "weekday": "long", - "month": "long", - "day": "two-digit", - "year": "numeric", - "hour": "numeric", - "minute": "numeric", - "second": "numeric", - "time_zone_name": "long" - } - } - }, - "output": { - "value": "Tuesday, January 21, 2020 at 8:25:07 AM zzzz" - } - } -]