merge

unicode-org · Jun 28, 2022 · e6236ff · e6236ff
2 parents 409ce0e + 7599633
commit e6236ff
Show file tree

Hide file tree

Showing 23 changed files with 409 additions and 83 deletions.
diff --git a/CODEOWNERS b/CODEOWNERS
@@ -21,7 +21,7 @@ experimental/codepointtrie/       @echeran
 experimental/collator/            @hsivonen @echeran
 experimental/normalizer/          @hsivonen @echeran
 experimental/provider_ppucd/      @echeran
-experimental/segmenter/           @aethanyc @makotokato
+experimental/segmenter/           @aethanyc @makotokato @sffc
 ffi/capi/                         @Manishearth
 ffi/cpp/                          @Manishearth
 ffi/ecma402/                      @filmil

diff --git a/components/icu/src/lib.rs b/components/icu/src/lib.rs
@@ -87,6 +87,7 @@
         clippy::panic
     )
 )]
+#![warn(missing_docs)]
 
 pub mod calendar {
     //! Contains the core types used by ICU4X for dealing

diff --git a/components/list/src/lib.rs b/components/list/src/lib.rs
@@ -2,8 +2,6 @@
 // called LICENSE at the top level of the ICU4X source tree
 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
 
-#![warn(missing_docs)]
-
 //! [`icu_list`](crate) provides the [`ListFormatter`] which renders sequences of [`Writeable`](
 //! writeable::Writeable)s as lists in a locale-sensitive way.
 //!
@@ -60,6 +58,7 @@
         clippy::exhaustive_enums
     )
 )]
+#![warn(missing_docs)]
 
 extern crate alloc;
 

diff --git a/components/locid/src/extensions/mod.rs b/components/locid/src/extensions/mod.rs
@@ -73,8 +73,7 @@ pub enum ExtensionType {
 }
 
 impl ExtensionType {
-    #[allow(missing_docs)] // TODO(#1028) - Add missing docs.
-    pub fn from_byte(key: u8) -> Result<Self, ParserError> {
+    pub(crate) fn from_byte(key: u8) -> Result<Self, ParserError> {
         let key = key.to_ascii_lowercase();
         match key {
             b'u' => Ok(Self::Unicode),
@@ -88,12 +87,17 @@ impl ExtensionType {
 
 /// A map of extensions associated with a given [`Locale`](crate::Locale).
 #[derive(Debug, Default, PartialEq, Eq, Clone, Hash, PartialOrd, Ord)]
-#[allow(missing_docs)] // TODO(#1028) - Add missing docs.
 #[non_exhaustive]
 pub struct Extensions {
+    /// A representation of the data for a Unicode extension, when present in the locale identifer.
     pub unicode: Unicode,
+    /// A representation of the data for a transform extension, when present in the locale identifer.
     pub transform: Transform,
+    /// A representation of the data for a private-use extension, when present in the locale identifer.
     pub private: Private,
+    /// A sequence of any other extensions that are present in the locale identifier but are not formally
+    /// [defined](https://unicode.org/reports/tr35/) and represented explicitly as [`Unicode`], [`Transform`],
+    /// and [`Private`] are.
     pub other: Vec<Other>,
 }
 

diff --git a/components/locid/src/extensions/other/key.rs b/components/locid/src/extensions/other/key.rs
@@ -29,7 +29,7 @@ const KEY_LENGTH: RangeInclusive<usize> = 2..=8;
 
 impl Key {
     #[allow(missing_docs)] // TODO(#1028) - Add missing docs.
-    pub fn valid_key(v: &[u8]) -> bool {
+    pub(crate) fn valid_key(v: &[u8]) -> bool {
         KEY_LENGTH.contains(&v.len())
     }
 

diff --git a/components/locid/src/extensions/transform/mod.rs b/components/locid/src/extensions/transform/mod.rs
@@ -69,10 +69,12 @@ use litemap::LiteMap;
 /// [`RFC 6497`]: https://www.ietf.org/rfc/rfc6497.txt
 /// [`Unicode Locale Identifier`]: https://unicode.org/reports/tr35/#Unicode_locale_identifier
 #[derive(Clone, PartialEq, Eq, Debug, Default, Hash, PartialOrd, Ord)]
-#[allow(missing_docs)] // TODO(#1028) - Add missing docs.
 #[allow(clippy::exhaustive_structs)] // spec-backed stable datastructure
 pub struct Transform {
+    /// The [`LanguageIdentifier`] specified with this locale extension, or `None` if not present.
     pub lang: Option<LanguageIdentifier>,
+    /// The key-value pairs present in this locale extension, with each extension key subtag
+    /// associated to its provided value subtag.
     pub fields: Fields,
 }
 

diff --git a/components/locid/src/extensions/transform/value.rs b/components/locid/src/extensions/transform/value.rs
@@ -9,13 +9,6 @@ use core::ops::RangeInclusive;
 use core::str::FromStr;
 use tinystr::TinyAsciiStr;
 
-#[derive(Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord)]
-#[allow(missing_docs)] // TODO(#1028) - Add missing docs.
-pub struct Value(Vec<TinyAsciiStr<{ *TYPE_LENGTH.end() }>>);
-
-const TYPE_LENGTH: RangeInclusive<usize> = 3..=8;
-const TRUE_TVALUE: TinyAsciiStr<8> = tinystr::tinystr!(8, "true");
-
 /// A value used in a list of [`Fields`](super::Fields).
 ///
 /// The value has to be a sequence of one or more alphanumerical strings
@@ -35,6 +28,12 @@ const TRUE_TVALUE: TinyAsciiStr<8> = tinystr::tinystr!(8, "true");
 /// assert_eq!(&value1.to_string(), "hybrid");
 /// assert_eq!(&value2.to_string(), "hybrid-foobar");
 /// ```
+#[derive(Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord)]
+pub struct Value(Vec<TinyAsciiStr<{ *TYPE_LENGTH.end() }>>);
+
+const TYPE_LENGTH: RangeInclusive<usize> = 3..=8;
+const TRUE_TVALUE: TinyAsciiStr<8> = tinystr::tinystr!(8, "true");
+
 impl Value {
     /// A constructor which takes a utf8 slice, parses it and
     /// produces a well-formed [`Value`].

diff --git a/components/locid/src/extensions/unicode/keywords.rs b/components/locid/src/extensions/unicode/keywords.rs
@@ -3,11 +3,13 @@
 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
 
 use core::borrow::Borrow;
+use core::cmp::Ordering;
 use core::iter::FromIterator;
 use litemap::LiteMap;
 
 use super::Key;
 use super::Value;
+use crate::ordering::SubtagOrderingResult;
 
 /// A list of [`Key`]-[`Value`] pairs representing functional information
 /// about locale's internationnalization preferences.
@@ -270,6 +272,96 @@ impl Keywords {
         self.0.retain(|k, _| predicate(k))
     }
 
+    /// Compare this [`Keywords`] with BCP-47 bytes.
+    ///
+    /// The return value is equivalent to what would happen if you first converted this
+    /// [`Keywords`] to a BCP-47 string and then performed a byte comparison.
+    ///
+    /// This function is case-sensitive and results in a *total order*, so it is appropriate for
+    /// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use icu::locid::Locale;
+    /// use icu::locid::extensions::unicode::Keywords;
+    /// use std::cmp::Ordering;
+    ///
+    /// let bcp47_strings: &[&str] = &[
+    ///     "ca-hebrew",
+    ///     "ca-japanese",
+    ///     "ca-japanese-nu-latn",
+    ///     "nu-latn",
+    /// ];
+    ///
+    /// for ab in bcp47_strings.windows(2) {
+    ///     let a = ab[0];
+    ///     let b = ab[1];
+    ///     assert!(a.cmp(b) == Ordering::Less);
+    ///     let a_kwds = format!("und-u-{}", a).parse::<Locale>().unwrap().extensions.unicode.keywords;
+    ///     assert_eq!(a, a_kwds.to_string());
+    ///     assert!(a_kwds.strict_cmp(a.as_bytes()) == Ordering::Equal);
+    ///     assert!(a_kwds.strict_cmp(b.as_bytes()) == Ordering::Less);
+    /// }
+    /// ```
+    pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
+        self.strict_cmp_iter(other.split(|b| *b == b'-')).end()
+    }
+
+    /// Compare this [`Keywords`] with an iterator of BCP-47 subtags.
+    ///
+    /// This function has the same equality semantics as [`Keywords::strict_cmp`]. It is intended as
+    /// a more modular version that allows multiple subtag iterators to be chained together.
+    ///
+    /// For an additional example, see [`SubtagOrderingResult`].
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use icu::locid::Locale;
+    /// use icu::locid::extensions::unicode::Keywords;
+    /// use std::cmp::Ordering;
+    ///
+    /// let subtags: &[&[u8]] = &[&*b"ca", &*b"buddhist"];
+    ///
+    /// let kwds = "und-u-ca-buddhist".parse::<Locale>().unwrap().extensions.unicode.keywords;
+    /// assert_eq!(
+    ///     Ordering::Equal,
+    ///     kwds.strict_cmp_iter(subtags.iter().copied()).end()
+    /// );
+    ///
+    /// let kwds = "und".parse::<Locale>().unwrap().extensions.unicode.keywords;
+    /// assert_eq!(
+    ///     Ordering::Less,
+    ///     kwds.strict_cmp_iter(subtags.iter().copied()).end()
+    /// );
+    ///
+    /// let kwds = "und-u-nu-latn".parse::<Locale>().unwrap().extensions.unicode.keywords;
+    /// assert_eq!(
+    ///     Ordering::Greater,
+    ///     kwds.strict_cmp_iter(subtags.iter().copied()).end()
+    /// );
+    /// ```
+    pub fn strict_cmp_iter<'l, I>(&self, mut subtags: I) -> SubtagOrderingResult<I>
+    where
+        I: Iterator<Item = &'l [u8]>,
+    {
+        let r = self.for_each_subtag_str(&mut |subtag| {
+            if let Some(other) = subtags.next() {
+                match subtag.as_bytes().cmp(other) {
+                    Ordering::Equal => Ok(()),
+                    not_equal => Err(not_equal),
+                }
+            } else {
+                Err(Ordering::Greater)
+            }
+        });
+        match r {
+            Ok(_) => SubtagOrderingResult::Subtags(subtags),
+            Err(o) => SubtagOrderingResult::Ordering(o),
+        }
+    }
+
     pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
     where
         F: FnMut(&str) -> Result<(), E>,

diff --git a/components/locid/src/extensions/unicode/mod.rs b/components/locid/src/extensions/unicode/mod.rs
@@ -68,10 +68,12 @@ use litemap::LiteMap;
 /// assert_eq!(loc.extensions.unicode.keywords.get(&key), Some(&value));
 /// ```
 #[derive(Clone, PartialEq, Eq, Debug, Default, Hash, PartialOrd, Ord)]
-#[allow(missing_docs)] // TODO(#1028) - Add missing docs.
 #[allow(clippy::exhaustive_structs)] // spec-backed stable datastructure
 pub struct Unicode {
+    /// The key-value pairs present in this locale extension, with each extension key subtag
+    /// associated to its provided value subtag.
     pub keywords: Keywords,
+    /// A canonically ordered sequence of single standalone subtags for this locale extension.
     pub attributes: Attributes,
 }
 

diff --git a/components/locid/src/langid.rs b/components/locid/src/langid.rs
@@ -5,6 +5,7 @@
 use core::cmp::Ordering;
 use core::str::FromStr;
 
+use crate::ordering::SubtagOrderingResult;
 use crate::parser::{
     get_subtag_iterator, parse_language_identifier, parse_language_identifier_without_variants,
     ParserError, ParserMode,
@@ -159,10 +160,10 @@ impl LanguageIdentifier {
         Ok(lang_id.to_string())
     }
 
-    /// Compare this `LanguageIdentifier` with BCP-47 bytes.
+    /// Compare this [`LanguageIdentifier`] with BCP-47 bytes.
     ///
     /// The return value is equivalent to what would happen if you first converted this
-    /// `LanguageIdentifier` to a BCP-47 string and then performed a byte comparison.
+    /// [`LanguageIdentifier`] to a BCP-47 string and then performed a byte comparison.
     ///
     /// This function is case-sensitive and results in a *total order*, so it is appropriate for
     /// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`.
@@ -173,28 +174,69 @@ impl LanguageIdentifier {
     /// use icu::locid::LanguageIdentifier;
     /// use std::cmp::Ordering;
     ///
-    /// let bcp47_strings: &[&[u8]] = &[
-    ///     b"pl-Latn-PL",
-    ///     b"und",
-    ///     b"und-Adlm",
-    ///     b"und-GB",
-    ///     b"und-ZA",
-    ///     b"und-fonipa",
-    ///     b"zh",
+    /// let bcp47_strings: &[&str] = &[
+    ///     "pl-Latn-PL",
+    ///     "und",
+    ///     "und-Adlm",
+    ///     "und-GB",
+    ///     "und-ZA",
+    ///     "und-fonipa",
+    ///     "zh",
     /// ];
     ///
     /// for ab in bcp47_strings.windows(2) {
     ///     let a = ab[0];
     ///     let b = ab[1];
     ///     assert!(a.cmp(b) == Ordering::Less);
-    ///     let a_langid = LanguageIdentifier::from_bytes(a).unwrap();
-    ///     assert!(a_langid.strict_cmp(b) == Ordering::Less);
+    ///     let a_langid = a.parse::<LanguageIdentifier>().unwrap();
+    ///     assert_eq!(a, a_langid.to_string());
+    ///     assert!(a_langid.strict_cmp(a.as_bytes()) == Ordering::Equal);
+    ///     assert!(a_langid.strict_cmp(b.as_bytes()) == Ordering::Less);
     /// }
     /// ```
     pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
-        let mut other_iter = other.split(|b| *b == b'-');
+        self.strict_cmp_iter(other.split(|b| *b == b'-')).end()
+    }
+
+    /// Compare this [`LanguageIdentifier`] with an iterator of BCP-47 subtags.
+    ///
+    /// This function has the same equality semantics as [`LanguageIdentifier::strict_cmp`]. It is intended as
+    /// a more modular version that allows multiple subtag iterators to be chained together.
+    ///
+    /// For an additional example, see [`SubtagOrderingResult`].
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use icu::locid::LanguageIdentifier;
+    /// use std::cmp::Ordering;
+    ///
+    /// let subtags: &[&[u8]] = &[&*b"ca", &*b"ES", &*b"valencia"];
+    ///
+    /// let loc = "ca-ES-valencia".parse::<LanguageIdentifier>().unwrap();
+    /// assert_eq!(
+    ///     Ordering::Equal,
+    ///     loc.strict_cmp_iter(subtags.iter().copied()).end()
+    /// );
+    ///
+    /// let loc = "ca-ES".parse::<LanguageIdentifier>().unwrap();
+    /// assert_eq!(
+    ///     Ordering::Less,
+    ///     loc.strict_cmp_iter(subtags.iter().copied()).end()
+    /// );
+    ///
+    /// let loc = "ca-ZA".parse::<LanguageIdentifier>().unwrap();
+    /// assert_eq!(
+    ///     Ordering::Greater,
+    ///     loc.strict_cmp_iter(subtags.iter().copied()).end()
+    /// );
+    /// ```
+    pub fn strict_cmp_iter<'l, I>(&self, mut subtags: I) -> SubtagOrderingResult<I>
+    where
+        I: Iterator<Item = &'l [u8]>,
+    {
         let r = self.for_each_subtag_str(&mut |subtag| {
-            if let Some(other) = other_iter.next() {
+            if let Some(other) = subtags.next() {
                 match subtag.as_bytes().cmp(other) {
                     Ordering::Equal => Ok(()),
                     not_equal => Err(not_equal),
@@ -203,14 +245,12 @@ impl LanguageIdentifier {
                 Err(Ordering::Greater)
             }
         });
-        if let Err(o) = r {
-            return o;
-        }
-        if other_iter.next().is_some() {
-            return Ordering::Less;
+        match r {
+            Ok(_) => SubtagOrderingResult::Subtags(subtags),
+            Err(o) => SubtagOrderingResult::Ordering(o),
         }
-        Ordering::Equal
     }
+
     /// Compare this `LanguageIdentifier` with a potentially unnormalized BCP-47 string.
     ///
     /// The return value is equivalent to what would happen if you first parsed the