From 3742586f8411102a3293c774f9fda6571e37534c Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Mon, 20 May 2024 01:10:09 -0400 Subject: [PATCH] Mark more `Prepended_Concatenation_Mark`s as non-advancing --- scripts/unicode.py | 11 +++++++++-- src/lib.rs | 9 +++++++-- src/tables.rs | 6 +++--- tests/tests.rs | 22 +++++++++++++++------- 4 files changed, 34 insertions(+), 14 deletions(-) diff --git a/scripts/unicode.py b/scripts/unicode.py index f748711..80affa5 100755 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -241,10 +241,17 @@ def load_zero_widths() -> "list[bool]": # width 2. Therefore, we treat it as having width 2. zw_map[0x115F] = False - # Syriac abbreviation mark - # This is a `Prepended_Concatenation_Mark`, but unlike the others it's zero-width + # Syriac abbreviation mark: + # Zero-width `Prepended_Concatenation_Mark` zw_map[0x070F] = True + # Some Arabic Prepended_Concatenation_Mark`s + # https://www.unicode.org/versions/Unicode15.0.0/ch09.pdf#G27820 + zw_map[0x0605] = True + zw_map[0x0890] = True + zw_map[0x0891] = True + zw_map[0x08E2] = True + # U+A8FA DEVANAGARI CARET # https://www.unicode.org/versions/Unicode15.0.0/ch12.pdf#G667447 zw_map[0xA8FA] = True diff --git a/src/lib.rs b/src/lib.rs index eb3900e..8e3c4ab 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -57,7 +57,12 @@ //! - [`'\u{1B43}'` BALINESE VOWEL SIGN PEPET TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B43). //! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BHangul_Syllable_Type%3DV%7D%5Cp%7BHangul_Syllable_Type%3DT%7D) //! with a [`Hangul_Syllable_Type`] of `Vowel_Jamo` (`V`) or `Trailing_Jamo` (`T`). -//! - `'\u{070F}'` [SYRIAC] ABBREVIATION MARK. +//! - The following [`Prepended_Concatenation_Mark`]s: +//! - [`'\u{0605}'` NUMBER MARK ABOVE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0605), +//! - [`'\u{070F}'` SYRIAC ABBREVIATION MARK](https://util.unicode.org/UnicodeJsps/character.jsp?a=070F), +//! - [`'\u{0890}'` POUND MARK ABOVE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0890), +//! - [`'\u{0891}'` PIASTRE MARK ABOVE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0891), and +//! - [`'\u{08E2}'` DISPUTED END OF AYAH](https://util.unicode.org/UnicodeJsps/character.jsp?a=08E2). //! - [`'\u{A8FA}'` DEVANAGARI CARET](https://util.unicode.org/UnicodeJsps/character.jsp?a=A8FA). //! 7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D) //! with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2. @@ -70,6 +75,7 @@ //! [`Emoji_Presentation`]: https://unicode.org/reports/tr51/#def_emoji_presentation //! [`Grapheme_Extend`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G52443 //! [`Hangul_Syllable_Type`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G45593 +//! [`Prepended_Concatenation_Mark`]: https://www.unicode.org/versions/Unicode15.0.0/ch23.pdf#G37908 //! //! [`Fullwidth`]: https://www.unicode.org/reports/tr11/#ED2 //! [`Wide`]: https://www.unicode.org/reports/tr11/#ED4 @@ -80,7 +86,6 @@ //! //! [Enclosed Ideographic Supplement]: https://unicode.org/charts/PDF/U1F200.pdf //! -//! [Syriac]: https://www.unicode.org/versions/Unicode15.0.0/ch09.pdf#G13006 //! [Lisu tone letter]: https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G42078 //! //! ## Canonical equivalence diff --git a/src/tables.rs b/src/tables.rs index f4464f7..521bd8e 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -320,7 +320,7 @@ pub mod charwidth { 0x00, 0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x41, 0x10, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, - 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x00, 0x00, 0x40, 0x54, 0x55, 0x55, 0x55, 0x55, 0x55, + 0x55, 0x55, 0x55, 0x51, 0x55, 0x55, 0x00, 0x00, 0x40, 0x54, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x15, 0x00, 0x00, 0x00, 0x00, 0x00, 0x55, 0x55, 0x55, 0x55, 0x54, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x05, 0x00, 0x14, 0x00, 0x14, 0x04, 0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x15, 0x51, 0x55, 0x55, 0x55, 0x55, 0x55, @@ -330,8 +330,8 @@ pub mod charwidth { 0x55, 0x55, 0x55, 0x15, 0x00, 0x00, 0x55, 0x55, 0x51, 0x55, 0x55, 0x55, 0x55, 0x55, 0x05, 0x10, 0x00, 0x00, 0x01, 0x01, 0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x01, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, - 0x55, 0x55, 0x00, 0x00, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x05, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x55, + 0x50, 0x55, 0x00, 0x00, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x05, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x45, 0x54, 0x01, 0x00, 0x54, 0x51, 0x01, 0x00, 0x55, 0x55, 0x05, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x51, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, diff --git a/tests/tests.rs b/tests/tests.rs index 97aa497..44c2aa9 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -99,14 +99,22 @@ fn test_jamo() { #[test] fn test_prepended_concatenation_marks() { - assert_eq!('\u{0600}'.width(), Some(1)); - assert_eq!('\u{08E2}'.width(), Some(1)); - assert_eq!('\u{110BD}'.width(), Some(1)); -} + for c in [ + '\u{0600}', + '\u{0601}', + '\u{0602}', + '\u{0603}', + '\u{0604}', + '\u{06DD}', + '\u{110BD}', + '\u{110CD}', + ] { + assert_eq!(c.width(), Some(1), "{c:?} should have width 1"); + } -#[test] -fn test_syriac_abbreviation_mark() { - assert_eq!('\u{070F}'.width(), Some(0)); + for c in ['\u{0605}', '\u{070F}', '\u{0890}', '\u{0891}', '\u{08E2}'] { + assert_eq!(c.width(), Some(0), "{c:?} should have width 0"); + } } #[test]