Skip to content

Commit

Permalink
Feat/support more languages in handles (#2163)
Browse files Browse the repository at this point in the history
# Goal
The goal of this PR is to:
1. support more languages in handles, 
2. close some gaps in language support for existing ones
3. disallow a '/' in handles.

Closes #2162 
Co-authored-by: Wil Wade <wil.wade@amplica.io>
  • Loading branch information
shannonwells authored Sep 27, 2024
1 parent 3a59277 commit c8f184f
Show file tree
Hide file tree
Showing 5 changed files with 276 additions and 43 deletions.
187 changes: 164 additions & 23 deletions pallets/handles/src/handles-utils/constants.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,168 @@

use core::ops::RangeInclusive;

/// Character that are allowed.
pub const ALLOWED_UNICODE_CHARACTER_RANGES: [RangeInclusive<u16>; 21] = [
0x0020..=0x007F, // Basic Latin
0x0080..=0x00FF, // Latin-1 Supplement
0x0100..=0x017F, // Latin Extended-A
0x0370..=0x03FF, // Greek and Coptic
0x0400..=0x04FF, // Cyrillic
0x0500..=0x052F, // Cyrillic Supplementary
0x0590..=0x05FF, // Hebrew
0x0600..=0x06FF, // Arabic
0x0900..=0x097F, // Devanagari
0x0980..=0x09FF, // Bengali
0x0E00..=0x0E7F, // Thai
0x1100..=0x11FF, // Hangul Jamo
0x1E00..=0x1EFF, // Latin Extended Additional
0x1F00..=0x1FFF, // Greek Extended
0x3040..=0x309F, // Hiragana
0x30A0..=0x30FF, // Katakana
0x3400..=0x4DBF, // CJK Unified Ideographs Extension A
0x4E00..=0x9FFF, // CJK Unified Ideographs
0xAC00..=0xD7AF, // Hangul Syllables
0xF900..=0xFAFF, // CJK Compatibility Ideographs
0xFB50..=0xFDFF, // Arabic Presentation Forms-A
#[cfg(test)]
pub fn build_allowed_char_ranges() -> Vec<RangeInclusive<u16>> {
let mut new_allowed: Vec<RangeInclusive<u16>> = Vec::new();
let mut last: RangeInclusive<u16> = RangeInclusive::new(0u16, 0u16);
// assumes the list is sorted!
for allowed in ALLOWED_UNICODE_CHARACTER_RANGES {
let last_start = last.start();
let last_end = last.end();
let allowed_start = allowed.start();
let allowed_end = allowed.end();
if *allowed_start == *last_end + 1u16 {
println!(
"joining {last_start:#X}..{last_end:#X} with {allowed_start:#X}..#{allowed_end:#X}"
);
last = RangeInclusive::new(*last.start(), *allowed.end());
} else {
println!("adding {last_start:#X}..{last_end:#4X}");
if *last_end > 0u16 {
new_allowed.push(last.clone());
}
last = allowed.clone()
}
}
new_allowed
}

/// Characters that are allowed.
/// This is generated using test_build_allowed_char_ranges
#[rustfmt::skip]
pub const ALLOWED_UNICODE_CHARACTER_RANGES: [RangeInclusive<u16>; 54] = [
0x0020..=0x007A,
0x0080..=0x0024F,
0x02B0..=0x04FF,
0x0531..=0x058A,
0x0591..=0x05F4,
0x0600..=0x07B1,
0x07C0..=0x07FA,
0x0900..=0x097F,
0x0981..=0x09FB,
0x0A01..=0x0A75,
0x0A81..=0x0AF1,
0x0B01..=0x0B77,
0x0B82..=0x0BFA,
0x0C01..=0x0C7F,
0x0C82..=0x0CF2,
0x0D02..=0x0D7F,
0x0D82..=0x0DF4,
0x0E01..=0x0E5B,
0x0E81..=0x0EDD,
0x0F00..=0x0FDA,
0x1000..=0x10FC,
0x1100..=0x137C,
0x1380..=0x1399,
0x13A0..=0x13F4,
0x1400..=0x167F,
0x1700..=0x1714,
0x1720..=0x1736,
0x1740..=0x1753,
0x1760..=0x1773,
0x1780..=0x17F9,
0x1800..=0x18AA,
0x18B0..=0x18F5,
0x1900..=0x1974,
0x1980..=0x1AAD,
0x1B00..=0x1B7C,
0x1B80..=0x1BB9,
0x1BC0..=0x1C7F,
0x1E00..=0x1FFF,
0x200C..=0x206F,
0x2C80..=0x2CFF,
0x2D30..=0x2D7F,
0x3040..=0x30FF,
0x3400..=0x4DBF,
0x4E00..=0x9FFF,
0xA500..=0xA62B,
0xA880..=0xA8D9,
0xA8E0..=0xA8FB,
0xA900..=0xA95F,
0xA980..=0xA9DF,
0xAA00..=0xAA7B,
0xAA80..=0xAADF,
0xABC0..=0xABF9,
0xAC00..=0xD7AF,
0xF900..=0xFAFF,
];

// Keep this to show what languages are supported and to generate a new compact
// list whenever the list is updated.
// pub const ALLOWED_UNICODE_CHARACTER_RANGES: [RangeInclusive<u16>; 75] = [
// 0x0020..=0x007A, // BasicLatin
// 0x0080..=0x00FF, // Latin-1 Supplement
// 0x0100..=0x017F, // Latin Extended-A
// 0x0180..=0x024F, // Latin Extended-B
// 0x02B0..=0x02FF, // Spacing Modifier Letters
// 0x0300..=0x036F, // Combining diacritical marks
// 0x0370..=0x03FF, // Greek and Coptic
// 0x0400..=0x04FF, // Cyrillic
// 0x0531..=0x058A, // Armenian
// 0x0591..=0x05F4, // Hebrew
// 0x0600..=0x06FF, // Arabic
// 0x0700..=0x074F, // Syriac
// 0x0750..=0x077F, // ArabicSupplement
// 0x0780..=0x07B1, // Thaana
// 0x07C0..=0x07FA, // N'Ko
// 0x0900..=0x097F, // Devanagari
// 0x0981..=0x09FB, // Bengali
// 0x0A01..=0x0A75, // Gurmukhi
// 0x0A81..=0x0AF1, // Gujarati
// 0x0B01..=0x0B77, // Oriya
// 0x0B82..=0x0BFA, // Tamil
// 0x0C01..=0x0C7F, // Telugu
// 0x0C82..=0x0CF2, // Kannada
// 0x0D02..=0x0D7F, // Malayalam
// 0x0D82..=0x0DF4, // Sinhala
// 0x0E01..=0x0E5B, // Thai
// 0x0E81..=0x0EDD, // Lao
// 0x0F00..=0x0FDA, // Tibetan
// 0x1000..=0x109F, // Myanmar
// 0x10A0..=0x10FC, // Georgian
// 0x1100..=0x11FF, // HangulJamo
// 0x1200..=0x137C, // Ethiopic
// 0x1380..=0x1399, // EthiopicSupplement
// 0x13A0..=0x13F4, // Cherokee
// 0x1400..=0x167F, // UnifiedCanadianAboriginalSyllabics
// 0x1700..=0x1714, // Tagalog
// 0x1720..=0x1736, // Hanunoo
// 0x1740..=0x1753, // Buhid
// 0x1760..=0x1773, // Tagbanwa
// 0x1780..=0x17F9, // Khmer
// 0x1800..=0x18AA, // Mongolian
// 0x18B0..=0x18F5, // Unified Canadian Aboriginal Syllabics Extended
// 0x1900..=0x194F, // Limbu
// 0x1950..=0x1974, // Tai Le
// 0x1980..=0x19DF, // New Tai Le
// 0x19E0..=0x19FF, // Khmer Symbols
// 0x1A00..=0x1A1F, // Buginese
// 0x1A20..=0x1AAD, // Tai Tham
// 0x1B00..=0x1B7C, // Balinese
// 0x1B80..=0x1BB9, // Sundanese
// 0x1BC0..=0x1BFF, // Batak
// 0x1C00..=0x1C4F, // Lepcha
// 0x1C50..=0x1C7F, // Ol Chiki
// 0x1E00..=0x1EFF, // Latin Extended Additional
// 0x1F00..=0x1FFF, // Greek Extended
// 0x200C..=0x206F, // General punctuation, used in some languages to indicate syllables such as glottal stops
// 0x2C80..=0x2CFF, // Coptic
// 0x2D30..=0x2D7F, // Tifinagh
// 0x3040..=0x309F, // Hiragana
// 0x30A0..=0x30FF, // Katakana
// 0x3400..=0x4DBF, // CJK Unified Ideographs Extension A
// 0x4E00..=0x9FFF, // CJK Unified Ideographs
// 0xA500..=0xA62B, // Vai
// 0xA880..=0xA8D9, // Saurashtra
// 0xA8E0..=0xA8FB, // Devanagari Extended
// 0xA900..=0xA92F, // Kayah Li
// 0xA930..=0xA95F, // Rejang
// 0xA980..=0xA9DF, // Javanese
// 0xAA00..=0xAA5F, // Cham
// 0xAA60..=0xAA7B, // Myanmar Extended-A
// 0xAA80..=0xAADF, // Tai Viet
// 0xABC0..=0xABF9, // Meetei Mayek
// 0xAC00..=0xD7AF, // Hangul Syllables
// 0xF900..=0xFAFF, // CJK Compatibility Ideographs
// 0xFB50..=0xFDFF, // Arabic Presentation Forms-A
// ];
17 changes: 17 additions & 0 deletions pallets/handles/src/handles-utils/src/tests/constants_tests.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#[path = "../../constants.rs"]
mod constants;
use constants::*;

// You can comment out the current one and uncomment the original, specific one
// for all the languages supported.
#[test]
#[ignore = "use only to regenerate compacted ALLOWED_UNICODE_CHARACTER_RANGES"]
fn test_build_allowed_char_ranges() {
let res = build_allowed_char_ranges();
assert_eq!(res.len(), 54usize);
for range in res {
let start = range.start();
let end = range.end();
println!("{start:#4X}..={end:#4X},")
}
}
1 change: 1 addition & 0 deletions pallets/handles/src/handles-utils/src/tests/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
mod constants_tests;
mod converter_tests;
mod suffix_tests;
mod validator_tests;
110 changes: 92 additions & 18 deletions pallets/handles/src/handles-utils/src/tests/validator_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ fn test_is_reserved_canonical_handle_negative() {
#[test]
fn test_contains_blocked_characters_happy_path() {
let handles: Vec<&str> =
vec!["@lbert", "coca:cola", "#freemont", "charles.darwin", "`String`", ":(){ :|:& };:"];
vec!["@lbert", "coca:cola", "#freemont", "charles.darwin", "`String`", ":(){ :|:& };:/"];
for handle in handles {
assert!(contains_blocked_characters(handle));
}
Expand All @@ -39,37 +39,111 @@ fn test_contains_blocked_characters_negative() {
}
}

// To validate new test cases, add a string/sentence in the new language, run the test
// A test of a sentence can reveal character ranges needed for language rendering.
// Unicode groups: https://www.unicodepedia.com/groups/ for character ranges
// If you don't know why a test is failing, decode the string here to check the range:
// https://unicodedecode.com/
// Translations of "I can eat glass" from https://www.kermitproject.org/utf8.html
// Some translations: https://translate.glosbe.com/
// Others from Wikipedia
// Many are (supposed to be) common names or greetings, or translations of "beautiful flower"
#[rustfmt::skip]
#[test]
fn test_consists_of_supported_unicode_character_sets_happy_path() {
let strings_containing_characters_in_supported_unicode_character_sets = Vec::from([
"John", // Basic Latin
"Álvaro", // Latin-1 Supplement
"가영", // Hangul Syllables
"가나다", // Hangul Syllables
"アキラ", // Katakana
"あいこ", // Hiragana
"李明", // CJK Unified Ideographs
"严勇", // CJK Unified Ideographs
"龍", // CJK Unified Ideographs
"অমিত", // Bengali
"आरव", // Devanagari
"Александр", // Cyrillic
"John", // Basic Latin
"Álvaro", // Latin-1 Supplement
"가영", // Hangul Syllables
"가나다", // Hangul Syllables
"アキラ", // Katakana
"あいこ", // Hiragana
"私はガラスを食べられますそれは私を傷つけません", // Japanese:
"李明", // CJK Unified Ideographs
"严勇", // CJK Unified Ideographs
"龍", // CJK Unified Ideographs
"Αλέξανδρος", // Greek and Coptic
"Ἀναξαγόρας", // Greek Extended
"กัญญา", // Thai
"ابجدهوزحطيكلمنسعفصقرشتثخذضظغءعمر", // Arabic
"דָּנִיֵּאלאבּבגּגדּדהווּוֹזחטי ִיכּךּכךלמםנןסעפּףּפףצץקרשׁשׂתּת", // Hewbrew
"AaĄąBbCcĆćDdEeĘęFfGgHhIiJjKkLlŁłMmNnŃńOoÓóRrSsŚśYyZzŹźŻż", // Polish
"ÄäÖöÜüẞß", // German
"AÁBCČDĎEÉĚFGHChIÍJKLMNŇOÓPQRŘSŠTŤUÚŮVWXYÝZŽaábcčdďeéěfghchiíjklmnňoópqrřsštťuúůvwxyýzž", // Czech
"αιαιαιᾳειειηιῃοιοιυιυιωιῳαυαυᾹυᾱυευευηυηυουουωυωυγγγγγκγκγξγξγχγχμπμπντντΖζΤΖτζ", // Greek
"ÅåÄäÖö", // Swedish
"ÅåÄäÖöŠšŽž", // Finnish
"ÆæØøÅå", // Danish
"Александр", // Cyrillic
"Կրնամ ապակի ուտել և ինծի անհանգիստ չըներ։", // Armenian
"דָּנִיֵּאלאבּבגּגדּדהווּוֹזחטי ִיכּךּכךלמםנןסעפּףּפףצץקרשׁשׂתּת", // Hebrew
"ابجدهوزحطيكلمنسعفصقرشتثخذضظغءعمر", // Arabic
"ܐܠܦ ܒܝܬ ܣܘܪܝܝܐ", // Syriac
"ދިވެހިބަސް", // Thaana
"ߒߞߏ ߞߊ߲ߜߍ", // N'Ko
"शक्नोम्यत्तुम्", // Devanagari
"म काँच खान सक्छू र मलाई केहि नी हुन्‍न् ।", // Nepali
"আমিকাঁচখেতেপারিতাতেআমারকোনোক্ষতিহয়না।", // Bengali
"मीकाचखाऊशकतोमलातेदुखतनाही", // Marathi
"ನನಗೆಹಾನಿಆಗದೆ,ನಾನುಗಜನ್ನುತಿನಬಹುದು", // Kannada
"मैंकाँचखासकतीहूँऔरमुझेउससेकोईचोटनहींपहुंचती", // Hindi
"நான்கண்ணாடிசாப்பிடுவேன்,அதனால்எனக்குஒருகேடும்வராது", // Tamil
"నేనుగాజుతినగలనుమరియుఅలాచేసినానాకుఏమిఇబ్బందిలేదు", // Telugu
" මටවීදුරුකෑමටහැකියි.එයින්මටකිසිහානියක්සිදුනොවේ", // Sinhalese
"Ἀναξαγόρας", // Greek Extended
" 我能吞下玻璃而不伤身体", // Chinese
" 我能吞下玻璃而不傷身體", // Chinese (Traditional)
"ฉันกินกระจกได้แต่มันไม่ทำให้ฉันเจ็บ", // Thai
"ຂອ້ຍກິນແກ້ວໄດ້ໂດຍທີ່ມັນບໍ່ໄດ້ເຮັດໃຫ້ຂອ້ຍເຈັບ", // Lao
" ཤེལ་སྒོ་ཟ་ནས་ང་ན་གི་མ་རེད།", // Tibetan
"က္ယ္ဝန္‌တော္‌၊က္ယ္ဝန္‌မ မ္ယက္‌စားနုိင္‌သည္‌။ ၎က္ရောင္‌့ ထိခုိက္‌မ္ဟု မရ္ဟိပာ။", // Burmese (Unicode 4.0)
"ကျွန်တော် ကျွန်မ မှန်စားနိုင်တယ်။ ၎င်းကြောင့် ထိခိုက်မှုမရှိပါ။", // Burmese (Unicode 5.0)
"თამარი მადლობა", // Georgian
"እናመሰግናለን አቢታ መልካም ቀን", // Ethiopian
"ᜀᜆᜇ᜔ ᜇᜃᜓ", // Hanunoo
"ᝊᝓᝑᝒᝇ ᝌᝃ ᝈᝅᝋ ", // Buhid
"ᝐᝓᝆᝎᝓ ᝐᝆᝓ", // Tagbanwa
"Би шил идэй чадна, надад хортой биш", // Mongolian (Cyrillic)
"ᠪᠢ ᠰᠢᠯᠢ ᠢᠳᠡᠶᠦ ᠴᠢᠳᠠᠨᠠ ᠂ ᠨᠠᠳᠤᠷ ᠬᠣᠤᠷᠠᠳᠠᠢ ᠪᠢᠰᠢ", // Mongolian (Classic) (5)
" ᐊᓕᒍᖅ ᓂᕆᔭᕌᖓᒃᑯ ᓱᕋᙱᑦᑐᓐᓇᖅᑐᖓ", // Inuktitut
"ᤋᤠᤱᤛᤠ ᤕᤠᤰᤁᤢ ", // Limbu
"ᥕᥤᥒᥱ ᥘᥦᥝᥲ", // Tai Le
"ᦉᦱᧃ ᦃᦺᦟᦹ", // New Tai Le
"ᨆᨗᨕᨚ ᨅᨔᨒᨀ", // Buginese
"ᨠᩯᩬ ᨴᩱᨶᩣ ᨧᩥᨶᩬᩁᩣ", // Tai Tham
"ᬳᬸᬜ ᬳᬶᬦ ᬳᬸᬢ᭄ᬤᬸᬳ᭄ᬯᬸᬭ᭄", // Balinese
"ᮞᮀᮛᮥᮔ᮪ ᮞᮩᮞᮤ ᮊᮔ᮪ᮓᮥ", // Sundanese
"ᯀᯩᯖ᯲ᯔ ᯂᯞᯒ ᯊᯭᯉᯮ ᯂᯪᯒᯖᯮ ᯘᯮ", // Batak
"ᰗᰱᰠ ᰛᰥᰧ ᰛᰣᰵ ᰔᰠᰯ", // Lepcha
"ᱪᱮᱫᱮ ᱨᱩᱜ ᱢᱟᱦᱟᱭ ᱚᱲᱤᱠ", // Ol Chiki
"ⲙⲁⲣⲓⲁ ⲟⲩⲁⲣⲉⲟⲩ ⲡⲉⲗⲓⲛⲟⲛ", // Coptic
"ⴰⵎⵎⵉⵙⵏⴰ ⴰⵎⵍⵓⵍ ⵉⵎⴰⵍⵉⵏ", // Tifinagh http://tifinaghtools.eazypo.ca/
"ꕉꕜꕮ ꔔꘋ ꖸ ꔰ ꗋꘋ ꕮꕨ ꔔꘋ ꖸ ꕎ ꕉꖸꕊ ꕴꖃ ꕃꔤꘂ ꗱ, ꕉꖷ ꗪꗡ ꔻꔤ ꗏꗒꗡ ꕎ ꗪ ꕉꖸꕊ ꖏꕎ", // Vai
"ꢪꢶꢥꢳ ꢥ꣄ꢳꢯꢳ", // Saurashtra
"ꤊꤢꤛꤢ꤭ ꤜꤟꤤ꤬ ꤞ꤮ꤣ ꤟꤢꤨ꤭ ꤊꤢ", // Kayah Li
" ꤰꥍꤲꥒ ꤿꥍꥎꥂ ꥆꤰ꥓ꤼꤽ ꤽꥍꤺꥏ ", // Rejang
"ꦲꦏ꧀ꦱꦫ ꦮꦾꦚ꧀ꦗꦤ ꦩꦒꦢꦁ ꦧꦸꦭꦏ꧀ꦭꦏ꧀", // Javanese
"ꨀꨇꩉ ꨌꩌ ꨤꨨꨪꩀ ꨎꨳꨯꨮꩆ ꨕꨴꨭꩅ ꨕꨴꨭꩈ ꨨꨕꨯꩌ ꨨꨣꨬ", // Cham
"ꪎꪳ ꪼꪕ ꪣꪱ꫁ꪙ ꪕꪴ", // Tai Viet
"ꯁꯤꯗꯤ ꯑꯩꯁꯨ ꯃꯩꯇꯩ ꯃꯌꯦꯛ ꯏꯕ ꯍꯩꯔꯅꯤ ꯕꯨ", // Meetei Mayek https://abhisanoujam.github.io/meitei_mayek/
"ᏌᏃᏂ ᎣᏏᏲ ᏙᎯᏧ ᏣᎳᎩ ᎦᏬᏂᎯᏍᏗ ᏓᎾᏁᎵᏗᎲᎢ", // Cherokee https://language.cherokee.org/word-list/ and https://chren.cs.unc.edu/
"Tsésǫʼ yishą́ągo bííníshghah dóó doo shił neezgai da.", // Navajo
"ᜋᜄᜇᜅ᜔ᜇᜅ᜔ ᜊᜓᜎᜃ᜔ᜃᜎᜃ᜔", // Tagalog
"میں کانچکھاسکتاہوںورمجھےتکلیفنہیںہوتی", // Urdu
"شيشهخوړلېشمهغه ما نه خوږوي", // Pashto
" .من می توانم بدونِ احساس درد شيشه بخورم", // Farsi / Persian(3)
"أنا قادر على أكل الزجاج و هذا لا يؤلمني. ", // Arabic
" إِنا إِىَ تَونَر غِلَاشِ كُمَ إِن غَمَا لَافِىَا", // Hausa
"Tôi có thể ăn thủy tinh mà không hại gì.", // Vietnamese (quốc ngữ)
" ខ្ញុំអាចញុំកញ្ចក់បាន ដោយគ្មានបញ្ហារ ", // Khmer
"Góa ē-tàng chia̍h po-lê mā bē tio̍h-siong", // Taiwanese
" 나는 유리를 먹을 수 있어요. 그래도 아프지 않아요", // Korean
"mi kakne le nu citka le blaci .iku'i le se go'i na xrani mi", // Lojban
" Ljœr ye caudran créneþ ý jor cẃran.", // Nórdicg
" Ég get etið gler án þess að meiða mig.", // Íslenska / Icelandic
" Mogę jeść szkło, i mi nie szkodzi.", // Polish
" Pot să mănânc sticlă și ea nu mă rănește.", // Romanian
" Я можу їсти шкло, й воно мені не пошкодить.", // Ukrainian
]);

for string in strings_containing_characters_in_supported_unicode_character_sets {
assert!(consists_of_supported_unicode_character_sets(string));
assert!(consists_of_supported_unicode_character_sets(string), "failed at {string}",);
}
}

Expand Down
4 changes: 2 additions & 2 deletions pallets/handles/src/handles-utils/src/validator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ fn ensure_reserved_words_canonical() {
}

/// Characters that cannot be used in the handle.
const BLOCKED_CHARACTERS: [char; 16] =
['"', '#', '%', '(', ')', ',', '.', ':', ';', '<', '>', '@', '\\', '`', '{', '}'];
const BLOCKED_CHARACTERS: [char; 17] =
['"', '#', '%', '(', ')', ',', '.', '/', ':', ';', '<', '>', '@', '\\', '`', '{', '}'];

// We MUST have the BLOCKED_CHARACTERS constant sorted or we cannot use the faster `binary_search` function.
// Cannot easily be sorted at compile time currently
Expand Down

0 comments on commit c8f184f

Please sign in to comment.