Skip to content

Commit

Permalink
update tests
Browse files Browse the repository at this point in the history
  • Loading branch information
shannonwells committed Sep 24, 2024
1 parent 6f13964 commit f730d78
Show file tree
Hide file tree
Showing 2 changed files with 132 additions and 80 deletions.
162 changes: 84 additions & 78 deletions pallets/handles/src/handles-utils/constants.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,82 +2,88 @@

use core::ops::RangeInclusive;

/// Character that are allowed.
pub const ALLOWED_UNICODE_CHARACTER_RANGES: [RangeInclusive<u16>; 76] = [
0x0000..=0x007F, // BasicLatin
0x0080..=0x00FF, // Latin-1 Supplement
0x0100..=0x017F, // Latin Extended
0x0370..=0x03FF, // Greek and Coptic
0x0400..=0x04FF, // Cyrillic
0x0531..=0x058A, // Armenian
0x0591..=0x05F4, // Hebrew
0x0600..=0x06FF, // Arabic
0x0700..=0x074F, // Syriac
0x0750..=0x077F, // ArabicSupplement
0x0780..=0x07B1, // Thaana
0x07C0..=0x07FA, // NKo
0x0800..=0x083E, // Samaritan
0x0840..=0x085E, // Mandaic
0x0900..=0x097F, // Devanagari
0x0981..=0x09FB, // Bengali
0x0A01..=0x0A75, // Gurmukhi
0x0A81..=0x0AF1, // Gujarati
0x0B01..=0x0B77, // Oriya
0x0B82..=0x0BFA, // Tamil
0x0C01..=0x0C7F, // Telugu
0x0C82..=0x0CF2, // Kannada
0x0D02..=0x0D7F, // Malayalam
0x0D82..=0x0DF4, // Sinhala
0x0E01..=0x0E5B, // Thai
0x0E81..=0x0EDD, // Lao
0x0F00..=0x0FDA, // Tibetan
0x1000..=0x109F, // Myanmar
0x10A0..=0x10FC, // Georgian
0x1100..=0x11FF, // HangulJamo
0x1200..=0x137C, // Ethiopic
0x1380..=0x1399, // EthiopicSupplement
0x13A0..=0x13F4, // Cherokee
0x1400..=0x167F, // UnifiedCanadianAboriginalSyllabics
0x1680..=0x169C, // Ogham
0x16A0..=0x16F0, // Runic
0x1700..=0x1714, // Tagalog
0x1720..=0x1736, // Hanunoo
0x1740..=0x1753, // Buhid
0x1760..=0x1773, // Tagbanwa
0x1780..=0x17F9, // Khmer
0x1800..=0x18AA, // Mongolian
0x18B0..=0x18F5, // Unified Canadian Aboriginal Syllabics Extended
0x1900..=0x194F, // Limbu
0x1950..=0x1974, // TaiLe
0x1980..=0x19DF, // NewTaiLue
0x19E0..=0x19FF, // KhmerSymbols
0x1A00..=0x1A1F, // Buginese
0x1A20..=0x1AAD, // TaiTham
0x1B00..=0x1B7C, // Balinese
0x1B80..=0x1BB9, // Sundanese
0x1BC0..=0x1BFF, // Batak
0x1C00..=0x1C4F, // Lepcha
0x1C50..=0x1C7F, // Ol Chiki
0x1E00..=0x1EFF, // Latin Extended Additional
0x1F00..=0x1FFF, // Greek Extended
0x2C80..=0x2CFF, // Coptic
0x2D30..=0x2D7F, // Tifinagh
0x3040..=0x309F, // Hiragana
0x30A0..=0x30FF, // Katakana
0x3400..=0x4DBF, // CJK Unified Ideographs Extension A
0x4E00..=0x9FFF, // CJK Unified Ideographs
0xA4D0..=0xA4FF, // Lisu
0xA500..=0xA62B, // Vai
0xA840..=0xA877, // Phags-pa
0xA880..=0xA8D9, // Saurashtra
0xA8E0..=0xA8FB, // Devanagari Extended
0xA900..=0xA92F, // KayahLi
0xA930..=0xA95F, // Rejang
0xA980..=0xA9DF, // Javanese
0xAA00..=0xAA5F, // Cham
0xAA80..=0xAADF, // Tai Viet
0xABC0..=0xABF9, // Meetei Mayek
0xAC00..=0xD7AF, // Hangul Syllables
0xF900..=0xFAFF, // CJK Compatibility Ideographs
0xFB50..=0xFDFF, // Arabic Presentation Forms-A
/// Characters that are allowed.
#[rustfmt::skip]
pub const ALLOWED_UNICODE_CHARACTER_RANGES: [RangeInclusive<u16>; 81] = [
0x0020..=0x007A, // BasicLatin
0x0080..=0x00FF, // Latin-1 Supplement
0x0100..=0x017F, // Latin Extended-A
0x180..=0x24F, // Latin Extended-B
0x02B0..=0x02FF, // Spacing Modifier Letters
0x0300..=0x036F, // Combining diacritical marks
0x0370..=0x03FF, // Greek and Coptic
0x0400..=0x04FF, // Cyrillic
0x0531..=0x058A, // Armenian
0x0591..=0x05F4, // Hebrew
0x0600..=0x06FF, // Arabic
0x0700..=0x074F, // Syriac
0x0750..=0x077F, // ArabicSupplement
0x0780..=0x07B1, // Thaana
0x07C0..=0x07FA, // NKo
0x0800..=0x083E, // Samaritan
0x0840..=0x085E, // Mandaic
0x0900..=0x097F, // Devanagari
0x0981..=0x09FB, // Bengali
0x0A01..=0x0A75, // Gurmukhi
0x0A81..=0x0AF1, // Gujarati
0x0B01..=0x0B77, // Oriya
0x0B82..=0x0BFA, // Tamil
0x0C01..=0x0C7F, // Telugu
0x0C82..=0x0CF2, // Kannada
0x0D02..=0x0D7F, // Malayalam
0x0D82..=0x0DF4, // Sinhala
0x0E01..=0x0E5B, // Thai
0x0E81..=0x0EDD, // Lao
0x0F00..=0x0FDA, // Tibetan
0x1000..=0x109F, // Myanmar
0x10A0..=0x10FC, // Georgian
0x1100..=0x11FF, // HangulJamo
0x1200..=0x137C, // Ethiopic
0x1380..=0x1399, // EthiopicSupplement
0x13A0..=0x13F4, // Cherokee
0x1400..=0x167F, // UnifiedCanadianAboriginalSyllabics
0x1680..=0x169C, // Ogham
0x16A0..=0x16F0, // Runic
0x1700..=0x1714, // Tagalog
0x1720..=0x1736, // Hanunoo
0x1740..=0x1753, // Buhid
0x1760..=0x1773, // Tagbanwa
0x1780..=0x17F9, // Khmer
0x1800..=0x18AA, // Mongolian
0x18B0..=0x18F5, // Unified Canadian Aboriginal Syllabics Extended
0x1900..=0x194F, // Limbu
0x1950..=0x1974, // TaiLe
0x1980..=0x19DF, // NewTaiLue
0x19E0..=0x19FF, // KhmerSymbols
0x1A00..=0x1A1F, // Buginese
0x1A20..=0x1AAD, // TaiTham
0x1B00..=0x1B7C, // Balinese
0x1B80..=0x1BB9, // Sundanese
0x1BC0..=0x1BFF, // Batak
0x1C00..=0x1C4F, // Lepcha
0x1C50..=0x1C7F, // Ol Chiki
0x1E00..=0x1EFF, // Latin Extended Additional
0x1F00..=0x1FFF, // Greek Extended
0x200C..=0x206F, // General punctuation, used in some languages to indicate syllables such as glottal stops
0x2C80..=0x2CFF, // Coptic
0x2D30..=0x2D7F, // Tifinagh
0x3040..=0x309F, // Hiragana
0x30A0..=0x30FF, // Katakana
0x3400..=0x4DBF, // CJK Unified Ideographs Extension A
0x4E00..=0x9FFF, // CJK Unified Ideographs
0xA4D0..=0xA4FF, // Lisu
0xA500..=0xA62B, // Vai
0xA840..=0xA877, // Phags-pa
0xA880..=0xA8D9, // Saurashtra
0xA8E0..=0xA8FB, // Devanagari Extended
0xA900..=0xA92F, // KayahLi
0xA930..=0xA95F, // Rejang
0xA980..=0xA9DF, // Javanese
0xAA00..=0xAA5F, // Cham
0xAA80..=0xAADF, // Tai Viet
0xAA60..=0xAA7B, // Myanmar Extended-A
0xABC0..=0xABF9, // Meetei Mayek
0xAC00..=0xD7AF, // Hangul Syllables
0xF900..=0xFAFF, // CJK Compatibility Ideographs
0xFB50..=0xFDFF, // Arabic Presentation Forms-A
];
50 changes: 48 additions & 2 deletions pallets/handles/src/handles-utils/src/tests/validator_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ fn test_is_reserved_canonical_handle_negative() {
#[test]
fn test_contains_blocked_characters_happy_path() {
let handles: Vec<&str> =
vec!["@lbert", "coca:cola", "#freemont", "charles.darwin", "`String`", ":(){ :|:& };:"];
vec!["@lbert", "coca:cola", "#freemont", "charles.darwin", "`String`", ":(){ :|:& };:/"];
for handle in handles {
assert!(contains_blocked_characters(handle));
}
Expand All @@ -39,6 +39,13 @@ fn test_contains_blocked_characters_negative() {
}
}

// To validate new test cases, add a string/sentence in the new language, run the test
// A test of a sentence can reveal character ranges needed for language rendering.
// Unicode groups: https://www.unicodepedia.com/groups/ for character ranges
// If you don't know why a test is failing, decode the string here to check the range:
// https://unicodedecode.com/
// Translations of "I can eat glass" from https://www.kermitproject.org/utf8.html
#[rustfmt::skip]
#[test]
fn test_consists_of_supported_unicode_character_sets_happy_path() {
let strings_containing_characters_in_supported_unicode_character_sets = Vec::from([
Expand All @@ -56,7 +63,7 @@ fn test_consists_of_supported_unicode_character_sets_happy_path() {
"Александр", // Cyrillic
"Αλέξανδρος", // Greek and Coptic
"Ἀναξαγόρας", // Greek Extended
"กัญญา", // Thai
"ฉันกินกระจกได้แต่มันไม่ทำให้ฉันเจ็บ", // Thai
"ابجدهوزحطيكلمنسعفصقرشتثخذضظغءعمر", // Arabic
"דָּנִיֵּאלאבּבגּגדּדהווּוֹזחטי ִיכּךּכךלמםנןסעפּףּפףצץקרשׁשׂתּת", // Hewbrew
"AaĄąBbCcĆćDdEeĘęFfGgHhIiJjKkLlŁłMmNnŃńOoÓóRrSsŚśYyZzŹźŻż", // Polish
Expand All @@ -66,6 +73,45 @@ fn test_consists_of_supported_unicode_character_sets_happy_path() {
"ÅåÄäÖö", // Swedish
"ÅåÄäÖöŠšŽž", // Finnish
"ÆæØøÅå", // Danish
"ᏌᏃᏂ ᎠᏁᏴ", // Cherokee
"Կրնամ", // Armenian
"शक्नोम्यत्तुम्", // Devanagari
"მინას", // Georgian
"আমিকাঁচখেতেপারিতাতেআমারকোনোক্ষতিহয়না।", // Bengali
"मीकाचखाऊशकतोमलातेदुखतनाही", // Marathi
"ನನಗೆಹಾನಿಆಗದೆ,ನಾನುಗಜನ್ನುತಿನಬಹುದು", // Kannada
"मैंकाँचखासकतीहूँऔरमुझेउससेकोईचोटनहींपहुंचती", // Hindi
"நான்கண்ணாடிசாப்பிடுவேன்,அதனால்எனக்குஒருகேடும்வராது", // Tamil
"నేనుగాజుతినగలనుమరియుఅలాచేసినానాకుఏమిఇబ్బందిలేదు", // Telugu
" මටවීදුරුකෑමටහැකියි.එයින්මටකිසිහානියක්සිදුනොවේ", // Sinhalese
"میں کانچکھاسکتاہوںورمجھےتکلیفنہیںہوتی", // Urdu
"شيشهخوړلېشمهغه ما نه خوږوي", // Pashto
" .من می توانم بدونِ احساس درد شيشه بخورم", // Farsi / Persian(3)
"أنا قادر على أكل الزجاج و هذا لا يؤلمني. ", // Arabic
" إِنا إِىَ تَونَر غِلَاشِ كُمَ إِن غَمَا لَافِىَا", // Hausa
"က္ယ္ဝန္‌တော္‌၊က္ယ္ဝန္‌မ မ္ယက္‌စားနုိင္‌သည္‌။ ၎က္ရောင္‌့ ထိခုိက္‌မ္ဟု မရ္ဟိပာ။", // Burmese (Unicode 4.0):
"ကျွန်တော် ကျွန်မ မှန်စားနိုင်တယ်။ ၎င်းကြောင့် ထိခိုက်မှုမရှိပါ။", // Burmese (Unicode 5.0):
"Tôi có thể ăn thủy tinh mà không hại gì.", // Vietnamese (quốc ngữ)
" ខ្ញុំអាចញុំកញ្ចក់បាន ដោយគ្មានបញ្ហារ ", // Khmer:
"ຂອ້ຍກິນແກ້ວໄດ້ໂດຍທີ່ມັນບໍ່ໄດ້ເຮັດໃຫ້ຂອ້ຍເຈັບ", // Lao:
"Би шил идэй чадна, надад хортой биш", // Mongolian (Cyrillic):
"ᠪᠢ ᠰᠢᠯᠢ ᠢᠳᠡᠶᠦ ᠴᠢᠳᠠᠨᠠ ᠂ ᠨᠠᠳᠤᠷ ᠬᠣᠤᠷᠠᠳᠠᠢ ᠪᠢᠰᠢ", // Mongolian (Classic) (5):
"म काँच खान सक्छू र मलाई केहि नी हुन्‍न् ।", // Nepali:
" ཤེལ་སྒོ་ཟ་ནས་ང་ན་གི་མ་རེད།", // Tibetan:
" 我能吞下玻璃而不伤身体", // Chinese:
" 我能吞下玻璃而不傷身體", // Chinese (Traditional):
"Góa ē-tàng chia̍h po-lê mā bē tio̍h-siong", // Taiwanese
" 私はガラスを食べられますそれは私を傷つけません", // Japanese:
" 나는 유리를 먹을 수 있어요. 그래도 아프지 않아요", // Korean:
" ᐊᓕᒍᖅ ᓂᕆᔭᕌᖓᒃᑯ ᓱᕋᙱᑦᑐᓐᓇᖅᑐᖓ", // Inuktitut
" Tsésǫʼ yishą́ągo bííníshghah dóó doo shił neezgai da.", // Navajo:
" mi kakne le nu citka le blaci .iku'i le se go'i na xrani mi", // Lojban:
" Ljœr ye caudran créneþ ý jor cẃran.", // Nórdicg:
" Ég get etið gler án þess að meiða mig.", // Íslenska / Icelandic
" Mogę jeść szkło, i mi nie szkodzi.", // Polish:
" Pot să mănânc sticlă și ea nu mă rănește.", // Romanian:
" Я можу їсти шкло, й воно мені не пошкодить.", // Ukrainian:
" Կրնամ ապակի ուտել և ինծի անհանգիստ չըներ։", // Armenian:
]);

for string in strings_containing_characters_in_supported_unicode_character_sets {
Expand Down

0 comments on commit f730d78

Please sign in to comment.