From b68172e9faa4d101faffb7bee1bc3a0b466b151d Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Thu, 12 Dec 2024 20:40:41 +0000 Subject: [PATCH] Add character set support for: Arabic, Greek, Hebrew, Japanese, Thai, and Korean --- encoding/src/text.rs | 154 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 147 insertions(+), 7 deletions(-) diff --git a/encoding/src/text.rs b/encoding/src/text.rs index 919b6b8f..49b57f0e 100644 --- a/encoding/src/text.rs +++ b/encoding/src/text.rs @@ -6,23 +6,30 @@ //! | Character Set | decoding support | encoding support | //! |-------------------------------|------------------|------------------| //! | ISO-IR 6 (default) | ✓ | ✓ | +//! | ISO-IR 13 (WINDOWS_31J): The JIS X 0201-1976 character set (Japanese single-byte) | ✓ | ✓ | +//! | ISO-IR 87 (ISO_2022_JP): The JIS X 0208-1990 character set (Japanese multi-byte) | ✓ | ✓ | //! | ISO-IR 100 (ISO-8859-1): Right-hand part of the Latin alphabet no. 1, the Western Europe character set | ✓ | ✓ | //! | ISO-IR 101 (ISO-8859-2): Right-hand part of the Latin alphabet no. 2, the Central/Eastern Europe character set | ✓ | ✓ | //! | ISO-IR 109 (ISO-8859-3): Right-hand part of the Latin alphabet no. 3, the South Europe character set | ✓ | ✓ | //! | ISO-IR 110 (ISO-8859-4): Right-hand part of the Latin alphabet no. 4, the North Europe character set | ✓ | ✓ | +//! | ISO-IR 126 (ISO-8859-7): The Latin/Greek character set | ✓ | ✓ | +//! | ISO-IR 127 (ISO-8859-6): The Latin/Arabic character set | ✓ | ✓ | +//! | ISO-IR 138 (ISO-8859-8): The Latin/Hebrew character set | ✓ | ✓ | //! | ISO-IR 144 (ISO-8859-5): The Latin/Cyrillic character set | ✓ | ✓ | +//! | ISO-IR 148 (ISO-8859-9): Latin no. 5, the Turkish character set | x | x | +//! | ISO-IR 149 (WINDOWS_949): The KS X 1001 character set (Korean) | ✓ | ✓ | +//! | ISO-IR 159: The JIS X 0212-1990 character set (supplementary Japanese characters) | x | x | +//! | ISO-IR 166 (WINDOWS_874): The TIS 620-2533 character set (Thai) | ✓ | ✓ | //! | ISO-IR 192: The Unicode character set based on the UTF-8 encoding | ✓ | ✓ | //! | GB18030: The Simplified Chinese character set | ✓ | ✓ | -//! | JIS X 0201-1976: Code for Information Interchange | x | x | -//! | JIS X 0208-1990: Code for the Japanese Graphic Character set for information interchange | x | x | -//! | JIS X 0212-1990: Code of the supplementary Japanese Graphic Character set for information interchange | x | x | -//! | KS X 1001 (registered as ISO-IR 149) for Korean Language | x | x | -//! | TIS 620-2533 (1990) Thai Characters Code for Information Interchange | x | x | //! | GB2312: Simplified Chinese character set | x | x | //! //! These capabilities are available through [`SpecificCharacterSet`]. -use encoding::all::{GB18030, ISO_8859_1, ISO_8859_2, ISO_8859_3, ISO_8859_4, ISO_8859_5, UTF_8}; +use encoding::all::{ + GB18030, ISO_2022_JP, ISO_8859_1, ISO_8859_2, ISO_8859_3, ISO_8859_4, ISO_8859_5, ISO_8859_6, + ISO_8859_7, ISO_8859_8, UTF_8, WINDOWS_31J, WINDOWS_874, WINDOWS_949, +}; use encoding::{DecoderTrap, EncoderTrap, Encoding, RawDecoder, StringWriter}; use snafu::{Backtrace, Snafu}; use std::borrow::Cow; @@ -188,6 +195,10 @@ enum CharsetImpl { /// **ISO-IR 6**: the default character set. #[default] Default, + /// **ISO-IR 13**: The Simplified Japanese single byte character set. + IsoIr13, + /// **ISO-IR 87**: The Simplified Japanese multi byte character set. + IsoIr87, /// **ISO-IR 100** (ISO-8859-1): Right-hand part of the Latin alphabet no. 1, /// the Western Europe character set. IsoIr100, @@ -200,8 +211,18 @@ enum CharsetImpl { /// **ISO-IR 110** (ISO-8859-4): Right-hand part of the Latin alphabet no. 4, /// the North Europe character set. IsoIr110, + /// **ISO-IR 126** (ISO-8859-7): The Greek character set. + IsoIr126, + /// **ISO-IR 127** (ISO-8859-6): The Arabic character set. + IsoIr127, + /// **ISO-IR 138** (ISO-8859-8): The Hebrew character set. + IsoIr138, /// **ISO-IR 144** (ISO-8859-5): The Latin/Cyrillic character set. IsoIr144, + /// **ISO-IR 149**: The Korean character set. + IsoIr149, + /// **ISO-IR 166**: The Thai character set. + IsoIr166, /// **ISO-IR 192**: The Unicode character set based on the UTF-8 encoding. IsoIr192, /// **GB18030**: The Simplified Chinese character set. @@ -218,11 +239,18 @@ impl CharsetImpl { use self::CharsetImpl::*; match uid.trim_end() { "Default" | "ISO_IR_6" | "ISO_IR 6" | "ISO 2022 IR 6" => Some(Default), + "ISO_IR_13" | "ISO_IR 13" | "ISO 2022 IR 13" => Some(IsoIr13), + "ISO_IR_87" | "ISO_IR 87" | "ISO 2022 IR 87" => Some(IsoIr87), "ISO_IR_100" | "ISO_IR 100" | "ISO 2022 IR 100" => Some(IsoIr100), "ISO_IR_101" | "ISO_IR 101" | "ISO 2022 IR 101" => Some(IsoIr101), "ISO_IR_109" | "ISO_IR 109" | "ISO 2022 IR 109" => Some(IsoIr109), "ISO_IR_110" | "ISO_IR 110" | "ISO 2022 IR 110" => Some(IsoIr110), + "ISO_IR_126" | "ISO_IR 126" | "ISO 2022 IR 126" => Some(IsoIr126), + "ISO_IR_127" | "ISO_IR 127" | "ISO 2022 IR 127" => Some(IsoIr127), + "ISO_IR_138" | "ISO_IR 138" | "ISO 2022 IR 138" => Some(IsoIr138), "ISO_IR_144" | "ISO_IR 144" | "ISO 2022 IR 144" => Some(IsoIr144), + "ISO_IR_149" | "ISO_IR 149" | "ISO 2022 IR 149" => Some(IsoIr149), + "ISO_IR_166" | "ISO_IR 166" | "ISO 2022 IR 166" => Some(IsoIr166), "ISO_IR_192" | "ISO_IR 192" => Some(IsoIr192), "GB18030" => Some(Gb18030), _ => None, @@ -234,11 +262,18 @@ impl TextCodec for CharsetImpl { fn name(&self) -> Cow<'static, str> { Cow::Borrowed(match self { CharsetImpl::Default => "ISO_IR 6", + CharsetImpl::IsoIr13 => "ISO_IR 13", + CharsetImpl::IsoIr87 => "ISO_IR 87", CharsetImpl::IsoIr100 => "ISO_IR 100", CharsetImpl::IsoIr101 => "ISO_IR 101", CharsetImpl::IsoIr109 => "ISO_IR 109", CharsetImpl::IsoIr110 => "ISO_IR 110", + CharsetImpl::IsoIr126 => "ISO_IR 126", + CharsetImpl::IsoIr127 => "ISO_IR 127", + CharsetImpl::IsoIr138 => "ISO_IR 138", CharsetImpl::IsoIr144 => "ISO_IR 144", + CharsetImpl::IsoIr149 => "ISO_IR 149", + CharsetImpl::IsoIr166 => "ISO_IR 166", CharsetImpl::IsoIr192 => "ISO_IR 192", CharsetImpl::Gb18030 => "GB18030", }) @@ -247,11 +282,18 @@ impl TextCodec for CharsetImpl { fn decode(&self, text: &[u8]) -> DecodeResult { match self { CharsetImpl::Default => DefaultCharacterSetCodec.decode(text), + CharsetImpl::IsoIr13 => IsoIr13CharacterSetCodec.decode(text), + CharsetImpl::IsoIr87 => IsoIr87CharacterSetCodec.decode(text), CharsetImpl::IsoIr100 => IsoIr100CharacterSetCodec.decode(text), CharsetImpl::IsoIr101 => IsoIr101CharacterSetCodec.decode(text), CharsetImpl::IsoIr109 => IsoIr109CharacterSetCodec.decode(text), CharsetImpl::IsoIr110 => IsoIr110CharacterSetCodec.decode(text), + CharsetImpl::IsoIr126 => IsoIr126CharacterSetCodec.decode(text), + CharsetImpl::IsoIr127 => IsoIr127CharacterSetCodec.decode(text), + CharsetImpl::IsoIr138 => IsoIr138CharacterSetCodec.decode(text), CharsetImpl::IsoIr144 => IsoIr144CharacterSetCodec.decode(text), + CharsetImpl::IsoIr149 => IsoIr149CharacterSetCodec.decode(text), + CharsetImpl::IsoIr166 => IsoIr166CharacterSetCodec.decode(text), CharsetImpl::IsoIr192 => Utf8CharacterSetCodec.decode(text), CharsetImpl::Gb18030 => Gb18030CharacterSetCodec.decode(text), } @@ -260,11 +302,18 @@ impl TextCodec for CharsetImpl { fn encode(&self, text: &str) -> EncodeResult> { match self { CharsetImpl::Default => DefaultCharacterSetCodec.encode(text), + CharsetImpl::IsoIr13 => IsoIr13CharacterSetCodec.encode(text), + CharsetImpl::IsoIr87 => IsoIr87CharacterSetCodec.encode(text), CharsetImpl::IsoIr100 => IsoIr100CharacterSetCodec.encode(text), CharsetImpl::IsoIr101 => IsoIr101CharacterSetCodec.encode(text), CharsetImpl::IsoIr109 => IsoIr109CharacterSetCodec.encode(text), CharsetImpl::IsoIr110 => IsoIr110CharacterSetCodec.encode(text), + CharsetImpl::IsoIr126 => IsoIr126CharacterSetCodec.encode(text), + CharsetImpl::IsoIr127 => IsoIr127CharacterSetCodec.encode(text), + CharsetImpl::IsoIr138 => IsoIr138CharacterSetCodec.encode(text), CharsetImpl::IsoIr144 => IsoIr144CharacterSetCodec.encode(text), + CharsetImpl::IsoIr149 => IsoIr149CharacterSetCodec.encode(text), + CharsetImpl::IsoIr166 => IsoIr166CharacterSetCodec.encode(text), CharsetImpl::IsoIr192 => Utf8CharacterSetCodec.encode(text), CharsetImpl::Gb18030 => Gb18030CharacterSetCodec.encode(text), } @@ -338,11 +387,18 @@ impl TextCodec for DefaultCharacterSetCodec { } } +decl_character_set!(IsoIr13CharacterSetCodec, "ISO_IR 13", WINDOWS_31J); +decl_character_set!(IsoIr87CharacterSetCodec, "ISO_IR 87", ISO_2022_JP); decl_character_set!(IsoIr100CharacterSetCodec, "ISO_IR 100", ISO_8859_1); decl_character_set!(IsoIr101CharacterSetCodec, "ISO_IR 101", ISO_8859_2); decl_character_set!(IsoIr109CharacterSetCodec, "ISO_IR 109", ISO_8859_3); decl_character_set!(IsoIr110CharacterSetCodec, "ISO_IR 110", ISO_8859_4); +decl_character_set!(IsoIr126CharacterSetCodec, "ISO_IR 126", ISO_8859_7); +decl_character_set!(IsoIr127CharacterSetCodec, "ISO_IR 127", ISO_8859_6); +decl_character_set!(IsoIr138CharacterSetCodec, "ISO_IR 138", ISO_8859_8); decl_character_set!(IsoIr144CharacterSetCodec, "ISO_IR 144", ISO_8859_5); +decl_character_set!(IsoIr149CharacterSetCodec, "ISO_IR 149", WINDOWS_949); +decl_character_set!(IsoIr166CharacterSetCodec, "ISO_IR 166", WINDOWS_874); decl_character_set!(Utf8CharacterSetCodec, "ISO_IR 192", UTF_8); decl_character_set!(Gb18030CharacterSetCodec, "GB18030", GB18030); @@ -436,6 +492,19 @@ mod tests { test_codec(codec, "Smith^John", b"Smith^John"); } + #[test] + fn iso_ir_13_baseline() { + let codec = SpecificCharacterSet(CharsetImpl::IsoIr13); + test_codec(codec, "ヤマダ^タロウ", b"\xd4\xcf\xc0\xde^\xc0\xdb\xb3"); + } + + #[test] + fn iso_ir_87_baseline() { + let codec = SpecificCharacterSet(CharsetImpl::IsoIr87); + test_codec(&codec, "山田^太郎", b"\x1b$B;3ED\x1b(B^\x1b$BB@O:"); + test_codec(&codec, "やまだ^たろう", b"\x1b$B$d$^$@\x1b(B^\x1b$B$?$m$&"); + } + #[test] fn iso_ir_192_baseline() { let codec = SpecificCharacterSet::ISO_IR_192; @@ -456,13 +525,84 @@ mod tests { test_codec(codec, "Günther^Hans", b"G\xfcnther^Hans"); } + #[test] + fn iso_ir_110_baseline() { + let codec = SpecificCharacterSet(CharsetImpl::IsoIr110); + test_codec(codec, "ĄĸŖĨĻ§ŠĒĢŦŽĀÁÂÃÄÅÆĮČÉ^ĘËĖÍÎĪĐŅŌĶÔÕÖ×ØŲÚÛÜŨŪß", b"\xA1\xA2\xA3\xA5\xA6\xA7\xA9\xAA\xAB\xAC\xAE\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9^\xCA\xCB\xCC\xCD\xCE\xCF\xD0\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9\xDA\xDB\xDC\xDD\xDE\xDF"); + } + + #[test] + fn iso_ir_126_baseline() { + let codec = SpecificCharacterSet(CharsetImpl::IsoIr126); + test_codec(codec, "Διονυσιος", b"\xC4\xE9\xEF\xED\xF5\xF3\xE9\xEF\xF2"); + } + + #[test] + fn iso_ir_127_baseline() { + let codec = SpecificCharacterSet(CharsetImpl::IsoIr127); + test_codec( + codec, + "قباني^لنزار", + b"\xE2\xC8\xC7\xE6\xEA^\xE4\xE6\xD2\xC7\xD1", + ); + } + + #[test] + fn iso_ir_138_baseline() { + let codec = SpecificCharacterSet(CharsetImpl::IsoIr138); + test_codec( + &codec, + "מקור השם עברית", + b"\xEE\xF7\xE5\xF8\x20\xE4\xF9\xED\x20\xF2\xE1\xF8\xE9\xFA", + ); + test_codec( + codec, + "שרון^דבורה", + b"\xF9\xF8\xE5\xEF^\xE3\xE1\xE5\xF8\xE4", + ); + } + #[test] fn iso_ir_144_baseline() { let codec = SpecificCharacterSet(CharsetImpl::IsoIr144); test_codec( - codec, + &codec, "Иванков^Андрей", b"\xb8\xd2\xd0\xdd\xda\xde\xd2^\xb0\xdd\xd4\xe0\xd5\xd9", ); + test_codec( + &codec, + "Гол. мозг стандарт", + b"\xB3\xDE\xDB.\x20\xDC\xDE\xD7\xD3\x20\xE1\xE2\xD0\xDD\xD4\xD0\xE0\xE2", + ); + test_codec(&codec, "мозг 2мм", b"\xDC\xDE\xD7\xD3\x202\xDC\xDC"); + } + + #[test] + fn iso_ir_149_baseline() { + let codec = SpecificCharacterSet(CharsetImpl::IsoIr149); + test_codec(&codec, "김희중", b"\xB1\xE8\xC8\xF1\xC1\xDF"); + test_codec( + codec, + "Hong^Gildong=洪^吉洞=홍^길동", + b"Hong^Gildong=\xFB\xF3^\xD1\xCE\xD4\xD7=\xC8\xAB^\xB1\xE6\xB5\xBF", + ); + } + + #[test] + fn iso_ir_166_baseline() { + let codec = SpecificCharacterSet(CharsetImpl::IsoIr166); + test_codec(&codec, "ประเทศไทย", b"\xBB\xC3\xD0\xE0\xB7\xC8\xE4\xB7\xC2"); + test_codec(codec, "รหัสสำหรับอักขระไทยที่ใช้กับคอมพิวเตอร์", b"\xC3\xCB\xD1\xCA\xCA\xD3\xCB\xC3\xD1\xBA\xCD\xD1\xA1\xA2\xC3\xD0\xE4\xB7\xC2\xB7\xD5\xE8\xE3\xAA\xE9\xA1\xD1\xBA\xA4\xCD\xC1\xBE\xD4\xC7\xE0\xB5\xCD\xC3\xEC"); + } + + #[test] + fn gb_18030_baseline() { + let codec = SpecificCharacterSet(CharsetImpl::Gb18030); + test_codec( + &codec, + "Wang^XiaoDong=王^小东", + b"Wang^XiaoDong=\xCD\xF5^\xD0\xA1\xB6\xAB", + ); } }