diff --git a/Cargo.lock b/Cargo.lock index de36e3e3e..fe136e149 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -505,6 +505,7 @@ dependencies = [ "dicom-core", "dicom-dictionary-std", "dicom-encoding", + "encoding_rs", "smallvec", "snafu", "tracing", @@ -730,6 +731,15 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" +[[package]] +name = "encoding_rs" +version = "0.8.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7268b386296a025e474d5140678f75d6de9493ae55a5d709eeb9dd08149945e1" +dependencies = [ + "cfg-if", +] + [[package]] name = "equivalent" version = "1.0.1" diff --git a/parser/Cargo.toml b/parser/Cargo.toml index ba022804d..e9df450a5 100644 --- a/parser/Cargo.toml +++ b/parser/Cargo.toml @@ -18,3 +18,4 @@ dicom-dictionary-std = { path = "../dictionary-std/", version = "0.6.1" } smallvec = "1.6.1" snafu = "0.7.3" tracing = "0.1.34" +encoding_rs = "0.8.33" diff --git a/parser/src/stateful/decode.rs b/parser/src/stateful/decode.rs index bef304dbf..caeb9e426 100644 --- a/parser/src/stateful/decode.rs +++ b/parser/src/stateful/decode.rs @@ -230,6 +230,7 @@ pub struct StatefulDecoder { decoder: D, basic: BD, text: TC, + cs: Vec, dt_utc_offset: FixedOffset, buffer: Vec, /// the assumed position of the reader source @@ -291,6 +292,7 @@ where basic: LittleEndianBasicDecoder, decoder: ExplicitVRLittleEndianDecoder::default(), text: DefaultCharacterSetCodec, + cs: Vec::new(), dt_utc_offset: FixedOffset::east_opt(0).unwrap(), buffer: Vec::with_capacity(PARSER_BUFFER_CAPACITY), position: 0, @@ -322,6 +324,7 @@ where basic, decoder, text, + cs: Vec::new(), dt_utc_offset: FixedOffset::east_opt(0).unwrap(), buffer: Vec::with_capacity(PARSER_BUFFER_CAPACITY), position, @@ -370,6 +373,51 @@ where }) } + fn read_element_data(&mut self, header: &DataElementHeader) -> Result { + let len = self.require_known_length(header)?; + self.buffer.resize_with(len, Default::default); + self.from + .read_exact(&mut self.buffer) + .context(ReadValueDataSnafu { + position: self.position, + })?; + + Ok(len) + } + + fn read_value_strs_impl( + &mut self, + header: &DataElementHeader, + require_known_length: usize, + ) -> Result> { + let parts: Result> = match header.vr() { + VR::AE | VR::CS | VR::AS => self + .buffer + .split(|v| *v == b'\\') + .map(|slice| { + DefaultCharacterSetCodec + .decode(slice) + .context(DecodeTextSnafu { + position: self.position, + }) + }) + .collect(), + _ => self + .buffer + .split(|v| *v == b'\\') + .map(|slice| { + self.text.decode(slice).context(DecodeTextSnafu { + position: self.position, + }) + }) + .collect(), + }; + + self.position += require_known_length as u64; + + parts + } + fn read_value_tag(&mut self, header: &DataElementHeader) -> Result { let len = self.require_known_length(header)?; @@ -403,40 +451,69 @@ where } fn read_value_strs(&mut self, header: &DataElementHeader) -> Result { - let len = self.require_known_length(header)?; // sequence of strings - self.buffer.resize_with(len, Default::default); - self.from - .read_exact(&mut self.buffer) - .context(ReadValueDataSnafu { - position: self.position, - })?; + let len = self.read_element_data(header)?; - let parts: Result<_> = match header.vr() { - VR::AE | VR::CS | VR::AS => self - .buffer - .split(|v| *v == b'\\') - .map(|slice| { - DefaultCharacterSetCodec - .decode(slice) - .context(DecodeTextSnafu { - position: self.position, - }) - }) - .collect(), - _ => self - .buffer - .split(|v| *v == b'\\') - .map(|slice| { - self.text.decode(slice).context(DecodeTextSnafu { - position: self.position, - }) - }) - .collect(), + let parts = self.read_value_strs_impl(header, len)?; + + Ok(PrimitiveValue::Strs(parts.into())) + } + + fn read_value_pn(&mut self, header: &DataElementHeader) -> Result { + match self.read_value_pns(header) { + Ok(PrimitiveValue::Strs(parts)) => Ok(PrimitiveValue::Str(parts.join("="))), + Ok(_) => { + panic!("wron impl: read_value_pns should always return Strs") + } + Err(e) => Err(e), + } + } + + fn read_value_pns(&mut self, header: &DataElementHeader) -> Result { + let len = self.read_element_data(header)?; + + let equal_positions: Vec<_> = self + .buffer + .iter() + .enumerate() + .filter(|&(_, &b)| b == b'=') + .map(|(i, _)| i) + .collect(); + + let decoded_parts = if equal_positions.len() == 0 { + self.read_value_strs_impl(header, len)? + } else { + let mut binary_data = Vec::new(); + let mut equal_position = 0; + for &index in &equal_positions { + binary_data.push(&self.buffer[equal_position..index]); + equal_position = index + 1; + } + binary_data.push(&self.buffer[equal_position..]); + + let mut decoded_parts = Vec::new(); + + let (cs1, cs2) = match self.cs.len() { + 0 => (&self.text, &self.text), + 1 => (&self.text, self.cs.get(0).unwrap_or(&self.text)), + _ => ( + self.cs.get(0).unwrap_or(&self.text), + self.cs.get(1).unwrap_or(&self.text), + ), + }; + + for (i, part) in binary_data.iter().enumerate() { + let charset = if i == 0 { cs1 } else { cs2 }; + let decoded = charset.decode(part).context(DecodeTextSnafu { + position: self.position, + })?; + decoded_parts.push(decoded); + } + + decoded_parts }; - self.position += len as u64; - Ok(PrimitiveValue::Strs(parts?)) + Ok(PrimitiveValue::Strs(decoded_parts.into())) } fn read_value_str(&mut self, header: &DataElementHeader) -> Result { @@ -780,6 +857,7 @@ where { fn set_character_set(&mut self, charset: SpecificCharacterSet) -> Result<()> { self.text = charset; + self.cs.push(charset); Ok(()) } @@ -799,14 +877,14 @@ where // Edge case handling strategies for // unsupported specific character sets should probably be considered // in the future. See #40 for discussion. - if let Some(charset) = parts.first().map(|x| x.as_ref()).and_then(|name| { - SpecificCharacterSet::from_code(name).or_else(|| { - tracing::warn!("Unsupported character set `{}`, ignoring", name); - None - }) - }) { - self.set_character_set(charset)?; - } + parts.iter().for_each(|name| { + if let Some(charset) = SpecificCharacterSet::from_code(name) { + self.set_character_set(charset).unwrap_or_else(|e| { + tracing::warn!("Unsupported character set `{}`, ignoring", name); + tracing::warn!("Error: {}", e); + }); + } + }); } Ok(out) @@ -920,9 +998,8 @@ where .fail() } VR::AT => self.read_value_tag(header), - VR::AE | VR::AS | VR::PN | VR::SH | VR::LO | VR::UC | VR::UI => { - self.read_value_strs(header) - } + VR::AE | VR::AS | VR::SH | VR::LO | VR::UC | VR::UI => self.read_value_strs(header), + VR::PN => self.read_value_pns(header), VR::CS => self.read_value_cs(header), VR::UT | VR::ST | VR::UR | VR::LT => self.read_value_str(header), VR::UN | VR::OB => self.read_value_ob(header), @@ -958,7 +1035,6 @@ where VR::AT => self.read_value_tag(header), VR::AE | VR::AS - | VR::PN | VR::SH | VR::LO | VR::UC @@ -968,6 +1044,7 @@ where | VR::DA | VR::TM | VR::DT => self.read_value_strs(header), + VR::PN => self.read_value_pn(header), VR::CS => self.read_value_cs(header), VR::UT | VR::ST | VR::UR | VR::LT => self.read_value_str(header), VR::UN | VR::OB => self.read_value_ob(header), @@ -1065,13 +1142,16 @@ fn trim_trail_empty_bytes(mut x: &[u8]) -> &[u8] { mod tests { use super::{StatefulDecode, StatefulDecoder}; use dicom_core::header::{DataElementHeader, HasLength, Header, Length, SequenceItemHeader}; - use dicom_core::{Tag, VR}; + use dicom_core::{PrimitiveValue, Tag, VR}; use dicom_encoding::decode::basic::LittleEndianBasicDecoder; use dicom_encoding::decode::{ explicit_le::ExplicitVRLittleEndianDecoder, implicit_le::ImplicitVRLittleEndianDecoder, }; use dicom_encoding::text::{SpecificCharacterSet, TextCodec}; + use encoding_rs::{ISO_2022_JP, SHIFT_JIS, UTF_8}; + use smallvec::SmallVec; use std::io::{Cursor, Seek, SeekFrom}; + use std::str; // manually crafting some DICOM data elements // Tag: (0002,0002) Media Storage SOP Class UID @@ -1479,4 +1559,187 @@ mod tests { assert_eq!(decoder.position(), 138); } + + #[test] + fn test_read_value_pn_ps_3_5_h_3_2_multistr() { + let yamada_taro_sjis = SHIFT_JIS.encode("ヤマダ^タロウ").0; + let equal_ascii = b"="; + let yamada_taro_iso2022jp = ISO_2022_JP.encode("山田^太郎").0; + let equal_ascii_2 = b"="; + let yamada_taro_kana_iso2022jp = ISO_2022_JP.encode("ヤマダ^タロウ").0; + + let mut combined = Vec::new(); + combined.extend_from_slice(&yamada_taro_sjis); + combined.extend_from_slice(equal_ascii); + combined.extend_from_slice(&yamada_taro_iso2022jp); + combined.extend_from_slice(equal_ascii_2); + combined.extend_from_slice(&yamada_taro_kana_iso2022jp); + + let mut cursor = Cursor::new(&combined); + + let mut decoder = StatefulDecoder::new( + &mut cursor, + ImplicitVRLittleEndianDecoder::default(), + LittleEndianBasicDecoder, + SpecificCharacterSet::Default, + ); + + decoder + .set_character_set(SpecificCharacterSet::IsoIr13) + .unwrap(); + decoder + .set_character_set(SpecificCharacterSet::IsoIr87) + .unwrap(); + + let header = + DataElementHeader::new(Tag(0x0010, 0x0010), VR::PN, Length(combined.len() as u32)); + + let result = decoder.read_value(&header); + assert!(result.is_ok()); + + let value = result.unwrap(); + let expected: SmallVec<[&str; 3]> = + vec!["ヤマタ\u{ff9e}^タロウ", "山田^太郎", "ヤマダ^タロウ"].into(); + match value { + PrimitiveValue::Strs(s) => { + assert_eq!(s, expected); + } + _ => panic!("Unexpected value type"), + } + } + + #[test] + fn test_read_value_pn_ps_3_5_h_3_2() { + let yamada_taro_sjis = SHIFT_JIS.encode("ヤマダ^タロウ").0; + let equal_ascii = b"="; + let yamada_taro_iso2022jp = ISO_2022_JP.encode("山田^太郎").0; + let equal_ascii_2 = b"="; + let yamada_taro_kana_iso2022jp = ISO_2022_JP.encode("ヤマダ^タロウ").0; + + let mut combined = Vec::new(); + combined.extend_from_slice(&yamada_taro_sjis); + combined.extend_from_slice(equal_ascii); + combined.extend_from_slice(&yamada_taro_iso2022jp); + combined.extend_from_slice(equal_ascii_2); + combined.extend_from_slice(&yamada_taro_kana_iso2022jp); + + let mut cursor = Cursor::new(&combined); + + let mut decoder = StatefulDecoder::new( + &mut cursor, + ImplicitVRLittleEndianDecoder::default(), + LittleEndianBasicDecoder, + SpecificCharacterSet::Default, + ); + + decoder + .set_character_set(SpecificCharacterSet::IsoIr13) + .unwrap(); + decoder + .set_character_set(SpecificCharacterSet::IsoIr87) + .unwrap(); + + let header = + DataElementHeader::new(Tag(0x0010, 0x0010), VR::PN, Length(combined.len() as u32)); + + let result = decoder.read_value_preserved(&header); + assert!(result.is_ok()); + + let value = result.unwrap(); + let expected = "ヤマタ\u{ff9e}^タロウ=山田^太郎=ヤマダ^タロウ"; + match value { + PrimitiveValue::Str(s) => { + assert_eq!(s, expected); + } + _ => panic!("Unexpected value type"), + } + } + #[test] + fn test_read_value_pn_ps_3_5_h_3_1_multistr() { + let yamada_taro_sjis = UTF_8.encode("Yamada^Taro").0; + let equal_ascii = b"="; + let yamada_taro_iso2022jp = ISO_2022_JP.encode("山田^太郎").0; + let equal_ascii_2 = b"="; + let yamada_taro_kana_iso2022jp = ISO_2022_JP.encode("ヤマダ^タロウ").0; + + let mut combined = Vec::new(); + combined.extend_from_slice(&yamada_taro_sjis); + combined.extend_from_slice(equal_ascii); + combined.extend_from_slice(&yamada_taro_iso2022jp); + combined.extend_from_slice(equal_ascii_2); + combined.extend_from_slice(&yamada_taro_kana_iso2022jp); + + let mut cursor = Cursor::new(&combined); + + let mut decoder = StatefulDecoder::new( + &mut cursor, + ImplicitVRLittleEndianDecoder::default(), + LittleEndianBasicDecoder, + SpecificCharacterSet::Default, + ); + + decoder + .set_character_set(SpecificCharacterSet::IsoIr87) + .unwrap(); + + let header = + DataElementHeader::new(Tag(0x0010, 0x0010), VR::PN, Length(combined.len() as u32)); + + let result = decoder.read_value(&header); + assert!(result.is_ok()); + + let value = result.unwrap(); + let expected: SmallVec<[&str; 3]> = + vec!["Yamada^Taro", "山田^太郎", "ヤマダ^タロウ"].into(); + match value { + PrimitiveValue::Strs(s) => { + assert_eq!(s, expected); + } + _ => panic!("Unexpected value type"), + } + } + + #[test] + fn test_read_value_pn_ps_3_5_h_3_1() { + let yamada_taro_sjis = UTF_8.encode("Yamada^Taro").0; + let equal_ascii = b"="; + let yamada_taro_iso2022jp = ISO_2022_JP.encode("山田^太郎").0; + let equal_ascii_2 = b"="; + let yamada_taro_kana_iso2022jp = ISO_2022_JP.encode("ヤマダ^タロウ").0; + + let mut combined = Vec::new(); + combined.extend_from_slice(&yamada_taro_sjis); + combined.extend_from_slice(equal_ascii); + combined.extend_from_slice(&yamada_taro_iso2022jp); + combined.extend_from_slice(equal_ascii_2); + combined.extend_from_slice(&yamada_taro_kana_iso2022jp); + + let mut cursor = Cursor::new(&combined); + + let mut decoder = StatefulDecoder::new( + &mut cursor, + ImplicitVRLittleEndianDecoder::default(), + LittleEndianBasicDecoder, + SpecificCharacterSet::Default, + ); + + decoder + .set_character_set(SpecificCharacterSet::IsoIr87) + .unwrap(); + + let header = + DataElementHeader::new(Tag(0x0010, 0x0010), VR::PN, Length(combined.len() as u32)); + + let result = decoder.read_value_preserved(&header); + assert!(result.is_ok()); + + let value = result.unwrap(); + let expected = "Yamada^Taro=山田^太郎=ヤマダ^タロウ"; + match value { + PrimitiveValue::Str(s) => { + assert_eq!(s, expected); + } + _ => panic!("Unexpected value type"), + } + } }