diff --git a/sudachi-cli/src/build.rs b/sudachi-cli/src/build.rs index eea11ecf..dbb03444 100644 --- a/sudachi-cli/src/build.rs +++ b/sudachi-cli/src/build.rs @@ -27,6 +27,8 @@ use sudachi::dic::build::report::DictPartReport; use sudachi::dic::build::DictBuilder; use sudachi::dic::dictionary::JapaneseDictionary; use sudachi::dic::grammar::Grammar; +use sudachi::dic::header::HeaderVersion; +use sudachi::dic::lexicon::word_infos::WordInfo; use sudachi::dic::lexicon_set::LexiconSet; use sudachi::dic::word_id::WordId; use sudachi::dic::DictionaryLoader; @@ -76,9 +78,17 @@ pub(crate) enum BuildCli { #[command(name = "dump")] Dump { - dict: PathBuf, + /// target dictionary to dump + dictionary: PathBuf, + /// dump target (matrix, pos, winfo) part: String, + /// output file output: PathBuf, + + /// reference system dictionary. + /// required to dump winfo of an user dictionary + #[arg(short = 's', long = "system")] + system: Option, }, } @@ -101,7 +111,12 @@ pub fn build_main(subcommand: BuildCli) { match subcommand { BuildCli::System { common, matrix } => build_system(common, matrix), BuildCli::User { common, dictionary } => build_user(common, dictionary), - BuildCli::Dump { dict, part, output } => dump_part(dict, part, output), + BuildCli::Dump { + dictionary, + part, + output, + system, + } => dump_part(dictionary, system, part, output), } } @@ -172,31 +187,36 @@ fn output_file(p: &Path) -> File { OpenOptions::new() .write(true) .create_new(true) - .open(&p) + .open(p) .unwrap_or_else(|e| panic!("failed to open {:?} for writing:\n{:?}", p, e)) } -fn dump_part(dict: PathBuf, part: String, output: PathBuf) { - let file = File::open(&dict).expect("open failed"); - let data = unsafe { Mmap::map(&file) }.expect("mmap failed"); +fn dump_part(dict: PathBuf, system: Option, part: String, output: PathBuf) { + let file = File::open(dict).expect("open dict failed"); + let data = unsafe { Mmap::map(&file) }.expect("mmap dict failed"); let loader = unsafe { DictionaryLoader::read_any_dictionary(&data) }.expect("failed to load dictionary"); - let dict = loader.to_loaded().expect("should contain grammar"); let outf = output_file(&output); let mut writer = BufWriter::new(outf); match part.as_str() { - "pos" => dump_pos(dict.grammar(), &mut writer), - "matrix" => dump_matrix(dict.grammar(), &mut writer), - "winfo" => dump_word_info(dict.lexicon(), &mut writer).unwrap(), + "pos" => dump_pos(loader, &mut writer), + "matrix" => dump_matrix(loader, &mut writer), + "winfo" => dump_word_info(loader, system, &mut writer).unwrap(), _ => unimplemented!(), } writer.flush().unwrap(); } -fn dump_pos(grammar: &Grammar, w: &mut W) { - for p in grammar.pos_list.iter() { +fn dump_pos(dict: DictionaryLoader, w: &mut W) { + let dict = dict + .to_loaded() + .expect("target dict should contain grammar"); + let grammar = dict.grammar(); + + for (id, p) in grammar.pos_list.iter().enumerate() { + write!(w, "{},", id).unwrap(); for (i, e) in p.iter().enumerate() { w.write_all(e.as_bytes()).unwrap(); if (i + 1) == p.len() { @@ -208,35 +228,86 @@ fn dump_pos(grammar: &Grammar, w: &mut W) { } } -fn dump_matrix(grammar: &Grammar, w: &mut W) { +fn dump_matrix(dict: DictionaryLoader, w: &mut W) { + if let HeaderVersion::UserDict(_) = dict.header.version { + panic!("user dictionary does not have connection matrix.") + } + + let dict = dict + .to_loaded() + .expect("target dict should contain grammar"); + let grammar = dict.grammar(); let conn = grammar.conn_matrix(); - write!(w, "{} {}", conn.num_left(), conn.num_right()).unwrap(); + writeln!(w, "{} {}", conn.num_left(), conn.num_right()).unwrap(); for left in 0..conn.num_left() { for right in 0..conn.num_right() { let cost = conn.cost(left as _, right as _); - write!(w, "{} {} {}\n", left, right, cost).unwrap(); + writeln!(w, "{} {} {}", left, right, cost).unwrap(); } } } -fn dump_word_info(lex: &LexiconSet, w: &mut W) -> SudachiResult<()> { - let size = lex.size(); +fn dump_word_info( + dict: DictionaryLoader, + system: Option, + w: &mut W, +) -> SudachiResult<()> { + let is_user = match dict.header.version { + HeaderVersion::UserDict(_) => true, + HeaderVersion::SystemDict(_) => false, + }; + let did = if is_user { 1 } else { 0 }; + let size = dict.lexicon.size(); + + let data = system.map(|system_path| { + let file = File::open(system_path).expect("open system failed"); + unsafe { Mmap::map(&file) }.expect("mmap system failed") + }); + let system = data.as_ref().map(|data| { + let loader = DictionaryLoader::read_system_dictionary(data) + .expect("failed to load system dictionary"); + loader + .to_loaded() + .expect("failed to load system dictionary") + }); + + let (base, user) = if is_user { + ( + system.expect("system dictionary is required to dump user dictionary lexicon"), + Some(dict), + ) + } else { + (dict.to_loaded().expect("failed to load dictionary"), None) + }; + + let mut lex = base.lexicon_set; + let mut grammar = base.grammar; + if let Some(udic) = user { + lex.append(udic.lexicon, grammar.pos_list.len())?; + if let Some(g) = udic.grammar { + grammar.merge(g) + } + } + for i in 0..size { - let wid = WordId::checked(0, i)?; + let wid = WordId::checked(did, i)?; let (left, right, cost) = lex.get_word_param(wid); let winfo = lex.get_word_info(wid)?; + write!(w, "{},", unicode_escape(winfo.surface()))?; write!(w, "{},{},{},", left, right, cost)?; - write!(w, "{},", winfo.surface())?; - write!(w, "{},", winfo.head_word_length())?; - write!(w, "{},", winfo.normalized_form())?; - write!(w, "{},", winfo.dictionary_form_word_id())?; - write!(w, "{},", winfo.reading_form())?; - dump_wids(w, winfo.a_unit_split())?; + write!(w, "{},", unicode_escape(winfo.surface()))?; // writing + write!(w, "{},", pos_string(&grammar, winfo.pos_id()))?; + write!(w, "{},", unicode_escape(winfo.reading_form()))?; + write!(w, "{},", unicode_escape(winfo.normalized_form()))?; + let dict_form = dictionary_form_string(&grammar, &lex, winfo.dictionary_form_word_id()); + write!(w, "{},", dict_form)?; + write!(w, "{},", split_mode(&winfo))?; + dump_wids(w, &grammar, &lex, winfo.a_unit_split())?; w.write_all(b",")?; - dump_wids(w, winfo.b_unit_split())?; + dump_wids(w, &grammar, &lex, winfo.b_unit_split())?; w.write_all(b",")?; - dump_wids(w, winfo.word_structure())?; + dump_wids(w, &grammar, &lex, winfo.word_structure())?; w.write_all(b",")?; dump_gids(w, winfo.synonym_group_ids())?; w.write_all(b"\n")?; @@ -244,23 +315,76 @@ fn dump_word_info(lex: &LexiconSet, w: &mut W) -> SudachiResult<()> { Ok(()) } -fn dump_wids(w: &mut W, data: &[WordId]) -> SudachiResult<()> { +fn unicode_escape(raw: &str) -> String { + // replace '"' and ',' + raw.to_string() + .replace('"', "\\u0022") + .replace(',', "\\u002c") +} + +fn split_mode(winfo: &WordInfo) -> &str { + let asplits = winfo.a_unit_split(); + if asplits.is_empty() { + return "A"; + } + let bsplits = winfo.b_unit_split(); + if bsplits.is_empty() { + return "B"; + } + "C" +} + +fn pos_string(grammar: &Grammar, posid: u16) -> String { + let pos_parts = grammar.pos_components(posid); + pos_parts.join(",") +} + +fn dictionary_form_string(grammar: &Grammar, lex: &LexiconSet, wid: i32) -> String { + if wid < 0 { + return "*".to_string(); + } + let wid_with_dic = WordId::checked(0, wid as u32).expect("invalid wordid"); + format!("\"{}\"", wordref_string(grammar, lex, &wid_with_dic)) +} + +fn wordref_string(grammar: &Grammar, lex: &LexiconSet, wid: &WordId) -> String { + let winfo = lex.get_word_info(*wid).expect("failed to get wordinfo"); + format!( + "{},{},{}", + unicode_escape(winfo.surface()), + pos_string(grammar, winfo.pos_id()), + unicode_escape(winfo.reading_form()), + ) +} + +fn dump_wids( + w: &mut W, + grammar: &Grammar, + lex: &LexiconSet, + data: &[WordId], +) -> SudachiResult<()> { + if data.is_empty() { + write!(w, "*")?; + return Ok(()); + } + w.write_all(b"\"")?; for (i, e) in data.iter().enumerate() { - let prefix = match e.dic() { - 0 => "", - _ => "U", - }; - write!(w, "{}{}", prefix, e.word())?; + write!(w, "{}", wordref_string(grammar, lex, e))?; if i + 1 != data.len() { w.write_all(b"/")?; } } + w.write_all(b"\"")?; Ok(()) } fn dump_gids(w: &mut W, data: &[u32]) -> SudachiResult<()> { + if data.is_empty() { + write!(w, "*")?; + return Ok(()); + } for (i, e) in data.iter().enumerate() { - write!(w, "{}", e)?; + write!(w, "{:06}", e)?; if i + 1 != data.len() { w.write_all(b"/")?; }