Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sync dump dict feature with Java version #277

Merged
merged 5 commits into from
Nov 11, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
190 changes: 157 additions & 33 deletions sudachi-cli/src/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ use sudachi::dic::build::report::DictPartReport;
use sudachi::dic::build::DictBuilder;
use sudachi::dic::dictionary::JapaneseDictionary;
use sudachi::dic::grammar::Grammar;
use sudachi::dic::header::HeaderVersion;
use sudachi::dic::lexicon::word_infos::WordInfo;
use sudachi::dic::lexicon_set::LexiconSet;
use sudachi::dic::word_id::WordId;
use sudachi::dic::DictionaryLoader;
Expand Down Expand Up @@ -76,9 +78,17 @@ pub(crate) enum BuildCli {

#[command(name = "dump")]
Dump {
dict: PathBuf,
/// target dictionary to dump
dictionary: PathBuf,
/// dump target (matrix, pos, winfo)
part: String,
/// output file
output: PathBuf,

/// reference system dictionary.
/// required to dump winfo of an user dictionary
#[arg(short = 's', long = "system")]
system: Option<PathBuf>,
},
}

Expand All @@ -101,7 +111,12 @@ pub fn build_main(subcommand: BuildCli) {
match subcommand {
BuildCli::System { common, matrix } => build_system(common, matrix),
BuildCli::User { common, dictionary } => build_user(common, dictionary),
BuildCli::Dump { dict, part, output } => dump_part(dict, part, output),
BuildCli::Dump {
dictionary,
part,
output,
system,
} => dump_part(dictionary, system, part, output),
}
}

Expand Down Expand Up @@ -172,31 +187,36 @@ fn output_file(p: &Path) -> File {
OpenOptions::new()
.write(true)
.create_new(true)
.open(&p)
.open(p)
.unwrap_or_else(|e| panic!("failed to open {:?} for writing:\n{:?}", p, e))
}

fn dump_part(dict: PathBuf, part: String, output: PathBuf) {
let file = File::open(&dict).expect("open failed");
let data = unsafe { Mmap::map(&file) }.expect("mmap failed");
fn dump_part(dict: PathBuf, system: Option<PathBuf>, part: String, output: PathBuf) {
let file = File::open(dict).expect("open dict failed");
let data = unsafe { Mmap::map(&file) }.expect("mmap dict failed");
let loader =
unsafe { DictionaryLoader::read_any_dictionary(&data) }.expect("failed to load dictionary");
let dict = loader.to_loaded().expect("should contain grammar");

let outf = output_file(&output);
let mut writer = BufWriter::new(outf);

match part.as_str() {
"pos" => dump_pos(dict.grammar(), &mut writer),
"matrix" => dump_matrix(dict.grammar(), &mut writer),
"winfo" => dump_word_info(dict.lexicon(), &mut writer).unwrap(),
"pos" => dump_pos(loader, &mut writer),
"matrix" => dump_matrix(loader, &mut writer),
"winfo" => dump_word_info(loader, system, &mut writer).unwrap(),
_ => unimplemented!(),
}
writer.flush().unwrap();
}

fn dump_pos<W: Write>(grammar: &Grammar, w: &mut W) {
for p in grammar.pos_list.iter() {
fn dump_pos<W: Write>(dict: DictionaryLoader, w: &mut W) {
let dict = dict
.to_loaded()
.expect("target dict should contain grammar");
let grammar = dict.grammar();

for (id, p) in grammar.pos_list.iter().enumerate() {
write!(w, "{},", id).unwrap();
for (i, e) in p.iter().enumerate() {
w.write_all(e.as_bytes()).unwrap();
if (i + 1) == p.len() {
Expand All @@ -208,59 +228,163 @@ fn dump_pos<W: Write>(grammar: &Grammar, w: &mut W) {
}
}

fn dump_matrix<W: Write>(grammar: &Grammar, w: &mut W) {
fn dump_matrix<W: Write>(dict: DictionaryLoader, w: &mut W) {
if let HeaderVersion::UserDict(_) = dict.header.version {
panic!("user dictionary does not have connection matrix.")
}

let dict = dict
.to_loaded()
.expect("target dict should contain grammar");
let grammar = dict.grammar();
let conn = grammar.conn_matrix();
write!(w, "{} {}", conn.num_left(), conn.num_right()).unwrap();

writeln!(w, "{} {}", conn.num_left(), conn.num_right()).unwrap();
for left in 0..conn.num_left() {
for right in 0..conn.num_right() {
let cost = conn.cost(left as _, right as _);
write!(w, "{} {} {}\n", left, right, cost).unwrap();
writeln!(w, "{} {} {}", left, right, cost).unwrap();
}
}
}

fn dump_word_info<W: Write>(lex: &LexiconSet, w: &mut W) -> SudachiResult<()> {
let size = lex.size();
fn dump_word_info<W: Write>(
dict: DictionaryLoader,
system: Option<PathBuf>,
w: &mut W,
) -> SudachiResult<()> {
let is_user = match dict.header.version {
HeaderVersion::UserDict(_) => true,
HeaderVersion::SystemDict(_) => false,
};
let did = if is_user { 1 } else { 0 };
let size = dict.lexicon.size();

let data = system.map(|system_path| {
let file = File::open(system_path).expect("open system failed");
unsafe { Mmap::map(&file) }.expect("mmap system failed")
});
let system = data.as_ref().map(|data| {
let loader = DictionaryLoader::read_system_dictionary(data)
.expect("failed to load system dictionary");
loader
.to_loaded()
.expect("failed to load system dictionary")
});

let (base, user) = if is_user {
(
system.expect("system dictionary is required to dump user dictionary lexicon"),
Some(dict),
)
} else {
(dict.to_loaded().expect("failed to load dictionary"), None)
};

let mut lex = base.lexicon_set;
let mut grammar = base.grammar;
if let Some(udic) = user {
lex.append(udic.lexicon, grammar.pos_list.len())?;
if let Some(g) = udic.grammar {
grammar.merge(g)
}
}

for i in 0..size {
let wid = WordId::checked(0, i)?;
let wid = WordId::checked(did, i)?;
let (left, right, cost) = lex.get_word_param(wid);
let winfo = lex.get_word_info(wid)?;
write!(w, "{},", unicode_escape(winfo.surface()))?;
write!(w, "{},{},{},", left, right, cost)?;
write!(w, "{},", winfo.surface())?;
write!(w, "{},", winfo.head_word_length())?;
write!(w, "{},", winfo.normalized_form())?;
write!(w, "{},", winfo.dictionary_form_word_id())?;
write!(w, "{},", winfo.reading_form())?;
dump_wids(w, winfo.a_unit_split())?;
write!(w, "{},", unicode_escape(winfo.surface()))?; // writing
write!(w, "{},", pos_string(&grammar, winfo.pos_id()))?;
write!(w, "{},", unicode_escape(winfo.reading_form()))?;
write!(w, "{},", unicode_escape(winfo.normalized_form()))?;
let dict_form = dictionary_form_string(&grammar, &lex, winfo.dictionary_form_word_id());
write!(w, "{},", dict_form)?;
write!(w, "{},", split_mode(&winfo))?;
dump_wids(w, &grammar, &lex, winfo.a_unit_split())?;
w.write_all(b",")?;
dump_wids(w, winfo.b_unit_split())?;
dump_wids(w, &grammar, &lex, winfo.b_unit_split())?;
w.write_all(b",")?;
dump_wids(w, winfo.word_structure())?;
dump_wids(w, &grammar, &lex, winfo.word_structure())?;
w.write_all(b",")?;
dump_gids(w, winfo.synonym_group_ids())?;
w.write_all(b"\n")?;
}
Ok(())
}

fn dump_wids<W: Write>(w: &mut W, data: &[WordId]) -> SudachiResult<()> {
fn unicode_escape(raw: &str) -> String {
// replace '"' and ','
raw.to_string()
.replace('"', "\\u0022")
.replace(',', "\\u002c")
}

fn split_mode(winfo: &WordInfo) -> &str {
let asplits = winfo.a_unit_split();
if asplits.is_empty() {
return "A";
}
let bsplits = winfo.b_unit_split();
if bsplits.is_empty() {
return "B";
}
"C"
}

fn pos_string(grammar: &Grammar, posid: u16) -> String {
let pos_parts = grammar.pos_components(posid);
pos_parts.join(",")
}

fn dictionary_form_string(grammar: &Grammar, lex: &LexiconSet, wid: i32) -> String {
if wid < 0 {
return "*".to_string();
}
let wid_with_dic = WordId::checked(0, wid as u32).expect("invalid wordid");
format!("\"{}\"", wordref_string(grammar, lex, &wid_with_dic))
}

fn wordref_string(grammar: &Grammar, lex: &LexiconSet, wid: &WordId) -> String {
let winfo = lex.get_word_info(*wid).expect("failed to get wordinfo");
format!(
"{},{},{}",
unicode_escape(winfo.surface()),
pos_string(grammar, winfo.pos_id()),
unicode_escape(winfo.reading_form()),
)
}

fn dump_wids<W: Write>(
w: &mut W,
grammar: &Grammar,
lex: &LexiconSet,
data: &[WordId],
) -> SudachiResult<()> {
if data.is_empty() {
write!(w, "*")?;
return Ok(());
}
w.write_all(b"\"")?;
for (i, e) in data.iter().enumerate() {
let prefix = match e.dic() {
0 => "",
_ => "U",
};
write!(w, "{}{}", prefix, e.word())?;
write!(w, "{}", wordref_string(grammar, lex, e))?;
if i + 1 != data.len() {
w.write_all(b"/")?;
}
}
w.write_all(b"\"")?;
Ok(())
}

fn dump_gids<W: Write>(w: &mut W, data: &[u32]) -> SudachiResult<()> {
if data.is_empty() {
write!(w, "*")?;
return Ok(());
}
for (i, e) in data.iter().enumerate() {
write!(w, "{}", e)?;
write!(w, "{:06}", e)?;
if i + 1 != data.len() {
w.write_all(b"/")?;
}
Expand Down