diff --git a/fuzz/fuzz_targets/group_events.rs b/fuzz/fuzz_targets/group_events.rs index bcb6499..a19cf28 100644 --- a/fuzz/fuzz_targets/group_events.rs +++ b/fuzz/fuzz_targets/group_events.rs @@ -9,7 +9,7 @@ fuzz_target!(|text: String| { let flattened_groups = group_events(&events) .into_iter() .flat_map(|group| match group { - Group::Translate(events) | Group::Skip(events) => events, + Group::Translate { events, .. } | Group::Skip(events) => events, }) .collect::>(); diff --git a/i18n-helpers/src/directives.rs b/i18n-helpers/src/directives.rs index f0e9f01..93549b6 100644 --- a/i18n-helpers/src/directives.rs +++ b/i18n-helpers/src/directives.rs @@ -4,7 +4,7 @@ use std::sync::OnceLock; #[derive(Debug, PartialEq)] pub enum Directive { Skip, - TranslatorComment(String), + Comment(String), } pub fn find(html: &str) -> Option { @@ -30,7 +30,7 @@ pub fn find(html: &str) -> Option { command.find("comment").unwrap() + "comment".len() + 1, command.len(), ); - Some(Directive::TranslatorComment( + Some(Directive::Comment( command[start_of_comment_offset..].trim().into(), )) } @@ -83,9 +83,9 @@ mod tests { } #[test] - fn test_translator_comment() { + fn test_comment() { assert!(match find("") { - Some(Directive::TranslatorComment(s)) => { + Some(Directive::Comment(s)) => { s == "hello world!" } _ => false, @@ -93,9 +93,9 @@ mod tests { } #[test] - fn test_translator_empty_comment_does_nothing() { + fn test_empty_comment_does_nothing() { assert!(match find("") { - Some(Directive::TranslatorComment(s)) => { + Some(Directive::Comment(s)) => { s.is_empty() } _ => false, diff --git a/i18n-helpers/src/lib.rs b/i18n-helpers/src/lib.rs index fe50fce..fd4919e 100644 --- a/i18n-helpers/src/lib.rs +++ b/i18n-helpers/src/lib.rs @@ -151,7 +151,11 @@ pub enum Group<'a> { /// /// This includes `[Text("foo")]` as well as sequences with text /// such as `[Start(Emphasis), Text("foo") End(Emphasis)]`. - Translate(Vec<(usize, Event<'a>)>), + Translate { + events: Vec<(usize, Event<'a>)>, + /// A comment that may be associated with the translation text. + comment: String, + }, /// Markdown events which should be skipped when translating. /// @@ -160,6 +164,21 @@ pub enum Group<'a> { Skip(Vec<(usize, Event<'a>)>), } +#[derive(Debug, Default)] +struct GroupingContext { + skip_next_group: bool, + comments: Vec, +} + +impl GroupingContext { + fn clear_skip_next_group(self) -> Self { + Self { + skip_next_group: false, + ..self + } + } +} + /// Group Markdown events into translatable and skipped events. /// /// This function will partition the input events into groups of @@ -192,9 +211,10 @@ pub enum Group<'a> { /// (1, Event::Start(Tag::List(None))), /// (1, Event::Start(Tag::Item)), /// ]), -/// Group::Translate(vec![ -/// (1, Event::Text("A list item.".into())), -/// ]), +/// Group::Translate { +/// events: vec![ +/// (1, Event::Text("A list item.".into())), +/// ], comment: "".into()}, /// Group::Skip(vec![ /// (1, Event::End(Tag::Item)), /// (1, Event::End(Tag::List(None))), @@ -203,20 +223,6 @@ pub enum Group<'a> { /// ); /// ``` pub fn group_events<'a>(events: &'a [(usize, Event<'a>)]) -> Vec> { - #[derive(Debug)] - struct GroupingContext { - skip_next_group: bool, - // TODO: this struct is planned to expand with translator - // comments and message contexts. - } - impl GroupingContext { - fn clear_skip_next_group(self) -> Self { - Self { - skip_next_group: false, - } - } - } - #[derive(Debug)] enum State { Translate(usize), @@ -229,7 +235,7 @@ pub fn group_events<'a>(events: &'a [(usize, Event<'a>)]) -> Vec> { self, idx: usize, events: &'a [(usize, Event<'a>)], - ctx: GroupingContext, + mut ctx: GroupingContext, ) -> (Vec>, GroupingContext) { match self { State::Translate(start) => { @@ -239,9 +245,15 @@ pub fn group_events<'a>(events: &'a [(usize, Event<'a>)]) -> Vec> { ctx.clear_skip_next_group(), ) } else if is_codeblock_group(&events[start..idx]) { - (parse_codeblock(&events[start..idx]), ctx) + parse_codeblock(&events[start..idx], ctx) } else { - (vec![Group::Translate(events[start..idx].into())], ctx) + ( + vec![Group::Translate { + events: events[start..idx].into(), + comment: std::mem::take(&mut ctx.comments).join(" "), + }], + ctx, + ) } } State::Skip(start) => (vec![Group::Skip(events[start..idx].into())], ctx), @@ -251,9 +263,7 @@ pub fn group_events<'a>(events: &'a [(usize, Event<'a>)]) -> Vec> { let mut groups = Vec::new(); let mut state = State::Skip(0); - let mut ctx = GroupingContext { - skip_next_group: false, - }; + let mut ctx = GroupingContext::default(); for (idx, (_, event)) in events.iter().enumerate() { match event { @@ -318,6 +328,22 @@ pub fn group_events<'a>(events: &'a [(usize, Event<'a>)]) -> Vec> { ctx.skip_next_group = true; } + + Some(directives::Directive::Comment(comment)) => { + // If in the middle of translation, finish it. + if let State::Translate(_) = state { + let mut next_groups; + (next_groups, ctx) = state.into_groups(idx, events, ctx); + groups.append(&mut next_groups); + + // Restart translation: subtle but should be + // needed to handle the skipping of the rest of + // the inlined content. + state = State::Translate(idx); + } + + ctx.comments.push(comment); + } // Otherwise, treat as a skipping group. _ => { if let State::Translate(_) = state { @@ -346,7 +372,10 @@ pub fn group_events<'a>(events: &'a [(usize, Event<'a>)]) -> Vec> { } match state { - State::Translate(start) => groups.push(Group::Translate(events[start..].into())), + State::Translate(start) => groups.push(Group::Translate { + events: events[start..].into(), + comment: "".into(), + }), State::Skip(start) => groups.push(Group::Skip(events[start..].into())), } @@ -376,7 +405,10 @@ fn is_translate_scope(x: Scope) -> bool { } /// Creates groups by checking codeblock with heuristic way. -fn heuristic_codeblock<'a>(events: &'a [(usize, Event)]) -> Vec> { +fn heuristic_codeblock<'a>( + events: &'a [(usize, Event)], + mut ctx: GroupingContext, +) -> (Vec>, GroupingContext) { let is_translate = match events { [(_, Event::Start(Tag::CodeBlock(_))), .., (_, Event::End(Tag::CodeBlock(_)))] => { let (codeblock_text, _) = reconstruct_markdown(events, None); @@ -389,14 +421,23 @@ fn heuristic_codeblock<'a>(events: &'a [(usize, Event)]) -> Vec> { }; if is_translate { - vec![Group::Translate(events.into())] + ( + vec![Group::Translate { + events: events.into(), + comment: std::mem::take(&mut ctx.comments).join(" "), + }], + ctx, + ) } else { - vec![Group::Skip(events.into())] + (vec![Group::Skip(events.into())], ctx) } } /// Creates groups by parsing codeblock. -fn parse_codeblock<'a>(events: &'a [(usize, Event)]) -> Vec> { +fn parse_codeblock<'a>( + events: &'a [(usize, Event)], + mut ctx: GroupingContext, +) -> (Vec>, GroupingContext) { // Language detection from language identifier of codeblock. static SYNTAX_SET: OnceLock = OnceLock::new(); let ss = SYNTAX_SET.get_or_init(SyntaxSet::load_defaults_newlines); @@ -409,7 +450,7 @@ fn parse_codeblock<'a>(events: &'a [(usize, Event)]) -> Vec> { let Some(syntax) = syntax else { // If there is no language specifier, falling back to heuristic way. - return heuristic_codeblock(events); + return heuristic_codeblock(events, ctx); }; let mut ps = ParseState::new(syntax); @@ -423,7 +464,10 @@ fn parse_codeblock<'a>(events: &'a [(usize, Event)]) -> Vec> { let Ok(ops) = ps.parse_line(text, ss) else { // If parse is failed, the text event should be translated. - ret.push(Group::Translate(events[idx..idx + 1].into())); + ret.push(Group::Translate { + events: events[idx..idx + 1].into(), + comment: std::mem::take(&mut ctx.comments).join(" "), + }); continue; }; @@ -461,19 +505,36 @@ fn parse_codeblock<'a>(events: &'a [(usize, Event)]) -> Vec> { translate_events.push((range_line, Event::Text(text.into()))); } else { let whitespace_events = extract_trailing_whitespaces(&mut translate_events); - groups.push(Group::Translate(std::mem::take(&mut translate_events))); - groups.push(Group::Skip(whitespace_events)); + if !translate_events.is_empty() { + groups.push(Group::Translate { + events: std::mem::take(&mut translate_events), + comment: std::mem::take(&mut ctx.comments).join(" "), + }); + } + if !whitespace_events.is_empty() { + groups.push(Group::Skip(whitespace_events)); + } groups.push(Group::Skip(vec![(range_line, Event::Text(text.into()))])); } } let whitespace_events = extract_trailing_whitespaces(&mut translate_events); - groups.push(Group::Translate(std::mem::take(&mut translate_events))); - groups.push(Group::Skip(whitespace_events)); + if !translate_events.is_empty() { + groups.push(Group::Translate { + events: std::mem::take(&mut translate_events), + comment: std::mem::take(&mut ctx.comments).join(" "), + }); + } + if !whitespace_events.is_empty() { + groups.push(Group::Skip(whitespace_events)); + } if stack_failure { // If stack operation is failed, the text event should be translated. - ret.push(Group::Translate(events[idx..idx + 1].into())); + ret.push(Group::Translate { + events: events[idx..idx + 1].into(), + comment: std::mem::take(&mut ctx.comments).join(" "), + }); } else { ret.append(&mut groups); } @@ -483,7 +544,7 @@ fn parse_codeblock<'a>(events: &'a [(usize, Event)]) -> Vec> { } } } - ret + (ret, ctx) } /// Extract trailing events which have whitespace only. @@ -562,6 +623,20 @@ pub fn reconstruct_markdown( (String::from(markdown.trim_start_matches('\n')), new_state) } +#[derive(Debug, PartialEq)] +pub struct ExtractedMessage { + pub message: String, + pub comment: String, +} +impl From<&str> for ExtractedMessage { + fn from(s: &str) -> Self { + ExtractedMessage { + message: s.to_owned(), + comment: "".into(), + } + } +} + /// Extract translatable strings from `document`. /// /// # Examples @@ -609,20 +684,26 @@ pub fn reconstruct_markdown( /// ], /// ); /// ``` -pub fn extract_messages(document: &str) -> Vec<(usize, String)> { +pub fn extract_messages(document: &str) -> Vec<(usize, ExtractedMessage)> { let events = extract_events(document, None); let mut messages = Vec::new(); let mut state = None; for group in group_events(&events) { match group { - Group::Translate(events) => { + Group::Translate { events, comment } => { if let Some((lineno, _)) = events.first() { let (text, new_state) = reconstruct_markdown(&events, state); // Skip empty messages since they are special: // they contains the PO file metadata. if !text.trim().is_empty() { - messages.push((*lineno, text)); + messages.push(( + *lineno, + ExtractedMessage { + message: text, + comment, + }, + )); } state = Some(new_state); } @@ -690,7 +771,7 @@ pub fn translate_events<'a>( for group in group_events(events) { match group { - Group::Translate(events) => { + Group::Translate { events, .. } => { // Reconstruct the message. let (msgid, new_state) = reconstruct_markdown(&events, state.clone()); let translated = catalog @@ -739,7 +820,7 @@ mod tests { assert_eq!( extract_messages(document) .iter() - .map(|(lineno, msg)| (*lineno, &msg[..])) + .map(|(lineno, msg)| (*lineno, &msg.message[..])) .collect::>(), expected, ); @@ -1503,4 +1584,114 @@ def g(x): &[(2, "````\n```\n// codeblock in codeblock\n```\n````")], ); } + + #[test] + fn extract_message_comments() { + assert_eq!( + extract_messages( + " + +Hello world! +" + ), + vec![( + 3, + ExtractedMessage { + message: "Hello world!".into(), + comment: "first comment!".into(), + } + )] + ); + } + + #[test] + fn extract_message_comments_multiple_joined() { + assert_eq!( + extract_messages( + " + + +Greetings! +" + ), + vec![( + 4, + ExtractedMessage { + message: "Greetings!".into(), + comment: "this is a test of a comment that spans.".into(), + } + )] + ); + } + + #[test] + fn extract_message_multiple_comments() { + assert_eq!( + extract_messages( + " +before-no-comment + + +Hello again, this is some text +with a comment on it. + + +after + +after-no-comment +" + ), + vec![ + ( + 2, + ExtractedMessage { + message: "before-no-comment".into(), + comment: "".into(), + } + ), + ( + 5, + ExtractedMessage { + message: "Hello again, this is some text with a comment on it.".into(), + comment: "another".into(), + } + ), + ( + 9, + ExtractedMessage { + message: "after".into(), + comment: "one more comment.".into(), + } + ), + ( + 11, + ExtractedMessage { + message: "after-no-comment".into(), + comment: "".into(), + } + ), + ] + ); + } + + #[test] + fn extract_message_comments_on_codeblock() { + assert_eq!( + extract_messages( + r#" + +```python +print("Hello world") +``` +"# + ), + vec![( + 4, + ExtractedMessage { + message: "\"Hello world\"".into(), + comment: "greetings!".into(), + } + ),] + ); + } } diff --git a/i18n-helpers/src/normalize.rs b/i18n-helpers/src/normalize.rs index 7fed94d..776d13c 100644 --- a/i18n-helpers/src/normalize.rs +++ b/i18n-helpers/src/normalize.rs @@ -16,6 +16,14 @@ fn parse_source(source: &str) -> Option<(&str, usize)> { Some((path, lineno.parse().ok()?)) } +/// Use only the potion of extract_messages that the normalizer cares about. +fn extract_document_messages(doc: &str) -> Vec<(usize, String)> { + extract_messages(doc) + .into_iter() + .map(|(idx, extracted)| (idx, extracted.message)) + .collect() +} + fn compute_source(source: &str, delta: usize) -> String { let mut new_source = String::with_capacity(source.len()); @@ -110,7 +118,7 @@ impl<'a> SourceMap<'a> { // link should be defined. let document = field.project(message.msgid(), message.msgstr()?); if !has_broken_link(document) { - return Ok(extract_messages(document)); + return Ok(extract_document_messages(document)); } // If `parse_source` fails, then `message` has more than one @@ -118,7 +126,7 @@ impl<'a> SourceMap<'a> { // case since it is unclear which link definition to use. let path = match parse_source(message.source()) { Some((path, _)) => path, - None => return Ok(extract_messages(document)), + None => return Ok(extract_document_messages(document)), }; // First, we try constructing a document using other messages @@ -149,7 +157,7 @@ impl<'a> SourceMap<'a> { let _ = file.read_to_string(&mut full_document); } - let mut messages = extract_messages(&full_document); + let mut messages = extract_document_messages(&full_document); // Truncate away the messages from `full_document` which start // after `document`. let line_count = document.lines().count(); diff --git a/i18n-helpers/src/xgettext.rs b/i18n-helpers/src/xgettext.rs index a7b8059..f421ae7 100644 --- a/i18n-helpers/src/xgettext.rs +++ b/i18n-helpers/src/xgettext.rs @@ -39,7 +39,7 @@ fn strip_link(text: &str) -> String { without_link } -fn add_message(catalog: &mut Catalog, msgid: &str, source: &str) { +fn add_message(catalog: &mut Catalog, msgid: &str, source: &str, comment: &str) { let sources = match catalog.find_message(None, msgid, None) { Some(msg) => format!("{}\n{}", msg.source(), source), None => String::from(source), @@ -47,6 +47,7 @@ fn add_message(catalog: &mut Catalog, msgid: &str, source: &str) { let message = Message::build_singular() .with_source(sources) .with_msgid(String::from(msgid)) + .with_comments(String::from(comment)) .done(); catalog.append_or_update(message); } @@ -130,14 +131,20 @@ where let summary_path = ctx.config.book.src.join("SUMMARY.md"); let summary = summary_reader(ctx.root.join(&summary_path)) .with_context(|| anyhow!("Failed to read {}", summary_path.display()))?; - for (lineno, msgid) in extract_messages(&summary) { + for (lineno, extracted_msg) in extract_messages(&summary) { + let msgid = extracted_msg.message; let source = build_source(&summary_path, lineno, granularity); // The summary is mostly links like "[Foo *Bar*](foo-bar.md)". // We strip away the link to get "Foo *Bar*". The formatting // is stripped away by mdbook when it sends the book to // mdbook-gettext -- we keep the formatting here in case the // same text is used for the page title. - add_message(&mut catalog, &strip_link(&msgid), &source); + add_message( + &mut catalog, + &strip_link(&msgid), + &source, + &extracted_msg.comment, + ); } // Next, we add the chapter contents. @@ -147,9 +154,10 @@ where Some(path) => ctx.config.book.src.join(path), None => continue, }; - for (lineno, msgid) in extract_messages(&chapter.content) { + for (lineno, extracted) in extract_messages(&chapter.content) { + let msgid = extracted.message; let source = build_source(&path, lineno, granularity); - add_message(&mut catalog, &msgid, &source); + add_message(&mut catalog, &msgid, &source, &extracted.comment); } } }