Skip to content

Commit

Permalink
Merge pull request #10 from zyk-mjzs/master
Browse files Browse the repository at this point in the history
Add ColorTable support, Unicode and \\ulnone
  • Loading branch information
d0rianb authored Apr 7, 2024
2 parents 05260fd + c61dbc6 commit aa125af
Show file tree
Hide file tree
Showing 7 changed files with 177 additions and 11 deletions.
5 changes: 4 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ repository = "https://github.com/d0rianb/rtf-parser"
version = "0.2.1"
edition = "2021"
license = "MIT"
keywords = ["rtf", "rich" ,"text", "format", "parser"]
keywords = ["rtf", "rich", "text", "format", "parser"]
categories = ["parsing", "parser-implementations"]
exclude = ["*.rtf", ".idea"]

Expand All @@ -15,4 +15,7 @@ opt-level = 3

[dependencies]
derivative = "2.2.0"
serde = { version = "1.0", optional = true, features = ["derive"] }

[features]
serde_support = ["serde"]
4 changes: 4 additions & 0 deletions src/document.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
use crate::header::RtfHeader;
use crate::parser::StyleBlock;

#[cfg(feature="serde_support")]
use serde::{Deserialize, Serialize};

#[cfg_attr(feature = "serde_support", derive(Deserialize, Serialize))]
#[derive(Debug, Default, Clone, PartialEq)]
pub struct RtfDocument {
pub header: RtfHeader,
Expand Down
20 changes: 20 additions & 0 deletions src/header.rs
Original file line number Diff line number Diff line change
@@ -1,37 +1,56 @@
use std::collections::HashMap;

#[cfg(feature="serde_support")]
use serde::{Deserialize, Serialize};

use crate::paragraph::Paragraph;
use crate::parser::Painter;
use crate::tokens::{ControlWord, Token};

pub type ColorRef = u16;
pub type ColorTable = HashMap<ColorRef, Color>;

pub type FontRef = u16;
pub type FontTable = HashMap<FontRef, Font>;

pub type StyleRef = u16;
pub type StyleSheet = HashMap<StyleRef, Style>;

/// Style for the StyleSheet
#[cfg_attr(feature = "serde_support", derive(Deserialize, Serialize))]
#[derive(Hash, Default, Debug, Clone, PartialEq)]
pub struct Style {
painter: Painter,
paragraph: Paragraph,
}

/// Information about the document, including references to fonts & styles
#[cfg_attr(feature = "serde_support", derive(Deserialize, Serialize))]
#[derive(Default, Debug, Clone, PartialEq)]
pub struct RtfHeader {
pub character_set: CharacterSet,
pub font_table: FontTable,
pub color_table: ColorTable,
pub stylesheet: StyleSheet,
}

#[cfg_attr(feature = "serde_support", derive(Deserialize, Serialize))]
#[derive(Hash, Default, Clone, Debug, PartialEq)]
pub struct Font {
pub name: String,
pub character_set: u8,
pub font_family: FontFamily,
}

#[cfg_attr(feature = "serde_support", derive(Deserialize, Serialize))]
#[derive(Hash, Default, Clone, Debug, PartialEq)]
pub struct Color {
pub red: u16,
pub green: u16,
pub blue: u16,
}

#[cfg_attr(feature = "serde_support", derive(Deserialize, Serialize))]
#[allow(dead_code)]
#[derive(Debug, PartialEq, Default, Clone)]
pub enum CharacterSet {
Expand All @@ -53,6 +72,7 @@ impl CharacterSet {
}
}

#[cfg_attr(feature = "serde_support", derive(Deserialize, Serialize))]
#[allow(dead_code)]
#[derive(Debug, PartialEq, Hash, Clone, Default)]
pub enum FontFamily {
Expand Down
14 changes: 10 additions & 4 deletions src/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,12 @@ impl Lexer {
let control_word = ControlWord::from(ident)?;
let mut ret = vec![Token::ControlSymbol(control_word)];
recursive_tokenize!(tail, ret);

// \u1234 \u1234 is ok, but \u1234 \u1234 is lost a space, \u1234 \u1234 lost two spaces, and so on
if control_word.0 == ControlWord::Unicode && tail.len() > 0 {
ret.push(Token::PlainText(tail));
}

return Ok(ret);
}
'*' => Ok(vec![Token::IgnorableDestination]),
Expand Down Expand Up @@ -148,7 +154,7 @@ impl Lexer {
#[cfg(test)]
pub(crate) mod tests {
use crate::lexer::Lexer;
use crate::tokens::ControlWord::{Ansi, Bold, FontNumber, FontSize, FontTable, Italic, Par, Pard, Rtf, Underline, Unknown};
use crate::tokens::ControlWord::{Ansi, Bold, FontNumber, ColorNumber, FontSize, FontTable, Italic, Par, Pard, Rtf, Underline, ColorRed, ColorGreen, ColorBlue, Unknown};
use crate::tokens::Property::*;
use crate::tokens::Token::*;

Expand Down Expand Up @@ -202,7 +208,7 @@ if (a == b) \{\
vec![
ControlSymbol((FontNumber, Value(0))),
ControlSymbol((FontSize, Value(24))),
ControlSymbol((Unknown("\\cf"), Value(0))),
ControlSymbol((ColorNumber, Value(0))),
PlainText("test de code "),
CRLF,
PlainText("if (a == b) "),
Expand Down Expand Up @@ -237,7 +243,7 @@ if (a == b) \{\
let tokens = Lexer::scan(text);
assert_eq!(
tokens.unwrap(),
vec![OpeningBracket, ControlSymbol((Unknown(r"\red"), Value(255))), ControlSymbol((Unknown(r"\blue"), Value(255))), ClosingBracket]
vec![OpeningBracket, ControlSymbol((ColorRed, Value(255))), ControlSymbol((ColorBlue, Value(255))), ClosingBracket]
);
}

Expand Down Expand Up @@ -266,7 +272,7 @@ if (a == b) \{\
OpeningBracket,
ControlSymbol((Unknown("\\partightenfactor"), Value(0))),
ControlSymbol((FontSize, Value(24))),
ControlSymbol((Unknown("\\cf"), Value(0))),
ControlSymbol((ColorNumber, Value(0))),
PlainText("Font size 12,"),
ControlSymbol((FontNumber, Value(0))),
ControlSymbol((Bold, None)),
Expand Down
7 changes: 7 additions & 0 deletions src/paragraph.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
// Define the paragraph related structs and enums

#[cfg(feature="serde_support")]
use serde::{Deserialize, Serialize};
use crate::tokens::ControlWord;

#[cfg_attr(feature = "serde_support", derive(Deserialize, Serialize))]
#[derive(Debug, Default, Clone, PartialEq, Hash)]
pub struct Paragraph {
pub alignment: Alignment,
Expand All @@ -11,6 +14,7 @@ pub struct Paragraph {
}

/// Alignement of a paragraph (left, right, center, justify)
#[cfg_attr(feature = "serde_support", derive(Deserialize, Serialize))]
#[derive(Debug, Default, Clone, Copy, PartialEq, Hash)]
pub enum Alignment {
#[default]
Expand All @@ -33,6 +37,7 @@ impl From<&ControlWord<'_>> for Alignment {
}

/// The vertical margin before / after a block of text
#[cfg_attr(feature = "serde_support", derive(Deserialize, Serialize))]
#[derive(Debug, Default, Clone, PartialEq, Hash)]
pub struct Spacing {
pub before: i32,
Expand All @@ -41,6 +46,7 @@ pub struct Spacing {
pub line_multiplier: i32,
}

#[cfg_attr(feature = "serde_support", derive(Deserialize, Serialize))]
#[derive(Default, Debug, Clone, PartialEq, Hash)]
pub enum SpaceBetweenLine {
Value(i32),
Expand All @@ -64,6 +70,7 @@ impl From<i32> for SpaceBetweenLine {
}

// This struct can not be an enum because left-indent and right-ident can both be defined at the same time
#[cfg_attr(feature = "serde_support", derive(Deserialize, Serialize))]
#[derive(Default, Debug, Clone, PartialEq, Hash)]
pub struct Indentation {
pub left: i32,
Expand Down
119 changes: 116 additions & 3 deletions src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@ use std::{fmt, mem};

use derivative::Derivative;

#[cfg(feature="serde_support")]
use serde::{Deserialize, Serialize};

use crate::document::RtfDocument;
use crate::header::{CharacterSet, Font, FontFamily, FontRef, FontTable, RtfHeader, StyleSheet};
use crate::header::{CharacterSet, Color, ColorRef, ColorTable, Font, FontFamily, FontRef, FontTable, RtfHeader, StyleSheet};
use crate::paragraph::{Alignment, Paragraph, SpaceBetweenLine};
use crate::tokens::{ControlWord, Property, Token};

Expand All @@ -18,16 +21,19 @@ macro_rules! header_control_word {
};
}

#[cfg_attr(feature = "serde_support", derive(Deserialize, Serialize))]
#[derive(Debug, Default, PartialEq, Clone)]
pub struct StyleBlock {
pub painter: Painter,
pub paragraph: Paragraph,
pub text: String,
}

#[cfg_attr(feature = "serde_support", derive(Deserialize, Serialize))]
#[derive(Derivative, Debug, Clone, PartialEq, Hash)]
#[derivative(Default)]
pub struct Painter {
pub color_ref: ColorRef,
pub font_ref: FontRef,
#[derivative(Default(value = "12"))]
pub font_size: u16,
Expand All @@ -46,6 +52,7 @@ pub enum ParserError {
IgnorableDestinationParsingError,
MalformedPainterStack,
InvalidFontIdentifier(Property),
InvalidColorIdentifier(Property),
NoMoreToken,
}

Expand All @@ -59,6 +66,7 @@ impl fmt::Display for ParserError {
ParserError::IgnorableDestinationParsingError => write!(f, "No ignorable destination should be left"),
ParserError::MalformedPainterStack => write!(f, "Malformed painter stack : Unbalanced number of brackets"),
ParserError::InvalidFontIdentifier(property) => write!(f, "Invalid font identifier : {:?}", property),
ParserError::InvalidColorIdentifier(property) => write!(f, "Invalid color identifier : {:?}", property),
ParserError::NoMoreToken => write!(f, "No more token to parse"),
};
return Ok(());
Expand Down Expand Up @@ -120,11 +128,13 @@ impl<'a> Parser<'a> {
};
#[rustfmt::skip] // For now, rustfmt does not support this kind of alignement
match control_word {
ControlWord::ColorNumber => current_painter.color_ref = property.get_value() as ColorRef,
ControlWord::FontNumber => current_painter.font_ref = property.get_value() as FontRef,
ControlWord::FontSize => current_painter.font_size = property.get_value() as u16,
ControlWord::Bold => current_painter.bold = property.as_bool(),
ControlWord::Italic => current_painter.italic = property.as_bool(),
ControlWord::Underline => current_painter.underline = property.as_bool(),
ControlWord::UnderlineNone => current_painter.underline = false,
ControlWord::Superscript => current_painter.superscript = property.as_bool(),
ControlWord::Subscript => current_painter.subscript = property.as_bool(),
ControlWord::Smallcaps => current_painter.smallcaps = property.as_bool(),
Expand All @@ -141,6 +151,11 @@ impl<'a> Parser<'a> {
ControlWord::SpaceAfter => paragraph.spacing.after = property.get_value(),
ControlWord::SpaceBetweenLine => paragraph.spacing.between_line = SpaceBetweenLine::from(property.get_value()),
ControlWord::SpaceLineMul => paragraph.spacing.line_multiplier = property.get_value(),
ControlWord::Unicode => {
let unicode = property.get_value() as u16;
let str = String::from_utf16(&vec![unicode]).unwrap();
Self::add_text_to_document(&str, &painter_stack, &paragraph, &mut document)?
}
// Others
_ => {}
};
Expand Down Expand Up @@ -246,6 +261,14 @@ impl<'a> Parser<'a> {
break;
}
}
(Some(Token::OpeningBracket), Some(&header_control_word!(ColorTable, None))) => {
let color_table_tokens = self.consume_tokens_until_matching_bracket();
header.color_table = Self::parse_color_table(&color_table_tokens)?;
// After the color table, check if next token is plain text without consuming it. If so, break
if let Some(&Token::PlainText(_text)) = self.get_next_token() {
break;
}
}
(Some(Token::OpeningBracket), Some(&header_control_word!(StyleSheet, None))) => {
let stylesheet_tokens = self.consume_tokens_until_matching_bracket();
header.stylesheet = Self::parse_stylesheet(&stylesheet_tokens)?;
Expand Down Expand Up @@ -322,6 +345,34 @@ impl<'a> Parser<'a> {
return Ok(table);
}

fn parse_color_table(color_table_tokens: &Vec<Token<'a>>) -> Result<ColorTable, ParserError> {
let Some(color_table_first_token) = color_table_tokens.get(0) else {
return Err(ParserError::NoMoreToken);
};
if color_table_first_token != &header_control_word!(ColorTable, None) {
return Err(ParserError::InvalidToken(format!("ParserError: {:?} is not a ColorTable token", color_table_first_token)));
}
let mut table = HashMap::new();
let mut current_key = 1;
let mut current_color = Color::default();
for token in color_table_tokens.iter() {
match token {
Token::ControlSymbol((control_word, property)) => match control_word {
ControlWord::ColorRed => current_color.red = property.get_value() as u16,
ControlWord::ColorGreen => current_color.green = property.get_value() as u16,
ControlWord::ColorBlue => {
current_color.blue = property.get_value() as u16;
table.insert(current_key, current_color.clone());
current_key += 1;
}
_ => {}
},
_ => {}
}
}
return Ok(table);
}

fn parse_stylesheet(stylesheet_tokens: &Vec<Token<'a>>) -> Result<StyleSheet, ParserError> {
// TODO
return Ok(StyleSheet::from([]));
Expand Down Expand Up @@ -449,6 +500,9 @@ pub mod tests {
}
)
]),
color_table: ColorTable::from([
(1, Color { red: 255, green: 255, blue: 255 }),
]),
..RtfHeader::default()
}
);
Expand Down Expand Up @@ -514,7 +568,7 @@ pub mod tests {
\f1\b0\fs21 \cf0 \
\pard\pardeftab709\fi-432\ri-1\sb240\sa120\partightenfactor0
\ls1\ilvl0
\f0\b\fs36\u\cf2\plain Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc ac faucibus odio. \
\f0\b\fs36\cf2\plain Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc ac faucibus odio. \
\pard\pardeftab709\sl288\slmult1\sa225\qj\partightenfactor0
}"#;
let tokens = Lexer::scan(rtf).unwrap();
Expand Down Expand Up @@ -556,6 +610,65 @@ pub mod tests {
let rtf = r"{\rtf1{\fonttbl {\f0 Times;}}\f0\b\fs36\u\cf2\plain Plain text}";
let tokens = Lexer::scan(rtf).unwrap();
let document = Parser::new(tokens).parse().unwrap();
assert_eq!(document.body[0].painter, Painter::default());
let mut painter = Painter::default();
painter.bold = true;
painter.font_size = 36;
assert_eq!(document.body[0].painter, painter);
}

#[test]
fn parse_color_table() {
// cf0 is unset color
// cf1 is first color
// cf2 is second color
// etc...
let rtf = r#"{\rtf1\ansi\ansicpg936\cocoartf2761
\cocoatextscaling0\cocoaplatform0{\fonttbl\f0\fswiss\fcharset0 Helvetica;\f1\fnil\fcharset134 PingFangSC-Regular;}
{\colortbl;\red255\green255\blue255;\red251\green2\blue7;\red114\green44\blue253;}
{\*\expandedcolortbl;;\cssrgb\c100000\c14913\c0;\cssrgb\c52799\c30710\c99498;}
\paperw11900\paperh16840\margl1440\margr1440\vieww11520\viewh8400\viewkind0
\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
\f0\fs24 \cf2 A
\f1 \cf3 B}"#;
let tokens = Lexer::scan(rtf).unwrap();
let document = Parser::new(tokens).parse().unwrap();
assert_eq!(document.header.color_table.get(&document.body[0].painter.color_ref).unwrap(), &Color { red: 251, green: 2, blue: 7 });
}

#[test]
fn parse_underline() {
// \\ul underline true
// \\ulnone underline false
let rtf = r#"{\rtf1\ansi\ansicpg936\cocoartf2761
\cocoatextscaling0\cocoaplatform0{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
{\colortbl;\red255\green255\blue255;}
{\*\expandedcolortbl;;}
\paperw11900\paperh16840\margl1440\margr1440\vieww11520\viewh8400\viewkind0
\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
\f0\fs24 \cf0 \ul \ulc0 a\ulnone A}"#;
let tokens = Lexer::scan(rtf).unwrap();
let document = Parser::new(tokens).parse().unwrap();
assert_eq!(&document.body[0].painter.underline, &true);
assert_eq!(&document.body[1].painter.underline, &false);
}

#[test]
fn parse_unicode() {
// start with \\uc0
// \u21834 => 啊
let rtf = r#"{\rtf1\ansi\ansicpg936\cocoartf2761
\cocoatextscaling0\cocoaplatform0{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
{\colortbl;\red255\green255\blue255;}
{\*\expandedcolortbl;;}
\paperw11900\paperh16840\margl1440\margr1440\vieww11520\viewh8400\viewkind0
\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
\f0\fs24 \cf0 \uc0\u21834 \u21834 }"#;
// \f0\fs24 \cf0 \uc0\u21834 \u21834 }"#;
let tokens = Lexer::scan(rtf).unwrap();
let document = Parser::new(tokens).parse().unwrap();
assert_eq!(&document.body[0].text, "啊 啊");
}
}
Loading

0 comments on commit aa125af

Please sign in to comment.