diff --git a/novelwriter/formats/tokenizer.py b/novelwriter/formats/tokenizer.py index b7eba0336..cdea7a857 100644 --- a/novelwriter/formats/tokenizer.py +++ b/novelwriter/formats/tokenizer.py @@ -90,9 +90,10 @@ def __init__(self, project: NWProject) -> None: self._project = project # Data Variables - self._text = "" # The raw text to be tokenized - self._handle = None # The item handle currently being processed - self._keepRaw = False # Whether to keep the raw text, used by ToRaw + self._text = "" # The raw text to be tokenized + self._handle = None # The item handle currently being processed + self._keepRaw = False # Whether to keep the raw text, used by ToRaw + self._noTokens = False # Disable tokenization if they're not needed # Blocks and Meta Data (Per Document) self._blocks: list[T_Block] = [] @@ -522,22 +523,24 @@ def tokenizeText(self) -> None: 4: The internal formatting map of the text, TxtFmt.* 5: The formats of the block, BlockFmt.* """ + if self._keepRaw: + self._raw.append(f"{self._text.rstrip()}\n\n") + if self._noTokens: + return + if self._isNovel: + self._hFormatter.setHandle(self._handle) + # Cache Flags isNovel = self._isNovel - keepRaw = self._keepRaw doJustify = self._doJustify keepBreaks = self._keepBreaks indentFirst = self._indentFirst firstIndent = self._firstIndent - if self._isNovel: - self._hFormatter.setHandle(self._handle) - # Replace all instances of [br] with a placeholder character text = REGEX_PATTERNS.lineBreak.sub("\uffff", self._text) nHead = 0 - rawText = [] tHandle = self._handle or "" tBlocks: list[T_Block] = [B_EMPTY] for bLine in text.splitlines(): @@ -547,8 +550,6 @@ def tokenizeText(self) -> None: # Check for blank lines if not sLine: tBlocks.append(B_EMPTY) - if keepRaw: - rawText.append("\n") continue if self._breakNext: @@ -613,14 +614,10 @@ def tokenizeText(self) -> None: tBlocks.append(( BlockTyp.COMMENT, "", tLine, tFmt, tStyle )) - if keepRaw: - rawText.append(f"{aLine}\n") elif cStyle == nwComment.FOOTNOTE: tLine, tFmt = self._extractFormats(cText, skip=TextFmt.FNOTE) self._footnotes[f"{tHandle}:{cKey}"] = (tLine, tFmt) - if keepRaw: - rawText.append(f"{aLine}\n") elif aLine.startswith("@"): # Keywords @@ -634,8 +631,6 @@ def tokenizeText(self) -> None: tBlocks.append(( BlockTyp.KEYWORD, tTag[1:], tLine, tFmt, tStyle )) - if keepRaw: - rawText.append(f"{aLine}\n") elif aLine.startswith(("# ", "#! ")): # Title or Partition Headings @@ -670,8 +665,6 @@ def tokenizeText(self) -> None: tBlocks.append(( tType, f"{tHandle}:T{nHead:04d}", tText, [], tStyle )) - if keepRaw: - rawText.append(f"{aLine}\n") elif aLine.startswith(("## ", "##! ")): # (Unnumbered) Chapter Headings @@ -704,8 +697,6 @@ def tokenizeText(self) -> None: tBlocks.append(( tType, f"{tHandle}:T{nHead:04d}", tText, [], tStyle )) - if keepRaw: - rawText.append(f"{aLine}\n") elif aLine.startswith(("### ", "###! ")): # (Alternative) Scene Headings @@ -744,8 +735,6 @@ def tokenizeText(self) -> None: tBlocks.append(( tType, f"{tHandle}:T{nHead:04d}", tText, [], tStyle )) - if keepRaw: - rawText.append(f"{aLine}\n") elif aLine.startswith("#### "): # Section Headings @@ -773,8 +762,6 @@ def tokenizeText(self) -> None: tBlocks.append(( tType, f"{tHandle}:T{nHead:04d}", tText, [], tStyle )) - if keepRaw: - rawText.append(f"{aLine}\n") else: # Text Lines @@ -821,8 +808,6 @@ def tokenizeText(self) -> None: tBlocks.append(( BlockTyp.TEXT, "", tLine, tFmt, tStyle )) - if keepRaw: - rawText.append(f"{aLine}\n") # If we have content, turn off the first page flag if self._isFirst and tBlocks: @@ -840,9 +825,6 @@ def tokenizeText(self) -> None: # Always add an empty line at the end of the file tBlocks.append(B_EMPTY) - if keepRaw: - rawText.append("\n") - self._raw.append("".join(rawText)) # Second Pass # =========== diff --git a/novelwriter/formats/toraw.py b/novelwriter/formats/toraw.py index 9a410f2f6..f316c4846 100644 --- a/novelwriter/formats/toraw.py +++ b/novelwriter/formats/toraw.py @@ -46,6 +46,7 @@ class ToRaw(Tokenizer): def __init__(self, project: NWProject) -> None: super().__init__(project) self._keepRaw = True + self._noTokens = True return def doConvert(self) -> None: diff --git a/tests/reference/mBuildDocBuild_NWD_Lorem_Ipsum.json b/tests/reference/mBuildDocBuild_NWD_Lorem_Ipsum.json index e3b1ccee5..f1fd3415e 100644 --- a/tests/reference/mBuildDocBuild_NWD_Lorem_Ipsum.json +++ b/tests/reference/mBuildDocBuild_NWD_Lorem_Ipsum.json @@ -2,8 +2,8 @@ "meta": { "projectName": "Lorem Ipsum", "novelAuthor": "lipsum.com", - "buildTime": 1730136328, - "buildTimeStr": "2024-10-28 18:25:28" + "buildTime": 1731001720, + "buildTimeStr": "2024-11-07 18:48:40" }, "text": { "nwd": [ @@ -21,6 +21,7 @@ ">> \u201cThere is no one who loves pain itself, who seeks after it and wants to have it, simply because it is pain\u2026\u201d <<" ], [ + "[NEW PAGE]", "", "% Exctracted from the lipsum.com website.", "", diff --git a/tests/reference/mBuildDocBuild_NWD_Lorem_Ipsum.txt b/tests/reference/mBuildDocBuild_NWD_Lorem_Ipsum.txt index 73d5d3d5e..219351889 100644 --- a/tests/reference/mBuildDocBuild_NWD_Lorem_Ipsum.txt +++ b/tests/reference/mBuildDocBuild_NWD_Lorem_Ipsum.txt @@ -10,6 +10,7 @@ >> “There is no one who loves pain itself, who seeks after it and wants to have it, simply because it is pain…” << +[NEW PAGE] % Exctracted from the lipsum.com website. diff --git a/tests/test_core/test_core_docbuild.py b/tests/test_core/test_core_docbuild.py index cbb6c52fa..166b281a0 100644 --- a/tests/test_core/test_core_docbuild.py +++ b/tests/test_core/test_core_docbuild.py @@ -467,8 +467,8 @@ def testCoreDocBuild_Custom(mockGUI, fncPath: Path): assert docFile.read_text(encoding="utf-8") == ( "#! New Novel\n\n" ">> By Jane Doe <<\n\n" - "## New Chapter\n\n\n" - "### New Scene\n\n\n" + "## New Chapter\n\n" + "### New Scene\n\n" ) docFile.unlink() @@ -497,8 +497,8 @@ def testCoreDocBuild_Custom(mockGUI, fncPath: Path): assert docFile.read_text(encoding="utf-8") == ( "#! New Novel\n\n" ">> By Jane Doe <<\n\n" - "## New Chapter\n\n\n" - "### New Scene\n\n\n" + "## New Chapter\n\n" + "### New Scene\n\n" ) docFile.unlink() @@ -621,8 +621,8 @@ def testCoreDocBuild_IterBuild(mockGUI, fncPath: Path, mockRnd): assert docFile.read_text(encoding="utf-8") == ( "#! New Novel\n\n" ">> By Jane Doe <<\n\n" - "## New Chapter\n\n\n" - "### New Scene\n\n\n" + "## New Chapter\n\n" + "### New Scene\n\n" "#! Notes: Plot\n\n" "# Main Plot\n" "**Text**\n\n" diff --git a/tests/test_formats/test_fmt_tokenizer.py b/tests/test_formats/test_fmt_tokenizer.py index 1daf7eeb0..bab57bfca 100644 --- a/tests/test_formats/test_fmt_tokenizer.py +++ b/tests/test_formats/test_fmt_tokenizer.py @@ -31,7 +31,6 @@ from novelwriter.formats.shared import BlockFmt, BlockTyp, TextFmt, stripEscape from novelwriter.formats.tokenizer import COMMENT_STYLE, HeadingFormatter, Tokenizer from novelwriter.formats.tomarkdown import ToMarkdown -from novelwriter.formats.toraw import ToRaw from tests.tools import C, buildTestProject @@ -186,7 +185,8 @@ def testFmtToken_TextOps(monkeypatch, mockGUI, mockRnd, fncPath): project.data.setLanguage("en") project._loadProjectLocalisation() - tokens = ToRaw(project) + tokens = BareTokenizer(project) + tokens._keepRaw = True # Set some content to work with docText = ( @@ -255,7 +255,7 @@ def testFmtToken_StripEscape(): def testFmtToken_HeaderFormat(mockGUI): """Test the tokenization of header formats in the Tokenizer class.""" project = NWProject() - tokens = ToRaw(project) + tokens = BareTokenizer(project) tokens._handle = TMH # Title @@ -270,7 +270,6 @@ def testFmtToken_HeaderFormat(mockGUI): assert tokens._blocks == [ (BlockTyp.TITLE, TM1, "Novel Title", [], BlockFmt.CENTRE), ] - assert tokens._raw[-1] == "#! Novel Title\n\n" # Note File tokens._isNovel = False @@ -281,7 +280,6 @@ def testFmtToken_HeaderFormat(mockGUI): assert tokens._blocks == [ (BlockTyp.TITLE, TM1, "Note Title", [], BlockFmt.CENTRE), ] - assert tokens._raw[-1] == "#! Note Title\n\n" # Header 1 # ======== @@ -295,7 +293,6 @@ def testFmtToken_HeaderFormat(mockGUI): assert tokens._blocks == [ (BlockTyp.HEAD1, TM1, "Novel Title", [], BlockFmt.CENTRE), ] - assert tokens._raw[-1] == "# Novel Title\n\n" # Note File tokens._isNovel = False @@ -306,7 +303,6 @@ def testFmtToken_HeaderFormat(mockGUI): assert tokens._blocks == [ (BlockTyp.HEAD1, TM1, "Note Title", [], BlockFmt.NONE), ] - assert tokens._raw[-1] == "# Note Title\n\n" # Header 2 # ======== @@ -319,7 +315,6 @@ def testFmtToken_HeaderFormat(mockGUI): assert tokens._blocks == [ (BlockTyp.HEAD2, TM1, "Chapter One", [], BlockFmt.PBB), ] - assert tokens._raw[-1] == "## Chapter One\n\n" # Note File tokens._isNovel = False @@ -329,7 +324,6 @@ def testFmtToken_HeaderFormat(mockGUI): assert tokens._blocks == [ (BlockTyp.HEAD2, TM1, "Heading 2", [], BlockFmt.NONE), ] - assert tokens._raw[-1] == "## Heading 2\n\n" # Header 3 # ======== @@ -342,7 +336,6 @@ def testFmtToken_HeaderFormat(mockGUI): assert tokens._blocks == [ (BlockTyp.HEAD3, TM1, "Scene One", [], BlockFmt.NONE), ] - assert tokens._raw[-1] == "### Scene One\n\n" # Note File tokens._isNovel = False @@ -352,7 +345,6 @@ def testFmtToken_HeaderFormat(mockGUI): assert tokens._blocks == [ (BlockTyp.HEAD3, TM1, "Heading 3", [], BlockFmt.NONE), ] - assert tokens._raw[-1] == "### Heading 3\n\n" # Header 4 # ======== @@ -365,7 +357,6 @@ def testFmtToken_HeaderFormat(mockGUI): assert tokens._blocks == [ (BlockTyp.HEAD4, TM1, "A Section", [], BlockFmt.NONE), ] - assert tokens._raw[-1] == "#### A Section\n\n" # Note File tokens._isNovel = False @@ -375,7 +366,6 @@ def testFmtToken_HeaderFormat(mockGUI): assert tokens._blocks == [ (BlockTyp.HEAD4, TM1, "Heading 4", [], BlockFmt.NONE), ] - assert tokens._raw[-1] == "#### Heading 4\n\n" # Title # ===== @@ -389,7 +379,6 @@ def testFmtToken_HeaderFormat(mockGUI): assert tokens._blocks == [ (BlockTyp.TITLE, TM1, "Title", [], BlockFmt.PBB | BlockFmt.CENTRE), ] - assert tokens._raw[-1] == "#! Title\n\n" # Note File tokens._isNovel = False @@ -400,7 +389,6 @@ def testFmtToken_HeaderFormat(mockGUI): assert tokens._blocks == [ (BlockTyp.TITLE, TM1, "Title", [], BlockFmt.PBB | BlockFmt.CENTRE), ] - assert tokens._raw[-1] == "#! Title\n\n" # Unnumbered # ========== @@ -413,7 +401,6 @@ def testFmtToken_HeaderFormat(mockGUI): assert tokens._blocks == [ (BlockTyp.HEAD2, TM1, "Prologue", [], BlockFmt.PBB), ] - assert tokens._raw[-1] == "##! Prologue\n\n" # Note File tokens._isNovel = False @@ -423,7 +410,6 @@ def testFmtToken_HeaderFormat(mockGUI): assert tokens._blocks == [ (BlockTyp.HEAD2, TM1, "Prologue", [], BlockFmt.NONE), ] - assert tokens._raw[-1] == "##! Prologue\n\n" @pytest.mark.core @@ -761,21 +747,19 @@ def processStyle(text: str, first: bool) -> BlockFmt: def testFmtToken_MetaFormat(mockGUI): """Test the tokenization of meta formats in the Tokenizer class.""" project = NWProject() - tokens = ToRaw(project) + tokens = BareTokenizer(project) tokens._handle = TMH # Ignore Text tokens._text = "%~ Some text\n" tokens.tokenizeText() assert tokens._blocks == [] - assert tokens._raw[-1] == "\n" # Comment tokens.setComments(False) tokens._text = "% A comment\n" tokens.tokenizeText() assert tokens._blocks == [] - assert tokens._raw[-1] == "\n" tokens.setComments(True) tokens._text = "% A comment\n" @@ -787,14 +771,12 @@ def testFmtToken_MetaFormat(mockGUI): (9, TextFmt.COL_B, "comment"), (18, TextFmt.COL_E, ""), ], BlockFmt.NONE )] - assert tokens._raw[-1] == "% A comment\n\n" # Synopsis tokens.setSynopsis(False) tokens._text = "%synopsis: The synopsis\n" tokens.tokenizeText() assert tokens._blocks == [] - assert tokens._raw[-1] == "\n" tokens.setSynopsis(True) tokens._text = "% synopsis: The synopsis\n" @@ -806,14 +788,12 @@ def testFmtToken_MetaFormat(mockGUI): (10, TextFmt.COL_B, "synopsis"), (22, TextFmt.COL_E, "") ], BlockFmt.NONE )] - assert tokens._raw[-1] == "% synopsis: The synopsis\n\n" # Short tokens.setSynopsis(False) tokens._text = "% short: A short description\n" tokens.tokenizeText() assert tokens._blocks == [] - assert tokens._raw[-1] == "\n" tokens.setSynopsis(True) tokens._text = "% short: A short description\n" @@ -825,14 +805,12 @@ def testFmtToken_MetaFormat(mockGUI): (19, TextFmt.COL_B, "synopsis"), (38, TextFmt.COL_E, ""), ], BlockFmt.NONE )] - assert tokens._raw[-1] == "% short: A short description\n\n" # Keyword tokens.setKeywords(False) tokens._text = "@char: Bod\n" tokens.tokenizeText() assert tokens._blocks == [] - assert tokens._raw[-1] == "\n" tokens.setKeywords(True) tokens.tokenizeText() @@ -844,7 +822,6 @@ def testFmtToken_MetaFormat(mockGUI): (15, TextFmt.ARF_E, ""), (15, TextFmt.COL_E, ""), ], BlockFmt.NONE )] - assert tokens._raw[-1] == "@char: Bod\n\n" tokens._text = "@pov: Bod\n@plot: Main\n@location: Europe\n" tokens.tokenizeText() @@ -870,7 +847,6 @@ def testFmtToken_MetaFormat(mockGUI): (17, TextFmt.ARF_E, ""), (17, TextFmt.COL_E, ""), ], BlockFmt.Z_TOP )] - assert tokens._raw[-1] == "@pov: Bod\n@plot: Main\n@location: Europe\n\n" # Ignored keywords tokens._text = "@pov: Bod\n@plot: Main\n@location: Europe\n" @@ -890,7 +866,7 @@ def testFmtToken_MetaFormat(mockGUI): def testFmtToken_MarginFormat(mockGUI): """Test the tokenization of margin formats in the Tokenizer class.""" project = NWProject() - tokens = ToRaw(project) + tokens = BareTokenizer(project) # Alignment and Indentation dblIndent = BlockFmt.IND_L | BlockFmt.IND_R @@ -916,16 +892,6 @@ def testFmtToken_MarginFormat(mockGUI): (BlockTyp.TEXT, "", "Double-indented block", [], dblIndent), (BlockTyp.TEXT, "", "Right-indent, right-aligned", [], rIndAlign), ] - assert tokens._raw[-1] == ( - "Some regular text\n\n" - "Some left-aligned text <<\n\n" - ">> Some right-aligned text\n\n" - ">> Some centered text <<\n\n" - "> Left-indented block\n\n" - "Right-indented block <\n\n" - "> Double-indented block <\n\n" - ">> Right-indent, right-aligned <\n\n\n" - ) @pytest.mark.core @@ -1111,7 +1077,7 @@ def testFmtToken_Paragraphs(mockGUI): def testFmtToken_TextFormat(mockGUI): """Test the tokenization of text formats in the Tokenizer class.""" project = NWProject() - tokens = ToRaw(project) + tokens = BareTokenizer(project) tokens._handle = TMH # Text @@ -1120,12 +1086,10 @@ def testFmtToken_TextFormat(mockGUI): assert tokens._blocks == [ (BlockTyp.TEXT, "", "Some plain text\non two lines", [], BlockFmt.NONE), ] - assert tokens._raw[-1] == "Some plain text\non two lines\n\n\n\n" tokens.setBodyText(False) tokens.tokenizeText() assert tokens._blocks == [] - assert tokens._raw[-1] == "\n\n\n" tokens.setBodyText(True) # Text Emphasis @@ -1137,7 +1101,6 @@ def testFmtToken_TextFormat(mockGUI): (16, TextFmt.B_E, ""), ], BlockFmt.NONE )] - assert tokens._raw[-1] == "Some **bolded text** on this lines\n\n" tokens._text = "Some _italic text_ on this lines\n" tokens.tokenizeText() @@ -1147,7 +1110,6 @@ def testFmtToken_TextFormat(mockGUI): (16, TextFmt.I_E, ""), ], BlockFmt.NONE )] - assert tokens._raw[-1] == "Some _italic text_ on this lines\n\n" tokens._text = "Some **_bold italic text_** on this lines\n" tokens.tokenizeText() @@ -1159,7 +1121,6 @@ def testFmtToken_TextFormat(mockGUI): (21, TextFmt.B_E, ""), ], BlockFmt.NONE )] - assert tokens._raw[-1] == "Some **_bold italic text_** on this lines\n\n" tokens._text = "Some ~~strikethrough text~~ on this lines\n" tokens.tokenizeText() @@ -1169,7 +1130,6 @@ def testFmtToken_TextFormat(mockGUI): (23, TextFmt.D_E, ""), ], BlockFmt.NONE )] - assert tokens._raw[-1] == "Some ~~strikethrough text~~ on this lines\n\n" tokens._text = "Some **nested bold and _italic_ and ~~strikethrough~~ text** here\n" tokens.tokenizeText() @@ -1183,9 +1143,6 @@ def testFmtToken_TextFormat(mockGUI): (50, TextFmt.B_E, ""), ], BlockFmt.NONE )] - assert tokens._raw[-1] == ( - "Some **nested bold and _italic_ and ~~strikethrough~~ text** here\n\n" - ) @pytest.mark.core