Simplify collection of raw text (#2087)

vkbo · Nov 7, 2024 · 4479277 · 4479277
2 parents 30118ec + 0a7bd68
commit 4479277
Show file tree

Hide file tree

Showing 6 changed files with 28 additions and 86 deletions.
diff --git a/novelwriter/formats/tokenizer.py b/novelwriter/formats/tokenizer.py
@@ -90,9 +90,10 @@ def __init__(self, project: NWProject) -> None:
         self._project = project
 
         # Data Variables
-        self._text    = ""     # The raw text to be tokenized
-        self._handle  = None   # The item handle currently being processed
-        self._keepRaw = False  # Whether to keep the raw text, used by ToRaw
+        self._text     = ""     # The raw text to be tokenized
+        self._handle   = None   # The item handle currently being processed
+        self._keepRaw  = False  # Whether to keep the raw text, used by ToRaw
+        self._noTokens = False  # Disable tokenization if they're not needed
 
         # Blocks and Meta Data (Per Document)
         self._blocks: list[T_Block] = []
@@ -522,22 +523,24 @@ def tokenizeText(self) -> None:
           4: The internal formatting map of the text, TxtFmt.*
           5: The formats of the block, BlockFmt.*
         """
+        if self._keepRaw:
+            self._raw.append(f"{self._text.rstrip()}\n\n")
+        if self._noTokens:
+            return
+        if self._isNovel:
+            self._hFormatter.setHandle(self._handle)
+
         # Cache Flags
         isNovel = self._isNovel
-        keepRaw = self._keepRaw
         doJustify = self._doJustify
         keepBreaks = self._keepBreaks
         indentFirst = self._indentFirst
         firstIndent = self._firstIndent
 
-        if self._isNovel:
-            self._hFormatter.setHandle(self._handle)
-
         # Replace all instances of [br] with a placeholder character
         text = REGEX_PATTERNS.lineBreak.sub("\uffff", self._text)
 
         nHead = 0
-        rawText = []
         tHandle = self._handle or ""
         tBlocks: list[T_Block] = [B_EMPTY]
         for bLine in text.splitlines():
@@ -547,8 +550,6 @@ def tokenizeText(self) -> None:
             # Check for blank lines
             if not sLine:
                 tBlocks.append(B_EMPTY)
-                if keepRaw:
-                    rawText.append("\n")
                 continue
 
             if self._breakNext:
@@ -613,14 +614,10 @@ def tokenizeText(self) -> None:
                     tBlocks.append((
                         BlockTyp.COMMENT, "", tLine, tFmt, tStyle
                     ))
-                    if keepRaw:
-                        rawText.append(f"{aLine}\n")
 
                 elif cStyle == nwComment.FOOTNOTE:
                     tLine, tFmt = self._extractFormats(cText, skip=TextFmt.FNOTE)
                     self._footnotes[f"{tHandle}:{cKey}"] = (tLine, tFmt)
-                    if keepRaw:
-                        rawText.append(f"{aLine}\n")
 
             elif aLine.startswith("@"):
                 # Keywords
@@ -634,8 +631,6 @@ def tokenizeText(self) -> None:
                         tBlocks.append((
                             BlockTyp.KEYWORD, tTag[1:], tLine, tFmt, tStyle
                         ))
-                        if keepRaw:
-                            rawText.append(f"{aLine}\n")
 
             elif aLine.startswith(("# ", "#! ")):
                 # Title or Partition Headings
@@ -670,8 +665,6 @@ def tokenizeText(self) -> None:
                 tBlocks.append((
                     tType, f"{tHandle}:T{nHead:04d}", tText, [], tStyle
                 ))
-                if keepRaw:
-                    rawText.append(f"{aLine}\n")
 
             elif aLine.startswith(("## ", "##! ")):
                 # (Unnumbered) Chapter Headings
@@ -704,8 +697,6 @@ def tokenizeText(self) -> None:
                 tBlocks.append((
                     tType, f"{tHandle}:T{nHead:04d}", tText, [], tStyle
                 ))
-                if keepRaw:
-                    rawText.append(f"{aLine}\n")
 
             elif aLine.startswith(("### ", "###! ")):
                 # (Alternative) Scene Headings
@@ -744,8 +735,6 @@ def tokenizeText(self) -> None:
                 tBlocks.append((
                     tType, f"{tHandle}:T{nHead:04d}", tText, [], tStyle
                 ))
-                if keepRaw:
-                    rawText.append(f"{aLine}\n")
 
             elif aLine.startswith("#### "):
                 # Section Headings
@@ -773,8 +762,6 @@ def tokenizeText(self) -> None:
                 tBlocks.append((
                     tType, f"{tHandle}:T{nHead:04d}", tText, [], tStyle
                 ))
-                if keepRaw:
-                    rawText.append(f"{aLine}\n")
 
             else:
                 # Text Lines
@@ -821,8 +808,6 @@ def tokenizeText(self) -> None:
                 tBlocks.append((
                     BlockTyp.TEXT, "", tLine, tFmt, tStyle
                 ))
-                if keepRaw:
-                    rawText.append(f"{aLine}\n")
 
         # If we have content, turn off the first page flag
         if self._isFirst and tBlocks:
@@ -840,9 +825,6 @@ def tokenizeText(self) -> None:
 
         # Always add an empty line at the end of the file
         tBlocks.append(B_EMPTY)
-        if keepRaw:
-            rawText.append("\n")
-            self._raw.append("".join(rawText))
 
         # Second Pass
         # ===========

diff --git a/novelwriter/formats/toraw.py b/novelwriter/formats/toraw.py
@@ -46,6 +46,7 @@ class ToRaw(Tokenizer):
     def __init__(self, project: NWProject) -> None:
         super().__init__(project)
         self._keepRaw = True
+        self._noTokens = True
         return
 
     def doConvert(self) -> None:

diff --git a/tests/reference/mBuildDocBuild_NWD_Lorem_Ipsum.json b/tests/reference/mBuildDocBuild_NWD_Lorem_Ipsum.json
@@ -2,8 +2,8 @@
   "meta": {
     "projectName": "Lorem Ipsum",
     "novelAuthor": "lipsum.com",
-    "buildTime": 1730136328,
-    "buildTimeStr": "2024-10-28 18:25:28"
+    "buildTime": 1731001720,
+    "buildTimeStr": "2024-11-07 18:48:40"
   },
   "text": {
     "nwd": [
@@ -21,6 +21,7 @@
         ">> \u201cThere is no one who loves pain itself, who seeks after it and wants to have it, simply because it is pain\u2026\u201d <<"
       ],
       [
+        "[NEW PAGE]",
         "",
         "% Exctracted from the lipsum.com website.",
         "",

diff --git a/tests/reference/mBuildDocBuild_NWD_Lorem_Ipsum.txt b/tests/reference/mBuildDocBuild_NWD_Lorem_Ipsum.txt
@@ -10,6 +10,7 @@
 
 >> “There is no one who loves pain itself, who seeks after it and wants to have it, simply because it is pain…” <<
 
+[NEW PAGE]
 
 % Exctracted from the lipsum.com website.
 

diff --git a/tests/test_core/test_core_docbuild.py b/tests/test_core/test_core_docbuild.py
@@ -467,8 +467,8 @@ def testCoreDocBuild_Custom(mockGUI, fncPath: Path):
     assert docFile.read_text(encoding="utf-8") == (
         "#! New Novel\n\n"
         ">> By Jane Doe <<\n\n"
-        "## New Chapter\n\n\n"
-        "### New Scene\n\n\n"
+        "## New Chapter\n\n"
+        "### New Scene\n\n"
     )
     docFile.unlink()
 
@@ -497,8 +497,8 @@ def testCoreDocBuild_Custom(mockGUI, fncPath: Path):
     assert docFile.read_text(encoding="utf-8") == (
         "#! New Novel\n\n"
         ">> By Jane Doe <<\n\n"
-        "## New Chapter\n\n\n"
-        "### New Scene\n\n\n"
+        "## New Chapter\n\n"
+        "### New Scene\n\n"
     )
     docFile.unlink()
 
@@ -621,8 +621,8 @@ def testCoreDocBuild_IterBuild(mockGUI, fncPath: Path, mockRnd):
     assert docFile.read_text(encoding="utf-8") == (
         "#! New Novel\n\n"
         ">> By Jane Doe <<\n\n"
-        "## New Chapter\n\n\n"
-        "### New Scene\n\n\n"
+        "## New Chapter\n\n"
+        "### New Scene\n\n"
         "#! Notes: Plot\n\n"
         "# Main Plot\n"
         "**Text**\n\n"
Original file line number	Diff line number	Diff line change
Expand Up		@@ -10,6 +10,7 @@

		>> “There is no one who loves pain itself, who seeks after it and wants to have it, simply because it is pain…” <<

		[NEW PAGE]

		% Exctracted from the lipsum.com website.

Expand Down