Skip to content

Commit

Permalink
Simplify collection of raw text (#2087)
Browse files Browse the repository at this point in the history
  • Loading branch information
vkbo authored Nov 7, 2024
2 parents 30118ec + 0a7bd68 commit 4479277
Show file tree
Hide file tree
Showing 6 changed files with 28 additions and 86 deletions.
40 changes: 11 additions & 29 deletions novelwriter/formats/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,10 @@ def __init__(self, project: NWProject) -> None:
self._project = project

# Data Variables
self._text = "" # The raw text to be tokenized
self._handle = None # The item handle currently being processed
self._keepRaw = False # Whether to keep the raw text, used by ToRaw
self._text = "" # The raw text to be tokenized
self._handle = None # The item handle currently being processed
self._keepRaw = False # Whether to keep the raw text, used by ToRaw
self._noTokens = False # Disable tokenization if they're not needed

# Blocks and Meta Data (Per Document)
self._blocks: list[T_Block] = []
Expand Down Expand Up @@ -522,22 +523,24 @@ def tokenizeText(self) -> None:
4: The internal formatting map of the text, TxtFmt.*
5: The formats of the block, BlockFmt.*
"""
if self._keepRaw:
self._raw.append(f"{self._text.rstrip()}\n\n")
if self._noTokens:
return
if self._isNovel:
self._hFormatter.setHandle(self._handle)

# Cache Flags
isNovel = self._isNovel
keepRaw = self._keepRaw
doJustify = self._doJustify
keepBreaks = self._keepBreaks
indentFirst = self._indentFirst
firstIndent = self._firstIndent

if self._isNovel:
self._hFormatter.setHandle(self._handle)

# Replace all instances of [br] with a placeholder character
text = REGEX_PATTERNS.lineBreak.sub("\uffff", self._text)

nHead = 0
rawText = []
tHandle = self._handle or ""
tBlocks: list[T_Block] = [B_EMPTY]
for bLine in text.splitlines():
Expand All @@ -547,8 +550,6 @@ def tokenizeText(self) -> None:
# Check for blank lines
if not sLine:
tBlocks.append(B_EMPTY)
if keepRaw:
rawText.append("\n")
continue

if self._breakNext:
Expand Down Expand Up @@ -613,14 +614,10 @@ def tokenizeText(self) -> None:
tBlocks.append((
BlockTyp.COMMENT, "", tLine, tFmt, tStyle
))
if keepRaw:
rawText.append(f"{aLine}\n")

elif cStyle == nwComment.FOOTNOTE:
tLine, tFmt = self._extractFormats(cText, skip=TextFmt.FNOTE)
self._footnotes[f"{tHandle}:{cKey}"] = (tLine, tFmt)
if keepRaw:
rawText.append(f"{aLine}\n")

elif aLine.startswith("@"):
# Keywords
Expand All @@ -634,8 +631,6 @@ def tokenizeText(self) -> None:
tBlocks.append((
BlockTyp.KEYWORD, tTag[1:], tLine, tFmt, tStyle
))
if keepRaw:
rawText.append(f"{aLine}\n")

elif aLine.startswith(("# ", "#! ")):
# Title or Partition Headings
Expand Down Expand Up @@ -670,8 +665,6 @@ def tokenizeText(self) -> None:
tBlocks.append((
tType, f"{tHandle}:T{nHead:04d}", tText, [], tStyle
))
if keepRaw:
rawText.append(f"{aLine}\n")

elif aLine.startswith(("## ", "##! ")):
# (Unnumbered) Chapter Headings
Expand Down Expand Up @@ -704,8 +697,6 @@ def tokenizeText(self) -> None:
tBlocks.append((
tType, f"{tHandle}:T{nHead:04d}", tText, [], tStyle
))
if keepRaw:
rawText.append(f"{aLine}\n")

elif aLine.startswith(("### ", "###! ")):
# (Alternative) Scene Headings
Expand Down Expand Up @@ -744,8 +735,6 @@ def tokenizeText(self) -> None:
tBlocks.append((
tType, f"{tHandle}:T{nHead:04d}", tText, [], tStyle
))
if keepRaw:
rawText.append(f"{aLine}\n")

elif aLine.startswith("#### "):
# Section Headings
Expand Down Expand Up @@ -773,8 +762,6 @@ def tokenizeText(self) -> None:
tBlocks.append((
tType, f"{tHandle}:T{nHead:04d}", tText, [], tStyle
))
if keepRaw:
rawText.append(f"{aLine}\n")

else:
# Text Lines
Expand Down Expand Up @@ -821,8 +808,6 @@ def tokenizeText(self) -> None:
tBlocks.append((
BlockTyp.TEXT, "", tLine, tFmt, tStyle
))
if keepRaw:
rawText.append(f"{aLine}\n")

# If we have content, turn off the first page flag
if self._isFirst and tBlocks:
Expand All @@ -840,9 +825,6 @@ def tokenizeText(self) -> None:

# Always add an empty line at the end of the file
tBlocks.append(B_EMPTY)
if keepRaw:
rawText.append("\n")
self._raw.append("".join(rawText))

# Second Pass
# ===========
Expand Down
1 change: 1 addition & 0 deletions novelwriter/formats/toraw.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ class ToRaw(Tokenizer):
def __init__(self, project: NWProject) -> None:
super().__init__(project)
self._keepRaw = True
self._noTokens = True
return

def doConvert(self) -> None:
Expand Down
5 changes: 3 additions & 2 deletions tests/reference/mBuildDocBuild_NWD_Lorem_Ipsum.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
"meta": {
"projectName": "Lorem Ipsum",
"novelAuthor": "lipsum.com",
"buildTime": 1730136328,
"buildTimeStr": "2024-10-28 18:25:28"
"buildTime": 1731001720,
"buildTimeStr": "2024-11-07 18:48:40"
},
"text": {
"nwd": [
Expand All @@ -21,6 +21,7 @@
">> \u201cThere is no one who loves pain itself, who seeks after it and wants to have it, simply because it is pain\u2026\u201d <<"
],
[
"[NEW PAGE]",
"",
"% Exctracted from the lipsum.com website.",
"",
Expand Down
1 change: 1 addition & 0 deletions tests/reference/mBuildDocBuild_NWD_Lorem_Ipsum.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

>> “There is no one who loves pain itself, who seeks after it and wants to have it, simply because it is pain…” <<

[NEW PAGE]

% Exctracted from the lipsum.com website.

Expand Down
12 changes: 6 additions & 6 deletions tests/test_core/test_core_docbuild.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,8 +467,8 @@ def testCoreDocBuild_Custom(mockGUI, fncPath: Path):
assert docFile.read_text(encoding="utf-8") == (
"#! New Novel\n\n"
">> By Jane Doe <<\n\n"
"## New Chapter\n\n\n"
"### New Scene\n\n\n"
"## New Chapter\n\n"
"### New Scene\n\n"
)
docFile.unlink()

Expand Down Expand Up @@ -497,8 +497,8 @@ def testCoreDocBuild_Custom(mockGUI, fncPath: Path):
assert docFile.read_text(encoding="utf-8") == (
"#! New Novel\n\n"
">> By Jane Doe <<\n\n"
"## New Chapter\n\n\n"
"### New Scene\n\n\n"
"## New Chapter\n\n"
"### New Scene\n\n"
)
docFile.unlink()

Expand Down Expand Up @@ -621,8 +621,8 @@ def testCoreDocBuild_IterBuild(mockGUI, fncPath: Path, mockRnd):
assert docFile.read_text(encoding="utf-8") == (
"#! New Novel\n\n"
">> By Jane Doe <<\n\n"
"## New Chapter\n\n\n"
"### New Scene\n\n\n"
"## New Chapter\n\n"
"### New Scene\n\n"
"#! Notes: Plot\n\n"
"# Main Plot\n"
"**Text**\n\n"
Expand Down
Loading

0 comments on commit 4479277

Please sign in to comment.