diff --git a/src/internals/helpers/string.test.ts b/src/internals/helpers/string.test.ts index c97dd1a7..f862f61e 100644 --- a/src/internals/helpers/string.test.ts +++ b/src/internals/helpers/string.test.ts @@ -19,23 +19,31 @@ import { findFirstPair, splitString } from "./string.js"; import { ValueError } from "@/errors.js"; describe("String Utilities", () => { - it("splitString", () => { - const text = - "00 01 02 03 04 05 06 07 08 09 " + - "10 11 12 13 14 15 16 17 18 19 " + - "20 21 22 23 24 25 26 27 28 29 " + - "30 31 32 33 34 35 36 37 38 39 "; - const chunks = [...splitString(text, { size: 30, overlap: 15 })]; - expect(chunks).toEqual([ - "00 01 02 03 04 05 06 07 08 09 ", - "05 06 07 08 09 10 11 12 13 14 ", - "10 11 12 13 14 15 16 17 18 19 ", - "15 16 17 18 19 20 21 22 23 24 ", - "20 21 22 23 24 25 26 27 28 29 ", - "25 26 27 28 29 30 31 32 33 34 ", - "30 31 32 33 34 35 36 37 38 39 ", - "35 36 37 38 39 ", - ]); + describe("splitString", () => { + it("Works", () => { + const text = + "00 01 02 03 04 05 06 07 08 09 " + + "10 11 12 13 14 15 16 17 18 19 " + + "20 21 22 23 24 25 26 27 28 29 " + + "30 31 32 33 34 35 36 37 38 39 "; + const chunks = [...splitString(text, { size: 30, overlap: 15, trim: false })]; + expect(chunks).toEqual([ + "00 01 02 03 04 05 06 07 08 09 ", + "05 06 07 08 09 10 11 12 13 14 ", + "10 11 12 13 14 15 16 17 18 19 ", + "15 16 17 18 19 20 21 22 23 24 ", + "20 21 22 23 24 25 26 27 28 29 ", + "25 26 27 28 29 30 31 32 33 34 ", + "30 31 32 33 34 35 36 37 38 39 ", + ]); + }); + it("Handles edge-cases", () => { + const text = `Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur ac viverra dolor, eu fringilla magna.`; + const output = Array.from(splitString(text, { size: 35, overlap: 10 })).join("|"); + expect(output).toMatchInlineSnapshot( + `"Lorem ipsum dolor sit amet, consect|t, consectetur adipiscing elit. Cur| elit. Curabitur ac viverra dolor, |ra dolor, eu fringilla magna."`, + ); + }); }); describe("findFirstPair", () => { diff --git a/src/internals/helpers/string.ts b/src/internals/helpers/string.ts index 26c17128..932dd5fe 100644 --- a/src/internals/helpers/string.ts +++ b/src/internals/helpers/string.ts @@ -17,16 +17,90 @@ import { ValueOf } from "@/internals/types.js"; import * as R from "remeda"; import { ValueError } from "@/errors.js"; +import { unique } from "remeda"; -export function* splitString( - text: T, - options: { size: number; overlap: number }, +export function* splitString( + text: string, + options: { size: number; overlap: number; trim?: boolean }, ) { + yield* recursiveSplitString(text, { ...options, trim: options?.trim ?? false, separators: [] }); +} + +export function* mergeStrings( + chunks: string[], + sep: string, + options: { size: number; overlap: number; trim?: boolean }, +) { + const tmpChunks: string[] = []; + let tmpOverlap = 0; + + const toDoc = (parts: string[]) => { + const text = parts.join(sep); + return options.trim ? text.trim() : text; + }; + + for (const chunk of chunks) { + if (tmpOverlap + chunk.length + tmpChunks.length * sep.length > options.size) { + if (tmpChunks.length > 0) { + const doc = toDoc(tmpChunks); + if (doc) { + yield doc; + } + + while ( + tmpOverlap > options.overlap || + (tmpOverlap + chunk.length + tmpChunks.length * sep.length > options.size && + tmpOverlap > 0) + ) { + const tmp = tmpChunks.shift()!; + tmpOverlap -= tmp.length; + } + } + } + tmpChunks.push(chunk); + tmpOverlap += chunk.length; + } + + const doc = toDoc(tmpChunks); + if (doc) { + yield doc; + } +} + +export function* recursiveSplitString( + text: string, + options: { size: number; overlap: number; separators: string[]; trim?: boolean }, +): Generator { + if (options.size <= 0 || options.overlap < 0) { + throw new Error("size must be positive and overlap must be non-negative"); + } if (options.overlap >= options.size) { throw new Error("overlap must be less than size"); } - for (let i = 0; i < text.length; i += options.size - options.overlap) { - yield text.slice(i, i + options.size); + + const goodSplits: string[] = []; + const [separator, ...remainingSeparators] = unique([...(options.separators ?? []), ""]); + + for (const chunk of text.split(separator).filter(Boolean)) { + if (chunk.length < options.size) { + goodSplits.push(chunk); + continue; + } + + if (goodSplits.length > 0) { + yield* mergeStrings(goodSplits, separator, options); + goodSplits.length = 0; + } + + if (remainingSeparators.length === 0) { + yield chunk; + } else { + yield* recursiveSplitString(chunk, { ...options, separators: remainingSeparators }); + } + } + + if (goodSplits.length > 0) { + yield* mergeStrings(goodSplits, separator, options); } }