Skip to content

Commit

Permalink
feat(internals): add recursiveSplitString and mergeStrings utility fu…
Browse files Browse the repository at this point in the history
…nctions

Signed-off-by: Tomas Dvorak <toomas2d@gmail.com>
  • Loading branch information
Tomas2D committed Nov 19, 2024
1 parent 23656bd commit d82be6e
Show file tree
Hide file tree
Showing 2 changed files with 104 additions and 22 deletions.
42 changes: 25 additions & 17 deletions src/internals/helpers/string.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,23 +19,31 @@ import { findFirstPair, splitString } from "./string.js";
import { ValueError } from "@/errors.js";

describe("String Utilities", () => {
it("splitString", () => {
const text =
"00 01 02 03 04 05 06 07 08 09 " +
"10 11 12 13 14 15 16 17 18 19 " +
"20 21 22 23 24 25 26 27 28 29 " +
"30 31 32 33 34 35 36 37 38 39 ";
const chunks = [...splitString(text, { size: 30, overlap: 15 })];
expect(chunks).toEqual([
"00 01 02 03 04 05 06 07 08 09 ",
"05 06 07 08 09 10 11 12 13 14 ",
"10 11 12 13 14 15 16 17 18 19 ",
"15 16 17 18 19 20 21 22 23 24 ",
"20 21 22 23 24 25 26 27 28 29 ",
"25 26 27 28 29 30 31 32 33 34 ",
"30 31 32 33 34 35 36 37 38 39 ",
"35 36 37 38 39 ",
]);
describe("splitString", () => {
it("Works", () => {
const text =
"00 01 02 03 04 05 06 07 08 09 " +
"10 11 12 13 14 15 16 17 18 19 " +
"20 21 22 23 24 25 26 27 28 29 " +
"30 31 32 33 34 35 36 37 38 39 ";
const chunks = [...splitString(text, { size: 30, overlap: 15, trim: false })];
expect(chunks).toEqual([
"00 01 02 03 04 05 06 07 08 09 ",
"05 06 07 08 09 10 11 12 13 14 ",
"10 11 12 13 14 15 16 17 18 19 ",
"15 16 17 18 19 20 21 22 23 24 ",
"20 21 22 23 24 25 26 27 28 29 ",
"25 26 27 28 29 30 31 32 33 34 ",
"30 31 32 33 34 35 36 37 38 39 ",
]);
});
it("Handles edge-cases", () => {
const text = `Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur ac viverra dolor, eu fringilla magna.`;
const output = Array.from(splitString(text, { size: 35, overlap: 10 })).join("|");
expect(output).toMatchInlineSnapshot(
`"Lorem ipsum dolor sit amet, consect|t, consectetur adipiscing elit. Cur| elit. Curabitur ac viverra dolor, |ra dolor, eu fringilla magna."`,
);
});
});

describe("findFirstPair", () => {
Expand Down
84 changes: 79 additions & 5 deletions src/internals/helpers/string.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,90 @@
import { ValueOf } from "@/internals/types.js";
import * as R from "remeda";
import { ValueError } from "@/errors.js";
import { unique } from "remeda";

export function* splitString<T extends string>(
text: T,
options: { size: number; overlap: number },
export function* splitString(
text: string,
options: { size: number; overlap: number; trim?: boolean },
) {
yield* recursiveSplitString(text, { ...options, trim: options?.trim ?? false, separators: [] });
}

export function* mergeStrings(
chunks: string[],
sep: string,
options: { size: number; overlap: number; trim?: boolean },
) {
const tmpChunks: string[] = [];
let tmpOverlap = 0;

const toDoc = (parts: string[]) => {
const text = parts.join(sep);
return options.trim ? text.trim() : text;
};

for (const chunk of chunks) {
if (tmpOverlap + chunk.length + tmpChunks.length * sep.length > options.size) {
if (tmpChunks.length > 0) {
const doc = toDoc(tmpChunks);
if (doc) {
yield doc;
}

while (
tmpOverlap > options.overlap ||
(tmpOverlap + chunk.length + tmpChunks.length * sep.length > options.size &&
tmpOverlap > 0)
) {
const tmp = tmpChunks.shift()!;
tmpOverlap -= tmp.length;
}
}
}
tmpChunks.push(chunk);
tmpOverlap += chunk.length;
}

const doc = toDoc(tmpChunks);
if (doc) {
yield doc;
}
}

export function* recursiveSplitString(
text: string,
options: { size: number; overlap: number; separators: string[]; trim?: boolean },
): Generator<string> {
if (options.size <= 0 || options.overlap < 0) {
throw new Error("size must be positive and overlap must be non-negative");
}
if (options.overlap >= options.size) {
throw new Error("overlap must be less than size");
}
for (let i = 0; i < text.length; i += options.size - options.overlap) {
yield text.slice(i, i + options.size);

const goodSplits: string[] = [];
const [separator, ...remainingSeparators] = unique([...(options.separators ?? []), ""]);

for (const chunk of text.split(separator).filter(Boolean)) {
if (chunk.length < options.size) {
goodSplits.push(chunk);
continue;
}

if (goodSplits.length > 0) {
yield* mergeStrings(goodSplits, separator, options);
goodSplits.length = 0;
}

if (remainingSeparators.length === 0) {
yield chunk;
} else {
yield* recursiveSplitString(chunk, { ...options, separators: remainingSeparators });
}
}

if (goodSplits.length > 0) {
yield* mergeStrings(goodSplits, separator, options);
}
}

Expand Down

0 comments on commit d82be6e

Please sign in to comment.