Skip to content

Commit

Permalink
fix the tests
Browse files Browse the repository at this point in the history
  • Loading branch information
davidkyle committed Dec 11, 2024
1 parent 78ea3e7 commit 71c1012
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 23 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -96,10 +96,6 @@ List<ChunkPosition> chunkPositions(String input, int chunkSize, int overlap) {
throw new IllegalArgumentException("Invalid chunking parameters, overlap [" + overlap + "] must be >= 0");
}

if (input.isEmpty()) {
return List.of();
}

var chunkPositions = new ArrayList<ChunkPosition>();

// This position in the chunk is where the next overlapping chunk will start
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,37 +44,37 @@ private List<String> textChunks(
}

public void testEmptyString() {
var chunks = new SentenceBoundaryChunker().chunk("", 100, randomBoolean());
var chunks = textChunks(new SentenceBoundaryChunker(), "", 100, randomBoolean());
assertThat(chunks, hasSize(1));
assertThat(chunks.get(0), Matchers.is(""));
}

public void testBlankString() {
var chunks = new SentenceBoundaryChunker().chunk(" ", 100, randomBoolean());
var chunks = textChunks(new SentenceBoundaryChunker(), " ", 100, randomBoolean());
assertThat(chunks, hasSize(1));
assertThat(chunks.get(0), Matchers.is(" "));
}

public void testSingleChar() {
var chunks = new SentenceBoundaryChunker().chunk(" b", 100, randomBoolean());
var chunks = textChunks(new SentenceBoundaryChunker(), " b", 100, randomBoolean());
assertThat(chunks, Matchers.contains(" b"));

chunks = new SentenceBoundaryChunker().chunk("b", 100, randomBoolean());
chunks = textChunks(new SentenceBoundaryChunker(), "b", 100, randomBoolean());
assertThat(chunks, Matchers.contains("b"));

chunks = new SentenceBoundaryChunker().chunk(". ", 100, randomBoolean());
chunks = textChunks(new SentenceBoundaryChunker(), ". ", 100, randomBoolean());
assertThat(chunks, Matchers.contains(". "));

chunks = new SentenceBoundaryChunker().chunk(" , ", 100, randomBoolean());
chunks = textChunks(new SentenceBoundaryChunker(), " , ", 100, randomBoolean());
assertThat(chunks, Matchers.contains(" , "));

chunks = new SentenceBoundaryChunker().chunk(" ,", 100, randomBoolean());
chunks = textChunks(new SentenceBoundaryChunker(), " ,", 100, randomBoolean());
assertThat(chunks, Matchers.contains(" ,"));
}

public void testSingleCharRepeated() {
var input = "a".repeat(32_000);
var chunks = new SentenceBoundaryChunker().chunk(input, 100, randomBoolean());
var chunks = textChunks(new SentenceBoundaryChunker(), input, 100, randomBoolean());
assertThat(chunks, Matchers.contains(input));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,6 @@ public class WordBoundaryChunkerTests extends ESTestCase {
* Use the chunk functions that return offsets where possible
*/
List<String> textChunks(WordBoundaryChunker chunker, String input, int chunkSize, int overlap) {
if (input.isEmpty()) {
return List.of("");
}

var chunkPositions = chunker.chunk(input, chunkSize, overlap);
return chunkPositions.stream().map(p -> input.substring(p.start(), p.end())).collect(Collectors.toList());
}
Expand Down Expand Up @@ -242,31 +238,31 @@ public void testWhitespace() {
}

public void testBlankString() {
var chunks = new WordBoundaryChunker().chunk(" ", 100, 10);
var chunks = textChunks(new WordBoundaryChunker(), " ", 100, 10);
assertThat(chunks, hasSize(1));
assertThat(chunks.get(0), Matchers.is(" "));
}

public void testSingleChar() {
var chunks = new WordBoundaryChunker().chunk(" b", 100, 10);
var chunks = textChunks(new WordBoundaryChunker(), " b", 100, 10);
assertThat(chunks, Matchers.contains(" b"));

chunks = new WordBoundaryChunker().chunk("b", 100, 10);
chunks = textChunks(new WordBoundaryChunker(), "b", 100, 10);
assertThat(chunks, Matchers.contains("b"));

chunks = new WordBoundaryChunker().chunk(". ", 100, 10);
chunks = textChunks(new WordBoundaryChunker(), ". ", 100, 10);
assertThat(chunks, Matchers.contains(". "));

chunks = new WordBoundaryChunker().chunk(" , ", 100, 10);
chunks = textChunks(new WordBoundaryChunker(), " , ", 100, 10);
assertThat(chunks, Matchers.contains(" , "));

chunks = new WordBoundaryChunker().chunk(" ,", 100, 10);
chunks = textChunks(new WordBoundaryChunker(), " ,", 100, 10);
assertThat(chunks, Matchers.contains(" ,"));
}

public void testSingleCharRepeated() {
var input = "a".repeat(32_000);
var chunks = new WordBoundaryChunker().chunk(input, 100, 10);
var chunks = textChunks(new WordBoundaryChunker(), input, 100, 10);
assertThat(chunks, Matchers.contains(input));
}

Expand Down

0 comments on commit 71c1012

Please sign in to comment.