main.go

package chunker

import (
	"fmt"
	"regexp"
	"sort"
	"strings"
)

type Chunker struct {
	ChunkSize            int
	Overlap              int
	Separators           []string
	OutputWithoutNewline bool
	Debug                bool
	// internal
	chunks []string
}

var (
	DefaultSeparators = []string{"\n\n", " ", "\n"}
)

// ChunkSentences splits the input text into individual sentences.
// The sentences are delimited by periods, exclamation marks, and question marks.
// Do not split abbreviations like Mr. and Mrs.
//
// The input data is the text to be split into sentences.
// Returns a slice of strings, where each string is a sentence.
func ChunkSentences(data string) []string {
	var sentences []string
	// List of common abbreviations to avoid splitting on
	abbreviations := []string{"Mr.", "Mrs.", "Dr.", "Ms.", "Jr.", "Sr.", "Prof.", "St."}

	re := regexp.MustCompile(`([.?!])\s+`)
	indexes := re.FindAllStringIndex(data, -1)

	start := 0
	for _, match := range indexes {
		text := data[start : match[1]-1]
		text = strings.TrimSpace(text)

		// Check if the text ends with an abbreviation
		split := true
		for _, abbr := range abbreviations {
			if strings.HasSuffix(text, abbr) {
				split = false
				break
			}
		}

		if split {
			sentences = append(sentences, text)
			start = match[1]
		}
	}

	// Add any remaining text as the last sentence
	if start < len(data) {
		text := data[start:]
		text = strings.TrimSpace(text)
		sentences = append(sentences, text)
	}

	return sentences
}

// NewChunker returns a new Chunker instance.
//
// The chunkSize parameter determines the maximum size of each chunk.
// The overlap parameter determines the overlap between chunks.
// The separators parameter determines the separators used to split the text.
// The outputWithoutNewline parameter determines whether newlines are removed from the output.
// The debug parameter determines whether debug mode is enabled.
// *Chunker
func NewChunker(chunkSize, overlap int, separators []string, outputWithoutNewline, debug bool) *Chunker {
	if chunkSize <= 0 {
		chunkSize = 150
	}
	if overlap <= 0 {
		overlap = 30
	}
	if overlap >= chunkSize {
		overlap = int(chunkSize / 4)
	}
	if len(separators) == 0 {
		separators = DefaultSeparators
	}

	return &Chunker{
		ChunkSize:            chunkSize,
		Overlap:              overlap,
		Separators:           separators,
		OutputWithoutNewline: outputWithoutNewline,
		Debug:                debug,
	}
}

// Chunk chunks the given data into smaller parts based on the chunk size and overlap.
//
// The data parameter is the input string to be chunked.
// Returns a slice of strings representing the chunked data.
func (c *Chunker) Chunk(data string) []string {
	c.ClearChunks()

	var i int = 0
	for {
		if c.Debug {
			fmt.Println("i: ", i, "len(data): ", len(data))
			fmt.Println("chunks: ", len(c.chunks))
		}
		if i == 0 {
			if len(data) < c.ChunkSize {
				possibleChunk := data

				c.addChunk(possibleChunk)
				break
			}

			possibleChunk := data[:c.ChunkSize]
			lastSeparator, ss := findLastSeparator(possibleChunk, c.Separators, 0)

			possibleChunk = possibleChunk[:lastSeparator]
			c.addChunk(possibleChunk)

			i = lastSeparator + ss - c.Overlap
		} else {
			if len(data)-i < c.ChunkSize {
				possibleChunk := data[i:]
				firstSeparator := findFirstSeparator(possibleChunk, c.Separators)
				if firstSeparator >= c.Overlap {
					firstSeparator = 0
				}

				possibleChunk = possibleChunk[firstSeparator:]
				c.addChunk(possibleChunk)
				break
			}

			possibleChunk := data[i : i+c.ChunkSize]
			firstSeparator := findFirstSeparator(possibleChunk, c.Separators)
			if firstSeparator >= c.Overlap {
				firstSeparator = 0
			}
			lastSeparator, ss := findLastSeparator(possibleChunk, c.Separators, firstSeparator)

			possibleChunk = possibleChunk[firstSeparator:lastSeparator]
			c.addChunk(possibleChunk)

			if len(possibleChunk) > c.Overlap {
				i += lastSeparator + ss - c.Overlap
			} else {
				if lastSeparator == 0 && ss == 0 {
					i += c.Overlap
				} else {
					i += lastSeparator + ss
				}
			}
		}
	}

	return c.GetChunks()
}

// ClearChunks clears the existing chunks in the Chunker.
//
// No parameters.
// No return value.
func (c *Chunker) ClearChunks() {
	c.chunks = make([]string, 0)
}

// GetChunkSize returns the count on chunks.
//
// No parameters.
// Return value: int
func (c *Chunker) GetChunkSize() int {
	return c.ChunkSize
}

// GetChunks returns the chunks of text that have been generated by the Chunker.
//
// No parameters.
// Returns a slice of strings representing the chunks of text.
func (c *Chunker) GetChunks() []string {
	return c.chunks
}

func (c *Chunker) addChunk(chunk string) {
	if c.OutputWithoutNewline {
		chunk = removeNewlineInChunk(chunk)
	}

	chunk = strings.TrimSpace(chunk)
	if len(chunk) == 0 {
		return
	}

	c.chunks = append(c.chunks, chunk)
}

func findFirstSeparator(chunk string, separators []string) (offset int) {
	pos := [][]int{}
	for _, sp := range separators {
		if len(chunk) >= len(sp) {
			firstPos := strings.Index(chunk, sp)
			if firstPos != -1 {
				pos = append(pos, []int{firstPos, len(sp)})
				continue
			}
		}
	}
	if len(pos) == 0 {
		return 0
	}

	sort.Slice(pos, func(i, j int) bool {
		return pos[i][0] < pos[j][0]
	})

	return pos[0][0] + pos[0][1]
}

func findLastSeparator(chunk string, separators []string, from int) (offset, separatorSize int) {
	pos := [][]int{}
	for _, sp := range separators {
		if len(chunk) >= len(sp) {
			lastPos := strings.LastIndex(chunk, sp)
			if lastPos != -1 && lastPos > from {
				pos = append(pos, []int{lastPos, len(sp)})
				continue
			}
		}
	}

	if len(pos) == 0 {
		return len(chunk), 0
	}

	sort.Slice(pos, func(i, j int) bool {
		return pos[i][0] > pos[j][0]
	})

	return pos[0][0], pos[0][1]
}

func removeNewlineInChunk(chunk string) string {
	if len(chunk) == 0 {
		return chunk
	}

	// remove /n from the beginning of the chunk
	if chunk[0] == '\n' {
		chunk = chunk[1:]
	}

	// remove /n from the end of the chunk
	if chunk[len(chunk)-1] == '\n' {
		chunk = chunk[:len(chunk)-1]
	}

	// remove /n in the middle of the chunk, replace with space if it is not followed by a space
	chunk = strings.ReplaceAll(chunk, "\n ", " ")
	chunk = strings.ReplaceAll(chunk, " \n", " ")
	chunk = strings.ReplaceAll(chunk, "\n", " ")

	return chunk
}