-
Notifications
You must be signed in to change notification settings - Fork 1
/
text-splitter.lisp
142 lines (124 loc) · 6.47 KB
/
text-splitter.lisp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
;;; text-splitter.lisp
;;;
;;; SPDX-License-Identifier: MIT
;;;
;;; Copyright (C) 2024 Anthony Green <green@moxielogic.com>
;;;
;;; Permission is hereby granted, free of charge, to any person obtaining a copy
;;; of this software and associated documentation files (the "Software"), to deal
;;; in the Software without restriction, including without limitation the rights
;;; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
;;; copies of the Software, and to permit persons to whom the Software is
;;; furnished to do so, subject to the following conditions:
;;;
;;; The above copyright notice and this permission notice shall be included in all
;;; copies or substantial portions of the Software.
;;;
;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
;;; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
;;; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
;;; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
;;; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
;;; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
;;; SOFTWARE.
;;;
(in-package :text-splitter)
(defparameter +default-size+ 5000)
(defparameter +default-overlap+ 200)
(defclass document ()
((text :initarg :text)))
(defclass plaintext-document (document)
())
(defclass markdown-document (plaintext-document)
())
(defclass html-document (plaintext-document)
())
(defclass org-mode-document (plaintext-document)
())
(defclass pdf-document (plaintext-document)
())
(defun detect-document-type (filename)
"Detects the document type based on the file extension."
(let ((extension (pathname-type filename)))
(cond
((string-equal extension "txt") 'plaintext-document)
((string-equal extension "md") 'markdown-document)
((string-equal extension "html") 'html-document)
((string-equal extension "org") 'org-mode-document)
((string-equal extension "pdf") 'pdf-document)
(t 'document))))
(defun make-document-from-file (filename)
"Creates an instance of a document class based on the file's type."
(let* ((document-type (detect-document-type filename)))
(cond
((eq document-type 'pdf-document)
(let ((text-content (uiop:run-program `("pdftotext" "-layout" "-enc" "UTF-8" ,filename "-")
:output '(:string))))
(make-instance 'pdf-document :text text-content)))
(t
(let ((text-content (with-open-file (stream filename)
(let ((content (make-string (file-length stream))))
(read-sequence content stream)
content))))
(make-instance document-type :text text-content))))))
(defun merge-adjacent-strings (strings x)
"Merges adjacent strings in STRINGS if their combined length is <= X,
aiming for the shortest list."
(loop with result = (list (first strings))
for s in (cdr strings)
do (if (<= (+ (length (car result)) (length s)) x)
(setf (car result) (concatenate 'string (car result) s))
(push s result))
finally (return (nreverse result))))
(defun add-overlaps (strings overlap)
(let* ((count (length strings))
(sa (make-array count :initial-contents strings)))
(loop for i from 0 below count
collect (format nil "~A~A~A"
(if (> i 0)
(subseq (aref sa (1- i)) (max (- (length (aref sa (1- i))) overlap) 0))
"")
(aref sa i)
(if (< i (1- count))
(subseq (aref sa (1+ i)) 0 (min (length (aref sa (1+ i))) overlap))
"")))))
(defmethod split-internal (doc delimiters size overlap)
"Split a DOC up into a list of strings around SIZE big and
overlapping by OVERLAP characters on either end."
(let ((usize (- size (* overlap 2))))
(labels ((%split (text delimiters)
(if (> (length text) usize)
(if delimiters
(let* ((matches (cl-ppcre:all-matches-as-strings (car delimiters) text))
(splits (cl-ppcre:split (car delimiters) text)))
(if matches
(mapcar (lambda (txt)
(%split txt (cdr delimiters)))
(mapcar (lambda (a b) (concatenate 'string a b)) splits matches))
(%split text (cdr delimiters))))
(loop for i from 0 below (length text) by usize
collect (subseq text i (min (+ i usize) (length text)))))
text)))
(let ((small-chunks
(alexandria:flatten
(%split (slot-value doc 'text) delimiters))))
(let ((strings (merge-adjacent-strings small-chunks usize)))
(add-overlaps strings overlap))))))
(defmethod split ((doc document) &key (size +default-size+) (overlap +default-overlap+))
nil)
(defmethod split ((doc plaintext-document) &key (size +default-size+) (overlap +default-overlap+))
"Split a plaintext DOC up into a list of strings around SIZE big and
overlapping by OVERLAP characters on either end."
(split-internal doc `(,(format nil "~A" #\Page) "\\n\\n" "[.!]" "\\n" ",:=" "[ \\t]") size overlap))
(defmethod split ((doc markdown-document) &key (size +default-size+) (overlap +default-overlap+))
"Split a markdown DOC up into a list of strings around SIZE big and
overlapping by OVERLAP characters on either end."
(split-internal doc '("^# " "^## " "^### " "^#### " "^##### " "^###### " "\\n\\n" "[.!]" "\\n" ",:=" "[ \\t]") size overlap))
(defmethod split ((doc org-mode-document) &key (size +default-size+) (overlap +default-overlap+))
"Split an org-mode DOC up into a list of strings around SIZE big and
overlapping by OVERLAP characters on either end."
(split-internal doc '("^\* " "^\*\* " "^\*\*\* " "^\*\*\*\* " "^\*\*\*\*\* " "^\*\*\*\*\*\* " "\\n\\n" "[.!]" "\\n" ",:=" "[ \\t]") size overlap))
(defmethod split ((doc html-document) &key (size +default-size+) (overlap +default-overlap+))
"Split an HTML DOC up into a list of strings around SIZE big and
overlapping by OVERLAP characters on either end."
(split-internal doc '("<h1" "<h2" "<h3" "<h4" "<h5" "<h6" "<div" "<p" "<table" "<[uo]l" "[.!]" "\\n" ",:=" "[ \\t]") size overlap))