-
-
Notifications
You must be signed in to change notification settings - Fork 2
/
tokenizers.pas
359 lines (314 loc) · 7.9 KB
/
tokenizers.pas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
unit Tokenizers;
{$mode Delphi}
interface
uses Classes, SysUtils, CharReaders, Recognizers;
type
TRowCol = record
Row: Word;
Col: Word;
end;
TPosition = record
Start: TRowCol;
Finish: TRowCol;
end;
TToken = record
Text: String;
// if < 0, token is not matched
// if >= 0, it's the recognizer that found it
Kind: Integer;
Position: TPosition;
end;
TTokenizer = class
private
FReader: TPushBackCharReader;
FRecognizers: TTokenRecognizers;
FRowCol: TRowCol;
public
constructor Create(Reader: TPushBackCharReader; Recognizers: TTokenRecognizers);
destructor Destroy; override;
function Read: TToken;
end;
PTokenNode = ^TTokenNode;
TTokenNode = record
Data: TToken;
Next: PTokenNode;
end;
TTokenLinkedList = class
private
FHead: PTokenNode;
public
constructor Create;
destructor Destroy; override;
function IsEmpty: Boolean;
function Pop: TToken;
procedure Push(Token: TToken);
procedure Append(List: TTokenLinkedList);
end;
TUndoTokenizer = class
private
FTokenizer: TTokenizer;
FBuffer: TTokenLinkedList;
public
constructor Create(Tokenizer: TTokenizer);
destructor Destroy; override;
function Read: TToken;
procedure Undo(Token: TToken); overload;
procedure Undo(var TokenList: TTokenLinkedList); overload;
end;
function CreateUndoTokenizer(Stream: TStream; Recognizers: TTokenRecognizers): TUndoTokenizer;
implementation
procedure IncRow(var RowCol: TRowCol);
begin
RowCol.Row := RowCol.Row + 1;
RowCol.Col := 1;
end;
procedure IncCol(var RowCol: TRowCol);
begin
RowCol.Col := RowCol.Col + 1;
end;
type
TRecognizerResponses = class
private
FResponses: array of TRecognition;
public
constructor Create(Length: Integer);
function GetLastResponse(Index: Integer): TRecognition;
procedure SetLastResponse(Index: Integer; Recognition: TRecognition);
{ How many responses exist of the given recognition value }
function CountByRecognition(Recognition: TRecognition): Integer;
end;
constructor TRecognizerResponses.Create(Length: Integer);
var
i: Integer;
begin
SetLength(FResponses, Length);
for i := 0 to Length - 1 do
FResponses[i] := rPartial;
end;
function TRecognizerResponses.GetLastResponse(Index: Integer): TRecognition;
begin
Result := FResponses[Index];
end;
procedure TRecognizerResponses.SetLastResponse(Index: Integer; Recognition: TRecognition);
begin
FResponses[Index] := Recognition;
end;
function TRecognizerResponses.CountByRecognition(Recognition: TRecognition): Integer;
var
i: Integer;
sum: Integer;
begin
sum := 0;
for i := 0 to Length(FResponses) - 1 do
if FResponses[i] = Recognition then Inc(sum);
Result := sum;
end;
(* Tokenizer *)
constructor TTokenizer.Create(Reader: TPushBackCharReader; Recognizers: TTokenRecognizers);
begin
FReader := Reader;
FRecognizers := Recognizers;
with FRowCol do
begin
Row := 1;
Col := 1;
end;
end;
destructor TTokenizer.Destroy;
var
i: Integer;
begin
FReader.Free;
for i := Low(FRecognizers) to High(FRecognizers) do
FRecognizers[i].Free;
inherited Destroy;
end;
function TTokenizer.Read: TToken;
var
Buffer: String;
Next: TOptChar;
NoMatchOrEof: Boolean;
i: Integer;
RecognizerResponses: TRecognizerResponses;
LastResponse, Recognition: TRecognition;
Sizes: array of Integer;
MaxPositiveSize: Integer;
MaxPositiveIndex: Integer;
begin
Buffer := '';
NoMatchOrEof := False;
// initialize recognizer responses
// prime all with "partial"
RecognizerResponses := TRecognizerResponses.Create(Length(FRecognizers));
SetLength(Sizes, Length(FRecognizers));
repeat
Next := FReader.Read;
if Next.HasValue then
begin
{ add read character to buffer}
Buffer := Buffer + Next.Value;
{ find which recognizer can work with the buffer, if any }
for i := Low(FRecognizers) to High(FRecognizers) do
begin
LastResponse := RecognizerResponses.GetLastResponse(i);
if LastResponse <> rNegative then
begin
Recognition := FRecognizers[i].Recognize(Buffer);
RecognizerResponses.SetLastResponse(i, Recognition);
if Recognition = rPositive then
begin
// remember the Buffer size at this point for this guy
Sizes[i] := Length(Buffer);
end
else if (Recognition = rNegative) and (LastResponse = rPartial) then
begin
// this recognizer got disqualified without ever reaching the goal
Sizes[i] := 0;
end
end
end;
// exit loop if everyone is done
NoMatchOrEof := RecognizerResponses.CountByRecognition(rNegative) = Length(FRecognizers);
end
else
begin
// EOF
NoMatchOrEof := True;
end
until NoMatchOrEof;
// out of all the positive responses, which one was the longest?
// this allows to have tokens '>=' and '>' and let '>=' win
MaxPositiveSize := 0;
MaxPositiveIndex := -1;
for i := Low(FRecognizers) to High(FRecognizers) do
begin
if Sizes[i] > MaxPositiveSize then
begin
MaxPositiveSize := Sizes[i];
MaxPositiveIndex := i;
end;
end;
// unread any extra characters back into the reader
while Length(Buffer) > MaxPositiveSize do begin
FReader.UnRead(Buffer[Length(Buffer)]);
// delete last character
Delete(Buffer, Length(Buffer), 1);
end;
Result.Kind := MaxPositiveIndex;
if MaxPositiveIndex >= 0 then
begin
// fill-in the result based on the buffer
Result.Text := Buffer;
Result.Position.Start := FRowCol;
for i := 1 to Length(Buffer) do
begin
if Buffer[i] = #13 then
IncRow(FRowCol)
else if Buffer[i] = #10 then
begin
// was it preceded by a \r (#13)?
if (i > 1) and (Buffer[i - 1] = #13) then
// do nothing, this is a \r\n, already moved position
else
IncRow(FRowCol)
end
else
IncCol(FRowCol)
end;
Result.Position.Finish := FRowCol;
end;
end;
(* Undo Tokenizer *)
constructor TUndoTokenizer.Create(Tokenizer: TTokenizer);
begin
FTokenizer := Tokenizer;
FBuffer := TTokenLinkedList.Create;
end;
destructor TUndoTokenizer.Destroy;
begin
FTokenizer.Free;
FBuffer.Free;
inherited Destroy;
end;
function TUndoTokenizer.Read: TToken;
begin
if FBuffer.IsEmpty then
Result := FTokenizer.Read
else
Result := FBuffer.Pop
end;
procedure TUndoTokenizer.Undo(Token: TToken);
begin
FBuffer.Push(Token);
end;
procedure TUndoTokenizer.Undo(var TokenList: TTokenLinkedList);
begin
while not TokenList.IsEmpty do
Undo(TokenList.Pop);
FreeAndNil(TokenList);
end;
constructor TTokenLinkedList.Create;
begin
FHead := nil;
end;
destructor TTokenLinkedList.Destroy;
var
temp: PTokenNode;
begin
while not IsEmpty do
begin
temp := FHead;
FHead := FHead^.Next;
Dispose(temp);
end;
inherited Destroy;
end;
function TTokenLinkedList.IsEmpty: Boolean;
begin
Result := not Assigned(FHead);
end;
function TTokenLinkedList.Pop: TToken;
var
temp: PTokenNode;
begin
if IsEmpty then
raise Exception.Create('Buffer underflow')
else
begin
Result := FHead^.Data;
temp := FHead;
FHead := FHead^.Next;
Dispose(temp);
end
end;
procedure TTokenLinkedList.Push(Token: TToken);
var
temp: PTokenNode;
begin
New(temp);
temp^.Data := Token;
temp^.Next := FHead;
FHead := temp;
end;
procedure TTokenLinkedList.Append(List: TTokenLinkedList);
var
temp: PTokenNode;
begin
temp := FHead;
while (temp <> nil) and (temp^.Next <> nil) do
temp := temp^.Next;
if temp <> nil then
begin
temp^.Next := List.FHead;
List.FHead := nil;
end
else
raise Exception.Create('Cannot append to empty list')
end;
function CreateUndoTokenizer(Stream: TStream; Recognizers: TTokenRecognizers): TUndoTokenizer;
begin
Result := TUndoTokenizer.Create(
TTokenizer.Create(CreatePushBackCharReader(Stream), Recognizers)
);
end;
end.