-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenizer.py
166 lines (133 loc) · 5.1 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
from __future__ import annotations
from dataclasses import dataclass
from enum import Enum, auto
from typing import Callable
from computer_simulator.translator.errors import InvalidSymbolsError
IDENTIFIER_NON_ALPHA_CHARS = {"_"}
@dataclass
class Token:
class Type(Enum):
IF: int = auto()
OPEN_BRACKET: int = auto()
CLOSE_BRACKET: int = auto()
BINOP: int = auto()
BOOLEAN: int = auto()
INT: int = auto()
STRING: int = auto()
SETQ: int = auto()
IDENTIFIER: int = auto()
DEFUN: int = auto()
PRINT_CHAR: int = auto()
PRINT_STRING: int = auto()
PROGN: int = auto()
READ_STRING: int = auto()
WHILE: int = auto()
READ_CHAR: int = auto()
token_type: Type
value: str
def process_whitespace(idx: int, chars: str) -> int:
if idx >= len(chars):
return idx
while idx < len(chars) and chars[idx] in (" ", "\n", "\t"):
idx += 1
return idx
def process_brackets(tokens: list, idx: int, chars: str) -> int:
if idx >= len(chars):
return idx
if chars[idx] == "(":
tokens.append(Token(Token.Type.OPEN_BRACKET, chars[idx]))
idx += 1
elif chars[idx] == ")":
tokens.append(Token(Token.Type.CLOSE_BRACKET, chars[idx]))
idx += 1
return idx
def process_binpos(tokens: list, idx: int, chars: str) -> int:
if idx >= len(chars):
return idx
if chars[idx] in ("+", "-", "*", "/", "=", "<", ">", "%"):
tokens.append(Token(Token.Type.BINOP, chars[idx]))
idx += 1
return idx
def process_number_literal(tokens: list, idx: int, chars: str) -> int:
if idx >= len(chars):
return idx
start_idx = idx
while idx < len(chars) and (chars[idx].isdigit() or start_idx == idx and chars[idx] == "-"):
idx += 1
if idx > start_idx:
tokens.append(Token(Token.Type.INT, chars[start_idx:idx]))
return idx
def char_allowed_in_identifier(c: str) -> bool:
return c.isalpha() or c in IDENTIFIER_NON_ALPHA_CHARS
def process_identifier(tokens: list, idx: int, chars: str) -> int:
if idx >= len(chars):
return idx
start_idx = idx
while idx < len(chars) and char_allowed_in_identifier(chars[idx]):
idx += 1
if idx > start_idx:
tokens.append(Token(Token.Type.IDENTIFIER, chars[start_idx:idx]))
return idx
def process_string_literal(tokens: list, idx: int, chars: str) -> int:
if idx >= len(chars):
return idx
if chars[idx] == '"':
value = ""
idx += 1
while idx < len(chars) and chars[idx] != '"':
value += chars[idx]
idx += 1
if idx >= len(chars):
raise InvalidSymbolsError(got="string index out of range", expected="closing quote")
tokens.append(Token(Token.Type.STRING, value))
idx += 1
return idx
def process_booleans(tokens: list, idx: int, chars: str) -> int:
if idx >= len(chars):
return idx
if chars[idx] in ("T", "F"):
tokens.append(Token(Token.Type.BOOLEAN, chars[idx]))
idx += 1
return idx
def process_keyword(tokens: list, idx: int, chars: str, token_type: Token.Type, keyword: str) -> int:
if idx >= len(chars):
return idx
value = ""
end_idx = idx
while end_idx < len(chars) and (chars[end_idx].isalpha() or chars[end_idx] in IDENTIFIER_NON_ALPHA_CHARS):
value += chars[end_idx]
end_idx += 1
if value == keyword:
tokens.append(Token(token_type, keyword))
return end_idx
return idx
TOKEN_HANDLERS: list[Callable[[list[Token], int, str], int]] = [
lambda tokens, idx, chars: process_whitespace(idx, chars),
process_booleans,
process_brackets,
process_binpos,
lambda tokens, idx, chars: process_keyword(tokens, idx, chars, Token.Type.IF, "if"),
lambda tokens, idx, chars: process_keyword(tokens, idx, chars, Token.Type.SETQ, "setq"),
lambda tokens, idx, chars: process_keyword(tokens, idx, chars, Token.Type.DEFUN, "defun"),
lambda tokens, idx, chars: process_keyword(tokens, idx, chars, Token.Type.PRINT_CHAR, "print_char"),
lambda tokens, idx, chars: process_keyword(tokens, idx, chars, Token.Type.PRINT_STRING, "print_string"),
lambda tokens, idx, chars: process_keyword(tokens, idx, chars, Token.Type.PROGN, "progn"),
lambda tokens, idx, chars: process_keyword(tokens, idx, chars, Token.Type.READ_STRING, "read_string"),
lambda tokens, idx, chars: process_keyword(tokens, idx, chars, Token.Type.WHILE, "while"),
lambda tokens, idx, chars: process_keyword(tokens, idx, chars, Token.Type.READ_CHAR, "read_char"),
process_number_literal,
process_string_literal,
process_identifier,
]
def tokenize(program_chars: str) -> list[Token]:
tokens: list[Token] = []
idx: int = 0
while idx < len(program_chars):
start_idx = idx
for handler in TOKEN_HANDLERS:
idx = handler(tokens, idx, program_chars)
if idx > start_idx:
break
if idx == start_idx:
raise InvalidSymbolsError(got=program_chars[idx], expected="any known token")
return tokens