-
Notifications
You must be signed in to change notification settings - Fork 0
/
1-token-separation.py
67 lines (60 loc) · 2.35 KB
/
1-token-separation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# To catagorize the leximes under the below catagory.
# Keywords
# Operators
# Identifier = (letter)(letter | digit)*
# Constants / Literals
# Punctuations / Separators
from typing import List
import re
class Tokenizer :
def __init__(self, filepath) -> None:
self.programlines = self.readInput(filepath)
def readInput(self, filepath : str) -> List[str]:
'''
To read the input source program from a text file
'''
with open(filepath, 'r') as f :
programlines = f.readlines()
programlines = [line.replace("\n", "").strip() for line in programlines]
programlines = [line.split() for line in programlines]
return programlines
def start_scanning(self, outputFile) -> None :
'''
Scans and write the output to a file
'''
open(outputFile, 'w').truncate()
for lines in self.programlines :
for word in lines :
token = self.scan(word)
line = f'< {token[1]}, {token[0]} > \n'
self.writeOutput(filepath=outputFile, line=line)
def scan(self, lex : str) -> bool:
'''
Actual scanning lexical analysis happens here
'''
keywords : List[str] = ["main","int", "float", "double", "long", \
"short", "string", "char", "if", "else", "while", \
"do","break","continue"]
operators : List[str] = ["+", "-","*","/","<",">","=","|","&"]
punctuations : List[str] = ["{", "}","(",")",";","[","]",".","&"]
identifers = r'\b[A-Za-z_][A-Za-z0-9_]*\b'
constants = r'\b[0-9][0-9]*\b'
if lex in keywords :
return [lex , 'Keyword']
elif lex in operators :
return [lex , 'Operator']
elif lex in punctuations :
return [lex, 'Punctuation']
elif re.findall(identifers, lex) :
return [lex, 'Identifier']
elif re.findall(constants, lex):
return [lex, 'Constant']
def writeOutput(self, filepath : str, line : str) -> None :
'''
To write the output to a file
'''
with open(filepath, 'a') as f :
f.write(line)
if __name__ == '__main__' :
scanner = Tokenizer(filepath='input/1-token-separation.txt')
scanner.start_scanning('output/1-token-separation.txt')