Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add operator lexing #35

Merged
merged 10 commits into from
Jun 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 92 additions & 8 deletions src/lexer/lex.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,66 @@ int in_string(char c, char s[]) {
return 0;
}

// We will need to add more of these later, for sure
char single_char_tokens[] = "(){}[];";
char single_char_tokens[] = "(){}[];~#,.:?~";

// All strings which represent operators.
char* operator_strings[] = {
"-",
"+",
"*",
"/",
"=",
":",
"%",
"&",
"&&",
"|",
"||",
"-=",
"+=",
"++",
"--",
"/=",
"*=",
"%=",
"&=",
"|=",
"&&=",
"||=",
">",
"<",
"<=",
">=",
"<<",
">>",
"!",
"==",
"!=",
"^",
"^=",
"->",
"<<=",
">>=",
NULL, // for iterating
};

int starts_operator(char c) {
switch (c) {
case '-': case '+': case '*': case '/': case '=': case ':': case '%':
case '&': case '|': case '<': case '>': case '!': case '~': case '^':
return 1;
default:
return 0;
}
}

int valid_operator_sequence(char* op) {
for (char** top = operator_strings; *top; ++top) {
if (STREQ(*top, op))
return 1;
}
return 0;
}

int is_valid_numeric_or_id_char(char c) {
return isalnum(c) || (c == '_') || (c == '.');
Expand Down Expand Up @@ -143,6 +201,22 @@ int real_lex(Lexer *l, Token *t) {
return 0;
}

// Lex an operator. We do this by lexing characters from the buffer until
// the resulting string is no longer an operator, then we cut our losses and
// return./
if (starts_operator(init)) {
while (valid_operator_sequence(t->contents)) {
t->contents[pos++] = (c = getc(l->fp));
}
// We've ended!
// Can we reduce this code duplication from above in a smart way?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A few lines is okay imo

ungetc(c, l->fp);
t->contents[pos - 1] = '\0';
t->type = ttype_from_string(t->contents);
t->length = pos;
return 0;
}

// TODO - parse character or string literal

return 0;
Expand Down Expand Up @@ -251,9 +325,14 @@ TokenType ttype_one_char(char c) {
return TT_BNOT; // ~
case '^':
return TT_XOR; // ^
case '#':
return TT_POUND;
case '?':
return TT_QMARK;
default:
PRINT_ERROR("Token type for token '%c' not recognized", c);
return TT_NO_TOKEN;
}

return TT_NO_TOKEN;
}

TokenType ttype_many_chars(const char *contents) {
Expand Down Expand Up @@ -316,9 +395,9 @@ TokenType ttype_many_chars(const char *contents) {
} else if (STREQ(contents, "unsigned")) {
return TT_UNSIGNED;
} else if (STREQ(contents, "void")) {
return TT_SIZEOF;
} else if (STREQ(contents, "volitile")) {
return TT_SIZEOF;
return TT_VOID;
} else if (STREQ(contents, "volatile")) {
return TT_VOLATILE;
} else if (STREQ(contents, "while")) {
return TT_WHILE;
} else if (STREQ(contents, "&&")) {
Expand Down Expand Up @@ -365,6 +444,8 @@ TokenType ttype_many_chars(const char *contents) {
return TT_LEFTSHIFTEQUALS;
} else if (STREQ(contents, ">>=")) {
return TT_RIGHTSHIFTEQUALS;
} else if (STREQ(contents, "!=")) {
return TT_NOTEQ;
}

// Includes only numbers
Expand Down Expand Up @@ -453,8 +534,10 @@ static const char *ttype_names[] = {
"no token", // Not a token
"end of file", // End-of-file, lex until we hit the end of the file
"newline", // Newline, used in preprocessing
"pound",
".",
",",
"?",
"-",
"+",
"*",
Expand Down Expand Up @@ -486,6 +569,7 @@ static const char *ttype_names[] = {
"!",
"~",
"==",
"!=",
"^",
"^=",
"->",
Expand Down Expand Up @@ -522,7 +606,7 @@ static const char *ttype_names[] = {
"unsigned",
"union",
"void",
"volitile",
"volatile",
"while",
};

Expand Down
3 changes: 3 additions & 0 deletions src/lexer/token.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,11 @@ typedef enum {
TT_NO_TOKEN, // Not a token
TT_EOF, // End-of-file, so we can lex until we hit the end of the file
TT_NEWLINE, // Newline, used in preprocessing
TT_POUND, // # for preprocessing

TT_PERIOD, // .
TT_COMMA, // ,
TT_QMARK, // ?
TT_MINUS, // -
TT_PLUS, // +
TT_STAR, // *
Expand Down Expand Up @@ -53,6 +55,7 @@ typedef enum {
TT_LNOT, // !
TT_BNOT, // ~
TT_EQUALS, // ==
TT_NOTEQ, // !=
TT_XOR, // ^
TT_XOREQ, // ^=
TT_POINT, // ->
Expand Down
12 changes: 12 additions & 0 deletions tests/optest.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
/* Credit - djb2 */
static unsigned long hash(const char * key) {

unsigned long hash = 5381;
int c;

while ((c = *key++))
hash = ((hash << 5) + hash) + c; /* hash * 33 + c */

return hash;

}
Loading