diff --git a/src/lexer/lex.c b/src/lexer/lex.c index 423939f..0411cde 100644 --- a/src/lexer/lex.c +++ b/src/lexer/lex.c @@ -18,8 +18,66 @@ int in_string(char c, char s[]) { return 0; } -// We will need to add more of these later, for sure -char single_char_tokens[] = "(){}[];"; +char single_char_tokens[] = "(){}[];~#,.:?~"; + +// All strings which represent operators. +char* operator_strings[] = { + "-", + "+", + "*", + "/", + "=", + ":", + "%", + "&", + "&&", + "|", + "||", + "-=", + "+=", + "++", + "--", + "/=", + "*=", + "%=", + "&=", + "|=", + "&&=", + "||=", + ">", + "<", + "<=", + ">=", + "<<", + ">>", + "!", + "==", + "!=", + "^", + "^=", + "->", + "<<=", + ">>=", + NULL, // for iterating +}; + +int starts_operator(char c) { + switch (c) { + case '-': case '+': case '*': case '/': case '=': case ':': case '%': + case '&': case '|': case '<': case '>': case '!': case '~': case '^': + return 1; + default: + return 0; + } +} + +int valid_operator_sequence(char* op) { + for (char** top = operator_strings; *top; ++top) { + if (STREQ(*top, op)) + return 1; + } + return 0; +} int is_valid_numeric_or_id_char(char c) { return isalnum(c) || (c == '_') || (c == '.'); @@ -143,6 +201,22 @@ int real_lex(Lexer *l, Token *t) { return 0; } + // Lex an operator. We do this by lexing characters from the buffer until + // the resulting string is no longer an operator, then we cut our losses and + // return./ + if (starts_operator(init)) { + while (valid_operator_sequence(t->contents)) { + t->contents[pos++] = (c = getc(l->fp)); + } + // We've ended! + // Can we reduce this code duplication from above in a smart way? + ungetc(c, l->fp); + t->contents[pos - 1] = '\0'; + t->type = ttype_from_string(t->contents); + t->length = pos; + return 0; + } + // TODO - parse character or string literal return 0; @@ -252,9 +326,14 @@ TokenType _ttype_one_char(char c) { return TT_BNOT; // ~ case '^': return TT_XOR; // ^ + case '#': + return TT_POUND; + case '?': + return TT_QMARK; + default: + PRINT_ERROR("Token type for token '%c' not recognized", c); + return TT_NO_TOKEN; } - - return TT_NO_TOKEN; } // This is a function for parsing exclusively tokens with more than one char @@ -318,9 +397,9 @@ TokenType _ttype_many_chars(const char *contents) { } else if (STREQ(contents, "unsigned")) { return TT_UNSIGNED; } else if (STREQ(contents, "void")) { - return TT_SIZEOF; - } else if (STREQ(contents, "volitile")) { - return TT_SIZEOF; + return TT_VOID; + } else if (STREQ(contents, "volatile")) { + return TT_VOLATILE; } else if (STREQ(contents, "while")) { return TT_WHILE; } else if (STREQ(contents, "&&")) { @@ -367,6 +446,8 @@ TokenType _ttype_many_chars(const char *contents) { return TT_LEFTSHIFTEQUALS; } else if (STREQ(contents, ">>=")) { return TT_RIGHTSHIFTEQUALS; + } else if (STREQ(contents, "!=")) { + return TT_NOTEQ; } // Includes only numbers @@ -456,8 +537,10 @@ static const char *ttype_names[] = { "no token", // Not a token "end of file", // End-of-file, lex until we hit the end of the file "newline", // Newline, used in preprocessing + "pound", ".", ",", + "?", "-", "+", "*", @@ -489,6 +572,7 @@ static const char *ttype_names[] = { "!", "~", "==", + "!=", "^", "^=", "->", @@ -525,7 +609,7 @@ static const char *ttype_names[] = { "unsigned", "union", "void", - "volitile", + "volatile", "while", }; diff --git a/src/lexer/token.h b/src/lexer/token.h index 7409706..4713561 100644 --- a/src/lexer/token.h +++ b/src/lexer/token.h @@ -19,9 +19,11 @@ typedef enum { TT_NO_TOKEN, // Not a token TT_EOF, // End-of-file, so we can lex until we hit the end of the file TT_NEWLINE, // Newline, used in preprocessing + TT_POUND, // # for preprocessing TT_PERIOD, // . TT_COMMA, // , + TT_QMARK, // ? TT_MINUS, // - TT_PLUS, // + TT_STAR, // * @@ -53,6 +55,7 @@ typedef enum { TT_LNOT, // ! TT_BNOT, // ~ TT_EQUALS, // == + TT_NOTEQ, // != TT_XOR, // ^ TT_XOREQ, // ^= TT_POINT, // -> diff --git a/tests/optest.c b/tests/optest.c new file mode 100644 index 0000000..d4bf38c --- /dev/null +++ b/tests/optest.c @@ -0,0 +1,12 @@ +/* Credit - djb2 */ +static unsigned long hash(const char * key) { + + unsigned long hash = 5381; + int c; + + while ((c = *key++)) + hash = ((hash << 5) + hash) + c; /* hash * 33 + c */ + + return hash; + +} \ No newline at end of file