From 7f61eab11dd94da4d9b5b92a8221c32c6227d802 Mon Sep 17 00:00:00 2001 From: Zffu Date: Thu, 12 Dec 2024 22:07:21 +0100 Subject: [PATCH 1/9] refactor: removed useless incremented enum value --- src/lexer/tokens.h | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/src/lexer/tokens.h b/src/lexer/tokens.h index e1cfaa9..1f38558 100644 --- a/src/lexer/tokens.h +++ b/src/lexer/tokens.h @@ -5,26 +5,26 @@ #define smallestKeywordSize 4 enum TokenType { - FUNCTION = 1, - RETURN = 2, - VAR = 3, - BRACKETS_OPEN = 4, - BRACKETS_CLOSE = 5, - PAREN_OPEN = 6, - PAREN_CLOSE = 7, - ARRAY_OPEN = 8, - ARRAY_CLOSE = 9, - NUMBER = 10, - STRING = 11, - BOOLEAN_VALUE = 12, - NU = 13, - KEYWORD = 14, - SEMICOLON = 15, - COMMA = 16, - DECLARE = 17, - USE = 18, - NONE = 19, - MATH_OP = 20 + FUNCTION, + RETURN, + VAR, + BRACKETS_OPEN, + BRACKETS_CLOSE, + PAREN_OPEN, + PAREN_CLOSE, + ARRAY_OPEN, + ARRAY_CLOSE, + NUMBER, + STRING, + BOOLEAN_VALUE, + NU, + KEYWORD, + SEMICOLON, + COMMA, + DECLARE, + USE, + NONE, + MATH_OP }; struct KeywordResult { From 96a0bc75ecc184209bf089d4d89f5c3108a0aca6 Mon Sep 17 00:00:00 2001 From: Zffu Date: Thu, 12 Dec 2024 22:08:06 +0100 Subject: [PATCH 2/9] feat: removed unused keyword search stuff --- src/lexer/tokens.h | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/lexer/tokens.h b/src/lexer/tokens.h index 1f38558..8aca2e9 100644 --- a/src/lexer/tokens.h +++ b/src/lexer/tokens.h @@ -27,13 +27,4 @@ enum TokenType { MATH_OP }; -struct KeywordResult { - int count; - char* keywords[10]; - enum TokenType types[10]; -}; - -void initKeywords(); -struct KeywordResult getKeywords(char start); - #endif From a4abf87f2e16f123a30fc8a375ab0e78f8f2889c Mon Sep 17 00:00:00 2001 From: Zffu Date: Thu, 12 Dec 2024 22:09:15 +0100 Subject: [PATCH 3/9] feat: removed unused defines --- src/lexer/tokens.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/lexer/tokens.h b/src/lexer/tokens.h index 8aca2e9..38b7328 100644 --- a/src/lexer/tokens.h +++ b/src/lexer/tokens.h @@ -1,9 +1,6 @@ #ifndef TOKENS_H #define TOKENS_H -#define longestKeywordSize 32 -#define smallestKeywordSize 4 - enum TokenType { FUNCTION, RETURN, From 1815aba5e8741f8eeb4864b7dd8474b5944153a2 Mon Sep 17 00:00:00 2001 From: Zffu Date: Thu, 12 Dec 2024 22:10:10 +0100 Subject: [PATCH 4/9] feat: changed arrays to pointers --- src/lexer/lexer.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lexer/lexer.h b/src/lexer/lexer.h index d4e29bf..6955d4a 100644 --- a/src/lexer/lexer.h +++ b/src/lexer/lexer.h @@ -9,7 +9,7 @@ */ struct Token { enum TokenType type; - char value[32]; // Using 32 as longestKeywordSize + char* value; }; /** @@ -17,7 +17,7 @@ struct Token { */ struct LexerResult { int size; - struct Token tokens[1024]; + struct Token* tokens; }; /** From 0e7dfc5a13753f87baa55de376ee29d66d2513bb Mon Sep 17 00:00:00 2001 From: Zffu Date: Thu, 12 Dec 2024 22:14:30 +0100 Subject: [PATCH 5/9] feat: moved token struct to tokens header --- src/lexer/lexer.c | 20 +++++++++----------- src/lexer/tokens.h | 9 +++++++++ 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/src/lexer/lexer.c b/src/lexer/lexer.c index b9035b6..32fc5d8 100644 --- a/src/lexer/lexer.c +++ b/src/lexer/lexer.c @@ -3,19 +3,13 @@ */ #include +#include #include #include + #include "./tokens.h" #include "../utils/hashes.h" -/** - * A token that was parsed by the Lexer. - */ -struct Token { - int type; - char value[longestKeywordSize]; // Increased size to handle longer values like numbers -}; - /** * The result of the lexer execution. */ @@ -38,6 +32,8 @@ void pushToken(struct LexerResult* result, enum TokenType type) { struct LexerResult runLexer(char string[]) { struct LexerResult result; result.size = 0; + + result->tokens = malloc(sizeof(struct Token) * 1024); const int len = strlen(string); for(int i = 0; i < len; ++i) { @@ -56,7 +52,8 @@ struct LexerResult runLexer(char string[]) { struct Token token; token.type = NUMBER; - strncpy(token.value, numStr, sizeof(token.value) - 1); + token.value = numStr; + result.tokens[result.size++] = token; continue; } else if (c == '"') { @@ -70,7 +67,8 @@ struct LexerResult runLexer(char string[]) { struct Token token; token.type = STRING; - strncpy(token.value, strValue, sizeof(token.value) - 1); + token.value = strValue; + result.tokens[result.size++] = token; continue; } else if (isalpha(c)) { @@ -99,7 +97,7 @@ struct LexerResult runLexer(char string[]) { token.type = KEYWORD; } - strncpy(token.value, word, sizeof(token.value) - 1); + token.value = word; result.tokens[result.size++] = token; continue; } diff --git a/src/lexer/tokens.h b/src/lexer/tokens.h index 38b7328..a5b7ece 100644 --- a/src/lexer/tokens.h +++ b/src/lexer/tokens.h @@ -24,4 +24,13 @@ enum TokenType { MATH_OP }; +/** + * An lexer token generated by the Lexer. + */ +struct Token { + enum TokenType type; + char* value; +}; + + #endif From 98f319fc21ee2d2391f38696078d28d5f642a2c8 Mon Sep 17 00:00:00 2001 From: Zffu Date: Thu, 12 Dec 2024 22:29:10 +0100 Subject: [PATCH 6/9] feat: added 'infinite' char stack for lexer --- src/lexer/lexer.c | 39 ++++++++++++++++++++++++++++----------- src/lexer/lexer.h | 8 -------- 2 files changed, 28 insertions(+), 19 deletions(-) diff --git a/src/lexer/lexer.c b/src/lexer/lexer.c index 32fc5d8..83d1eb0 100644 --- a/src/lexer/lexer.c +++ b/src/lexer/lexer.c @@ -7,16 +7,10 @@ #include #include +#include "./lexer.h" #include "./tokens.h" -#include "../utils/hashes.h" -/** - * The result of the lexer execution. - */ -struct LexerResult { - int size; - struct Token tokens[1024]; -}; +#include "../utils/hashes.h" /** * Sets the token type of the currently selected token in the LexerResult with the provided token type. @@ -43,10 +37,18 @@ struct LexerResult runLexer(char string[]) { continue; } else if (isdigit(c)) { int numLen = 0; - char numStr[32] = {0}; + int maxLen = 32; + + char* numStr = malloc(maxLen); while (i < len && (isdigit(string[i]) || string[i] == '.')) { numStr[numLen++] = string[i++]; + + if(numLen >= maxLen) { + maxLen = maxLen * 1.25; + numStr = realloc(numStr, maxLen); + } + } i--; @@ -58,11 +60,18 @@ struct LexerResult runLexer(char string[]) { continue; } else if (c == '"') { int numLen = 0; - char strValue[longestKeywordSize] = {0}; + int maxLen = 32; + + char* strValue = malloc(maxLen); i++; while (i < len && string[i] != '"') { strValue[numLen++] = string[i++]; + + if(numLen > maxLen) { + maxLen = maxLen * 1.25; + strValue = realloc(strValue, maxLen); + } } struct Token token; @@ -73,10 +82,18 @@ struct LexerResult runLexer(char string[]) { continue; } else if (isalpha(c)) { int wordLen = 0; - char word[longestKeywordSize] = {0}; + int maxLen = 32; + + char* word = malloc(maxLen); while (i < len && (isalnum(string[i]) || string[i] == '_')) { word[wordLen++] = string[i++]; + + if(wordLen >= maxLen) { + maxLen = maxLen * 1.25; + word = realloc(word, maxLen); + } + } i--; diff --git a/src/lexer/lexer.h b/src/lexer/lexer.h index 6955d4a..e66d7c0 100644 --- a/src/lexer/lexer.h +++ b/src/lexer/lexer.h @@ -4,14 +4,6 @@ #include #include "./tokens.h" -/** - * Represents a single token from lexical analysis - */ -struct Token { - enum TokenType type; - char* value; -}; - /** * Contains the results of lexical analysis */ From c739d88cb75883d9657800ba9c97d8deab298737 Mon Sep 17 00:00:00 2001 From: Zffu Date: Thu, 12 Dec 2024 22:40:30 +0100 Subject: [PATCH 7/9] feat: started reworking a more optimized lexer --- src/lexer/lexer.c | 37 +++++++++++++++++++++++++++++++++++-- src/lexer/lexer.h | 2 +- 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/src/lexer/lexer.c b/src/lexer/lexer.c index 83d1eb0..23db0b6 100644 --- a/src/lexer/lexer.c +++ b/src/lexer/lexer.c @@ -20,14 +20,47 @@ void pushToken(struct LexerResult* result, enum TokenType type) { result->size++; } +struct LexerResult runLexer(char* string) { + struct LexerResult result; + result.size = 0; + + result.tokens = malloc(sizeof(struct Token) * 1024); + + int i = 0; + char c; + + while(c = *string++) { + + int buffLen = 32; + char* buff = malloc(buffLen); + + if(c == ' ' || c == '\t' || c == '\n') { + continue; + } else if (isdigit(c)) { + int numLen = 0; + + while(isdigit(c)) { + buff[numLen] = c; + numLen++; + + c = *string++; + } + } + + ++i; + } + + return result; +} + /** * Runs the lexer on the provided string and returns the parsed tokens. */ -struct LexerResult runLexer(char string[]) { +struct LexerResult runLexer0(char string[]) { struct LexerResult result; result.size = 0; - result->tokens = malloc(sizeof(struct Token) * 1024); + result.tokens = malloc(sizeof(struct Token) * 1024); const int len = strlen(string); for(int i = 0; i < len; ++i) { diff --git a/src/lexer/lexer.h b/src/lexer/lexer.h index e66d7c0..f3a9072 100644 --- a/src/lexer/lexer.h +++ b/src/lexer/lexer.h @@ -16,7 +16,7 @@ struct LexerResult { * Performs lexical analysis on an input string * Returns a LexerResult containing the tokens */ -struct LexerResult runLexer(const char* input); +struct LexerResult runLexer(char* input); /** * Adds a token to the LexerResult From de89e090dc620efbaf8e842389eb9ba07fde3fbc Mon Sep 17 00:00:00 2001 From: Zffu Date: Thu, 12 Dec 2024 22:50:21 +0100 Subject: [PATCH 8/9] feat: added wip lexer --- src/lexer/lexer.c | 50 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 3 deletions(-) diff --git a/src/lexer/lexer.c b/src/lexer/lexer.c index 23db0b6..2a92c21 100644 --- a/src/lexer/lexer.c +++ b/src/lexer/lexer.c @@ -26,7 +26,6 @@ struct LexerResult runLexer(char* string) { result.tokens = malloc(sizeof(struct Token) * 1024); - int i = 0; char c; while(c = *string++) { @@ -45,9 +44,54 @@ struct LexerResult runLexer(char* string) { c = *string++; } - } - ++i; + pushToken(&result, NUMBER); + result.tokens[result.size].value = buff; + + } else if (c == '\"') { + int strLen = 0; + + while(c != '\"') { + buff[strLen] = c; + strLen++; + + c = *string++; + } + + pushToken(&result, STRING); + result.tokens[result.size].value = buff; + + } else if(isalpha(c)) { + int keywordLen = 0; + + while(isalpha(c)) { + buff[keywordLen] = c; + keywordLen++; + + c = *string++; + } + + if(strcmp(word, "func") == 0) { + pushToken(&result, FUNCTION); + } + else if(strcmp(word, "true") == 0 || strcmp(word, "false") == 0) { + pushToken(&result, BOOLEAN_VALUE); + result.tokens[result.size].value = word; + } + else if(strcmp(word, "null") == 0) { + pushToken(&result, NU); + } + else if(strcmp(word, "use") == 0) { + pushToken(&result, USE); + } + else if(strcmp(word, "var") == 0) { + pushToken(&result, VAR); + } + else { + pushToken(&result, KEYWORD); + result.tokens[result.size].value = buff; + } + } } return result; From 64839fade1cf51b7a8166a327e2582ffbbc269bb Mon Sep 17 00:00:00 2001 From: Zffu Date: Thu, 12 Dec 2024 22:57:41 +0100 Subject: [PATCH 9/9] refactor: improved lexer performance by ditching strlen --- src/lexer/lexer.c | 168 +++++++++------------------------------------- 1 file changed, 31 insertions(+), 137 deletions(-) diff --git a/src/lexer/lexer.c b/src/lexer/lexer.c index 2a92c21..f0d8e21 100644 --- a/src/lexer/lexer.c +++ b/src/lexer/lexer.c @@ -46,7 +46,7 @@ struct LexerResult runLexer(char* string) { } pushToken(&result, NUMBER); - result.tokens[result.size].value = buff; + result.tokens[result.size - 1].value = buff; } else if (c == '\"') { int strLen = 0; @@ -59,7 +59,7 @@ struct LexerResult runLexer(char* string) { } pushToken(&result, STRING); - result.tokens[result.size].value = buff; + result.tokens[result.size - 1].value = buff; } else if(isalpha(c)) { int keywordLen = 0; @@ -71,25 +71,47 @@ struct LexerResult runLexer(char* string) { c = *string++; } - if(strcmp(word, "func") == 0) { + if(strcmp(buff, "func") == 0) { pushToken(&result, FUNCTION); } - else if(strcmp(word, "true") == 0 || strcmp(word, "false") == 0) { + else if(strcmp(buff, "true") == 0 || strcmp(buff, "false") == 0) { pushToken(&result, BOOLEAN_VALUE); - result.tokens[result.size].value = word; + result.tokens[result.size - 1].value = buff; } - else if(strcmp(word, "null") == 0) { + else if(strcmp(buff, "null") == 0) { pushToken(&result, NU); } - else if(strcmp(word, "use") == 0) { + else if(strcmp(buff, "use") == 0) { pushToken(&result, USE); } - else if(strcmp(word, "var") == 0) { + else if(strcmp(buff, "var") == 0) { pushToken(&result, VAR); } else { pushToken(&result, KEYWORD); - result.tokens[result.size].value = buff; + result.tokens[result.size - 1].value = buff; + } + } + else { + switch(c) { + case '{': pushToken(&result, BRACKETS_OPEN); break; + case '}': pushToken(&result, BRACKETS_CLOSE); break; + case '(': pushToken(&result, PAREN_OPEN); break; + case ')': pushToken(&result, PAREN_CLOSE); break; + case '[': pushToken(&result, ARRAY_OPEN); break; + case ']': pushToken(&result, ARRAY_CLOSE); break; + case ';': pushToken(&result, SEMICOLON); break; + case ',': pushToken(&result, COMMA); break; + case '=': pushToken(&result, DECLARE); break; + case '?': pushToken(&result, NONE); break; + + case '+': + case '-': + case '*': + case '/': + case '^': + pushToken(&result, MATH_OP); + result.tokens[result.size - 1].value[0] = c; } } } @@ -97,132 +119,4 @@ struct LexerResult runLexer(char* string) { return result; } -/** - * Runs the lexer on the provided string and returns the parsed tokens. - */ -struct LexerResult runLexer0(char string[]) { - struct LexerResult result; - result.size = 0; - - result.tokens = malloc(sizeof(struct Token) * 1024); - - const int len = strlen(string); - for(int i = 0; i < len; ++i) { - const char c = string[i]; - - if (c == ' ' || c == '\t' || c == '\n') { - continue; - } else if (isdigit(c)) { - int numLen = 0; - int maxLen = 32; - - char* numStr = malloc(maxLen); - - while (i < len && (isdigit(string[i]) || string[i] == '.')) { - numStr[numLen++] = string[i++]; - - if(numLen >= maxLen) { - maxLen = maxLen * 1.25; - numStr = realloc(numStr, maxLen); - } - } - i--; - - struct Token token; - token.type = NUMBER; - token.value = numStr; - - result.tokens[result.size++] = token; - continue; - } else if (c == '"') { - int numLen = 0; - int maxLen = 32; - - char* strValue = malloc(maxLen); - i++; - - while (i < len && string[i] != '"') { - strValue[numLen++] = string[i++]; - - if(numLen > maxLen) { - maxLen = maxLen * 1.25; - strValue = realloc(strValue, maxLen); - } - } - - struct Token token; - token.type = STRING; - token.value = strValue; - - result.tokens[result.size++] = token; - continue; - } else if (isalpha(c)) { - int wordLen = 0; - int maxLen = 32; - - char* word = malloc(maxLen); - - while (i < len && (isalnum(string[i]) || string[i] == '_')) { - word[wordLen++] = string[i++]; - - if(wordLen >= maxLen) { - maxLen = maxLen * 1.25; - word = realloc(word, maxLen); - } - - } - i--; - - struct Token token; - - if (strcmp(word, "func") == 0) { - token.type = FUNCTION; - } else if (strcmp(word, "true") == 0 || strcmp(word, "false") == 0) { - token.type = BOOLEAN_VALUE; - } else if (strcmp(word, "null") == 0) { - token.type = NU; - } else if(strcmp(word, "use") == 0) { - token.type = USE; - } else if(strcmp(word, "var") == 0) { - token.type = VAR; - } - else { - token.type = KEYWORD; - } - - token.value = word; - result.tokens[result.size++] = token; - continue; - } - - switch(c) { - case '{': pushToken(&result, BRACKETS_OPEN); break; - case '}': pushToken(&result, BRACKETS_CLOSE); break; - case '(': pushToken(&result, PAREN_OPEN); break; - case ')': pushToken(&result, PAREN_CLOSE); break; - case '[': pushToken(&result, ARRAY_OPEN); break; - case ']': pushToken(&result, ARRAY_CLOSE); break; - case ';': pushToken(&result, SEMICOLON); break; - case ',': pushToken(&result, COMMA); break; - case '=': pushToken(&result, DECLARE); break; - case '?': - pushToken(&result, NONE); - result.tokens[result.size - 1].value[0] = '?'; - break; - case '+': - case '-': - case '/': - case '*': - pushToken(&result, MATH_OP); - result.tokens[result.size - 1].value[0] = c; - break; - } - } - - if (result.size > 0 && strlen(result.tokens[result.size - 1].value) == 0) { - result.size--; - } - - return result; -}