Skip to content
This repository was archived by the owner on Jul 19, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
220 changes: 103 additions & 117 deletions src/lexer/lexer.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,14 @@
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include "./tokens.h"
#include "../utils/hashes.h"

/**
* A token that was parsed by the Lexer.
*/
struct Token {
int type;
char value[longestKeywordSize]; // Increased size to handle longer values like numbers
};
#include "./lexer.h"
#include "./tokens.h"

/**
* The result of the lexer execution.
*/
struct LexerResult {
int size;
struct Token tokens[1024];
};
#include "../utils/hashes.h"

/**
* Sets the token type of the currently selected token in the LexerResult with the provided token type.
Expand All @@ -32,105 +20,103 @@ void pushToken(struct LexerResult* result, enum TokenType type) {
result->size++;
}

/**
* Runs the lexer on the provided string and returns the parsed tokens.
*/
struct LexerResult runLexer(char string[]) {
struct LexerResult result;
result.size = 0;

const int len = strlen(string);
for(int i = 0; i < len; ++i) {
const char c = string[i];

if (c == ' ' || c == '\t' || c == '\n') {
continue;
} else if (isdigit(c)) {
int numLen = 0;
char numStr[32] = {0};

while (i < len && (isdigit(string[i]) || string[i] == '.')) {
numStr[numLen++] = string[i++];
}
i--;

struct Token token;
token.type = NUMBER;
strncpy(token.value, numStr, sizeof(token.value) - 1);
result.tokens[result.size++] = token;
continue;
} else if (c == '"') {
int numLen = 0;
char strValue[longestKeywordSize] = {0};
i++;

while (i < len && string[i] != '"') {
strValue[numLen++] = string[i++];
}

struct Token token;
token.type = STRING;
strncpy(token.value, strValue, sizeof(token.value) - 1);
result.tokens[result.size++] = token;
continue;
} else if (isalpha(c)) {
int wordLen = 0;
char word[longestKeywordSize] = {0};

while (i < len && (isalnum(string[i]) || string[i] == '_')) {
word[wordLen++] = string[i++];
}
i--;

struct Token token;

if (strcmp(word, "func") == 0) {
token.type = FUNCTION;
} else if (strcmp(word, "true") == 0 || strcmp(word, "false") == 0) {
token.type = BOOLEAN_VALUE;
} else if (strcmp(word, "null") == 0) {
token.type = NU;
} else if(strcmp(word, "use") == 0) {
token.type = USE;
} else if(strcmp(word, "var") == 0) {
token.type = VAR;
}
else {
token.type = KEYWORD;
}

strncpy(token.value, word, sizeof(token.value) - 1);
result.tokens[result.size++] = token;
continue;
}

switch(c) {
case '{': pushToken(&result, BRACKETS_OPEN); break;
case '}': pushToken(&result, BRACKETS_CLOSE); break;
case '(': pushToken(&result, PAREN_OPEN); break;
case ')': pushToken(&result, PAREN_CLOSE); break;
case '[': pushToken(&result, ARRAY_OPEN); break;
case ']': pushToken(&result, ARRAY_CLOSE); break;
case ';': pushToken(&result, SEMICOLON); break;
case ',': pushToken(&result, COMMA); break;
case '=': pushToken(&result, DECLARE); break;
case '?':
pushToken(&result, NONE);
result.tokens[result.size - 1].value[0] = '?';
break;
case '+':
case '-':
case '/':
case '*':
pushToken(&result, MATH_OP);
result.tokens[result.size - 1].value[0] = c;
break;
}
}

if (result.size > 0 && strlen(result.tokens[result.size - 1].value) == 0) {
result.size--;
}

return result;
struct LexerResult runLexer(char* string) {
struct LexerResult result;
result.size = 0;

result.tokens = malloc(sizeof(struct Token) * 1024);

Comment on lines +27 to +28
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Fixed-size token array may cause overflow

Allocating result.tokens with a fixed size of 1024 tokens could lead to overflow if processing inputs with more tokens.

Implement dynamic resizing for result.tokens to handle inputs with more than 1024 tokens.

27     int tokensCapacity = 1024;
27     result.tokens = malloc(sizeof(struct Token) * tokensCapacity);
      ...
+     // Before pushing a new token
+     if (result.size >= tokensCapacity) {
+         tokensCapacity *= 2;
+         result.tokens = realloc(result.tokens, sizeof(struct Token) * tokensCapacity);
+         if (result.tokens == NULL) {
+             // Handle allocation failure
+         }
+     }

Committable suggestion skipped: line range outside the PR's diff.

char c;

while(c = *string++) {

int buffLen = 32;
char* buff = malloc(buffLen);

Comment on lines +33 to +35
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Memory leak: buff is not freed after use

The buffer buff is allocated with malloc but is not freed after it's no longer needed, leading to memory leaks.

Ensure that buff is freed after assigning it to result.tokens[result.size - 1].value or if it's no longer needed. For tokens that don't store buff, free it before moving to the next iteration.

+     // After using buff
+     if (buff_needed) {
+         result.tokens[result.size - 1].value = buff;
+     } else {
+         free(buff);
+     }

Also applies to: 52-52, 64-64

if(c == ' ' || c == '\t' || c == '\n') {
continue;
} else if (isdigit(c)) {
int numLen = 0;

while(isdigit(c)) {
buff[numLen] = c;
numLen++;

c = *string++;
}
Comment on lines +33 to +46
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Potential buffer overflow due to fixed buffer size

The buffers buff allocated with a fixed size of 32 bytes may overflow if the tokens (numbers, strings, keywords) are longer than 31 characters (reserving one byte for the null terminator). There is no check to prevent writing beyond the allocated memory, which can lead to undefined behavior or security vulnerabilities.

Apply this diff to implement dynamic resizing of buff:

33     int buffLen = 32;
34     char* buff = malloc(buffLen);
+     int buffIndex = 0;
      ...
42         buff[numLen] = c;
43         numLen++;
+         buffIndex++;
+         if (buffIndex >= buffLen - 1) { // Resize if needed
+             buffLen *= 2;
+             buff = realloc(buff, buffLen);
+             if (buff == NULL) {
+                 // Handle allocation failure
+             }
+         }
45         c = *string++;

Repeat similar changes for string parsing (lines 54-59) and keyword parsing (lines 67-72).

Also applies to: 54-59, 67-72

Comment on lines +41 to +46
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Missing null terminators in token values

When collecting characters for numbers and keywords, the buffers are not null-terminated, which can lead to undefined behavior when used as strings.

Add a null terminator at the end of the buffers after the loops.

46     }
+     buff[numLen] = '\0';

Apply similar changes for keyword parsing.

Also applies to: 67-72


pushToken(&result, NUMBER);
result.tokens[result.size - 1].value = buff;

} else if (c == '\"') {
int strLen = 0;

while(c != '\"') {
buff[strLen] = c;
strLen++;

c = *string++;
}
Comment on lines +54 to +59
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Infinite loop risk when parsing strings

The loop while(c != '\"') doesn't check for the end of the string. If a closing quote is missing, this will cause an infinite loop.

Modify the loop condition to check for the null terminator to prevent infinite loops.

54     while(c != '\"' && c != '\0') {
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
while(c != '\"') {
buff[strLen] = c;
strLen++;
c = *string++;
}
while(c != '\"' && c != '\0') {
buff[strLen] = c;
strLen++;
c = *string++;
}


pushToken(&result, STRING);
result.tokens[result.size - 1].value = buff;

} else if(isalpha(c)) {
int keywordLen = 0;

while(isalpha(c)) {
buff[keywordLen] = c;
keywordLen++;

c = *string++;
}

if(strcmp(buff, "func") == 0) {
pushToken(&result, FUNCTION);
}
else if(strcmp(buff, "true") == 0 || strcmp(buff, "false") == 0) {
pushToken(&result, BOOLEAN_VALUE);
result.tokens[result.size - 1].value = buff;
}
else if(strcmp(buff, "null") == 0) {
pushToken(&result, NU);
}
else if(strcmp(buff, "use") == 0) {
pushToken(&result, USE);
}
else if(strcmp(buff, "var") == 0) {
pushToken(&result, VAR);
}
else {
pushToken(&result, KEYWORD);
result.tokens[result.size - 1].value = buff;
}
}
else {
switch(c) {
case '{': pushToken(&result, BRACKETS_OPEN); break;
case '}': pushToken(&result, BRACKETS_CLOSE); break;
case '(': pushToken(&result, PAREN_OPEN); break;
case ')': pushToken(&result, PAREN_CLOSE); break;
case '[': pushToken(&result, ARRAY_OPEN); break;
case ']': pushToken(&result, ARRAY_CLOSE); break;
case ';': pushToken(&result, SEMICOLON); break;
case ',': pushToken(&result, COMMA); break;
case '=': pushToken(&result, DECLARE); break;
case '?': pushToken(&result, NONE); break;

case '+':
case '-':
case '*':
case '/':
case '^':
pushToken(&result, MATH_OP);
result.tokens[result.size - 1].value[0] = c;
}
Comment on lines +114 to +115
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Uninitialized value field in MATH_OP tokens

For math operator tokens, you're assigning c to result.tokens[result.size - 1].value[0], but value is not allocated, which may cause undefined behavior.

Allocate memory for value before assigning to it.

114     pushToken(&result, MATH_OP);
+       result.tokens[result.size - 1].value = malloc(2);
115     result.tokens[result.size - 1].value[0] = c;
+       result.tokens[result.size - 1].value[1] = '\0';
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
result.tokens[result.size - 1].value[0] = c;
}
result.tokens[result.size - 1].value = malloc(2);
result.tokens[result.size - 1].value[0] = c;
result.tokens[result.size - 1].value[1] = '\0';
}

}
}

return result;
}


12 changes: 2 additions & 10 deletions src/lexer/lexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,19 @@
#include <stdio.h>
#include "./tokens.h"

/**
* Represents a single token from lexical analysis
*/
struct Token {
enum TokenType type;
char value[32]; // Using 32 as longestKeywordSize
};

/**
* Contains the results of lexical analysis
*/
struct LexerResult {
int size;
struct Token tokens[1024];
struct Token* tokens;
};

/**
* Performs lexical analysis on an input string
* Returns a LexerResult containing the tokens
*/
struct LexerResult runLexer(const char* input);
struct LexerResult runLexer(char* input);
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Changing input parameter to char* may lead to unintended modifications

The runLexer function now takes a char* instead of a const char*, which allows modification of the input string. This could cause unexpected side effects if the input string is used elsewhere.

If the function does not need to modify the input string, change the parameter back to const char*.


/**
* Adds a token to the LexerResult
Expand Down
55 changes: 26 additions & 29 deletions src/lexer/tokens.h
Original file line number Diff line number Diff line change
@@ -1,39 +1,36 @@
#ifndef TOKENS_H
#define TOKENS_H

#define longestKeywordSize 32
#define smallestKeywordSize 4

enum TokenType {
FUNCTION = 1,
RETURN = 2,
VAR = 3,
BRACKETS_OPEN = 4,
BRACKETS_CLOSE = 5,
PAREN_OPEN = 6,
PAREN_CLOSE = 7,
ARRAY_OPEN = 8,
ARRAY_CLOSE = 9,
NUMBER = 10,
STRING = 11,
BOOLEAN_VALUE = 12,
NU = 13,
KEYWORD = 14,
SEMICOLON = 15,
COMMA = 16,
DECLARE = 17,
USE = 18,
NONE = 19,
MATH_OP = 20
FUNCTION,
RETURN,
VAR,
BRACKETS_OPEN,
BRACKETS_CLOSE,
PAREN_OPEN,
PAREN_CLOSE,
ARRAY_OPEN,
ARRAY_CLOSE,
NUMBER,
STRING,
BOOLEAN_VALUE,
NU,
KEYWORD,
SEMICOLON,
COMMA,
DECLARE,
USE,
NONE,
MATH_OP
};

struct KeywordResult {
int count;
char* keywords[10];
enum TokenType types[10];
/**
* An lexer token generated by the Lexer.
*/
struct Token {
enum TokenType type;
char* value;
};

void initKeywords();
struct KeywordResult getKeywords(char start);

#endif
Loading