diff options
author | Thomas Bracht Laumann Jespersen <t@laumann.xyz> | 2025-02-02 08:59:12 +0100 |
---|---|---|
committer | Thomas Bracht Laumann Jespersen <t@laumann.xyz> | 2025-02-02 08:59:12 +0100 |
commit | d8321c1947b644888f92795a75645d0c91a6a92d (patch) | |
tree | 058d5e9e66463b9b173e513c3244a4da93c0a28c | |
parent | 7b72404f5ec31a29aafa6faaf9c284487c819175 (diff) |
Still need to parse numbers, but otherwise the lexer looks
functional. It provides location information for each token (something
the OCaml impl does not do).
-rw-r--r-- | c/lex.c | 82 | ||||
-rw-r--r-- | c/lex.h | 16 | ||||
-rw-r--r-- | c/main.c | 6 | ||||
-rw-r--r-- | c/makefile | 4 | ||||
-rw-r--r-- | c/token.c | 13 | ||||
-rw-r--r-- | c/token.h | 14 |
6 files changed, 91 insertions, 44 deletions
@@ -1,4 +1,6 @@ +#include <stdlib.h> #include <stddef.h> +#include <stdbool.h> #include <stdint.h> #include <string.h> #include <ctype.h> @@ -15,6 +17,8 @@ lexgetc(struct lexstate *lexer) lexer->column++; return lexer->buf[lexer->pos++]; } + /* TODO we could check if the last character is 0 and handle + * it here... */ return UINT32_MAX; } @@ -57,12 +61,15 @@ lexsingle(struct lexstate *lexer) skipwhitespace(lexer); + ret.token.loc.lineno = lexer->lineno; + ret.token.loc.column = lexer->column-1; + d = lexgetc(lexer); - if (d == UINT32_MAX) + if (d == UINT32_MAX) { + ret.res = Lmore; return ret; + } c = (u8)d; - ret.token.loc.lineno = lexer->lineno; - ret.token.loc.column = lexer->column; ret.res = Lok; switch (c) { case ';': @@ -103,27 +110,58 @@ lexsingle(struct lexstate *lexer) } usize len = lexer->pos-start; memcpy(str, &lexer->buf[start], len); - printf("token range from %zu to %zu\n", start, lexer->pos); - printf("token value = %s\n", str); enum tokentype t = findkeyword((char *)str); if (t == Txxx) { t = Tident; - printf("identifier! %s\n", str); - } else { - printf("keyword! %s\n", tokname(t)); + /* TODO introduce own string type? smallstr */ + ret.token.str = strdup((char *)str); } ret.token.type = t; return ret; } + /* An unexpected character... */ + ret.res = Lerror; return ret; } +static void +printfileloc(char *filename, struct location loc) +{ + printf("%s:%zu:%zu: ", filename, loc.lineno, loc.column); +} + +static void +printtoken(struct token tok, char *filename) +{ + printfileloc(filename, tok.loc); + if (tok.type < Tident) { + /* keyword */ + printf("%s\n", tokname(tok.type)); + } else if (tok.type == Tident) { + char *s = tok.str ? tok.str : "<meh>"; + printf("ident - %s\n", s); + } else if (tok.type == Tconstant) { + printf("constant...\n"); + } else { + /* syntax/operators */ + printf("%s\n", tokname(tok.type)); + } +} + +void +freetoken(struct token tok) +{ + if (tok.type == Tident) + free(tok.str); +} + struct lexresult lex(struct lexstate *lexer) { struct lexresult ret = {0}; + char filename[] = "../tests/return_2.c"; if (lexer->respos < lexer->nres) { ret = lexer->results[lexer->respos]; @@ -136,16 +174,24 @@ lex(struct lexstate *lexer) } /* lex everything we got in the buffer */ - ret = lexsingle(lexer); - ret = lexsingle(lexer); - /* usize i = 0; */ - /* while (true) { */ - /* ret = lexsingle(lexer); */ - /* if (ret.type == Txxx) */ - /* break; */ - /* lexer->results[i++] = ret; */ - /* } */ - /* something non-whitespace */ + while (true) { + ret = lexsingle(lexer); + switch (ret.res) { + case Lok: + /* TODO assign token to lexer buffer */ + printtoken(ret.token, filename); + freetoken(ret.token); + break; + case Lmore: + printf("no more tokens to lex in current input!\n"); + goto lexout; + case Lerror: + printf("error: unexpected character in input\n"); + goto lexout; + } + } +lexout: + /* TODO return first token from input (if available) */ return ret; } @@ -1,22 +1,8 @@ #ifndef LEX_H #define LEX_H -struct location { - usize lineno; - usize column; -}; - -struct token { - /* location */ - struct location loc; - /* token type */ - enum tokentype type; - /* string value, if any */ -}; - enum lexcode { - Lnone, - Lok, + Lok = 1, Lerror, /* Returned when lexer could not produce any tokens from * current input, but no error was detected either */ @@ -124,13 +124,11 @@ main(int argc, char *argv[]) struct lexstate lexer; lexreset(&lexer); - u8 code[] = " \n\n int main(void)\n{\n\treturn 2;\n}\n"; - lexfeed(&lexer, sizeof(code), code); + u8 code[] = " \n\n int\nmain(void)\n{\n\treturn 2;\n}\n"; + lexfeed(&lexer, sizeof(code)-1, code); lex(&lexer); - printf("line = %zu column = %zu\n", lexer.lineno, lexer.column); - /* call preprocessor */ /* call main compiler */ @@ -4,8 +4,8 @@ OBJ = main.o panic.o lex.o token.o CC = gcc -CFLAGS = -std=c17 -g -fsanitize=address,undefined -Wall -LDFLAGS = -std=c17 -g -fsanitize=address,undefined +CFLAGS = -std=c17 -g3 -fsanitize=address,undefined -Wall -Wdouble-promotion -Wconversion -D_POSIX_C_SOURCE=200809L +LDFLAGS = -std=c17 -g3 -fsanitize=address,undefined -D_POSIX_C_SOURCE=200809L lc: $(OBJ) $(CC) $(LDFLAGS) $(OBJ) -o $@ @@ -1,5 +1,8 @@ #include <assert.h> #include <string.h> +#include <stddef.h> +#include <stdint.h> +#include "types.h" #include "token.h" static char *tokennames[] = { @@ -8,11 +11,11 @@ static char *tokennames[] = { [Treturn] = "return", [Tident] = "identifier", [Tconstant] = "constant", - [Tlparen] = "lparen", - [Trparen] = "rparen", - [Tlbrace] = "lbrace", - [Trbrace] = "rbrace", - [Tsemicolon] = "semicolon", + [Tlparen] = "(", + [Trparen] = ")", + [Tlbrace] = "{", + [Trbrace] = "}", + [Tsemicolon] = ";", }; static_assert(sizeof(tokennames) / sizeof(char *) == Ntok, "token name map out of sync with tokens"); @@ -23,6 +23,20 @@ enum tokentype { Ntok, }; +struct location { + usize lineno; + usize column; +}; + +struct token { + /* location */ + struct location loc; + /* token type */ + enum tokentype type; + /* string value, if any */ + char *str; +}; + char* tokname(enum tokentype); enum tokentype findkeyword(char *name); |