summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorThomas Bracht Laumann Jespersen <t@laumann.xyz>2025-02-02 08:59:12 +0100
committerThomas Bracht Laumann Jespersen <t@laumann.xyz>2025-02-02 08:59:12 +0100
commitd8321c1947b644888f92795a75645d0c91a6a92d (patch)
tree058d5e9e66463b9b173e513c3244a4da93c0a28c
parent7b72404f5ec31a29aafa6faaf9c284487c819175 (diff)
c/lex: mostly functional lexerHEADmaster
Still need to parse numbers, but otherwise the lexer looks functional. It provides location information for each token (something the OCaml impl does not do).
-rw-r--r--c/lex.c82
-rw-r--r--c/lex.h16
-rw-r--r--c/main.c6
-rw-r--r--c/makefile4
-rw-r--r--c/token.c13
-rw-r--r--c/token.h14
6 files changed, 91 insertions, 44 deletions
diff --git a/c/lex.c b/c/lex.c
index d41562c..c2506a8 100644
--- a/c/lex.c
+++ b/c/lex.c
@@ -1,4 +1,6 @@
+#include <stdlib.h>
#include <stddef.h>
+#include <stdbool.h>
#include <stdint.h>
#include <string.h>
#include <ctype.h>
@@ -15,6 +17,8 @@ lexgetc(struct lexstate *lexer)
lexer->column++;
return lexer->buf[lexer->pos++];
}
+ /* TODO we could check if the last character is 0 and handle
+ * it here... */
return UINT32_MAX;
}
@@ -57,12 +61,15 @@ lexsingle(struct lexstate *lexer)
skipwhitespace(lexer);
+ ret.token.loc.lineno = lexer->lineno;
+ ret.token.loc.column = lexer->column-1;
+
d = lexgetc(lexer);
- if (d == UINT32_MAX)
+ if (d == UINT32_MAX) {
+ ret.res = Lmore;
return ret;
+ }
c = (u8)d;
- ret.token.loc.lineno = lexer->lineno;
- ret.token.loc.column = lexer->column;
ret.res = Lok;
switch (c) {
case ';':
@@ -103,27 +110,58 @@ lexsingle(struct lexstate *lexer)
}
usize len = lexer->pos-start;
memcpy(str, &lexer->buf[start], len);
- printf("token range from %zu to %zu\n", start, lexer->pos);
- printf("token value = %s\n", str);
enum tokentype t = findkeyword((char *)str);
if (t == Txxx) {
t = Tident;
- printf("identifier! %s\n", str);
- } else {
- printf("keyword! %s\n", tokname(t));
+ /* TODO introduce own string type? smallstr */
+ ret.token.str = strdup((char *)str);
}
ret.token.type = t;
return ret;
}
+ /* An unexpected character... */
+ ret.res = Lerror;
return ret;
}
+static void
+printfileloc(char *filename, struct location loc)
+{
+ printf("%s:%zu:%zu: ", filename, loc.lineno, loc.column);
+}
+
+static void
+printtoken(struct token tok, char *filename)
+{
+ printfileloc(filename, tok.loc);
+ if (tok.type < Tident) {
+ /* keyword */
+ printf("%s\n", tokname(tok.type));
+ } else if (tok.type == Tident) {
+ char *s = tok.str ? tok.str : "<meh>";
+ printf("ident - %s\n", s);
+ } else if (tok.type == Tconstant) {
+ printf("constant...\n");
+ } else {
+ /* syntax/operators */
+ printf("%s\n", tokname(tok.type));
+ }
+}
+
+void
+freetoken(struct token tok)
+{
+ if (tok.type == Tident)
+ free(tok.str);
+}
+
struct lexresult
lex(struct lexstate *lexer)
{
struct lexresult ret = {0};
+ char filename[] = "../tests/return_2.c";
if (lexer->respos < lexer->nres) {
ret = lexer->results[lexer->respos];
@@ -136,16 +174,24 @@ lex(struct lexstate *lexer)
}
/* lex everything we got in the buffer */
- ret = lexsingle(lexer);
- ret = lexsingle(lexer);
- /* usize i = 0; */
- /* while (true) { */
- /* ret = lexsingle(lexer); */
- /* if (ret.type == Txxx) */
- /* break; */
- /* lexer->results[i++] = ret; */
- /* } */
- /* something non-whitespace */
+ while (true) {
+ ret = lexsingle(lexer);
+ switch (ret.res) {
+ case Lok:
+ /* TODO assign token to lexer buffer */
+ printtoken(ret.token, filename);
+ freetoken(ret.token);
+ break;
+ case Lmore:
+ printf("no more tokens to lex in current input!\n");
+ goto lexout;
+ case Lerror:
+ printf("error: unexpected character in input\n");
+ goto lexout;
+ }
+ }
+lexout:
+ /* TODO return first token from input (if available) */
return ret;
}
diff --git a/c/lex.h b/c/lex.h
index 48f6cba..1e10d3a 100644
--- a/c/lex.h
+++ b/c/lex.h
@@ -1,22 +1,8 @@
#ifndef LEX_H
#define LEX_H
-struct location {
- usize lineno;
- usize column;
-};
-
-struct token {
- /* location */
- struct location loc;
- /* token type */
- enum tokentype type;
- /* string value, if any */
-};
-
enum lexcode {
- Lnone,
- Lok,
+ Lok = 1,
Lerror,
/* Returned when lexer could not produce any tokens from
* current input, but no error was detected either */
diff --git a/c/main.c b/c/main.c
index bb3a3d4..b5db33d 100644
--- a/c/main.c
+++ b/c/main.c
@@ -124,13 +124,11 @@ main(int argc, char *argv[])
struct lexstate lexer;
lexreset(&lexer);
- u8 code[] = " \n\n int main(void)\n{\n\treturn 2;\n}\n";
- lexfeed(&lexer, sizeof(code), code);
+ u8 code[] = " \n\n int\nmain(void)\n{\n\treturn 2;\n}\n";
+ lexfeed(&lexer, sizeof(code)-1, code);
lex(&lexer);
- printf("line = %zu column = %zu\n", lexer.lineno, lexer.column);
-
/* call preprocessor */
/* call main compiler */
diff --git a/c/makefile b/c/makefile
index af85d9e..65977b2 100644
--- a/c/makefile
+++ b/c/makefile
@@ -4,8 +4,8 @@
OBJ = main.o panic.o lex.o token.o
CC = gcc
-CFLAGS = -std=c17 -g -fsanitize=address,undefined -Wall
-LDFLAGS = -std=c17 -g -fsanitize=address,undefined
+CFLAGS = -std=c17 -g3 -fsanitize=address,undefined -Wall -Wdouble-promotion -Wconversion -D_POSIX_C_SOURCE=200809L
+LDFLAGS = -std=c17 -g3 -fsanitize=address,undefined -D_POSIX_C_SOURCE=200809L
lc: $(OBJ)
$(CC) $(LDFLAGS) $(OBJ) -o $@
diff --git a/c/token.c b/c/token.c
index 85e8e43..6acfe3a 100644
--- a/c/token.c
+++ b/c/token.c
@@ -1,5 +1,8 @@
#include <assert.h>
#include <string.h>
+#include <stddef.h>
+#include <stdint.h>
+#include "types.h"
#include "token.h"
static char *tokennames[] = {
@@ -8,11 +11,11 @@ static char *tokennames[] = {
[Treturn] = "return",
[Tident] = "identifier",
[Tconstant] = "constant",
- [Tlparen] = "lparen",
- [Trparen] = "rparen",
- [Tlbrace] = "lbrace",
- [Trbrace] = "rbrace",
- [Tsemicolon] = "semicolon",
+ [Tlparen] = "(",
+ [Trparen] = ")",
+ [Tlbrace] = "{",
+ [Trbrace] = "}",
+ [Tsemicolon] = ";",
};
static_assert(sizeof(tokennames) / sizeof(char *) == Ntok, "token name map out of sync with tokens");
diff --git a/c/token.h b/c/token.h
index 36b3657..71183d1 100644
--- a/c/token.h
+++ b/c/token.h
@@ -23,6 +23,20 @@ enum tokentype {
Ntok,
};
+struct location {
+ usize lineno;
+ usize column;
+};
+
+struct token {
+ /* location */
+ struct location loc;
+ /* token type */
+ enum tokentype type;
+ /* string value, if any */
+ char *str;
+};
+
char* tokname(enum tokentype);
enum tokentype findkeyword(char *name);