summaryrefslogtreecommitdiff
path: root/c
diff options
context:
space:
mode:
authorThomas Bracht Laumann Jespersen <t@laumann.xyz>2025-02-02 08:59:12 +0100
committerThomas Bracht Laumann Jespersen <t@laumann.xyz>2025-02-02 08:59:12 +0100
commitd8321c1947b644888f92795a75645d0c91a6a92d (patch)
tree058d5e9e66463b9b173e513c3244a4da93c0a28c /c
parent7b72404f5ec31a29aafa6faaf9c284487c819175 (diff)
c/lex: mostly functional lexerHEADmaster
Still need to parse numbers, but otherwise the lexer looks functional. It provides location information for each token (something the OCaml impl does not do).
Diffstat (limited to 'c')
-rw-r--r--c/lex.c82
-rw-r--r--c/lex.h16
-rw-r--r--c/main.c6
-rw-r--r--c/makefile4
-rw-r--r--c/token.c13
-rw-r--r--c/token.h14
6 files changed, 91 insertions, 44 deletions
diff --git a/c/lex.c b/c/lex.c
index d41562c..c2506a8 100644
--- a/c/lex.c
+++ b/c/lex.c
@@ -1,4 +1,6 @@
+#include <stdlib.h>
#include <stddef.h>
+#include <stdbool.h>
#include <stdint.h>
#include <string.h>
#include <ctype.h>
@@ -15,6 +17,8 @@ lexgetc(struct lexstate *lexer)
lexer->column++;
return lexer->buf[lexer->pos++];
}
+ /* TODO we could check if the last character is 0 and handle
+ * it here... */
return UINT32_MAX;
}
@@ -57,12 +61,15 @@ lexsingle(struct lexstate *lexer)
skipwhitespace(lexer);
+ ret.token.loc.lineno = lexer->lineno;
+ ret.token.loc.column = lexer->column-1;
+
d = lexgetc(lexer);
- if (d == UINT32_MAX)
+ if (d == UINT32_MAX) {
+ ret.res = Lmore;
return ret;
+ }
c = (u8)d;
- ret.token.loc.lineno = lexer->lineno;
- ret.token.loc.column = lexer->column;
ret.res = Lok;
switch (c) {
case ';':
@@ -103,27 +110,58 @@ lexsingle(struct lexstate *lexer)
}
usize len = lexer->pos-start;
memcpy(str, &lexer->buf[start], len);
- printf("token range from %zu to %zu\n", start, lexer->pos);
- printf("token value = %s\n", str);
enum tokentype t = findkeyword((char *)str);
if (t == Txxx) {
t = Tident;
- printf("identifier! %s\n", str);
- } else {
- printf("keyword! %s\n", tokname(t));
+ /* TODO introduce own string type? smallstr */
+ ret.token.str = strdup((char *)str);
}
ret.token.type = t;
return ret;
}
+ /* An unexpected character... */
+ ret.res = Lerror;
return ret;
}
+static void
+printfileloc(char *filename, struct location loc)
+{
+ printf("%s:%zu:%zu: ", filename, loc.lineno, loc.column);
+}
+
+static void
+printtoken(struct token tok, char *filename)
+{
+ printfileloc(filename, tok.loc);
+ if (tok.type < Tident) {
+ /* keyword */
+ printf("%s\n", tokname(tok.type));
+ } else if (tok.type == Tident) {
+ char *s = tok.str ? tok.str : "<meh>";
+ printf("ident - %s\n", s);
+ } else if (tok.type == Tconstant) {
+ printf("constant...\n");
+ } else {
+ /* syntax/operators */
+ printf("%s\n", tokname(tok.type));
+ }
+}
+
+void
+freetoken(struct token tok)
+{
+ if (tok.type == Tident)
+ free(tok.str);
+}
+
struct lexresult
lex(struct lexstate *lexer)
{
struct lexresult ret = {0};
+ char filename[] = "../tests/return_2.c";
if (lexer->respos < lexer->nres) {
ret = lexer->results[lexer->respos];
@@ -136,16 +174,24 @@ lex(struct lexstate *lexer)
}
/* lex everything we got in the buffer */
- ret = lexsingle(lexer);
- ret = lexsingle(lexer);
- /* usize i = 0; */
- /* while (true) { */
- /* ret = lexsingle(lexer); */
- /* if (ret.type == Txxx) */
- /* break; */
- /* lexer->results[i++] = ret; */
- /* } */
- /* something non-whitespace */
+ while (true) {
+ ret = lexsingle(lexer);
+ switch (ret.res) {
+ case Lok:
+ /* TODO assign token to lexer buffer */
+ printtoken(ret.token, filename);
+ freetoken(ret.token);
+ break;
+ case Lmore:
+ printf("no more tokens to lex in current input!\n");
+ goto lexout;
+ case Lerror:
+ printf("error: unexpected character in input\n");
+ goto lexout;
+ }
+ }
+lexout:
+ /* TODO return first token from input (if available) */
return ret;
}
diff --git a/c/lex.h b/c/lex.h
index 48f6cba..1e10d3a 100644
--- a/c/lex.h
+++ b/c/lex.h
@@ -1,22 +1,8 @@
#ifndef LEX_H
#define LEX_H
-struct location {
- usize lineno;
- usize column;
-};
-
-struct token {
- /* location */
- struct location loc;
- /* token type */
- enum tokentype type;
- /* string value, if any */
-};
-
enum lexcode {
- Lnone,
- Lok,
+ Lok = 1,
Lerror,
/* Returned when lexer could not produce any tokens from
* current input, but no error was detected either */
diff --git a/c/main.c b/c/main.c
index bb3a3d4..b5db33d 100644
--- a/c/main.c
+++ b/c/main.c
@@ -124,13 +124,11 @@ main(int argc, char *argv[])
struct lexstate lexer;
lexreset(&lexer);
- u8 code[] = " \n\n int main(void)\n{\n\treturn 2;\n}\n";
- lexfeed(&lexer, sizeof(code), code);
+ u8 code[] = " \n\n int\nmain(void)\n{\n\treturn 2;\n}\n";
+ lexfeed(&lexer, sizeof(code)-1, code);
lex(&lexer);
- printf("line = %zu column = %zu\n", lexer.lineno, lexer.column);
-
/* call preprocessor */
/* call main compiler */
diff --git a/c/makefile b/c/makefile
index af85d9e..65977b2 100644
--- a/c/makefile
+++ b/c/makefile
@@ -4,8 +4,8 @@
OBJ = main.o panic.o lex.o token.o
CC = gcc
-CFLAGS = -std=c17 -g -fsanitize=address,undefined -Wall
-LDFLAGS = -std=c17 -g -fsanitize=address,undefined
+CFLAGS = -std=c17 -g3 -fsanitize=address,undefined -Wall -Wdouble-promotion -Wconversion -D_POSIX_C_SOURCE=200809L
+LDFLAGS = -std=c17 -g3 -fsanitize=address,undefined -D_POSIX_C_SOURCE=200809L
lc: $(OBJ)
$(CC) $(LDFLAGS) $(OBJ) -o $@
diff --git a/c/token.c b/c/token.c
index 85e8e43..6acfe3a 100644
--- a/c/token.c
+++ b/c/token.c
@@ -1,5 +1,8 @@
#include <assert.h>
#include <string.h>
+#include <stddef.h>
+#include <stdint.h>
+#include "types.h"
#include "token.h"
static char *tokennames[] = {
@@ -8,11 +11,11 @@ static char *tokennames[] = {
[Treturn] = "return",
[Tident] = "identifier",
[Tconstant] = "constant",
- [Tlparen] = "lparen",
- [Trparen] = "rparen",
- [Tlbrace] = "lbrace",
- [Trbrace] = "rbrace",
- [Tsemicolon] = "semicolon",
+ [Tlparen] = "(",
+ [Trparen] = ")",
+ [Tlbrace] = "{",
+ [Trbrace] = "}",
+ [Tsemicolon] = ";",
};
static_assert(sizeof(tokennames) / sizeof(char *) == Ntok, "token name map out of sync with tokens");
diff --git a/c/token.h b/c/token.h
index 36b3657..71183d1 100644
--- a/c/token.h
+++ b/c/token.h
@@ -23,6 +23,20 @@ enum tokentype {
Ntok,
};
+struct location {
+ usize lineno;
+ usize column;
+};
+
+struct token {
+ /* location */
+ struct location loc;
+ /* token type */
+ enum tokentype type;
+ /* string value, if any */
+ char *str;
+};
+
char* tokname(enum tokentype);
enum tokentype findkeyword(char *name);