summaryrefslogtreecommitdiff
path: root/c/lex.c
diff options
context:
space:
mode:
authorThomas Bracht Laumann Jespersen <t@laumann.xyz>2025-02-02 08:59:12 +0100
committerThomas Bracht Laumann Jespersen <t@laumann.xyz>2025-02-02 08:59:12 +0100
commitd8321c1947b644888f92795a75645d0c91a6a92d (patch)
tree058d5e9e66463b9b173e513c3244a4da93c0a28c /c/lex.c
parent7b72404f5ec31a29aafa6faaf9c284487c819175 (diff)
c/lex: mostly functional lexerHEADmaster
Still need to parse numbers, but otherwise the lexer looks functional. It provides location information for each token (something the OCaml impl does not do).
Diffstat (limited to 'c/lex.c')
-rw-r--r--c/lex.c82
1 files changed, 64 insertions, 18 deletions
diff --git a/c/lex.c b/c/lex.c
index d41562c..c2506a8 100644
--- a/c/lex.c
+++ b/c/lex.c
@@ -1,4 +1,6 @@
+#include <stdlib.h>
#include <stddef.h>
+#include <stdbool.h>
#include <stdint.h>
#include <string.h>
#include <ctype.h>
@@ -15,6 +17,8 @@ lexgetc(struct lexstate *lexer)
lexer->column++;
return lexer->buf[lexer->pos++];
}
+ /* TODO we could check if the last character is 0 and handle
+ * it here... */
return UINT32_MAX;
}
@@ -57,12 +61,15 @@ lexsingle(struct lexstate *lexer)
skipwhitespace(lexer);
+ ret.token.loc.lineno = lexer->lineno;
+ ret.token.loc.column = lexer->column-1;
+
d = lexgetc(lexer);
- if (d == UINT32_MAX)
+ if (d == UINT32_MAX) {
+ ret.res = Lmore;
return ret;
+ }
c = (u8)d;
- ret.token.loc.lineno = lexer->lineno;
- ret.token.loc.column = lexer->column;
ret.res = Lok;
switch (c) {
case ';':
@@ -103,27 +110,58 @@ lexsingle(struct lexstate *lexer)
}
usize len = lexer->pos-start;
memcpy(str, &lexer->buf[start], len);
- printf("token range from %zu to %zu\n", start, lexer->pos);
- printf("token value = %s\n", str);
enum tokentype t = findkeyword((char *)str);
if (t == Txxx) {
t = Tident;
- printf("identifier! %s\n", str);
- } else {
- printf("keyword! %s\n", tokname(t));
+ /* TODO introduce own string type? smallstr */
+ ret.token.str = strdup((char *)str);
}
ret.token.type = t;
return ret;
}
+ /* An unexpected character... */
+ ret.res = Lerror;
return ret;
}
+static void
+printfileloc(char *filename, struct location loc)
+{
+ printf("%s:%zu:%zu: ", filename, loc.lineno, loc.column);
+}
+
+static void
+printtoken(struct token tok, char *filename)
+{
+ printfileloc(filename, tok.loc);
+ if (tok.type < Tident) {
+ /* keyword */
+ printf("%s\n", tokname(tok.type));
+ } else if (tok.type == Tident) {
+ char *s = tok.str ? tok.str : "<meh>";
+ printf("ident - %s\n", s);
+ } else if (tok.type == Tconstant) {
+ printf("constant...\n");
+ } else {
+ /* syntax/operators */
+ printf("%s\n", tokname(tok.type));
+ }
+}
+
+void
+freetoken(struct token tok)
+{
+ if (tok.type == Tident)
+ free(tok.str);
+}
+
struct lexresult
lex(struct lexstate *lexer)
{
struct lexresult ret = {0};
+ char filename[] = "../tests/return_2.c";
if (lexer->respos < lexer->nres) {
ret = lexer->results[lexer->respos];
@@ -136,16 +174,24 @@ lex(struct lexstate *lexer)
}
/* lex everything we got in the buffer */
- ret = lexsingle(lexer);
- ret = lexsingle(lexer);
- /* usize i = 0; */
- /* while (true) { */
- /* ret = lexsingle(lexer); */
- /* if (ret.type == Txxx) */
- /* break; */
- /* lexer->results[i++] = ret; */
- /* } */
- /* something non-whitespace */
+ while (true) {
+ ret = lexsingle(lexer);
+ switch (ret.res) {
+ case Lok:
+ /* TODO assign token to lexer buffer */
+ printtoken(ret.token, filename);
+ freetoken(ret.token);
+ break;
+ case Lmore:
+ printf("no more tokens to lex in current input!\n");
+ goto lexout;
+ case Lerror:
+ printf("error: unexpected character in input\n");
+ goto lexout;
+ }
+ }
+lexout:
+ /* TODO return first token from input (if available) */
return ret;
}