Author | Jakob Wakeling <[email protected]> |
Date | 2021-09-13 06:13:31 |
Commit | f9cfead4e2e097b074e715d0d7e2a2631f1228be |
Parent | 760381011b2ab87cd09106506a7305fcea141486 |
lex: Rebuild and flesh out lexer
Diffstat
M | CMakeLists.txt | | | 2 | +- |
M | README.md | | | 2 | +- |
M | src/g.h | | | 25 | ------------------------- |
M | src/lex.c | | | 206 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------- |
A | src/lex.h | | | 48 | ++++++++++++++++++++++++++++++++++++++++++++++++ |
M | src/parse.c | | | 16 | ++++++++-------- |
6 files changed, 238 insertions, 61 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index dde9eb5..74214ac 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ CMAKE_MINIMUM_REQUIRED(VERSION 3.12) -PROJECT(G VERSION 0.0.0 LANGUAGES C) +PROJECT(G VERSION 0.1.0 LANGUAGES C) SET(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/bin) ADD_COMPILE_DEFINITIONS(PROJECT_VERSION="${PROJECT_VERSION}") diff --git a/README.md b/README.md index 7d9c0b5..e699f66 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,7 @@ command. The second command will output an executable file, *a.out* by default. - [ ] Implement procedure definitions - [ ] Implement procedure calls - [ ] Implement variable assignments -- [ ] Implement integers +- [x] Implement integers - [ ] Implement reals - [ ] Implement arrays - [ ] Implement the *type* type diff --git a/src/g.h b/src/g.h index ade650e..012b849 100644 --- a/src/g.h +++ b/src/g.h @@ -12,19 +12,7 @@ #include "util/stack.h" #include "value.h" #include "util/util.h" - -typedef enum { - LK_NIL, LK_EOF, LK_IDN, - LK_INT, LK_FLT, LK_CHR, LK_STR, - - LK_LPAREN, LK_RPAREN, LK_LBRACK, LK_RBRACK, - LK_LBRACE, LK_RBRACE, LK_COLON, LK_SCOLON, - LK_PERIOD, LK_COMMA, - - LK_EQUALS, LK_RARROW, - - LK_PROC, LK_RETURN, -} tok_k; +#include "lex.h" typedef enum { AK_NIL, @@ -37,28 +25,17 @@ typedef enum { } ast_k; typedef struct { usize ln, cl; } pos; -typedef struct { tok_k k; pos p; u8 *s; u64 h; } tok; - -typedef struct { u8 *s; usize sp, sl, ln, cl; tok t; } lex; typedef struct ast { ast_k k; pos p; type *t; val v; u8 *s; struct ast *c, *cl, *cr; stack *cs; } ast; -extern char *tok_ks[]; extern char *ast_ks[]; -extern lex lex_init(u8 *src, usize len); -extern tok lex_next(lex *l); -extern tok lex_kind(lex *l, tok_k k); -extern tok lex_peek(lex *l); - extern ast *ast_init(void); extern void ast_free(ast *a); extern ast *parse(lex *l); -extern void lex_debug(lex *l); - #endif // G_G_H_PHZNJHJ7 diff --git a/src/lex.c b/src/lex.c index ff6f7dd..0341f3d 100644 --- a/src/lex.c +++ b/src/lex.c @@ -20,112 +20,197 @@ #include <string.h> #include <stdio.h> -#define C (l->s[l->sp]) -#define D (l->s[l->sp + 1]) - -#define ISIDNCA(c) (isalpha(c) || c == '_') -#define ISIDNCB(c) (isalpha(c) || isdigit(c) || c == '_') - char *tok_ks[] = { - "LK_NIL", "LK_EOF", "LK_IDN", - "LK_INT", "LK_FLT", "LK_CHR", "LK_STR", + "LK_NULL", "LK_EOF", "LK_ID", "LK_INT", "LK_FLT", "LK_STR", + + "LK_RETURN", "LK_FOR", "LK_IF", "LK_ELSE", "LK_PROC", - "LK_LPAREN", "LK_RPAREN", "LK_LBRACK", "LK_RBRACK", - "LK_LBRACE", "LK_RBRACE", "LK_COLON", "LK_SCOLON", - "LK_PERIOD", "LK_COMMA", + "LK_LPAREN", "LK_RPAREN", "LK_LBRACK", "LK_RBRACK", "LK_LBRACE", "LK_RBRACE", + "LK_COLON", "LK_SCOLON", "LK_COMMA", "LK_PERIOD", "LK_RARROW", "LK_QMARK", - "LK_EQUALS", "LK_RARROW", + "LK_OP_ADD", "LK_OP_SUB", "LK_OP_MUL", "LK_OP_DIV", "LK_OP_MOD", + "LK_OP_EQ", "LK_OP_NEQ", "LK_OP_GT", "LK_OP_LT", "LK_OP_GTE", "LK_OP_LTE", + "LK_LO_NOT", "LK_LO_AND", "LK_LO_OR", + "LK_BW_NOT", "LK_BW_AND", "LK_BW_OR", "LK_BW_XOR", "LK_BW_SHL", "LK_BW_SHR", - "LK_PROC", "LK_RETURN", + "LK_ASSIGN", "LK_AS_ADD", "LK_AS_SUB", "LK_AS_MUL", "LK_AS_DIV", "LK_AS_MOD", + "LK_AS_NOT", "LK_AS_AND", "LK_AS_OR", "LK_AS_XOR", "LK_AS_SHL", "LK_AS_SHR", }; /* Initialise a lexer. */ lex lex_init(u8 *src, usize len) { - lex l = { src, 0, len, 0, 0 }; lex_next(&l); return l; + lex l = { src, src, src + len, 0, 0, 0 }; lex_next(&l); return l; } +#define P (l->p) // Pointer to the Current Character +#define Q (l->q) // Pointer to EOF +#define C (l->p[0]) // Current Character +#define D (l->p[1]) // Next Character +#define LN (l->ln) // Line Index +#define CL (l->cl) // Column Index +#define T (l->t) // Current Token + +#define ISIDCHA(c) (isalpha(c) || c == '_') +#define ISIDCHB(c) (isalpha(c) || isdigit(c) || c == '_') + /* Lex the next token, and return the current one. */ tok lex_next(lex *l) { - /* Short circuit if the current token is EOF */ - if (l->t.k == LK_EOF) { return l->t; } - - tok t = l->t, n; u8 *ss = NULL; usize sl = 0; + if (T.k == LK_EOF) { return T; } + tok t = T; T = (tok){ LK_NULL, 0, 0, 0 }; skip:; /* Skip null characters and whitespace */ - for (; l->sp + 1 != l->sl && (!C || isspace(C)); ++l->sp) { - if (C == '\n') { ++l->ln; l->cl = 0; } else { ++l->cl; } + for (; P != Q && (!C || isspace(C)); P += 1) switch (C) { + case '\0': { /* TODO warn user of null character */ } break; + case '\n': { LN += 1; CL = 0; } break; + default: { CL += 1; } break; } - /* Immediately return the current token if the next is EOF */ - if (l->sp + 1 == l->sl) { n.k = LK_EOF; goto ret; } + /* Return the current token immediately if EOF is reached */ + if (P == Q) { T = (tok){ LK_EOF, LN, CL, 0 }; return t; } - /* TODO handle nested multi-line comments */ - /* FIXME multi-line comments do not count lines or columns */ - if (C == '/') switch (D) { /* Comments */ - case '/': { for (l->sp += 2; C != '\n'; ++l->sp) {} goto skip; } - case '*': { for (l->sp += 2; C != '*' && D != '/'; ++l->sp) {} goto skip; } + /* Skip single-line and (potentially nested) multi-line comments */ + if (C == '/') switch (D) { + case '/': { for (P += 2; P != Q && C != '\n'; P += 1); } goto skip; + case '*': { + usize d = 1; for (P += 2, CL += 2; P != Q && d; P += 1) { + if (C == '/' && D == '*') { P += 2; CL += 2; d += 1; continue; } + if (C == '*' && D == '/') { P += 2; CL += 2; d -= 1; continue; } + if (C == '\n') { LN += 1; CL = 0; } else { CL += 1; } + } + } goto skip; } - n.p.ln = l->ln; n.p.cl = l->cl; if (l->sp + 1 != l->sl) { - if (ISIDNCA(C)) { /* Identifiers and Keywords */ - for (ss = &C, ++sl, ++l->sp; ISIDNCB(C); ++sl, ++l->sp); - - /* Check for a keyword, and store its hash */ - u64 h = fnv1a64(ss, sl); kwd *k = kwd_find(h); n.h = h; - if (k) { n.k = k->k; } else { n.k = LK_IDN; } goto end; - } - else if (isdigit(C)) { /* Numbers */ - for (ss = &C, ++sl, ++l->sp; isalnum(C); ++sl, ++l->sp); - - n.k = LK_INT; goto end; - } + /* + FIXME beyond this point EOF isn't checked properly so if a file does not + have a trailing newline it **may** cause a segfault + */ + + T.ln = LN; T.cl = CL; + + /* Handle identifiers and keywords */ + if (ISIDCHA(C)) { + u8 *s = P; usize sl; - else switch (C) { - case '(': { n.k = LK_LPAREN; } goto esc_1; - case ')': { n.k = LK_RPAREN; } goto esc_1; - case '[': { n.k = LK_LBRACK; } goto esc_1; - case ']': { n.k = LK_RBRACK; } goto esc_1; - case '{': { n.k = LK_LBRACE; } goto esc_1; - case '}': { n.k = LK_RBRACE; } goto esc_1; - case ':': { n.k = LK_COLON; } goto esc_1; - case ';': { n.k = LK_SCOLON; } goto esc_1; - case '.': { n.k = LK_PERIOD; } goto esc_1; - case ',': { n.k = LK_COMMA; } goto esc_1; - case '=': { n.k = LK_EQUALS; } goto esc_1; - case '-': switch (D) { - case '>': { n.k = LK_RARROW; } goto esc_2; - } - esc_1: { ss = &C; sl += 1; l->sp += 1; } goto end; - esc_2: { ss = &C; sl += 2; l->sp += 2; } goto end; - esc_3: { ss = &C; sl += 3; l->sp += 3; } goto end; - default: { error(1, "%zu:%zu: Unknown: \'%d\'", n.p.ln, n.p.cl, C); } - } + for (P += 1; ISIDCHB(C); P += 1); + sl = P - s; CL += sl; + + T.v_str = (u8 *)strndup((char *)s, sl); + T.h = fnv1a64(s, sl); kwd *k = kwd_find(T.h); + if (k) { T.k = k->k; } else { T.k = LK_ID; } } -end:; - n.s = (u8 *)strndup((char *)ss, sl); l->cl += sl; + /* Handle number literals */ + else if (isdigit(C)) { + u8 *s = P; usize sl; + + for (P += 1; isalnum(C); P += 1); + sl = P - s; CL += sl; + + T.v_str = (u8 *)strndup((char *)s, sl); + T.k = LK_INT; + } + + /* Handle punctuators and operators */ + else switch (C) { + case '(': { T.k = LK_LPAREN; P += 1; CL += 1; } break; + case ')': { T.k = LK_RPAREN; P += 1; CL += 1; } break; + case '[': { T.k = LK_LBRACK; P += 1; CL += 1; } break; + case ']': { T.k = LK_RBRACK; P += 1; CL += 1; } break; + case '{': { T.k = LK_LBRACE; P += 1; CL += 1; } break; + case '}': { T.k = LK_RBRACE; P += 1; CL += 1; } break; + case ':': { T.k = LK_COLON; P += 1; CL += 1; } break; + case ';': { T.k = LK_SCOLON; P += 1; CL += 1; } break; + case ',': { T.k = LK_COMMA; P += 1; CL += 1; } break; + case '.': { T.k = LK_PERIOD; P += 1; CL += 1; } break; + case '?': { T.k = LK_QMARK; P += 1; CL += 1; } break; + case '+': switch (D) { + default: { T.k = LK_OP_ADD; P += 1; CL += 1; } break; + case '=': { T.k = LK_AS_ADD; P += 2; CL += 2; } break; + } break; + case '-': switch (D) { + default: { T.k = LK_OP_SUB; P += 1; CL += 1; } break; + case '>': { T.k = LK_RARROW; P += 2; CL += 2; } break; + case '=': { T.k = LK_AS_SUB; P += 2; CL += 2; } break; + } break; + case '*': switch (D) { + default: { T.k = LK_OP_MUL; P += 1; CL += 1; } break; + case '=': { T.k = LK_AS_MUL; P += 2; CL += 2; } break; + } break; + case '/': switch (D) { + default: { T.k = LK_OP_DIV; P += 1; CL += 1; } break; + case '=': { T.k = LK_AS_DIV; P += 2; CL += 2; } break; + } break; + case '%': switch (D) { + default: { T.k = LK_OP_MOD; P += 1; CL += 1; } break; + case '=': { T.k = LK_AS_MOD; P += 2; CL += 2; } break; + } break; + case '=': switch (D) { + default: { T.k = LK_ASSIGN; P += 1; CL += 1; } break; + case '=': { T.k = LK_OP_EQ; P += 2; CL += 2; } break; + } break; + case '<': switch (D) { + default: { T.k = LK_OP_LT; P += 1; CL += 1; } break; + case '=': { T.k = LK_OP_LTE; P += 2; CL += 2; } break; + } break; + case '>': switch (D) { + default: { T.k = LK_OP_GT; P += 1; CL += 1; } break; + case '=': { T.k = LK_OP_GTE; P += 2; CL += 2; } break; + } break; + case '!': switch (D) { + default: { T.k = LK_LO_NOT; P += 1; CL += 1; } break; + case '=': { T.k = LK_OP_NEQ; P += 2; CL += 2; } break; + } break; + case '&': switch (D) { + default: { T.k = LK_BW_AND; P += 1; CL += 1; } break; + case '&': { T.k = LK_LO_AND; P += 2; CL += 2; } break; + case '=': { T.k = LK_AS_AND; P += 2; CL += 2; } break; + } break; + case '|': switch (D) { + default: { T.k = LK_BW_OR; P += 1; CL += 1; } break; + case '|': { T.k = LK_LO_OR; P += 2; CL += 2; } break; + case '=': { T.k = LK_AS_OR; P += 2; CL += 2; } break; + } break; + case '~': switch (D) { + default: { T.k = LK_BW_NOT; P += 1; CL += 1; } break; + case '=': { T.k = LK_AS_NOT; P += 2; CL += 2; } break; + } break; + case '^': switch (D) { + default: { T.k = LK_BW_XOR; P += 1; CL += 1; } break; + case '=': { T.k = LK_AS_XOR; P += 2; CL += 2; } break; + } break; + + case '\'': { /* TODO */ } break; + case '\"': { /* TODO */ } break; + + /* Handle unknown characters */ + default: { + warn("%zu:%zu: Unknown character: %X '%c'", LN, CL, C, C); + P += 1; CL += 1; + } break; + } -ret:; - l->t = n; return t; + return t; +} + +/* Return the next token. */ +tok lex_peek(lex *l) { + return T; } /* Lex the next token if the current is of a specific type. */ tok lex_kind(lex *l, tok_k k) { - if (l->t.k != k) { error( + if (T.k != k) { error( 1, "%zu:%zu: Unexpected: \"%s\", was expecting: \"%s\"", - l->t.p.ln, l->t.p.cl, tok_ks[l->t.k], tok_ks[k] - );} + T.ln + 1, T.cl + 1, tok_ks[T.k], tok_ks[k] + ); } return lex_next(l); } -/* Return the current token. */ -tok lex_peek(lex *l) { return l->t; } - /* Print lexer debug output and exit. */ void lex_debug(lex *l) { - for (tok t = lex_next(l); t.k != LK_EOF; free(t.s), t = lex_next(l)) { - printf("%zu:%zu: %s \"%s\"\n", t.p.ln, t.p.cl + 1, tok_ks[t.k], t.s); + for (tok t = lex_next(l); t.k != LK_EOF; free(t.v_str), t = lex_next(l)) { + printf("%zu:%zu: %s \"%s\"\n", t.ln + 1, t.cl + 1, tok_ks[t.k], t.v_str); } } diff --git a/src/lex.h b/src/lex.h new file mode 100644 index 0000000..64393d8 --- /dev/null +++ b/src/lex.h @@ -0,0 +1,48 @@ +// lex.h +// Lexer header file for G +// Copyright (C) 2021, Jakob Wakeling +// All rights reserved. + + + +#ifndef G_LEX_H_H356P5AM +#define G_LEX_H_H356P5AM + +#include "util/util.h" + +typedef enum { + LK_NULL, LK_EOF, LK_ID, LK_INT, LK_FLT, LK_STR, + + LK_RETURN, LK_FOR, LK_IF, LK_ELSE, LK_PROC, + + LK_LPAREN, LK_RPAREN, LK_LBRACK, LK_RBRACK, LK_LBRACE, LK_RBRACE, + LK_COLON, LK_SCOLON, LK_COMMA, LK_PERIOD, LK_RARROW, LK_QMARK, + + LK_OP_ADD, LK_OP_SUB, LK_OP_MUL, LK_OP_DIV, LK_OP_MOD, + LK_OP_EQ, LK_OP_NEQ, LK_OP_GT, LK_OP_LT, LK_OP_GTE, LK_OP_LTE, + LK_LO_NOT, LK_LO_AND, LK_LO_OR, + LK_BW_NOT, LK_BW_AND, LK_BW_OR, LK_BW_XOR, LK_BW_SHL, LK_BW_SHR, + + LK_ASSIGN, LK_AS_ADD, LK_AS_SUB, LK_AS_MUL, LK_AS_DIV, LK_AS_MOD, + LK_AS_NOT, LK_AS_AND, LK_AS_OR, LK_AS_XOR, LK_AS_SHL, LK_AS_SHR, +} tok_k; + +typedef struct { + tok_k k; usize ln, cl; u64 h; + union { u64 v_u64; s64 v_s64; f64 v_f64; u8 *v_str; }; +} tok; + +typedef struct { + u8 *s, *p, *q; usize ln, cl; tok t; +} lex; + +extern char *tok_ks[]; + +extern lex lex_init(u8 *src, usize len); +extern tok lex_next(lex *l); +extern tok lex_peek(lex *l); +extern tok lex_kind(lex *l, tok_k k); + +extern void lex_debug(lex *l); + +#endif // G_LEX_H_H356P5AM diff --git a/src/parse.c b/src/parse.c index 8f32b12..d963360 100644 --- a/src/parse.c +++ b/src/parse.c @@ -50,17 +50,17 @@ ast *parse(lex *l) { static ast *parse_decl(lex *l) { ast *a = ast_init(); - a->s = lex_kind(l, LK_IDN).s; lex_kind(l, LK_COLON); + a->s = lex_kind(l, LK_ID).v_str; lex_kind(l, LK_COLON); - if (lex_peek(l).k == LK_IDN) { /* TODO lookup and store type */ } + if (lex_peek(l).k == LK_ID) { /* TODO lookup and store type */ } switch (lex_peek(l).k) { case LK_COLON: { lex_kind(l, LK_COLON); a->k = AK_DECL; } goto decl_expr; - case LK_EQUALS: { lex_kind(l, LK_EQUALS); a->k = AK_DECL; } goto decl_expr; + case LK_ASSIGN: { lex_kind(l, LK_ASSIGN); a->k = AK_DECL; } goto decl_expr; decl_expr: { a->c = parse_expr(l); } break; default: { error( 1, "%zu:%zu: Unexpected: \"%s\" (parse_decl)", - lex_peek(l).p.ln, lex_peek(l).p.cl, tok_ks[lex_peek(l).k] + lex_peek(l).ln + 1, lex_peek(l).cl + 1, tok_ks[lex_peek(l).k] ); } break; } @@ -77,7 +77,7 @@ static ast *parse_stmt(lex *l) { case LK_RETURN: { lex_kind(l, LK_RETURN); a->k = AK_RETURN; a->c = parse_expr(l); } break; default: { error( 1, "%zu:%zu: Unexpected: \"%s\" (parse_stmt)", - lex_peek(l).p.ln, lex_peek(l).p.cl, tok_ks[lex_peek(l).k] + lex_peek(l).ln + 1, lex_peek(l).cl + 1, tok_ks[lex_peek(l).k] ); } break; } @@ -108,7 +108,7 @@ static ast *parse_expr(lex *l) { case LK_INT: { return parse_int(l); } break; default: { error( 1, "%zu:%zu: Unexpected: \"%s\" (parse_expr)", - lex_peek(l).p.ln, lex_peek(l).p.cl, tok_ks[lex_peek(l).k] + lex_peek(l).ln + 1, lex_peek(l).cl + 1, tok_ks[lex_peek(l).k] ); } break; } @@ -136,14 +136,14 @@ static ast *parse_proc(lex *l) { lex_kind(l, LK_RARROW); /* TODO dont hardcode return type */ - lex_kind(l, LK_IDN); a->t = t_s64; + lex_kind(l, LK_ID); a->t = t_s64; } a->c = parse_stmt_compound(l); return a; } static ast *parse_int(lex *l) { - val v = val_strint(lex_kind(l, LK_INT).s); + val v = val_strint(lex_kind(l, LK_INT).v_str); ast *a = ast_init(); a->k = AK_INT; a->v = v; return a; }