Author | Jakob Wakeling <[email protected]> |
Date | 2023-05-30 05:29:17 |
Commit | d44ca567693a048a42551b84af16eeaf09101aa0 |
Parent | af77441bb0a4dd397f9bcad216f5f892557b938b |
Implement string lexing
Diffstat
M | README.md | | | 1 | + |
M | doc/g.ebnf | | | 5 | +++-- |
M | src/init.c | | | 2 | +- |
M | src/lex.c | | | 99 | +++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------- |
M | src/lex.h | | | 2 | +- |
M | src/llvm.c | | | 3 | ++- |
M | src/parse.c | | | 20 | ++++++++++---------- |
M | src/parse.h | | | 2 | +- |
8 files changed, 83 insertions, 51 deletions
diff --git a/README.md b/README.md index ca1b7f5..eb424cf 100644 --- a/README.md +++ b/README.md @@ -64,6 +64,7 @@ command. The second command will output an executable file, *a.out* by default. - [ ] Implement booleans - [x] Implement integers - [x] Implement reals +- [ ] Implement pointers - [ ] Implement arrays - [x] Implement expressions - [x] Implement type casting diff --git a/doc/g.ebnf b/doc/g.ebnf index faae4e0..75d223f 100644 --- a/doc/g.ebnf +++ b/doc/g.ebnf @@ -30,6 +30,7 @@ expr = iden | literal | "(", expr, ")" | type, "(", expr, ")" (* Type cast *) | iden, "(", [ expr, { ",", expr } ], ")" (* Procedure call *) + | "#", iden, [ "(", [ expr, { ",", expr } ], ")" ] (* Hash expression *) | "+", expr | "-", expr (* Unary POS and NEG *) | "!", expr | "~", expr (* Logical and bitwise NOT *) @@ -83,8 +84,8 @@ literal = literal_null | literal_bool | literal_int | literal_flt | literal_chr (* General *) escape = escape_std | escape_oct | escape_hex | escape_utf ; escape_std = "\0" | "\a" | "\b" | "\t" | "\n" | "\v" | "\f" | "\r" | '\"' | "\'" | "\\" ; -escape_oct = "\o", digit_oct, [ digit_oct, [ digit_oct ] ] ; -escape_hex = "\x", digit_hex, [ digit_hex ] ; +escape_oct = "\o", digit_oct, digit_oct, digit_oct ; +escape_hex = "\x", digit_hex, digit_hex ; escape_utf = "\u", quadd_hex, [ quadd_hex ] ; digit = digit_dec ; diff --git a/src/init.c b/src/init.c index 2d7e6f6..b78f831 100644 --- a/src/init.c +++ b/src/init.c @@ -44,7 +44,7 @@ static ast kwds[] = { { AK_TYPE, 0, 0, 0, "char", &TYPE(TY_CHAR), { 0 }, NULL }, { AK_TYPE, 0, 0, 0, "rune", &TYPE(TY_RUNE), { 0 }, NULL }, - { AK_NULL, 0, 0, 0, NULL, NULL, { 0 }, NULL } + { AK_ZERO, 0, 0, 0, NULL, NULL, { 0 }, NULL } }; void initialise(void) { diff --git a/src/lex.c b/src/lex.c index 81843f0..5914950 100644 --- a/src/lex.c +++ b/src/lex.c @@ -10,14 +10,13 @@ #include "util/error.h" #include "util/util.h" -#include <ctype.h> #include <stdint.h> #include <stdlib.h> #include <string.h> #include <stdio.h> char *tok_ks[] = { - "NULL_TK", "TK_EOF", "TK_ID", "TK_NUM", "TK_STR", "TK_HASH", + "TK_ZERO", "TK_EOF", "TK_ID", "TK_NUM", "TK_CHR", "TK_STR", "TK_HASH", "TK_NULL", "TK_TRUE", "TK_FALSE", "TK_RETURN", "TK_IF", "TK_ELSE", "TK_FOR", "TK_PROC", @@ -33,18 +32,26 @@ char *tok_ks[] = { "TK_AS_NOT", "TK_AS_AND", "TK_AS_OR", "TK_AS_XOR", "TK_AS_SHL", "TK_AS_SHR", }; +#define is_space(c) (c == ' ' || c == '\f' || c == '\n' || c == '\r' || c == '\t' || c == '\v') +#define is_alpha(c) ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) +#define is_digit_bin(c) (c == '0' || c == '1') +#define is_digit_oct(c) (c >= '0' && c <= '7') +#define is_digit_dec(c) (c >= '0' && c <= '9') +#define is_digit_doz(c) ((c >= '0' && c <= '9') || (c == 'A' || c == 'B')) +#define is_digit_hex(c) ((c >= '0' && c <= '9') || (c >= 'A' || c == 'F')) + /* Push a token to a token array. */ void tok_a_push(tok_a *a, tok t) { tok *ta = realloc(a->a, (a->al += 1) * sizeof (tok)); if (!ta) { error(1, SERR); } else { a->a = ta; a->a[a->al - 1] = t; } } -/* Pop a token from a token array. */ +/* Pop a token from a token array. A zero token is returned when empty. */ tok tok_a_pop(tok_a *a) { return (a->al ? a->a[a->al -= 1] : (tok){ 0 }); } -/* Pop a token from a token array. */ +/* Pop a token from a token array. A zero token is returned when empty. */ tok tok_a_peek(tok_a *a) { return (a->al ? a->a[a->al - 1] : (tok){ 0 }); } @@ -72,7 +79,7 @@ tok lex_next(lex *l) { tok t = T; T = (tok){ 0 }; /* Skip null characters and whitespace */ - skip:; for (; P != Q && (!C || isspace(C)); P += 1) switch (C) { + skip:; for (; P != Q && (!C || is_space(C)); P += 1) switch (C) { case '\0': { note(l->n, l->ln, l->cl, 1, "Null character ignored"); } break; case '\n': { LN += 1; CL = 0; } break; default: { CL += 1; } break; @@ -93,18 +100,13 @@ tok lex_next(lex *l) { } goto skip; } - /* - FIXME beyond this point EOF isn't checked properly so if a file does not - have a trailing newline it **may** cause a segfault (?) - */ - T.ln = LN; T.cl = CL; /* Handle identifiers and keywords */ - if (isalpha(C) || C == '_') { + if (is_alpha(C) || C == '_') { char *s = P; UINT sl; - for (P += 1; isalpha(C) || isdigit(C) || C == '_'; P += 1); + for (P += 1; is_alpha(C) || is_digit_dec(C) || C == '_'; P += 1); sl = P - s; CL += sl; T.h = syt_hash(s, sl); if (strncmp(s, "null", sl) == 0) { T.k = TK_NULL; } @@ -119,11 +121,11 @@ tok lex_next(lex *l) { } /* Handle number literals */ - else if (isdigit(C)) { + else if (is_digit_dec(C)) { char *s = P; UINT sl; - for (P += 1; isalnum(C); P += 1); - if (C == '.') { P += 1; for (P += 1; isdigit(C); P += 1); } + for (P += 1; is_alpha(C) || is_digit_dec(C); P += 1); + if (C == '.') { P += 1; for (P += 1; is_digit_dec(C); P += 1); } sl = P - s; CL += sl; @@ -134,7 +136,7 @@ tok lex_next(lex *l) { else if (C == '#') { char *s = P; UINT sl; - for (P += 1; isalpha(C) || isdigit(C) || C == '_'; P += 1); + for (P += 1; is_alpha(C) || is_digit_dec(C) || C == '_'; P += 1); sl = P - s; CL += sl; if (sl <= 1) { note(l->n, T.ln, T.cl, 0, "A hash must be followed by an identifier"); goto redo; } @@ -211,26 +213,53 @@ tok lex_next(lex *l) { case '=': { T.k = TK_AS_XOR; P += 2; CL += 2; } break; } break; - /* TODO implement character escapes */ - /* TODO implement multi line strings */ - // case '\'': { - // char *s = P; UINT sl; - - // for (P += 1; C != '\''; P += 1); - // sl = P - s; CL += sl; - - // T.k = TK_NUM; - // } break; - case '\"': { - char *s = P += 1; UINT sl; + /* Handle character and string literals */ + case '\'': case '\"': { + char quote = C, *s = P += 1; CL += 1; register char *head = s; - for (; C != '\"' && C != '\n'; P += 1); - sl = P - s; CL += sl; T.h = syt_hash(s, sl); + for (; C != quote && C != '\n' && P != Q;) { + /* Non escape characters are not altered */ + if (C != '\\') { *head = C; head += 1; } + + /* Escape characters are processed and re-written to head */ + else switch (D) { + case '0': { *head = 0x00; P += 2; head += 1; } break; + case 'a': { *head = '\a'; P += 2; head += 1; } break; + case 'b': { *head = '\b'; P += 2; head += 1; } break; + case 'f': { *head = '\f'; P += 2; head += 1; } break; + case 'n': { *head = '\n'; P += 2; head += 1; } break; + case 'r': { *head = '\r'; P += 2; head += 1; } break; + case 't': { *head = '\t'; P += 2; head += 1; } break; + case 'v': { *head = '\v'; P += 2; head += 1; } break; + case '\'': { *head = '\''; P += 2; head += 1; } break; + case '\"': { *head = '\"'; P += 2; head += 1; } break; + case '\\': { *head = '\\'; P += 2; head += 1; } break; + case '\n': { P += 2; } break; + // case 'o': {} break; + // case 'x': {} break; + // case 'u': {} break; + default: { + note(l->n, l->ln, l->cl, 1, "Unknown escape sequence: \"\\%c\"", D); + *head = D; P += 2; head += 1; + } break; + } + } - if (C != '\"') { note(l->n, T.ln, T.cl, 0, "Missing closing quote"); } - else { P += 1; } + UINT sl = head - s; CL += sl; + if (C != quote) { note(l->n, T.ln, T.cl, 0, "Missing closing quote"); } + else if (P != Q) { P += 1; CL += 1; } - T.k = TK_STR; if (!(T.s = strndup(s, sl))) { error(1, SERR); } + if (quote == '\'') { + T.k = TK_NUM; + if (!(T.s = strndup(s, sl))) { error(1, SERR); } + + note(l->n, T.ln, T.cl, 1, "Characters are not yet fully implemented"); + /* Lex as TK_NUM, but how to distinguish at parsing stage? */ + } + else if (quote == '\"') { + T.k = TK_STR; T.h = syt_hash(s, sl); + if (!(T.s = strndup(s, sl))) { error(1, SERR); } + } } break; /* Handle unknown characters */ diff --git a/src/lex.h b/src/lex.h index 2ce4303..ff94340 100644 --- a/src/lex.h +++ b/src/lex.h @@ -10,7 +10,7 @@ /* Remember to update tok_ks in lex.c */ typedef enum { - NULL_TK, TK_EOF, TK_ID, TK_NUM, TK_STR, TK_HASH, + TK_ZERO, TK_EOF, TK_ID, TK_NUM, TK_CHR, TK_STR, TK_HASH, TK_NULL, TK_TRUE, TK_FALSE, TK_RETURN, TK_IF, TK_ELSE, TK_FOR, TK_PROC, diff --git a/src/llvm.c b/src/llvm.c index db97832..a8fedb7 100644 --- a/src/llvm.c +++ b/src/llvm.c @@ -222,7 +222,7 @@ static LLVMValueRef llvm_stmt_expr(ast *a, syt *st) { /* Generate IR for a return statement. */ static LLVMValueRef llvm_stmt_return(ast *a, syt *st) { - return LLVMBuildRet(llvm_builder, llvm_expr(C[0], st)); + return LLVMBuildRet(llvm_builder, a->c.al > 0 ? llvm_expr(C[0], st) : NULL); } /* Generate IR for an if statement. */ @@ -385,6 +385,7 @@ static inline void llvm_free(void) { /* Return the appropriate LLVMTypeRef for a G type. */ static LLVMTypeRef llvm_type(type *t) { switch (t->k) { + case TY_NULL: { return LLVMVoidType(); } break; case TY_B8: { return LLVMIntType(8); } break; case TY_B16: { return LLVMIntType(16); } break; case TY_B32: { return LLVMIntType(32); } break; diff --git a/src/parse.c b/src/parse.c index df43318..8325461 100644 --- a/src/parse.c +++ b/src/parse.c @@ -20,7 +20,7 @@ typedef struct { tok_k tk; ast_k ak; s32 o; bool as; } op; char *ast_ks[] = { - "AK_NULL", "AK_PROG", "AK_PROC", "AK_TYPE", "AK_CAST", + "AK_ZERO", "AK_PROG", "AK_PROC", "AK_TYPE", "AK_CAST", "AK_STMT", "AK_COMP", "AK_DECL", "AK_RETURN", "AK_IF", "AK_FOR", @@ -226,7 +226,7 @@ static ast *parse_stmt_return(lex *l, syt *st) { lex_kind(l, TK_RETURN); ast *a = ast_init(); a->k = AK_RETURN; - ast_push(a, parse_expr(l, st, false)); + if (T.k != TK_SCOLON) { ast_push(a, parse_expr(l, st, false)); } lex_kind(l, TK_SCOLON); return a; } @@ -307,7 +307,7 @@ static ast *parse_expr(lex *l, syt *st, bool arg) { case TK_LPAREN: { tok_a_push(&ts, lex_next(l)); } break; case TK_RPAREN: { for (tok t = tok_a_pop(&ts);; t = tok_a_pop(&ts)) { - if (t.k == NULL_TK) { + if (t.k == TK_ZERO) /* Stack is empty */ { if (arg) { goto eox; } note(l->n, T.ln, T.cl, -1, "expected left parenthesis"); } @@ -343,7 +343,7 @@ static ast *parse_expr(lex *l, syt *st, bool arg) { ast_a_push(&as, a); } break; default: /* Handle operators */ { - op o1, o2; if ((o1 = op_lookup(T.k, false)).tk == NULL_TK) { goto eox; } + op o1, o2; if ((o1 = op_lookup(T.k, false)).tk == TK_ZERO) { goto eox; } /* If there is an operator at the top of the operator stack that is not @@ -351,7 +351,7 @@ static ast *parse_expr(lex *l, syt *st, bool arg) { same precedence as o1 and o1 is left-associative, then pop it from the stack onto the output. */ - for (o2 = op_lookup(tok_a_peek(&ts).k, false); o2.tk != NULL_TK; o2 = op_lookup(tok_a_peek(&ts).k, false)) { + for (o2 = op_lookup(tok_a_peek(&ts).k, false); o2.tk != TK_ZERO; o2 = op_lookup(tok_a_peek(&ts).k, false)) { if (o2.tk == TK_LPAREN || (o1.o < o2.o && (o1.o != o2.o || o1.as == true))) { break; } shunt(&as, tok_a_pop(&ts), o2); @@ -362,7 +362,7 @@ static ast *parse_expr(lex *l, syt *st, bool arg) { } eox:; /* Pop any remaining operators from the operator stack */ - for (tok t = tok_a_pop(&ts); t.k != NULL_TK; t = tok_a_pop(&ts)) { + for (tok t = tok_a_pop(&ts); t.k != TK_ZERO; t = tok_a_pop(&ts)) { if (t.k == TK_LPAREN) { error(1, "LPAREN: TODO"); } if (t.k == TK_RPAREN) { error(1, "RPAREN: TODO"); } @@ -381,7 +381,6 @@ static ast *parse_expr_proc(lex *l, syt *st) { /* TODO */ lex_kind(l, TK_RPAREN); /* Parse optional procedure return type(s) */ - /* TODO parse more than one return type */ if (T.k == TK_RARROW) { lex_next(l); tok t = lex_kind(l, TK_ID); ast *s = syt_search_h(st, t.h, t.s); @@ -397,6 +396,7 @@ static ast *parse_expr_proc(lex *l, syt *st) { a->t = s->t; } + else { a->t = &TYPE(TY_NULL); } ast_push(a, parse_stmt_compound(l, st)); return a; } diff --git a/src/parse.h b/src/parse.h index ad88305..7781e50 100644 --- a/src/parse.h +++ b/src/parse.h @@ -16,7 +16,7 @@ /* Remember to update ast_ks in parse.c */ typedef enum { - AK_NULL, AK_PROG, AK_PROC, AK_TYPE, AK_CAST, + AK_ZERO, AK_PROG, AK_PROC, AK_TYPE, AK_CAST, AK_STMT, AK_COMP, AK_DECL, AK_RETURN, AK_IF, AK_FOR,