G

G Programming Language
git clone http://git.omkov.net/G
Log | Tree | Refs | README | Download

AuthorJakob Wakeling <[email protected]>
Date2021-09-13 06:13:31
Commitf9cfead4e2e097b074e715d0d7e2a2631f1228be
Parent760381011b2ab87cd09106506a7305fcea141486

lex: Rebuild and flesh out lexer

Diffstat

M CMakeLists.txt | 2 +-
M README.md | 2 +-
M src/g.h | 25 -------------------------
M src/lex.c | 206 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------
A src/lex.h | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
M src/parse.c | 16 ++++++++--------

6 files changed, 238 insertions, 61 deletions

diff --git a/CMakeLists.txt b/CMakeLists.txt
index dde9eb5..74214ac 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 CMAKE_MINIMUM_REQUIRED(VERSION 3.12)
-PROJECT(G VERSION 0.0.0 LANGUAGES C)
+PROJECT(G VERSION 0.1.0 LANGUAGES C)
 
 SET(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/bin)
 ADD_COMPILE_DEFINITIONS(PROJECT_VERSION="${PROJECT_VERSION}")
diff --git a/README.md b/README.md
index 7d9c0b5..e699f66 100644
--- a/README.md
+++ b/README.md
@@ -62,7 +62,7 @@ command. The second command will output an executable file, *a.out* by default.
 - [ ] Implement procedure definitions
 - [ ] Implement procedure calls
 - [ ] Implement variable assignments
-- [ ] Implement integers
+- [x] Implement integers
 - [ ] Implement reals
 - [ ] Implement arrays
 - [ ] Implement the *type* type
diff --git a/src/g.h b/src/g.h
index ade650e..012b849 100644
--- a/src/g.h
+++ b/src/g.h
@@ -12,19 +12,7 @@
 #include "util/stack.h"
 #include "value.h"
 #include "util/util.h"
-
-typedef enum {
-	LK_NIL, LK_EOF, LK_IDN,
-	LK_INT, LK_FLT, LK_CHR, LK_STR,
-	
-	LK_LPAREN, LK_RPAREN, LK_LBRACK, LK_RBRACK,
-	LK_LBRACE, LK_RBRACE, LK_COLON,  LK_SCOLON,
-	LK_PERIOD, LK_COMMA,
-	
-	LK_EQUALS, LK_RARROW,
-	
-	LK_PROC, LK_RETURN,
-} tok_k;
+#include "lex.h"
 
 typedef enum {
 	AK_NIL,
@@ -37,28 +25,17 @@ typedef enum {
 } ast_k;
 
 typedef struct { usize ln, cl; } pos;
-typedef struct { tok_k k; pos p; u8 *s; u64 h; } tok;
-
-typedef struct { u8 *s; usize sp, sl, ln, cl; tok t; } lex;
 
 typedef struct ast {
 	ast_k k; pos p; type *t; val v; u8 *s;
 	struct ast *c, *cl, *cr; stack *cs;
 } ast;
 
-extern char *tok_ks[];
 extern char *ast_ks[];
 
-extern lex lex_init(u8 *src, usize len);
-extern tok lex_next(lex *l);
-extern tok lex_kind(lex *l, tok_k k);
-extern tok lex_peek(lex *l);
-
 extern ast *ast_init(void);
 extern void ast_free(ast *a);
 
 extern ast *parse(lex *l);
 
-extern void lex_debug(lex *l);
-
 #endif // G_G_H_PHZNJHJ7
diff --git a/src/lex.c b/src/lex.c
index ff6f7dd..0341f3d 100644
--- a/src/lex.c
+++ b/src/lex.c
@@ -20,112 +20,197 @@
 #include <string.h>
 #include <stdio.h>
 
-#define C (l->s[l->sp])
-#define D (l->s[l->sp + 1])
-
-#define ISIDNCA(c) (isalpha(c) || c == '_')
-#define ISIDNCB(c) (isalpha(c) || isdigit(c) || c == '_')
-
 char *tok_ks[] = {
-	"LK_NIL", "LK_EOF", "LK_IDN",
-	"LK_INT", "LK_FLT", "LK_CHR", "LK_STR",
+	"LK_NULL", "LK_EOF", "LK_ID", "LK_INT", "LK_FLT", "LK_STR",
+	
+	"LK_RETURN", "LK_FOR", "LK_IF", "LK_ELSE", "LK_PROC",
 
-	"LK_LPAREN", "LK_RPAREN", "LK_LBRACK", "LK_RBRACK",
-	"LK_LBRACE", "LK_RBRACE", "LK_COLON",  "LK_SCOLON",
-	"LK_PERIOD", "LK_COMMA",
+	"LK_LPAREN", "LK_RPAREN", "LK_LBRACK", "LK_RBRACK", "LK_LBRACE", "LK_RBRACE",
+	"LK_COLON",  "LK_SCOLON", "LK_COMMA",  "LK_PERIOD", "LK_RARROW", "LK_QMARK",
 
-	"LK_EQUALS", "LK_RARROW",
+	"LK_OP_ADD", "LK_OP_SUB", "LK_OP_MUL", "LK_OP_DIV", "LK_OP_MOD",
+	"LK_OP_EQ",  "LK_OP_NEQ", "LK_OP_GT",  "LK_OP_LT",  "LK_OP_GTE", "LK_OP_LTE",
+	"LK_LO_NOT", "LK_LO_AND", "LK_LO_OR",
+	"LK_BW_NOT", "LK_BW_AND", "LK_BW_OR",  "LK_BW_XOR", "LK_BW_SHL", "LK_BW_SHR",
 
-	"LK_PROC", "LK_RETURN",
+	"LK_ASSIGN", "LK_AS_ADD", "LK_AS_SUB", "LK_AS_MUL", "LK_AS_DIV", "LK_AS_MOD",
+	"LK_AS_NOT", "LK_AS_AND", "LK_AS_OR",  "LK_AS_XOR", "LK_AS_SHL", "LK_AS_SHR",
 };
 
 /* Initialise a lexer. */
 lex lex_init(u8 *src, usize len) {
-	lex l = { src, 0, len, 0, 0 }; lex_next(&l); return l;
+	lex l = { src, src, src + len, 0, 0, 0 }; lex_next(&l); return l;
 }
 
+#define P  (l->p)    // Pointer to the Current Character
+#define Q  (l->q)    // Pointer to EOF
+#define C  (l->p[0]) // Current Character
+#define D  (l->p[1]) // Next Character
+#define LN (l->ln)   // Line Index
+#define CL (l->cl)   // Column Index
+#define T  (l->t)    // Current Token
+
+#define ISIDCHA(c) (isalpha(c) || c == '_')
+#define ISIDCHB(c) (isalpha(c) || isdigit(c) || c == '_')
+
 /* Lex the next token, and return the current one. */
 tok lex_next(lex *l) {
-	/* Short circuit if the current token is EOF */
-	if (l->t.k == LK_EOF) { return l->t; }
-	
-	tok t = l->t, n; u8 *ss = NULL; usize sl = 0;
+	if (T.k == LK_EOF) { return T; }
+	tok t = T; T = (tok){ LK_NULL, 0, 0, 0 };
 
 skip:;
 	/* Skip null characters and whitespace */
-	for (; l->sp + 1 != l->sl && (!C || isspace(C)); ++l->sp) {
-		if (C == '\n') { ++l->ln; l->cl = 0; } else { ++l->cl; }
+	for (; P != Q && (!C || isspace(C)); P += 1) switch (C) {
+		case '\0': { /* TODO warn user of null character */ } break;
+		case '\n': { LN += 1; CL = 0; } break;
+		default:   { CL += 1; } break;
 	}
 
-	/* Immediately return the current token if the next is EOF */
-	if (l->sp + 1 == l->sl) { n.k = LK_EOF; goto ret; }
+	/* Return the current token immediately if EOF is reached */
+	if (P == Q) { T = (tok){ LK_EOF, LN, CL, 0 }; return t; }
 
-	/* TODO handle nested multi-line comments */
-	/* FIXME multi-line comments do not count lines or columns */
-	if (C == '/') switch (D) { /* Comments */
-	case '/': { for (l->sp += 2; C != '\n'; ++l->sp) {} goto skip; }
-	case '*': { for (l->sp += 2; C != '*' && D != '/'; ++l->sp) {} goto skip; }
+	/* Skip single-line and (potentially nested) multi-line comments */
+	if (C == '/') switch (D) {
+		case '/': { for (P += 2; P != Q && C != '\n'; P += 1); } goto skip;
+		case '*': {
+			usize d = 1; for (P += 2, CL += 2; P != Q && d; P += 1) {
+				if (C == '/' && D == '*') { P += 2; CL += 2; d += 1; continue; }
+				if (C == '*' && D == '/') { P += 2; CL += 2; d -= 1; continue; }
+				if (C == '\n') { LN += 1; CL = 0; } else { CL += 1; }
+			}
+		} goto skip;
 	}
 
-	n.p.ln = l->ln; n.p.cl = l->cl; if (l->sp + 1 != l->sl) {
-		if (ISIDNCA(C)) { /* Identifiers and Keywords */
-			for (ss = &C, ++sl, ++l->sp; ISIDNCB(C); ++sl, ++l->sp);
-			
-			/* Check for a keyword, and store its hash */
-			u64 h = fnv1a64(ss, sl); kwd *k = kwd_find(h); n.h = h;
-			if (k) { n.k = k->k; } else { n.k = LK_IDN; } goto end;
-		}
-		else if (isdigit(C)) { /* Numbers */
-			for (ss = &C, ++sl, ++l->sp; isalnum(C); ++sl, ++l->sp);
-			
-			n.k = LK_INT; goto end;
-		}
+	/*
+		FIXME beyond this point EOF isn't checked properly so if a file does not
+		have a trailing newline it **may** cause a segfault
+	*/
+	
+	T.ln = LN; T.cl = CL;
+	
+	/* Handle identifiers and keywords */
+	if (ISIDCHA(C)) {
+		u8 *s = P; usize sl;
 
-		else switch (C) {
-		case '(': { n.k = LK_LPAREN; } goto esc_1;
-		case ')': { n.k = LK_RPAREN; } goto esc_1;
-		case '[': { n.k = LK_LBRACK; } goto esc_1;
-		case ']': { n.k = LK_RBRACK; } goto esc_1;
-		case '{': { n.k = LK_LBRACE; } goto esc_1;
-		case '}': { n.k = LK_RBRACE; } goto esc_1;
-		case ':': { n.k = LK_COLON;  } goto esc_1;
-		case ';': { n.k = LK_SCOLON; } goto esc_1;
-		case '.': { n.k = LK_PERIOD; } goto esc_1;
-		case ',': { n.k = LK_COMMA;  } goto esc_1;
-		case '=': { n.k = LK_EQUALS; } goto esc_1;
-		case '-': switch (D) {
-			case '>': { n.k = LK_RARROW; } goto esc_2;
-		}
-		esc_1: { ss = &C; sl += 1; l->sp += 1; } goto end;
-		esc_2: { ss = &C; sl += 2; l->sp += 2; } goto end;
-		esc_3: { ss = &C; sl += 3; l->sp += 3; } goto end;
-		default: { error(1, "%zu:%zu: Unknown: \'%d\'", n.p.ln, n.p.cl, C); }
-		}
+		for (P += 1; ISIDCHB(C); P += 1);
+		sl = P - s; CL += sl;
+		
+		T.v_str = (u8 *)strndup((char *)s, sl);
+		T.h = fnv1a64(s, sl); kwd *k = kwd_find(T.h);
+		if (k) { T.k = k->k; } else { T.k = LK_ID; }
 	}
 
-end:;
-	n.s = (u8 *)strndup((char *)ss, sl); l->cl += sl;
+	/* Handle number literals */
+	else if (isdigit(C)) {
+		u8 *s = P; usize sl;
+		
+		for (P += 1; isalnum(C); P += 1);
+		sl = P - s; CL += sl;
+		
+		T.v_str = (u8 *)strndup((char *)s, sl);
+		T.k = LK_INT;
+	}
+	
+	/* Handle punctuators and operators */
+	else switch (C) {
+		case '(': { T.k = LK_LPAREN; P += 1; CL += 1; } break;
+		case ')': { T.k = LK_RPAREN; P += 1; CL += 1; } break;
+		case '[': { T.k = LK_LBRACK; P += 1; CL += 1; } break;
+		case ']': { T.k = LK_RBRACK; P += 1; CL += 1; } break;
+		case '{': { T.k = LK_LBRACE; P += 1; CL += 1; } break;
+		case '}': { T.k = LK_RBRACE; P += 1; CL += 1; } break;
+		case ':': { T.k = LK_COLON;  P += 1; CL += 1; } break;
+		case ';': { T.k = LK_SCOLON; P += 1; CL += 1; } break;
+		case ',': { T.k = LK_COMMA;  P += 1; CL += 1; } break;
+		case '.': { T.k = LK_PERIOD; P += 1; CL += 1; } break;
+		case '?': { T.k = LK_QMARK;  P += 1; CL += 1; } break;
+		case '+': switch (D) {
+			default:  { T.k = LK_OP_ADD; P += 1; CL += 1; } break;
+			case '=': { T.k = LK_AS_ADD; P += 2; CL += 2; } break;
+		} break;
+		case '-': switch (D) {
+			default:  { T.k = LK_OP_SUB; P += 1; CL += 1; } break;
+			case '>': { T.k = LK_RARROW; P += 2; CL += 2; } break;
+			case '=': { T.k = LK_AS_SUB; P += 2; CL += 2; } break;
+		} break;
+		case '*': switch (D) {
+			default:  { T.k = LK_OP_MUL; P += 1; CL += 1; } break;
+			case '=': { T.k = LK_AS_MUL; P += 2; CL += 2; } break;
+		} break;
+		case '/': switch (D) {
+			default:  { T.k = LK_OP_DIV; P += 1; CL += 1; } break;
+			case '=': { T.k = LK_AS_DIV; P += 2; CL += 2; } break;
+		} break;
+		case '%': switch (D) {
+			default:  { T.k = LK_OP_MOD; P += 1; CL += 1; } break;
+			case '=': { T.k = LK_AS_MOD; P += 2; CL += 2; } break;
+		} break;
+		case '=': switch (D) {
+			default:  { T.k = LK_ASSIGN; P += 1; CL += 1; } break;
+			case '=': { T.k = LK_OP_EQ;  P += 2; CL += 2; } break;
+		} break;
+		case '<': switch (D) {
+			default:  { T.k = LK_OP_LT;  P += 1; CL += 1; } break;
+			case '=': { T.k = LK_OP_LTE; P += 2; CL += 2; } break;
+		} break;
+		case '>': switch (D) {
+			default:  { T.k = LK_OP_GT;  P += 1; CL += 1; } break;
+			case '=': { T.k = LK_OP_GTE; P += 2; CL += 2; } break;
+		} break;
+		case '!': switch (D) {
+			default:  { T.k = LK_LO_NOT; P += 1; CL += 1; } break;
+			case '=': { T.k = LK_OP_NEQ; P += 2; CL += 2; } break;
+		} break;
+		case '&': switch (D) {
+			default:  { T.k = LK_BW_AND; P += 1; CL += 1; } break;
+			case '&': { T.k = LK_LO_AND; P += 2; CL += 2; } break;
+			case '=': { T.k = LK_AS_AND; P += 2; CL += 2; } break;
+		} break;
+		case '|': switch (D) {
+			default:  { T.k = LK_BW_OR;  P += 1; CL += 1; } break;
+			case '|': { T.k = LK_LO_OR;  P += 2; CL += 2; } break;
+			case '=': { T.k = LK_AS_OR;  P += 2; CL += 2; } break;
+		} break;
+		case '~': switch (D) {
+			default:  { T.k = LK_BW_NOT; P += 1; CL += 1; } break;
+			case '=': { T.k = LK_AS_NOT; P += 2; CL += 2; } break;
+		} break;
+		case '^': switch (D) {
+			default:  { T.k = LK_BW_XOR; P += 1; CL += 1; } break;
+			case '=': { T.k = LK_AS_XOR; P += 2; CL += 2; } break;
+		} break;
+		
+		case '\'': { /* TODO */ } break;
+		case '\"': { /* TODO */ } break;
+		
+		/* Handle unknown characters */
+		default: {
+			warn("%zu:%zu: Unknown character: %X '%c'", LN, CL, C, C);
+			P += 1; CL += 1;
+		} break;
+	}
 
-ret:;
-	l->t = n; return t;
+	return t;
+}
+
+/* Return the next token. */
+tok lex_peek(lex *l) {
+	return T;
 }
 
 /* Lex the next token if the current is of a specific type. */
 tok lex_kind(lex *l, tok_k k) {
-	if (l->t.k != k) { error(
+	if (T.k != k) { error(
 		1, "%zu:%zu: Unexpected: \"%s\", was expecting: \"%s\"",
-		l->t.p.ln, l->t.p.cl, tok_ks[l->t.k], tok_ks[k]
-	);}
+		T.ln + 1, T.cl + 1, tok_ks[T.k], tok_ks[k]
+	); }
 
 	return lex_next(l);
 }
 
-/* Return the current token. */
-tok lex_peek(lex *l) { return l->t; }
-
 /* Print lexer debug output and exit. */
 void lex_debug(lex *l) {
-	for (tok t = lex_next(l); t.k != LK_EOF; free(t.s), t = lex_next(l)) {
-		printf("%zu:%zu: %s \"%s\"\n", t.p.ln, t.p.cl + 1, tok_ks[t.k], t.s);
+	for (tok t = lex_next(l); t.k != LK_EOF; free(t.v_str), t = lex_next(l)) {
+		printf("%zu:%zu: %s \"%s\"\n", t.ln + 1, t.cl + 1, tok_ks[t.k], t.v_str);
 	}
 }
diff --git a/src/lex.h b/src/lex.h
new file mode 100644
index 0000000..64393d8
--- /dev/null
+++ b/src/lex.h
@@ -0,0 +1,48 @@
+// lex.h
+// Lexer header file for G
+// Copyright (C) 2021, Jakob Wakeling
+// All rights reserved.
+
+
+
+#ifndef G_LEX_H_H356P5AM
+#define G_LEX_H_H356P5AM
+
+#include "util/util.h"
+
+typedef enum {
+	LK_NULL, LK_EOF, LK_ID, LK_INT, LK_FLT, LK_STR,
+	
+	LK_RETURN, LK_FOR, LK_IF, LK_ELSE, LK_PROC,
+	
+	LK_LPAREN, LK_RPAREN, LK_LBRACK, LK_RBRACK, LK_LBRACE, LK_RBRACE,
+	LK_COLON,  LK_SCOLON, LK_COMMA,  LK_PERIOD, LK_RARROW, LK_QMARK,
+	
+	LK_OP_ADD, LK_OP_SUB, LK_OP_MUL, LK_OP_DIV, LK_OP_MOD,
+	LK_OP_EQ,  LK_OP_NEQ, LK_OP_GT,  LK_OP_LT,  LK_OP_GTE, LK_OP_LTE,
+	LK_LO_NOT, LK_LO_AND, LK_LO_OR,
+	LK_BW_NOT, LK_BW_AND, LK_BW_OR,  LK_BW_XOR, LK_BW_SHL, LK_BW_SHR,
+	
+	LK_ASSIGN, LK_AS_ADD, LK_AS_SUB, LK_AS_MUL, LK_AS_DIV, LK_AS_MOD,
+	LK_AS_NOT, LK_AS_AND, LK_AS_OR,  LK_AS_XOR, LK_AS_SHL, LK_AS_SHR,
+} tok_k;
+
+typedef struct {
+	tok_k k; usize ln, cl; u64 h;
+	union { u64 v_u64; s64 v_s64; f64 v_f64; u8 *v_str; };
+} tok;
+
+typedef struct {
+	u8 *s, *p, *q; usize ln, cl; tok t;
+} lex;
+
+extern char *tok_ks[];
+
+extern lex lex_init(u8 *src, usize len);
+extern tok lex_next(lex *l);
+extern tok lex_peek(lex *l);
+extern tok lex_kind(lex *l, tok_k k);
+
+extern void lex_debug(lex *l);
+
+#endif // G_LEX_H_H356P5AM
diff --git a/src/parse.c b/src/parse.c
index 8f32b12..d963360 100644
--- a/src/parse.c
+++ b/src/parse.c
@@ -50,17 +50,17 @@ ast *parse(lex *l) {
 static ast *parse_decl(lex *l) {
 	ast *a = ast_init();
 
-	a->s = lex_kind(l, LK_IDN).s; lex_kind(l, LK_COLON);
+	a->s = lex_kind(l, LK_ID).v_str; lex_kind(l, LK_COLON);
 
-	if (lex_peek(l).k == LK_IDN) { /* TODO lookup and store type */ }
+	if (lex_peek(l).k == LK_ID) { /* TODO lookup and store type */ }
 
 	switch (lex_peek(l).k) {
 	case LK_COLON:  { lex_kind(l, LK_COLON);  a->k = AK_DECL; } goto decl_expr;
-	case LK_EQUALS: { lex_kind(l, LK_EQUALS); a->k = AK_DECL; } goto decl_expr;
+	case LK_ASSIGN: { lex_kind(l, LK_ASSIGN); a->k = AK_DECL; } goto decl_expr;
 	decl_expr: { a->c = parse_expr(l); } break;
 	default: { error(
 		1, "%zu:%zu: Unexpected: \"%s\" (parse_decl)",
-		lex_peek(l).p.ln, lex_peek(l).p.cl, tok_ks[lex_peek(l).k]
+		lex_peek(l).ln + 1, lex_peek(l).cl + 1, tok_ks[lex_peek(l).k]
 	); } break;
 	}
 
@@ -77,7 +77,7 @@ static ast *parse_stmt(lex *l) {
 	case LK_RETURN: { lex_kind(l, LK_RETURN); a->k = AK_RETURN; a->c = parse_expr(l); } break;
 	default: { error(
 		1, "%zu:%zu: Unexpected: \"%s\" (parse_stmt)",
-		lex_peek(l).p.ln, lex_peek(l).p.cl, tok_ks[lex_peek(l).k]
+		lex_peek(l).ln + 1, lex_peek(l).cl + 1, tok_ks[lex_peek(l).k]
 	); } break;
 	}
 
@@ -108,7 +108,7 @@ static ast *parse_expr(lex *l) {
 	case LK_INT: { return parse_int(l); } break;
 	default: { error(
 		1, "%zu:%zu: Unexpected: \"%s\" (parse_expr)",
-		lex_peek(l).p.ln, lex_peek(l).p.cl, tok_ks[lex_peek(l).k]
+		lex_peek(l).ln + 1, lex_peek(l).cl + 1, tok_ks[lex_peek(l).k]
 	); } break;
 	}
 
@@ -136,14 +136,14 @@ static ast *parse_proc(lex *l) {
 		lex_kind(l, LK_RARROW);
 
 		/* TODO dont hardcode return type */
-		lex_kind(l, LK_IDN); a->t = t_s64;
+		lex_kind(l, LK_ID); a->t = t_s64;
 	}
 
 	a->c = parse_stmt_compound(l); return a;
 }
 
 static ast *parse_int(lex *l) {
-	val v = val_strint(lex_kind(l, LK_INT).s);
+	val v = val_strint(lex_kind(l, LK_INT).v_str);
 	ast *a = ast_init(); a->k = AK_INT; a->v = v;
 	return a;
 }