G - Log

Author	Jakob Wakeling <[email protected]>
Date	2023-05-30 05:29:17
Commit	d44ca567693a048a42551b84af16eeaf09101aa0
Parent	af77441bb0a4dd397f9bcad216f5f892557b938b
Implement string lexing
Diffstat

M	README.md	\|	1	+
M	doc/g.ebnf	\|	5	+++--
M	src/init.c	\|	2	+-
M	src/lex.c	\|	99	+++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------
M	src/lex.h	\|	2	+-
M	src/llvm.c	\|	3	++-
M	src/parse.c	\|	20	++++++++++----------
M	src/parse.h	\|	2	+-
8 files changed, 83 insertions, 51 deletions
diff --git a/README.md b/README.md
index ca1b7f5..eb424cf 100644
--- a/README.md
+++ b/README.md
@@ -64,6 +64,7 @@ command. The second command will output an executable file, *a.out* by default.
 - [ ] Implement booleans
 - [x] Implement integers
 - [x] Implement reals
+- [ ] Implement pointers
 - [ ] Implement arrays
 - [x] Implement expressions
 - [x] Implement type casting
diff --git a/doc/g.ebnf b/doc/g.ebnf
index faae4e0..75d223f 100644
--- a/doc/g.ebnf
+++ b/doc/g.ebnf
@@ -30,6 +30,7 @@ expr = iden | literal
      | "(", expr, ")"
      | type, "(", expr, ")" (* Type cast *)
      | iden, "(", [ expr, { ",", expr } ], ")" (* Procedure call *)
+     | "#", iden, [ "(", [ expr, { ",", expr } ], ")" ] (* Hash expression *)
      | "+", expr | "-", expr (* Unary POS and NEG *)
      | "!", expr | "~", expr (* Logical and bitwise NOT *)
 
@@ -83,8 +84,8 @@ literal = literal_null | literal_bool | literal_int | literal_flt | literal_chr
 (* General *)
 escape = escape_std | escape_oct | escape_hex | escape_utf ;
 escape_std = "\0" | "\a" | "\b" | "\t" | "\n" | "\v" | "\f" | "\r" | '\"' | "\'" | "\\" ;
-escape_oct = "\o", digit_oct, [ digit_oct, [ digit_oct ] ] ;
-escape_hex = "\x", digit_hex, [ digit_hex ] ;
+escape_oct = "\o", digit_oct, digit_oct, digit_oct ;
+escape_hex = "\x", digit_hex, digit_hex ;
 escape_utf = "\u", quadd_hex, [ quadd_hex ] ;
 
 digit = digit_dec ;
diff --git a/src/init.c b/src/init.c
index 2d7e6f6..b78f831 100644
--- a/src/init.c
+++ b/src/init.c
@@ -44,7 +44,7 @@ static ast kwds[] = {
 	{ AK_TYPE, 0, 0, 0, "char", &TYPE(TY_CHAR), { 0 }, NULL },
 	{ AK_TYPE, 0, 0, 0, "rune", &TYPE(TY_RUNE), { 0 }, NULL },
 
-	{ AK_NULL, 0, 0, 0, NULL, NULL, { 0 }, NULL }
+	{ AK_ZERO, 0, 0, 0, NULL, NULL, { 0 }, NULL }
 };
 
 void initialise(void) {
diff --git a/src/lex.c b/src/lex.c
index 81843f0..5914950 100644
--- a/src/lex.c
+++ b/src/lex.c
@@ -10,14 +10,13 @@
 #include "util/error.h"
 #include "util/util.h"
 
-#include <ctype.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
 
 char *tok_ks[] = {
-	"NULL_TK", "TK_EOF", "TK_ID", "TK_NUM", "TK_STR", "TK_HASH",
+	"TK_ZERO", "TK_EOF", "TK_ID", "TK_NUM", "TK_CHR", "TK_STR", "TK_HASH",
 
 	"TK_NULL", "TK_TRUE", "TK_FALSE", "TK_RETURN", "TK_IF", "TK_ELSE", "TK_FOR", "TK_PROC",
 
@@ -33,18 +32,26 @@ char *tok_ks[] = {
 	"TK_AS_NOT", "TK_AS_AND", "TK_AS_OR",  "TK_AS_XOR", "TK_AS_SHL", "TK_AS_SHR",
 };
 
+#define is_space(c) (c == ' ' || c == '\f' || c == '\n' || c == '\r' || c == '\t' || c == '\v')
+#define is_alpha(c) ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
+#define is_digit_bin(c) (c == '0' || c == '1')
+#define is_digit_oct(c) (c >= '0' && c <= '7')
+#define is_digit_dec(c) (c >= '0' && c <= '9')
+#define is_digit_doz(c) ((c >= '0' && c <= '9') || (c == 'A' || c == 'B'))
+#define is_digit_hex(c) ((c >= '0' && c <= '9') || (c >= 'A' || c == 'F'))
+
 /* Push a token to a token array. */
 void tok_a_push(tok_a *a, tok t) {
 	tok *ta = realloc(a->a, (a->al += 1) * sizeof (tok));
 	if (!ta) { error(1, SERR); } else { a->a = ta; a->a[a->al - 1] = t; }
 }
 
-/* Pop a token from a token array. */
+/* Pop a token from a token array. A zero token is returned when empty. */
 tok tok_a_pop(tok_a *a) {
 	return (a->al ? a->a[a->al -= 1] : (tok){ 0 });
 }
 
-/* Pop a token from a token array. */
+/* Pop a token from a token array. A zero token is returned when empty. */
 tok tok_a_peek(tok_a *a) {
 	return (a->al ? a->a[a->al - 1] : (tok){ 0 });
 }
@@ -72,7 +79,7 @@ tok lex_next(lex *l) {
 	tok t = T; T = (tok){ 0 };
 
 	/* Skip null characters and whitespace */
-	skip:; for (; P != Q && (!C || isspace(C)); P += 1) switch (C) {
+	skip:; for (; P != Q && (!C || is_space(C)); P += 1) switch (C) {
 		case '\0': { note(l->n, l->ln, l->cl, 1, "Null character ignored"); } break;
 		case '\n': { LN += 1; CL = 0; } break;
 		default:   { CL += 1; } break;
@@ -93,18 +100,13 @@ tok lex_next(lex *l) {
 		} goto skip;
 	}
 
-	/*
-		FIXME beyond this point EOF isn't checked properly so if a file does not
-		have a trailing newline it **may** cause a segfault (?)
-	*/
-	
 	T.ln = LN; T.cl = CL;
 
 	/* Handle identifiers and keywords */
-	if (isalpha(C) || C == '_') {
+	if (is_alpha(C) || C == '_') {
 		char *s = P; UINT sl;
 
-		for (P += 1; isalpha(C) || isdigit(C) || C == '_'; P += 1);
+		for (P += 1; is_alpha(C) || is_digit_dec(C) || C == '_'; P += 1);
 		sl = P - s; CL += sl; T.h = syt_hash(s, sl);
 
 		if      (strncmp(s, "null",   sl) == 0) { T.k = TK_NULL;   }
@@ -119,11 +121,11 @@ tok lex_next(lex *l) {
 	}
 
 	/* Handle number literals */
-	else if (isdigit(C)) {
+	else if (is_digit_dec(C)) {
 		char *s = P; UINT sl;
 
-		for (P += 1; isalnum(C); P += 1);
-		if (C == '.') { P += 1; for (P += 1; isdigit(C); P += 1); }
+		for (P += 1; is_alpha(C) || is_digit_dec(C); P += 1);
+		if (C == '.') { P += 1; for (P += 1; is_digit_dec(C); P += 1); }
 
 		sl = P - s; CL += sl;
 
@@ -134,7 +136,7 @@ tok lex_next(lex *l) {
 	else if (C == '#') {
 		char *s = P; UINT sl;
 
-		for (P += 1; isalpha(C) || isdigit(C) || C == '_'; P += 1);
+		for (P += 1; is_alpha(C) || is_digit_dec(C) || C == '_'; P += 1);
 		sl = P - s; CL += sl;
 
 		if (sl <= 1) { note(l->n, T.ln, T.cl, 0, "A hash must be followed by an identifier"); goto redo; }
@@ -211,26 +213,53 @@ tok lex_next(lex *l) {
 			case '=': { T.k = TK_AS_XOR; P += 2; CL += 2; } break;
 		} break;
 
-		/* TODO implement character escapes */
-		/* TODO implement multi line strings */
-		// case '\'': {
-		// 	char *s = P; UINT sl;
-			
-		// 	for (P += 1; C != '\''; P += 1);
-		// 	sl = P - s; CL += sl;
-			
-		// 	T.k = TK_NUM;
-		// } break;
-		case '\"': {
-			char *s = P += 1; UINT sl;
+		/* Handle character and string literals */
+		case '\'': case '\"': {
+			char quote = C, *s = P += 1; CL += 1; register char *head = s;
 
-			for (; C != '\"' && C != '\n'; P += 1);
-			sl = P - s; CL += sl; T.h = syt_hash(s, sl);
+			for (; C != quote && C != '\n' && P != Q;) {
+				/* Non escape characters are not altered */
+				if (C != '\\') { *head = C; head += 1; }
+				
+				/* Escape characters are processed and re-written to head */
+				else switch (D) {
+				case '0':  { *head = 0x00; P += 2; head += 1; } break;
+				case 'a':  { *head = '\a'; P += 2; head += 1; } break;
+				case 'b':  { *head = '\b'; P += 2; head += 1; } break;
+				case 'f':  { *head = '\f'; P += 2; head += 1; } break;
+				case 'n':  { *head = '\n'; P += 2; head += 1; } break;
+				case 'r':  { *head = '\r'; P += 2; head += 1; } break;
+				case 't':  { *head = '\t'; P += 2; head += 1; } break;
+				case 'v':  { *head = '\v'; P += 2; head += 1; } break;
+				case '\'': { *head = '\''; P += 2; head += 1; } break;
+				case '\"': { *head = '\"'; P += 2; head += 1; } break;
+				case '\\': { *head = '\\'; P += 2; head += 1; } break;
+				case '\n': { P += 2; } break;
+				// case 'o': {} break;
+				// case 'x': {} break;
+				// case 'u': {} break;
+				default: {
+					note(l->n, l->ln, l->cl, 1, "Unknown escape sequence: \"\\%c\"", D);
+					*head = D; P += 2; head += 1;
+				} break;
+				}
+			}
 
-			if (C != '\"') { note(l->n, T.ln, T.cl, 0, "Missing closing quote"); }
-			else { P += 1; }
+			UINT sl = head - s; CL += sl;
+			if (C != quote) { note(l->n, T.ln, T.cl, 0, "Missing closing quote"); }
+			else if (P != Q) { P += 1; CL += 1; }
 
-			T.k = TK_STR; if (!(T.s = strndup(s, sl))) { error(1, SERR); }
+			if (quote == '\'') {
+				T.k = TK_NUM;
+				if (!(T.s = strndup(s, sl))) { error(1, SERR); }
+				
+				note(l->n, T.ln, T.cl, 1, "Characters are not yet fully implemented");
+				/* Lex as TK_NUM, but how to distinguish at parsing stage? */
+			}
+			else if (quote == '\"') {
+				T.k = TK_STR; T.h = syt_hash(s, sl);
+				if (!(T.s = strndup(s, sl))) { error(1, SERR); }
+			}
 		} break;
 
 		/* Handle unknown characters */
diff --git a/src/lex.h b/src/lex.h
index 2ce4303..ff94340 100644
--- a/src/lex.h
+++ b/src/lex.h
@@ -10,7 +10,7 @@
 
 /* Remember to update tok_ks in lex.c */
 typedef enum {
-	NULL_TK, TK_EOF, TK_ID, TK_NUM, TK_STR, TK_HASH,
+	TK_ZERO, TK_EOF, TK_ID, TK_NUM, TK_CHR, TK_STR, TK_HASH,
 
 	TK_NULL, TK_TRUE, TK_FALSE, TK_RETURN, TK_IF, TK_ELSE, TK_FOR, TK_PROC,
 
diff --git a/src/llvm.c b/src/llvm.c
index db97832..a8fedb7 100644
--- a/src/llvm.c
+++ b/src/llvm.c
@@ -222,7 +222,7 @@ static LLVMValueRef llvm_stmt_expr(ast *a, syt *st) {
 
 /* Generate IR for a return statement. */
 static LLVMValueRef llvm_stmt_return(ast *a, syt *st) {
-	return LLVMBuildRet(llvm_builder, llvm_expr(C[0], st));
+	return LLVMBuildRet(llvm_builder, a->c.al > 0 ? llvm_expr(C[0], st) : NULL);
 }
 
 /* Generate IR for an if statement. */
@@ -385,6 +385,7 @@ static inline void llvm_free(void) {
 /* Return the appropriate LLVMTypeRef for a G type. */
 static LLVMTypeRef llvm_type(type *t) {
 	switch (t->k) {
+	case TY_NULL: { return LLVMVoidType();  } break;
 	case TY_B8:   { return LLVMIntType(8);  } break;
 	case TY_B16:  { return LLVMIntType(16); } break;
 	case TY_B32:  { return LLVMIntType(32); } break;
diff --git a/src/parse.c b/src/parse.c
index df43318..8325461 100644
--- a/src/parse.c
+++ b/src/parse.c
@@ -20,7 +20,7 @@
 typedef struct { tok_k tk; ast_k ak; s32 o; bool as; } op;
 
 char *ast_ks[] = {
-	"AK_NULL", "AK_PROG", "AK_PROC", "AK_TYPE", "AK_CAST",
+	"AK_ZERO", "AK_PROG", "AK_PROC", "AK_TYPE", "AK_CAST",
 
 	"AK_STMT", "AK_COMP", "AK_DECL", "AK_RETURN", "AK_IF", "AK_FOR",
 
@@ -226,7 +226,7 @@ static ast *parse_stmt_return(lex *l, syt *st) {
 	lex_kind(l, TK_RETURN);
 
 	ast *a = ast_init(); a->k = AK_RETURN;
-	ast_push(a, parse_expr(l, st, false));
+	if (T.k != TK_SCOLON) { ast_push(a, parse_expr(l, st, false)); }
 
 	lex_kind(l, TK_SCOLON); return a;
 }
@@ -307,7 +307,7 @@ static ast *parse_expr(lex *l, syt *st, bool arg) {
 	case TK_LPAREN: { tok_a_push(&ts, lex_next(l));      } break;
 	case TK_RPAREN: {
 		for (tok t = tok_a_pop(&ts);; t = tok_a_pop(&ts)) {
-			if (t.k == NULL_TK) {
+			if (t.k == TK_ZERO) /* Stack is empty */ {
 				if (arg) { goto eox; }
 				note(l->n, T.ln, T.cl, -1, "expected left parenthesis");
 			}
@@ -343,7 +343,7 @@ static ast *parse_expr(lex *l, syt *st, bool arg) {
 		ast_a_push(&as, a);
 	} break;
 	default: /* Handle operators */ {
-		op o1, o2; if ((o1 = op_lookup(T.k, false)).tk == NULL_TK) { goto eox; }
+		op o1, o2; if ((o1 = op_lookup(T.k, false)).tk == TK_ZERO) { goto eox; }
 
 		/*
 			If there is an operator at the top of the operator stack that is not
@@ -351,7 +351,7 @@ static ast *parse_expr(lex *l, syt *st, bool arg) {
 			same precedence as o1 and o1 is left-associative, then pop it from
 			the stack onto the output.
 		*/
-		for (o2 = op_lookup(tok_a_peek(&ts).k, false); o2.tk != NULL_TK; o2 = op_lookup(tok_a_peek(&ts).k, false)) {
+		for (o2 = op_lookup(tok_a_peek(&ts).k, false); o2.tk != TK_ZERO; o2 = op_lookup(tok_a_peek(&ts).k, false)) {
 			if (o2.tk == TK_LPAREN || (o1.o < o2.o && (o1.o != o2.o || o1.as == true))) { break; }
 
 			shunt(&as, tok_a_pop(&ts), o2);
@@ -362,7 +362,7 @@ static ast *parse_expr(lex *l, syt *st, bool arg) {
 	} eox:;
 
 	/* Pop any remaining operators from the operator stack */
-	for (tok t = tok_a_pop(&ts); t.k != NULL_TK; t = tok_a_pop(&ts)) {
+	for (tok t = tok_a_pop(&ts); t.k != TK_ZERO; t = tok_a_pop(&ts)) {
 		if (t.k == TK_LPAREN) { error(1, "LPAREN: TODO"); }
 		if (t.k == TK_RPAREN) { error(1, "RPAREN: TODO"); }
 
@@ -381,7 +381,6 @@ static ast *parse_expr_proc(lex *l, syt *st) {
 	/* TODO */ lex_kind(l, TK_RPAREN);
 
 	/* Parse optional procedure return type(s) */
-	/* TODO parse more than one return type */
 	if (T.k == TK_RARROW) {
 		lex_next(l); tok t = lex_kind(l, TK_ID);
 		ast *s = syt_search_h(st, t.h, t.s);
@@ -397,6 +396,7 @@ static ast *parse_expr_proc(lex *l, syt *st) {
 
 		a->t = s->t;
 	}
+	else { a->t = &TYPE(TY_NULL); }
 
 	ast_push(a, parse_stmt_compound(l, st)); return a;
 }
diff --git a/src/parse.h b/src/parse.h
index ad88305..7781e50 100644
--- a/src/parse.h
+++ b/src/parse.h
@@ -16,7 +16,7 @@
 
 /* Remember to update ast_ks in parse.c */
 typedef enum {
-	AK_NULL, AK_PROG, AK_PROC, AK_TYPE, AK_CAST,
+	AK_ZERO, AK_PROG, AK_PROC, AK_TYPE, AK_CAST,
 
 	AK_STMT, AK_COMP, AK_DECL, AK_RETURN, AK_IF, AK_FOR,
	G
	G Programming Language
	git clone http://git.omkov.net/G
	Log \| Tree \| Refs \| README \| Download