G

G Programming Language
git clone http://git.omkov.net/G
Log | Tree | Refs | README | Download

AuthorJakob Wakeling <[email protected]>
Date2023-07-13 04:57:41
Commit1544347804b7a223b8dc012a622ed61ae77eb068
Parent8afca84b873628ea66a00a16be614d515f23f10c

Inline integer and float lexing

Diffstat

M README.md | 15 +++++++++++++++
M doc/spec.md | 2 +-
M src/lex.c | 106 ++++++++++++++++++++++++++++++++++++++-----------------------------------------
M src/main.c | 35 ++++++++++++++++++++++-------------

4 files changed, 89 insertions, 69 deletions

diff --git a/README.md b/README.md
index ab9df5a..90b5072 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,20 @@ Influenced by **C**, **Odin**, **Zig**, and others.
 
 Note that at present, **G** is highly unstable and will certainly change.
 
+## Example
+
+```g
+print :: proc(s: string) {
+	#syscall(uint(1), sint(1), cstring(s), len(s));
+	return;
+}
+
+main :: proc() -> u8 {
+	print("Hello, World!\n");
+	return 0;
+}
+```
+
 ## Usage
 
 **G** is being developed on x86-64 Linux, and is untested elsewhere.
diff --git a/doc/spec.md b/doc/spec.md
index 392abaf..3c465ac 100644
--- a/doc/spec.md
+++ b/doc/spec.md
@@ -61,7 +61,7 @@ An underscore character `_` may appear between digits in each part; these
 underscores are solely for readability and do not change the literal's value.
 
 ```ebnf
-literal_flt = integer_seq, ".", integer_seq, [ "e", "+" | "-", integer_seq ] ;
+literal_flt = integer_seq, [ ".", integer_seq ], [ "e", "+" | "-", integer_seq ] ;
 integer_seq = digit_dec, { "_", digit_dec } ;
 ```
 
diff --git a/src/lex.c b/src/lex.c
index c240889..6254c20 100644
--- a/src/lex.c
+++ b/src/lex.c
@@ -40,9 +40,6 @@ char *tok_ks[] = {
 #define is_digit_doz(c) ((c >= '0' && c <= '9') || (c == 'A' || c == 'B'))
 #define is_digit_hex(c) ((c >= '0' && c <= '9') || (c >= 'A' || c == 'F'))
 
-static inline u64 parse_int(char *s);
-static inline f128 parse_flt(char *s);
-
 /* Initialise a lexer. */
 lex lex_init(const char *file, char *src, UINT len) {
 	lex l = { file, src, src, src + len, 0, 0, 0, 0 };
@@ -106,24 +103,56 @@ tok lex_next(lex *l) {
 		else if (strncmp(s, "for",      3) == 0) { T.k = TK_FOR;      }
 		else if (strncmp(s, "break",    5) == 0) { T.k = TK_BREAK;    }
 		else if (strncmp(s, "continue", 8) == 0) { T.k = TK_CONTINUE; }
-		else { T.k = TK_ID; if (!(T.s = strndup(s, sl))) { error(1, SERR); }}
+		else { T.k = TK_ID; if (!(T.s = strndup(s, sl))) { panic(SERR); }}
 	}
 
 	/* Handle number literals */
 	else if (is_digit_dec(C)) {
-		T.k = TK_INT; char *s = P;
-		
-		for (P += 1; is_alpha(C) || is_digit_dec(C); P += 1);
-		if (C == '.') { T.k = TK_FLT; P += 1; for (P += 1; is_digit_dec(C); P += 1); }
-		if (C == 'e') { P += (C == '+' || C == '-') ? 2 : 1; for (P += 1; is_digit_dec(C); P += 1); }
+		T.k = TK_INT; char *start = P;
 
-		UINT sl = P - s; CL += sl;
+		for (P += 1; is_alpha(C) || is_digit_dec(C) || C == '.'; P += 1) {
+			if (C == '.') { T.k = TK_FLT; }
+			if (C == 'e' || C == 'E') { T.k = TK_FLT; P += P[1] == '+' || P[1] == '-'; }
+		}
 
-		if (!(T.s = strndup(s, sl))) { error(1, SERR); }
+		UINT sl = P - start; CL += sl;
+		if (!(T.s = strndup(start, sl))) { panic(SERR); }
 
 		switch (T.k) {
-		case TK_INT: { T.v_int = parse_int(T.s); } break;
-		case TK_FLT: { T.v_flt = parse_flt(T.s); } break;
+		case TK_INT: {
+			char *s = T.s; register u64 v = 0; u64 c;
+			register UINT b = 10; char *b_s;
+			
+			if (s[0] == '0') switch (s[1]) {
+			case 'b': { s += 2; b = 2;  b_s = "binary";      } break;
+			case 'o': { s += 2; b = 8;  b_s = "octal";       } break;
+			case 'd': { s += 2; b = 10; b_s = "decimal";     } break;
+			case 'z': { s += 2; b = 12; b_s = "duodecimal";  } break;
+			case 'x': { s += 2; b = 16; b_s = "hexadecimal"; } break;
+			default:  { s += 1; } break;
+			}
+			
+			for (; s[0]; ++s) {
+				if (s[0] >= '0' && s[0] <= '9') { c = *s - '0'; }
+				else if (s[0] >= 'A' && s[0] <= 'F') { c = *s - ('A' - 10); }
+				
+				if (c >= b) { note(l->n, T.ln, T.cl, 0, "Invalid digit \'%c\' in %s constant", s[0], b_s); }
+				if (v > (U64_MAX - c) / b) { note(l->n, T.ln, T.cl, 0, "Integer literal cannot be represented"); }
+				
+				v = v * b + c;
+			}
+			
+			T.v_int = v;
+		} break;
+		case TK_FLT: {
+			/* TODO reimplement */
+			char *s = T.s, *end; register f128 v = 0;
+			
+			v = strtold(s, &end);
+			if (*end != '\0') { note(l->n, T.ln, T.cl, 0, "Invalid digit \'%c\' in floating-point constant", *end); }
+			
+			T.v_flt = v;
+		} break;
 		default: { /* Unreachable */ } break;
 		}
 	}
@@ -243,15 +272,12 @@ tok lex_next(lex *l) {
 			else if (P != Q) { P += 1; CL += 1; }
 
 			if (quote == '\'') {
-				T.k = TK_INT; if (!(T.s = strndup(s, sl))) { error(1, SERR); }
+				T.k = TK_INT; if (!(T.s = strndup(s, sl))) { panic(SERR); }
 
-				/*
-					Numerical value of character literals is calculated and
-					stored in T.v_int
-				*/
+				/* Numerical value of character literals is calculated and stored in T.v_int */
 				for (UINT i = 0; i < sl; i += 1) {
 					if (T.v_int > (U32_MAX - s[i]) / 256) {
-						note(l->n, T.ln, T.cl, 1, "Character constant exceeds maximum size"); break;
+						note(l->n, T.ln, T.cl, 0, "Character literal cannot be represented"); break;
 					}
 
 					T.v_int = T.v_int * 256 + s[i];
@@ -259,7 +285,7 @@ tok lex_next(lex *l) {
 			}
 			else if (quote == '\"') {
 				T.k = TK_STR; T.h = syt_hash(s, sl);
-				if (!(T.s = strndup(s, sl))) { error(1, SERR); }
+				if (!(T.s = strndup(s, sl))) { panic(SERR); }
 			}
 		} break;
 
@@ -283,39 +309,3 @@ void lex_debug(lex *l) {
 		printf("%zu:%zu: %s \"%s\"\n", t.ln + 1, t.cl + 1, tok_ks[t.k], t.s);
 	}
 }
-
-/* Parse an integer string into a value. */
-static inline u64 parse_int(char *s) {
-	/* TODO lex exponent part of numbers */
-	register u64 v = 0; u64 c; register UINT b = 10;
-	
-	if (s[0] == '0') switch (s[1]) {
-	case 'b': { s += 2; b = 2;  } break; case 'o': { s += 2; b = 8;  } break;
-	case 'd': { s += 2; b = 10; } break; case 'z': { s += 2; b = 12; } break;
-	case 'x': { s += 2; b = 16; } break; default:  { s += 1; } break;
-	}
-	
-	for (; s[0]; ++s) {
-		if (s[0] >= '0' && s[0] <= '9') { c = *s - '0'; }
-		else if (s[0] >= 'A' && s[0] <= 'F') { c = *s - ('A' - 10); }
-		
-		/* TODO better error handling */
-		if (c >= b) { errno = EDOM; return 0; }
-		if (v > (U64_MAX - c) / b) { errno = ERANGE; return 0; }
-		
-		v = v * b + c;
-	}
-	
-	return v;
-}
-
-static inline f128 parse_flt(char *s) {
-	/* TODO lex exponent part of numbers */
-	register f128 v = 0; u64 c; char *endptr;
-	
-	v = strtold(s, &endptr);
-	/* TODO better error handling */
-	if (*endptr != '\0') { return 0; }
-	
-	return v;
-}
diff --git a/src/main.c b/src/main.c
index fde1694..d51477b 100644
--- a/src/main.c
+++ b/src/main.c
@@ -26,6 +26,7 @@ static bool bflag = false, Bflag = false, cflag = false;
 static bool Eflag = false, pflag = false, Pflag = false;
 static bool qflag = false, Sflag = false;
 
+static char *output = NULL;
 static int verbosity = 0;
 
 static void compile(const char *file, char *src, UINT len);
@@ -41,18 +42,19 @@ static void ver(void);
 int main(int ac, char *av[]) { A0 = av[0];
 	struct opt opt = OPTGET_INIT; opt.str = "bBcEf:O:pPqSvW:"; opt.lops = lops;
 	for (int o; (o = optget(&opt, av, 1)) != -1;) switch (o) {
-	case 'b': { bflag = true; } break; /* Output LLVM IR files */
-	case 'B': { Bflag = true; } break; /* Output LLVM bitcode files */
-	case 'c': { cflag = true; } break; /* Output object files */
-	case 'E': { Eflag = true; } break; /* Output lexer tokens */
-	case 'f': { opt_f(opt.arg); } break; /* Configure formatting */
-	case 'O': { opt_O(opt.arg); } break; /* Configure optimisation */
-	case 'p': { pflag = true; } break; /* Output parser AST */
-	case 'P': { Pflag = true; } break; /* Output analyser AST */
-	case 'q': { qflag = true; } break; /* Silence certain outputs (for benchmarking) */
-	case 'S': { Sflag = true; } break; /* Output assembly files */
-	case 'v': { verbosity += 1; } break; /* Increase verbosity */
-	case 'W': { opt_W(opt.arg); } break; /* Configure warnings and errors */
+	case 'b': { bflag = true;     } break; /* Output LLVM IR files */
+	case 'B': { Bflag = true;     } break; /* Output LLVM bitcode files */
+	case 'c': { cflag = true;     } break; /* Output object files */
+	case 'E': { Eflag = true;     } break; /* Output lexer tokens */
+	case 'f': { opt_f(opt.arg);   } break; /* Configure formatting */
+	case 'o': { output = opt.arg; } break; /* Specify output file */
+	case 'O': { opt_O(opt.arg);   } break; /* Configure optimisation */
+	case 'p': { pflag = true;     } break; /* Output parser AST */
+	case 'P': { Pflag = true;     } break; /* Output analyser AST */
+	case 'q': { qflag = true;     } break; /* Silence certain outputs (for benchmarking) */
+	case 'S': { Sflag = true;     } break; /* Output assembly files */
+	case 'v': { verbosity += 1;   } break; /* Increase verbosity */
+	case 'W': { opt_W(opt.arg);   } break; /* Configure warnings and errors */
 	case 256: { hlp(); } return 0;
 	case 257: { ver(); } return 0;
 	default: {} return 1;
@@ -118,14 +120,21 @@ static void opt_W(const char *arg) {
 /* Print help information. */
 static void hlp(void) {
 	puts("G - G Programming Language\n");
-	puts("Usage: g\n");
+	puts("Usage: g file...\n");
 	puts("Options:");
 	puts("  -b,       Output LLVM IR files");
 	puts("  -B,       Output LLVM bitcode files");
 	puts("  -c,       Output object files");
 	puts("  -E,       Output lexer tokens");
+	// puts("  -f,       Configure formatting");
+	// puts("  -o file,  Specify output file");
+	// puts("  -O,       Configure optimisation");
 	puts("  -p,       Output parser AST");
 	puts("  -P,       Output analyser AST");
+	puts("  -q,       Silence certain outputs (for benchmarking)");
+	puts("  -S,       Output assembly files");
+	// puts("  -v,       Increase verbosity ");
+	// puts("  -W,       Configure warnings and errors");
 	puts("  --help    Display help information");
 	puts("  --version Display version information");
 }