igen: Start work on a better lexer.

The other one I was about to do would've been harder to read, and understand. So, I've decided to write a more readable version.
author: mrb0nk500 <b0nk@b0nk.xyz> 2022-02-15 18:49:29 -0400
committer: mrb0nk500 <b0nk@b0nk.xyz> 2022-02-18 10:05:01 -0400
commit: 722c7f08e409d1f6f3a26bda666c15d7082f52e3 (patch)
tree: af0c77367a611398db7214b2bda90b499196b941
parent: f478e6c1223cc8370fa51d44b9244ec25be99788 (diff)
2 files changed, 92 insertions, 111 deletions
diff --git a/igen/lexer.c b/igen/lexer.c
index 275bcdd..03f7e87 100644
--- a/igen/lexer.c
+++ b/igen/lexer.c
@@ -5,87 +5,60 @@
 #include "lexer.h"
 #include "misc.h"
 
-atom get_token_id(const char c, const int dbg) {
-	switch (c) {
-		case '(': return ATOM_LBRACK;
-		case ')': return ATOM_RBRACK;
-		case '/': return ATOM_SLASH;
-		case '+': return ATOM_PLUS;
-		case '-': return ATOM_MINUS;
-		case '*': return ATOM_ASTR;
-		case '%': return ATOM_PRCNT;
-		case '&': return ATOM_AMPR;
-		case '|': return ATOM_PIPE;
-		case '^': return ATOM_CARROT;
-		case '#': return ATOM_HASH;
-		case ':': return ATOM_COL;
-		case ';': return ATOM_SCOL;
-		case ' ': return ATOM_SPACE;
-		case '_': return ATOM_USCORE;
-		case '=': return ATOM_EQUAL;
-		case '.': return ATOM_DOT;
-		case '?': return ATOM_QMARK;
-		case '!': return ATOM_BANG;
-		case '<': return ATOM_LT;
-		case '>': return ATOM_GT;
-		case '%': return ATOM_PERCENT;
-		case ',': return ATOM_COMMA;
-		case '\\': return ATOM_BSLASH;
-		case '\"': return ATOM_QUOTE;
-		case '\'': return ATOM_SQUOTE;
-		case '\t': return ATOM_TAB;
-		case '\n': return ATOM_NLINE;
-		default:
-			if (isalpha(c)) {
-				return ATOM_ALPHA;
-			} else if (isdigit(c)) {
-				return ATOM_NUM;
-			}
-			break;
+cond_stmt *lex_cond_stmt(char **str, int dbg) {
+
+}
+
+stmt *lex_comp_stmt(char **str, int dbg) {
+	char *tmp = *str;
+	if (*tmp++ == '{') {
+		stmt *s = lex_stmt(&tmp, dbg);
+		if (*tmp++ == '}') {
+			*str = tmp;
+			return s;
+		} else {
+			throw_error("Missing \'}\' in stmt.");
+		}
 	}
-	return ATOM_NONE;
+	return NULL;
 }
 
-int get_atom_span(const char *str, const atom *atoms, int inv, int dbg) {
-	int i;
-	for (i = 0; str[i] != '\0'; ++i) {
-		const enum atom atom = get_atom_id(str[i], dbg);
-		for (int j = 0; atoms[j] != ATOM_NONE; ++j) {
-			const int is_done = (inv) ? (atom == atoms[j]) : (atom != atoms[j]);
-			if (is_done) {
-				return i;
-			}
+stmt *lex_stmt(char **str, int dbg) {
+	const alt_stmt alts[] = {
+		{STMT_DIR, offsetof(stmt, dir), lex_dir},
+		{STMT_FUNC, offsetof(stmt, func), lex_func},
+		{STMT_EXPR, offsetof(stmt, expr), lex_exprs},
+		{STMT_COND, offsetof(stmt, cond_stmt), lex_cond_stmt},
+		{STMT_COMP, offsetof(stmt, down), lex_comp_stmt},
+	};
+	for (int i = 0; i < NUM_STMTS; ++i) {
+		char *tmp = *str;
+		void *data = alts[i].lex(&tmp, dbg);
+		if (data != NULL) {
+			stmt *s = calloc(1, sizeof(stmt));
+			void **member = (char **)s+alts[i].offset;
+			*member = data;
+			return s;
 		}
 	}
-	return i;
+
+	return NULL;
 }
 
+stmt *lex_library(char **str, stmt **end, int dbg) {
+	stmt *start = lex_stmt(str, dbg);
+	end = (end != NULL) ? end : &start;
+	for (stmt *s = start; s != NULL; s = lex_stmt(str, dbg)) {
+		(*end)->next = s;
+		*end = s;
+	}
+	return start;
+};
+
 int lex(char *str, int dbg) {
-	int in_inst_stmt = 0;
-	lexeme *lex_start = NULL;
-	lexeme *lex_end = NULL;
+	stmt *start = NULL;
+	stmt *end = NULL;
 
-	for (int i = 0; str[i] != '\0'; ++i) {
-		atom atom_id = get_atom_id(str[i], dbg);
-		switch (atom_id) {
-			case ATOM_PERCENT:
-				if (get_atom_id(str[++i]) == ATOM_PERCENT) {
-					in_inst_stmt = !in_inst_stmt;
-				}
-				break;
-			case ATOM_LBRACK:
-				break;
-			case ATOM_ALPHA:
-				do {
-					const int ident_len = get_atom_span(&str[i], (const atom []) {
-						ATOM_ALPHA,
-						ATOM_USCORE,
-						ATOM_NUM,
-						ATOM_NONE
-					}, 0, dbg);
-					char *ident = calloc(ident_len+1, sizeof(char));
-				} while(0);
-				break;
-		}
-	}
+	start = library(&str, &end, dbg);
+	return (start != NULL && end != NULL);
 }
diff --git a/igen/lexer.h b/igen/lexer.h
index ba81cb1..454d2e1 100644
--- a/igen/lexer.h
+++ b/igen/lexer.h
@@ -1,45 +1,53 @@
 #ifndef LEXER_H
 #define LEXER_H
 
-typedef enum atom atom;
-typedef struct lexeme lexeme;
-
-enum atom {
-	ATOM_LBRACK,
-	ATOM_RBRACK,
-	ATOM_SLASH,
-	ATOM_PLUS,
-	ATOM_MINUS,
-	ATOM_ASTR,
-	ATOM_PRCNT,
-	ATOM_AMPR,
-	ATOM_PIPE,
-	ATOM_CARROT,
-	ATOM_HASH,
-	ATOM_COL,
-	ATOM_SCOL,
-	ATOM_SPACE,
-	ATOM_USCORE,
-	ATOM_EQUAL,
-	ATOM_DOT,
-	ATOM_QMARK,
-	ATOM_BANG,
-	ATOM_LT,
-	ATOM_GT,
-	ATOM_PERCENT,
-	ATOM_COMMA,
-	ATOM_BSLASH,
-	ATOM_QUOTE,
-	ATOM_SQUOTE,
-	ATOM_TAB,
-	ATOM_NLINE,
-	ATOM_ALPHA,
-	ATOM_NUM,
-	ATOM_NONE,
-	NUM_ATOMS
+#include <stdlib.h>
+
+typedef enum stmt_type stmt_type;
+typedef enum cond_type cond_type;
+typedef struct alt_stmt alt_stmt;
+typedef struct cond_stmt cond_stmt;
+typedef struct stmt stmt;
+
+enum stmt_type {
+	STMT_DIR,
+	STMT_FUNC,
+	STMT_EXPR,
+	STMT_COND,
+	STMT_COMP,
+	NUM_STMTS
+};
+
+enum cond_type {
+	COND_IF,
+	COND_FOR,
+	COND_WHILE,
+	COND_DO_WHILE,
+	NUM_CONDS
+};
+
+struct alt_stmt {
+	int type;
+	size_t offset;
+	void *(*lex)(char **str, int dbg);
+};
+
+struct cond_stmt {
+	cond_type type;
+	expr *expr;
+	stmt *stmt;
 };
 
-struct lexeme {
+struct stmt {
+	stmt_type type;
+	union {
+		dir *dir;
+		func *func;
+		expr *expr;
+		cond_stmt *cond_stmt;
+		stmt *down;
+	};
+	stmt *next;
 };
 
 extern int lex(char *str, int dbg);
author	mrb0nk500 <b0nk@b0nk.xyz>	2022-02-15 18:49:29 -0400
committer	mrb0nk500 <b0nk@b0nk.xyz>	2022-02-18 10:05:01 -0400
commit	722c7f08e409d1f6f3a26bda666c15d7082f52e3 (patch)
tree	af0c77367a611398db7214b2bda90b499196b941
parent	f478e6c1223cc8370fa51d44b9244ec25be99788 (diff)