#include "lex.h" #include "util.h" #define TAB_WIDTH 4 typedef struct Pos { size_t ln, col; /* current position */ size_t m_ln, m_col; /* marked position */ } Pos; static void consume(Pos *p, char c); static void emit(TokList *toks, const Pos *p, Tok t); static void mark(Pos *p); static void mark_err(const Pos *p); static char get_esc_char(char c); static void consume(Pos *p, char c) { if (c == '\n') { p->ln++; p->col = 1; } else if (c == '\t') p->col += TAB_WIDTH; else p->col++; } static void emit(TokList *toks, const Pos *p, Tok t) { t.ln = p->m_ln; t.col = p->m_col; toklist_append(toks, t); } static void mark(Pos *p) { p->m_ln = p->ln; p->m_col = p->col; } static void mark_err(const Pos *p) { err_ln = p->m_ln; err_col = p->m_col; } static char get_esc_char(char c) { switch(c) { case 'a': return '\a'; case 'b': return '\b'; case 'e': return '\033'; case 'f': return '\f'; case 'n': return '\n'; case 'r': return '\r'; case 't': return '\t'; case 'v': return '\v'; case '\\': return '\\'; case '\'': return '\''; case '"': return '\"'; default: return 0; } } TokList lex(const char *s) { TokList toks; toklist_init(&toks); Pos pos = { .ln = 1, .col = 1 }; for (;;) { mark(&pos); mark_err(&pos); if (IS_ALPHA(s[0])) { size_t i = 1; const char *start = s; consume(&pos, *(s++)); while (IS_ALNUM(s[0])) { consume(&pos, *(s++)); i++; } if (streq_0_n("if", start, i)) emit(&toks, &pos, (Tok){ .kind = TokIf }); else if (streq_0_n("else", start, i)) emit(&toks, &pos, (Tok){ .kind = TokElse }); else if (streq_0_n("while", start, i)) emit(&toks, &pos, (Tok){ .kind = TokWhile }); else if (streq_0_n("true", start, i)) emit(&toks, &pos, (Tok){ .kind = TokVal, .Val = { .type = TypeBool, .Bool = true }}); else if (streq_0_n("false", start, i)) emit(&toks, &pos, (Tok){ .kind = TokVal, .Val = { .type = TypeBool, .Bool = false }}); else { emit(&toks, &pos, (Tok){ .kind = TokIdent, .Ident = { .kind = IdentName, .Name = psndup(toks.p, start, i), }, }); } continue; } if (IS_NUM(s[0]) || s[0] == '.') { const char *start = s; size_t base = 10; bool num_end = false; bool is_float = false; if (s[0] == '0') { consume(&pos, *(s++)); if (s[0] == 'x' || s[0] == 'X') { base = 16; consume(&pos, *(s++)); start = s; } else if (s[0] == 'b' || s[0] == 'B') { base = 2; consume(&pos, *(s++)); start = s; } else if (!IS_NUM(s[0]) && s[0] != '.') num_end = true; } if (!num_end) { for (;;) { if (s[0] == '.') { if (is_float) { mark(&pos); mark_err(&pos); set_err("Too many decimal points in number"); return toks; } if (base != 10) { set_err("Only decimal floats are supported"); return toks; } is_float = true; } else if (!IS_ALNUM(s[0])) break; consume(&pos, *(s++)); } } if (is_float) { ssize_t endpos; double num = stod(start, s - start, &endpos); if (endpos != -1) { err_col += endpos; set_err("Invalid decimal float character: '%c'", start[endpos]); return toks; } emit(&toks, &pos, (Tok){ .kind = TokVal, .Val = { .type = TypeFloat, .Float = num, }, }); } else { ssize_t endpos; intmax_t num = stoimax(start, s - start, base, &endpos); if (endpos != -1) { err_col += endpos; set_err("Invalid base %zu numerical character: '%c'", base, start[endpos]); return toks; } emit(&toks, &pos, (Tok){ .kind = TokVal, .Val = { .type = TypeInt, .Int = num, }, }); } continue; } switch (s[0]) { case 0: goto end_of_file; case ' ': case '\t': break; case '\n': emit(&toks, &pos, (Tok){ .kind = TokOp, .Op = OpNewLn, }); break; case ':': consume(&pos, *(s++)); if (s[0] == '=') emit(&toks, &pos, (Tok){ .kind = TokDeclare }); else { set_err("Expected ':='"); return toks; } break; case '=': consume(&pos, *(s++)); if (s[0] == '=') emit(&toks, &pos, (Tok){ .kind = TokOp, .Op = OpEq }); else { emit(&toks, &pos, (Tok){ .kind = TokAssign }); continue; } break; case '<': consume(&pos, *(s++)); if (s[0] == '=') emit(&toks, &pos, (Tok){ .kind = TokOp, .Op = OpLe }); else { emit(&toks, &pos, (Tok){ .kind = TokOp, .Op = OpLt }); continue; } break; case '>': consume(&pos, *(s++)); if (s[0] == '=') emit(&toks, &pos, (Tok){ .kind = TokOp, .Op = OpGe }); else { emit(&toks, &pos, (Tok){ .kind = TokOp, .Op = OpGt }); continue; } break; case '&': consume(&pos, *(s++)); if (s[0] == '&') emit(&toks, &pos, (Tok){ .kind = TokOp, .Op = OpAnd }); else { emit(&toks, &pos, (Tok){ .kind = TokOp, .Op = OpAddrOf }); continue; } break; case '!': consume(&pos, *(s++)); if (s[0] == '=') emit(&toks, &pos, (Tok){ .kind = TokOp, .Op = OpNeq }); else { emit(&toks, &pos, (Tok){ .kind = TokOp, .Op = OpNot }); continue; } break; case '|': consume(&pos, *(s++)); if (s[0] == '|') emit(&toks, &pos, (Tok){ .kind = TokOp, .Op = OpOr }); else continue; break; case '{': case '}': case '(': case ')': case '[': case ']': case ',': case '+': case '-': case '*': emit(&toks, &pos, (Tok){ .kind = TokOp, .Op = s[0], }); break; case '/': consume(&pos, *(s++)); if (s[0] == '/') { consume(&pos, *(s++)); while (s[0] != '\n') { if (s[0] == 0) goto end_of_file; consume(&pos, *(s++)); } } else if (s[0] == '*') { size_t depth = 1; while (depth) { consume(&pos, *(s++)); if (s[0] == '/') { consume(&pos, *(s++)); if (s[0] == '*') depth++; } else if (s[0] == '*') { consume(&pos, *(s++)); if (s[0] == '/') depth--; } else if (s[0] == 0) { set_err("Unclosed comment"); return toks; } } consume(&pos, *(s++)); } else { emit(&toks, &pos, (Tok){ .kind = TokOp, .Op = '/', }); } continue; case '\'': { consume(&pos, *(s++)); char c = s[0]; if (c == '\\') { consume(&pos, *(s++)); c = get_esc_char(s[0]); if (!c) { set_err("Unrecognized escape sequence: '\\%c'", s[0]); return toks; } } consume(&pos, *(s++)); if (s[0] != '\'') { set_err("Unclosed char literal"); return toks; } emit(&toks, &pos, (Tok){ .kind = TokVal, .Val = { .type = TypeChar, .Char = c }}); break; } case '"': { consume(&pos, *(s++)); const char *start = s; Pos start_pos = pos; size_t size = 0; /* count the string size before allocating */ while (s[0] != '"') { if (!s[0]) { set_err("Unexpected EOF in string literal"); return toks; } else if (s[0] == '\\') consume(&pos, *(s++)); consume(&pos, *(s++)); size++; } /* go through the actual string */ s = start; pos = start_pos; char *str = xmalloc(size); for (size_t i = 0; i < size; i++) { char c = s[0]; if (c == '\\') { consume(&pos, *(s++)); c = get_esc_char(s[0]); if (!c) { set_err("Unrecognized escape sequence: '\\%c'", s[0]); free(str); return toks; } } consume(&pos, *(s++)); str[i] = c; } emit(&toks, &pos, (Tok){ .kind = TokVal, .Val = { .type = TypeArr, .Arr = { .is_string = true, .dynamically_allocated = false, .type = TypeChar, .vals = str, .len = size, .cap = size, }, },}); break; } default: set_err("Unrecognized character: '%c'", s[0]); return toks; } consume(&pos, *(s++)); } end_of_file: emit(&toks, &pos, (Tok){ .kind = TokOp, .Op = OpEOF, }); return toks; }