lang/lex.c
r4 63af3e907b add assignment operator and unify memory pools
The unification of memory pools also fixed some memory leaks and
hopefully reduced the mallocs of identifier strings significantly by
giving them the same pool as the token stream.
2021-12-21 11:40:49 +01:00

233 lines
4.5 KiB
C

#include "lex.h"
#include "util.h"
typedef struct Pos {
size_t ln, col; /* current position */
size_t m_ln, m_col; /* marked position */
} Pos;
static void consume(Pos *p, char c);
static void emit(TokList *toks, const Pos *p, Tok t);
static void mark(Pos *p);
static void mark_err(const Pos *p);
static void consume(Pos *p, char c) {
if (c == '\n') {
p->ln++;
p->col = 1;
} else
p->col++;
}
static void emit(TokList *toks, const Pos *p, Tok t) {
t.ln = p->m_ln;
t.col = p->m_col;
toklist_append(toks, t);
}
static void mark(Pos *p) {
p->m_ln = p->ln;
p->m_col = p->col;
}
static void mark_err(const Pos *p) {
err_ln = p->m_ln;
err_col = p->m_col;
}
TokList lex(const char *s) {
TokList toks;
toklist_init(&toks);
Pos pos = { .ln = 1, .col = 1 };
for (;;) {
mark(&pos);
mark_err(&pos);
if (IS_ALPHA(s[0])) {
size_t i = 1;
const char *start = s;
consume(&pos, *(s++));
while (IS_ALNUM(s[0])) {
consume(&pos, *(s++));
i++;
}
if (streq_0_n("if", start, i))
emit(&toks, &pos, (Tok){ .kind = TokIf });
else if (streq_0_n("while", start, i))
emit(&toks, &pos, (Tok){ .kind = TokWhile });
else {
emit(&toks, &pos, (Tok){
.kind = TokIdent,
.Ident = {
.kind = IdentName,
.Name = psndup(toks.p, start, i),
},
});
}
continue;
}
if (IS_NUM(s[0]) || s[0] == '.') {
const char *start = s;
size_t base = 10;
bool num_end = false;
bool is_float = false;
if (s[0] == '0') {
consume(&pos, *(s++));
if (s[0] == 'x' || s[0] == 'X') {
base = 16;
consume(&pos, *(s++));
start = s;
} else if (s[0] == 'b' || s[0] == 'B') {
base = 2;
consume(&pos, *(s++));
start = s;
} else if (!IS_NUM(s[0]) && s[0] != '.')
num_end = true;
}
if (!num_end) {
for (;;) {
if (s[0] == '.') {
if (is_float) {
mark(&pos);
mark_err(&pos);
set_err("Too many decimal points in number");
return toks;
}
if (base != 10) {
set_err("Only decimal floats are supported");
return toks;
}
is_float = true;
} else if (!IS_ALNUM(s[0]))
break;
consume(&pos, *(s++));
}
}
if (is_float) {
ssize_t endpos;
double num = stod(start, s - start, &endpos);
if (endpos != -1) {
err_col += endpos;
set_err("Invalid decimal float character: '%c'", start[endpos]);
return toks;
}
emit(&toks, &pos, (Tok){
.kind = TokVal,
.Val = {
.type = {
.kind = TypeFloat,
},
.Float = num,
},
});
} else {
ssize_t endpos;
intmax_t num = stoimax(start, s - start, base, &endpos);
if (endpos != -1) {
err_col += endpos;
set_err("Invalid base %zu numerical character: '%c'", base, start[endpos]);
return toks;
}
emit(&toks, &pos, (Tok){
.kind = TokVal,
.Val = {
.type = {
.kind = TypeInt,
},
.Int = num,
},
});
}
continue;
}
switch (s[0]) {
case 0:
goto end_of_file;
case ' ':
case '\t':
break;
case '\n':
emit(&toks, &pos, (Tok){
.kind = TokOp,
.Op = OpNewLn,
});
break;
case ':':
consume(&pos, *(s++));
if (s[0] == '=') {
emit(&toks, &pos, (Tok){ .kind = TokDeclare });
} else {
set_err("Expected ':='");
return toks;
}
break;
case '=':
emit(&toks, &pos, (Tok){ .kind = TokAssign });
break;
case '{':
case '}':
case '(':
case ')':
case ',':
case '+':
case '-':
case '*':
emit(&toks, &pos, (Tok){
.kind = TokOp,
.Op = s[0],
});
break;
case '/':
consume(&pos, *(s++));
if (s[0] == '/') {
consume(&pos, *(s++));
while (s[0] != '\n') {
if (s[0] == 0)
goto end_of_file;
consume(&pos, *(s++));
}
} else if (s[0] == '*') {
size_t depth = 1;
while (depth) {
consume(&pos, *(s++));
if (s[0] == '/') {
consume(&pos, *(s++));
if (s[0] == '*')
depth++;
} else if (s[0] == '*') {
consume(&pos, *(s++));
if (s[0] == '/')
depth--;
} else if (s[0] == 0) {
set_err("Unclosed comment");
return toks;
}
}
consume(&pos, *(s++));
} else {
emit(&toks, &pos, (Tok){
.kind = TokOp,
.Op = '/',
});
}
continue;
default:
set_err("Unrecognized character: '%c'", s[0]);
return toks;
}
consume(&pos, *(s++));
}
end_of_file:
emit(&toks, &pos, (Tok){
.kind = TokOp,
.Op = OpEOF,
});
return toks;
}