lang/lex.c

375 lines
8.0 KiB
C
Raw Permalink Normal View History

2021-12-21 01:18:22 +01:00
#include "lex.h"
#include "util.h"
2021-12-25 12:32:52 +01:00
#define TAB_WIDTH 4
2021-12-21 01:18:22 +01:00
typedef struct Pos {
size_t ln, col; /* current position */
size_t m_ln, m_col; /* marked position */
} Pos;
static void consume(Pos *p, char c);
static void emit(TokList *toks, const Pos *p, Tok t);
static void mark(Pos *p);
static void mark_err(const Pos *p);
2021-12-23 21:26:53 +01:00
static char get_esc_char(char c);
2021-12-21 01:18:22 +01:00
static void consume(Pos *p, char c) {
if (c == '\n') {
p->ln++;
p->col = 1;
2021-12-25 12:32:52 +01:00
} else if (c == '\t')
p->col += TAB_WIDTH;
else
2021-12-21 01:18:22 +01:00
p->col++;
}
static void emit(TokList *toks, const Pos *p, Tok t) {
t.ln = p->m_ln;
t.col = p->m_col;
toklist_append(toks, t);
}
static void mark(Pos *p) {
p->m_ln = p->ln;
p->m_col = p->col;
}
static void mark_err(const Pos *p) {
err_ln = p->m_ln;
err_col = p->m_col;
}
2021-12-23 21:26:53 +01:00
static char get_esc_char(char c) {
switch(c) {
case 'a': return '\a';
case 'b': return '\b';
case 'e': return '\033';
case 'f': return '\f';
case 'n': return '\n';
case 'r': return '\r';
case 't': return '\t';
case 'v': return '\v';
case '\\': return '\\';
case '\'': return '\'';
case '"': return '\"';
default: return 0;
}
}
TokList lex(const char *s) {
2021-12-21 01:18:22 +01:00
TokList toks;
toklist_init(&toks);
Pos pos = { .ln = 1, .col = 1 };
for (;;) {
mark(&pos);
mark_err(&pos);
if (IS_ALPHA(s[0])) {
size_t i = 1;
const char *start = s;
consume(&pos, *(s++));
while (IS_ALNUM(s[0])) {
consume(&pos, *(s++));
i++;
}
if (streq_0_n("if", start, i))
emit(&toks, &pos, (Tok){ .kind = TokIf });
else if (streq_0_n("else", start, i))
emit(&toks, &pos, (Tok){ .kind = TokElse });
2021-12-21 01:18:22 +01:00
else if (streq_0_n("while", start, i))
emit(&toks, &pos, (Tok){ .kind = TokWhile });
2021-12-23 21:06:49 +01:00
else if (streq_0_n("true", start, i))
2021-12-28 13:55:01 +01:00
emit(&toks, &pos, (Tok){ .kind = TokVal, .Val = { .type = TypeBool, .Bool = true }});
2021-12-23 21:06:49 +01:00
else if (streq_0_n("false", start, i))
2021-12-28 13:55:01 +01:00
emit(&toks, &pos, (Tok){ .kind = TokVal, .Val = { .type = TypeBool, .Bool = false }});
2021-12-21 01:18:22 +01:00
else {
emit(&toks, &pos, (Tok){
.kind = TokIdent,
.Ident = {
.kind = IdentName,
.Name = psndup(toks.p, start, i),
2021-12-21 01:18:22 +01:00
},
});
}
continue;
}
if (IS_NUM(s[0]) || s[0] == '.') {
const char *start = s;
size_t base = 10;
bool num_end = false;
bool is_float = false;
if (s[0] == '0') {
consume(&pos, *(s++));
if (s[0] == 'x' || s[0] == 'X') {
base = 16;
consume(&pos, *(s++));
start = s;
} else if (s[0] == 'b' || s[0] == 'B') {
base = 2;
consume(&pos, *(s++));
start = s;
} else if (!IS_NUM(s[0]) && s[0] != '.')
num_end = true;
}
if (!num_end) {
for (;;) {
if (s[0] == '.') {
if (is_float) {
mark(&pos);
mark_err(&pos);
set_err("Too many decimal points in number");
return toks;
}
if (base != 10) {
set_err("Only decimal floats are supported");
return toks;
}
is_float = true;
} else if (!IS_ALNUM(s[0]))
break;
consume(&pos, *(s++));
}
}
if (is_float) {
ssize_t endpos;
double num = stod(start, s - start, &endpos);
if (endpos != -1) {
err_col += endpos;
set_err("Invalid decimal float character: '%c'", start[endpos]);
return toks;
}
emit(&toks, &pos, (Tok){
.kind = TokVal,
.Val = {
2021-12-28 13:55:01 +01:00
.type = TypeFloat,
2021-12-21 01:18:22 +01:00
.Float = num,
},
});
} else {
ssize_t endpos;
intmax_t num = stoimax(start, s - start, base, &endpos);
if (endpos != -1) {
err_col += endpos;
set_err("Invalid base %zu numerical character: '%c'", base, start[endpos]);
return toks;
}
emit(&toks, &pos, (Tok){
.kind = TokVal,
.Val = {
2021-12-28 13:55:01 +01:00
.type = TypeInt,
2021-12-21 01:18:22 +01:00
.Int = num,
},
});
}
continue;
}
switch (s[0]) {
case 0:
goto end_of_file;
case ' ':
case '\t':
break;
case '\n':
emit(&toks, &pos, (Tok){
.kind = TokOp,
.Op = OpNewLn,
});
break;
case ':':
consume(&pos, *(s++));
2021-12-23 21:06:49 +01:00
if (s[0] == '=')
2021-12-21 01:18:22 +01:00
emit(&toks, &pos, (Tok){ .kind = TokDeclare });
2021-12-23 21:06:49 +01:00
else {
2021-12-21 01:18:22 +01:00
set_err("Expected ':='");
return toks;
}
break;
case '=':
2021-12-23 21:06:49 +01:00
consume(&pos, *(s++));
if (s[0] == '=')
emit(&toks, &pos, (Tok){ .kind = TokOp, .Op = OpEq });
else {
emit(&toks, &pos, (Tok){ .kind = TokAssign });
continue;
}
break;
case '<':
consume(&pos, *(s++));
if (s[0] == '=')
emit(&toks, &pos, (Tok){ .kind = TokOp, .Op = OpLe });
else {
emit(&toks, &pos, (Tok){ .kind = TokOp, .Op = OpLt });
continue;
}
break;
case '>':
consume(&pos, *(s++));
if (s[0] == '=')
emit(&toks, &pos, (Tok){ .kind = TokOp, .Op = OpGe });
else {
emit(&toks, &pos, (Tok){ .kind = TokOp, .Op = OpGt });
continue;
}
2021-12-21 01:18:22 +01:00
break;
case '&':
consume(&pos, *(s++));
if (s[0] == '&')
emit(&toks, &pos, (Tok){ .kind = TokOp, .Op = OpAnd });
else {
emit(&toks, &pos, (Tok){ .kind = TokOp, .Op = OpAddrOf });
continue;
}
break;
2021-12-26 12:19:54 +01:00
case '!':
consume(&pos, *(s++));
if (s[0] == '=')
2021-12-28 13:55:01 +01:00
emit(&toks, &pos, (Tok){ .kind = TokOp, .Op = OpNeq });
2021-12-26 12:19:54 +01:00
else {
2021-12-28 13:55:01 +01:00
emit(&toks, &pos, (Tok){ .kind = TokOp, .Op = OpNot });
2021-12-26 12:19:54 +01:00
continue;
}
break;
2021-12-23 21:42:09 +01:00
case '|':
consume(&pos, *(s++));
2021-12-26 12:19:54 +01:00
if (s[0] == '|')
2021-12-23 21:42:09 +01:00
emit(&toks, &pos, (Tok){ .kind = TokOp, .Op = OpOr });
2021-12-26 12:19:54 +01:00
else
2021-12-23 21:42:09 +01:00
continue;
break;
2021-12-21 01:18:22 +01:00
case '{':
case '}':
case '(':
case ')':
2021-12-30 17:59:28 +01:00
case '[':
case ']':
2021-12-21 01:18:22 +01:00
case ',':
case '+':
case '-':
case '*':
emit(&toks, &pos, (Tok){
.kind = TokOp,
.Op = s[0],
});
break;
case '/':
consume(&pos, *(s++));
if (s[0] == '/') {
consume(&pos, *(s++));
while (s[0] != '\n') {
if (s[0] == 0)
goto end_of_file;
consume(&pos, *(s++));
}
} else if (s[0] == '*') {
size_t depth = 1;
while (depth) {
consume(&pos, *(s++));
if (s[0] == '/') {
consume(&pos, *(s++));
if (s[0] == '*')
depth++;
} else if (s[0] == '*') {
consume(&pos, *(s++));
if (s[0] == '/')
depth--;
} else if (s[0] == 0) {
set_err("Unclosed comment");
return toks;
}
}
consume(&pos, *(s++));
} else {
emit(&toks, &pos, (Tok){
.kind = TokOp,
.Op = '/',
});
}
continue;
2021-12-23 21:26:53 +01:00
case '\'': {
consume(&pos, *(s++));
char c = s[0];
if (c == '\\') {
consume(&pos, *(s++));
c = get_esc_char(s[0]);
if (!c) {
2021-12-30 18:10:08 +01:00
set_err("Unrecognized escape sequence: '\\%c'", s[0]);
2021-12-23 21:26:53 +01:00
return toks;
}
}
consume(&pos, *(s++));
if (s[0] != '\'') {
set_err("Unclosed char literal");
return toks;
}
2021-12-28 13:55:01 +01:00
emit(&toks, &pos, (Tok){ .kind = TokVal, .Val = { .type = TypeChar, .Char = c }});
2021-12-23 21:26:53 +01:00
break;
}
2021-12-25 12:16:06 +01:00
case '"': {
consume(&pos, *(s++));
const char *start = s;
Pos start_pos = pos;
size_t size = 0;
/* count the string size before allocating */
while (s[0] != '"') {
if (!s[0]) {
set_err("Unexpected EOF in string literal");
return toks;
} else if (s[0] == '\\')
consume(&pos, *(s++));
consume(&pos, *(s++));
size++;
}
/* go through the actual string */
s = start;
pos = start_pos;
char *str = xmalloc(size);
2021-12-25 12:16:06 +01:00
for (size_t i = 0; i < size; i++) {
char c = s[0];
if (c == '\\') {
consume(&pos, *(s++));
c = get_esc_char(s[0]);
if (!c) {
2021-12-30 18:10:08 +01:00
set_err("Unrecognized escape sequence: '\\%c'", s[0]);
free(str);
2021-12-25 12:16:06 +01:00
return toks;
}
}
consume(&pos, *(s++));
str[i] = c;
}
emit(&toks, &pos, (Tok){ .kind = TokVal, .Val = {
2021-12-28 13:55:01 +01:00
.type = TypeArr,
2021-12-25 12:16:06 +01:00
.Arr = {
.is_string = true,
.dynamically_allocated = false,
2021-12-28 13:55:01 +01:00
.type = TypeChar,
2021-12-25 12:16:06 +01:00
.vals = str,
.len = size,
.cap = size,
},
},});
break;
}
2021-12-21 01:18:22 +01:00
default:
set_err("Unrecognized character: '%c'", s[0]);
return toks;
}
consume(&pos, *(s++));
}
end_of_file:
emit(&toks, &pos, (Tok){
.kind = TokOp,
.Op = OpEOF,
});
return toks;
}