9cc/token.c

507 lines
10 KiB
C

#include "9cc.h"
typedef struct Env {
char *path;
char *buf;
Vector *tokens;
struct Env *prev;
} Env;
static Env *env;
static Map *keywords;
static FILE *open_file(char *path) {
if (!strcmp(path, "-"))
return stdin;
FILE *fp = fopen(path, "r");
if (!fp) {
perror(path);
exit(1);
}
return fp;
}
static char *read_file(FILE *fp) {
StringBuilder *sb = new_sb();
char buf[4096];
for (;;) {
int nread = fread(buf, 1, sizeof(buf), fp);
if (nread == 0)
break;
sb_append_n(sb, buf, nread);
}
// We want to make sure that a souce file ends with a newline.
// Add not only one but two to protect against a backslash at EOF.
sb_append(sb, "\n\n");
return sb_get(sb);
}
static Env *new_env(Env *prev, char *path, char *buf) {
Env *env = calloc(1, sizeof(Env));
env->path = strcmp(path, "-") ? path : "(stdin)";
env->buf = buf;
env->tokens = new_vec();
env->prev = prev;
return env;
}
// Returns true if s1 starts with s2.
static bool startswith(char *s1, char *s2) {
return !strncmp(s1, s2, strlen(s2));
}
// Error reporting
// Finds a line pointed by a given pointer from the input file
// to print it out.
static void print_line(char *buf, char *path, char *pos) {
char *start = buf;
int line = 0;
int col = 0;
for (char *p = buf; p; p++) {
if (*p == '\n') {
start = p + 1;
line++;
col = 0;
continue;
}
if (p != pos) {
col++;
continue;
}
fprintf(stderr, "error at %s:%d:%d\n\n", path, line + 1, col + 1);
// Print out the line containing the error location.
int linelen = strchr(p, '\n') - start;
fprintf(stderr, "%.*s\n", linelen, start);
// Show tabs for tabs and spaces for other characters
// so that the column matches.
for (int i = 0; i < col; i++)
fprintf(stderr, (start[i] == '\t') ? "\t" : " ");
fprintf(stderr, "^\n\n");
return;
}
}
void warn_token(Token *t, char *msg) {
if (t->start)
print_line(t->buf, t->path, t->start);
fprintf(stderr, msg);
fprintf(stderr, "\n");
}
noreturn void bad_token(Token *t, char *msg) {
warn_token(t, msg);
exit(1);
}
noreturn static void bad_position(char *p, char *msg) {
print_line(env->buf, env->path, p);
error(msg);
}
int get_line_number(Token *t) {
int n = 0;
for (char *p = t->buf; p < t->end; p++)
if (*p == '\n')
n++;
return n;
}
// Returns true if Token t followed a space or a comment
// in an original source file.
static bool need_space(Token *t) {
char *s = t->start;
if (t->buf <= s - 1 && isspace(s[-1]))
return true;
return t->buf <= s - 2 && startswith(s - 2, "*/");
}
// For C preprocessor.
char *stringize(Vector *tokens) {
StringBuilder *sb = new_sb();
for (int i = 0; i < tokens->len; i++) {
Token *t = tokens->data[i];
if (t->ty == '\n')
continue;
if (i > 0 && need_space(t))
sb_add(sb, ' ');
assert(t->start && t->end);
sb_append_n(sb, t->start, t->end - t->start);
}
return sb_get(sb);
}
// Atomic unit in the grammar is called "token".
// For example, `123`, `"abc"` and `while` are tokens.
// The tokenizer splits an input string into tokens.
// Spaces and comments are removed by the tokenizer.
static Token *add(int ty, char *start) {
Token *t = calloc(1, sizeof(Token));
t->ty = ty;
t->start = start;
t->path = env->path;
t->buf = env->buf;
vec_push(env->tokens, t);
return t;
}
static struct {
char *name;
int ty;
} symbols[] = {
{"<<=", TK_SHL_EQ}, {">>=", TK_SHR_EQ}, {"!=", TK_NE},
{"&&", TK_LOGAND}, {"++", TK_INC}, {"--", TK_DEC},
{"->", TK_ARROW}, {"<<", TK_SHL}, {"<=", TK_LE},
{"==", TK_EQ}, {">=", TK_GE}, {">>", TK_SHR},
{"||", TK_LOGOR}, {"*=", TK_MUL_EQ}, {"/=", TK_DIV_EQ},
{"%=", TK_MOD_EQ}, {"+=", TK_ADD_EQ}, {"-=", TK_SUB_EQ},
{"&=", TK_AND_EQ}, {"^=", TK_XOR_EQ}, {"|=", TK_OR_EQ},
{NULL, 0},
};
static Map *keyword_map() {
Map *map = new_map();
map_puti(map, "_Alignof", TK_ALIGNOF);
map_puti(map, "_Bool", TK_BOOL);
map_puti(map, "break", TK_BREAK);
map_puti(map, "case", TK_CASE);
map_puti(map, "char", TK_CHAR);
map_puti(map, "continue", TK_CONTINUE);
map_puti(map, "do", TK_DO);
map_puti(map, "else", TK_ELSE);
map_puti(map, "extern", TK_EXTERN);
map_puti(map, "for", TK_FOR);
map_puti(map, "if", TK_IF);
map_puti(map, "int", TK_INT);
map_puti(map, "return", TK_RETURN);
map_puti(map, "sizeof", TK_SIZEOF);
map_puti(map, "struct", TK_STRUCT);
map_puti(map, "switch", TK_SWITCH);
map_puti(map, "typedef", TK_TYPEDEF);
map_puti(map, "typeof", TK_TYPEOF);
map_puti(map, "void", TK_VOID);
map_puti(map, "while", TK_WHILE);
return map;
}
static char *block_comment(char *pos) {
for (char *p = pos + 2; *p; p++)
if (startswith(p, "*/"))
return p + 2;
bad_position(pos, "unclosed comment");
}
static int isoctal(char c) {
return '0' <= c && c <= '7';
}
static int hex(char c) {
if ('0' <= c && c <= '9')
return c - '0';
if ('a' <= c && c <= 'f')
return c - 'a' + 10;
assert('A' <= c && c <= 'F');
return c - 'A' + 10;
}
// Read a single character in a char or string literal.
static char *c_char(int *res, char *p) {
// Nonescaped
if (*p != '\\') {
*res = *p;
return p + 1;
}
p++;
static char escaped[256] = {
['a'] = '\a', ['b'] = '\b', ['f'] = '\f',
['n'] = '\n', ['r'] = '\r', ['t'] = '\t',
['v'] = '\v', ['e'] = '\033', ['E'] = '\033',
};
// Simple (e.g. `\n` or `\a`)
int esc = escaped[(uint8_t)*p];
if (esc) {
*res = esc;
return p + 1;
}
// Hexadecimal
if (*p == 'x') {
*res = 0;
p++;
while (isxdigit(*p))
*res = *res * 16 + hex(*p++);
return p;
}
// Octal
if (isoctal(*p)) {
int i = *p++ - '0';
if (isoctal(*p))
i = i * 8 + *p++ - '0';
if (isoctal(*p))
i = i * 8 + *p++ - '0';
*res = i;
return p;
}
*res = *p;
return p + 1;
}
static char *char_literal(char *p) {
Token *t = add(TK_NUM, p++);
p = c_char(&t->val, p);
if (*p != '\'')
bad_token(t, "unclosed character literal");
t->end = p + 1;
return p + 1;
}
static char *string_literal(char *p) {
Token *t = add(TK_STR, p++);
StringBuilder *sb = new_sb();
while (*p != '"') {
if (!*p)
bad_token(t, "unclosed string literal");
int c;
p = c_char(&c, p);
sb_add(sb, c);
}
t->str = sb_get(sb);
t->len = sb->len;
t->end = p + 1;
return p + 1;
}
static char *ident(char *p) {
int len = 1;
while (isalpha(p[len]) || isdigit(p[len]) || p[len] == '_')
len++;
char *name = strndup(p, len);
int ty = map_geti(keywords, name, TK_IDENT);
Token *t = add(ty, p);
t->name = name;
t->end = p + len;
return p + len;
}
static char *hexadecimal(char *p) {
Token *t = add(TK_NUM, p);
p += 2;
if (!isxdigit(*p))
bad_token(t, "bad hexadecimal number");
while (isxdigit(*p))
t->val = t->val * 16 + hex(*p++);
t->end = p;
return p;
}
static char *octal(char *p) {
Token *t = add(TK_NUM, p++);
while (isoctal(*p))
t->val = t->val * 8 + *p++ - '0';
t->end = p;
return p;
}
static char *decimal(char *p) {
Token *t = add(TK_NUM, p);
while (isdigit(*p))
t->val = t->val * 10 + *p++ - '0';
t->end = p;
return p;
}
static char *number(char *p) {
if (startswith(p, "0x") || startswith(p, "0X"))
return hexadecimal(p);
if (*p == '0')
return octal(p);
return decimal(p);
}
static void scan() {
char *p = env->buf;
loop:
while (*p) {
// New line (preprocessor-only token)
if (*p == '\n') {
Token *t = add(*p, p);
p++;
t->end = p;
continue;
}
// Whitespace
if (isspace(*p)) {
p++;
continue;
}
// Line comment
if (startswith(p, "//")) {
while (*p && *p != '\n')
p++;
continue;
}
// Block comment
if (startswith(p, "/*")) {
p = block_comment(p);
continue;
}
// Character literal
if (*p == '\'') {
p = char_literal(p);
continue;
}
// String literal
if (*p == '"') {
p = string_literal(p);
continue;
}
// Multi-letter symbol
for (int i = 0; symbols[i].name; i++) {
char *name = symbols[i].name;
if (!startswith(p, name))
continue;
Token *t = add(symbols[i].ty, p);
p += strlen(name);
t->end = p;
goto loop;
}
// Single-letter symbol
if (strchr("+-*/;=(),{}<>[]&.!?:|^%~#", *p)) {
Token *t = add(*p, p);
p++;
t->end = p;
continue;
}
// Keyword or identifier
if (isalpha(*p) || *p == '_') {
p = ident(p);
continue;
}
// Number
if (isdigit(*p)) {
p = number(p);
continue;
}
bad_position(p, "cannot tokenize");
}
}
static void replace_crlf(char *p) {
for (char *q = p; *q;) {
if (startswith(q, "\r\n"))
q++;
*p++ = *q++;
}
*p = '\0';
}
// Concatenates continuation lines. We keep the total number of
// newline characters the same to keep the line counter sane.
static void remove_backslash_newline(char *p) {
int cnt = 0;
for (char *q = p; *q;) {
if (startswith(q, "\\\n")) {
cnt++;
q += 2;
continue;
}
if (*q == '\n') {
for (int i = 0; i < cnt + 1; i++)
*p++ = '\n';
q++;
cnt = 0;
continue;
}
*p++ = *q++;
}
*p = '\0';
}
static Vector *strip_newline_tokens(Vector *tokens) {
Vector *v = new_vec();
for (int i = 0; i < tokens->len; i++) {
Token *t = tokens->data[i];
if (t->ty != '\n')
vec_push(v, t);
}
return v;
}
static void append(Token *x, Token *y) {
StringBuilder *sb = new_sb();
sb_append_n(sb, x->str, x->len - 1);
sb_append_n(sb, y->str, y->len - 1);
x->str = sb_get(sb);
x->len = sb->len;
}
static Vector *join_string_literals(Vector *tokens) {
Vector *v = new_vec();
Token *last = NULL;
for (int i = 0; i < tokens->len; i++) {
Token *t = tokens->data[i];
if (last && last->ty == TK_STR && t->ty == TK_STR) {
append(last, t);
continue;
}
last = t;
vec_push(v, t);
}
return v;
}
Vector *tokenize(char *path, bool add_eof) {
if (!keywords)
keywords = keyword_map();
FILE *fp = open_file(path);
char *buf = read_file(fp);
replace_crlf(buf);
remove_backslash_newline(buf);
env = new_env(env, path, buf);
scan();
if (add_eof)
add(TK_EOF, NULL);
Vector *v = env->tokens;
env = env->prev;
v = preprocess(v);
v = strip_newline_tokens(v);
return join_string_literals(v);
}