diff --git a/tokenizer.c b/tokenizer.c index 314735e..58bc3a6 100644 --- a/tokenizer.c +++ b/tokenizer.c @@ -8,185 +8,247 @@ #include "os.h" #include "s8slice.h" +#if 0 +#define log_trace(...) fprintf(stderr, __VA_ARGS__) +#else +#define log_trace(...) +#endif + + typedef struct Parser { - S8Slice file; - int64_t cursor; + S8Slice file; + int64_t cursor; } Parser; typedef struct Token { - enum { - TOKEN_UNDEFINED, - TOKEN_INT, - TOKEN_IDENTIFIER, - TOKEN_STRING, - TOKEN_SYNTAX_ERROR, - TOKEN_EOF, - } kind; - union { - S8Slice identifier; - S8Slice string; - int integer; - }; + enum { + TOKEN_UNDEFINED, + TOKEN_INT, + TOKEN_ID, + TOKEN_STRING, + TOKEN_SYNTAX_ERROR, + TOKEN_EOS, + TOKEN_EOF, + } kind; + union { + S8Slice identifier; + S8Slice string; + int integer; + char undefined; + }; } Token; +const char* token_kind_str[] = { + [TOKEN_UNDEFINED] = "TOKEN_UNDEFINED", + [TOKEN_INT] = "TOKEN_INT", + [TOKEN_ID] = "TOKEN_ID", + [TOKEN_STRING] = "TOKEN_STRING", + [TOKEN_SYNTAX_ERROR] = "TOKEN_SYNTAX_ERROR", + [TOKEN_EOS] = "TOKEN_EOS", + [TOKEN_EOF] = "TOKEN_EOF", +}; + Parser parser_attach(S8Slice file) { - return (Parser){.file = file, .cursor = 0}; + return (Parser){.file = file, .cursor = 0}; } static int parser_peek(Parser* p) { - const int ch = p->cursor >= p->file.len - ? EOF - : (int)p->file.data[p->cursor]; - return ch; + const int ch = p->cursor >= p->file.len + ? EOF + : (int)p->file.data[p->cursor]; + log_trace("peeking:\t%c\n", isprint(ch) ? ch : '?'); + return ch; } static int parser_getch(Parser* p) { - const int ch = p->cursor >= p->file.len - ? EOF - : (int)p->file.data[p->cursor]; - p->cursor += 1; - if (ch == '\n' && parser_peek(p) == '\r') { - p->cursor += 1; - } - return ch; + const int ch = p->cursor >= p->file.len + ? EOF + : (int)p->file.data[p->cursor]; + p->cursor += 1; + log_trace(stderr, "getch:\t%c\n", isprint(ch) ? ch : '?'); + if (ch == '\n' && p->file.data[p->cursor] == '\r') { + p->cursor += 1; + } + return ch; +} + +static void parser_skip_char(Parser* p) +{ + const int ch = parser_getch(p); + log_trace(stderr, "skipping:\t%c\n", isprint(ch) ? ch : '?'); } static void parser_ungetch(Parser* p) { - if (p->cursor > 0) { - p->cursor -= 1; - } + log_trace(stderr, "ungetch:\n"); + if (p->cursor > 0) { + p->cursor -= 1; + } + if (p->file.data[p->cursor] == '\r') { + p->cursor -= 1; + } } -static void parser_discard(Parser* p, int (*f) (int)) +static inline void parser_discard(Parser* p, int (*f) (int)) { - int ch; - while (ch = parser_getch(p), f(ch)) - /* noop */; - if (ch != EOF) - parser_ungetch(p); + int ch; + while (ch = parser_getch(p), f(ch)) + /* noop */; + if (ch != EOF) + parser_ungetch(p); } static Token read_integer(Parser* p) { - /* FIXME: add support for 0x prefixes */ + /* FIXME: add support for 0x prefixes */ - /* the first char should be guaranteed to be isdigit */ - assert(isdigit(parser_peek(p))); + /* the first char should be guaranteed to be isdigit */ + assert(isdigit(parser_peek(p))); - int ch = EOF; - int n = 0; - while (ch = parser_getch(p), isdigit(ch)) { - n *= 10; - n += ch - '0'; - } - parser_ungetch(p); + int ch = EOF; + int n = 0; + while (ch = parser_getch(p), isdigit(ch)) { + n *= 10; + n += ch - '0'; + } + parser_ungetch(p); - Token t = {.kind = TOKEN_INT, .integer = n}; + Token t = {.kind = TOKEN_INT, .integer = n}; - return t; + return t; +} + +int is_identifier_tail(int ch) +{ + return isalnum(ch) || ch == '_'; } static Token read_identifier(Parser* p) { - int ch = EOF; + int ch = EOF; - /* the first char should be guaranteed to be isalpha */ - assert(isalpha(parser_peek(p))); + /* the should be checked by the caller */ + assert(isalpha(parser_peek(p))); - int64_t begin = p->cursor; - while (ch = parser_getch(p), isalnum(ch)) - /* NOOP */; - parser_ungetch(p); - int64_t end = p->cursor; + int64_t begin = p->cursor; + while (ch = parser_getch(p), is_identifier_tail(ch)) + /* NOOP */; + parser_ungetch(p); + int64_t end = p->cursor; - Token t = { - .kind = TOKEN_IDENTIFIER, - .identifier = s8slice(&p->file, begin, end) - }; + Token t = { + .kind = TOKEN_ID, + .identifier = s8slice(&p->file, begin, end) + }; - return t; + return t; } static Token read_string(Parser* p) { - int ch = EOF; + int ch = EOF; - /* the first char should be guaranteed to be '"' */ - assert(parser_peek(p) == '"'); + /* the first char should be guaranteed to be '"' */ + assert(parser_peek(p) == '"'); - (void)parser_getch(p); /* skip quote */ - int64_t begin = p->cursor; - while (ch = parser_getch(p), ch != '"' && ch != '\n' && ch != EOF) - /* NOOP */; - if (ch != '"') { - fprintf(stderr, "syntax error: expected \", found %c\n", ch); - exit(EXIT_FAILURE); - } - int64_t end = p->cursor - 1; /* subtract one to ignore end quote */ + (void)parser_getch(p); /* skip quote */ + int64_t begin = p->cursor; + while (ch = parser_getch(p), ch != '"' && ch != '\n' && ch != EOF) + /* NOOP */; + if (ch != '"') { + fprintf(stderr, "syntax error: expected \", found %c\n", ch); + exit(EXIT_FAILURE); + } + int64_t end = p->cursor - 1; /* subtract one to ignore end quote */ - Token t = { - .kind = TOKEN_STRING, - .identifier = s8slice(&p->file, begin, end) - }; + Token t = { + .kind = TOKEN_STRING, + .identifier = s8slice(&p->file, begin, end) + }; - return t; + return t; } static Token read_token(Parser* p) { - int ch = parser_peek(p); + int ch = parser_peek(p); - Token t; + Token t; - if (ch == EOF) { - t.kind = TOKEN_EOF; - } else if (isalpha(ch)) { - t = read_identifier(p); - } else if (isdigit(ch)) { - t = read_integer(p); - } else if (ch == '"') { - t = read_string(p); - } + if (ch == EOF) { + t.kind = TOKEN_EOF; + } + else if (ch == '\n') { + t.kind = TOKEN_EOS; + parser_skip_char(p); + parser_discard(p, isspace); + } + else if (isalpha(ch)) { + t = read_identifier(p); + } + else if (isdigit(ch)) { + t = read_integer(p); + } + else if (ch == '"') { + t = read_string(p); + } else { + t.kind = TOKEN_UNDEFINED; + t.undefined = ch; + parser_skip_char(p); + } - return t; + return t; +} + +int isspace_except_newline(int ch) +{ + return isspace(ch) && (ch != '\n'); } int main(int argc, char** argv) { - if (argc != 2) { - fprintf(stderr, "Usage: %s \n", argv[0] ? argv[0] : "program"); - exit(EXIT_FAILURE); - } + if (argc != 2) { + fprintf(stderr, "Usage: %s \n", argv[0] ? argv[0] : "program"); + exit(EXIT_FAILURE); + } - S8Slice path = s8slice_from_cstr(argv[1]); - const S8Slice f = os_open_file(path, OS_READ); + S8Slice path = s8slice_from_cstr(argv[1]); + const S8Slice f = os_open_file(path, OS_READ); - if (f.len == -1) { - fprintf(stderr, "could not open file %s: %s\n", path.data, strerror(errno)); - exit(EXIT_FAILURE); - } + if (f.len == -1) { + fprintf(stderr, "could not open file %s: %s\n", path.data, strerror(errno)); + exit(EXIT_FAILURE); + } - Parser p = parser_attach(f); - Token t = {0}; - while (true) { - parser_discard(&p, isspace); - Token t = read_token(&p); - if (t.kind == TOKEN_IDENTIFIER) { - S8Slice s = t.identifier; - printf("{.len = %lld, .data = %.*s}\n", s.len, (int)s.len, s.data); - } else if (t.kind == TOKEN_INT) { - printf("%d\n", t.integer); - } else if (t.kind == TOKEN_STRING) { - S8Slice s = t.identifier; - printf("{.len = %lld, .data = %.*s}\n", s.len, (int)s.len, s.data); - } else if (t.kind == TOKEN_EOF) { - break; - } - } + Parser p = parser_attach(f); + Token t = {0}; + while (true) { + parser_discard(&p, isspace_except_newline); + Token t = read_token(&p); + printf("%s\t", token_kind_str[t.kind]); + if (t.kind == TOKEN_ID) { + S8Slice s = t.identifier; + printf("<%.*s>\n", (int)s.len, s.data); + } + else if (t.kind == TOKEN_INT) { + printf("%d\n", t.integer); + } + else if (t.kind == TOKEN_STRING) { + S8Slice s = t.identifier; + printf("\"%.*s\"\n", (int)s.len, s.data); + } + else if (t.kind == TOKEN_EOF) { + break; + } else if (t.kind == TOKEN_UNDEFINED) { + printf("'%c'\n", t.undefined); + } + else { + printf("\n"); + } + } - return EXIT_SUCCESS; + return EXIT_SUCCESS; }