Compare commits

...

3 Commits

Author SHA1 Message Date
bbf58eff2f Add .gitignore 2025-11-11 03:12:42 +01:00
ae114de9b7 Add more tokens 2025-11-11 01:59:18 +01:00
98482768f2 Fix 2025-11-11 01:58:32 +01:00
3 changed files with 183 additions and 126 deletions

1
.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
tokenizer

8
os.c
View File

@@ -7,13 +7,7 @@
#include "s8slice.h" #include "s8slice.h"
#include <stdio.h> #include <stdio.h>
#include "os.h"
enum {
OS_READ = 1<<0,
OS_WRITE = 1<<1,
OS_ALLFLAGS = (1<<2)-1
};
typedef unsigned int os_open_flags;
S8Slice os_open_file(S8Slice path, os_open_flags flags) S8Slice os_open_file(S8Slice path, os_open_flags flags)
{ {

View File

@@ -8,185 +8,247 @@
#include "os.h" #include "os.h"
#include "s8slice.h" #include "s8slice.h"
#if 0
#define log_trace(...) fprintf(stderr, __VA_ARGS__)
#else
#define log_trace(...)
#endif
typedef struct Parser { typedef struct Parser {
S8Slice file; S8Slice file;
int64_t cursor; int64_t cursor;
} Parser; } Parser;
typedef struct Token { typedef struct Token {
enum { enum {
TOKEN_UNDEFINED, TOKEN_UNDEFINED,
TOKEN_INT, TOKEN_INT,
TOKEN_IDENTIFIER, TOKEN_ID,
TOKEN_STRING, TOKEN_STRING,
TOKEN_SYNTAX_ERROR, TOKEN_SYNTAX_ERROR,
TOKEN_EOF, TOKEN_EOS,
} kind; TOKEN_EOF,
union { } kind;
S8Slice identifier; union {
S8Slice string; S8Slice identifier;
int integer; S8Slice string;
}; int integer;
char undefined;
};
} Token; } Token;
const char* token_kind_str[] = {
[TOKEN_UNDEFINED] = "TOKEN_UNDEFINED",
[TOKEN_INT] = "TOKEN_INT",
[TOKEN_ID] = "TOKEN_ID",
[TOKEN_STRING] = "TOKEN_STRING",
[TOKEN_SYNTAX_ERROR] = "TOKEN_SYNTAX_ERROR",
[TOKEN_EOS] = "TOKEN_EOS",
[TOKEN_EOF] = "TOKEN_EOF",
};
Parser parser_attach(S8Slice file) Parser parser_attach(S8Slice file)
{ {
return (Parser){.file = file, .cursor = 0}; return (Parser){.file = file, .cursor = 0};
} }
static int parser_peek(Parser* p) static int parser_peek(Parser* p)
{ {
const int ch = p->cursor >= p->file.len const int ch = p->cursor >= p->file.len
? EOF ? EOF
: (int)p->file.data[p->cursor]; : (int)p->file.data[p->cursor];
return ch; log_trace("peeking:\t%c\n", isprint(ch) ? ch : '?');
return ch;
} }
static int parser_getch(Parser* p) static int parser_getch(Parser* p)
{ {
const int ch = p->cursor >= p->file.len const int ch = p->cursor >= p->file.len
? EOF ? EOF
: (int)p->file.data[p->cursor]; : (int)p->file.data[p->cursor];
p->cursor += 1; p->cursor += 1;
if (ch == '\n' && parser_peek(p) == '\r') { log_trace(stderr, "getch:\t%c\n", isprint(ch) ? ch : '?');
p->cursor += 1; if (ch == '\n' && p->file.data[p->cursor] == '\r') {
} p->cursor += 1;
return ch; }
return ch;
}
static void parser_skip_char(Parser* p)
{
const int ch = parser_getch(p);
log_trace(stderr, "skipping:\t%c\n", isprint(ch) ? ch : '?');
} }
static void parser_ungetch(Parser* p) static void parser_ungetch(Parser* p)
{ {
if (p->cursor > 0) { log_trace(stderr, "ungetch:\n");
p->cursor -= 1; if (p->cursor > 0) {
} p->cursor -= 1;
}
if (p->file.data[p->cursor] == '\r') {
p->cursor -= 1;
}
} }
static void parser_discard(Parser* p, int (*f) (int)) static inline void parser_discard(Parser* p, int (*f) (int))
{ {
int ch; int ch;
while (ch = parser_getch(p), f(ch)) while (ch = parser_getch(p), f(ch))
/* noop */; /* noop */;
if (ch != EOF) if (ch != EOF)
parser_ungetch(p); parser_ungetch(p);
} }
static Token read_integer(Parser* p) static Token read_integer(Parser* p)
{ {
/* FIXME: add support for 0x prefixes */ /* FIXME: add support for 0x prefixes */
/* the first char should be guaranteed to be isdigit */ /* the first char should be guaranteed to be isdigit */
assert(isdigit(parser_peek(p))); assert(isdigit(parser_peek(p)));
int ch = EOF; int ch = EOF;
int n = 0; int n = 0;
while (ch = parser_getch(p), isdigit(ch)) { while (ch = parser_getch(p), isdigit(ch)) {
n *= 10; n *= 10;
n += ch - '0'; n += ch - '0';
} }
parser_ungetch(p); parser_ungetch(p);
Token t = {.kind = TOKEN_INT, .integer = n}; Token t = {.kind = TOKEN_INT, .integer = n};
return t; return t;
}
int is_identifier_tail(int ch)
{
return isalnum(ch) || ch == '_';
} }
static Token read_identifier(Parser* p) static Token read_identifier(Parser* p)
{ {
int ch = EOF; int ch = EOF;
/* the first char should be guaranteed to be isalpha */ /* the should be checked by the caller */
assert(isalpha(parser_peek(p))); assert(isalpha(parser_peek(p)));
int64_t begin = p->cursor; int64_t begin = p->cursor;
while (ch = parser_getch(p), isalnum(ch)) while (ch = parser_getch(p), is_identifier_tail(ch))
/* NOOP */; /* NOOP */;
parser_ungetch(p); parser_ungetch(p);
int64_t end = p->cursor; int64_t end = p->cursor;
Token t = { Token t = {
.kind = TOKEN_IDENTIFIER, .kind = TOKEN_ID,
.identifier = s8slice(&p->file, begin, end) .identifier = s8slice(&p->file, begin, end)
}; };
return t; return t;
} }
static Token read_string(Parser* p) static Token read_string(Parser* p)
{ {
int ch = EOF; int ch = EOF;
/* the first char should be guaranteed to be '"' */ /* the first char should be guaranteed to be '"' */
assert(parser_peek(p) == '"'); assert(parser_peek(p) == '"');
(void)parser_getch(p); /* skip quote */ (void)parser_getch(p); /* skip quote */
int64_t begin = p->cursor; int64_t begin = p->cursor;
while (ch = parser_getch(p), ch != '"' && ch != '\n' && ch != EOF) while (ch = parser_getch(p), ch != '"' && ch != '\n' && ch != EOF)
/* NOOP */; /* NOOP */;
if (ch != '"') { if (ch != '"') {
fprintf(stderr, "syntax error: expected \", found %c\n", ch); fprintf(stderr, "syntax error: expected \", found %c\n", ch);
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
int64_t end = p->cursor - 1; /* subtract one to ignore end quote */ int64_t end = p->cursor - 1; /* subtract one to ignore end quote */
Token t = { Token t = {
.kind = TOKEN_STRING, .kind = TOKEN_STRING,
.identifier = s8slice(&p->file, begin, end) .identifier = s8slice(&p->file, begin, end)
}; };
return t; return t;
} }
static Token read_token(Parser* p) static Token read_token(Parser* p)
{ {
int ch = parser_peek(p); int ch = parser_peek(p);
Token t; Token t;
if (ch == EOF) { if (ch == EOF) {
t.kind = TOKEN_EOF; t.kind = TOKEN_EOF;
} else if (isalpha(ch)) { }
t = read_identifier(p); else if (ch == '\n') {
} else if (isdigit(ch)) { t.kind = TOKEN_EOS;
t = read_integer(p); parser_skip_char(p);
} else if (ch == '"') { parser_discard(p, isspace);
t = read_string(p); }
} else if (isalpha(ch)) {
t = read_identifier(p);
}
else if (isdigit(ch)) {
t = read_integer(p);
}
else if (ch == '"') {
t = read_string(p);
} else {
t.kind = TOKEN_UNDEFINED;
t.undefined = ch;
parser_skip_char(p);
}
return t; return t;
}
int isspace_except_newline(int ch)
{
return isspace(ch) && (ch != '\n');
} }
int main(int argc, char** argv) int main(int argc, char** argv)
{ {
if (argc != 2) { if (argc != 2) {
fprintf(stderr, "Usage: %s <file>\n", argv[0] ? argv[0] : "program"); fprintf(stderr, "Usage: %s <file>\n", argv[0] ? argv[0] : "program");
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
S8Slice path = s8slice_from_cstr(argv[1]); S8Slice path = s8slice_from_cstr(argv[1]);
const S8Slice f = os_open_file(path, OS_READ); const S8Slice f = os_open_file(path, OS_READ);
if (f.len == -1) { if (f.len == -1) {
fprintf(stderr, "could not open file %s: %s\n", path.data, strerror(errno)); fprintf(stderr, "could not open file %s: %s\n", path.data, strerror(errno));
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
Parser p = parser_attach(f); Parser p = parser_attach(f);
Token t = {0}; Token t = {0};
while (true) { while (true) {
parser_discard(&p, isspace); parser_discard(&p, isspace_except_newline);
Token t = read_token(&p); Token t = read_token(&p);
if (t.kind == TOKEN_IDENTIFIER) { printf("%s\t", token_kind_str[t.kind]);
S8Slice s = t.identifier; if (t.kind == TOKEN_ID) {
printf("{.len = %lld, .data = %.*s}\n", s.len, (int)s.len, s.data); S8Slice s = t.identifier;
} else if (t.kind == TOKEN_INT) { printf("<%.*s>\n", (int)s.len, s.data);
printf("%d\n", t.integer); }
} else if (t.kind == TOKEN_STRING) { else if (t.kind == TOKEN_INT) {
S8Slice s = t.identifier; printf("%d\n", t.integer);
printf("{.len = %lld, .data = %.*s}\n", s.len, (int)s.len, s.data); }
} else if (t.kind == TOKEN_EOF) { else if (t.kind == TOKEN_STRING) {
break; S8Slice s = t.identifier;
} printf("\"%.*s\"\n", (int)s.len, s.data);
} }
else if (t.kind == TOKEN_EOF) {
break;
} else if (t.kind == TOKEN_UNDEFINED) {
printf("'%c'\n", t.undefined);
}
else {
printf("\n");
}
}
return EXIT_SUCCESS; return EXIT_SUCCESS;
} }