From 33c7349178085e9c02067083c30ae8d1c45c45d7 Mon Sep 17 00:00:00 2001 From: Ole Morud Date: Thu, 6 Nov 2025 07:20:31 +0100 Subject: [PATCH] Add simple tokenizer --- dare.c | 192 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ os.c | 59 +++++++++++++++++ os.h | 13 ++++ s8slice.h | 39 +++++++++++ test.txt | 4 ++ 5 files changed, 307 insertions(+) create mode 100644 dare.c create mode 100644 os.c create mode 100644 os.h create mode 100644 s8slice.h create mode 100644 test.txt diff --git a/dare.c b/dare.c new file mode 100644 index 0000000..314735e --- /dev/null +++ b/dare.c @@ -0,0 +1,192 @@ +#include +#include +#include +#include +#include +#include + +#include "os.h" +#include "s8slice.h" + +typedef struct Parser { + S8Slice file; + int64_t cursor; +} Parser; + +typedef struct Token { + enum { + TOKEN_UNDEFINED, + TOKEN_INT, + TOKEN_IDENTIFIER, + TOKEN_STRING, + TOKEN_SYNTAX_ERROR, + TOKEN_EOF, + } kind; + union { + S8Slice identifier; + S8Slice string; + int integer; + }; +} Token; + +Parser parser_attach(S8Slice file) +{ + return (Parser){.file = file, .cursor = 0}; +} + +static int parser_peek(Parser* p) +{ + const int ch = p->cursor >= p->file.len + ? EOF + : (int)p->file.data[p->cursor]; + return ch; +} + +static int parser_getch(Parser* p) +{ + const int ch = p->cursor >= p->file.len + ? EOF + : (int)p->file.data[p->cursor]; + p->cursor += 1; + if (ch == '\n' && parser_peek(p) == '\r') { + p->cursor += 1; + } + return ch; +} + +static void parser_ungetch(Parser* p) +{ + if (p->cursor > 0) { + p->cursor -= 1; + } +} + +static void parser_discard(Parser* p, int (*f) (int)) +{ + int ch; + while (ch = parser_getch(p), f(ch)) + /* noop */; + if (ch != EOF) + parser_ungetch(p); +} + +static Token read_integer(Parser* p) +{ + /* FIXME: add support for 0x prefixes */ + + /* the first char should be guaranteed to be isdigit */ + assert(isdigit(parser_peek(p))); + + int ch = EOF; + int n = 0; + while (ch = parser_getch(p), isdigit(ch)) { + n *= 10; + n += ch - '0'; + } + parser_ungetch(p); + + Token t = {.kind = TOKEN_INT, .integer = n}; + + return t; +} + +static Token read_identifier(Parser* p) +{ + int ch = EOF; + + /* the first char should be guaranteed to be isalpha */ + assert(isalpha(parser_peek(p))); + + int64_t begin = p->cursor; + while (ch = parser_getch(p), isalnum(ch)) + /* NOOP */; + parser_ungetch(p); + int64_t end = p->cursor; + + Token t = { + .kind = TOKEN_IDENTIFIER, + .identifier = s8slice(&p->file, begin, end) + }; + + return t; +} + +static Token read_string(Parser* p) +{ + int ch = EOF; + + /* the first char should be guaranteed to be '"' */ + assert(parser_peek(p) == '"'); + + (void)parser_getch(p); /* skip quote */ + int64_t begin = p->cursor; + while (ch = parser_getch(p), ch != '"' && ch != '\n' && ch != EOF) + /* NOOP */; + if (ch != '"') { + fprintf(stderr, "syntax error: expected \", found %c\n", ch); + exit(EXIT_FAILURE); + } + int64_t end = p->cursor - 1; /* subtract one to ignore end quote */ + + Token t = { + .kind = TOKEN_STRING, + .identifier = s8slice(&p->file, begin, end) + }; + + return t; +} + +static Token read_token(Parser* p) +{ + int ch = parser_peek(p); + + Token t; + + if (ch == EOF) { + t.kind = TOKEN_EOF; + } else if (isalpha(ch)) { + t = read_identifier(p); + } else if (isdigit(ch)) { + t = read_integer(p); + } else if (ch == '"') { + t = read_string(p); + } + + return t; +} + +int main(int argc, char** argv) +{ + if (argc != 2) { + fprintf(stderr, "Usage: %s \n", argv[0] ? argv[0] : "program"); + exit(EXIT_FAILURE); + } + + S8Slice path = s8slice_from_cstr(argv[1]); + const S8Slice f = os_open_file(path, OS_READ); + + if (f.len == -1) { + fprintf(stderr, "could not open file %s: %s\n", path.data, strerror(errno)); + exit(EXIT_FAILURE); + } + + Parser p = parser_attach(f); + Token t = {0}; + while (true) { + parser_discard(&p, isspace); + Token t = read_token(&p); + if (t.kind == TOKEN_IDENTIFIER) { + S8Slice s = t.identifier; + printf("{.len = %lld, .data = %.*s}\n", s.len, (int)s.len, s.data); + } else if (t.kind == TOKEN_INT) { + printf("%d\n", t.integer); + } else if (t.kind == TOKEN_STRING) { + S8Slice s = t.identifier; + printf("{.len = %lld, .data = %.*s}\n", s.len, (int)s.len, s.data); + } else if (t.kind == TOKEN_EOF) { + break; + } + } + + return EXIT_SUCCESS; +} diff --git a/os.c b/os.c new file mode 100644 index 0000000..3f18cd4 --- /dev/null +++ b/os.c @@ -0,0 +1,59 @@ + +#include +#include +#include +#include +#include +#include "s8slice.h" + +#include + +enum { + OS_READ = 1<<0, + OS_WRITE = 1<<1, + OS_ALLFLAGS = (1<<2)-1 +}; +typedef unsigned int os_open_flags; + +S8Slice os_open_file(S8Slice path, os_open_flags flags) +{ + int oflag = 0; + switch (flags & (OS_ALLFLAGS)) { + case OS_READ: + oflag = O_RDONLY; + break; + case OS_WRITE: + oflag = O_WRONLY; + break; + case OS_READ | OS_WRITE: + oflag = O_RDWR; + } + int fd = open((const char*)path.data, oflag); + if (fd == -1) { + goto open_fail; + } + + struct stat st; + int ok = fstat(fd, &st); + if (ok == -1) { + goto fstat_fail; + } + + int prot = 0; + if (flags & OS_READ) prot |= PROT_READ; + if (flags & OS_WRITE) prot |= PROT_WRITE; + void *x = mmap(NULL, st.st_size, prot, MAP_SHARED, fd, 0); + if (x == MAP_FAILED) { + goto mmap_fail; + } + + close(fd); + + return (S8Slice){.data = x, .len = (int64_t)st.st_size}; + +mmap_fail: +fstat_fail: + close(fd); +open_fail: + return (S8Slice){.len = -1, .data = (uint8_t*)""}; +} diff --git a/os.h b/os.h new file mode 100644 index 0000000..860255f --- /dev/null +++ b/os.h @@ -0,0 +1,13 @@ + +#include "s8slice.h" + +enum { + OS_READ = 1<<0, + OS_WRITE = 1<<1, + OS_ALLFLAGS = (1<<2)-1 +}; +typedef unsigned int os_open_flags; + +S8Slice os_open_file(S8Slice path, os_open_flags flags); + +S8Slice os_close_file(S8Slice file); diff --git a/s8slice.h b/s8slice.h new file mode 100644 index 0000000..eb8ccfe --- /dev/null +++ b/s8slice.h @@ -0,0 +1,39 @@ +#pragma once + +#include +#include + +typedef struct s8_slice { + uint8_t* data; + int64_t len; +} S8Slice; + +#define S8(str) (S8Slice) { \ + .data = (uint8_t*)("" str), \ + .len = sizeof(str)-1, \ +} + +#define S8_error(str) (S8Slice) { \ + .data = (uint8_t*)("" str), \ + .len = -1, \ +} + +static inline S8Slice s8slice_from_cstr(const char* cstr) +{ + return (S8Slice){ + .len = strlen(cstr), + .data = (uint8_t*)cstr, + }; +} + +static inline S8Slice s8slice(const S8Slice* s, int64_t begin, int64_t end) +{ + if (end >= s->len) { + return S8_error("s8slice: `end` surpasses length of `s`"); + } + const int64_t diff = end - begin; + return (S8Slice) { + .len = diff, + .data = &s->data[begin] + }; +} diff --git a/test.txt b/test.txt new file mode 100644 index 0000000..da84427 --- /dev/null +++ b/test.txt @@ -0,0 +1,4 @@ + +hello world 123 "this is a string" + +goodbye world 123