Compare commits

...

5 Commits

Author SHA1 Message Date
bbf58eff2f Add .gitignore 2025-11-11 03:12:42 +01:00
ae114de9b7 Add more tokens 2025-11-11 01:59:18 +01:00
98482768f2 Fix 2025-11-11 01:58:32 +01:00
4441a7eb21 Add license (unlicense) 2025-11-06 07:30:45 +01:00
de6a826a83 Add Makefile 2025-11-06 07:30:22 +01:00
6 changed files with 281 additions and 199 deletions

1
.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
tokenizer

1
Makefile Normal file
View File

@@ -0,0 +1 @@
tokenizer: tokenizer.c os.c

192
dare.c
View File

@@ -1,192 +0,0 @@
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
#include <ctype.h>
#include <assert.h>
#include <stdbool.h>
#include "os.h"
#include "s8slice.h"
typedef struct Parser {
S8Slice file;
int64_t cursor;
} Parser;
typedef struct Token {
enum {
TOKEN_UNDEFINED,
TOKEN_INT,
TOKEN_IDENTIFIER,
TOKEN_STRING,
TOKEN_SYNTAX_ERROR,
TOKEN_EOF,
} kind;
union {
S8Slice identifier;
S8Slice string;
int integer;
};
} Token;
Parser parser_attach(S8Slice file)
{
return (Parser){.file = file, .cursor = 0};
}
static int parser_peek(Parser* p)
{
const int ch = p->cursor >= p->file.len
? EOF
: (int)p->file.data[p->cursor];
return ch;
}
static int parser_getch(Parser* p)
{
const int ch = p->cursor >= p->file.len
? EOF
: (int)p->file.data[p->cursor];
p->cursor += 1;
if (ch == '\n' && parser_peek(p) == '\r') {
p->cursor += 1;
}
return ch;
}
static void parser_ungetch(Parser* p)
{
if (p->cursor > 0) {
p->cursor -= 1;
}
}
static void parser_discard(Parser* p, int (*f) (int))
{
int ch;
while (ch = parser_getch(p), f(ch))
/* noop */;
if (ch != EOF)
parser_ungetch(p);
}
static Token read_integer(Parser* p)
{
/* FIXME: add support for 0x prefixes */
/* the first char should be guaranteed to be isdigit */
assert(isdigit(parser_peek(p)));
int ch = EOF;
int n = 0;
while (ch = parser_getch(p), isdigit(ch)) {
n *= 10;
n += ch - '0';
}
parser_ungetch(p);
Token t = {.kind = TOKEN_INT, .integer = n};
return t;
}
static Token read_identifier(Parser* p)
{
int ch = EOF;
/* the first char should be guaranteed to be isalpha */
assert(isalpha(parser_peek(p)));
int64_t begin = p->cursor;
while (ch = parser_getch(p), isalnum(ch))
/* NOOP */;
parser_ungetch(p);
int64_t end = p->cursor;
Token t = {
.kind = TOKEN_IDENTIFIER,
.identifier = s8slice(&p->file, begin, end)
};
return t;
}
static Token read_string(Parser* p)
{
int ch = EOF;
/* the first char should be guaranteed to be '"' */
assert(parser_peek(p) == '"');
(void)parser_getch(p); /* skip quote */
int64_t begin = p->cursor;
while (ch = parser_getch(p), ch != '"' && ch != '\n' && ch != EOF)
/* NOOP */;
if (ch != '"') {
fprintf(stderr, "syntax error: expected \", found %c\n", ch);
exit(EXIT_FAILURE);
}
int64_t end = p->cursor - 1; /* subtract one to ignore end quote */
Token t = {
.kind = TOKEN_STRING,
.identifier = s8slice(&p->file, begin, end)
};
return t;
}
static Token read_token(Parser* p)
{
int ch = parser_peek(p);
Token t;
if (ch == EOF) {
t.kind = TOKEN_EOF;
} else if (isalpha(ch)) {
t = read_identifier(p);
} else if (isdigit(ch)) {
t = read_integer(p);
} else if (ch == '"') {
t = read_string(p);
}
return t;
}
int main(int argc, char** argv)
{
if (argc != 2) {
fprintf(stderr, "Usage: %s <file>\n", argv[0] ? argv[0] : "program");
exit(EXIT_FAILURE);
}
S8Slice path = s8slice_from_cstr(argv[1]);
const S8Slice f = os_open_file(path, OS_READ);
if (f.len == -1) {
fprintf(stderr, "could not open file %s: %s\n", path.data, strerror(errno));
exit(EXIT_FAILURE);
}
Parser p = parser_attach(f);
Token t = {0};
while (true) {
parser_discard(&p, isspace);
Token t = read_token(&p);
if (t.kind == TOKEN_IDENTIFIER) {
S8Slice s = t.identifier;
printf("{.len = %lld, .data = %.*s}\n", s.len, (int)s.len, s.data);
} else if (t.kind == TOKEN_INT) {
printf("%d\n", t.integer);
} else if (t.kind == TOKEN_STRING) {
S8Slice s = t.identifier;
printf("{.len = %lld, .data = %.*s}\n", s.len, (int)s.len, s.data);
} else if (t.kind == TOKEN_EOF) {
break;
}
}
return EXIT_SUCCESS;
}

24
license.txt Normal file
View File

@@ -0,0 +1,24 @@
This is free and unencumbered software released into the public domain.
Anyone is free to copy, modify, publish, use, compile, sell, or
distribute this software, either in source code form or as a compiled
binary, for any purpose, commercial or non-commercial, and by any
means.
In jurisdictions that recognize copyright laws, the author or authors
of this software dedicate any and all copyright interest in the
software to the public domain. We make this dedication for the benefit
of the public at large and to the detriment of our heirs and
successors. We intend this dedication to be an overt act of
relinquishment in perpetuity of all present and future rights to this
software under copyright law.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
For more information, please refer to <https://unlicense.org/>

8
os.c
View File

@@ -7,13 +7,7 @@
#include "s8slice.h"
#include <stdio.h>
enum {
OS_READ = 1<<0,
OS_WRITE = 1<<1,
OS_ALLFLAGS = (1<<2)-1
};
typedef unsigned int os_open_flags;
#include "os.h"
S8Slice os_open_file(S8Slice path, os_open_flags flags)
{

254
tokenizer.c Normal file
View File

@@ -0,0 +1,254 @@
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
#include <ctype.h>
#include <assert.h>
#include <stdbool.h>
#include "os.h"
#include "s8slice.h"
#if 0
#define log_trace(...) fprintf(stderr, __VA_ARGS__)
#else
#define log_trace(...)
#endif
typedef struct Parser {
S8Slice file;
int64_t cursor;
} Parser;
typedef struct Token {
enum {
TOKEN_UNDEFINED,
TOKEN_INT,
TOKEN_ID,
TOKEN_STRING,
TOKEN_SYNTAX_ERROR,
TOKEN_EOS,
TOKEN_EOF,
} kind;
union {
S8Slice identifier;
S8Slice string;
int integer;
char undefined;
};
} Token;
const char* token_kind_str[] = {
[TOKEN_UNDEFINED] = "TOKEN_UNDEFINED",
[TOKEN_INT] = "TOKEN_INT",
[TOKEN_ID] = "TOKEN_ID",
[TOKEN_STRING] = "TOKEN_STRING",
[TOKEN_SYNTAX_ERROR] = "TOKEN_SYNTAX_ERROR",
[TOKEN_EOS] = "TOKEN_EOS",
[TOKEN_EOF] = "TOKEN_EOF",
};
Parser parser_attach(S8Slice file)
{
return (Parser){.file = file, .cursor = 0};
}
static int parser_peek(Parser* p)
{
const int ch = p->cursor >= p->file.len
? EOF
: (int)p->file.data[p->cursor];
log_trace("peeking:\t%c\n", isprint(ch) ? ch : '?');
return ch;
}
static int parser_getch(Parser* p)
{
const int ch = p->cursor >= p->file.len
? EOF
: (int)p->file.data[p->cursor];
p->cursor += 1;
log_trace(stderr, "getch:\t%c\n", isprint(ch) ? ch : '?');
if (ch == '\n' && p->file.data[p->cursor] == '\r') {
p->cursor += 1;
}
return ch;
}
static void parser_skip_char(Parser* p)
{
const int ch = parser_getch(p);
log_trace(stderr, "skipping:\t%c\n", isprint(ch) ? ch : '?');
}
static void parser_ungetch(Parser* p)
{
log_trace(stderr, "ungetch:\n");
if (p->cursor > 0) {
p->cursor -= 1;
}
if (p->file.data[p->cursor] == '\r') {
p->cursor -= 1;
}
}
static inline void parser_discard(Parser* p, int (*f) (int))
{
int ch;
while (ch = parser_getch(p), f(ch))
/* noop */;
if (ch != EOF)
parser_ungetch(p);
}
static Token read_integer(Parser* p)
{
/* FIXME: add support for 0x prefixes */
/* the first char should be guaranteed to be isdigit */
assert(isdigit(parser_peek(p)));
int ch = EOF;
int n = 0;
while (ch = parser_getch(p), isdigit(ch)) {
n *= 10;
n += ch - '0';
}
parser_ungetch(p);
Token t = {.kind = TOKEN_INT, .integer = n};
return t;
}
int is_identifier_tail(int ch)
{
return isalnum(ch) || ch == '_';
}
static Token read_identifier(Parser* p)
{
int ch = EOF;
/* the should be checked by the caller */
assert(isalpha(parser_peek(p)));
int64_t begin = p->cursor;
while (ch = parser_getch(p), is_identifier_tail(ch))
/* NOOP */;
parser_ungetch(p);
int64_t end = p->cursor;
Token t = {
.kind = TOKEN_ID,
.identifier = s8slice(&p->file, begin, end)
};
return t;
}
static Token read_string(Parser* p)
{
int ch = EOF;
/* the first char should be guaranteed to be '"' */
assert(parser_peek(p) == '"');
(void)parser_getch(p); /* skip quote */
int64_t begin = p->cursor;
while (ch = parser_getch(p), ch != '"' && ch != '\n' && ch != EOF)
/* NOOP */;
if (ch != '"') {
fprintf(stderr, "syntax error: expected \", found %c\n", ch);
exit(EXIT_FAILURE);
}
int64_t end = p->cursor - 1; /* subtract one to ignore end quote */
Token t = {
.kind = TOKEN_STRING,
.identifier = s8slice(&p->file, begin, end)
};
return t;
}
static Token read_token(Parser* p)
{
int ch = parser_peek(p);
Token t;
if (ch == EOF) {
t.kind = TOKEN_EOF;
}
else if (ch == '\n') {
t.kind = TOKEN_EOS;
parser_skip_char(p);
parser_discard(p, isspace);
}
else if (isalpha(ch)) {
t = read_identifier(p);
}
else if (isdigit(ch)) {
t = read_integer(p);
}
else if (ch == '"') {
t = read_string(p);
} else {
t.kind = TOKEN_UNDEFINED;
t.undefined = ch;
parser_skip_char(p);
}
return t;
}
int isspace_except_newline(int ch)
{
return isspace(ch) && (ch != '\n');
}
int main(int argc, char** argv)
{
if (argc != 2) {
fprintf(stderr, "Usage: %s <file>\n", argv[0] ? argv[0] : "program");
exit(EXIT_FAILURE);
}
S8Slice path = s8slice_from_cstr(argv[1]);
const S8Slice f = os_open_file(path, OS_READ);
if (f.len == -1) {
fprintf(stderr, "could not open file %s: %s\n", path.data, strerror(errno));
exit(EXIT_FAILURE);
}
Parser p = parser_attach(f);
Token t = {0};
while (true) {
parser_discard(&p, isspace_except_newline);
Token t = read_token(&p);
printf("%s\t", token_kind_str[t.kind]);
if (t.kind == TOKEN_ID) {
S8Slice s = t.identifier;
printf("<%.*s>\n", (int)s.len, s.data);
}
else if (t.kind == TOKEN_INT) {
printf("%d\n", t.integer);
}
else if (t.kind == TOKEN_STRING) {
S8Slice s = t.identifier;
printf("\"%.*s\"\n", (int)s.len, s.data);
}
else if (t.kind == TOKEN_EOF) {
break;
} else if (t.kind == TOKEN_UNDEFINED) {
printf("'%c'\n", t.undefined);
}
else {
printf("\n");
}
}
return EXIT_SUCCESS;
}