299 lines
8.4 KiB
Rust
299 lines
8.4 KiB
Rust
#[path = "shared/shared.rs"]
|
|
mod util;
|
|
|
|
use util::*;
|
|
|
|
#[derive(Debug)]
|
|
struct Lexeme(u8, &'static str);
|
|
|
|
impl PartialEq for Lexeme {
|
|
fn eq(&self, other: &Self) -> bool {
|
|
match self.0 {
|
|
// Identifiers and numbers compare both token and lexeme
|
|
30 | 31 => self.0 == other.0 && self.1 == other.1,
|
|
_ => self.0 == other.0,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Eq for Lexeme {}
|
|
|
|
trait AsLexeme {
|
|
fn as_lexeme(self) -> Option<Lexeme>;
|
|
}
|
|
|
|
#[repr(C)]
|
|
struct LexemeRaw {
|
|
token: u8,
|
|
lexeme: *const u8,
|
|
len: usize,
|
|
}
|
|
impl AsLexeme for LexemeRaw {
|
|
fn as_lexeme(self) -> Option<Lexeme> {
|
|
let Self { token, lexeme, len } = self;
|
|
let slice =
|
|
unsafe { core::str::from_utf8_unchecked(core::slice::from_raw_parts(lexeme, len)) };
|
|
match token {
|
|
1.. => Some(Lexeme(token, slice)),
|
|
_ => None,
|
|
}
|
|
}
|
|
}
|
|
|
|
#[allow(dead_code)]
|
|
unsafe extern "C" {
|
|
unsafe fn tokeniser_init(path: *const i8) -> ();
|
|
unsafe fn tokeniser_init_buf(bytes: *const u8, len: usize) -> ();
|
|
unsafe fn tokeniser_print() -> ();
|
|
unsafe fn is_ident(len: usize) -> bool;
|
|
unsafe fn is_number(len: usize) -> bool;
|
|
unsafe fn skip_whitespace() -> ();
|
|
|
|
unsafe fn find_lexeme() -> LexemeRaw;
|
|
unsafe fn expect_token(token: u8) -> MaybeFFISlice;
|
|
unsafe fn unwrap_token(token: u8) -> FFISlice;
|
|
|
|
static mut LEXEMES: *const u8;
|
|
static mut LEXEME_LENS: usize;
|
|
static mut NUM_LEXEMES: usize;
|
|
static mut TOKENS: u8;
|
|
|
|
static mut input_file: u32;
|
|
static mut buffer: *mut u8;
|
|
static mut cursor: usize;
|
|
static mut buffer_len: usize;
|
|
|
|
unsafe fn exit(code: i32) -> !;
|
|
}
|
|
|
|
fn collect_tokens() -> Vec<Lexeme> {
|
|
let mut lexemes = Vec::new();
|
|
unsafe {
|
|
while let Some(lexeme) = find_lexeme().as_lexeme() {
|
|
lexemes.push(lexeme);
|
|
}
|
|
}
|
|
|
|
lexemes
|
|
}
|
|
|
|
fn main() {
|
|
unsafe {
|
|
use util::defs::*;
|
|
|
|
// assert initial state
|
|
assert_eq!((&raw const input_file).read(), 0);
|
|
assert_eq!((&raw const buffer_len).read(), 0);
|
|
assert_eq!((&raw const cursor).read(), 0);
|
|
assert_eq!((&raw const buffer).read(), core::ptr::null_mut());
|
|
|
|
eprint!("Initializing tokeniser.. ");
|
|
tokeniser_init(c"tests/tokens/keywords.l".as_ptr());
|
|
eprintln!("ok.");
|
|
|
|
assert_eq!(
|
|
&collect_tokens()[..],
|
|
&[
|
|
Lexeme(4, ""),
|
|
Lexeme(1, ""),
|
|
Lexeme(2, ""),
|
|
Lexeme(3, ""),
|
|
Lexeme(4, ""),
|
|
Lexeme(8, ""),
|
|
Lexeme(13, ""),
|
|
Lexeme(11, ""),
|
|
Lexeme(10, ""),
|
|
Lexeme(9, ""),
|
|
Lexeme(5, ""),
|
|
][..]
|
|
);
|
|
|
|
eprint!("Initializing tokeniser.. ");
|
|
tokeniser_init(c"tests/tokens/delimiters.l".as_ptr());
|
|
eprintln!("ok.");
|
|
|
|
assert_eq!(
|
|
&collect_tokens()[..],
|
|
&[
|
|
Lexeme(19, ""),
|
|
Lexeme(18, ""),
|
|
Lexeme(28, ""),
|
|
Lexeme(29, ""),
|
|
Lexeme(21, ""),
|
|
Lexeme(20, ""),
|
|
Lexeme(24, ""),
|
|
Lexeme(12, ""),
|
|
Lexeme(23, ""),
|
|
Lexeme(22, ""),
|
|
Lexeme(15, ""),
|
|
][..]
|
|
);
|
|
|
|
eprint!("Initializing tokeniser.. ");
|
|
tokeniser_init(c"tests/tokens/identifier.l".as_ptr());
|
|
eprintln!("ok.");
|
|
|
|
assert_eq!(
|
|
&collect_tokens()[..],
|
|
&[
|
|
Lexeme(TOKEN_IDENT, "this-is-an-ident"),
|
|
Lexeme(TOKEN_IDENT, "another_ident123"),
|
|
Lexeme(TOKEN_IDENT, "_underscore_test"),
|
|
Lexeme(TOKEN_IDENT, "mixedCASEIdent"),
|
|
Lexeme(TOKEN_IDENT, "number12345"),
|
|
Lexeme(TOKEN_IDENT, "____"),
|
|
Lexeme(TOKEN_IDENT, "_"),
|
|
Lexeme(17, ""),
|
|
Lexeme(TOKEN_IDENT, "leading-minus"),
|
|
Lexeme(TOKEN_IDENT, "trailing-minus-"),
|
|
]
|
|
);
|
|
|
|
eprint!("Initializing tokeniser.. ");
|
|
tokeniser_init(c"tests/tokens/function.l".as_ptr());
|
|
eprintln!("ok.");
|
|
|
|
assert_eq!(
|
|
&collect_tokens()[..],
|
|
&[
|
|
Lexeme(4, ""),
|
|
Lexeme(TOKEN_IDENT, "my-function"),
|
|
Lexeme(19, ""),
|
|
Lexeme(18, ""),
|
|
Lexeme(12, ""),
|
|
Lexeme(11, ""),
|
|
Lexeme(21, ""),
|
|
Lexeme(5, ""),
|
|
Lexeme(10, ""),
|
|
Lexeme(23, ""),
|
|
Lexeme(20, ""),
|
|
]
|
|
);
|
|
|
|
eprint!("Initializing tokeniser.. ");
|
|
tokeniser_init(c"tests/tokens/function.l".as_ptr());
|
|
eprintln!("ok.");
|
|
|
|
assert_eq!(expect_token(2).into_option(), None);
|
|
assert_eq!(expect_token(4).into_option().unwrap().as_str(), "fn");
|
|
assert_eq!(unwrap_token(TOKEN_IDENT).as_str(), "my-function");
|
|
|
|
eprint!("Initializing tokeniser.. ");
|
|
tokeniser_init(c"tests/tokens/comment.l".as_ptr());
|
|
eprintln!("ok.");
|
|
|
|
assert_eq!(
|
|
&collect_tokens()[..],
|
|
&[
|
|
Lexeme(TOKEN_COMMENT, ""),
|
|
Lexeme(4, ""),
|
|
Lexeme(TOKEN_IDENT, "my-function"),
|
|
Lexeme(19, ""),
|
|
Lexeme(18, ""),
|
|
Lexeme(12, ""),
|
|
Lexeme(11, ""),
|
|
Lexeme(21, ""),
|
|
Lexeme(TOKEN_COMMENT, ""),
|
|
Lexeme(5, ""),
|
|
Lexeme(10, ""),
|
|
Lexeme(23, ""),
|
|
Lexeme(20, ""),
|
|
]
|
|
);
|
|
|
|
eprint!("Initializing tokeniser.. ");
|
|
tokeniser_init(c"tests/tokens/number.l".as_ptr());
|
|
eprintln!("ok.");
|
|
|
|
assert_eq!(
|
|
&collect_tokens()[..],
|
|
&[
|
|
Lexeme(TOKEN_NUMBER, "1234"),
|
|
Lexeme(TOKEN_NUMBER, "123_345_"),
|
|
Lexeme(TOKEN_NUMBER, "1234____56"),
|
|
Lexeme(TOKEN_NUMBER, "1"),
|
|
Lexeme(TOKEN_NUMBER, "0"),
|
|
]
|
|
);
|
|
|
|
eprint!("Initializing tokeniser.. ");
|
|
tokeniser_init(c"tests/tokens/strings.l".as_ptr());
|
|
eprintln!("ok.");
|
|
|
|
assert_eq!(
|
|
&collect_tokens()[..],
|
|
&[
|
|
Lexeme(TOKEN_STRING, "\"this is a string\""),
|
|
Lexeme(
|
|
TOKEN_STRING,
|
|
"\"another\nstring\nspanning multiple\n lines\""
|
|
),
|
|
Lexeme(TOKEN_STRING, "\"string with a \\\"quoted\\\" word\""),
|
|
Lexeme(TOKEN_STRING, "\"a\""),
|
|
Lexeme(TOKEN_STRING, "\"\"")
|
|
],
|
|
);
|
|
|
|
eprint!("Initializing tokeniser.. ");
|
|
let src = b"3 + 4";
|
|
tokeniser_init_buf(src.as_ptr(), src.len());
|
|
eprintln!("ok.");
|
|
|
|
assert_eq!(
|
|
&collect_tokens()[..],
|
|
&[
|
|
Lexeme(TOKEN_NUMBER, "3"),
|
|
Lexeme(16, "+"),
|
|
Lexeme(TOKEN_NUMBER, "4")
|
|
],
|
|
);
|
|
|
|
eprint!("Initializing tokeniser.. ");
|
|
let src = b"fn main() -> void { return 1 + 2; }";
|
|
tokeniser_init_buf(src.as_ptr(), src.len());
|
|
eprintln!("ok.");
|
|
|
|
assert_eq!(
|
|
&collect_tokens()[..],
|
|
&[
|
|
Lexeme(4, "fn"),
|
|
Lexeme(TOKEN_IDENT, "main"),
|
|
Lexeme(19, "("),
|
|
Lexeme(18, ")"),
|
|
Lexeme(12, "->"),
|
|
Lexeme(30, "void"),
|
|
Lexeme(21, "{"),
|
|
Lexeme(5, "return"),
|
|
Lexeme(TOKEN_NUMBER, "1"),
|
|
Lexeme(16, "+"),
|
|
Lexeme(TOKEN_NUMBER, "2"),
|
|
Lexeme(23, ";"),
|
|
Lexeme(20, "}"),
|
|
],
|
|
);
|
|
|
|
eprint!("Initializing tokeniser.. ");
|
|
let src = b"(b / d + c) * 42;";
|
|
tokeniser_init_buf(src.as_ptr(), src.len());
|
|
eprintln!("ok.");
|
|
|
|
assert_eq!(
|
|
&collect_tokens()[..],
|
|
&[
|
|
Lexeme(19, "("),
|
|
Lexeme(33, "b"),
|
|
Lexeme(31, "/"),
|
|
Lexeme(33, "d"),
|
|
Lexeme(16, "+"),
|
|
Lexeme(33, "c"),
|
|
Lexeme(18, ")"),
|
|
Lexeme(32, "*"),
|
|
Lexeme(34, "42"),
|
|
Lexeme(23, ";")
|
|
],
|
|
);
|
|
|
|
eprintln!("Finished tokenising.");
|
|
}
|
|
}
|