more tests for tokeniser
This commit is contained in:
parent
63fbca378b
commit
cfc1c8e3b3
|
|
@ -372,6 +372,8 @@ is_id_continue:
|
|||
jne .is_id_continue_ret
|
||||
cmp cl, '_'
|
||||
je .is_id_continue_ret
|
||||
cmp cl, '-'
|
||||
je .is_id_continue_ret
|
||||
xor rax, rax
|
||||
ret
|
||||
.is_id_continue_ret:
|
||||
|
|
|
|||
|
|
@ -114,98 +114,106 @@ global LEXEME_LENS
|
|||
global NUM_LEXEMES
|
||||
|
||||
align 8
|
||||
LEXEMES: dq \
|
||||
LEX_NOT_A_LEXEME, \
|
||||
LEX_LET, \
|
||||
LEX_IF, \
|
||||
LEX_ELSE, \
|
||||
LEX_FN, \
|
||||
LEX_RETURN, \
|
||||
LEX_LOOP, \
|
||||
LEX_BREAK, \
|
||||
LEX_CONTINUE, \
|
||||
LEX_TRUE, \
|
||||
LEX_FALSE, \
|
||||
LEX_BOOL, \
|
||||
LEX_ARROW, \
|
||||
LEX_I32, \
|
||||
LEX_U32, \
|
||||
LEX_EQUALS, \
|
||||
LEX_PLUS, \
|
||||
LEX_MINUS, \
|
||||
LEX_RPARENS, \
|
||||
LEX_LPARENS, \
|
||||
LEX_RBRACE, \
|
||||
LEX_LBRACE, \
|
||||
LEX_COLON, \
|
||||
LEX_SEMI, \
|
||||
LEX_COMMA, \
|
||||
LEX_PIPE, \
|
||||
LEX_AMP, \
|
||||
LEX_EQEQ
|
||||
align 8
|
||||
TOKENS: db \
|
||||
TOKEN_EOF, \
|
||||
TOKEN_LET, \
|
||||
TOKEN_IF, \
|
||||
TOKEN_ELSE, \
|
||||
TOKEN_FN, \
|
||||
TOKEN_RETURN, \
|
||||
TOKEN_LOOP, \
|
||||
TOKEN_BREAK, \
|
||||
TOKEN_CONTINUE, \
|
||||
TOKEN_TRUE, \
|
||||
TOKEN_FALSE, \
|
||||
TOKEN_BOOL, \
|
||||
TOKEN_ARROW, \
|
||||
TOKEN_I32, \
|
||||
TOKEN_U32, \
|
||||
TOKEN_EQUALS, \
|
||||
TOKEN_PLUS, \
|
||||
TOKEN_MINUS, \
|
||||
TOKEN_RPARENS, \
|
||||
TOKEN_LPARENS, \
|
||||
TOKEN_RBRACE, \
|
||||
TOKEN_LBRACE, \
|
||||
TOKEN_COLON, \
|
||||
TOKEN_SEMI, \
|
||||
TOKEN_COMMA, \
|
||||
TOKEN_PIPE, \
|
||||
TOKEN_AMP, \
|
||||
TOKEN_EQEQ
|
||||
align 8
|
||||
LEXEME_LENS: dq \
|
||||
0, \
|
||||
LEX_LET_len, \
|
||||
LEX_IF_len, \
|
||||
LEX_ELSE_len, \
|
||||
LEX_FN_len, \
|
||||
LEX_RETURN_len, \
|
||||
LEX_LOOP_len, \
|
||||
LEX_BREAK_len, \
|
||||
LEX_CONTINUE_len, \
|
||||
LEX_TRUE_len, \
|
||||
LEX_FALSE_len, \
|
||||
LEX_BOOL_len, \
|
||||
LEX_ARROW_len, \
|
||||
LEX_I32_len, \
|
||||
LEX_U32_len, \
|
||||
LEX_EQUALS_len, \
|
||||
LEX_PLUS_len, \
|
||||
LEX_MINUS_len, \
|
||||
LEX_RPARENS_len, \
|
||||
LEX_LPARENS_len, \
|
||||
LEX_RBRACE_len, \
|
||||
LEX_LBRACE_len, \
|
||||
LEX_COLON_len, \
|
||||
LEX_SEMI_len, \
|
||||
LEX_COMMA_len, \
|
||||
LEX_PIPE_len, \
|
||||
LEX_AMP_len, \
|
||||
LEX_EQEQ_len
|
||||
LEXEMES:
|
||||
dq LEX_NOT_A_LEXEME
|
||||
dq LEX_LET
|
||||
dq LEX_IF
|
||||
dq LEX_ELSE
|
||||
dq LEX_FN
|
||||
dq LEX_RETURN
|
||||
dq LEX_LOOP
|
||||
dq LEX_BREAK
|
||||
dq LEX_CONTINUE
|
||||
dq LEX_TRUE
|
||||
dq LEX_FALSE
|
||||
dq LEX_BOOL
|
||||
dq LEX_ARROW
|
||||
dq LEX_I32
|
||||
dq LEX_U32
|
||||
dq LEX_EQUALS
|
||||
dq LEX_PLUS
|
||||
dq LEX_MINUS
|
||||
dq LEX_RPARENS
|
||||
dq LEX_LPARENS
|
||||
dq LEX_RBRACE
|
||||
dq LEX_LBRACE
|
||||
dq LEX_COLON
|
||||
dq LEX_SEMI
|
||||
dq LEX_COMMA
|
||||
dq LEX_PIPE
|
||||
dq LEX_AMP
|
||||
dq LEX_EQEQ
|
||||
dq LEX_LBRACKET
|
||||
dq LEX_RBRACKET
|
||||
|
||||
align 8
|
||||
NUM_LEXEMES: dq 28
|
||||
TOKENS:
|
||||
db TOKEN_EOF ;; 0
|
||||
db TOKEN_LET ;; 1
|
||||
db TOKEN_IF ;; 2
|
||||
db TOKEN_ELSE ;; 3
|
||||
db TOKEN_FN ;; 4
|
||||
db TOKEN_RETURN ;; 5
|
||||
db TOKEN_LOOP ;; 6
|
||||
db TOKEN_BREAK ;; 7
|
||||
db TOKEN_CONTINUE ;; 8
|
||||
db TOKEN_TRUE ;; 9
|
||||
db TOKEN_FALSE ;; 10
|
||||
db TOKEN_BOOL ;; 11
|
||||
db TOKEN_ARROW ;; 12
|
||||
db TOKEN_I32 ;; 13
|
||||
db TOKEN_U32 ;; 14
|
||||
db TOKEN_EQUALS ;; 15
|
||||
db TOKEN_PLUS ;; 16
|
||||
db TOKEN_MINUS ;; 17
|
||||
db TOKEN_RPARENS ;; 18
|
||||
db TOKEN_LPARENS ;; 19
|
||||
db TOKEN_RBRACE ;; 20
|
||||
db TOKEN_LBRACE ;; 21
|
||||
db TOKEN_COLON ;; 22
|
||||
db TOKEN_SEMI ;; 23
|
||||
db TOKEN_COMMA ;; 24
|
||||
db TOKEN_PIPE ;; 25
|
||||
db TOKEN_AMP ;; 26
|
||||
db TOKEN_EQEQ ;; 27
|
||||
db TOKEN_LBRACKET ;; 28
|
||||
db TOKEN_RBRACKET ;; 29
|
||||
|
||||
align 8
|
||||
LEXEME_LENS:
|
||||
dq 0
|
||||
dq LEX_LET_len
|
||||
dq LEX_IF_len
|
||||
dq LEX_ELSE_len
|
||||
dq LEX_FN_len
|
||||
dq LEX_RETURN_len
|
||||
dq LEX_LOOP_len
|
||||
dq LEX_BREAK_len
|
||||
dq LEX_CONTINUE_len
|
||||
dq LEX_TRUE_len
|
||||
dq LEX_FALSE_len
|
||||
dq LEX_BOOL_len
|
||||
dq LEX_ARROW_len
|
||||
dq LEX_I32_len
|
||||
dq LEX_U32_len
|
||||
dq LEX_EQUALS_len
|
||||
dq LEX_PLUS_len
|
||||
dq LEX_MINUS_len
|
||||
dq LEX_RPARENS_len
|
||||
dq LEX_LPARENS_len
|
||||
dq LEX_RBRACE_len
|
||||
dq LEX_LBRACE_len
|
||||
dq LEX_COLON_len
|
||||
dq LEX_SEMI_len
|
||||
dq LEX_COMMA_len
|
||||
dq LEX_PIPE_len
|
||||
dq LEX_AMP_len
|
||||
dq LEX_EQEQ_len
|
||||
dq LEX_LBRACKET_len
|
||||
dq LEX_RBRACKET_len
|
||||
|
||||
align 8
|
||||
NUM_LEXEMES: dq 30
|
||||
|
||||
LEX_NOT_A_LEXEME db "<not a lexeme>", 0
|
||||
TOKEN_EOF equ 0
|
||||
|
|
@ -290,13 +298,20 @@ NUM_LEXEMES: dq 28
|
|||
TOKEN_EQEQ equ 27
|
||||
LEX_EQEQ db "=="
|
||||
LEX_EQEQ_len equ $ - LEX_EQEQ
|
||||
TOKEN_IDENT equ 28
|
||||
TOKEN_LBRACKET equ 28
|
||||
LEX_LBRACKET db "["
|
||||
LEX_LBRACKET_len equ $ - LEX_LBRACKET
|
||||
TOKEN_RBRACKET equ 29
|
||||
LEX_RBRACKET db "]"
|
||||
LEX_RBRACKET_len equ $ - LEX_RBRACKET
|
||||
TOKEN_IDENT equ 30
|
||||
LEX_IDENT db "<identifier>"
|
||||
LEX_IDENT_len equ $ - LEX_IDENT
|
||||
TOKEN_NUMBER equ 29
|
||||
TOKEN_NUMBER equ 31
|
||||
LEX_NUMBER db "<number>"
|
||||
LEX_NUMBER_len equ $ - LEX_NUMBER
|
||||
|
||||
|
||||
section .text
|
||||
;; rdi: length of matched lexeme
|
||||
is_ident:
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ extern "C" fn panic() -> ! {
|
|||
panic!("Called panic from external code.");
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
struct Lexeme(u8);
|
||||
|
||||
impl Lexeme {
|
||||
|
|
@ -27,7 +28,7 @@ trait AsLexeme {
|
|||
impl AsLexeme for u8 {
|
||||
fn as_lexeme(self) -> Option<Lexeme> {
|
||||
match self {
|
||||
1..=10 => Some(Lexeme(self)),
|
||||
1.. => Some(Lexeme(self)),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
|
@ -55,51 +56,61 @@ unsafe extern "C" {
|
|||
unsafe fn exit(code: i32) -> !;
|
||||
}
|
||||
|
||||
// fn lexemes_raw() -> &'static [*const u8] {
|
||||
// unsafe {
|
||||
// core::slice::from_raw_parts(
|
||||
// (&raw const LEXEMES),
|
||||
// (&raw const NUM_LEXEMES).read(),
|
||||
// )
|
||||
// }
|
||||
// }
|
||||
fn collect_tokens() -> Vec<Lexeme> {
|
||||
let mut lexemes = Vec::new();
|
||||
unsafe {
|
||||
while let Some(lexeme) = find_lexeme().as_lexeme() {
|
||||
lexemes.push(lexeme);
|
||||
}
|
||||
}
|
||||
|
||||
// fn lexeme_lens() -> &'static [usize] {
|
||||
// unsafe {
|
||||
// core::slice::from_raw_parts(
|
||||
// (&raw const LEXEME_LENS),
|
||||
// (&raw const NUM_LEXEMES).read(),
|
||||
// )
|
||||
// }
|
||||
// }
|
||||
|
||||
// fn lexeme_iter() -> impl Iterator<Item = &'static str> {
|
||||
// lexemes_raw().iter().zip(lexeme_lens().iter()).map(|(&ptr, &len)| {
|
||||
// // SAFETY: lexemes_raw and lexeme_lens are guaranteed to contain valid
|
||||
// // UTF-8 data and correct lengths.
|
||||
// unsafe {
|
||||
// core::str::from_utf8_unchecked(core::slice::from_raw_parts(ptr, len))
|
||||
// }
|
||||
// })
|
||||
// }
|
||||
lexemes
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let path = c"tests/tokens.l";
|
||||
|
||||
unsafe {
|
||||
// assert initial state
|
||||
assert_eq!((&raw const input_file).read(), 0);
|
||||
assert_eq!((&raw const buffer_len).read(), 0);
|
||||
assert_eq!((&raw const cursor).read(), 0);
|
||||
assert_eq!((&raw const buffer).read(), core::ptr::null_mut());
|
||||
eprint!("Initializing tokeniser.. ");
|
||||
tokeniser_init(path.as_ptr());
|
||||
eprintln!("ok.");
|
||||
eprintln!("{}: {:?}[{}..{}]", (&raw const input_file).read(), (&raw const buffer).read(), (&raw const cursor).read(), (&raw const buffer_len).read());
|
||||
tokeniser_print();
|
||||
|
||||
while let Some(lexeme) = find_lexeme().as_lexeme() {
|
||||
eprintln!("Found lexeme: {}", lexeme.lex());
|
||||
}
|
||||
eprint!("Initializing tokeniser.. ");
|
||||
tokeniser_init(c"tests/tokens/keywords.l".as_ptr());
|
||||
eprintln!("ok.");
|
||||
|
||||
assert_eq!(&collect_tokens()[..], &[
|
||||
Lexeme(4),
|
||||
Lexeme(1),
|
||||
Lexeme(2),
|
||||
Lexeme(3),
|
||||
Lexeme(4),
|
||||
Lexeme(8),
|
||||
Lexeme(13),
|
||||
Lexeme(11),
|
||||
Lexeme(10),
|
||||
Lexeme(9),
|
||||
Lexeme(5),
|
||||
][..]);
|
||||
|
||||
eprint!("Initializing tokeniser.. ");
|
||||
tokeniser_init(c"tests/tokens/delimiters.l".as_ptr());
|
||||
eprintln!("ok.");
|
||||
|
||||
assert_eq!(&collect_tokens()[..], &[
|
||||
Lexeme(19),
|
||||
Lexeme(18),
|
||||
Lexeme(28),
|
||||
Lexeme(29),
|
||||
Lexeme(21),
|
||||
Lexeme(20),
|
||||
Lexeme(24),
|
||||
Lexeme(12),
|
||||
Lexeme(23),
|
||||
Lexeme(22),
|
||||
Lexeme(15),
|
||||
][..]);
|
||||
|
||||
eprintln!("Finished tokenising.");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
1
lang/tests/tokens/delimiters.l
Normal file
1
lang/tests/tokens/delimiters.l
Normal file
|
|
@ -0,0 +1 @@
|
|||
()[]{},->;:=
|
||||
3
lang/tests/tokens/function.l
Normal file
3
lang/tests/tokens/function.l
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
fn my-function() -> bool {
|
||||
return false;
|
||||
}
|
||||
7
lang/tests/tokens/identifier.l
Normal file
7
lang/tests/tokens/identifier.l
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
this-is-an-ident
|
||||
another_ident123
|
||||
_underscore_test
|
||||
mixedCASEIdent
|
||||
number12345
|
||||
____
|
||||
_
|
||||
3
lang/tests/tokens/keywords.l
Normal file
3
lang/tests/tokens/keywords.l
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
fn let if else fn continue
|
||||
i32 bool false true
|
||||
return
|
||||
Loading…
Reference in a new issue