from-scratch/lang/src/tokeniser.asm

section .text
extern panic
extern strlen
extern strcmp
extern streq
extern memcpy
extern eprint_str
extern exit
extern error_to_str
extern eprint_error
extern alloc_pages
extern allocate
extern fopen_read

extern is_alpha
extern is_numeric
extern is_id_continue
extern is_id_start
extern is_whitespace

global tokeniser_init
global tokeniser_print
global find_lexeme

;; =============================
;; Tokeniser functions
;; =============================

;; tokeniser state
section .data
global input_file
global buffer
global cursor
global buffer_len

    input_file dd 0
    buffer dq 0
    cursor dq 0
    buffer_len dq 0

;; each buffer is chunk_size bytes large
;; buffer header structure:
;; +0 (8 bytes): pointer buffer
;; +8 (8 bytes): size of buffer

;; Tokens:
;; [let, if, else, fn, return, loop, break, continue, true, false, i32, u32, bool, =, +, -, *, /, %, ==, !=, <, <=, >, >=, &&, ||, !, (, ), {, }, [, ], ;, ',', ]

section .bss
statbuf: resb 144

section .text
;; Initialises the tokeniser
;; rdx: pointer to filename (null-terminated)
tokeniser_init:
    ; open file for reading
    ; this panics if the file doesn't exist
    call fopen_read
    mov dword [rel input_file], eax        ; store file descriptor
    mov qword [rel cursor], 0
    mov qword [rel buffer_len], 0
    ; fstat
    mov rax, 5                   ; syscall: fstat
    mov rdi, [rel input_file]        ; fd
    lea rsi, [rel statbuf]           ; statbuf
    syscall
    cmp rax, 0
    jl .report_error
    ; get file size from statbuf
    lea r15, [rel statbuf]      ; st_size
    mov r15, [r15 + 48]         ; offset of st_size in stat struct

    ; allocate buffer
    mov rdi, r15
    call allocate
    mov qword [rel buffer], rax
    mov qword [rel buffer_len], r15

    ; read file into buffer
    mov rax, 0                   ; syscall: read
    mov edi, [rel input_file]        ; fd
    mov rsi, [rel buffer]            ; buf
    mov rdx, [rel buffer_len]       ; count
    syscall
    cmp rax, 0
    jl .report_error
    ret
.report_error:
    mov rcx, rax
    call eprint_error
    call panic

section .rdata
    tokeniser_buffer db "Tokeniser buffer: ", 10
    tokeniser_buffer_len equ $ - tokeniser_buffer

section .text
tokeniser_print:
    lea rdi, [rel tokeniser_buffer]
    mov rsi, tokeniser_buffer_len
    call eprint_str

    mov rax, [rel cursor]
    mov rdi, [rel buffer]
    add rdi, rax
    mov rsi, [rel buffer_len]
    call eprint_str
    ret

section .rdata
global LEXEMES
global TOKENS
global LEXEME_LENS
global NUM_LEXEMES

align 8
LEXEMES:
    dq LEX_NOT_A_LEXEME
    dq LEX_LET
    dq LEX_IF
    dq LEX_ELSE
    dq LEX_FN
    dq LEX_RETURN
    dq LEX_LOOP
    dq LEX_BREAK
    dq LEX_CONTINUE
    dq LEX_TRUE
    dq LEX_FALSE
    dq LEX_BOOL
    dq LEX_ARROW
    dq LEX_I32
    dq LEX_U32
    dq LEX_EQUALS
    dq LEX_PLUS
    dq LEX_MINUS
    dq LEX_RPARENS
    dq LEX_LPARENS
    dq LEX_RBRACE
    dq LEX_LBRACE
    dq LEX_COLON
    dq LEX_SEMI
    dq LEX_COMMA
    dq LEX_PIPE
    dq LEX_AMP
    dq LEX_EQEQ
    dq LEX_LBRACKET
    dq LEX_RBRACKET

align 8
TOKENS:
    db TOKEN_EOF                   ;; 0
    db TOKEN_LET                   ;; 1
    db TOKEN_IF                    ;; 2
    db TOKEN_ELSE                  ;; 3
    db TOKEN_FN                    ;; 4
    db TOKEN_RETURN                ;; 5
    db TOKEN_LOOP                  ;; 6
    db TOKEN_BREAK                 ;; 7
    db TOKEN_CONTINUE              ;; 8
    db TOKEN_TRUE                  ;; 9
    db TOKEN_FALSE                 ;; 10
    db TOKEN_BOOL                  ;; 11
    db TOKEN_ARROW                 ;; 12
    db TOKEN_I32                   ;; 13
    db TOKEN_U32                   ;; 14
    db TOKEN_EQUALS                ;; 15
    db TOKEN_PLUS                  ;; 16
    db TOKEN_MINUS                 ;; 17
    db TOKEN_RPARENS               ;; 18
    db TOKEN_LPARENS               ;; 19
    db TOKEN_RBRACE                ;; 20
    db TOKEN_LBRACE                ;; 21
    db TOKEN_COLON                 ;; 22
    db TOKEN_SEMI                  ;; 23
    db TOKEN_COMMA                 ;; 24
    db TOKEN_PIPE                  ;; 25
    db TOKEN_AMP                   ;; 26
    db TOKEN_EQEQ                  ;; 27
    db TOKEN_LBRACKET              ;; 28
    db TOKEN_RBRACKET              ;; 29

align 8
LEXEME_LENS:
    dq 0
    dq LEX_LET_len
    dq LEX_IF_len
    dq LEX_ELSE_len
    dq LEX_FN_len
    dq LEX_RETURN_len
    dq LEX_LOOP_len
    dq LEX_BREAK_len
    dq LEX_CONTINUE_len
    dq LEX_TRUE_len
    dq LEX_FALSE_len
    dq LEX_BOOL_len
    dq LEX_ARROW_len
    dq LEX_I32_len
    dq LEX_U32_len
    dq LEX_EQUALS_len
    dq LEX_PLUS_len
    dq LEX_MINUS_len
    dq LEX_RPARENS_len
    dq LEX_LPARENS_len
    dq LEX_RBRACE_len
    dq LEX_LBRACE_len
    dq LEX_COLON_len
    dq LEX_SEMI_len
    dq LEX_COMMA_len
    dq LEX_PIPE_len
    dq LEX_AMP_len
    dq LEX_EQEQ_len
    dq LEX_LBRACKET_len
    dq LEX_RBRACKET_len

align 8
NUM_LEXEMES: dq 30

    LEX_NOT_A_LEXEME db "<not a lexeme>", 0
    TOKEN_EOF       equ 0
    TOKEN_LET       equ 1
    LEX_LET db "let"
    LEX_LET_len equ $ - LEX_LET
    TOKEN_IF        equ 2
    LEX_IF db "if"
    LEX_IF_len equ $ - LEX_IF
    TOKEN_ELSE      equ 3
    LEX_ELSE db "else"
    LEX_ELSE_len equ $ - LEX_ELSE
    TOKEN_FN        equ 4
    LEX_FN db "fn"
    LEX_FN_len equ $ - LEX_FN
    TOKEN_RETURN    equ 5
    LEX_RETURN db "return"
    LEX_RETURN_len equ $ - LEX_RETURN
    TOKEN_LOOP      equ 6
    LEX_LOOP db "loop"
    LEX_LOOP_len equ $ - LEX_LOOP
    TOKEN_BREAK     equ 7
    LEX_BREAK db "break"
    LEX_BREAK_len equ $ - LEX_BREAK
    TOKEN_CONTINUE  equ 8
    LEX_CONTINUE db "continue"
    LEX_CONTINUE_len equ $ - LEX_CONTINUE
    TOKEN_TRUE      equ 9
    LEX_TRUE db "true"
    LEX_TRUE_len equ $ - LEX_TRUE
    TOKEN_FALSE     equ 10
    LEX_FALSE db "false"
    LEX_FALSE_len equ $ - LEX_FALSE
    TOKEN_BOOL      equ 11
    LEX_BOOL db "bool"
    LEX_BOOL_len equ $ - LEX_BOOL
    TOKEN_ARROW     equ 12
    LEX_ARROW db "->"
    LEX_ARROW_len equ $ - LEX_ARROW
    TOKEN_I32       equ 13
    LEX_I32 db "i32"
    LEX_I32_len equ $ - LEX_I32
    TOKEN_U32       equ 14
    LEX_U32 db "u32"
    LEX_U32_len equ $ - LEX_U32
    TOKEN_EQUALS    equ 15
    LEX_EQUALS db "="
    LEX_EQUALS_len equ $ - LEX_EQUALS
    TOKEN_PLUS      equ 16
    LEX_PLUS db "+"
    LEX_PLUS_len equ $ - LEX_PLUS
    TOKEN_MINUS     equ 17
    LEX_MINUS db "-"
    LEX_MINUS_len equ $ - LEX_MINUS
    TOKEN_RPARENS   equ 18
    LEX_RPARENS db ")"
    LEX_RPARENS_len equ $ - LEX_RPARENS
    TOKEN_LPARENS   equ 19
    LEX_LPARENS db "("
    LEX_LPARENS_len equ $ - LEX_LPARENS
    TOKEN_RBRACE    equ 20
    LEX_RBRACE db "}"
    LEX_RBRACE_len equ $ - LEX_RBRACE
    TOKEN_LBRACE    equ 21
    LEX_LBRACE db "{"
    LEX_LBRACE_len equ $ - LEX_LBRACE
    TOKEN_COLON     equ 22
    LEX_COLON db ":"
    LEX_COLON_len equ $ - LEX_COLON
    TOKEN_SEMI      equ 23
    LEX_SEMI db ";"
    LEX_SEMI_len equ $ - LEX_SEMI
    TOKEN_COMMA     equ 24
    LEX_COMMA db ","
    LEX_COMMA_len equ $ - LEX_COMMA
    TOKEN_PIPE      equ 25
    LEX_PIPE db "|"
    LEX_PIPE_len equ $ - LEX_PIPE
    TOKEN_AMP       equ 26
    LEX_AMP db "&"
    LEX_AMP_len equ $ - LEX_AMP
    TOKEN_EQEQ     equ 27
    LEX_EQEQ db "=="
    LEX_EQEQ_len equ $ - LEX_EQEQ
    TOKEN_LBRACKET  equ 28
    LEX_LBRACKET db "["
    LEX_LBRACKET_len equ $ - LEX_LBRACKET
    TOKEN_RBRACKET  equ 29
    LEX_RBRACKET db "]"
    LEX_RBRACKET_len equ $ - LEX_RBRACKET
    TOKEN_IDENT     equ 30
    LEX_IDENT db "<identifier>"
    LEX_IDENT_len equ $ - LEX_IDENT
    TOKEN_NUMBER    equ 31
    LEX_NUMBER db "<number>"
    LEX_NUMBER_len equ $ - LEX_NUMBER
    TOKEN_STRING    equ 32
    LEX_STRING db "<string>"
    LEX_STRING_len equ $ - LEX_STRING
    TOKEN_COMMENT   equ 33
    LEX_COMMENT db "<comment>"
    LEX_COMMENT_len equ $ - LEX_COMMENT


section .text
;; rdi: length of previously matched lexeme
;; returns the length of the ident
;; fn is_ident(lexeme_len: usize) -> usize
is_ident:
    push rbp
    mov rbp, rsp
    push r12
    push r13
    push r14
    push rdi

    mov rax, [rel cursor]
    mov r12, [rel buffer]
    mov r13, [rel buffer_len]
    sub r13, rax
    add r12, rax

    ; check first char is id_start
    mov dil, [r12]
    call is_id_start
    test rax, rax
    je .not_ident
    mov r14, 1
.loop:
    cmp r14, r13
    jge .done
    mov dil, [r12 + r14]
    ; check for id_continue
    call is_id_continue
    test rax, rax
    je .done
    inc r14
    jmp .loop
.done:
    ; r14 is length of ident
    mov rdi, [rsp]
    cmp r14, rdi
    jle .not_ident
    mov rax, [rel cursor]
    add rax, r14
    mov [rel cursor], rax
    mov rax, r14
    jmp .epilogue
.not_ident:
    xor rax, rax
.epilogue:
    pop rdi
    pop r14
    pop r13
    pop r12
    pop rbp
    ret

is_comment:
    push rbp
    mov rbp, rsp
    push r12
    push r13
    push r14

    mov rax, [rel cursor]
    mov r12, [rel buffer]
    mov r13, [rel buffer_len]
    add r12, rax
    sub r13, rax

    mov dil, [r12]
    cmp dil, '/'
    jne .not_comment

    mov r14, 1
    cmp r14, r13
    jge .not_comment
    mov dil, [r12 + r14]
    cmp dil, '/'
    jne .not_comment
.loop:
    inc r14
    cmp r14, r13
    jge .comment
    mov dil, [r12 + r14]
    cmp dil, 10  ; newline
    jne .loop
.comment:
    mov rax, [rel cursor]
    add rax, r14
    mov [rel cursor], rax
    mov rax, r14
    jmp .epilogue

.not_comment:
    xor rax, rax
.epilogue:
    pop r14
    pop r13
    pop r12
    pop rbp
    ret

;; Strings are sequences of characters enclosed in double quotes
;; Strings span multiple lines, and may in the future contain escape sequences
is_string:
    push rbp
    mov rbp, rsp
    push r12
    push r13
    push r14

    mov rax, [rel cursor]
    mov r12, [rel buffer]
    mov r13, [rel buffer_len]
    add r12, rax
    sub r13, rax

    mov dil, [r12]
    cmp dil, '"'
    jne .not_string

    mov r14, 1
.loop:
    cmp r14, r13
    jge .unterminated
    mov dil, [r12 + r14]
    cmp dil, '"'
    je .string
    cmp dil, 0x5c  ; backslash
    je .escape
    inc r14
    jmp .loop
.escape:
    inc r14
    cmp r14, r13
    jge .unterminated
    inc r14
    jmp .loop
.string:
    mov rax, [rel cursor]
    inc r14                 ; include closing quote
    add rax, r14
    mov [rel cursor], rax
    mov rax, r14
    jmp .epilogue
.unterminated:
;; TODO: report unterminated string error
    mov rax, r14
    jmp .epilogue
.not_string:
    xor rax, rax
.epilogue:
    pop r14
    pop r13
    pop r12
    pop rbp
    ret

;; Numbers are sequences of numeric characters, interspersed with underscores
;; The leading character must be numeric
;; In the future, numbers may be prefixed with '0x' for hexadecimal or '0b' for binary.
is_number:
    push rbp
    mov rbp, rsp
    push r12
    push r13
    push r14

    mov rax, [rel cursor]
    mov r12, [rel buffer]
    mov r13, [rel buffer_len]
    add r12, rax
    sub r13, rax

    mov dil, [r12]
    call is_numeric
    test rax, rax
    je .not_number

    mov r14, 1
.loop:
    cmp r14, r13
    jge .number
    mov dil, [r12 + r14]
    call is_whitespace
    test rax, rax
    jne .number
    cmp dil, '_'
    je .loop_next
    call is_numeric
    test rax, rax
    je .not_number
.loop_next:
    inc r14
    jmp .loop
.number:
    mov rax, [rel cursor]
    add rax, r14
    mov [rel cursor], rax
    mov rax, r14
    jmp .epilogue
.not_number:
    xor rax, rax
.epilogue:
    pop r14
    pop r13
    pop r12
    pop rbp
    ret

skip_whitespaces:
    push rbp
    mov rbp, rsp
    push r12
    push r13
    push r14
    ; let start = buffer.add(cursor);
    ; let end = buffer.add(buffer_len);
    mov r12, [rel cursor]
    mov r13, [rel buffer_len]
    mov r14, [rel buffer]
    ; for ptr in start..end {
.loop:
    cmp r12, r13
    jge .done
    mov dil, [r14 + r12]
    call is_whitespace
    test rax, rax
    je .done
    inc r12
    jmp .loop
.done:
    mov [rel cursor], r12
    pop r14
    pop r13
    pop r12
    pop rbp
    ret


;; rdi: pointer to out-struct
;; fn find_lexeme() -> (u8, *const u8, usize)
find_lexeme:
    push rbp
    mov rbp, rsp
    push rdi
    ; skip whitespaces
    call skip_whitespaces
    ;; init out struct
    mov rdi, [rsp]
    mov rax, [rel buffer]
    add rax, [rel cursor]
    mov qword [rdi], 0
    mov [rdi + 8], rax
    mov qword [rdi + 16], 0
    ; check length
    mov rax, [rel cursor]
    mov rcx, [rel buffer_len]
    ; if cursor >= buffer_len {
    cmp rax, rcx
    jge .eof
    jmp .start
.eof:
    ;   return TOKEN_EOF;
    mov rax, TOKEN_EOF
    pop rdi
    pop rbp
    ret
    ; }
.start:
    push r12
    ; test special tokens:
    ; if buffer[cursor] == '"' {
    call is_string
    test rax, rax
    jne .is_string
    ; } else if buffer[cursor].is_numeric() {
    call is_number
    ;   return is_number();
    test rax, rax
    jne .is_number
    ; } else if buffer[cursor..][..2] == "//" {
    call is_comment
    ;   // skip to end of line
    test rax, rax
    jne .is_comment
    ; }
.loop_init:
    mov r12, 1
    ; for 1..NUM_LEXEMES {
.loop:
    cmp r12, [rel NUM_LEXEMES]
    jge .not_found
    ;   let lexeme = LEXEMES[i];
    lea rdi, [rel LEXEMES]
    mov rdi, [rdi + r12*8]
    lea rdx, [rel LEXEME_LENS]
    mov rsi, [rdx + r12*8]
    mov rax, [rel cursor]
    mov rdx, [rel buffer]
    add rdx, rax
    ;   let len = LEXEME_LENS[i];
    mov rcx, [rel buffer_len]
    sub rcx, rax
    jo .not_found
    ;   if lexeme.len() > buffer.len() - cursor {
    cmp rsi, rcx
    jg .next
    ;     continue;
    ;   }
    mov rcx, rsi
    ;   if buffer[cursor..cursor+len] == lexeme {
    call streq
    test rax, rax
    je .next
    ;       if is_ident() {
    mov rdi, rsi
    call is_ident
    test rax, rax
    ;         return TOKEN_IDENT;
    jne .is_ident
    ;       } else {
    mov rdi, [rsp + 8]
    mov rax, [rel cursor]
    ;         cursor += len;
    lea rsi, [rel LEXEME_LENS]
    mov rsi, [rsi + r12*8]
    add rax, rsi
    mov [rel cursor], rax
    ;         return TOKENS[i];
    lea rax, [rel TOKENS]
    mov al, [rax + r12]
    and rax, 0xFF
    mov rdi, [rsp + 8]
    mov [rdi], al
    mov [rdi + 16], rsi
    jmp .epilogue
    ;       }
.next:
    inc r12
    jmp .loop
    ;   }
    ; }
.not_found:
    ; if is_ident() {
    xor rdi, rdi
    call is_ident
    test rax, rax
    ;   return TOKEN_IDENT;
    jne .is_ident
    ; } else {
    ;   return TOKEN_EOF;
    mov rdi, [rsp + 8]
    mov qword [rdi], TOKEN_EOF
    ; }
.epilogue:
    pop r12
    pop rdi
    pop rbp
    mov rax, rdi
    ret
.is_ident:
    ; rax = len
    ; out.0 = TOKEN_IDENT
    ; out.1 = buffer.add(cursor - len)
    ; out.2 = len
    mov rdi, [rsp + 8]
    mov qword [rdi], TOKEN_IDENT
    mov [rdi + 16], rax
    jmp .epilogue
.is_number:
    mov rdi, [rsp + 8]
    mov qword [rdi], TOKEN_NUMBER
    mov [rdi + 16], rax
    jmp .epilogue
.is_string:
    mov rdi, [rsp + 8]
    mov qword [rdi], TOKEN_STRING
    mov [rdi + 16], rax
    jmp .epilogue
.is_comment:

    mov rdi, [rsp + 8]
    mov qword [rdi], TOKEN_COMMENT
    mov [rdi + 16], rax
    jmp .epilogue