from-scratch/lang/src/tokeniser.asm

section .text
extern panic
extern strlen
extern strcmp
extern streq
extern memcpy
extern eprint_str
extern exit
extern error_to_str
extern eprint_error
extern alloc_pages
extern allocate
extern fopen_read

extern is_alpha
extern is_numeric
extern is_id_continue
extern is_id_start
extern is_whitespace

global tokeniser_init
global tokeniser_init_buf
global tokeniser_print
global find_lexeme
global expect_token
global unwrap_token
global peek_expect_token
global peek_lexeme

global tokeniser_get_cursor
global tokeniser_set_cursor

;; =============================
;; Tokeniser functions
;; =============================

;; tokeniser state
section .data
global input_file
global buffer
global cursor
global buffer_len

    input_file dd 0
    buffer dq 0
    cursor dq 0
    buffer_len dq 0

;; each buffer is chunk_size bytes large
;; buffer header structure:
;; +0 (8 bytes): pointer buffer
;; +8 (8 bytes): size of buffer

;; Tokens:
;; [let, if, else, fn, return, loop, break, continue, true, false, i32, u32, bool, =, +, -, *, /, %, ==, !=, <, <=, >, >=, &&, ||, !, (, ), {, }, [, ], ;, ',', ]

section .bss
statbuf: resb 144

section .text

;; rdi: pointer to buffer
;; rsi: length of buffer
tokeniser_init_buf:
    push rbp
    mov rbp, rsp

    mov dword [rel input_file], 0
    mov qword [rel buffer], rdi
    mov qword [rel buffer_len], rsi
    mov qword [rel cursor], 0

    pop rbp
    ret


;; Initialises the tokeniser
;; rdx: pointer to filename (null-terminated)
tokeniser_init:
    ; open file for reading
    ; this panics if the file doesn't exist
    call fopen_read
    mov dword [rel input_file], eax        ; store file descriptor
    mov qword [rel cursor], 0
    mov qword [rel buffer_len], 0
    ; fstat
    mov rax, 5                   ; syscall: fstat
    mov rdi, [rel input_file]        ; fd
    lea rsi, [rel statbuf]           ; statbuf
    syscall
    cmp rax, 0
    jl .report_error
    ; get file size from statbuf
    lea r15, [rel statbuf]      ; st_size
    mov r15, [r15 + 48]         ; offset of st_size in stat struct

    ; allocate buffer
    mov rdi, r15
    call allocate
    mov qword [rel buffer], rax
    mov qword [rel buffer_len], r15

    ; read file into buffer
    mov rax, 0                   ; syscall: read
    mov edi, [rel input_file]        ; fd
    mov rsi, [rel buffer]            ; buf
    mov rdx, [rel buffer_len]       ; count
    syscall
    cmp rax, 0
    jl .report_error
    ret
.report_error:
    mov rcx, rax
    call eprint_error
    call panic

section .rdata
    tokeniser_buffer db "Tokeniser buffer: ", 10
    tokeniser_buffer_len equ $ - tokeniser_buffer

section .text
tokeniser_print:
    lea rdi, [rel tokeniser_buffer]
    mov rsi, tokeniser_buffer_len
    call eprint_str

    mov rax, [rel cursor]
    mov rdi, [rel buffer]
    add rdi, rax
    mov rsi, [rel buffer_len]
    call eprint_str
    ret

section .rdata
global LEXEMES
global TOKENS
global LEXEME_LENS
global NUM_LEXEMES

%include "src/tokeniser.inc"

section .text
;; rdi: length of previously matched lexeme
;; returns the length of the ident
;; fn is_ident(lexeme_len: usize) -> usize
is_ident:
    push rbp
    mov rbp, rsp
    push r12
    push r13
    push r14
    push rdi

    mov rax, [rel cursor]
    mov r12, [rel buffer]
    mov r13, [rel buffer_len]
    sub r13, rax
    add r12, rax

    ; check first char is id_start
    mov dil, [r12]
    call is_id_start
    test rax, rax
    je .not_ident
    mov r14, 1
.loop:
    cmp r14, r13
    jge .done
    mov dil, [r12 + r14]
    ; check for id_continue
    call is_id_continue
    test rax, rax
    je .done
    inc r14
    jmp .loop
.done:
    ; r14 is length of ident
    mov rdi, [rsp]
    cmp r14, rdi
    jle .not_ident
    mov rax, [rel cursor]
    add rax, r14
    mov [rel cursor], rax
    mov rax, r14
    jmp .epilogue
.not_ident:
    xor rax, rax
.epilogue:
    pop rdi
    pop r14
    pop r13
    pop r12
    pop rbp
    ret

is_comment:
    push rbp
    mov rbp, rsp
    push r12
    push r13
    push r14

    mov rax, [rel cursor]
    mov r12, [rel buffer]
    mov r13, [rel buffer_len]
    add r12, rax
    sub r13, rax

    mov dil, [r12]
    cmp dil, '/'
    jne .not_comment

    mov r14, 1
    cmp r14, r13
    jge .not_comment
    mov dil, [r12 + r14]
    cmp dil, '/'
    jne .not_comment
.loop:
    inc r14
    cmp r14, r13
    jge .comment
    mov dil, [r12 + r14]
    cmp dil, 10  ; newline
    jne .loop
.comment:
    mov rax, [rel cursor]
    add rax, r14
    mov [rel cursor], rax
    mov rax, r14
    jmp .epilogue

.not_comment:
    xor rax, rax
.epilogue:
    pop r14
    pop r13
    pop r12
    pop rbp
    ret

;; Strings are sequences of characters enclosed in double quotes
;; Strings span multiple lines, and may in the future contain escape sequences
is_string:
    push rbp
    mov rbp, rsp
    push r12
    push r13
    push r14

    mov rax, [rel cursor]
    mov r12, [rel buffer]
    mov r13, [rel buffer_len]
    add r12, rax
    sub r13, rax

    mov dil, [r12]
    cmp dil, '"'
    jne .not_string

    mov r14, 1
.loop:
    cmp r14, r13
    jge .unterminated
    mov dil, [r12 + r14]
    cmp dil, '"'
    je .string
    cmp dil, 0x5c  ; backslash
    je .escape
    inc r14
    jmp .loop
.escape:
    inc r14
    cmp r14, r13
    jge .unterminated
    inc r14
    jmp .loop
.string:
    mov rax, [rel cursor]
    inc r14                 ; include closing quote
    add rax, r14
    mov [rel cursor], rax
    mov rax, r14
    jmp .epilogue
.unterminated:
;; TODO: report unterminated string error
    mov rax, r14
    jmp .epilogue
.not_string:
    xor rax, rax
.epilogue:
    pop r14
    pop r13
    pop r12
    pop rbp
    ret

;; Numbers are sequences of numeric characters, interspersed with underscores
;; The leading character must be numeric
;; In the future, numbers may be prefixed with '0x' for hexadecimal or '0b' for binary.
is_number:
    push rbp
    mov rbp, rsp
    push r12
    push r13
    push r14

    mov rax, [rel cursor]
    mov r12, [rel buffer]
    mov r13, [rel buffer_len]
    add r12, rax
    sub r13, rax

    mov dil, [r12]
    call is_numeric
    test rax, rax
    je .not_number

    mov r14, 1
.loop:
    cmp r14, r13
    jge .number
    mov dil, [r12 + r14]
    cmp dil, '_'
    je .loop_next
    call is_numeric
    test rax, rax
    je .number
.loop_next:
    inc r14
    jmp .loop
.number:
    mov rax, [rel cursor]
    add rax, r14
    mov [rel cursor], rax
    mov rax, r14
    jmp .epilogue
.not_number:
    xor rax, rax
.epilogue:
    pop r14
    pop r13
    pop r12
    pop rbp
    ret

skip_whitespaces:
    push rbp
    mov rbp, rsp
    push r12
    push r13
    push r14
    ; let start = buffer.add(cursor);
    ; let end = buffer.add(buffer_len);
    mov r12, [rel cursor]
    mov r13, [rel buffer_len]
    mov r14, [rel buffer]
    ; for ptr in start..end {
.loop:
    cmp r12, r13
    jge .done
    mov dil, [r14 + r12]
    call is_whitespace
    test rax, rax
    je .done
    inc r12
    jmp .loop
.done:
    mov [rel cursor], r12
    pop r14
    pop r13
    pop r12
    pop rbp
    ret


;; rdi: pointer to out-struct
;; fn find_lexeme() -> (u8, *const u8, usize)
find_lexeme:
    push rbp
    mov rbp, rsp
    push rdi
    ; skip whitespaces
    call skip_whitespaces
    ;; init out struct
    mov rdi, [rsp]
    mov rax, [rel buffer]
    add rax, [rel cursor]
    mov qword [rdi], 0
    mov [rdi + 8], rax
    mov qword [rdi + 16], 0
    ; check length
    mov rax, [rel cursor]
    mov rcx, [rel buffer_len]
    ; if cursor >= buffer_len {
    cmp rax, rcx
    jge .eof
    jmp .start
.eof:
    ;   return TOKEN_EOF;
    mov rax, TOKEN_EOF
    pop rdi
    pop rbp
    ret
    ; }
.start:
    push r12
    ; test special tokens:
    ; if buffer[cursor] == '"' {
    call is_string
    test rax, rax
    jne .is_string
    ; } else if buffer[cursor].is_numeric() {
    call is_number
    ;   return is_number();
    test rax, rax
    jne .is_number
    ; } else if buffer[cursor..][..2] == "//" {
    call is_comment
    ;   // skip to end of line
    test rax, rax
    jne .is_comment
    ; }
.loop_init:
    mov r12, 1
    ; for 1..NUM_LEXEMES {
.loop:
    cmp r12, [rel NUM_LEXEMES]
    jge .not_found
    ;   let lexeme = LEXEMES[i];
    lea rdi, [rel LEXEMES]
    mov rdi, [rdi + r12*8]
    lea rdx, [rel LEXEME_LENS]
    mov rsi, [rdx + r12*8]
    mov rax, [rel cursor]
    mov rdx, [rel buffer]
    add rdx, rax
    ;   let len = LEXEME_LENS[i];
    mov rcx, [rel buffer_len]
    sub rcx, rax
    jo .not_found
    ;   if lexeme.len() > buffer.len() - cursor {
    cmp rsi, rcx
    jg .next
    ;     continue;
    ;   }
    mov rcx, rsi
    ;   if buffer[cursor..cursor+len] == lexeme {
    call streq
    test rax, rax
    je .next
    ;       if is_ident() {
    mov rdi, rsi
    call is_ident
    test rax, rax
    ;         return TOKEN_IDENT;
    jne .is_ident
    ;       } else {
    mov rdi, [rsp + 8]
    mov rax, [rel cursor]
    ;         cursor += len;
    lea rsi, [rel LEXEME_LENS]
    mov rsi, [rsi + r12*8]
    add rax, rsi
    mov [rel cursor], rax
    ;         return TOKENS[i];
    lea rax, [rel TOKENS]
    mov al, [rax + r12]
    and rax, 0xFF
    mov rdi, [rsp + 8]
    mov [rdi], al
    mov [rdi + 16], rsi
    jmp .epilogue
    ;       }
.next:
    inc r12
    jmp .loop
    ;   }
    ; }
.not_found:
    ; if is_ident() {
    xor rdi, rdi
    call is_ident
    test rax, rax
    ;   return TOKEN_IDENT;
    jne .is_ident
    ; } else {
    ;   return TOKEN_EOF;
    mov rdi, [rsp + 8]
    mov qword [rdi], TOKEN_EOF
    ; }
.epilogue:
    pop r12
    pop rdi
    pop rbp
    mov rax, rdi
    ret
.is_ident:
    ; rax = len
    ; out.0 = TOKEN_IDENT
    ; out.1 = buffer.add(cursor - len)
    ; out.2 = len
    mov rdi, [rsp + 8]
    mov qword [rdi], TOKEN_IDENT
    mov [rdi + 16], rax
    jmp .epilogue
.is_number:
    mov rdi, [rsp + 8]
    mov qword [rdi], TOKEN_NUMBER
    mov [rdi + 16], rax
    jmp .epilogue
.is_string:
    mov rdi, [rsp + 8]
    mov qword [rdi], TOKEN_STRING
    mov [rdi + 16], rax
    jmp .epilogue
.is_comment:

    mov rdi, [rsp + 8]
    mov qword [rdi], TOKEN_COMMENT
    mov [rdi + 16], rax
    jmp .epilogue

;; dil: expected token
expect_token:
    push rbp
    mov rbp, rsp
    sub rsp, 0x30
    mov [rsp], dil
    mov rax, [rel cursor]       ; current cursor
    mov [rsp + 8], rax
    lea rdi, [rsp + 0x10]
    call find_lexeme
    mov rax, [rsp + 0x10]    ; found token
    mov dil, [rsp]          ; expected token
    cmp al, dil
    je .matched
    mov rdi, [rsp + 8]     ; restore cursor
    mov [rel cursor], rdi    ; restore cursor
    xor rax, rax
    xor rdx, rdx
    jmp .epilogue
.matched:
    mov rax, [rsp + 0x18]   ; lexeme pointer
    mov rdx, [rsp + 0x20]   ; lexeme length
.epilogue:
    add rsp, 0x30
    pop rbp
    ret

;; dil: expected token
unwrap_token:
    push rbp
    mov rbp, rsp
    call expect_token
    test rax, rax
    jz .panic
    pop rbp
    ret
.panic:
    call panic

;; returns 0 if token not found, else returns lexeme (ptr, len)
;; dil: expected token
peek_expect_token:
    push rbp
    mov rbp, rsp
    mov rax, [rel cursor]
    push rax
    call expect_token
    pop rdi
    mov [rel cursor], rdi
    pop rbp
    ret

;; rdi: out-struct pointer
peek_lexeme:
    push rbp
    mov rbp, rsp
    push rdi
    mov rax, [rel cursor]       ; current cursor
    push rax
    call find_lexeme
    pop rdi
    mov [rel cursor], rdi       ; restore cursor
    pop rax
    pop rbp
    ret

tokeniser_get_cursor:
    mov rax, [rel cursor]
    ret

tokeniser_set_cursor:
    mov [rel cursor], rdi
    ret