From e8934b8ccc13b2e0d81dd642131eb111cf80ac35 Mon Sep 17 00:00:00 2001
From: Janis <janis@nirgendwo.xyz>
Date: Tue, 6 Aug 2024 20:57:19 +0200
Subject: [PATCH] initial commit

---
 .gitignore     |   2 +
 Cargo.toml     |  10 ++
 grammar.bnf    |  95 +++++++++++
 rust-toolchain |   1 +
 src/common.rs  | 160 ++++++++++++++++++
 src/lexer.rs   | 442 +++++++++++++++++++++++++++++++++++++++++++++++++
 src/lib.rs     |   5 +
 src/tokens.rs  | 236 ++++++++++++++++++++++++++
 8 files changed, 951 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 Cargo.toml
 create mode 100644 grammar.bnf
 create mode 100644 rust-toolchain
 create mode 100644 src/common.rs
 create mode 100644 src/lexer.rs
 create mode 100644 src/lib.rs
 create mode 100644 src/tokens.rs
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..4fffb2f
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+/target
+/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..1fb4d26
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "compiler"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+itertools = "0.13.0"
+log = "0.4.22"
+thiserror = "1.0.63"
+unicode-xid = "0.2.4"
diff --git a/grammar.bnf b/grammar.bnf
new file mode 100644
index 0000000..86296ff
--- /dev/null
+++ b/grammar.bnf
@@ -0,0 +1,95 @@
+# cool language called sea:
+
+<id-start> ::= ...
+<id-cont> ::= ...
+<digit> ::= ...
+<digits> ::= ...
+<letter> ::= ...
+<letter_> ::= <letter> | '_'
+<digitletter_> ::= <digit> | <letter> | '_'
+<ident> ::= <id-start>
+        | <ident> <id-cont>
+
+<program> ::= <definition>
+
+<definition> ::= <var-decl> | <fn-decl>
+
+<fn-decl> ::= fn <ident> '(' <parameter-list> ,? ')' (-> <type-name>)? <block>
+
+<parameter-list> ::= <parameter>
+                 | <parameter-list> , <parameter>
+<parameter> ::= <ident> : <type-name>
+
+<block> ::= <statement>*
+        | <block> <expr>
+
+<statement> ::= <return-statement>
+            | <expr-statement>
+            | <assignment-statement>
+            | <var-decl> ';'
+<return-statement> ::= return <expr>? ';'
+<expr-statement> ::= <expr> ';'
+<assignment-statement> ::= <ident> <assignment-op> <expr> ';'
+<assignment-op> ::= |= | &= | ^= | /= | *= | %= | <<= | >>= | += | -= | =
+
+<expr> ::= <or-expr>
+
+<or-expr> ::= <and-expr>
+           | <or-expr> || <and-expr>
+<and-expr> ::= <bitor-expr>
+           | <and-expr> && <bitor-expr>
+<bitor-expr> ::= <bitxor-expr>
+           | <bitor-expr> '|' <bitxor-expr>
+<bitxor-expr> ::= <bitand-expr>
+           | <bitxor-expr> ^ <bitand-expr>
+<bitand-expr> ::= <equality-expr>
+           | <bitand-expr> & <equality-expr>
+<equality-expr> ::= <relational-expr>
+           | <equality-expr> (!= | ==) <relational-expr>
+<relational-expr> ::= <shift-expr>
+           | <relational-expr> (< | > | <= | >=) <shift-expr>
+<shift-expr> ::= <add-expr>
+           | <shift-expr> (<< | >>) <add-expr>
+<add-expr> ::= <mul-expr>
+           | <add-expr> (+ | -) <mul-expr>
+<mul-expr> ::= <prefix-expr>
+           | <mul-expr> (* | / | %) <prefix-expr>
+
+<prefix-expr> ::= <prefix-op> <as-expr>
+<prefix-op> ::= ! - + & *
+
+<as-expr> ::= <primary-expr> as <type-name>
+
+<primary-expr> ::= <constant>
+               | <literal>
+               | <ident>
+               | '(' <expr> ')'
+
+<var-decl> ::= (let | var) <ident> (':' <type-name>)? ( = <expr> )?
+
+<type-name> ::= <ident>
+            | <primitive-type>
+            | <pointer>
+<pointer> ::= '*' 'const'? <type-name>
+<primitive-type> ::= bool
+                 | <integral-type>
+                 | <floating-type>
+                 | void
+
+<integral-type> ::= ('u' | 'i') <digits>+
+<floating-type> ::= 'f'('32' | '64')
+                
+
+<constant> ::= <integral-constant>
+           | <floating-constant>
+<integral-constant> ::= <dec-digits><integral-type>?
+                    | '0x' <hex-digits> <integral-type>?
+                    | '0b' <bin-digits> <integral-type>?
+                    | '0o' <oct-digits> <integral-type>?
+
+<floating-constant> ::= <dec-digits> <floating-type>?
+                    | '.' <dec-digits> <exp-part>? <floating-type>?
+                    | <dec-digits> '.' <dec-digits>? <exp-part>? <floating-type>?
+
+<exp-part> ::=  ('e' | 'E') ('-' | '+')? <dec-digits> 
+           
diff --git a/rust-toolchain b/rust-toolchain
new file mode 100644
index 0000000..bf867e0
--- /dev/null
+++ b/rust-toolchain
@@ -0,0 +1 @@
+nightly
diff --git a/src/common.rs b/src/common.rs
new file mode 100644
index 0000000..21e486c
--- /dev/null
+++ b/src/common.rs
@@ -0,0 +1,160 @@
+#![allow(unused)]
+/// True if `c` is considered a whitespace according to Rust language definition.
+/// See [Rust language reference](https://doc.rust-lang.org/reference/whitespace.html)
+/// for definitions of these classes.
+pub fn is_whitespace(c: char) -> bool {
+    // This is Pattern_White_Space.
+    //
+    // Note that this set is stable (ie, it doesn't change with different
+    // Unicode versions), so it's ok to just hard-code the values.
+
+    matches!(
+        c,
+        // Usual ASCII suspects
+        '\u{0009}'   // \t
+        | '\u{000A}' // \n
+        | '\u{000B}' // vertical tab
+        | '\u{000C}' // form feed
+        | '\u{000D}' // \r
+        | '\u{0020}' // space
+
+    // NEXT LINE from latin1
+        | '\u{0085}'
+
+    // Bidi markers
+        | '\u{200E}' // LEFT-TO-RIGHT MARK
+        | '\u{200F}' // RIGHT-TO-LEFT MARK
+
+    // Dedicated whitespace characters from Unicode
+        | '\u{2028}' // LINE SEPARATOR
+        | '\u{2029}' // PARAGRAPH SEPARATOR
+    )
+}
+
+/// True if `c` is valid as a first character of an identifier.
+/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
+/// a formal definition of valid identifier name.
+pub fn is_id_start(c: char) -> bool {
+    // This is XID_Start OR '_' (which formally is not a XID_Start).
+    c == '_' || unicode_xid::UnicodeXID::is_xid_start(c)
+}
+
+/// True if `c` is valid as a non-first character of an identifier.
+/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
+/// a formal definition of valid identifier name.
+pub fn is_id_continue(c: char) -> bool {
+    unicode_xid::UnicodeXID::is_xid_continue(c)
+}
+
+/// The passed string is lexically an identifier.
+pub fn is_ident(string: &str) -> bool {
+    let mut chars = string.chars();
+    if let Some(start) = chars.next() {
+        is_id_start(start) && chars.all(is_id_continue)
+    } else {
+        false
+    }
+}
+
+pub fn is_digit(ch: char) -> bool {
+    ('0'..='9').contains(&ch)
+}
+
+pub fn is_bin_digit(ch: char) -> bool {
+    ch == '0' || ch == '1'
+}
+
+pub fn is_nonzero_digit(ch: char) -> bool {
+    ('1'..='9').contains(&ch)
+}
+
+pub fn is_oct_digit(ch: char) -> bool {
+    ('0'..='7').contains(&ch)
+}
+
+pub fn is_hex_digit(ch: char) -> bool {
+    ('0'..='9').contains(&ch) || ('a'..='f').contains(&ch) || ('A'..='F').contains(&ch)
+}
+
+/// Trait for only yielding the next item in the Iterator if it tests true for some predicate
+pub trait NextIf<I>: Iterator<Item = I> + Clone {
+    /// Yield next item if `pred` returns `true`.
+    /// If `pred` returns `false` the Iterator is not advanced.
+    #[must_use]
+    fn next_if<F>(&mut self, pred: F) -> Option<I>
+    where
+        F: FnOnce(&Self::Item) -> bool,
+    {
+        let old = self.clone();
+        match self.next() {
+            Some(item) => {
+                if pred(&item) {
+                    Some(item)
+                } else {
+                    *self = old;
+                    None
+                }
+            }
+            None => None,
+        }
+    }
+    /// Yield next item if `pred` returns `Some(T)`.
+    /// If `pred` returns `None` the Iterator is not advanced.
+    #[must_use]
+    fn next_if_map<F, T>(&mut self, pred: F) -> Option<T>
+    where
+        F: FnOnce(Self::Item) -> Option<T>,
+    {
+        let old = self.clone();
+        match self.next() {
+            Some(item) => match pred(item) {
+                None => {
+                    *self = old;
+                    None
+                }
+                some => some,
+            },
+            None => None,
+        }
+    }
+}
+
+impl<I, T> NextIf<I> for T where T: Iterator<Item = I> + Clone {}
+
+pub trait FallibleParse<I>: Iterator<Item = I> + Clone {
+    /// consumes items from `self` if and only if `map` yields `Some`.
+    #[must_use]
+    fn try_parse<F, U>(&mut self, map: F) -> Option<U>
+    where
+        F: FnOnce(&mut Self) -> Option<U>,
+    {
+        // clone iterator and keep around
+        let old = self.clone();
+        match map(self) {
+            Some(result) => Some(result),
+            None => {
+                // the map function failed, restore iterator and yield None.
+                *self = old;
+                None
+            }
+        }
+    }
+    #[must_use]
+    fn try_parse_result<F, U, E>(&mut self, map: F) -> Result<U, E>
+    where
+        F: FnOnce(&mut Self) -> Result<U, E>,
+    {
+        // clone iterator and keep around
+        let old = self.clone();
+        match map(self) {
+            Ok(result) => Ok(result),
+            Err(e) => {
+                // the map function failed, restore iterator and yield None.
+                *self = old;
+                Err(e)
+            }
+        }
+    }
+}
+
+impl<I, T> FallibleParse<I> for T where T: Iterator<Item = I> + Clone {}
diff --git a/src/lexer.rs b/src/lexer.rs
new file mode 100644
index 0000000..0bfe059
--- /dev/null
+++ b/src/lexer.rs
@@ -0,0 +1,442 @@
+use crate::tokens::Token;
+use crate::tokens::TokenPos;
+use itertools::Itertools;
+
+use crate::common::FallibleParse;
+use crate::common::NextIf;
+
+#[derive(Debug, thiserror::Error)]
+pub enum LexerError {
+    #[error("{0}")]
+    StringError(String),
+    #[error("Exp part of floating constant had no digits.")]
+    FloatingConstantExpPartNoDigit,
+    #[error("Dummy Message.")]
+    NumericalConstantDigitLeadingUnderscore,
+    #[error("Dummy Message.")]
+    NumericalConstantDigitNoDigit,
+    #[error("Dummy Message.")]
+    IntegralTypeExpectedDigit,
+    #[error("Dummy Message.")]
+    FloatingConstantInvalidTrailingType,
+    #[error("Dummy Message.")]
+    InvalidToken,
+    #[error("Dummy Message.")]
+    ExpectedIdStartForIdentifier,
+}
+
+pub type LexerResult<T> = core::result::Result<T, LexerError>;
+
+#[derive(Debug, Clone)]
+pub struct Chars<'a> {
+    bytes: &'a [u8],
+    offset: usize,
+}
+
+impl<'a> Chars<'a> {
+    pub fn as_str(&self) -> &str {
+        unsafe { core::str::from_utf8_unchecked(&self.bytes[self.offset..]) }
+    }
+    pub fn is_eof(&self) -> bool {
+        self.offset >= self.bytes.len()
+    }
+    pub fn peek(&self) -> Option<char> {
+        self.clone().next()
+    }
+
+    pub fn position(&self) -> u32 {
+        self.offset() as u32
+    }
+
+    pub fn offset(&self) -> usize {
+        self.offset
+    }
+
+    pub fn get_range(&self, start: u32, end: u32) -> &str {
+        unsafe { core::str::from_utf8_unchecked(&self.bytes[start as usize..end as usize]) }
+    }
+
+    fn next_char(&mut self) -> Option<char> {
+        let ch = self.as_str().chars().next()?;
+        self.offset += ch.len_utf8();
+        Some(ch)
+    }
+}
+
+impl<'a> Iterator for Chars<'a> {
+    type Item = char;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.next_char()
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct Tokenizer<'a> {
+    source: Chars<'a>,
+    tokens: Vec<TokenPos>,
+}
+
+macro_rules! next_or_eof {
+    ($expr:expr) => {
+        match $expr.next() {
+            Some(c) => c,
+            None => {
+                return Ok(Token::Eof);
+            }
+        }
+    };
+    (?$expr:expr) => {
+        match $expr.peek() {
+            Some(c) => c,
+            None => {
+                return Ok(Token::Eof);
+            }
+        }
+    };
+}
+
+macro_rules! residual {
+    (ok: $expr:expr) => {
+        match $expr {
+            Ok(t) => t,
+            Err(e) => {
+                return Err(e);
+            }
+        }
+    };
+    (none: $expr:expr) => {
+        match $expr {
+            Ok(Some(t)) => {
+                return Ok(Some(t));
+            }
+            Ok(val) => val,
+            Err(e) => {
+                return Err(e);
+            }
+        }
+    };
+    (flatten: none: $expr:expr) => {
+        match $expr {
+            Ok(Some(t)) => {
+                return Ok(t);
+            }
+            Ok(val) => val,
+            Err(e) => {
+                return Err(e);
+            }
+        }
+    };
+    (some: $expr:expr) => {
+        match $expr {
+            Ok(Some(t)) => t,
+            Ok(None) => {
+                return Ok(None);
+            }
+            Err(e) => {
+                return Err(e);
+            }
+        }
+    };
+}
+
+impl<'a> Tokenizer<'a> {
+    fn push_token(&mut self, token: Token, start: u32, end: u32) -> LexerResult<()> {
+        self.tokens.push(TokenPos::new(token, start, end));
+
+        Ok(())
+    }
+
+    pub fn next_token(&mut self) -> LexerResult<()> {
+        self.source
+            .take_while_ref(|&c| crate::common::is_whitespace(c))
+            .count();
+        let start = self.source.position();
+
+        let token = self.source.try_parse_result(|source| {
+            let a = try_parse_integral_type(source).map(|o| o.map(|_| Token::IntegralType));
+            residual!(none: a);
+
+            let mut peeking = source.clone();
+            match peeking.next() {
+                Some('0'..='9') => {
+                    return Ok(Some(parse_constant(source)?));
+                }
+                Some('.') if peeking.next().map(|c| ['b', 'x', 'o'].contains(&c)) == Some(true) => {
+                    return Ok(Some(parse_constant(source)?));
+                }
+                _ => {}
+            }
+
+            Ok(None)
+        });
+
+        if let Some(token) = token? {
+            return self.push_token(token, start, self.source.position());
+        }
+
+        // lexical tokens
+        let token = crate::tokens::LexemeParser::parse(self.source.clone());
+
+        if let Some(token) = token {
+            _ = self.source.advance_by(token.lexeme_len());
+
+            match token {
+                Token::SlashSlash | Token::SlashSlashSlash => {
+                    _ = self.push_token(token, start, self.source.position());
+                    let start = self.source.position();
+                    loop {
+                        // advance until either EOF or newline
+                        let Some(ch) = self.source.next() else {
+                            break;
+                        };
+                        if ch == '\n' {
+                            break;
+                        }
+                    }
+                    let end = self.source.position() - 1;
+                    return self.push_token(
+                        if token == Token::SlashSlash {
+                            Token::Comment
+                        } else {
+                            Token::DocComment
+                        },
+                        start,
+                        end,
+                    );
+                }
+                Token::SlashStar | Token::SlashStarStar => {
+                    let start = self.source.position();
+                    let mut end = self.source.position();
+
+                    let mut last = self.source.next();
+                    loop {
+                        // break out of loop if EOF
+                        let Some(l) = last.replace(match self.source.next() {
+                            Some(ch) => ch,
+                            None => {
+                                break;
+                            }
+                        }) else {
+                            break;
+                        };
+
+                        // break out of loop if end of comment
+                        if (l, last.unwrap()) == ('*', '/') {
+                            break;
+                        }
+                        end = self.source.position() - 1;
+                    }
+                    return self.push_token(
+                        if token == Token::SlashStar {
+                            Token::Comment
+                        } else {
+                            Token::DocComment
+                        },
+                        start,
+                        end,
+                    );
+                }
+                _ => {}
+            }
+
+            if token.maybe_ident() {
+                if self
+                    .source
+                    .take_while_ref(|&c| crate::common::is_id_continue(c))
+                    .count()
+                    .gt(&0)
+                {
+                    return self.push_token(Token::Ident, start, self.source.position());
+                }
+            }
+
+            return self.push_token(token, start, self.source.position());
+        }
+
+        self.source
+            .next_if(|&c| crate::common::is_id_start(c))
+            .ok_or(LexerError::ExpectedIdStartForIdentifier)?;
+        self.source
+            .take_while_ref(|&c| crate::common::is_id_continue(c))
+            .count();
+
+        return self.push_token(Token::Ident, start, self.source.position());
+    }
+}
+
+/// IntegralType <-
+///     ( 'u' | 'i' ) DIGITS+
+fn try_parse_integral_type(source: &mut Chars) -> LexerResult<Option<()>> {
+    if !source.next_if(|&c| c == 'u' || c == 'i').is_some() {
+        return Ok(None);
+    }
+
+    if source
+        .take_while_ref(|&c| crate::common::is_digit(c))
+        .count()
+        <= 0
+    {
+        return Err(LexerError::IntegralTypeExpectedDigit);
+    };
+
+    Ok(Some(()))
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum Radix {
+    Hex,
+    Bin,
+    Dec,
+    Oct,
+}
+
+impl Radix {
+    /// must be called with one of `['b','x','d','o']`
+    unsafe fn from_char_unchecked(c: char) -> Self {
+        match c.to_ascii_lowercase() {
+            'o' => Self::Oct,
+            'b' => Self::Oct,
+            'x' => Self::Oct,
+            'd' => Self::Oct,
+            _ => unreachable!(),
+        }
+    }
+    fn from_char(c: char) -> Option<Self> {
+        match c.to_ascii_lowercase() {
+            'o' => Some(Self::Oct),
+            'b' => Some(Self::Oct),
+            'x' => Some(Self::Oct),
+            'd' => Some(Self::Oct),
+            _ => None,
+        }
+    }
+    fn to_token(self) -> Token {
+        match self {
+            Radix::Hex => Token::IntegerHexConstant,
+            Radix::Bin => Token::IntegerBinConstant,
+            Radix::Oct => Token::IntegerOctConstant,
+            Radix::Dec => Token::IntegerConstant,
+        }
+    }
+    fn is_digit(self) -> fn(char) -> bool {
+        match self {
+            Radix::Hex => crate::common::is_hex_digit,
+            Radix::Bin => crate::common::is_bin_digit,
+            Radix::Oct => crate::common::is_oct_digit,
+            Radix::Dec => crate::common::is_digit,
+        }
+    }
+}
+
+/// where DIGIT is defined by radix:
+/// DIGITS <-
+///     if allow_leading_underscore: `_`* DIGIT (DIGIT|`_`)*
+///     else: DIGIT (DIGIT|`_`)*
+fn parse_digit_part(
+    source: &mut Chars,
+    allow_leading_underscore: bool,
+    radix: Radix,
+) -> LexerResult<()> {
+    let radix = radix.is_digit();
+
+    if allow_leading_underscore {
+        let _underscore = source.take_while_ref(|&c| c == '_').count();
+    }
+    let _need_digit = source.next_if(|&c| radix(c)).ok_or_else(|| {
+        if source.peek() == Some('_') {
+            LexerError::NumericalConstantDigitLeadingUnderscore
+        } else {
+            LexerError::NumericalConstantDigitNoDigit
+        }
+    })?;
+    let _rest = source.take_while_ref(|&c| radix(c) || c == '_').count();
+
+    Ok(())
+}
+
+/// returns `Err(E)` if it failed to parse.
+/// returns `Ok(None)` if no exp part was found.
+/// returns `Ok(Some(()))` if an exp part was found and parsed.
+///
+/// EXP_PART <-
+///     (`e`|`E`) (`-`|`+`)? DEC_DIGITS
+fn try_parse_exp_part(source: &mut Chars) -> LexerResult<Option<()>> {
+    if source.next_if(|&c| c.to_ascii_lowercase() == 'e').is_some() {
+        let _sign = source.next_if(|&c| c == '-' || c == '+');
+        if source
+            .take_while_ref(|&c| crate::common::is_digit(c))
+            .count()
+            .lt(&1)
+        {
+            // need digits following exp notation
+            Err(LexerError::FloatingConstantExpPartNoDigit)
+        } else {
+            Ok(Some(()))
+        }
+    } else {
+        Ok(None)
+    }
+}
+
+/// CONSTANT <-
+///    DEC_DIGITS IntegralType?
+///    `0x` HEX_DIGITS IntegralType?
+///    `0b` BIN_DIGITS IntegralType?
+///    `0o` OCT_DIGITS IntegralType?
+///    DEC_DIGITS FloatingType?
+///    `.` DEC_DIGITS EXP_PART? FloatingType?
+///    DEC_DIGITS `.` DEC_DIGITS? EXP_PART? FloatingType?
+fn parse_constant(source: &mut Chars) -> LexerResult<Token> {
+    let zero = source.next_if(|&c| c == '0').is_some();
+    let radix = zero
+        .then(|| source.next_if_map(|c| Radix::from_char(c)))
+        .flatten();
+
+    if let Some(radix) = radix {
+        parse_digit_part(source, false, radix)?;
+        if source.peek().map(|c| c == 'u' || c == 'i') == Some(true) {
+            try_parse_integral_type(source)?;
+        }
+        return Ok(radix.to_token());
+    }
+
+    // if zero: `_`* DIGIT (DIGIT|`_`)*
+    // else: DIGIT (DIGIT|`_`)*
+    let _digits = parse_digit_part(source, false, Radix::Dec)?;
+
+    if let Ok(_) = source.try_parse_result(|source| try_parse_integral_type(source)) {
+        return Ok(Token::IntegerConstant);
+    }
+
+    let dot = source.next_if(|&c| c == '.').is_some();
+
+    if dot {
+        parse_digit_part(source, false, Radix::Dec)?;
+    }
+
+    // parse exp notation
+    let exp = try_parse_exp_part(source)?.is_some();
+
+    // trailing FloatingType?
+    let floating = if source.next_if(|&c| c == 'f').is_some() {
+        let digits = source.next_tuple::<(char, char)>();
+        if !(digits == Some(('6', '4')) || digits == Some(('3', '2'))) {
+            // need either f64 or f32 here!
+            return Err(LexerError::FloatingConstantInvalidTrailingType);
+        }
+        true
+    } else {
+        false
+    };
+
+    let token = match (dot, exp, floating) {
+        (false, false, false) => Token::IntegerConstant,
+        (true, false, _) => Token::DotFloatingConstant,
+        (true, true, _) => Token::DotFloatingExpConstant,
+        (false, true, _) => Token::FloatingExpConstant,
+        (false, _, _) => Token::FloatingConstant,
+    };
+
+    Ok(token)
+}
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..1fdc1f2
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,5 @@
+#![feature(extract_if, iter_advance_by)]
+
+mod common;
+mod lexer;
+mod tokens;
diff --git a/src/tokens.rs b/src/tokens.rs
new file mode 100644
index 0000000..dad749c
--- /dev/null
+++ b/src/tokens.rs
@@ -0,0 +1,236 @@
+macro_rules! tokens {
+    ($vis:vis $ty_name:ident:
+        {
+            $($name2:ident),*
+        },
+        {
+            $($name:ident => $lexeme:literal),*
+        }) => {
+
+            #[allow(dead_code)]
+            #[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
+            $vis enum $ty_name {
+                $($name,
+                )*
+                $($name2,)*
+            }
+
+            impl std::fmt::Display for $ty_name {
+                fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                    match self {
+                        $(Self::$name => write!(f, "{}", $lexeme),)*
+                        $(Self::$name2 => write!(f, "<{}>", stringify!($name2))),*
+                    }
+                }
+            }
+
+            #[allow(dead_code)]
+            impl $ty_name {
+                $vis fn lexeme(&self) -> Option<&'static str> {
+                    match self {
+                        $(Self::$name => Some($lexeme),)*
+                        $(Self::$name2 => None),*
+                    }
+                }
+
+                /// returns the number of chars in this lexeme
+                $vis fn lexeme_len(&self) -> usize {
+                    self.lexeme().map(|lexeme|lexeme.chars().count()).unwrap_or(0)
+                }
+
+                $vis fn maybe_ident(&self) -> bool {
+                    self.lexeme().map(|lexeme| crate::common::is_ident(lexeme)).unwrap_or(false)
+                }
+
+                $vis fn lexemes() -> &'static [(Self, &'static str)] {
+                    &[
+                        $((Self::$name, $lexeme)),*
+                    ]
+                }
+            }
+        };
+}
+
+tokens!(pub Token: {
+    Eof,
+    // Marker Token for any Comment
+    Comment,
+    DocComment,
+        // Marker Token for any pre-processing directive
+    CharConstant,
+    IntegerConstant,
+    IntegerHexConstant,
+    IntegerBinConstant,
+    IntegerOctConstant,
+    FloatingConstant,
+    FloatingExpConstant,
+    DotFloatingConstant,
+    DotFloatingExpConstant,
+    StringConstant,
+    IntegralType,
+    Ident
+},
+    // Lexical Tokens:
+    {
+        SlashSlash => "//",
+        SlashStar => "/*",
+        SlashStarStar => "/**",
+        StarSlash => "*/",
+        SlashSlashSlash => "///",
+        // Punctuation:
+        OpenParens => "(",
+        CloseParens => ")",
+        OpenBrace => "{",
+        CloseBrace => "}",
+        OpenSquareBracket => "[",
+        CloseSquareBracket => "]",
+        Semi => ";",
+        Comma => ",",
+        Elipsis3 => "...",
+        Elipsis2 => "..",
+        Colon => ":",
+        Equal => "=",
+        // Keywords:
+        Void => "void",
+        Bool => "bool",
+        F32 => "f32",
+        F64 => "f64",
+        Const => "const",
+        Fn => "fn",
+        Let => "let",
+        Var => "var",
+        If => "if",
+        As => "as",
+        Else => "else",
+        Return => "return",
+        // Operators
+        Dot => ".",
+        MinusGreater => "->",
+        Bang => "!",
+        Tilde => "~",
+        Plus => "+",
+        Minus => "-",
+        Star => "*",
+        Slash => "/",
+        Percent => "%",
+        Less => "<",
+        Greater => ">",
+        LessEqual => "<=",
+        GreaterEqual => ">=",
+        EqualEqual => "==",
+        BangEqual => "!=",
+        PipePipe => "||",
+        AmpersandAmpersand => "&&",
+        Ampersand => "&",
+        Caret => "^",
+        Pipe => "|",
+        LessLess => "<<",
+        GreaterGreater => ">>",
+        Question => "?",
+        PlusEqual => "+=",
+        MinusEqual => "-=",
+        StarEqual => "*=",
+        SlashEqual => "/=",
+        PercentEqual => "%=",
+        AmpersandEqual => "&=",
+        PipeEqual => "|=",
+        CaretEqual => "^=",
+        LessLessEqual => "<<=",
+        GreaterGreaterEqual => ">>="
+    });
+
+/// Helper type for parsing tokens that have a defined lexeme, such as `fn`,
+/// `f32`, `const`, etc. Tokens with variable lexemes, such as primitive
+/// integral types, constants or identifiers are not parsed by this.
+pub struct LexemeParser {
+    lexemes: Vec<Token>,
+    candidates: Vec<Token>,
+    len: usize,
+}
+
+impl LexemeParser {
+    pub fn new() -> Self {
+        let lexemes = Token::lexemes()
+            .iter()
+            .map(|(tok, _)| tok.clone())
+            .collect::<Vec<_>>();
+
+        Self {
+            lexemes,
+            candidates: vec![],
+            len: 0,
+        }
+    }
+
+    pub fn finish(mut self) -> Option<Token> {
+        self.candidates.pop()
+    }
+
+    pub fn parse(mut tokens: impl Iterator<Item = char>) -> Option<Token> {
+        let mut this = Self::new();
+        loop {
+            let Some(ch) = tokens.next() else {
+                break;
+            };
+
+            if crate::common::is_whitespace(ch) {
+                break;
+            }
+
+            this.advance(ch)?;
+        }
+        this.finish()
+    }
+
+    /// accepts a char and returns `None` until it is done trying to parse the longest `Token`.
+    /// when finished, returns a Token, if it parsed one, or `Some(None)`.
+    pub fn advance(&mut self, ch: char) -> Option<Option<Token>> {
+        self.len += 1;
+
+        // advance match
+        // keep tokens whose lexemes match the next char
+        self.lexemes.retain(|tok| {
+            // SAFETY: all of these tokens are lexical, and every character in
+            // them is represented by a single byte and we know they must be
+            // utf8/ascii.
+            unsafe {
+                char::from_u32_unchecked(tok.lexeme().unwrap().as_bytes()[self.len - 1] as u32)
+                    == ch
+            }
+        });
+
+        // A token has been successfully matched completely if it has not yet
+        // been removed from the lexeme list but the length of it's lexeme is no
+        // greater than the number of chars we've received.
+        self.candidates.extend(self.lexemes.extract_if(|tok| {
+            // SAFETY: as above, all of the tokens in self.lexemes are
+            // lexical and are all single byte characters.
+            tok.lexeme().unwrap().as_bytes().len() <= self.len
+        }));
+
+        // we prefer the longer match
+        // that means that a+++++b doesn't parse and a+++(++b) is a++ + ++b
+        // `&&i` is also LogicalAnd i and not Ampersand Ampersand i
+        // Somehow, this is also a gnu extension...
+
+        if self.lexemes.is_empty() {
+            // return match, if it exists
+            return Some(self.candidates.pop());
+        }
+
+        return None;
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct TokenPos {
+    pub token: Token,
+    pub start: u32,
+    pub end: u32,
+}
+
+impl TokenPos {
+    pub fn new(token: Token, start: u32, end: u32) -> Self {
+        Self { token, start, end }
+    }
+}