token iterator based on trie

This commit is contained in:
janis 2025-09-19 14:46:48 +02:00
parent 47e671f827
commit fafd4011e2
Signed by: janis
SSH key fingerprint: SHA256:bB1qbbqmDXZNT0KKD5c2Dfjg53JGhj7B3CFcLIzSqq8

View file

@ -445,120 +445,80 @@ impl LexemeParser {
}
}
// but what if...? prefix-tree!
mod trie {
use super::Token;
use std::collections::BTreeMap;
use trie::{OnceAndIter, Tree};
#[derive(Debug, Default)]
struct Node {
keys: Vec<char>,
values: Vec<Option<Token>>,
edges: Vec<Option<Box<Node>>>,
}
pub struct LexemeIterator<I: Iterator<Item = char>> {
trie: Tree<char, Token>,
iter: OnceAndIter<I, char>,
}
#[derive(Debug)]
struct Tree {
root: Option<Box<Node>>,
}
impl<I: Iterator<Item = char>> LexemeIterator<I> {
pub fn new(iter: I) -> Self {
let mut trie = Tree::new();
fn build_tree() -> Tree {
let lexemes = Token::lexemes()
.iter()
.map(|(tok, _)| tok.clone())
.collect::<Box<_>>();
let mut tree = Tree {
root: Some(Box::new(Node::default())),
};
for tok in lexemes {
let lexeme = tok.lexeme().unwrap();
let mut current = &mut Box::new(Node::default());
let mut next = &mut tree.root;
let mut p = 0;
for c in lexeme.chars() {
current = next.get_or_insert(Box::new(Node::default()));
p = match current.keys.binary_search(&c) {
Ok(p) => p,
Err(p) => {
current.keys.insert(p, c);
current.values.insert(p, None);
current.edges.insert(p, None);
p
}
};
next = current.edges.get_mut(p).unwrap();
}
current.values[p] = Some(tok);
for (token, token_str) in Token::lexemes() {
trie.insert(token_str.chars(), *token);
}
tree
Self {
trie,
iter: iter.into(),
}
}
fn search_tree(tree: &Tree, mut tokens: impl Iterator<Item = char>) -> Option<Token> {
let mut current = tree.root.as_ref().unwrap();
let mut p = None;
fn parse(&mut self) -> Option<Token> {
match self.trie.get_closest(&mut self.iter) {
Some((Some(key), token)) => {
// skip the peeked item
self.iter.set_once(key);
Some(*token)
}
Some((None, token)) => Some(*token),
None => None,
}
}
fn skip_whitespaces(&mut self) {
loop {
let Some(ch) = tokens.next() else {
break;
};
let Some(c) = self.iter.next() else { break };
if crate::is_things::is_whitespace(ch) {
if is_things::is_whitespace(c) {
continue;
} else {
self.iter.set_once(c);
break;
}
let n = match current.keys.binary_search(&ch) {
Ok(p) => p,
Err(p) => {
eprintln!("No match for {ch} in {:?} (p={p})", current.keys);
return None;
}
};
current = match current.edges.get(n) {
Some(Some(node)) => node,
_ => {
p = Some(n);
break;
}
};
}
eprintln!("current: {:?}", current);
current.values.get(p?).copied().flatten()
}
#[cfg(test)]
mod tree_tests {
use super::*;
#[test]
fn test_tree() {
use werkzeug::iter::AdvanceWhile;
let tree = build_tree();
eprintln!("Tree: {tree:?}");
let mut tokens = "fn let void+++(++bool)".chars();
assert_eq!(search_tree(&tree, &mut tokens), Some(Token::Fn));
tokens.advance_while(|&c| crate::is_things::is_whitespace(c));
assert_eq!(search_tree(&tree, &mut tokens), Some(Token::Let));
tokens.advance_while(|&c| crate::is_things::is_whitespace(c));
assert_eq!(search_tree(&tree, &mut tokens), Some(Token::Void));
tokens.advance_while(|&c| crate::is_things::is_whitespace(c));
assert_eq!(search_tree(&tree, &mut tokens), Some(Token::PlusPlus));
tokens.advance_while(|&c| crate::is_things::is_whitespace(c));
assert_eq!(search_tree(&tree, &mut tokens), Some(Token::Plus));
tokens.advance_while(|&c| crate::is_things::is_whitespace(c));
assert_eq!(search_tree(&tree, &mut tokens), Some(Token::OpenParens));
tokens.advance_while(|&c| crate::is_things::is_whitespace(c));
assert_eq!(search_tree(&tree, &mut tokens), Some(Token::PlusPlus));
tokens.advance_while(|&c| crate::is_things::is_whitespace(c));
assert_eq!(search_tree(&tree, &mut tokens), Some(Token::Bool));
tokens.advance_while(|&c| crate::is_things::is_whitespace(c));
assert_eq!(search_tree(&tree, &mut tokens), Some(Token::CloseParens));
assert_eq!(search_tree(&tree, &mut tokens), None);
}
}
}
impl<I: Iterator<Item = char>> Iterator for LexemeIterator<I> {
type Item = Token;
fn next(&mut self) -> Option<Self::Item> {
// skip whitespace
self.skip_whitespaces();
self.parse()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_iterator() {
let mut tokens = "fn let void+++(++bool)".chars();
let mut lexer = LexemeIterator::new(&mut tokens);
assert_eq!(lexer.next(), Some(Token::Fn));
assert_eq!(lexer.next(), Some(Token::Let));
assert_eq!(lexer.next(), Some(Token::Void));
assert_eq!(lexer.next(), Some(Token::PlusPlus));
assert_eq!(lexer.next(), Some(Token::Plus));
assert_eq!(lexer.next(), Some(Token::OpenParens));
assert_eq!(lexer.next(), Some(Token::PlusPlus));
assert_eq!(lexer.next(), Some(Token::Bool));
assert_eq!(lexer.next(), Some(Token::CloseParens));
assert_eq!(lexer.next(), None);
}
}