token iterator based on trie
This commit is contained in:
parent
47e671f827
commit
fafd4011e2
|
@ -445,120 +445,80 @@ impl LexemeParser {
|
|||
}
|
||||
}
|
||||
|
||||
// but what if...? prefix-tree!
|
||||
mod trie {
|
||||
use super::Token;
|
||||
use std::collections::BTreeMap;
|
||||
use trie::{OnceAndIter, Tree};
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
struct Node {
|
||||
keys: Vec<char>,
|
||||
values: Vec<Option<Token>>,
|
||||
edges: Vec<Option<Box<Node>>>,
|
||||
}
|
||||
pub struct LexemeIterator<I: Iterator<Item = char>> {
|
||||
trie: Tree<char, Token>,
|
||||
iter: OnceAndIter<I, char>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct Tree {
|
||||
root: Option<Box<Node>>,
|
||||
}
|
||||
impl<I: Iterator<Item = char>> LexemeIterator<I> {
|
||||
pub fn new(iter: I) -> Self {
|
||||
let mut trie = Tree::new();
|
||||
|
||||
fn build_tree() -> Tree {
|
||||
let lexemes = Token::lexemes()
|
||||
.iter()
|
||||
.map(|(tok, _)| tok.clone())
|
||||
.collect::<Box<_>>();
|
||||
|
||||
let mut tree = Tree {
|
||||
root: Some(Box::new(Node::default())),
|
||||
};
|
||||
|
||||
for tok in lexemes {
|
||||
let lexeme = tok.lexeme().unwrap();
|
||||
let mut current = &mut Box::new(Node::default());
|
||||
let mut next = &mut tree.root;
|
||||
let mut p = 0;
|
||||
for c in lexeme.chars() {
|
||||
current = next.get_or_insert(Box::new(Node::default()));
|
||||
p = match current.keys.binary_search(&c) {
|
||||
Ok(p) => p,
|
||||
Err(p) => {
|
||||
current.keys.insert(p, c);
|
||||
current.values.insert(p, None);
|
||||
current.edges.insert(p, None);
|
||||
p
|
||||
}
|
||||
};
|
||||
|
||||
next = current.edges.get_mut(p).unwrap();
|
||||
}
|
||||
current.values[p] = Some(tok);
|
||||
for (token, token_str) in Token::lexemes() {
|
||||
trie.insert(token_str.chars(), *token);
|
||||
}
|
||||
|
||||
tree
|
||||
Self {
|
||||
trie,
|
||||
iter: iter.into(),
|
||||
}
|
||||
}
|
||||
|
||||
fn search_tree(tree: &Tree, mut tokens: impl Iterator<Item = char>) -> Option<Token> {
|
||||
let mut current = tree.root.as_ref().unwrap();
|
||||
let mut p = None;
|
||||
fn parse(&mut self) -> Option<Token> {
|
||||
match self.trie.get_closest(&mut self.iter) {
|
||||
Some((Some(key), token)) => {
|
||||
// skip the peeked item
|
||||
self.iter.set_once(key);
|
||||
Some(*token)
|
||||
}
|
||||
Some((None, token)) => Some(*token),
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn skip_whitespaces(&mut self) {
|
||||
loop {
|
||||
let Some(ch) = tokens.next() else {
|
||||
break;
|
||||
};
|
||||
let Some(c) = self.iter.next() else { break };
|
||||
|
||||
if crate::is_things::is_whitespace(ch) {
|
||||
if is_things::is_whitespace(c) {
|
||||
continue;
|
||||
} else {
|
||||
self.iter.set_once(c);
|
||||
break;
|
||||
}
|
||||
|
||||
let n = match current.keys.binary_search(&ch) {
|
||||
Ok(p) => p,
|
||||
Err(p) => {
|
||||
eprintln!("No match for {ch} in {:?} (p={p})", current.keys);
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
current = match current.edges.get(n) {
|
||||
Some(Some(node)) => node,
|
||||
_ => {
|
||||
p = Some(n);
|
||||
break;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
eprintln!("current: {:?}", current);
|
||||
current.values.get(p?).copied().flatten()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tree_tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_tree() {
|
||||
use werkzeug::iter::AdvanceWhile;
|
||||
let tree = build_tree();
|
||||
eprintln!("Tree: {tree:?}");
|
||||
let mut tokens = "fn let void+++(++bool)".chars();
|
||||
assert_eq!(search_tree(&tree, &mut tokens), Some(Token::Fn));
|
||||
tokens.advance_while(|&c| crate::is_things::is_whitespace(c));
|
||||
assert_eq!(search_tree(&tree, &mut tokens), Some(Token::Let));
|
||||
tokens.advance_while(|&c| crate::is_things::is_whitespace(c));
|
||||
assert_eq!(search_tree(&tree, &mut tokens), Some(Token::Void));
|
||||
tokens.advance_while(|&c| crate::is_things::is_whitespace(c));
|
||||
assert_eq!(search_tree(&tree, &mut tokens), Some(Token::PlusPlus));
|
||||
tokens.advance_while(|&c| crate::is_things::is_whitespace(c));
|
||||
assert_eq!(search_tree(&tree, &mut tokens), Some(Token::Plus));
|
||||
tokens.advance_while(|&c| crate::is_things::is_whitespace(c));
|
||||
assert_eq!(search_tree(&tree, &mut tokens), Some(Token::OpenParens));
|
||||
tokens.advance_while(|&c| crate::is_things::is_whitespace(c));
|
||||
assert_eq!(search_tree(&tree, &mut tokens), Some(Token::PlusPlus));
|
||||
tokens.advance_while(|&c| crate::is_things::is_whitespace(c));
|
||||
assert_eq!(search_tree(&tree, &mut tokens), Some(Token::Bool));
|
||||
tokens.advance_while(|&c| crate::is_things::is_whitespace(c));
|
||||
assert_eq!(search_tree(&tree, &mut tokens), Some(Token::CloseParens));
|
||||
assert_eq!(search_tree(&tree, &mut tokens), None);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item = char>> Iterator for LexemeIterator<I> {
|
||||
type Item = Token;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
// skip whitespace
|
||||
self.skip_whitespaces();
|
||||
|
||||
self.parse()
|
||||
}
|
||||
}
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_iterator() {
|
||||
let mut tokens = "fn let void+++(++bool)".chars();
|
||||
let mut lexer = LexemeIterator::new(&mut tokens);
|
||||
assert_eq!(lexer.next(), Some(Token::Fn));
|
||||
assert_eq!(lexer.next(), Some(Token::Let));
|
||||
assert_eq!(lexer.next(), Some(Token::Void));
|
||||
assert_eq!(lexer.next(), Some(Token::PlusPlus));
|
||||
assert_eq!(lexer.next(), Some(Token::Plus));
|
||||
assert_eq!(lexer.next(), Some(Token::OpenParens));
|
||||
assert_eq!(lexer.next(), Some(Token::PlusPlus));
|
||||
assert_eq!(lexer.next(), Some(Token::Bool));
|
||||
assert_eq!(lexer.next(), Some(Token::CloseParens));
|
||||
assert_eq!(lexer.next(), None);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue