// We (at least initially) use logos and winnow
use logos::{Lexer, Logos};
use std::num::{ParseFloatError, ParseIntError};
// Use tiny winnow parsers to parse intra-element stuff
use winnow::{
ascii::{digit1, hex_digit1},
combinator::{alt, delimited, opt, preceded, repeat},
token::{any, one_of},
PResult, Parser,
};
#[derive(thiserror::Error, Clone, PartialEq, Default, Debug)]
pub enum LexError {
#[default]
#[error("unknown")]
Unknown,
}
fn symbol_element_parse(input: &mut &str) -> PResult<Vec<char>> {
repeat(
0..,
alt((
delimited(r"\x", hex_digit1, r";").try_map(|s: &str| {
Ok::<_, ParseIntError>(
char::from_u32(u32::from_str_radix(s, 16)?)
.unwrap_or(char::REPLACEMENT_CHARACTER),
)
}),
preceded(r"\", one_of(['a', 'b', 't', 'n', 'r'])).map(|c| match c {
'a' => 7 as char,
'b' => 8 as char,
't' => 9 as char,
'n' => 0xa as char,
'r' => 0xd as char,
_ => unreachable!(r"Scheme R7RS only handles \a, \b, \t, \n, and \r"),
}),
any,
)),
)
.parse_next(input)
}
fn parse_to_symbol_elements(lex: &mut Lexer<Token>) -> Result<String, LexError> {
let slice = lex.slice();
let target = &slice[1..slice.len() - 1];
symbol_element_parse
.parse(target)
.map(|vc| vc.into_iter().collect())
.map_err(|e| {
// TODO A proper lex error (for ident symbols!)
dbg!(std::any::type_name_of_val(&e));
LexError::Unknown
})
}
fn string_element_parse(input: &mut &str) -> PResult<Vec<char>> {
repeat(
0..,
alt((
delimited(r"\x", hex_digit1, r";")
.try_map(|s: &str| {
Ok::<_, ParseIntError>(
char::from_u32(u32::from_str_radix(s, 16)?)
.unwrap_or(char::REPLACEMENT_CHARACTER),
)
})
.map(Some),
preceded(r"\", one_of(['a', 'b', 't', 'n', 'r', '"', '\\', '|']))
.map(|c| match c {
'a' => 7 as char,
'b' => 8 as char,
't' => 9 as char,
'n' => 0xa as char,
'r' => 0xd as char,
'"' => 0x22 as char,
'\\' => 0x5c as char,
'|' => 0x7c as char,
_ => unreachable!(
r#"Scheme R7RS strings only handle \a, \b, \t, \n, \r, \", \\, and \|"#
),
})
.map(Some),
delimited(
repeat::<_, _, (), _, _>(0.., one_of([' ', '\t', '\r'])),
'\n'.void(),
repeat::<_, _, (), _, _>(0.., one_of([' ', '\t', '\r'])),
)
.map(|_| None),
any.map(Some),
)),
)
.map(|vc: Vec<Option<char>>| vc.into_iter().flatten().collect())
.parse_next(input)
}
fn parse_to_string(lex: &mut Lexer<Token>) -> Result<String, LexError> {
let slice = lex.slice();
let target = &slice[1..slice.len() - 1];
string_element_parse
.parse(target)
.map(|vc| vc.into_iter().collect())
.map_err(|e| {
// TODO A proper lex error
dbg!(std::any::type_name_of_val(&e));
LexError::Unknown
})
}
fn integer_number_parser(input: &mut &str) -> PResult<(i64, u32)> {
let sign = opt(one_of(['-', '+']));
let digits_with_underscores = || {
repeat(1.., alt((digit1.map(Some), '_'.map(|_| None))))
.map(|vd: Vec<Option<&str>>| vd.into_iter().flatten().collect::<Vec<_>>().join(""))
};
let signed_digits_with_underscores =
(sign, digits_with_underscores()).map(|(sign, digits): (_, String)| {
if let Some(sign) = sign {
format!("{sign}{digits}")
} else {
digits
}
});
let exponent = preceded("e", (opt('+').void(), digits_with_underscores()));
(signed_digits_with_underscores, opt(exponent))
.try_map(|(base, maybe_exp)| {
let base = base.parse::<i64>()?;
let exponent = if let Some((_, exponent)) = maybe_exp {
exponent.parse::<u32>()?
} else {
0
};
Ok::<_, ParseIntError>((base, exponent))
})
.parse_next(input)
}
fn parse_integer(lexer: &mut Lexer<Token>) -> Result<(i64, u32), LexError> {
let target = lexer.slice();
integer_number_parser.parse(target).map_err(|e| {
// TODO A proper lex error
dbg!(std::any::type_name_of_val(&e));
LexError::Unknown
})
}
#[derive(thiserror::Error, Debug)]
pub enum DecimalParseError {
#[error(transparent)]
FloatError(#[from] ParseFloatError),
#[error(transparent)]
IntError(#[from] ParseIntError),
}
fn decimal_number_parser(input: &mut &str) -> PResult<(f64, i32)> {
let sign = || opt(one_of(['-', '+']));
let digits_with_underscores = || {
repeat(1.., alt((digit1.map(Some), '_'.map(|_| None))))
.map(|vd: Vec<Option<&str>>| vd.into_iter().flatten().collect::<Vec<_>>().join(""))
};
let float_digits = || {
(
opt(digits_with_underscores()),
'.',
digits_with_underscores(),
)
.map(|(maybe_predot, _, postdot)| {
maybe_predot
.map(|predot| format!("{predot}.{postdot}"))
.unwrap_or_else(|| format!(".{postdot}"))
})
};
let signed_base = (sign(), alt((float_digits(), digits_with_underscores()))).map(
|(sign, digits): (_, String)| {
if let Some(sign) = sign {
format!("{sign}{digits}")
} else {
digits
}
},
);
let exponent = preceded("e", (sign(), digits_with_underscores()));
(signed_base, opt(exponent))
.try_map(|(base, maybe_exp)| {
let base = base.parse::<f64>()?;
let exponent = if let Some((sign, exponent)) = maybe_exp {
let signed = if let Some(sign) = sign {
format!("{sign}{exponent}")
} else {
exponent
};
signed.parse::<i32>()?
} else {
0
};
Ok::<_, DecimalParseError>((base, exponent))
})
.parse_next(input)
}
fn parse_float(lexer: &mut Lexer<Token>) -> Result<(f64, i32), LexError> {
let target = lexer.slice();
decimal_number_parser.parse(target).map_err(|e| {
// TODO A proper lex error
dbg!(std::any::type_name_of_val(&e));
LexError::Unknown
})
}
#[derive(Debug, Clone, PartialEq, Logos)]
#[logos(error = LexError)]
pub enum Token {
#[token("(")]
LParen,
#[token(")")]
RParen,
#[regex(r"[ \t\r]+")]
Whitespace,
#[regex(r"[A-Za-z!$%&*+\-./:<=>?@^_-][A-Za-z0-9!$%&*+\-./:<=>?@^_-]*", |lex| lex.slice().to_string())]
#[regex(r"\|(?:\\x[0-9a-fA-F]+;|\\[abtnr]|[^|])*\|", parse_to_symbol_elements)]
Identifier(String),
#[regex(
r#""(?:\\x[0-9a-fA-F]+;|\\[abtnr"\\|]|\\[ \t\r]*\n[ \t\r]*|[^"])*""#,
parse_to_string
)]
String(String),
// TODO lexers for num, complex, real
#[regex(r"[+-]?[0-9][0-9_]*", priority = 3, callback = parse_integer)]
#[regex(r"[+-]?[0-9][0-9_]*e\+?[0-9_]+", callback = parse_integer)]
Integer((i64, u32)),
#[regex(r"[+-]?[0-9][0-9_]*\.[0-9][0-9_]*", parse_float)]
#[regex(r"[+-]?[0-9][0-9_]*\.[0-9][0-9_]*e[+-]?[0-9_]+", parse_float)]
#[regex(r"[+-]?\.[0-9][0-9_]*", parse_float)]
#[regex(r"[+-]?\.[0-9][0-9_]*e[+-]?[0-9_]+", parse_float)]
#[regex(r"[+-]?[0-9][0-9_]*e\-[0-9_]+", parse_float)] // Negative exponent
Decimal((f64, i32)),
#[token("+inf.0")]
InfinityPos,
#[token("-inf.0")]
InifinityNeg,
#[token("+nan.0")]
NotANumberPos,
#[token("-nan.0")]
NotANumberNeg,
#[token("+i")]
ImaginaryPos,
#[token("-i")]
ImaginaryNeg,
#[token(".", priority = 3)]
Dot,
#[token("'")]
QuoteSymbol,
#[token("`")]
QuasiquoteSymbol,
#[token(",")]
UnquoteSymbol,
#[token(",@")]
UnquoteSplicingSymbol,
#[token(";")]
LineCommentStart,
#[regex(r"\n+")]
LineEnd,
#[token("#;")]
DatumCommentStart,
#[token("#|")]
BlockCommentStart,
#[token("|#")]
BlockCommentEnd,
}
pub fn lexer(input: &str) -> Lexer<'_, Token> {
Token::lexer(input)
}
#[cfg(test)]
mod tests {
use super::Token;
use assert2::assert;
use logos::Logos;
#[test]
fn lex_decimal() {
let source = r"0.0 -0.0e0 -.0 .0e0 0e-1";
let data = [
((0.0, 0), (0..3), "0.0"),
((-0.0, 0), (4..10), "-0.0e0"),
((-0.0, 0), (11..14), "-.0"),
((0.0, 0), (15..19), ".0e0"),
((0.0, -1), (20..24), "0e-1"),
];
let lexer = Token::lexer(source);
for ((_, (tok, span)), (parsed, tspan, tslice)) in lexer
.spanned()
.enumerate()
// Skip all odd-indexed tokens (it's whitespace)
.filter(|(idx, _)| idx % 2 == 0)
.zip(data.into_iter())
{
let slice = &source[span.clone()];
assert!(tok == Ok(Token::Decimal(parsed)));
assert!(span == tspan);
assert!(slice == tslice);
}
}
#[test]
fn literal_identfier_equivalence() {
let source = r"|\t\t||\x9;\x9;|";
let mut lexer = Token::lexer(source);
let tok1 = lexer.next();
let tok1_slice = lexer.slice();
let tok2 = lexer.next();
let tok2_slice = lexer.slice();
assert!(tok1 == tok2);
assert!(tok1_slice != tok2_slice);
}
#[test]
fn lex_literal_identifier() {
let source = r"|H\x65;llo|";
let mut lexer = Token::lexer(source);
assert!(lexer.next() == Some(Ok(Token::Identifier("Hello".to_string()))));
assert!(lexer.span() == (0..source.len()));
assert!(lexer.slice() == "|H\\x65;llo|");
}
#[test]
fn lex_identifier_or_number() {
let source = "3let -i";
let mut lexer = Token::lexer(source);
assert!(lexer.next() == Some(Ok(Token::Integer((3, 0)))));
assert!(lexer.span() == (0..1));
assert!(lexer.slice() == "3");
assert!(lexer.next() == Some(Ok(Token::Identifier("let".to_string()))));
assert!(lexer.span() == (1..4));
assert!(lexer.slice() == "let");
assert!(lexer.next() == Some(Ok(Token::Whitespace)));
assert!(lexer.span() == (4..5));
assert!(lexer.slice() == " ");
assert!(lexer.next() == Some(Ok(Token::ImaginaryNeg)));
assert!(lexer.span() == (5..7));
assert!(lexer.slice() == "-i");
}
#[test]
fn lex_identifier() {
let source = "+soup+ $?bama";
let mut lexer = Token::lexer(source);
assert!(lexer.next() == Some(Ok(Token::Identifier("+soup+".to_string()))));
assert!(lexer.span() == (0..6));
assert!(lexer.slice() == "+soup+");
assert!(lexer.next() == Some(Ok(Token::Whitespace)));
assert!(lexer.span() == (6..7));
assert!(lexer.slice() == " ");
assert!(lexer.next() == Some(Ok(Token::Identifier("$?bama".to_string()))));
assert!(lexer.span() == (7..13));
assert!(lexer.slice() == "$?bama");
}
}