lexer.rs 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227
  1. pub use crate::core::{FileRef, Loc, Span};
  2. use logos::{Lexer, Logos};
  3. #[derive(Debug, Clone, Copy)]
  4. pub struct Located<T> {
  5. pub item: T,
  6. pub loc: Loc,
  7. }
  8. impl<T> Located<T> {
  9. pub fn new(item: T, file: FileRef, span: Span) -> Located<T> {
  10. Located {
  11. loc: Loc { file, span },
  12. item,
  13. }
  14. }
  15. }
  16. impl<T: Clone> Located<T> {
  17. pub fn map<R>(&self, func: impl FnOnce(T) -> R) -> Located<R> {
  18. Located {
  19. item: func(self.item.clone()),
  20. loc: self.loc,
  21. }
  22. }
  23. }
  24. fn parse_num<'a>(lex: &mut Lexer<'a, Token<'a>>) -> Option<i64> {
  25. let slice = lex.slice();
  26. slice.parse().ok()
  27. }
  28. fn parse_str<'a>(lex: &mut Lexer<'a, Token<'a>>) -> Option<String> {
  29. let mut buf = String::new();
  30. let s = lex.slice();
  31. let mut src = s[1..s.len() - 1].chars();
  32. while let Some(c) = src.next() {
  33. if c == '\\' {
  34. match src.next() {
  35. Some('n') => buf.push('\n'),
  36. Some('t') => buf.push('\t'),
  37. Some('r') => buf.push('\r'),
  38. Some(c) => buf.push(c),
  39. None => return None,
  40. }
  41. } else {
  42. buf.push(c);
  43. }
  44. }
  45. Some(buf)
  46. }
  47. #[derive(Logos, Debug, PartialEq, Clone)]
  48. pub enum Token<'a> {
  49. #[token("<")]
  50. LAngle,
  51. #[token(">")]
  52. RAngle,
  53. #[token("(")]
  54. LPar,
  55. #[token(")")]
  56. RPar,
  57. #[token("{")]
  58. LCurl,
  59. #[token("}")]
  60. RCurl,
  61. #[token("[")]
  62. LBrac,
  63. #[token("]")]
  64. RBrac,
  65. #[token("|")]
  66. Pipe,
  67. #[token(":")]
  68. Colon,
  69. #[token(",")]
  70. Comma,
  71. #[token(";")]
  72. Semi,
  73. #[token(".")]
  74. Dot,
  75. #[token("_")]
  76. Underscore,
  77. #[token("..")]
  78. DotDot,
  79. #[token("=>")]
  80. Arrow,
  81. #[token(":=")]
  82. Assn,
  83. #[token("::=")]
  84. LitAssn,
  85. #[token("puts")]
  86. Puts,
  87. #[token("case")]
  88. Case,
  89. #[token("let")]
  90. Let,
  91. #[token("in")]
  92. In,
  93. #[token("fix")]
  94. Fix,
  95. #[regex(r"\p{Ll}(\pL|[0-9_/-])*")]
  96. Var(&'a str),
  97. #[regex(r"\p{Lu}(\pL|[0-9_/-])*")]
  98. Atom(&'a str),
  99. #[regex(r"[0-9]+", parse_num)]
  100. Num(i64),
  101. #[regex(r"'([^'\\]|\\.)*'", parse_str)]
  102. #[regex("\"([^\"\\\\]|\\\\.)*\"", parse_str)]
  103. Str(String),
  104. #[error]
  105. #[regex(r"[ \t\n\f]+", logos::skip)]
  106. #[regex(r"\(\*([^*]|\*[^)])*\*\)", logos::skip)]
  107. Error,
  108. }
  109. impl<'a> Token<'a> {
  110. pub fn token_name(&self) -> String {
  111. match self {
  112. Token::Var(v) => format!("variable `{}`", v),
  113. Token::Atom(a) => format!("atom `{}`", a),
  114. Token::Num(n) => format!("number `{}`", n),
  115. Token::Str(s) => format!("string `{}`", s),
  116. Token::LAngle => "`<`".to_string(),
  117. Token::RAngle => "`>`".to_string(),
  118. Token::LPar => "`(`".to_string(),
  119. Token::RPar => "`)`".to_string(),
  120. Token::LCurl => "`{`".to_string(),
  121. Token::RCurl => "`}`".to_string(),
  122. Token::LBrac => "`[`".to_string(),
  123. Token::RBrac => "`]`".to_string(),
  124. Token::Pipe => "`|`".to_string(),
  125. Token::Colon => "`:`".to_string(),
  126. Token::Comma => "`,`".to_string(),
  127. Token::Semi => "`;`".to_string(),
  128. Token::Dot => "`.`".to_string(),
  129. Token::Underscore => "`_`".to_string(),
  130. Token::DotDot => "`..`".to_string(),
  131. Token::Arrow => "`=>`".to_string(),
  132. Token::Assn => "`:=`".to_string(),
  133. Token::LitAssn => "`::=`".to_string(),
  134. Token::Puts => "`puts`".to_string(),
  135. Token::Case => "`case`".to_string(),
  136. Token::Let => "`let`".to_string(),
  137. Token::In => "`in`".to_string(),
  138. Token::Fix => "`fix`".to_string(),
  139. Token::Error => "error".to_string(),
  140. }
  141. }
  142. }
  143. #[derive(Debug)]
  144. pub struct LexerError {
  145. pub range: Span,
  146. }
  147. impl std::fmt::Display for LexerError {
  148. fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
  149. write!(f, "LexerError({}..{})", self.range.start, self.range.end)
  150. }
  151. }
  152. pub type Spanned<Tok, Loc, Error> = Result<(Loc, Tok, Loc), Error>;
  153. pub fn tokens(source: &str) -> impl Iterator<Item = Spanned<Token<'_>, usize, LexerError>> {
  154. Token::lexer(source)
  155. .spanned()
  156. .map(move |(token, range)| match token {
  157. Token::Error => Err(LexerError {
  158. range: Span {
  159. start: range.start as u32,
  160. end: range.end as u32,
  161. },
  162. }),
  163. token => Ok((range.start, token, range.end)),
  164. })
  165. }
  166. #[cfg(test)]
  167. mod test {
  168. use logos::Logos;
  169. use super::Token;
  170. #[test]
  171. fn simple_lexer_test() {
  172. let mut lex = Token::lexer("x := Foo (* ignore *) | \"bar\";");
  173. assert_eq!(lex.next(), Some(Token::Var("x")));
  174. assert_eq!(lex.next(), Some(Token::Assn));
  175. assert_eq!(lex.next(), Some(Token::Atom("Foo")));
  176. assert_eq!(lex.next(), Some(Token::Pipe));
  177. assert_eq!(lex.next(), Some(Token::Str("bar".to_owned())));
  178. assert_eq!(lex.next(), Some(Token::Semi));
  179. assert_eq!(lex.next(), None)
  180. }
  181. }