lexer.rs 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199
  1. use logos::{Lexer, Logos};
  2. #[derive(Debug, Copy, Clone, PartialEq, Eq)]
  3. pub struct FileRef {
  4. pub idx: usize,
  5. }
  6. /// A location in a source file
  7. #[derive(Debug, Clone, Copy)]
  8. pub struct Span {
  9. pub start: u32,
  10. pub end: u32,
  11. }
  12. impl Span {
  13. pub fn empty() -> Span {
  14. Span {
  15. start: u32::MAX,
  16. end: u32::MAX,
  17. }
  18. }
  19. pub fn exists(&self) -> bool {
  20. self.start != u32::MAX && self.end != u32::MAX
  21. }
  22. }
  23. #[derive(Debug, Clone, Copy)]
  24. pub struct Located<T> {
  25. pub item: T,
  26. pub span: Span,
  27. pub file: FileRef,
  28. }
  29. impl<T> Located<T> {
  30. pub fn new(item: T, file: FileRef, span: Span) -> Located<T> {
  31. Located { span, file, item }
  32. }
  33. }
  34. impl<T: Clone> Located<T> {
  35. pub fn map<R>(&self, func: impl FnOnce(T) -> R) -> Located<R> {
  36. Located {
  37. item: func(self.item.clone()),
  38. span: self.span,
  39. file: self.file,
  40. }
  41. }
  42. }
  43. fn parse_num<'a>(lex: &mut Lexer<'a, Token<'a>>) -> Option<i64> {
  44. let slice = lex.slice();
  45. slice.parse().ok()
  46. }
  47. fn parse_str<'a>(lex: &mut Lexer<'a, Token<'a>>) -> Option<String> {
  48. let mut buf = String::new();
  49. let s = lex.slice();
  50. let mut src = s[1..s.len() - 1].chars();
  51. while let Some(c) = src.next() {
  52. if c == '\\' {
  53. match src.next() {
  54. Some('n') => buf.push('\n'),
  55. Some('t') => buf.push('\t'),
  56. Some('r') => buf.push('\r'),
  57. Some(c) => buf.push(c),
  58. None => return None,
  59. }
  60. } else {
  61. buf.push(c);
  62. }
  63. }
  64. Some(buf)
  65. }
  66. #[derive(Logos, Debug, PartialEq, Clone)]
  67. pub enum Token<'a> {
  68. #[token("<")]
  69. LAngle,
  70. #[token(">")]
  71. RAngle,
  72. #[token("(")]
  73. LPar,
  74. #[token(")")]
  75. RPar,
  76. #[token("{")]
  77. LCurl,
  78. #[token("}")]
  79. RCurl,
  80. #[token("[")]
  81. LBrac,
  82. #[token("]")]
  83. RBrac,
  84. #[token("|")]
  85. Pipe,
  86. #[token(":")]
  87. Colon,
  88. #[token(",")]
  89. Comma,
  90. #[token(";")]
  91. Semi,
  92. #[token(".")]
  93. Dot,
  94. #[token("_")]
  95. Underscore,
  96. #[token("..")]
  97. DotDot,
  98. #[token("=>")]
  99. Arrow,
  100. #[token(":=")]
  101. Assn,
  102. #[token("::=")]
  103. LitAssn,
  104. #[token("puts")]
  105. Puts,
  106. #[token("case")]
  107. Case,
  108. #[token("let")]
  109. Let,
  110. #[token("in")]
  111. In,
  112. #[token("fix")]
  113. Fix,
  114. #[regex(r"\p{Ll}(\pL|[0-9_/-])*")]
  115. Var(&'a str),
  116. #[regex(r"\p{Lu}(\pL|[0-9_/-])*")]
  117. Atom(&'a str),
  118. #[regex(r"[0-9]+", parse_num)]
  119. Num(i64),
  120. #[regex(r"'([^'\\]|\\.)*'", parse_str)]
  121. #[regex("\"([^\"\\\\]|\\\\.)*\"", parse_str)]
  122. Str(String),
  123. #[error]
  124. #[regex(r"[ \t\n\f]+", logos::skip)]
  125. #[regex(r"\(\*([^*]|\*[^)])*\*\)", logos::skip)]
  126. Error,
  127. }
  128. #[derive(Debug)]
  129. pub struct LexerError;
  130. impl std::fmt::Display for LexerError {
  131. fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
  132. write!(f, "LexerError")
  133. }
  134. }
  135. pub type Spanned<Tok, Loc, Error> = Result<(Loc, Tok, Loc), Error>;
  136. pub fn tokens(source: &str) -> impl Iterator<Item = Spanned<Token<'_>, usize, LexerError>> {
  137. Token::lexer(source)
  138. .spanned()
  139. .map(move |(token, range)| match token {
  140. Token::Error => Err(LexerError),
  141. token => Ok((range.start, token, range.end)),
  142. })
  143. }
  144. #[cfg(test)]
  145. mod test {
  146. use logos::Logos;
  147. use super::Token;
  148. #[test]
  149. fn simple_lexer_test() {
  150. let mut lex = Token::lexer("x := Foo (* ignore *) | \"bar\";");
  151. assert_eq!(lex.next(), Some(Token::Var("x")));
  152. assert_eq!(lex.next(), Some(Token::Assn));
  153. assert_eq!(lex.next(), Some(Token::Atom("Foo")));
  154. assert_eq!(lex.next(), Some(Token::Pipe));
  155. assert_eq!(lex.next(), Some(Token::Str("bar".to_owned())));
  156. assert_eq!(lex.next(), Some(Token::Semi));
  157. assert_eq!(lex.next(), None)
  158. }
  159. }