Browse Source

start adding logos lexer

Getty Ritter 2 years ago
parent
commit
5a4ff1b1af
4 changed files with 161 additions and 0 deletions
  1. 43 0
      Cargo.lock
  2. 1 0
      Cargo.toml
  3. 116 0
      src/lexer.rs
  4. 1 0
      src/lib.rs

+ 43 - 0
Cargo.lock

@@ -46,6 +46,12 @@ version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a"
 
+[[package]]
+name = "beef"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bed554bd50246729a1ec158d08aa3235d1b69d94ad120ebe187e28894787e736"
+
 [[package]]
 name = "bit-set"
 version = "0.5.2"
@@ -137,6 +143,12 @@ version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "37ab347416e802de484e4d03c7316c48f1ecb56574dfd4a46a80f173ce1de04d"
 
+[[package]]
+name = "fnv"
+version = "1.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
+
 [[package]]
 name = "getrandom"
 version = "0.2.3"
@@ -253,12 +265,37 @@ dependencies = [
  "cfg-if",
 ]
 
+[[package]]
+name = "logos"
+version = "0.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "427e2abca5be13136da9afdbf874e6b34ad9001dd70f2b103b083a85daa7b345"
+dependencies = [
+ "logos-derive",
+]
+
+[[package]]
+name = "logos-derive"
+version = "0.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "56a7d287fd2ac3f75b11f19a1c8a874a7d55744bd91f7a1b3e7cf87d4343c36d"
+dependencies = [
+ "beef",
+ "fnv",
+ "proc-macro2",
+ "quote",
+ "regex-syntax",
+ "syn",
+ "utf8-ranges",
+]
+
 [[package]]
 name = "matzo"
 version = "0.1.0"
 dependencies = [
  "lalrpop",
  "lalrpop-util",
+ "logos",
  "pretty_assertions",
  "rand",
  "regex",
@@ -527,6 +564,12 @@ version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3"
 
+[[package]]
+name = "utf8-ranges"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b4ae116fef2b7fea257ed6440d3cfcff7f190865f170cdad00bb6465bf18ecba"
+
 [[package]]
 name = "wasi"
 version = "0.10.2+wasi-snapshot-preview1"

+ 1 - 0
Cargo.toml

@@ -22,6 +22,7 @@ path = "tools/regenerate.rs"
 regex = "1"
 rand = "*"
 lalrpop-util = { version = "*", features = ["lexer"] }
+logos = "*"
 
 [build-dependencies.lalrpop]
 version = "*"

+ 116 - 0
src/lexer.rs

@@ -0,0 +1,116 @@
+use logos::{Lexer, Logos};
+
+fn parse_num<'a>(lex: &mut Lexer<'a, Token<'a>>) -> Option<i64> {
+    let slice = lex.slice();
+    Some(slice.parse().ok()?)
+}
+
+fn parse_str<'a>(lex: &mut Lexer<'a, Token<'a>>) -> Option<String> {
+    let mut buf = String::new();
+    let s = lex.slice();
+    let mut src = s[1..s.len() - 1].chars().into_iter();
+    while let Some(c) = src.next() {
+        if c == '\\' {
+            match src.next() {
+                Some('n') => buf.push('\n'),
+                Some('t') => buf.push('\t'),
+                Some('r') => buf.push('\r'),
+                Some(c) => buf.push(c),
+                None => return None,
+            }
+        } else {
+            buf.push(c);
+        }
+    }
+    Some(buf)
+}
+
+#[derive(Logos, Debug, PartialEq)]
+enum Token<'a> {
+    #[token("<")]
+    LAngle,
+    #[token(">")]
+    RAngle,
+
+    #[token("(")]
+    LPar,
+    #[token(")")]
+    RPar,
+
+    #[token("{")]
+    LCurl,
+    #[token("}")]
+    RCurl,
+
+    #[token("|")]
+    Pipe,
+
+    #[token(":")]
+    Colon,
+
+    #[token(",")]
+    Comma,
+
+    #[token(";")]
+    Semi,
+
+    #[token(".")]
+    Dot,
+
+    #[token("..")]
+    DotDot,
+
+    #[token(":=")]
+    Assn,
+
+    #[token("::=")]
+    LitAssn,
+
+    #[token("puts")]
+    Puts,
+
+    #[token("case")]
+    Case,
+
+    #[token("let")]
+    Let,
+
+    #[token("in")]
+    In,
+
+    #[regex(r"\p{Ll}(\pL|[0-9_-])*")]
+    Var(&'a str),
+
+    #[regex(r"\p{Lu}(\pL|[0-9_-])*")]
+    Atom(&'a str),
+
+    #[regex(r"[0-9]+", parse_num)]
+    Num(i64),
+
+    #[regex(r"'([^'\\]|\\.)*'", parse_str)]
+    Str(String),
+
+    #[error]
+    #[regex(r"[ \t\n\f]+", logos::skip)]
+    #[regex(r"\(\*([^*]|\*[^)])*\*\)", logos::skip)]
+    Error
+}
+
+#[cfg(test)]
+mod test {
+    use logos::Logos;
+
+    use super::Token;
+
+    #[test]
+    fn simple_lexer_test() {
+        let mut lex = Token::lexer("x := Foo (* ignore *) | 'bar';");
+        assert_eq!(lex.next(), Some(Token::Var("x")));
+        assert_eq!(lex.next(), Some(Token::Assn));
+        assert_eq!(lex.next(), Some(Token::Atom("Foo")));
+        assert_eq!(lex.next(), Some(Token::Pipe));
+        assert_eq!(lex.next(), Some(Token::Str("bar".to_owned())));
+        assert_eq!(lex.next(), Some(Token::Semi));
+        assert_eq!(lex.next(), None)
+    }
+}

+ 1 - 0
src/lib.rs

@@ -1,6 +1,7 @@
 #[macro_use]
 extern crate lalrpop_util;
 
+pub mod lexer;
 pub mod ast;
 pub mod interp;