Add a regular expression engine (#222)

* Add a regular expression engine

* Fix off by one error

* Add + quantifier

* Add more tests

* Use assert_eq instead of assert in tests

* Rewrite tests with an array

* Fix bug in is_match_star

* Use the same do while equivalent in is_match_plus

* Add ? quantifier

* Refactor engine code

* Add backslash char

* Group ifs in match

* Add special escaped chars

* Add doc

* Add find command

* Add Match#find

* Show multiple matches in the same line

* Dry Regex

* Change matches color

* Add greedy version of matching by default

* Add MetaChar enum to fix matching escaped chars

* Change function signatures

* Remove macro_export

* Add TODO

* Find matching lines recursively

* Handle special patterns
This commit is contained in:
Vincent Ollivier 2021-08-01 11:35:24 +02:00 committed by GitHub
parent 72f9baae6c
commit de48c87e0c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 447 additions and 1 deletions

21
doc/regex.md Normal file
View File

@ -0,0 +1,21 @@
# MOROS Regular Expression Engine
MOROS include a simplified regular expression engine with the following syntax:
- `\` escape the following character to its literal meaning
- `^` matches the starting position within the string
- `$` matches the ending position within the string
- `*` matches the preceding element zero or more times
- `+` matches the preceding element one or more times
- `?` matches the preceding element zero or one time
- `.` matches any single character
- `\w` matches any alphanumeric character
- `\W` matches any non-alphanumeric character
- `\d` matches any numeric character
- `\D` matches any non-numeric character
- `\w` matches any whitespace character
- `\W` matches any whitespace character
The engine is UTF-8 aware, so for example the unicode character `é` will be
matched by `\w` even if it's not present in the ASCII table and has a size
of two bytes.

View File

@ -20,5 +20,7 @@ pub mod console;
pub mod font;
pub mod fs;
pub mod prompt;
pub mod regex;
pub mod syscall;
pub mod vga;
// TODO: add mod wildcard

284
src/api/regex.rs Normal file
View File

@ -0,0 +1,284 @@
use alloc::string::{String, ToString};
use alloc::vec::Vec;
use core::convert::From;
use core::ops::RangeBounds;
// TODO: Remove this when tests are done
const DEBUG: bool = false;
macro_rules! debug {
($($arg:tt)*) => ({
if DEBUG {
println!("{}", format_args!($($arg)*));
}
});
}
// See "A Regular Expression Matcher" by Rob Pike and Brian Kernighan (2007)
#[derive(Debug)]
enum MetaChar {
Any,
Numeric,
Whitespace,
Alphanumeric,
NonNumeric,
NonWhitespace,
NonAlphanumeric,
Literal(char),
}
impl From<char> for MetaChar {
fn from(c: char) -> Self {
match c {
'.' => MetaChar::Any,
_ => MetaChar::Literal(c),
}
}
}
trait MetaCharExt {
fn from_escaped(c: char) -> Self;
fn contains(&self, c: char) -> bool;
}
impl MetaCharExt for MetaChar {
fn from_escaped(c: char) -> Self {
match c {
'd' => MetaChar::Numeric,
's' => MetaChar::Whitespace,
'w' => MetaChar::Alphanumeric,
'D' => MetaChar::NonNumeric,
'S' => MetaChar::NonWhitespace,
'W' => MetaChar::NonAlphanumeric,
_ => MetaChar::Literal(c),
}
}
fn contains(&self, c: char) -> bool {
match self {
MetaChar::Any => true,
MetaChar::Numeric => c.is_numeric(),
MetaChar::Whitespace => c.is_whitespace(),
MetaChar::Alphanumeric => c.is_alphanumeric(),
MetaChar::NonNumeric => !c.is_numeric(),
MetaChar::NonWhitespace => !c.is_whitespace(),
MetaChar::NonAlphanumeric => !c.is_alphanumeric(),
MetaChar::Literal(lc) => c == *lc,
}
}
}
#[derive(Debug)]
pub struct Regex(String);
impl Regex {
pub fn new(re: &str) -> Self {
debug!("debug: Regex::new({:?})", re);
Self(re.to_string())
}
pub fn is_match(&self, text: &str) -> bool {
self.find(text).is_some()
}
pub fn find(&self, text: &str) -> Option<(usize, usize)> {
let vec_re: Vec<char> = self.0.chars().collect();
let vec_text: Vec<char> = text.chars().collect();
let mut start = 0;
let mut end = 0;
if is_match(&vec_re[..], &vec_text[..], &mut start, &mut end) {
Some((start, end))
} else {
None
}
}
}
fn is_match(re: &[char], text: &[char], start: &mut usize, end: &mut usize) -> bool {
debug!("debug: is_match({:?}, {:?})", re, text);
if re.len() == 0 {
return true;
}
if re[0] == '^' {
*end = 1;
return is_match_here(&re[1..], text, end);
}
let mut i = 0;
let n = text.len();
loop {
*start = i;
*end = i;
if is_match_here(re, &text[i..], end) {
return true;
}
if i == n {
return false;
}
i += 1;
}
}
fn is_match_here(re: &[char], text: &[char], end: &mut usize) -> bool {
debug!("debug: is_match_here({:?}, {:?})", re, text);
if re.len() == 0 {
return true;
}
if re[0] == '$' {
return text.len() == 0;
}
let (mc, i) = if re.len() > 1 && re[0] == '\\' {
(MetaChar::from_escaped(re[1]), 1)
} else {
(MetaChar::from(re[0]), 0)
};
if re.len() > i + 1 {
let lazy = re.len() > i + 2 && re[i + 2] == '?';
let j = if lazy { i + 3 } else { i + 2 };
match re[i + 1] {
'*' => return is_match_star(lazy, mc, &re[j..], text, end),
'+' => return is_match_plus(lazy, mc, &re[j..], text, end),
'?' => return is_match_ques(lazy, mc, &re[j..], text, end),
_ => {}
}
}
if text.len() != 0 && mc.contains(text[0]) {
*end += 1;
let j = i + 1;
return is_match_here(&re[j..], &text[1..], end);
}
false
}
fn is_match_star(lazy: bool, mc: MetaChar, re: &[char], text: &[char], end: &mut usize) -> bool {
debug!("debug: is_match_star({:?}, {:?}, {:?}", mc, re, text);
is_match_char(lazy, mc, re, text, .., end)
}
fn is_match_plus(lazy: bool, mc: MetaChar, re: &[char], text: &[char], end: &mut usize) -> bool {
debug!("debug: is_match_plus({:?}, {:?}, {:?}", mc, re, text);
is_match_char(lazy, mc, re, text, 1.., end)
}
fn is_match_ques(lazy: bool, mc: MetaChar, re: &[char], text: &[char], end: &mut usize) -> bool {
debug!("debug: is_match_ques({:?}, {:?}, {:?}", mc, re, text);
is_match_char(lazy, mc, re, text, ..2, end)
}
fn is_match_char<T: RangeBounds<usize>>(lazy: bool, mc: MetaChar, re: &[char], text: &[char], range: T, end: &mut usize) -> bool {
debug!("debug: is_match_char({:?}, {:?}, {:?}", mc, re, text);
let mut i = 0;
let n = text.len();
if !lazy {
loop {
if i == n || !(mc.contains(text[i])) {
break;
}
i += 1;
}
}
loop {
if is_match_here(re, &text[i..], end) && range.contains(&i) {
*end += i;
return true;
}
if lazy {
if i == n || !(mc.contains(text[i])) {
return false;
}
i += 1;
} else {
if i == 0 {
return false;
}
i -= 1;
}
}
}
#[test_case]
fn test_regex() {
let tests = [
("", "aaa", true),
("", "", true),
("aaa", "aaa", true),
("aaa", "bbb", false),
("a.a", "aaa", true),
("a.a", "aba", true),
("a.a", "abb", false),
("a*", "aaa", true),
("a*b", "aab", true),
("a*b*", "aabb", true),
("a*b*", "bb", true),
("a.*", "abb", true),
(".*", "aaa", true),
("a.*", "a", true),
("a.+", "ab", true),
("a.+", "abb", true),
("a.+", "a", false),
("a.+b", "ab", false),
("a.+b", "abb", true),
(".+", "abb", true),
(".+", "b", true),
("a?b", "abb", true),
("a?b", "bb", true),
("a?b", "aabb", true),
("^a.*a$", "aaa", true),
("^#.*", "#aaa", true),
("^#.*", "a#aaa", false),
(".*;$", "aaa;", true),
(".*;$", "aaa;a", false),
("^.*$", "aaa", true),
("a.b", "abb", true),
("a.b", "a.b", true),
("a\\.b", "abb", false),
("a\\.b", "a.b", true),
("a\\\\.b", "abb", false),
("a\\\\.b", "a.b", false),
("a\\\\.b", "a\\bb", true),
("a\\\\.b", "a\\.b", true),
("a\\\\\\.b", "a\\bb", false),
("a\\\\\\.b", "a\\.b", true),
("a\\\\\\.b", "a\\\\bb", false),
("a\\\\\\.b", "a\\\\.b", false),
("a\\\\\\\\.b", "a\\bb", false),
("a\\\\\\\\.b", "a\\.b", false),
("a\\\\\\\\.b", "a\\\\bb", true),
("a\\\\\\\\.b", "a\\\\.b", true),
("a\\wb", "aéb", true),
("a\\wb", "awb", true),
("a\\wb", "abb", true),
("a\\wb", "a1b", true),
("a\\wb", "a.b", false),
("a\\Wb", "aWb", false),
("a\\Wb", "abb", false),
("a\\Wb", "a1b", false),
("a\\Wb", "a.b", true),
("a\\db", "abb", false),
("a\\db", "a1b", true),
("a\\Db", "abb", true),
("a\\Db", "a1b", false),
("a\\sb", "abb", false),
("a\\sb", "a b", true),
("a\\Sb", "abb", true),
("a\\Sb", "a b", false),
("a\\.*d", "a..d", true),
("a\\.*d", "a.cd", false),
("a\\w*d", "abcd", true),
];
for (re, text, is_match) in tests {
assert!(Regex::new(re).is_match(text) == is_match, "Regex::new(\"{}\").is_match(\"{}\") == {}", re, text, is_match);
}
assert_eq!(Regex::new(".*").find("abcd"), Some((0, 4)));
assert_eq!(Regex::new("b.*c").find("aaabbbcccddd"), Some((3, 9)));
assert_eq!(Regex::new("b.*?c").find("aaabbbcccddd"), Some((3, 7)));
assert_eq!(Regex::new("a\\w*d").find("abcdabcd"), Some((0, 8)));
assert_eq!(Regex::new("a\\w*?d").find("abcdabcd"), Some((0, 4)));
}

View File

@ -91,6 +91,10 @@ impl File {
None
}
pub fn name(&self) -> String {
self.name.clone()
}
pub fn size(&self) -> usize {
self.size as usize
}

134
src/usr/find.rs Normal file
View File

@ -0,0 +1,134 @@
use crate::{sys, usr};
use crate::api::fs;
use crate::api::regex::Regex;
use crate::api::console::Style;
use alloc::format;
use alloc::string::{String, ToString};
use alloc::vec::Vec;
use core::iter::FromIterator;
struct PrintingState {
is_first_match: bool,
is_recursive: bool,
}
impl PrintingState {
fn new() -> Self {
Self {
is_first_match: true,
is_recursive: false,
}
}
}
// > find /tmp -name *.txt -line hello
pub fn main(args: &[&str]) -> usr::shell::ExitCode {
let mut path: &str = &sys::process::dir();
let mut name = None;
let mut line = None;
let mut i = 1;
let n = args.len();
while i < n {
match args[i] {
"--name" | "-n" => {
if i + 1 < n {
name = Some(args[i + 1]);
i += 1;
} else {
println!("Missing name");
return usr::shell::ExitCode::CommandError;
}
},
"--line" | "-l" => {
if i + 1 < n {
line = Some(args[i + 1]);
i += 1;
} else {
println!("Missing line");
return usr::shell::ExitCode::CommandError;
}
},
_ => path = args[i],
}
i += 1;
}
if name.is_some() {
todo!();
}
let mut state = PrintingState::new();
if let Some(pattern) = line {
print_matching_lines(path, pattern, &mut state);
}
usr::shell::ExitCode::CommandSuccessful
}
fn print_matching_lines(path: &str, pattern: &str, state: &mut PrintingState) {
if let Some(dir) = sys::fs::Dir::open(path) {
state.is_recursive = true;
for file in dir.read() {
let file_path = format!("{}/{}", path, file.name());
if file.is_dir() {
print_matching_lines(&file_path, pattern, state);
} else {
print_matching_lines_in_file(&file_path, pattern, state);
}
}
} else if sys::fs::File::open(path).is_some() {
print_matching_lines_in_file(&path, pattern, state);
}
}
fn print_matching_lines_in_file(path: &str, pattern: &str, state: &mut PrintingState) {
let name_color = Style::color("Cyan");
let line_color = Style::color("Yellow");
let match_color = Style::color("LightRed");
let reset = Style::reset();
let re = Regex::new(pattern);
if let Ok(lines) = fs::read_to_string(path) {
let mut matches = Vec::new();
for (i, line) in lines.split('\n').enumerate() {
let line: Vec<char> = line.chars().collect();
let mut l = String::new();
let mut j = 0;
while let Some((a, b)) = re.find(&String::from_iter(&line[j..])) {
let m = j + a;
let n = j + b;
let before = String::from_iter(&line[j..m]);
let matched = String::from_iter(&line[m..n]);
l = format!("{}{}{}{}{}", l, before, match_color, matched, reset);
j = n;
if m == n || n >= line.len() {
// Some patterns like "" or ".*?" would never move the
// cursor on the line and some like ".*" would match the
// whole line at once. In both cases we print the line,
// and we color it in the latter case.
break;
}
}
if !l.is_empty() {
let after = String::from_iter(&line[j..]);
l.push_str(&after);
matches.push((i + 1, l)); // 1-index line numbers
}
}
if !matches.is_empty() {
if state.is_recursive {
if state.is_first_match {
state.is_first_match = false;
} else {
println!();
}
println!("{}{}{}", name_color, path, reset);
}
let width = matches[matches.len() - 1].0.to_string().len();
for (i, line) in matches {
println!("{}{:>width$}:{} {}", line_color, i, reset, line, width = width);
}
}
}
}

View File

@ -8,6 +8,7 @@ pub mod dhcp;
pub mod disk;
pub mod editor;
pub mod env;
pub mod find;
pub mod geotime;
pub mod halt;
pub mod help;

View File

@ -132,7 +132,7 @@ pub fn exec(cmd: &str) -> ExitCode {
"c" | "copy" => usr::copy::main(&args),
"d" | "del" | "delete" => usr::delete::main(&args),
"e" | "edit" => usr::editor::main(&args),
"f" | "find" => ExitCode::CommandUnknown,
"f" | "find" => usr::find::main(&args),
"g" | "go" | "goto" => change_dir(&args),
"h" | "help" => usr::help::main(&args),
"i" => ExitCode::CommandUnknown,