mirror of https://github.com/vinc/moros.git
Add a regular expression engine (#222)
* Add a regular expression engine * Fix off by one error * Add + quantifier * Add more tests * Use assert_eq instead of assert in tests * Rewrite tests with an array * Fix bug in is_match_star * Use the same do while equivalent in is_match_plus * Add ? quantifier * Refactor engine code * Add backslash char * Group ifs in match * Add special escaped chars * Add doc * Add find command * Add Match#find * Show multiple matches in the same line * Dry Regex * Change matches color * Add greedy version of matching by default * Add MetaChar enum to fix matching escaped chars * Change function signatures * Remove macro_export * Add TODO * Find matching lines recursively * Handle special patterns
This commit is contained in:
parent
72f9baae6c
commit
de48c87e0c
|
@ -0,0 +1,21 @@
|
|||
# MOROS Regular Expression Engine
|
||||
|
||||
MOROS include a simplified regular expression engine with the following syntax:
|
||||
|
||||
- `\` escape the following character to its literal meaning
|
||||
- `^` matches the starting position within the string
|
||||
- `$` matches the ending position within the string
|
||||
- `*` matches the preceding element zero or more times
|
||||
- `+` matches the preceding element one or more times
|
||||
- `?` matches the preceding element zero or one time
|
||||
- `.` matches any single character
|
||||
- `\w` matches any alphanumeric character
|
||||
- `\W` matches any non-alphanumeric character
|
||||
- `\d` matches any numeric character
|
||||
- `\D` matches any non-numeric character
|
||||
- `\w` matches any whitespace character
|
||||
- `\W` matches any whitespace character
|
||||
|
||||
The engine is UTF-8 aware, so for example the unicode character `é` will be
|
||||
matched by `\w` even if it's not present in the ASCII table and has a size
|
||||
of two bytes.
|
|
@ -20,5 +20,7 @@ pub mod console;
|
|||
pub mod font;
|
||||
pub mod fs;
|
||||
pub mod prompt;
|
||||
pub mod regex;
|
||||
pub mod syscall;
|
||||
pub mod vga;
|
||||
// TODO: add mod wildcard
|
||||
|
|
|
@ -0,0 +1,284 @@
|
|||
use alloc::string::{String, ToString};
|
||||
use alloc::vec::Vec;
|
||||
use core::convert::From;
|
||||
use core::ops::RangeBounds;
|
||||
|
||||
// TODO: Remove this when tests are done
|
||||
const DEBUG: bool = false;
|
||||
macro_rules! debug {
|
||||
($($arg:tt)*) => ({
|
||||
if DEBUG {
|
||||
println!("{}", format_args!($($arg)*));
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// See "A Regular Expression Matcher" by Rob Pike and Brian Kernighan (2007)
|
||||
|
||||
#[derive(Debug)]
|
||||
enum MetaChar {
|
||||
Any,
|
||||
Numeric,
|
||||
Whitespace,
|
||||
Alphanumeric,
|
||||
NonNumeric,
|
||||
NonWhitespace,
|
||||
NonAlphanumeric,
|
||||
Literal(char),
|
||||
}
|
||||
|
||||
impl From<char> for MetaChar {
|
||||
fn from(c: char) -> Self {
|
||||
match c {
|
||||
'.' => MetaChar::Any,
|
||||
_ => MetaChar::Literal(c),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
trait MetaCharExt {
|
||||
fn from_escaped(c: char) -> Self;
|
||||
fn contains(&self, c: char) -> bool;
|
||||
}
|
||||
|
||||
impl MetaCharExt for MetaChar {
|
||||
fn from_escaped(c: char) -> Self {
|
||||
match c {
|
||||
'd' => MetaChar::Numeric,
|
||||
's' => MetaChar::Whitespace,
|
||||
'w' => MetaChar::Alphanumeric,
|
||||
'D' => MetaChar::NonNumeric,
|
||||
'S' => MetaChar::NonWhitespace,
|
||||
'W' => MetaChar::NonAlphanumeric,
|
||||
_ => MetaChar::Literal(c),
|
||||
}
|
||||
}
|
||||
fn contains(&self, c: char) -> bool {
|
||||
match self {
|
||||
MetaChar::Any => true,
|
||||
MetaChar::Numeric => c.is_numeric(),
|
||||
MetaChar::Whitespace => c.is_whitespace(),
|
||||
MetaChar::Alphanumeric => c.is_alphanumeric(),
|
||||
MetaChar::NonNumeric => !c.is_numeric(),
|
||||
MetaChar::NonWhitespace => !c.is_whitespace(),
|
||||
MetaChar::NonAlphanumeric => !c.is_alphanumeric(),
|
||||
MetaChar::Literal(lc) => c == *lc,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Regex(String);
|
||||
|
||||
impl Regex {
|
||||
pub fn new(re: &str) -> Self {
|
||||
debug!("debug: Regex::new({:?})", re);
|
||||
Self(re.to_string())
|
||||
}
|
||||
pub fn is_match(&self, text: &str) -> bool {
|
||||
self.find(text).is_some()
|
||||
}
|
||||
pub fn find(&self, text: &str) -> Option<(usize, usize)> {
|
||||
let vec_re: Vec<char> = self.0.chars().collect();
|
||||
let vec_text: Vec<char> = text.chars().collect();
|
||||
let mut start = 0;
|
||||
let mut end = 0;
|
||||
if is_match(&vec_re[..], &vec_text[..], &mut start, &mut end) {
|
||||
Some((start, end))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn is_match(re: &[char], text: &[char], start: &mut usize, end: &mut usize) -> bool {
|
||||
debug!("debug: is_match({:?}, {:?})", re, text);
|
||||
if re.len() == 0 {
|
||||
return true;
|
||||
}
|
||||
if re[0] == '^' {
|
||||
*end = 1;
|
||||
return is_match_here(&re[1..], text, end);
|
||||
}
|
||||
let mut i = 0;
|
||||
let n = text.len();
|
||||
loop {
|
||||
*start = i;
|
||||
*end = i;
|
||||
if is_match_here(re, &text[i..], end) {
|
||||
return true;
|
||||
}
|
||||
if i == n {
|
||||
return false;
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
fn is_match_here(re: &[char], text: &[char], end: &mut usize) -> bool {
|
||||
debug!("debug: is_match_here({:?}, {:?})", re, text);
|
||||
if re.len() == 0 {
|
||||
return true;
|
||||
}
|
||||
if re[0] == '$' {
|
||||
return text.len() == 0;
|
||||
}
|
||||
let (mc, i) = if re.len() > 1 && re[0] == '\\' {
|
||||
(MetaChar::from_escaped(re[1]), 1)
|
||||
} else {
|
||||
(MetaChar::from(re[0]), 0)
|
||||
};
|
||||
if re.len() > i + 1 {
|
||||
let lazy = re.len() > i + 2 && re[i + 2] == '?';
|
||||
let j = if lazy { i + 3 } else { i + 2 };
|
||||
|
||||
match re[i + 1] {
|
||||
'*' => return is_match_star(lazy, mc, &re[j..], text, end),
|
||||
'+' => return is_match_plus(lazy, mc, &re[j..], text, end),
|
||||
'?' => return is_match_ques(lazy, mc, &re[j..], text, end),
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
if text.len() != 0 && mc.contains(text[0]) {
|
||||
*end += 1;
|
||||
let j = i + 1;
|
||||
return is_match_here(&re[j..], &text[1..], end);
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
fn is_match_star(lazy: bool, mc: MetaChar, re: &[char], text: &[char], end: &mut usize) -> bool {
|
||||
debug!("debug: is_match_star({:?}, {:?}, {:?}", mc, re, text);
|
||||
is_match_char(lazy, mc, re, text, .., end)
|
||||
}
|
||||
|
||||
fn is_match_plus(lazy: bool, mc: MetaChar, re: &[char], text: &[char], end: &mut usize) -> bool {
|
||||
debug!("debug: is_match_plus({:?}, {:?}, {:?}", mc, re, text);
|
||||
is_match_char(lazy, mc, re, text, 1.., end)
|
||||
}
|
||||
|
||||
fn is_match_ques(lazy: bool, mc: MetaChar, re: &[char], text: &[char], end: &mut usize) -> bool {
|
||||
debug!("debug: is_match_ques({:?}, {:?}, {:?}", mc, re, text);
|
||||
is_match_char(lazy, mc, re, text, ..2, end)
|
||||
}
|
||||
|
||||
fn is_match_char<T: RangeBounds<usize>>(lazy: bool, mc: MetaChar, re: &[char], text: &[char], range: T, end: &mut usize) -> bool {
|
||||
debug!("debug: is_match_char({:?}, {:?}, {:?}", mc, re, text);
|
||||
let mut i = 0;
|
||||
let n = text.len();
|
||||
|
||||
if !lazy {
|
||||
loop {
|
||||
if i == n || !(mc.contains(text[i])) {
|
||||
break;
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
loop {
|
||||
if is_match_here(re, &text[i..], end) && range.contains(&i) {
|
||||
*end += i;
|
||||
return true;
|
||||
}
|
||||
if lazy {
|
||||
if i == n || !(mc.contains(text[i])) {
|
||||
return false;
|
||||
}
|
||||
i += 1;
|
||||
} else {
|
||||
if i == 0 {
|
||||
return false;
|
||||
}
|
||||
i -= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test_case]
|
||||
fn test_regex() {
|
||||
let tests = [
|
||||
("", "aaa", true),
|
||||
("", "", true),
|
||||
("aaa", "aaa", true),
|
||||
("aaa", "bbb", false),
|
||||
("a.a", "aaa", true),
|
||||
("a.a", "aba", true),
|
||||
("a.a", "abb", false),
|
||||
|
||||
("a*", "aaa", true),
|
||||
("a*b", "aab", true),
|
||||
("a*b*", "aabb", true),
|
||||
("a*b*", "bb", true),
|
||||
("a.*", "abb", true),
|
||||
(".*", "aaa", true),
|
||||
("a.*", "a", true),
|
||||
|
||||
("a.+", "ab", true),
|
||||
("a.+", "abb", true),
|
||||
("a.+", "a", false),
|
||||
("a.+b", "ab", false),
|
||||
("a.+b", "abb", true),
|
||||
(".+", "abb", true),
|
||||
(".+", "b", true),
|
||||
|
||||
("a?b", "abb", true),
|
||||
("a?b", "bb", true),
|
||||
("a?b", "aabb", true),
|
||||
|
||||
("^a.*a$", "aaa", true),
|
||||
("^#.*", "#aaa", true),
|
||||
("^#.*", "a#aaa", false),
|
||||
(".*;$", "aaa;", true),
|
||||
(".*;$", "aaa;a", false),
|
||||
("^.*$", "aaa", true),
|
||||
|
||||
("a.b", "abb", true),
|
||||
("a.b", "a.b", true),
|
||||
("a\\.b", "abb", false),
|
||||
("a\\.b", "a.b", true),
|
||||
("a\\\\.b", "abb", false),
|
||||
("a\\\\.b", "a.b", false),
|
||||
("a\\\\.b", "a\\bb", true),
|
||||
("a\\\\.b", "a\\.b", true),
|
||||
("a\\\\\\.b", "a\\bb", false),
|
||||
("a\\\\\\.b", "a\\.b", true),
|
||||
("a\\\\\\.b", "a\\\\bb", false),
|
||||
("a\\\\\\.b", "a\\\\.b", false),
|
||||
("a\\\\\\\\.b", "a\\bb", false),
|
||||
("a\\\\\\\\.b", "a\\.b", false),
|
||||
("a\\\\\\\\.b", "a\\\\bb", true),
|
||||
("a\\\\\\\\.b", "a\\\\.b", true),
|
||||
|
||||
("a\\wb", "aéb", true),
|
||||
("a\\wb", "awb", true),
|
||||
("a\\wb", "abb", true),
|
||||
("a\\wb", "a1b", true),
|
||||
("a\\wb", "a.b", false),
|
||||
("a\\Wb", "aWb", false),
|
||||
("a\\Wb", "abb", false),
|
||||
("a\\Wb", "a1b", false),
|
||||
("a\\Wb", "a.b", true),
|
||||
("a\\db", "abb", false),
|
||||
("a\\db", "a1b", true),
|
||||
("a\\Db", "abb", true),
|
||||
("a\\Db", "a1b", false),
|
||||
("a\\sb", "abb", false),
|
||||
("a\\sb", "a b", true),
|
||||
("a\\Sb", "abb", true),
|
||||
("a\\Sb", "a b", false),
|
||||
|
||||
("a\\.*d", "a..d", true),
|
||||
("a\\.*d", "a.cd", false),
|
||||
("a\\w*d", "abcd", true),
|
||||
];
|
||||
for (re, text, is_match) in tests {
|
||||
assert!(Regex::new(re).is_match(text) == is_match, "Regex::new(\"{}\").is_match(\"{}\") == {}", re, text, is_match);
|
||||
}
|
||||
|
||||
assert_eq!(Regex::new(".*").find("abcd"), Some((0, 4)));
|
||||
assert_eq!(Regex::new("b.*c").find("aaabbbcccddd"), Some((3, 9)));
|
||||
assert_eq!(Regex::new("b.*?c").find("aaabbbcccddd"), Some((3, 7)));
|
||||
assert_eq!(Regex::new("a\\w*d").find("abcdabcd"), Some((0, 8)));
|
||||
assert_eq!(Regex::new("a\\w*?d").find("abcdabcd"), Some((0, 4)));
|
||||
}
|
|
@ -91,6 +91,10 @@ impl File {
|
|||
None
|
||||
}
|
||||
|
||||
pub fn name(&self) -> String {
|
||||
self.name.clone()
|
||||
}
|
||||
|
||||
pub fn size(&self) -> usize {
|
||||
self.size as usize
|
||||
}
|
||||
|
|
|
@ -0,0 +1,134 @@
|
|||
use crate::{sys, usr};
|
||||
use crate::api::fs;
|
||||
use crate::api::regex::Regex;
|
||||
use crate::api::console::Style;
|
||||
|
||||
use alloc::format;
|
||||
use alloc::string::{String, ToString};
|
||||
use alloc::vec::Vec;
|
||||
use core::iter::FromIterator;
|
||||
|
||||
struct PrintingState {
|
||||
is_first_match: bool,
|
||||
is_recursive: bool,
|
||||
}
|
||||
|
||||
impl PrintingState {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
is_first_match: true,
|
||||
is_recursive: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// > find /tmp -name *.txt -line hello
|
||||
pub fn main(args: &[&str]) -> usr::shell::ExitCode {
|
||||
let mut path: &str = &sys::process::dir();
|
||||
let mut name = None;
|
||||
let mut line = None;
|
||||
let mut i = 1;
|
||||
let n = args.len();
|
||||
while i < n {
|
||||
match args[i] {
|
||||
"--name" | "-n" => {
|
||||
if i + 1 < n {
|
||||
name = Some(args[i + 1]);
|
||||
i += 1;
|
||||
} else {
|
||||
println!("Missing name");
|
||||
return usr::shell::ExitCode::CommandError;
|
||||
}
|
||||
},
|
||||
"--line" | "-l" => {
|
||||
if i + 1 < n {
|
||||
line = Some(args[i + 1]);
|
||||
i += 1;
|
||||
} else {
|
||||
println!("Missing line");
|
||||
return usr::shell::ExitCode::CommandError;
|
||||
}
|
||||
},
|
||||
_ => path = args[i],
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
|
||||
if name.is_some() {
|
||||
todo!();
|
||||
}
|
||||
|
||||
let mut state = PrintingState::new();
|
||||
if let Some(pattern) = line {
|
||||
print_matching_lines(path, pattern, &mut state);
|
||||
}
|
||||
|
||||
usr::shell::ExitCode::CommandSuccessful
|
||||
}
|
||||
|
||||
fn print_matching_lines(path: &str, pattern: &str, state: &mut PrintingState) {
|
||||
if let Some(dir) = sys::fs::Dir::open(path) {
|
||||
state.is_recursive = true;
|
||||
for file in dir.read() {
|
||||
let file_path = format!("{}/{}", path, file.name());
|
||||
if file.is_dir() {
|
||||
print_matching_lines(&file_path, pattern, state);
|
||||
} else {
|
||||
print_matching_lines_in_file(&file_path, pattern, state);
|
||||
}
|
||||
}
|
||||
} else if sys::fs::File::open(path).is_some() {
|
||||
print_matching_lines_in_file(&path, pattern, state);
|
||||
}
|
||||
}
|
||||
|
||||
fn print_matching_lines_in_file(path: &str, pattern: &str, state: &mut PrintingState) {
|
||||
let name_color = Style::color("Cyan");
|
||||
let line_color = Style::color("Yellow");
|
||||
let match_color = Style::color("LightRed");
|
||||
let reset = Style::reset();
|
||||
|
||||
let re = Regex::new(pattern);
|
||||
if let Ok(lines) = fs::read_to_string(path) {
|
||||
let mut matches = Vec::new();
|
||||
for (i, line) in lines.split('\n').enumerate() {
|
||||
let line: Vec<char> = line.chars().collect();
|
||||
let mut l = String::new();
|
||||
let mut j = 0;
|
||||
while let Some((a, b)) = re.find(&String::from_iter(&line[j..])) {
|
||||
let m = j + a;
|
||||
let n = j + b;
|
||||
let before = String::from_iter(&line[j..m]);
|
||||
let matched = String::from_iter(&line[m..n]);
|
||||
l = format!("{}{}{}{}{}", l, before, match_color, matched, reset);
|
||||
j = n;
|
||||
if m == n || n >= line.len() {
|
||||
// Some patterns like "" or ".*?" would never move the
|
||||
// cursor on the line and some like ".*" would match the
|
||||
// whole line at once. In both cases we print the line,
|
||||
// and we color it in the latter case.
|
||||
break;
|
||||
}
|
||||
}
|
||||
if !l.is_empty() {
|
||||
let after = String::from_iter(&line[j..]);
|
||||
l.push_str(&after);
|
||||
matches.push((i + 1, l)); // 1-index line numbers
|
||||
}
|
||||
}
|
||||
if !matches.is_empty() {
|
||||
if state.is_recursive {
|
||||
if state.is_first_match {
|
||||
state.is_first_match = false;
|
||||
} else {
|
||||
println!();
|
||||
}
|
||||
println!("{}{}{}", name_color, path, reset);
|
||||
}
|
||||
let width = matches[matches.len() - 1].0.to_string().len();
|
||||
for (i, line) in matches {
|
||||
println!("{}{:>width$}:{} {}", line_color, i, reset, line, width = width);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -8,6 +8,7 @@ pub mod dhcp;
|
|||
pub mod disk;
|
||||
pub mod editor;
|
||||
pub mod env;
|
||||
pub mod find;
|
||||
pub mod geotime;
|
||||
pub mod halt;
|
||||
pub mod help;
|
||||
|
|
|
@ -132,7 +132,7 @@ pub fn exec(cmd: &str) -> ExitCode {
|
|||
"c" | "copy" => usr::copy::main(&args),
|
||||
"d" | "del" | "delete" => usr::delete::main(&args),
|
||||
"e" | "edit" => usr::editor::main(&args),
|
||||
"f" | "find" => ExitCode::CommandUnknown,
|
||||
"f" | "find" => usr::find::main(&args),
|
||||
"g" | "go" | "goto" => change_dir(&args),
|
||||
"h" | "help" => usr::help::main(&args),
|
||||
"i" => ExitCode::CommandUnknown,
|
||||
|
|
Loading…
Reference in New Issue