Skip to content

Commit

Permalink
feat: implement regexp look-arounds (\b, \B) and anchors (^, $)
Browse files Browse the repository at this point in the history
  • Loading branch information
plusvic committed Jul 19, 2023
1 parent 1a40cfa commit 1e45ca2
Show file tree
Hide file tree
Showing 6 changed files with 305 additions and 124 deletions.
20 changes: 16 additions & 4 deletions yara-x/src/re/compiler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use std::mem::{size_of, size_of_val};

use regex_syntax::hir;
use regex_syntax::hir::{
visit, Class, ClassBytes, Hir, HirKind, Literal, Repetition,
visit, Class, ClassBytes, Hir, HirKind, Literal, Look, Repetition,
};

use yara_x_parser::ast::HexByte;
Expand Down Expand Up @@ -293,6 +293,16 @@ impl Compiler {
}
}

fn visit_post_look(&mut self, look: &Look) -> Location {
match look {
Look::Start => self.emit_instr(Instr::START),
Look::End => self.emit_instr(Instr::END),
Look::WordAscii => self.emit_instr(Instr::WORD_BOUNDARY),
Look::WordAsciiNegate => self.emit_instr(Instr::WORD_BOUNDARY_NEG),
_ => unreachable!(),
}
}

fn visit_pre_concat(&mut self) {
self.bookmarks.push(self.location());
// A new child of a `Concat` node is about to be processed,
Expand Down Expand Up @@ -673,8 +683,10 @@ impl hir::Visitor for &mut Compiler {

fn visit_pre(&mut self, hir: &Hir) -> Result<(), Self::Err> {
match hir.kind() {
HirKind::Empty => {}
HirKind::Literal(_) => {}
HirKind::Class(_) => {}
HirKind::Look(_) => {}
HirKind::Capture(_) => {
self.bookmarks.push(self.location());
}
Expand All @@ -687,7 +699,6 @@ impl hir::Visitor for &mut Compiler {
HirKind::Repetition(rep) => {
self.visit_pre_repetition(rep);
}
kind => unreachable!("{:?}", kind),
}

self.depth += 1;
Expand All @@ -697,8 +708,10 @@ impl hir::Visitor for &mut Compiler {

fn visit_post(&mut self, hir: &Hir) -> Result<(), Self::Err> {
let mut code_loc = match hir.kind() {
HirKind::Capture(_) => self.bookmarks.pop().unwrap(),
HirKind::Empty => self.location(),
HirKind::Literal(literal) => self.emit_literal(literal),
HirKind::Capture(_) => self.bookmarks.pop().unwrap(),
HirKind::Look(look) => self.visit_post_look(look),
hir_kind @ HirKind::Class(class) => {
if any_byte(hir_kind) {
self.emit_instr(Instr::ANY_BYTE)
Expand All @@ -715,7 +728,6 @@ impl hir::Visitor for &mut Compiler {
HirKind::Repetition(repeated) => {
self.visit_post_repetition(repeated)
}
_ => unreachable!(),
};

// If `zero_rep_depth` > 0 we are currently at a HIR node that is
Expand Down
107 changes: 92 additions & 15 deletions yara-x/src/re/instr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ pub type Offset = i16;

/// Instructions supported by the Pike VM.
pub enum Instr<'a> {
/// Match for the regexp has been found.
Match,

/// Matches any byte.
AnyByte,

Expand Down Expand Up @@ -119,24 +122,40 @@ pub enum Instr<'a> {
/// location of the jump opcode.
Jump(Offset),

/// Match for the regexp has been found.
Match,
/// Matches the start of the scanned data (^)
Start,

/// Matches the end of the scanned data ($)
End,

/// Matches a word boundary (i.e: characters that are not part of the
/// \w class). Used for \b look-around assertions. This is a zero-length
/// match.
WordBoundary,

/// The negation of WordBoundary. Used for \B look-around assertions. This
/// is a zero-length match.
WordBoundaryNeg,

/// Not really an instruction, is just a marker that indicates the end
/// of a instruction sequence.
Eoi,
}

impl<'a> Instr<'a> {
pub const SPLIT_A: u8 = 0x00;
pub const SPLIT_B: u8 = 0x01;
pub const SPLIT_N: u8 = 0x02;
pub const JUMP: u8 = 0x03;
pub const ANY_BYTE: u8 = 0x04;
pub const MASKED_BYTE: u8 = 0x05;
pub const CLASS_BITMAP: u8 = 0x06;
pub const CLASS_RANGES: u8 = 0x07;
pub const MATCH: u8 = 0x08;
pub const MATCH: u8 = 0x00;
pub const SPLIT_A: u8 = 0x01;
pub const SPLIT_B: u8 = 0x02;
pub const SPLIT_N: u8 = 0x03;
pub const JUMP: u8 = 0x04;
pub const ANY_BYTE: u8 = 0x05;
pub const MASKED_BYTE: u8 = 0x06;
pub const CLASS_BITMAP: u8 = 0x07;
pub const CLASS_RANGES: u8 = 0x08;
pub const START: u8 = 0x09;
pub const END: u8 = 0x0A;
pub const WORD_BOUNDARY: u8 = 0x0B;
pub const WORD_BOUNDARY_NEG: u8 = 0x0C;
}

/// A sequence of instructions for the Pike VM.
Expand Down Expand Up @@ -206,8 +225,7 @@ impl InstrSeq {
// offset that is relative to the start of the instruction.
self.seq.write_all(&[0x00; size_of::<Offset>()]).unwrap();
}
Instr::ANY_BYTE | Instr::MATCH => {}
_ => unreachable!(),
_ => {}
}

location
Expand Down Expand Up @@ -394,6 +412,9 @@ impl Display for InstrSeq {
Instr::AnyByte => {
writeln!(f, "{:05x}: ANY_BYTE", addr)?;
}
Instr::Byte(byte) => {
writeln!(f, "{:05x}: LIT {:#04x}", addr, byte)?;
}
Instr::MaskedByte(byte, mask) => {
writeln!(
f,
Expand Down Expand Up @@ -446,8 +467,17 @@ impl Display for InstrSeq {
}
writeln!(f)?;
}
Instr::Byte(byte) => {
writeln!(f, "{:05x}: LIT {:#04x}", addr, byte)?;
Instr::Start => {
writeln!(f, "{:05x}: START", addr)?;
}
Instr::End => {
writeln!(f, "{:05x}: END", addr)?;
}
Instr::WordBoundary => {
writeln!(f, "{:05x}: WORD_BOUNDARY", addr)?;
}
Instr::WordBoundaryNeg => {
writeln!(f, "{:05x}: WORD_BOUNDARY_NEG", addr)?;
}
Instr::Match => {
writeln!(f, "{:05x}: MATCH", addr)?;
Expand Down Expand Up @@ -538,6 +568,13 @@ pub(crate) fn decode_instr(code: &[u8]) -> (Instr, usize) {
let bitmap = &code[2..2 + 32];
(Instr::ClassBitmap(ClassBitmap(bitmap)), 2 + bitmap.len())
}
[OPCODE_PREFIX, Instr::START, ..] => (Instr::Start, 2),
[OPCODE_PREFIX, Instr::END, ..] => (Instr::End, 2),
[OPCODE_PREFIX, Instr::WORD_BOUNDARY, ..] => (Instr::WordBoundary, 2),
[OPCODE_PREFIX, Instr::WORD_BOUNDARY_NEG, ..] => {
(Instr::WordBoundaryNeg, 2)
}

[OPCODE_PREFIX, Instr::MATCH, ..] => (Instr::Match, 2),
[b, ..] => (Instr::Byte(b), 1),
[] => (Instr::Eoi, 0),
Expand All @@ -559,6 +596,9 @@ impl Cache {
pub fn epsilon_closure(
code: &[u8],
start: usize,
backwards: bool,
curr_byte: Option<&u8>,
prev_byte: Option<&u8>,
cache: &mut Cache,
closure: &mut Vec<usize>,
) {
Expand Down Expand Up @@ -609,6 +649,43 @@ pub fn epsilon_closure(
.fibers
.push((fiber as i64 + offset as i64).try_into().unwrap());
}
Instr::Start => {
if backwards {
if curr_byte.is_none() {
cache.fibers.push(next);
}
} else if prev_byte.is_none() {
cache.fibers.push(next);
}
}
Instr::End => {
if backwards {
if prev_byte.is_none() {
cache.fibers.push(next);
}
} else if curr_byte.is_none() {
cache.fibers.push(next);
}
}
Instr::WordBoundary | Instr::WordBoundaryNeg => {
let mut is_match = match (prev_byte, curr_byte) {
(Some(p), Some(c)) => {
p.is_ascii_alphanumeric() != c.is_ascii_alphanumeric()
}
(None, Some(b)) | (Some(b), None) => {
b.is_ascii_alphanumeric()
}
_ => false,
};

if matches!(instr, Instr::WordBoundaryNeg) {
is_match = !is_match;
}

if is_match {
cache.fibers.push(next)
}
}
Instr::Match => {
closure.push(fiber);
}
Expand Down
Loading

0 comments on commit 1e45ca2

Please sign in to comment.