From 422a4b4c9da5a7384f50464d314161e1b27e40a8 Mon Sep 17 00:00:00 2001 From: defuz Date: Thu, 18 Feb 2016 03:03:43 +0200 Subject: [PATCH] Fix #168 and using Arc for named groups --- src/program.rs | 14 ++--- src/re.rs | 140 +++++++++++++++++++++++++++++-------------------- 2 files changed, 90 insertions(+), 64 deletions(-) diff --git a/src/program.rs b/src/program.rs index 91d2d38535..029e48787d 100644 --- a/src/program.rs +++ b/src/program.rs @@ -8,7 +8,9 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. - +use std::{char, cmp}; +use std::collections::HashMap; +use std::sync::Arc; use syntax; @@ -57,7 +59,7 @@ pub struct Program { pub cap_names: Vec>, /// The map of named capture groups. The keys are group names and /// the values are group indices. - pub named_groups: ::std::collections::HashMap, + pub named_groups: Arc>, /// If the regular expression requires a literal prefix in order to have a /// match, that prefix is stored here as a DFA. pub prefixes: Prefix, @@ -89,7 +91,7 @@ impl Program { let (insts_len, ncaps) = (insts.len(), num_captures(&insts)); let create_threads = move || NfaThreads::new(insts_len, ncaps); let create_backtrack = move || BackMachine::new(); - let mut named_groups = ::std::collections::HashMap::new(); + let mut named_groups = HashMap::new(); for (i, name) in cap_names.iter().enumerate() { if let Some(ref name) = *name { named_groups.insert(name.to_owned(), i); @@ -99,7 +101,7 @@ impl Program { original: re.into(), insts: insts, cap_names: cap_names, - named_groups: named_groups, + named_groups: Arc::new(named_groups), prefixes: Prefix::Empty, prefixes_complete: false, anchored_begin: false, @@ -284,7 +286,7 @@ impl Program { for c in (s as u32)..(e as u32 + 1){ for alt in &orig { let mut alt = alt.clone(); - alt.push(::std::char::from_u32(c).unwrap()); + alt.push(char::from_u32(c).unwrap()); alts.push(alt); } } @@ -346,7 +348,7 @@ fn num_captures(insts: &[Inst]) -> usize { let mut n = 0; for inst in insts { if let Inst::Save(ref inst) = *inst { - n = ::std::cmp::max(n, inst.slot + 1) + n = cmp::max(n, inst.slot + 1) } } // There's exactly 2 Save slots for every capture. diff --git a/src/re.rs b/src/re.rs index d3c6f2bed4..5d3490d211 100644 --- a/src/re.rs +++ b/src/re.rs @@ -14,6 +14,8 @@ use std::ops::Index; #[cfg(feature = "pattern")] use std::str::pattern::{Pattern, Searcher, SearchStep}; use std::str::FromStr; +use std::collections::HashMap; +use std::sync::Arc; use program::{Program, MatchEngine}; use syntax; @@ -416,13 +418,13 @@ impl Regex { /// /// The `0`th capture group is always unnamed, so it must always be /// accessed with `at(0)` or `[0]`. - pub fn captures<'r, 't>(&'r self, text: &'t str) -> Option> { + pub fn captures<'t>(&self, text: &'t str) -> Option> { let mut locs = self.alloc_captures(); if exec(self, &mut locs, text, 0) { Some(Captures { - regex: self, text: text, locs: locs, + named_groups: NamedGroups::from_regex(self) }) } else { None @@ -816,6 +818,47 @@ impl<'r, 't> Iterator for RegexSplitsN<'r, 't> { } } +enum NamedGroups { + Native(&'static [(&'static str, usize)]), + Dynamic(Arc>), +} + +impl NamedGroups { + fn from_regex(regex: &Regex) -> NamedGroups { + match *regex { + Regex::Native(ExNative { ref groups, .. }) => + NamedGroups::Native(groups), + Regex::Dynamic(Program { ref named_groups, .. }) => + NamedGroups::Dynamic(named_groups.clone()) + } + } + + fn pos(&self, name: &str) -> Option { + match *self { + NamedGroups::Native(groups) => { + groups.binary_search_by(|&(n, _)| n.cmp(name)) + .ok().map(|i| groups[i].1) + }, + NamedGroups::Dynamic(ref groups) => { + groups.get(name).map(|i| *i) + }, + } + } + + fn iter<'n>(&'n self) -> Box + 'n> { + match *self { + NamedGroups::Native(groups) => { + Box::new(groups.iter().map(|&v| v)) + as Box + 'n> + }, + NamedGroups::Dynamic(ref groups) => { + Box::new(groups.iter().map(|(s, i)| (&s[..], *i))) + as Box + 'n> + }, + } + } +} + /// Captures represents a group of captured strings for a single match. /// /// The 0th capture always corresponds to the entire match. Each subsequent @@ -827,13 +870,13 @@ impl<'r, 't> Iterator for RegexSplitsN<'r, 't> { /// Positions returned from a capture group are always byte indices. /// /// `'t` is the lifetime of the matched text. -pub struct Captures<'r, 't> { - regex: &'r Regex, +pub struct Captures<'t> { text: &'t str, locs: Vec>, + named_groups: NamedGroups, } -impl<'r, 't> Captures<'r, 't> { +impl<'t> Captures<'t> { /// Returns the start and end positions of the Nth capture group. /// Returns `None` if `i` is not a valid capture group or if the capture /// group did not match anything. @@ -862,49 +905,29 @@ impl<'r, 't> Captures<'r, 't> { /// `name` isn't a valid capture group or didn't match anything, then /// `None` is returned. pub fn name(&self, name: &str) -> Option<&'t str> { - match *self.regex { - Regex::Native(ExNative { ref groups, .. }) => { - match groups.binary_search_by(|&(n, _)| n.cmp(name)) { - Ok(i) => self.at(groups[i].1), - Err(_) => None - } - }, - Regex::Dynamic(Program { ref named_groups, .. }) => { - named_groups.get(name).and_then(|i| self.at(*i)) - }, - } + self.named_groups.pos(name).and_then(|i| self.at(i)) } /// Creates an iterator of all the capture groups in order of appearance /// in the regular expression. - pub fn iter<'c>(&'c self) -> SubCaptures<'c, 'r, 't> { + pub fn iter<'c>(&'c self) -> SubCaptures<'c, 't> { SubCaptures { idx: 0, caps: self, } } /// Creates an iterator of all the capture group positions in order of /// appearance in the regular expression. Positions are byte indices /// in terms of the original string matched. - pub fn iter_pos<'c>(&'c self) -> SubCapturesPos<'c, 'r, 't> { - SubCapturesPos { idx: 0, caps: self, } + pub fn iter_pos<'c>(&'c self) -> SubCapturesPos<'c> { + SubCapturesPos { idx: 0, locs: &self.locs } } /// Creates an iterator of all named groups as an tuple with the group /// name and the value. The iterator returns these values in arbitrary /// order. - pub fn iter_named<'c>(&'c self) -> SubCapturesNamed<'c, 'r, 't> { - let iter = match *self.regex { - Regex::Native(ExNative { ref groups, .. }) => { - Box::new(groups.iter().map(|&v| v)) - as Box + 'r> - }, - Regex::Dynamic(Program { ref named_groups, .. }) => { - Box::new(named_groups.iter().map(|(s, i)| (&s[..], *i))) - as Box + 'r> - }, - }; + pub fn iter_named<'c: 't>(&'c self) -> SubCapturesNamed<'c, 't> { SubCapturesNamed { caps: self, - inner: iter + names: self.named_groups.iter() } } @@ -948,7 +971,7 @@ impl<'r, 't> Captures<'r, 't> { /// /// # Panics /// If there is no group at the given index. -impl<'r, 't> Index for Captures<'r, 't> { +impl<'t> Index for Captures<'t> { type Output = str; @@ -962,7 +985,7 @@ impl<'r, 't> Index for Captures<'r, 't> { /// /// # Panics /// If there is no group named by the given value. -impl<'r, 't> Index<&'t str> for Captures<'r, 't> { +impl<'t> Index<&'t str> for Captures<'t> { type Output = str; @@ -979,12 +1002,12 @@ impl<'r, 't> Index<&'t str> for Captures<'r, 't> { /// expression. /// /// `'t` is the lifetime of the matched text. -pub struct SubCaptures<'c, 'r: 'c, 't: 'c> { +pub struct SubCaptures<'c, 't: 'c> { idx: usize, - caps: &'c Captures<'r, 't>, + caps: &'c Captures<'t>, } -impl<'c, 'r, 't> Iterator for SubCaptures<'c, 'r, 't> { +impl<'c, 't> Iterator for SubCaptures<'c, 't> { type Item = Option<&'t str>; fn next(&mut self) -> Option> { @@ -1003,21 +1026,25 @@ impl<'c, 'r, 't> Iterator for SubCaptures<'c, 'r, 't> { /// Positions are byte indices in terms of the original string matched. /// /// `'t` is the lifetime of the matched text. -pub struct SubCapturesPos<'c, 'r: 'c, 't: 'c> { +pub struct SubCapturesPos<'c> { idx: usize, - caps: &'c Captures<'r, 't>, + locs: &'c [Option] } -impl<'c, 'r, 't> Iterator for SubCapturesPos<'c, 'r, 't> { +impl<'c> Iterator for SubCapturesPos<'c> { type Item = Option<(usize, usize)>; fn next(&mut self) -> Option> { - if self.idx < self.caps.len() { - self.idx += 1; - Some(self.caps.pos(self.idx - 1)) - } else { - None + if self.idx >= self.locs.len() { + return None } + let r = match (self.locs[self.idx], self.locs[self.idx + 1]) { + (Some(s), Some(e)) => Some((s, e)), + (None, None) => None, + _ => unreachable!() + }; + self.idx += 2; + Some(r) } } @@ -1025,19 +1052,16 @@ impl<'c, 'r, 't> Iterator for SubCapturesPos<'c, 'r, 't> { /// name and the value. /// /// `'t` is the lifetime of the matched text. -pub struct SubCapturesNamed<'c, 'r: 'c, 't: 'c> { - caps: &'c Captures<'r, 't>, - inner: Box + 'r>, +pub struct SubCapturesNamed<'c, 't: 'c> { + caps: &'c Captures<'t>, + names: Box + 'c>, } -impl<'c, 'r, 't> Iterator for SubCapturesNamed<'c, 'r, 't> { - type Item = (&'r str, Option<&'t str>); +impl<'c, 't: 'c> Iterator for SubCapturesNamed<'c, 't> { + type Item = (&'c str, Option<&'t str>); - fn next(&mut self) -> Option<(&'r str, Option<&'t str>)> { - match self.inner.next() { - Some((name, pos)) => Some((name, self.caps.at(pos))), - None => None - } + fn next(&mut self) -> Option<(&'c str, Option<&'t str>)> { + self.names.next().map(|(name, pos)| (name, self.caps.at(pos))) } } @@ -1056,9 +1080,9 @@ pub struct FindCaptures<'r, 't> { } impl<'r, 't> Iterator for FindCaptures<'r, 't> { - type Item = Captures<'r, 't>; + type Item = Captures<'t>; - fn next(&mut self) -> Option> { + fn next(&mut self) -> Option> { if self.last_end > self.search.len() { return None } @@ -1083,9 +1107,9 @@ impl<'r, 't> Iterator for FindCaptures<'r, 't> { self.skip_next_empty = true; } Some(Captures { - regex: self.re, text: self.search, - locs: caps + locs: caps, + named_groups: NamedGroups::from_regex(self.re), }) } }