-
Notifications
You must be signed in to change notification settings - Fork 449
/
parse.rs
5930 lines (5689 loc) · 200 KB
/
parse.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*!
This module provides a regular expression parser.
*/
use std::borrow::Borrow;
use std::cell::{Cell, RefCell};
use std::mem;
use std::result;
use crate::ast::{self, Ast, Position, Span};
use crate::either::Either;
use crate::is_meta_character;
type Result<T> = result::Result<T, ast::Error>;
/// A primitive is an expression with no sub-expressions. This includes
/// literals, assertions and non-set character classes. This representation
/// is used as intermediate state in the parser.
///
/// This does not include ASCII character classes, since they can only appear
/// within a set character class.
#[derive(Clone, Debug, Eq, PartialEq)]
enum Primitive {
Literal(ast::Literal),
Assertion(ast::Assertion),
Dot(Span),
Perl(ast::ClassPerl),
Unicode(ast::ClassUnicode),
}
impl Primitive {
/// Return the span of this primitive.
fn span(&self) -> &Span {
match *self {
Primitive::Literal(ref x) => &x.span,
Primitive::Assertion(ref x) => &x.span,
Primitive::Dot(ref span) => span,
Primitive::Perl(ref x) => &x.span,
Primitive::Unicode(ref x) => &x.span,
}
}
/// Convert this primitive into a proper AST.
fn into_ast(self) -> Ast {
match self {
Primitive::Literal(lit) => Ast::Literal(lit),
Primitive::Assertion(assert) => Ast::Assertion(assert),
Primitive::Dot(span) => Ast::Dot(span),
Primitive::Perl(cls) => Ast::Class(ast::Class::Perl(cls)),
Primitive::Unicode(cls) => Ast::Class(ast::Class::Unicode(cls)),
}
}
/// Convert this primitive into an item in a character class.
///
/// If this primitive is not a legal item (i.e., an assertion or a dot),
/// then return an error.
fn into_class_set_item<P: Borrow<Parser>>(
self,
p: &ParserI<'_, P>,
) -> Result<ast::ClassSetItem> {
use self::Primitive::*;
use crate::ast::ClassSetItem;
match self {
Literal(lit) => Ok(ClassSetItem::Literal(lit)),
Perl(cls) => Ok(ClassSetItem::Perl(cls)),
Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
}
}
/// Convert this primitive into a literal in a character class. In
/// particular, literals are the only valid items that can appear in
/// ranges.
///
/// If this primitive is not a legal item (i.e., a class, assertion or a
/// dot), then return an error.
fn into_class_literal<P: Borrow<Parser>>(
self,
p: &ParserI<'_, P>,
) -> Result<ast::Literal> {
use self::Primitive::*;
match self {
Literal(lit) => Ok(lit),
x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
}
}
}
/// Returns true if the given character is a hexadecimal digit.
fn is_hex(c: char) -> bool {
('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
}
/// Returns true if the given character is a valid in a capture group name.
///
/// If `first` is true, then `c` is treated as the first character in the
/// group name (which must be alphabetic or underscore).
fn is_capture_char(c: char, first: bool) -> bool {
c == '_'
|| (!first
&& (('0' <= c && c <= '9') || c == '.' || c == '[' || c == ']'))
|| ('A' <= c && c <= 'Z')
|| ('a' <= c && c <= 'z')
}
/// A builder for a regular expression parser.
///
/// This builder permits modifying configuration options for the parser.
#[derive(Clone, Debug)]
pub struct ParserBuilder {
ignore_whitespace: bool,
nest_limit: u32,
octal: bool,
}
impl Default for ParserBuilder {
fn default() -> ParserBuilder {
ParserBuilder::new()
}
}
impl ParserBuilder {
/// Create a new parser builder with a default configuration.
pub fn new() -> ParserBuilder {
ParserBuilder {
ignore_whitespace: false,
nest_limit: 250,
octal: false,
}
}
/// Build a parser from this configuration with the given pattern.
pub fn build(&self) -> Parser {
Parser {
pos: Cell::new(Position { offset: 0, line: 1, column: 1 }),
capture_index: Cell::new(0),
nest_limit: self.nest_limit,
octal: self.octal,
initial_ignore_whitespace: self.ignore_whitespace,
ignore_whitespace: Cell::new(self.ignore_whitespace),
comments: RefCell::new(vec![]),
stack_group: RefCell::new(vec![]),
stack_class: RefCell::new(vec![]),
capture_names: RefCell::new(vec![]),
scratch: RefCell::new(String::new()),
}
}
/// Set the nesting limit for this parser.
///
/// The nesting limit controls how deep the abstract syntax tree is allowed
/// to be. If the AST exceeds the given limit (e.g., with too many nested
/// groups), then an error is returned by the parser.
///
/// The purpose of this limit is to act as a heuristic to prevent stack
/// overflow for consumers that do structural induction on an `Ast` using
/// explicit recursion. While this crate never does this (instead using
/// constant stack space and moving the call stack to the heap), other
/// crates may.
///
/// This limit is not checked until the entire Ast is parsed. Therefore,
/// if callers want to put a limit on the amount of heap space used, then
/// they should impose a limit on the length, in bytes, of the concrete
/// pattern string. In particular, this is viable since this parser
/// implementation will limit itself to heap space proportional to the
/// length of the pattern string.
///
/// Note that a nest limit of `0` will return a nest limit error for most
/// patterns but not all. For example, a nest limit of `0` permits `a` but
/// not `ab`, since `ab` requires a concatenation, which results in a nest
/// depth of `1`. In general, a nest limit is not something that manifests
/// in an obvious way in the concrete syntax, therefore, it should not be
/// used in a granular way.
pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder {
self.nest_limit = limit;
self
}
/// Whether to support octal syntax or not.
///
/// Octal syntax is a little-known way of uttering Unicode codepoints in
/// a regular expression. For example, `a`, `\x61`, `\u0061` and
/// `\141` are all equivalent regular expressions, where the last example
/// shows octal syntax.
///
/// While supporting octal syntax isn't in and of itself a problem, it does
/// make good error messages harder. That is, in PCRE based regex engines,
/// syntax like `\0` invokes a backreference, which is explicitly
/// unsupported in Rust's regex engine. However, many users expect it to
/// be supported. Therefore, when octal support is disabled, the error
/// message will explicitly mention that backreferences aren't supported.
///
/// Octal syntax is disabled by default.
pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder {
self.octal = yes;
self
}
/// Enable verbose mode in the regular expression.
///
/// When enabled, verbose mode permits insignificant whitespace in many
/// places in the regular expression, as well as comments. Comments are
/// started using `#` and continue until the end of the line.
///
/// By default, this is disabled. It may be selectively enabled in the
/// regular expression by using the `x` flag regardless of this setting.
pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder {
self.ignore_whitespace = yes;
self
}
}
/// A regular expression parser.
///
/// This parses a string representation of a regular expression into an
/// abstract syntax tree. The size of the tree is proportional to the length
/// of the regular expression pattern.
///
/// A `Parser` can be configured in more detail via a
/// [`ParserBuilder`](struct.ParserBuilder.html).
#[derive(Clone, Debug)]
pub struct Parser {
/// The current position of the parser.
pos: Cell<Position>,
/// The current capture index.
capture_index: Cell<u32>,
/// The maximum number of open parens/brackets allowed. If the parser
/// exceeds this number, then an error is returned.
nest_limit: u32,
/// Whether to support octal syntax or not. When `false`, the parser will
/// return an error helpfully pointing out that backreferences are not
/// supported.
octal: bool,
/// The initial setting for `ignore_whitespace` as provided by
/// `ParserBuilder`. It is used when resetting the parser's state.
initial_ignore_whitespace: bool,
/// Whether whitespace should be ignored. When enabled, comments are
/// also permitted.
ignore_whitespace: Cell<bool>,
/// A list of comments, in order of appearance.
comments: RefCell<Vec<ast::Comment>>,
/// A stack of grouped sub-expressions, including alternations.
stack_group: RefCell<Vec<GroupState>>,
/// A stack of nested character classes. This is only non-empty when
/// parsing a class.
stack_class: RefCell<Vec<ClassState>>,
/// A sorted sequence of capture names. This is used to detect duplicate
/// capture names and report an error if one is detected.
capture_names: RefCell<Vec<ast::CaptureName>>,
/// A scratch buffer used in various places. Mostly this is used to
/// accumulate relevant characters from parts of a pattern.
scratch: RefCell<String>,
}
/// ParserI is the internal parser implementation.
///
/// We use this separate type so that we can carry the provided pattern string
/// along with us. In particular, a `Parser` internal state is not tied to any
/// one pattern, but `ParserI` is.
///
/// This type also lets us use `ParserI<&Parser>` in production code while
/// retaining the convenience of `ParserI<Parser>` for tests, which sometimes
/// work against the internal interface of the parser.
#[derive(Clone, Debug)]
struct ParserI<'s, P> {
/// The parser state/configuration.
parser: P,
/// The full regular expression provided by the user.
pattern: &'s str,
}
/// GroupState represents a single stack frame while parsing nested groups
/// and alternations. Each frame records the state up to an opening parenthesis
/// or a alternating bracket `|`.
#[derive(Clone, Debug)]
enum GroupState {
/// This state is pushed whenever an opening group is found.
Group {
/// The concatenation immediately preceding the opening group.
concat: ast::Concat,
/// The group that has been opened. Its sub-AST is always empty.
group: ast::Group,
/// Whether this group has the `x` flag enabled or not.
ignore_whitespace: bool,
},
/// This state is pushed whenever a new alternation branch is found. If
/// an alternation branch is found and this state is at the top of the
/// stack, then this state should be modified to include the new
/// alternation.
Alternation(ast::Alternation),
}
/// ClassState represents a single stack frame while parsing character classes.
/// Each frame records the state up to an intersection, difference, symmetric
/// difference or nested class.
///
/// Note that a parser's character class stack is only non-empty when parsing
/// a character class. In all other cases, it is empty.
#[derive(Clone, Debug)]
enum ClassState {
/// This state is pushed whenever an opening bracket is found.
Open {
/// The union of class items immediately preceding this class.
union: ast::ClassSetUnion,
/// The class that has been opened. Typically this just corresponds
/// to the `[`, but it can also include `[^` since `^` indicates
/// negation of the class.
set: ast::ClassBracketed,
},
/// This state is pushed when a operator is seen. When popped, the stored
/// set becomes the left hand side of the operator.
Op {
/// The type of the operation, i.e., &&, -- or ~~.
kind: ast::ClassSetBinaryOpKind,
/// The left-hand side of the operator.
lhs: ast::ClassSet,
},
}
impl Parser {
/// Create a new parser with a default configuration.
///
/// The parser can be run with either the `parse` or `parse_with_comments`
/// methods. The parse methods return an abstract syntax tree.
///
/// To set configuration options on the parser, use
/// [`ParserBuilder`](struct.ParserBuilder.html).
pub fn new() -> Parser {
ParserBuilder::new().build()
}
/// Parse the regular expression into an abstract syntax tree.
pub fn parse(&mut self, pattern: &str) -> Result<Ast> {
ParserI::new(self, pattern).parse()
}
/// Parse the regular expression and return an abstract syntax tree with
/// all of the comments found in the pattern.
pub fn parse_with_comments(
&mut self,
pattern: &str,
) -> Result<ast::WithComments> {
ParserI::new(self, pattern).parse_with_comments()
}
/// Reset the internal state of a parser.
///
/// This is called at the beginning of every parse. This prevents the
/// parser from running with inconsistent state (say, if a previous
/// invocation returned an error and the parser is reused).
fn reset(&self) {
// These settings should be in line with the construction
// in `ParserBuilder::build`.
self.pos.set(Position { offset: 0, line: 1, column: 1 });
self.ignore_whitespace.set(self.initial_ignore_whitespace);
self.comments.borrow_mut().clear();
self.stack_group.borrow_mut().clear();
self.stack_class.borrow_mut().clear();
}
}
impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
/// Build an internal parser from a parser configuration and a pattern.
fn new(parser: P, pattern: &'s str) -> ParserI<'s, P> {
ParserI { parser, pattern }
}
/// Return a reference to the parser state.
fn parser(&self) -> &Parser {
self.parser.borrow()
}
/// Return a reference to the pattern being parsed.
fn pattern(&self) -> &str {
self.pattern.borrow()
}
/// Create a new error with the given span and error type.
fn error(&self, span: Span, kind: ast::ErrorKind) -> ast::Error {
ast::Error { kind, pattern: self.pattern().to_string(), span }
}
/// Return the current offset of the parser.
///
/// The offset starts at `0` from the beginning of the regular expression
/// pattern string.
fn offset(&self) -> usize {
self.parser().pos.get().offset
}
/// Return the current line number of the parser.
///
/// The line number starts at `1`.
fn line(&self) -> usize {
self.parser().pos.get().line
}
/// Return the current column of the parser.
///
/// The column number starts at `1` and is reset whenever a `\n` is seen.
fn column(&self) -> usize {
self.parser().pos.get().column
}
/// Return the next capturing index. Each subsequent call increments the
/// internal index.
///
/// The span given should correspond to the location of the opening
/// parenthesis.
///
/// If the capture limit is exceeded, then an error is returned.
fn next_capture_index(&self, span: Span) -> Result<u32> {
let current = self.parser().capture_index.get();
let i = current.checked_add(1).ok_or_else(|| {
self.error(span, ast::ErrorKind::CaptureLimitExceeded)
})?;
self.parser().capture_index.set(i);
Ok(i)
}
/// Adds the given capture name to this parser. If this capture name has
/// already been used, then an error is returned.
fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
let mut names = self.parser().capture_names.borrow_mut();
match names
.binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str())
{
Err(i) => {
names.insert(i, cap.clone());
Ok(())
}
Ok(i) => Err(self.error(
cap.span,
ast::ErrorKind::GroupNameDuplicate { original: names[i].span },
)),
}
}
/// Return whether the parser should ignore whitespace or not.
fn ignore_whitespace(&self) -> bool {
self.parser().ignore_whitespace.get()
}
/// Return the character at the current position of the parser.
///
/// This panics if the current position does not point to a valid char.
fn char(&self) -> char {
self.char_at(self.offset())
}
/// Return the character at the given position.
///
/// This panics if the given position does not point to a valid char.
fn char_at(&self, i: usize) -> char {
self.pattern()[i..]
.chars()
.next()
.unwrap_or_else(|| panic!("expected char at offset {}", i))
}
/// Bump the parser to the next Unicode scalar value.
///
/// If the end of the input has been reached, then `false` is returned.
fn bump(&self) -> bool {
if self.is_eof() {
return false;
}
let Position { mut offset, mut line, mut column } = self.pos();
if self.char() == '\n' {
line = line.checked_add(1).unwrap();
column = 1;
} else {
column = column.checked_add(1).unwrap();
}
offset += self.char().len_utf8();
self.parser().pos.set(Position { offset, line, column });
self.pattern()[self.offset()..].chars().next().is_some()
}
/// If the substring starting at the current position of the parser has
/// the given prefix, then bump the parser to the character immediately
/// following the prefix and return true. Otherwise, don't bump the parser
/// and return false.
fn bump_if(&self, prefix: &str) -> bool {
if self.pattern()[self.offset()..].starts_with(prefix) {
for _ in 0..prefix.chars().count() {
self.bump();
}
true
} else {
false
}
}
/// Returns true if and only if the parser is positioned at a look-around
/// prefix. The conditions under which this returns true must always
/// correspond to a regular expression that would otherwise be consider
/// invalid.
///
/// This should only be called immediately after parsing the opening of
/// a group or a set of flags.
fn is_lookaround_prefix(&self) -> bool {
self.bump_if("?=")
|| self.bump_if("?!")
|| self.bump_if("?<=")
|| self.bump_if("?<!")
}
/// Bump the parser, and if the `x` flag is enabled, bump through any
/// subsequent spaces. Return true if and only if the parser is not at
/// EOF.
fn bump_and_bump_space(&self) -> bool {
if !self.bump() {
return false;
}
self.bump_space();
!self.is_eof()
}
/// If the `x` flag is enabled (i.e., whitespace insensitivity with
/// comments), then this will advance the parser through all whitespace
/// and comments to the next non-whitespace non-comment byte.
///
/// If the `x` flag is disabled, then this is a no-op.
///
/// This should be used selectively throughout the parser where
/// arbitrary whitespace is permitted when the `x` flag is enabled. For
/// example, `{ 5 , 6}` is equivalent to `{5,6}`.
fn bump_space(&self) {
if !self.ignore_whitespace() {
return;
}
while !self.is_eof() {
if self.char().is_whitespace() {
self.bump();
} else if self.char() == '#' {
let start = self.pos();
let mut comment_text = String::new();
self.bump();
while !self.is_eof() {
let c = self.char();
self.bump();
if c == '\n' {
break;
}
comment_text.push(c);
}
let comment = ast::Comment {
span: Span::new(start, self.pos()),
comment: comment_text,
};
self.parser().comments.borrow_mut().push(comment);
} else {
break;
}
}
}
/// Peek at the next character in the input without advancing the parser.
///
/// If the input has been exhausted, then this returns `None`.
fn peek(&self) -> Option<char> {
if self.is_eof() {
return None;
}
self.pattern()[self.offset() + self.char().len_utf8()..].chars().next()
}
/// Like peek, but will ignore spaces when the parser is in whitespace
/// insensitive mode.
fn peek_space(&self) -> Option<char> {
if !self.ignore_whitespace() {
return self.peek();
}
if self.is_eof() {
return None;
}
let mut start = self.offset() + self.char().len_utf8();
let mut in_comment = false;
for (i, c) in self.pattern()[start..].char_indices() {
if c.is_whitespace() {
continue;
} else if !in_comment && c == '#' {
in_comment = true;
} else if in_comment && c == '\n' {
in_comment = false;
} else {
start += i;
break;
}
}
self.pattern()[start..].chars().next()
}
/// Returns true if the next call to `bump` would return false.
fn is_eof(&self) -> bool {
self.offset() == self.pattern().len()
}
/// Return the current position of the parser, which includes the offset,
/// line and column.
fn pos(&self) -> Position {
self.parser().pos.get()
}
/// Create a span at the current position of the parser. Both the start
/// and end of the span are set.
fn span(&self) -> Span {
Span::splat(self.pos())
}
/// Create a span that covers the current character.
fn span_char(&self) -> Span {
let mut next = Position {
offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
line: self.line(),
column: self.column().checked_add(1).unwrap(),
};
if self.char() == '\n' {
next.line += 1;
next.column = 1;
}
Span::new(self.pos(), next)
}
/// Parse and push a single alternation on to the parser's internal stack.
/// If the top of the stack already has an alternation, then add to that
/// instead of pushing a new one.
///
/// The concatenation given corresponds to a single alternation branch.
/// The concatenation returned starts the next branch and is empty.
///
/// This assumes the parser is currently positioned at `|` and will advance
/// the parser to the character following `|`.
#[inline(never)]
fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
assert_eq!(self.char(), '|');
concat.span.end = self.pos();
self.push_or_add_alternation(concat);
self.bump();
Ok(ast::Concat { span: self.span(), asts: vec![] })
}
/// Pushes or adds the given branch of an alternation to the parser's
/// internal stack of state.
fn push_or_add_alternation(&self, concat: ast::Concat) {
use self::GroupState::*;
let mut stack = self.parser().stack_group.borrow_mut();
if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
alts.asts.push(concat.into_ast());
return;
}
stack.push(Alternation(ast::Alternation {
span: Span::new(concat.span.start, self.pos()),
asts: vec![concat.into_ast()],
}));
}
/// Parse and push a group AST (and its parent concatenation) on to the
/// parser's internal stack. Return a fresh concatenation corresponding
/// to the group's sub-AST.
///
/// If a set of flags was found (with no group), then the concatenation
/// is returned with that set of flags added.
///
/// This assumes that the parser is currently positioned on the opening
/// parenthesis. It advances the parser to the character at the start
/// of the sub-expression (or adjoining expression).
///
/// If there was a problem parsing the start of the group, then an error
/// is returned.
#[inline(never)]
fn push_group(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
assert_eq!(self.char(), '(');
match self.parse_group()? {
Either::Left(set) => {
let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
if let Some(v) = ignore {
self.parser().ignore_whitespace.set(v);
}
concat.asts.push(Ast::Flags(set));
Ok(concat)
}
Either::Right(group) => {
let old_ignore_whitespace = self.ignore_whitespace();
let new_ignore_whitespace = group
.flags()
.and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
.unwrap_or(old_ignore_whitespace);
self.parser().stack_group.borrow_mut().push(
GroupState::Group {
concat,
group,
ignore_whitespace: old_ignore_whitespace,
},
);
self.parser().ignore_whitespace.set(new_ignore_whitespace);
Ok(ast::Concat { span: self.span(), asts: vec![] })
}
}
}
/// Pop a group AST from the parser's internal stack and set the group's
/// AST to the given concatenation. Return the concatenation containing
/// the group.
///
/// This assumes that the parser is currently positioned on the closing
/// parenthesis and advances the parser to the character following the `)`.
///
/// If no such group could be popped, then an unopened group error is
/// returned.
#[inline(never)]
fn pop_group(&self, mut group_concat: ast::Concat) -> Result<ast::Concat> {
use self::GroupState::*;
assert_eq!(self.char(), ')');
let mut stack = self.parser().stack_group.borrow_mut();
let (mut prior_concat, mut group, ignore_whitespace, alt) = match stack
.pop()
{
Some(Group { concat, group, ignore_whitespace }) => {
(concat, group, ignore_whitespace, None)
}
Some(Alternation(alt)) => match stack.pop() {
Some(Group { concat, group, ignore_whitespace }) => {
(concat, group, ignore_whitespace, Some(alt))
}
None | Some(Alternation(_)) => {
return Err(self.error(
self.span_char(),
ast::ErrorKind::GroupUnopened,
));
}
},
None => {
return Err(self
.error(self.span_char(), ast::ErrorKind::GroupUnopened));
}
};
self.parser().ignore_whitespace.set(ignore_whitespace);
group_concat.span.end = self.pos();
self.bump();
group.span.end = self.pos();
match alt {
Some(mut alt) => {
alt.span.end = group_concat.span.end;
alt.asts.push(group_concat.into_ast());
group.ast = Box::new(alt.into_ast());
}
None => {
group.ast = Box::new(group_concat.into_ast());
}
}
prior_concat.asts.push(Ast::Group(group));
Ok(prior_concat)
}
/// Pop the last state from the parser's internal stack, if it exists, and
/// add the given concatenation to it. There either must be no state or a
/// single alternation item on the stack. Any other scenario produces an
/// error.
///
/// This assumes that the parser has advanced to the end.
#[inline(never)]
fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
concat.span.end = self.pos();
let mut stack = self.parser().stack_group.borrow_mut();
let ast = match stack.pop() {
None => Ok(concat.into_ast()),
Some(GroupState::Alternation(mut alt)) => {
alt.span.end = self.pos();
alt.asts.push(concat.into_ast());
Ok(Ast::Alternation(alt))
}
Some(GroupState::Group { group, .. }) => {
return Err(
self.error(group.span, ast::ErrorKind::GroupUnclosed)
);
}
};
// If we try to pop again, there should be nothing.
match stack.pop() {
None => ast,
Some(GroupState::Alternation(_)) => {
// This unreachable is unfortunate. This case can't happen
// because the only way we can be here is if there were two
// `GroupState::Alternation`s adjacent in the parser's stack,
// which we guarantee to never happen because we never push a
// `GroupState::Alternation` if one is already at the top of
// the stack.
unreachable!()
}
Some(GroupState::Group { group, .. }) => {
Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
}
}
}
/// Parse the opening of a character class and push the current class
/// parsing context onto the parser's stack. This assumes that the parser
/// is positioned at an opening `[`. The given union should correspond to
/// the union of set items built up before seeing the `[`.
///
/// If there was a problem parsing the opening of the class, then an error
/// is returned. Otherwise, a new union of set items for the class is
/// returned (which may be populated with either a `]` or a `-`).
#[inline(never)]
fn push_class_open(
&self,
parent_union: ast::ClassSetUnion,
) -> Result<ast::ClassSetUnion> {
assert_eq!(self.char(), '[');
let (nested_set, nested_union) = self.parse_set_class_open()?;
self.parser()
.stack_class
.borrow_mut()
.push(ClassState::Open { union: parent_union, set: nested_set });
Ok(nested_union)
}
/// Parse the end of a character class set and pop the character class
/// parser stack. The union given corresponds to the last union built
/// before seeing the closing `]`. The union returned corresponds to the
/// parent character class set with the nested class added to it.
///
/// This assumes that the parser is positioned at a `]` and will advance
/// the parser to the byte immediately following the `]`.
///
/// If the stack is empty after popping, then this returns the final
/// "top-level" character class AST (where a "top-level" character class
/// is one that is not nested inside any other character class).
///
/// If there is no corresponding opening bracket on the parser's stack,
/// then an error is returned.
#[inline(never)]
fn pop_class(
&self,
nested_union: ast::ClassSetUnion,
) -> Result<Either<ast::ClassSetUnion, ast::Class>> {
assert_eq!(self.char(), ']');
let item = ast::ClassSet::Item(nested_union.into_item());
let prevset = self.pop_class_op(item);
let mut stack = self.parser().stack_class.borrow_mut();
match stack.pop() {
None => {
// We can never observe an empty stack:
//
// 1) We are guaranteed to start with a non-empty stack since
// the character class parser is only initiated when it sees
// a `[`.
// 2) If we ever observe an empty stack while popping after
// seeing a `]`, then we signal the character class parser
// to terminate.
panic!("unexpected empty character class stack")
}
Some(ClassState::Op { .. }) => {
// This panic is unfortunate, but this case is impossible
// since we already popped the Op state if one exists above.
// Namely, every push to the class parser stack is guarded by
// whether an existing Op is already on the top of the stack.
// If it is, the existing Op is modified. That is, the stack
// can never have consecutive Op states.
panic!("unexpected ClassState::Op")
}
Some(ClassState::Open { mut union, mut set }) => {
self.bump();
set.span.end = self.pos();
set.kind = prevset;
if stack.is_empty() {
Ok(Either::Right(ast::Class::Bracketed(set)))
} else {
union.push(ast::ClassSetItem::Bracketed(Box::new(set)));
Ok(Either::Left(union))
}
}
}
}
/// Return an "unclosed class" error whose span points to the most
/// recently opened class.
///
/// This should only be called while parsing a character class.
#[inline(never)]
fn unclosed_class_error(&self) -> ast::Error {
for state in self.parser().stack_class.borrow().iter().rev() {
if let ClassState::Open { ref set, .. } = *state {
return self.error(set.span, ast::ErrorKind::ClassUnclosed);
}
}
// We are guaranteed to have a non-empty stack with at least
// one open bracket, so we should never get here.
panic!("no open character class found")
}
/// Push the current set of class items on to the class parser's stack as
/// the left hand side of the given operator.
///
/// A fresh set union is returned, which should be used to build the right
/// hand side of this operator.
#[inline(never)]
fn push_class_op(
&self,
next_kind: ast::ClassSetBinaryOpKind,
next_union: ast::ClassSetUnion,
) -> ast::ClassSetUnion {
let item = ast::ClassSet::Item(next_union.into_item());
let new_lhs = self.pop_class_op(item);
self.parser()
.stack_class
.borrow_mut()
.push(ClassState::Op { kind: next_kind, lhs: new_lhs });
ast::ClassSetUnion { span: self.span(), items: vec![] }
}
/// Pop a character class set from the character class parser stack. If the
/// top of the stack is just an item (not an operation), then return the
/// given set unchanged. If the top of the stack is an operation, then the
/// given set will be used as the rhs of the operation on the top of the
/// stack. In that case, the binary operation is returned as a set.
#[inline(never)]
fn pop_class_op(&self, rhs: ast::ClassSet) -> ast::ClassSet {
let mut stack = self.parser().stack_class.borrow_mut();
let (kind, lhs) = match stack.pop() {
Some(ClassState::Op { kind, lhs }) => (kind, lhs),
Some(state @ ClassState::Open { .. }) => {
stack.push(state);
return rhs;
}
None => unreachable!(),
};
let span = Span::new(lhs.span().start, rhs.span().end);
ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
span,
kind,
lhs: Box::new(lhs),
rhs: Box::new(rhs),
})
}
}
impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
/// Parse the regular expression into an abstract syntax tree.
fn parse(&self) -> Result<Ast> {
self.parse_with_comments().map(|astc| astc.ast)
}
/// Parse the regular expression and return an abstract syntax tree with
/// all of the comments found in the pattern.
fn parse_with_comments(&self) -> Result<ast::WithComments> {
assert_eq!(self.offset(), 0, "parser can only be used once");
self.parser().reset();
let mut concat = ast::Concat { span: self.span(), asts: vec![] };
loop {
self.bump_space();
if self.is_eof() {
break;
}
match self.char() {
'(' => concat = self.push_group(concat)?,
')' => concat = self.pop_group(concat)?,
'|' => concat = self.push_alternate(concat)?,
'[' => {
let class = self.parse_set_class()?;
concat.asts.push(Ast::Class(class));
}
'?' => {
concat = self.parse_uncounted_repetition(
concat,
ast::RepetitionKind::ZeroOrOne,
)?;
}
'*' => {
concat = self.parse_uncounted_repetition(
concat,
ast::RepetitionKind::ZeroOrMore,
)?;
}
'+' => {
concat = self.parse_uncounted_repetition(
concat,
ast::RepetitionKind::OneOrMore,
)?;
}
'{' => {
concat = self.parse_counted_repetition(concat)?;
}
_ => concat.asts.push(self.parse_primitive()?.into_ast()),
}
}
let ast = self.pop_group_end(concat)?;
NestLimiter::new(self).check(&ast)?;
Ok(ast::WithComments {