Skip to content

Commit

Permalink
syntax: allow Unicode in capture names
Browse files Browse the repository at this point in the history
This changes the rules for capture names to be much less restrictive.
Namely, the requirements are now:

1. Must begin with an `_` or any alphabetic codepoint.
2. After the first codepoint, the name may contain any sequence of
   alpha-numeric codepoints along with `_`, `.`, `[` and `]`.

Closes #595
  • Loading branch information
BurntSushi committed Mar 15, 2023
1 parent f6b1cf4 commit e47398b
Show file tree
Hide file tree
Showing 3 changed files with 125 additions and 8 deletions.
113 changes: 108 additions & 5 deletions regex-syntax/src/ast/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -109,11 +109,11 @@ fn is_hex(c: char) -> bool {
/// If `first` is true, then `c` is treated as the first character in the
/// group name (which must be alphabetic or underscore).
fn is_capture_char(c: char, first: bool) -> bool {
c == '_'
|| (!first
&& (('0' <= c && c <= '9') || c == '.' || c == '[' || c == ']'))
|| ('A' <= c && c <= 'Z')
|| ('a' <= c && c <= 'z')
if first {
c == '_' || c.is_alphabetic()
} else {
c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
}
}

/// A builder for a regular expression parser.
Expand Down Expand Up @@ -3910,6 +3910,55 @@ bar
}))
);

assert_eq!(
parser("(?P<a¾>)").parse(),
Ok(Ast::Group(ast::Group {
span: Span::new(
Position::new(0, 1, 1),
Position::new(9, 1, 9),
),
kind: ast::GroupKind::CaptureName {
starts_with_p: true,
name: ast::CaptureName {
span: Span::new(
Position::new(4, 1, 5),
Position::new(7, 1, 7),
),
name: s("a¾"),
index: 1,
}
},
ast: Box::new(Ast::Empty(Span::new(
Position::new(8, 1, 8),
Position::new(8, 1, 8),
))),
}))
);
assert_eq!(
parser("(?P<名字>)").parse(),
Ok(Ast::Group(ast::Group {
span: Span::new(
Position::new(0, 1, 1),
Position::new(12, 1, 9),
),
kind: ast::GroupKind::CaptureName {
starts_with_p: true,
name: ast::CaptureName {
span: Span::new(
Position::new(4, 1, 5),
Position::new(10, 1, 7),
),
name: s("名字"),
index: 1,
}
},
ast: Box::new(Ast::Empty(Span::new(
Position::new(11, 1, 8),
Position::new(11, 1, 8),
))),
}))
);

assert_eq!(
parser("(?P<").parse().unwrap_err(),
TestError {
Expand Down Expand Up @@ -3968,6 +4017,60 @@ bar
},
}
);
assert_eq!(
parser("(?P<5>)").parse().unwrap_err(),
TestError {
span: span(4..5),
kind: ast::ErrorKind::GroupNameInvalid,
}
);
assert_eq!(
parser("(?P<5a>)").parse().unwrap_err(),
TestError {
span: span(4..5),
kind: ast::ErrorKind::GroupNameInvalid,
}
);
assert_eq!(
parser("(?P<¾>)").parse().unwrap_err(),
TestError {
span: Span::new(
Position::new(4, 1, 5),
Position::new(6, 1, 6),
),
kind: ast::ErrorKind::GroupNameInvalid,
}
);
assert_eq!(
parser("(?P<¾a>)").parse().unwrap_err(),
TestError {
span: Span::new(
Position::new(4, 1, 5),
Position::new(6, 1, 6),
),
kind: ast::ErrorKind::GroupNameInvalid,
}
);
assert_eq!(
parser("(?P<☃>)").parse().unwrap_err(),
TestError {
span: Span::new(
Position::new(4, 1, 5),
Position::new(7, 1, 6),
),
kind: ast::ErrorKind::GroupNameInvalid,
}
);
assert_eq!(
parser("(?P<a☃>)").parse().unwrap_err(),
TestError {
span: Span::new(
Position::new(5, 1, 6),
Position::new(8, 1, 7),
),
kind: ast::ErrorKind::GroupNameInvalid,
}
);
}

#[test]
Expand Down
10 changes: 9 additions & 1 deletion src/expand.rs
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,8 @@ fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef<'_>> {
})
}

/// Returns true if and only if the given byte is allowed in a capture name.
/// Returns true if and only if the given byte is allowed in a capture name
/// written in non-brace form.
fn is_valid_cap_letter(b: u8) -> bool {
match b {
b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true,
Expand Down Expand Up @@ -236,4 +237,11 @@ mod tests {
find!(find_cap_ref17, "$x_$y", c!("x_", 3));
find!(find_cap_ref18, "${#}", c!("#", 4));
find!(find_cap_ref19, "${Z[}", c!("Z[", 5));
find!(find_cap_ref20, "${¾}", c!("¾", 5));
find!(find_cap_ref21, "${¾a}", c!("¾a", 6));
find!(find_cap_ref22, "${a¾}", c!("a¾", 6));
find!(find_cap_ref23, "${☃}", c!("☃", 6));
find!(find_cap_ref24, "${a☃}", c!("a☃", 7));
find!(find_cap_ref25, "${☃a}", c!("☃a", 7));
find!(find_cap_ref26, "${名字}", c!("名字", 9));
}
10 changes: 8 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -360,13 +360,19 @@ regex matches `abc` at positions `0`, `1`, `2` and `3`.
<pre class="rust">
(exp) numbered capture group (indexed by opening parenthesis)
(?P&lt;name&gt;exp) named (also numbered) capture group (allowed chars: [_0-9a-zA-Z.\[\]])
(?&lt;name&gt;exp) named (also numbered) capture group (allowed chars: [_0-9a-zA-Z.\[\]])
(?P&lt;name&gt;exp) named (also numbered) capture group (names must be alpha-numeric)
(?&lt;name&gt;exp) named (also numbered) capture group (names must be alpha-numeric)
(?:exp) non-capturing group
(?flags) set flags within current group
(?flags:exp) set flags for exp (non-capturing)
</pre>
Capture group names must be any sequence of alpha-numeric Unicode codepoints,
in addition to `.`, `_`, `[` and `]`. Names must start with either an `_` or
an alphabetic codepoint. Alphabetic codepoints correspond to the `Alphabetic`
Unicode property, while numeric codepoints correspond to the union of the
`Decimal_Number`, `Letter_Number` and `Other_Number` general categories.
Flags are each a single character. For example, `(?x)` sets the flag `x`
and `(?-x)` clears the flag `x`. Multiple flags can be set or cleared at
the same time: `(?xy)` sets both the `x` and `y` flags and `(?x-y)` sets
Expand Down

0 comments on commit e47398b

Please sign in to comment.