Skip to content

Commit

Permalink
perf: use memchr (#8)
Browse files Browse the repository at this point in the history
  • Loading branch information
jrandolf authored Jan 27, 2023
1 parent fb08d65 commit 2337137
Show file tree
Hide file tree
Showing 9 changed files with 57 additions and 169 deletions.
1 change: 0 additions & 1 deletion .clang-format

This file was deleted.

4 changes: 0 additions & 4 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,6 @@ jobs:
steps:
- name: Check out repository
uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c
- name: Install C linter
run: sudo apt-get install clang-format
- name: Lint C code
run: clang-format --dry-run --Werror **/*.{h,c}
- name: Set up Cargo cache
uses: ./.github/actions/cargo-cache
with:
Expand Down
79 changes: 1 addition & 78 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 7 additions & 14 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,34 +14,27 @@ keywords = ["address", "rfc5322", "rfc2822"]
default = ["normalization"]
comments = ["white-spaces"]
literals = []
normalization = [
"dep:icu_collections",
"dep:icu_datagen",
"dep:icu_normalizer",
"dep:icu_provider",
"dep:litemap",
"dep:zerovec",
]
normalization = ["dep:icu_collections", "dep:icu_datagen", "dep:icu_normalizer", "dep:icu_provider", "dep:litemap", "dep:zerovec"]
white-spaces = []

[dependencies]
serde = { version = "1.0", optional = true, features = ["derive"] }

icu_collections = { version = "1.0.0", optional = true }
icu_normalizer = { version = "1.0.0", optional = true }
icu_provider = { version = "1.0.1", optional = true }
litemap = { version = "0.6.0", optional = true }
zerovec = { version = "0.9.1", optional = true }

serde = { version = "1.0", optional = true, features = ["derive"] }

memchr = "2.5.0"

[dev-dependencies]
regex = "1.7.1"

[build-dependencies]
bindgen = "0.63.0"
cc = "1.0.78"
rustc_version = "0.4.0"

icu_datagen = { version = "1.0.2", optional = true }

rustc_version = "0.4.0"

[profile.release]
lto = true
14 changes: 0 additions & 14 deletions build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,20 +25,6 @@ fn generate_icu_data() {
}

fn main() {
println!("cargo:rerun-if-changed=src/ascii.h");
bindgen::Builder::default()
.header("src/ascii.h")
.blocklist_type("max_align_t")
.blocklist_type("wchar_t")
.parse_callbacks(Box::new(bindgen::CargoCallbacks))
.generate()
.expect("Unable to generate bindings")
.write_to_file(PathBuf::from(env::var("OUT_DIR").unwrap()).join("ascii.rs"))
.expect("Couldn't write bindings!");

println!("cargo:rerun-if-changed=src/ascii.c");
cc::Build::new().file("src/ascii.c").compile("ascii");

#[cfg(feature = "normalization")]
generate_icu_data();

Expand Down
19 changes: 0 additions & 19 deletions src/ascii.c

This file was deleted.

12 changes: 0 additions & 12 deletions src/ascii.h

This file was deleted.

74 changes: 48 additions & 26 deletions src/ascii.rs
Original file line number Diff line number Diff line change
@@ -1,21 +1,36 @@
use std::alloc::{alloc, Layout};
use std::os::raw::c_char;

include!(concat!(env!("OUT_DIR"), "/ascii.rs"));
fn memcspn(value: &[u8], accept: &[u8]) -> Option<usize> {
let mut i = 0;
while i < value.len() {
if memchr::memchr(value[i], accept).is_some() {
return Some(i);
}
i += 1;
}
None
}

pub fn escape<const N: usize>(esc_chr: char, src: &str, cntl_chrs: [u8; N]) -> String {
let cap = src.len() << 1;
/// Escape ASCII characters in the given string. The first character in `cntl_chrs`
/// is used as the escape character.
pub fn escape<const N: usize>(value: &str, escape: [u8; N]) -> String {
let cap = value.len() << 1;
unsafe {
let dst = alloc(Layout::array::<u8>(cap).unwrap());
let len = ascii_escape(
esc_chr as c_char,
src.as_ptr() as *const c_char,
src.len(),
dst as *mut c_char,
cntl_chrs.as_ptr() as *const c_char,
N,
);
String::from_raw_parts(dst, len, cap)
let buffer = alloc(Layout::array::<u8>(cap).unwrap());
let mut src = value.as_bytes();
let mut dst = buffer;
while let Some(end) = memcspn(src, &escape) {
dst.copy_from_nonoverlapping(src.as_ptr(), end);
dst = dst.add(end);
dst.copy_from_nonoverlapping([escape[0], src[end]].as_ptr(), 2);
dst = dst.add(2);
src = &src[end + 1..];
}
if !src.is_empty() {
dst.copy_from_nonoverlapping(src.as_ptr(), src.len());
dst = dst.add(src.len());
}
String::from_raw_parts(buffer, dst.offset_from(buffer) as usize, cap)
}
}

Expand All @@ -25,12 +40,13 @@ mod tests {

#[test]
fn test_escape() {
assert_eq!(escape('\\', "", [b'"']), "");
assert_eq!(escape('\\', "abc", [b'"']), "abc");
assert_eq!(escape('\\', "a\\b", [b'"']), "a\\\\b");
assert_eq!(escape('\\', "a\"b", [b'"']), "a\\\"b");
assert_eq!(escape('\\', "a\\\"b", [b'"']), "a\\\\\\\"b");
assert_eq!(escape('\\', "😄\"😄😄", [b'"']), "😄\\\"😄😄");
assert_eq!(escape("", [b'\\', b'"']), "");
assert_eq!(escape("abc", [b'\\', b'"']), "abc");
assert_eq!(escape("a\\b", [b'\\', b'"']), "a\\\\b");
assert_eq!(escape("a\"b", [b'\\', b'"']), "a\\\"b");
assert_eq!(escape("a\\\"b", [b'\\', b'"']), "a\\\\\\\"b");
assert_eq!(escape("😄\"😄😄", [b'\\', b'"']), "😄\\\"😄😄");
assert_eq!(escape("😄😄😄\"", [b'\\', b'"']), "😄😄😄\\\"");
}
}

Expand All @@ -43,36 +59,42 @@ mod benches {
#[bench]
fn bench_no_escape_small(b: &mut test::Bencher) {
let s = "abc";
b.iter(|| escape('\\', s, [b'"']));
b.iter(|| escape(s, [b'\\', b'"']));
}

#[bench]
fn bench_no_escape_medium(b: &mut test::Bencher) {
let s = "abcdefghijklmnopqrstuvwxyz";
b.iter(|| escape('\\', s, [b'"']));
b.iter(|| escape(s, [b'\\', b'"']));
}

#[bench]
fn bench_no_escape_large(b: &mut test::Bencher) {
let s = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz";
b.iter(|| escape('\\', s, [b'"']));
b.iter(|| escape(s, [b'\\', b'"']));
}

#[bench]
fn bench_escape_small(b: &mut test::Bencher) {
let s = "a\\b";
b.iter(|| escape('\\', s, [b'"']));
b.iter(|| escape(s, [b'\\', b'"']));
}

#[bench]
fn bench_escape_medium(b: &mut test::Bencher) {
let s = "a\\bcdefgh\\ijklmnopqrst\\uvwxyz";
b.iter(|| escape('\\', s, [b'"']));
b.iter(|| escape(s, [b'\\', b'"']));
}

#[bench]
fn bench_escape_large(b: &mut test::Bencher) {
let s = "a\\bcdefgh\\ijklmnopqrst\\uvwxyzabcdefghijklmnopqrstuvwxyz\\abcdefghijklmnopqrstuvwxyz\\abcdefghijklmnopqrstuvwxyz";
b.iter(|| escape('\\', s, [b'"']));
b.iter(|| escape(s, [b'\\', b'"']));
}

#[bench]
fn bench_many_escapes(b: &mut test::Bencher) {
let s = "\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"";
b.iter(|| escape(s, [b'\\', b'"']));
}
}
2 changes: 1 addition & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ use serde::{Deserialize, Deserializer, Serialize, Serializer};

#[inline]
fn quote(value: &str) -> String {
ascii::escape('\\', value, [b'"', b' ', b'\t'])
ascii::escape(value, [b'\\', b'"', b' ', b'\t'])
}

/// Address specification as defined in [RFC
Expand Down

0 comments on commit 2337137

Please sign in to comment.