Skip to content

Commit

Permalink
Escape str using SIMD
Browse files Browse the repository at this point in the history
  • Loading branch information
ijl committed Dec 22, 2023
1 parent 39f3996 commit fe8c0e3
Show file tree
Hide file tree
Showing 16 changed files with 1,107 additions and 16 deletions.
8 changes: 5 additions & 3 deletions .github/workflows/debug.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,12 @@ jobs:
]
python: [
{ version: '3.12', abi: 'cp312-cp312' },
{ version: '3.11', abi: 'cp311-cp311' },
{ version: '3.8', abi: 'cp38-cp38' },
]
features: [
"",
"--features=yyjson,simd-write",
]
env:
CC: "gcc"
CFLAGS: "-O2 -fno-plt"
Expand All @@ -36,9 +39,8 @@ jobs:
PATH="$HOME/.cargo/bin:$PATH" maturin build --release \
--out=dist \
--profile=dev \
--features=yyjson \
--interpreter python${{ matrix.python.version }} \
--target=x86_64-unknown-linux-gnu
--target=x86_64-unknown-linux-gnu ${{ matrix.features }}
- run: python -m pip install --user dist/orjson*.whl
- run: python -m pip install --user -r test/requirements.txt -r integration/requirements.txt
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/linux.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ jobs:
- run: |
maturin build --release --strip \
--out=dist \
--features=no-panic,yyjson \
--features=no-panic,simd-write,yyjson \
--compatibility manylinux_2_17 \
--interpreter python${{ matrix.python.version }} \
--target=x86_64-unknown-linux-gnu
Expand Down Expand Up @@ -147,7 +147,7 @@ jobs:
rustup-components: rust-src
target: ${{ matrix.platform.target }}
manylinux: musllinux_1_1
args: --release --strip --out=dist --features=no-panic,yyjson -i python${{ matrix.python.version }}
args: --release --strip --out=dist --features=no-panic,simd-write,yyjson -i python${{ matrix.python.version }}

- name: Set up QEMU
if: matrix.platform.arch != 'x86_64'
Expand Down Expand Up @@ -237,7 +237,7 @@ jobs:
rust-toolchain: nightly-2023-12-10
rustup-components: rust-src
manylinux: auto
args: --release --strip --out=dist --features=no-panic,yyjson -i python${{ matrix.python.version }}
args: --release --strip --out=dist --features=no-panic,simd-write,yyjson -i python${{ matrix.python.version }}

- uses: uraimo/run-on-arch-action@v2
name: Test
Expand Down
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,11 @@ no-panic = [
"ryu/no-panic",
]

simd-write = [
"packed_simd",
"libc",
]

# Build yyjson as a backend and panic if it fails. The default is to attempt
# to build and on failure fall back to another backend.
yyjson = []
Expand All @@ -58,7 +63,9 @@ compact_str = { version = "0.7", default_features = false, features = ["serde"]
encoding_rs = { version = "0.8", default_features = false }
itoa = { version = "1", default_features = false }
itoap = { version = "1", features = ["std", "simd"] }
libc = { version = "0.2", default_features = false, optional = true }
once_cell = { version = "1", default_features = false, features = ["race"] }
packed_simd = { version = "0.3", default_features = false, optional = true }
pyo3-ffi = { version = "^0.20", default_features = false, features = ["extension-module"]}
ryu = { version = "1", default_features = false }
serde = { version = "1", default_features = false }
Expand Down
2 changes: 1 addition & 1 deletion ci/azure-macos.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ steps:
PATH=$HOME/.cargo/bin:$PATH \
MACOSX_DEPLOYMENT_TARGET=$(macosx_deployment_target) \
PYO3_CROSS_LIB_DIR=$(python -c "import sysconfig;print(sysconfig.get_config_var('LIBDIR'))") \
maturin build --release --strip --features=no-panic,yyjson --interpreter $(interpreter) --target=universal2-apple-darwin
maturin build --release --strip --features=no-panic,simd-write,yyjson --interpreter $(interpreter) --target=universal2-apple-darwin
env:
CC: "clang"
CFLAGS: "-O2 -fno-plt -flto=thin -fstrict-aliasing"
Expand Down
2 changes: 1 addition & 1 deletion ci/azure-win.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ steps:
displayName: build dependencies
- script: python.exe -m pip install -r test\requirements.txt -r integration\requirements.txt
displayName: test dependencies
- script: maturin.exe build --release --features=no-panic,yyjson --strip --interpreter $(interpreter) --target $(target)
- script: maturin.exe build --release --features=no-panic,simd-write,yyjson --strip --interpreter $(interpreter) --target $(target)
displayName: build
- script: python.exe -m pip install orjson --no-index --find-links=D:\a\1\s\target\wheels
displayName: install
Expand Down
205 changes: 205 additions & 0 deletions src/serialize/backend/format_str/json.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
// SPDX-License-Identifier: (Apache-2.0 OR MIT)
// This is an adaptation of `src/value/ser.rs` from serde-json.

pub fn format_escaped_str<W, F>(writer: &mut W, formatter: &mut F, value: &str) -> io::Result<()>
where
W: ?Sized + io::Write + WriteExt,
F: ?Sized + Formatter,
{
let len = value.len();

if len == 0 {
reserve_minimum!(writer);
return unsafe { writer.write_reserved_fragment(b"\"\"") };
}
unsafe {
let mut escapes: u8 = __;
let mut idx = 0;
let as_bytes = value.as_bytes();
while idx < len.saturating_sub(8) {
escapes |= *ESCAPE.get_unchecked(*as_bytes.get_unchecked(idx) as usize);
escapes |= *ESCAPE.get_unchecked(*as_bytes.get_unchecked(idx + 1) as usize);
escapes |= *ESCAPE.get_unchecked(*as_bytes.get_unchecked(idx + 2) as usize);
escapes |= *ESCAPE.get_unchecked(*as_bytes.get_unchecked(idx + 3) as usize);
escapes |= *ESCAPE.get_unchecked(*as_bytes.get_unchecked(idx + 4) as usize);
escapes |= *ESCAPE.get_unchecked(*as_bytes.get_unchecked(idx + 5) as usize);
escapes |= *ESCAPE.get_unchecked(*as_bytes.get_unchecked(idx + 6) as usize);
escapes |= *ESCAPE.get_unchecked(*as_bytes.get_unchecked(idx + 7) as usize);
if unlikely!(escapes != __) {
return format_escaped_str_with_escapes(writer, formatter, as_bytes, idx);
}
idx += 8;
}
while idx < len {
escapes |= *ESCAPE.get_unchecked(*as_bytes.get_unchecked(idx) as usize);
if unlikely!(escapes != __) {
return format_escaped_str_with_escapes(writer, formatter, as_bytes, idx);
}
idx += 1;
}
}

writer.write_str(value)
}

fn format_escaped_str_with_escapes<W, F>(
writer: &mut W,
formatter: &mut F,
value: &[u8],
initial: usize,
) -> io::Result<()>
where
W: ?Sized + io::Write + WriteExt,
F: ?Sized + Formatter,
{
writer.reserve((value.len() * 8) + 2);
unsafe {
writer.write_reserved_punctuation(b'"').unwrap();
if initial > 0 {
writer
.write_reserved_fragment(value.get_unchecked(0..initial))
.unwrap();
}
format_escaped_str_contents(writer, formatter, value.get_unchecked(initial..)).unwrap();
writer.write_reserved_punctuation(b'"').unwrap();
};
Ok(())
}

fn format_escaped_str_contents<W, F>(
writer: &mut W,
formatter: &mut F,
bytes: &[u8],
) -> io::Result<()>
where
W: ?Sized + io::Write + WriteExt,
F: ?Sized + Formatter,
{
let len = bytes.len();
let mut start = 0;
let mut idx = 0;

let mut escape: u8;
loop {
if idx < len.saturating_sub(4) {
escape = 0;
unsafe {
escape |= *ESCAPE.get_unchecked(*bytes.get_unchecked(idx) as usize);
escape |= *ESCAPE.get_unchecked(*bytes.get_unchecked(idx + 1) as usize);
escape |= *ESCAPE.get_unchecked(*bytes.get_unchecked(idx + 2) as usize);
escape |= *ESCAPE.get_unchecked(*bytes.get_unchecked(idx + 3) as usize);
}
if escape == 0 {
idx += 4;
continue;
}
}

let byte = unsafe { *bytes.get_unchecked(idx) };
escape = unsafe { *ESCAPE.get_unchecked(byte as usize) };
if escape == 0 {
idx += 1;
if idx == len {
break;
} else {
continue;
}
}

if start < idx {
unsafe {
writer
.write_reserved_fragment(bytes.get_unchecked(start..idx))
.unwrap()
};
}

let char_escape = CharEscape::from_escape_table(escape, byte);
formatter.write_char_escape(writer, char_escape)?;

idx += 1;
start = idx;
if idx == len {
break;
}
}

if start != len {
unsafe {
writer
.write_reserved_fragment(bytes.get_unchecked(start..len))
.unwrap()
};
}
Ok(())
}


pub enum CharEscape {
/// An escaped quote `"`
Quote,
/// An escaped reverse solidus `\`
ReverseSolidus,
/// An escaped backspace character (usually escaped as `\b`)
Backspace,
/// An escaped form feed character (usually escaped as `\f`)
FormFeed,
/// An escaped line feed character (usually escaped as `\n`)
LineFeed,
/// An escaped carriage return character (usually escaped as `\r`)
CarriageReturn,
/// An escaped tab character (usually escaped as `\t`)
Tab,
/// An escaped ASCII plane control character (usually escaped as
/// `\u00XX` where `XX` are two hex characters)
AsciiControl(u8),
}

impl CharEscape {
#[inline]
fn from_escape_table(escape: u8, byte: u8) -> CharEscape {
match escape {
self::BB => CharEscape::Backspace,
self::TT => CharEscape::Tab,
self::NN => CharEscape::LineFeed,
self::FF => CharEscape::FormFeed,
self::RR => CharEscape::CarriageReturn,
self::QU => CharEscape::Quote,
self::BS => CharEscape::ReverseSolidus,
self::UU => CharEscape::AsciiControl(byte),
_ => unreachable!(),
}
}
}

const BB: u8 = b'b'; // \x08
const TT: u8 = b't'; // \x09
const NN: u8 = b'n'; // \x0A
const FF: u8 = b'f'; // \x0C
const RR: u8 = b'r'; // \x0D
const QU: u8 = b'"'; // \x22
const BS: u8 = b'\\'; // \x5C
const UU: u8 = b'u'; // \x00...\x1F except the ones above
const __: u8 = 0;

// Lookup table of escape sequences. A value of b'x' at index i means that byte
// i is escaped as "\x" in JSON. A value of 0 means that byte i is not escaped.
const ESCAPE: [u8; 256] = [
// 1 2 3 4 5 6 7 8 9 A B C D E F
UU, UU, UU, UU, UU, UU, UU, UU, BB, TT, NN, UU, FF, RR, UU, UU, // 0
UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, // 1
__, __, QU, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4
__, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 5
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F
];
22 changes: 22 additions & 0 deletions src/serialize/backend/format_str/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
// SPDX-License-Identifier: (Apache-2.0 OR MIT)

mod json;

#[cfg(feature = "simd-write")]
mod sonic;

#[cfg(not(feature = "simd-write"))]
pub use json::format_escaped_str;

#[cfg(feature = "simd-write")]
fn format_escaped_str<W, F>(writer: &mut W, formatter: &mut F, value: &str) -> io::Result<()>
where
W: ?Sized + io::Write + WriteExt,
F: ?Sized + Formatter,
{
if std::is_x86_feature_detected!("avx2") {
sonic::format_escaped_str(writer, formatter, value)
} else {
json::format_escaped_str(writer, formatter, value)
}
}
Loading

0 comments on commit fe8c0e3

Please sign in to comment.