-
-
Notifications
You must be signed in to change notification settings - Fork 2.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
std: Optimize std.mem.indexOfSentinel #16646
std: Optimize std.mem.indexOfSentinel #16646
Conversation
Some benchmark data would be nice. |
Old version: Source codeconst std = @import("std");
const native_endian = @import("builtin").cpu.arch.endian();
pub fn indexOfSentinelNaive(comptime Elem: type, comptime sentinel: Elem, ptr: [*:sentinel]const Elem) usize {
var i: usize = 0;
while (ptr[i] != sentinel) {
i += 1;
}
return i;
}
inline fn splatInt(comptime T: type, byte: u8) T {
const nb_bytes = @divExact(@bitSizeOf(T), 8);
var buf: [nb_bytes]u8 = undefined;
@memset(&buf, byte);
return @bitCast(buf);
}
// https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
inline fn isAnyByteZero(comptime T: type, v: T) bool {
const lsb = comptime splatInt(T, 0x01);
const msb = comptime splatInt(T, 0x80);
return (v -% lsb) & ~v & msb != 0;
}
inline fn isAnyByteEqual(comptime T: type, comptime needle: u8, v: T) bool {
if(comptime needle == 0) return isAnyByteZero(T, v);
return isAnyByteZero(T, v ^ (comptime splatInt(T, needle)));
}
inline fn findFirstZero(comptime T: type, v: T) usize {
const m = comptime splatInt(T, 0x7f);
const x = ~(((v & m) +% m) | v | m);
return if(native_endian == .Little) @ctz(x) / 8 else @clz(x) / 8;
}
inline fn findFirstEqual(comptime T: type, comptime needle: u8, v: T) usize {
if(comptime needle == 0) return findFirstZero(T, v);
return findFirstZero(T, v ^ (comptime splatInt(T, needle)));
}
pub fn indexOfSentinelBytesGlibc(comptime Elem: type, comptime sentinel: Elem, ptr: [*:sentinel]const Elem) usize {
if(Elem != u8 or sentinel != 0) return indexOfSentinelNaive(Elem, sentinel, ptr);
const OpInt = usize;
var i: usize = 0;
while (ptr[i] != sentinel) {
if(std.mem.isAligned(@intFromPtr(ptr + i), @alignOf(OpInt))) break;
i += 1;
} else return i;
while(true) : (i += @sizeOf(OpInt)) {
const bytes = std.mem.readIntNative(OpInt, ptr[i..][0..@sizeOf(OpInt)]);
if(isAnyByteEqual(OpInt, sentinel, bytes))
return i + findFirstEqual(OpInt, sentinel, bytes);
}
unreachable;
}
pub fn indexOfSentinelSimd(comptime Elem: type, comptime sentinel: Elem, ptr: [*:sentinel]const Elem) usize {
const v_len = comptime std.simd.suggestVectorSize(Elem) orelse return indexOfSentinelBytesGlibc;
const V = @Vector(@as(u32, @intCast(v_len)), Elem);
const I = std.meta.Int(.unsigned, v_len);
if(v_len * @sizeOf(Elem) > std.mem.page_size) {
// vector length would exceed page size
return indexOfSentinelNaive(Elem, sentinel, ptr);
}
var i: usize = 0;
while (ptr[i] != sentinel) {
if(std.mem.isAligned(@intFromPtr(ptr + i), @alignOf(V))) break;
i += 1;
} else return i;
const m: V = @splat(sentinel);
while(true) : (i += @sizeOf(V)) {
const v: V = @bitCast(ptr[i..][0..@sizeOf(V)].*);
const equals: I = @bitCast(v == m);
if(equals != 0) {
return i + (if(native_endian == .Little) @ctz(equals) else @clz(equals));
}
}
unreachable;
}
pub fn main() !void {
std.debug.print("compiled in {}\n", .{@import("builtin").mode});
const bytes = bytes: {
const fd = try std.os.openZ(std.os.argv[1], std.os.O.RDONLY, 0);
defer std.os.close(fd);
const len = (try std.os.fstat(fd)).size;
break :bytes try std.os.mmap(null, @intCast(len), std.os.PROT.READ, std.os.MAP.PRIVATE | std.os.MAP.POPULATE, fd, 0);
};
defer std.os.munmap(bytes);
const p: [*:0]const u8 = @ptrCast(bytes);
{
std.debug.print("warmup...\n", .{});
var i: usize = 0;
while(p[i] != 0) {
i += 1;
}
asm volatile("");
}
{
std.debug.print("std... ", .{});
var t = std.time.Timer.start() catch unreachable;
const len = std.mem.indexOfSentinel(u8, 0, p);
std.debug.print("{d}s ({d})\n", .{@as(f64, @floatFromInt(t.read())) / @as(f64, @floatFromInt(std.time.ns_per_s)), len});
}
{
std.debug.print("naive... ", .{});
var t = std.time.Timer.start() catch unreachable;
const len = indexOfSentinelNaive(u8, 0, p);
std.debug.print("{d}s ({d})\n", .{@as(f64, @floatFromInt(t.read())) / @as(f64, @floatFromInt(std.time.ns_per_s)), len});
}
{
std.debug.print("int... ", .{});
var t = std.time.Timer.start() catch unreachable;
const len = indexOfSentinelBytesGlibc(u8, 0, p);
std.debug.print("{d}s ({d})\n", .{@as(f64, @floatFromInt(t.read())) / @as(f64, @floatFromInt(std.time.ns_per_s)), len});
}
{
std.debug.print("vec... ", .{});
var t = std.time.Timer.start() catch unreachable;
const len = indexOfSentinelSimd(u8, 0, p);
std.debug.print("{d}s ({d})\n", .{@as(f64, @floatFromInt(t.read())) / @as(f64, @floatFromInt(std.time.ns_per_s)), len});
}
} On my machine: compiled in builtin.OptimizeMode.ReleaseFast
warmup...
std... 0.570167127s (1207911144)
naive... 0.579053407s (1207911144)
int... 0.142770632s (1207911144)
vec... 0.075286838s (1207911144) Specify the test file (uncompress with Of course, this assumes the best case of everything being aligned. |
I will make a PR to apply the x86-specific optimization here to |
36b7b37
to
fa19266
Compare
Not sure if this is still the case, but at one point we were using https://github.com/ziglang/gotta-go-fast to add benchmarks for optimization focused changes like this. |
09fdbea
to
7a25b94
Compare
8f4467b
to
92acbef
Compare
Done in #17161. |
Should be merged after #16648
Add a faster version of
std.mem.indexOfSentinel
. It outperforms both the common word-wise and the naive version.