std: Optimize std.mem.indexOfSentinel #16646

notcancername · 2023-08-02T01:12:11Z

Should be merged after #16648
Add a faster version of std.mem.indexOfSentinel. It outperforms both the common word-wise and the naive version.

squeek502 · 2023-08-02T01:18:15Z

Some benchmark data would be nice.

notcancername · 2023-08-02T01:30:28Z

Old version:

Source code

const std = @import("std");
const native_endian = @import("builtin").cpu.arch.endian();

pub fn indexOfSentinelNaive(comptime Elem: type, comptime sentinel: Elem, ptr: [*:sentinel]const Elem) usize {
    var i: usize = 0;
    while (ptr[i] != sentinel) {
        i += 1;
    }
    return i;
}

inline fn splatInt(comptime T: type, byte: u8) T {
    const nb_bytes = @divExact(@bitSizeOf(T), 8);
    var buf: [nb_bytes]u8 = undefined;
    @memset(&buf, byte);
    return @bitCast(buf);
}

// https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
inline fn isAnyByteZero(comptime T: type, v: T) bool {
    const lsb = comptime splatInt(T, 0x01);
    const msb = comptime splatInt(T, 0x80);
    return (v -% lsb) & ~v & msb != 0;
}

inline fn isAnyByteEqual(comptime T: type, comptime needle: u8, v: T) bool {
    if(comptime needle == 0) return isAnyByteZero(T, v);
    return isAnyByteZero(T, v ^ (comptime splatInt(T, needle)));
}

inline fn findFirstZero(comptime T: type, v: T) usize {
    const m = comptime splatInt(T, 0x7f);
    const x = ~(((v & m) +% m) | v | m);
    return if(native_endian == .Little) @ctz(x) / 8 else @clz(x) / 8;
}

inline fn findFirstEqual(comptime T: type, comptime needle: u8, v: T) usize {
    if(comptime needle == 0) return findFirstZero(T, v);
    return findFirstZero(T, v ^ (comptime splatInt(T, needle)));
}

pub fn indexOfSentinelBytesGlibc(comptime Elem: type, comptime sentinel: Elem, ptr: [*:sentinel]const Elem) usize {
    if(Elem != u8 or sentinel != 0) return indexOfSentinelNaive(Elem, sentinel, ptr);
    const OpInt = usize;

    var i: usize = 0;

    while (ptr[i] != sentinel) {
        if(std.mem.isAligned(@intFromPtr(ptr + i), @alignOf(OpInt))) break;
        i += 1;
    } else return i;

    while(true) : (i += @sizeOf(OpInt)) {
        const bytes = std.mem.readIntNative(OpInt, ptr[i..][0..@sizeOf(OpInt)]);

        if(isAnyByteEqual(OpInt, sentinel, bytes))
            return i + findFirstEqual(OpInt, sentinel, bytes);
    }
    unreachable;
}

pub fn indexOfSentinelSimd(comptime Elem: type, comptime sentinel: Elem, ptr: [*:sentinel]const Elem) usize {
    const v_len = comptime std.simd.suggestVectorSize(Elem) orelse return indexOfSentinelBytesGlibc;
    const V = @Vector(@as(u32, @intCast(v_len)), Elem);
    const I = std.meta.Int(.unsigned, v_len);

    if(v_len * @sizeOf(Elem) > std.mem.page_size) {
        // vector length would exceed page size
        return indexOfSentinelNaive(Elem, sentinel, ptr);
    }

    var i: usize = 0;

    while (ptr[i] != sentinel) {
        if(std.mem.isAligned(@intFromPtr(ptr + i), @alignOf(V))) break;
        i += 1;
    } else return i;

    const m: V = @splat(sentinel);

    while(true) : (i += @sizeOf(V)) {
        const v: V = @bitCast(ptr[i..][0..@sizeOf(V)].*);

        const equals: I = @bitCast(v == m);
        if(equals != 0) {
            return i + (if(native_endian == .Little) @ctz(equals) else @clz(equals));
        }
    }
    unreachable;
}

pub fn main() !void {
    std.debug.print("compiled in {}\n", .{@import("builtin").mode});
    const bytes = bytes: {
        const fd = try std.os.openZ(std.os.argv[1], std.os.O.RDONLY, 0);
        defer std.os.close(fd);
        const len = (try std.os.fstat(fd)).size;
        break :bytes try std.os.mmap(null, @intCast(len), std.os.PROT.READ, std.os.MAP.PRIVATE | std.os.MAP.POPULATE, fd, 0);
    };
    defer std.os.munmap(bytes);
    const p: [*:0]const u8 = @ptrCast(bytes);

    {
        std.debug.print("warmup...\n", .{});
        var i: usize = 0;
        while(p[i] != 0) {
            i += 1;
        }
        asm volatile("");
    }
    {
        std.debug.print("std...     ", .{});
        var t = std.time.Timer.start() catch unreachable;
        const len = std.mem.indexOfSentinel(u8, 0, p);
        std.debug.print("{d}s ({d})\n", .{@as(f64, @floatFromInt(t.read())) / @as(f64, @floatFromInt(std.time.ns_per_s)), len});
    }
    {
        std.debug.print("naive...   ", .{});
        var t = std.time.Timer.start() catch unreachable;
        const len = indexOfSentinelNaive(u8, 0, p);
        std.debug.print("{d}s ({d})\n", .{@as(f64, @floatFromInt(t.read())) / @as(f64, @floatFromInt(std.time.ns_per_s)), len});
    }
    {
        std.debug.print("int...     ", .{});
        var t = std.time.Timer.start() catch unreachable;
        const len = indexOfSentinelBytesGlibc(u8, 0, p);
        std.debug.print("{d}s ({d})\n", .{@as(f64, @floatFromInt(t.read())) / @as(f64, @floatFromInt(std.time.ns_per_s)), len});
    }
    {
        std.debug.print("vec...      ", .{});
        var t = std.time.Timer.start() catch unreachable;
        const len = indexOfSentinelSimd(u8, 0, p);
        std.debug.print("{d}s ({d})\n", .{@as(f64, @floatFromInt(t.read())) / @as(f64, @floatFromInt(std.time.ns_per_s)), len});
    }

}

On my machine:

compiled in builtin.OptimizeMode.ReleaseFast
warmup...
std...     0.570167127s (1207911144)
naive...   0.579053407s (1207911144)
int...     0.142770632s (1207911144)
vec...      0.075286838s (1207911144)

Specify the test file (uncompress with xz -d) as argv[1].

Of course, this assumes the best case of everything being aligned.

notcancername · 2023-08-02T01:31:42Z

I will make a PR to apply the x86-specific optimization here to std.simd.firstTrue and friends, then use std.simd.firstIndexOfValue.

marler8997 · 2023-08-04T01:47:49Z

Not sure if this is still the case, but at one point we were using https://github.com/ziglang/gotta-go-fast to add benchmarks for optimization focused changes like this.

…omptime

tiehuis · 2023-10-03T00:11:13Z

Done in #17161.

notcancername force-pushed the optimize-indexofsentinel branch 3 times, most recently from 36b7b37 to fa19266 Compare August 2, 2023 02:40

std: Optimize std.mem.indexOfSentinel

7a25b94

notcancername force-pushed the optimize-indexofsentinel branch from 09fdbea to 7a25b94 Compare August 4, 2023 16:46

std.mem.indexOfSentinel: clean up

92acbef

notcancername force-pushed the optimize-indexofsentinel branch from 8f4467b to 92acbef Compare August 12, 2023 16:55

std.mem.indexOfSentinel: fix incorrect call of runtime-only code at c…

82b0ad0

…omptime

tiehuis mentioned this pull request Sep 15, 2023

std.mem: add vectorized indexOfScalarPos and indexOfSentinel #17161

Merged

tiehuis closed this Oct 3, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

std: Optimize std.mem.indexOfSentinel #16646

std: Optimize std.mem.indexOfSentinel #16646

notcancername commented Aug 2, 2023 •

edited

Loading

squeek502 commented Aug 2, 2023

notcancername commented Aug 2, 2023

notcancername commented Aug 2, 2023 •

edited

Loading

marler8997 commented Aug 4, 2023

tiehuis commented Oct 3, 2023

std: Optimize std.mem.indexOfSentinel #16646

std: Optimize std.mem.indexOfSentinel #16646

Conversation

notcancername commented Aug 2, 2023 • edited Loading

squeek502 commented Aug 2, 2023

notcancername commented Aug 2, 2023

notcancername commented Aug 2, 2023 • edited Loading

marler8997 commented Aug 4, 2023

tiehuis commented Oct 3, 2023

notcancername commented Aug 2, 2023 •

edited

Loading

notcancername commented Aug 2, 2023 •

edited

Loading