Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

std: Optimize std.mem.indexOfSentinel #16646

Closed

Conversation

notcancername
Copy link
Contributor

@notcancername notcancername commented Aug 2, 2023

Should be merged after #16648
Add a faster version of std.mem.indexOfSentinel. It outperforms both the common word-wise and the naive version.

@squeek502
Copy link
Collaborator

Some benchmark data would be nice.

@notcancername
Copy link
Contributor Author

Old version:

Source code
const std = @import("std");
const native_endian = @import("builtin").cpu.arch.endian();

pub fn indexOfSentinelNaive(comptime Elem: type, comptime sentinel: Elem, ptr: [*:sentinel]const Elem) usize {
    var i: usize = 0;
    while (ptr[i] != sentinel) {
        i += 1;
    }
    return i;
}

inline fn splatInt(comptime T: type, byte: u8) T {
    const nb_bytes = @divExact(@bitSizeOf(T), 8);
    var buf: [nb_bytes]u8 = undefined;
    @memset(&buf, byte);
    return @bitCast(buf);
}

// https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
inline fn isAnyByteZero(comptime T: type, v: T) bool {
    const lsb = comptime splatInt(T, 0x01);
    const msb = comptime splatInt(T, 0x80);
    return (v -% lsb) & ~v & msb != 0;
}

inline fn isAnyByteEqual(comptime T: type, comptime needle: u8, v: T) bool {
    if(comptime needle == 0) return isAnyByteZero(T, v);
    return isAnyByteZero(T, v ^ (comptime splatInt(T, needle)));
}

inline fn findFirstZero(comptime T: type, v: T) usize {
    const m = comptime splatInt(T, 0x7f);
    const x = ~(((v & m) +% m) | v | m);
    return if(native_endian == .Little) @ctz(x) / 8 else @clz(x) / 8;
}

inline fn findFirstEqual(comptime T: type, comptime needle: u8, v: T) usize {
    if(comptime needle == 0) return findFirstZero(T, v);
    return findFirstZero(T, v ^ (comptime splatInt(T, needle)));
}

pub fn indexOfSentinelBytesGlibc(comptime Elem: type, comptime sentinel: Elem, ptr: [*:sentinel]const Elem) usize {
    if(Elem != u8 or sentinel != 0) return indexOfSentinelNaive(Elem, sentinel, ptr);
    const OpInt = usize;

    var i: usize = 0;

    while (ptr[i] != sentinel) {
        if(std.mem.isAligned(@intFromPtr(ptr + i), @alignOf(OpInt))) break;
        i += 1;
    } else return i;

    while(true) : (i += @sizeOf(OpInt)) {
        const bytes = std.mem.readIntNative(OpInt, ptr[i..][0..@sizeOf(OpInt)]);

        if(isAnyByteEqual(OpInt, sentinel, bytes))
            return i + findFirstEqual(OpInt, sentinel, bytes);
    }
    unreachable;
}

pub fn indexOfSentinelSimd(comptime Elem: type, comptime sentinel: Elem, ptr: [*:sentinel]const Elem) usize {
    const v_len = comptime std.simd.suggestVectorSize(Elem) orelse return indexOfSentinelBytesGlibc;
    const V = @Vector(@as(u32, @intCast(v_len)), Elem);
    const I = std.meta.Int(.unsigned, v_len);

    if(v_len * @sizeOf(Elem) > std.mem.page_size) {
        // vector length would exceed page size
        return indexOfSentinelNaive(Elem, sentinel, ptr);
    }

    var i: usize = 0;

    while (ptr[i] != sentinel) {
        if(std.mem.isAligned(@intFromPtr(ptr + i), @alignOf(V))) break;
        i += 1;
    } else return i;

    const m: V = @splat(sentinel);

    while(true) : (i += @sizeOf(V)) {
        const v: V = @bitCast(ptr[i..][0..@sizeOf(V)].*);

        const equals: I = @bitCast(v == m);
        if(equals != 0) {
            return i + (if(native_endian == .Little) @ctz(equals) else @clz(equals));
        }
    }
    unreachable;
}

pub fn main() !void {
    std.debug.print("compiled in {}\n", .{@import("builtin").mode});
    const bytes = bytes: {
        const fd = try std.os.openZ(std.os.argv[1], std.os.O.RDONLY, 0);
        defer std.os.close(fd);
        const len = (try std.os.fstat(fd)).size;
        break :bytes try std.os.mmap(null, @intCast(len), std.os.PROT.READ, std.os.MAP.PRIVATE | std.os.MAP.POPULATE, fd, 0);
    };
    defer std.os.munmap(bytes);
    const p: [*:0]const u8 = @ptrCast(bytes);

    {
        std.debug.print("warmup...\n", .{});
        var i: usize = 0;
        while(p[i] != 0) {
            i += 1;
        }
        asm volatile("");
    }
    {
        std.debug.print("std...     ", .{});
        var t = std.time.Timer.start() catch unreachable;
        const len = std.mem.indexOfSentinel(u8, 0, p);
        std.debug.print("{d}s ({d})\n", .{@as(f64, @floatFromInt(t.read())) / @as(f64, @floatFromInt(std.time.ns_per_s)), len});
    }
    {
        std.debug.print("naive...   ", .{});
        var t = std.time.Timer.start() catch unreachable;
        const len = indexOfSentinelNaive(u8, 0, p);
        std.debug.print("{d}s ({d})\n", .{@as(f64, @floatFromInt(t.read())) / @as(f64, @floatFromInt(std.time.ns_per_s)), len});
    }
    {
        std.debug.print("int...     ", .{});
        var t = std.time.Timer.start() catch unreachable;
        const len = indexOfSentinelBytesGlibc(u8, 0, p);
        std.debug.print("{d}s ({d})\n", .{@as(f64, @floatFromInt(t.read())) / @as(f64, @floatFromInt(std.time.ns_per_s)), len});
    }
    {
        std.debug.print("vec...      ", .{});
        var t = std.time.Timer.start() catch unreachable;
        const len = indexOfSentinelSimd(u8, 0, p);
        std.debug.print("{d}s ({d})\n", .{@as(f64, @floatFromInt(t.read())) / @as(f64, @floatFromInt(std.time.ns_per_s)), len});
    }

}

On my machine:

compiled in builtin.OptimizeMode.ReleaseFast
warmup...
std...     0.570167127s (1207911144)
naive...   0.579053407s (1207911144)
int...     0.142770632s (1207911144)
vec...      0.075286838s (1207911144)

Specify the test file (uncompress with xz -d) as argv[1].

Of course, this assumes the best case of everything being aligned.

@notcancername
Copy link
Contributor Author

notcancername commented Aug 2, 2023

I will make a PR to apply the x86-specific optimization here to std.simd.firstTrue and friends, then use std.simd.firstIndexOfValue.

@notcancername notcancername force-pushed the optimize-indexofsentinel branch 3 times, most recently from 36b7b37 to fa19266 Compare August 2, 2023 02:40
@marler8997
Copy link
Contributor

Not sure if this is still the case, but at one point we were using https://github.com/ziglang/gotta-go-fast to add benchmarks for optimization focused changes like this.

@notcancername notcancername force-pushed the optimize-indexofsentinel branch from 09fdbea to 7a25b94 Compare August 4, 2023 16:46
@notcancername notcancername force-pushed the optimize-indexofsentinel branch from 8f4467b to 92acbef Compare August 12, 2023 16:55
@tiehuis
Copy link
Member

tiehuis commented Oct 3, 2023

Done in #17161.

@tiehuis tiehuis closed this Oct 3, 2023
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants