Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

src: enable avx512 optimization for base64 encoding #43717

Closed
wants to merge 2 commits into from
Closed
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 120 additions & 1 deletion src/base64-inl.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,32 @@
/*
* Copyright 2019 Daniel Lemire, Wojciech Muła

* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#ifndef SRC_BASE64_INL_H_
#define SRC_BASE64_INL_H_

Expand Down Expand Up @@ -120,7 +149,7 @@ size_t base64_decode(char* const dst, const size_t dstlen,
}


inline size_t base64_encode(const char* src,
inline size_t base64_encode_scalar(const char* src,
size_t slen,
char* dst,
size_t dlen,
Expand Down Expand Up @@ -182,6 +211,96 @@ inline size_t base64_encode(const char* src,
return dlen;
}


#if (defined(__x86_64) || defined(__x86_64__)) && \
(defined(__linux) || defined(__linux__))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see why this operation wouldn't apply outside of Linux? Should this be a check for gcc/clang instead?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The original code is for Linux GCC and I do not have a windows to test for it. I would try to remove the line 216 and add gcc/clang check and let the bot to test it. Thanks!

#pragma GCC target("avx512vl", "avx512vbmi")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be a bit cleaner to use __attribute__((target(...))) on the function itself? Or is this necessary for the #include to work?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I chose #pragma so to explicitly reset the pragma to avoid polution on other code.

#include <immintrin.h>
inline size_t base64_encode_avx512vl(const char* src,
size_t slen,
char* dst,
size_t dlen,
Base64Mode mode) {
// Do the exact check as scalar algorithm.
CHECK(dlen >= base64_encoded_size(slen, mode) &&
"not enough space provided for base64 encode");
size_t dlen_remain = dlen;
const char* lookup_tbl = base64_select_table(mode);
// 32-bit input
// [ 0 0 0 0 0 0 0 0|c1 c0 d5 d4 d3 d2 d1 d0|
// b3 b2 b1 b0 c5 c4 c3 c2|a5 a4 a3 a2 a1 a0 b5 b4]
// output order [1, 2, 0, 1]
// [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0|
// a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0]

const __m512i shuffle_input = _mm512_setr_epi32(
0x01020001, 0x04050304, 0x07080607, 0x0a0b090a,
0x0d0e0c0d, 0x10110f10, 0x13141213, 0x16171516,
0x191a1819, 0x1c1d1b1c, 0x1f201e1f, 0x22232122,
0x25262425, 0x28292728, 0x2b2c2a2b, 0x2e2f2d2e);
const __m512i lookup = _mm512_loadu_si512((const __m512i*)(lookup_tbl));

while (slen >= 64) {
const __m512i v = _mm512_loadu_si512((const __m512i*)src);

// Reorder bytes
// [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0|
// a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0]
const __m512i in = _mm512_permutexvar_epi8(shuffle_input, v);

// After multishift a single 32-bit lane has following layout
// [c1 c0 d5 d4 d3 d2 d1 d0|b1 b0 c5 c4 c3 c2 c1 c0|
// a1 a0 b5 b4 b3 b2 b1 b0|d1 d0 a5 a4 a3 a2 a1 a0]
// (a = [10:17], b = [4:11], c = [22:27], d = [16:21])

// 48, 54, 36, 42, 16, 22, 4, 10
const __m512i shifts = _mm512_set1_epi64(0x3036242a1016040alu);
const __m512i indices = _mm512_multishift_epi64_epi8(shifts, in);

// Note: the two higher bits of each indices' byte have garbage
// but the following permutexvar instruction masks them out

// Translation 6-bit values to ASCII.
const __m512i result = _mm512_permutexvar_epi8(indices, lookup);

_mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), result);

dlen_remain -= 64;
dst += 64;
src += 48;
slen -= 48;
}

if (slen > 0)
base64_encode_scalar(src, slen, dst, dlen_remain, mode);

return dlen;
}


inline size_t base64_encode(const char* src,
size_t slen,
char* dst,
size_t dlen,
Base64Mode mode) {
if (__builtin_cpu_supports("avx512vl") && \
__builtin_cpu_supports("avx512vbmi")) {
return base64_encode_avx512vl(src, slen, dst, dlen, mode);
} else {
return base64_encode_scalar(src, slen, dst, dlen, mode);
}
}
#pragma GCC reset_options
#else
inline size_t base64_encode(const char* src,
size_t slen,
char* dst,
size_t dlen,
Base64Mode mode) {
return base64_encode_scalar(src, slen, dst, dlen, mode);
}
#endif

} // namespace node

#endif // defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS
Expand Down