Skip to content

Commit

Permalink
feat: add s390x architecture support (#1214)
Browse files Browse the repository at this point in the history
* fix(lua): use native architecture when compiling lua for s390x.

Signed-off-by: iko1 <me@remotecpp.dev>

* feat(server): implement CompareFP for s390x architecture.

Signed-off-by: iko1 <me@remotecpp.dev>

* feat: implement validate_ascii_fast function variant for s390x arch.

Signed-off-by: iko1 <me@remotecpp.dev>

* fix: add comments before s390x vector operations

Signed-off-by: iko1 <me@remotecpp.dev>

* fix validate_ascii_fast function logic after CR comment

Signed-off-by: iko1 <me@remotecpp.dev>

* Revert "fix(lua): use native architecture when compiling lua for s390x."

This reverts commit 6cc5d8a.

* fix(lua): use native architecture when compiling lua for s390x.

Signed-off-by: iko1 <me@remotecpp.dev>

* refactor validate_ascii_fast function after CR comment

Signed-off-by: iko1 <me@remotecpp.dev>

* include vecintrin.h from sse_port.h rather the misleading filename

Signed-off-by: iko1 <me@remotecpp.dev>

---------

Signed-off-by: iko1 <me@remotecpp.dev>
  • Loading branch information
iko1 committed Jun 18, 2023
1 parent 6d4d740 commit 19d7622
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 2 deletions.
6 changes: 4 additions & 2 deletions patches/lua-v5.4.4.patch
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ index d42d14b7..75647e72 100644
#define LUAI_MAXSTACK 15000
#endif
diff --git a/makefile b/makefile
index d46e650c..e347e614 100644
index d46e650c..c27e5677 100644
--- a/makefile
+++ b/makefile
@@ -66,13 +66,23 @@ LOCAL = $(TESTS) $(CWARNS)
@@ -66,13 +66,25 @@ LOCAL = $(TESTS) $(CWARNS)


# enable Linux goodies
Expand All @@ -32,6 +32,8 @@ index d46e650c..e347e614 100644
+OPTFLAGS= -march=sandybridge
+else ifeq ($(uname_m), aarch64)
+OPTFLAGS= -march=armv8.2-a+fp16+rcpc+dotprod+crypto
+else ifeq ($(uname_m), s390x)
+OPTFLAGS= -march=native
+else
+ $(error ERROR: unknown architecture $(uname_m))
+endif
Expand Down
29 changes: 29 additions & 0 deletions src/core/dash_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -879,6 +879,34 @@ unsigned BucketBase<NUM_SLOTS, NUM_OVR>::UnsetStashPtr(uint8_t fp_hash, unsigned
return res;
}

#ifdef __s390x__
template <unsigned NUM_SLOTS, unsigned NUM_OVR>
uint32_t BucketBase<NUM_SLOTS, NUM_OVR>::CompareFP(uint8_t fp) const {
static_assert(FpArray{}.size() <= 16);
vector unsigned char v1;

// Replicate 16 times fp to key_data.
for (int i = 0; i < 16; i++) {
v1[i] = fp;
}

// Loads 16 bytes of src into seg_data.
vector unsigned char v2 = vec_load_len(finger_arr_.data(), 16);

// compare 1-byte vectors seg_data and key_data, dst[i] := ( a[i] == b[i] ) ? 0xFF : 0.
vector bool char rv_mask = vec_cmpeq(v1, v2);

// collapses 16 msb bits from each byte in rv_mask into mask.
int mask = 0;
for (int i = 0; i < 16; i++) {
if (rv_mask[i]) {
mask |= 1 << i;
}
}

return mask;
}
#else
template <unsigned NUM_SLOTS, unsigned NUM_OVR>
uint32_t BucketBase<NUM_SLOTS, NUM_OVR>::CompareFP(uint8_t fp) const {
static_assert(FpArray{}.size() <= 16);
Expand All @@ -898,6 +926,7 @@ uint32_t BucketBase<NUM_SLOTS, NUM_OVR>::CompareFP(uint8_t fp) const {
// Note: Last 2 operations can be combined in skylake with _mm_cmpeq_epi8_mask.
return mask;
}
#endif

// Bucket slot array goes from left to right: [x, x, ...]
// Shift right vacates the first slot on the left by shifting all the elements right and
Expand Down
37 changes: 37 additions & 0 deletions src/core/detail/bitpacking.cc
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,42 @@ static inline pair<const char*, uint8_t*> simd_variant2_pack(const char* ascii,
// See https://github.com/lemire/fastvalidate-utf-8/
// The function returns true (1) if all chars passed in src are
// 7-bit values (0x00..0x7F). Otherwise, it returns false (0).
#ifdef __s390x__
bool validate_ascii_fast(const char* src, size_t len) {
size_t i = 0;

// Initialize a vector in which all the elements are set to zero.
vector unsigned char has_error = vec_splat_s8(0);
if (len >= 16) {
for (; i <= len - 16; i += 16) {
// Load 16 bytes from buffer into a vector.
vector unsigned char current_bytes = vec_load_len((signed char*)(src + i), 16);
// Perform a bitwise OR operation between the current and the previously loaded contents.
has_error = vec_orc(has_error, current_bytes);
}
}

// Initialize a vector in which all the elements are set to an invalid ASCII value.
vector unsigned char rep_invalid_values = vec_splat_s8(0x80);

// Perform bitwise AND-complement operation between two vectors.
vector unsigned char andc_result = vec_andc(rep_invalid_values, has_error);

// Tests whether any of corresponding elements of the given vectors are not equal.
// After the bitwise operation, both vectors should be equal if ASCII values.
if (!vec_all_eq(rep_invalid_values, andc_result)) {
return false;
}

for (; i < len; i++) {
if (src[i] & 0x80) {
return false;
}
}

return true;
}
#else
bool validate_ascii_fast(const char* src, size_t len) {
size_t i = 0;
__m128i has_error = _mm_setzero_si128();
Expand All @@ -123,6 +159,7 @@ bool validate_ascii_fast(const char* src, size_t len) {

return !error_mask;
}
#endif

// len must be at least 16
void ascii_pack(const char* ascii, size_t len, uint8_t* bin) {
Expand Down
4 changes: 4 additions & 0 deletions src/core/sse_port.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,16 @@
#pragma once
#if defined(__aarch64__)
#include "base/sse2neon.h"
#elif defined(__s390x__)
#include <vecintrin.h>
#else
#include <emmintrin.h>
#include <tmmintrin.h>
#endif

namespace dfly {

#ifndef __s390x__
inline __m128i mm_loadu_si128(const __m128i* ptr) {
#if defined(__aarch64__)
__m128i res;
Expand All @@ -22,5 +25,6 @@ inline __m128i mm_loadu_si128(const __m128i* ptr) {
return _mm_loadu_si128(ptr);
#endif
}
#endif

} // namespace dfly

0 comments on commit 19d7622

Please sign in to comment.