diff --git a/0001-kylin-hyperscan-5.4.2-add-loongarch64-support.patch b/0001-kylin-hyperscan-5.4.2-add-loongarch64-support.patch new file mode 100644 index 0000000000000000000000000000000000000000..41e27a3de68f789d2d43684ef1c7b1d2d0035b37 --- /dev/null +++ b/0001-kylin-hyperscan-5.4.2-add-loongarch64-support.patch @@ -0,0 +1,1407 @@ +From 5a73aacd6e22ff6d751c568e109f2fb6914b0472 Mon Sep 17 00:00:00 2001 +From: peijiankang +Date: Thu, 18 Jan 2024 15:47:45 +0800 +Subject: [PATCH] kylin hyperscan-5.4.2 add loongarch64 support + +--- + CMakeLists.txt | 14 +- + cmake/arch.cmake | 5 +- + cmake/config.h.in | 6 + + cmake/platform.cmake | 5 +- + src/dispatcher.c | 4 +- + src/hs_valid_platform.c | 7 +- + src/nfa/shufti.c | 6 +- + src/nfa/truffle.c | 4 +- + src/rose/counting_miracle.h | 2 +- + src/util/arch.h | 4 + + src/util/cpuid_flags.c | 2 + + src/util/intrinsics.h | 6 + + src/util/simd_loongarch.h | 956 +++++++++++++++++++++++++++++++++++ + src/util/simd_types.h | 3 + + src/util/simd_utils.h | 2 + + src/util/state_compress.c | 31 +- + unit/hyperscan/behaviour.cpp | 8 + + 17 files changed, 1033 insertions(+), 32 deletions(-) + create mode 100644 src/util/simd_loongarch.h + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 77eaa25..1c49fc4 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -183,7 +183,7 @@ else() + string(REGEX REPLACE "-O[^ ]*" "" CMAKE_CXX_FLAGS_${CONFIG} "${CMAKE_CXX_FLAGS_${CONFIG}}") + endforeach () + +- if (CMAKE_COMPILER_IS_GNUCC) ++ if (ARCH_IA32 OR ARCH_X86_64 AND CMAKE_COMPILER_IS_GNUCC) + message(STATUS "gcc version ${CMAKE_C_COMPILER_VERSION}") + # If gcc doesn't recognise the host cpu, then mtune=native becomes + # generic, which isn't very good in some cases. march=native looks at +@@ -284,6 +284,10 @@ else() + set(ARCH_CXX_FLAGS "-march=armv8-a -mtune=${TUNE_FLAG}") + endif() + endif() ++ if (ARCH_LOONGARCH64) ++ set(ARCH_CXX_FLAGS "-mlsx") ++ set(ARCH_C_FLAGS "-mlsx") ++ endif() + + if(CMAKE_COMPILER_IS_GNUCC) + # spurious warnings? +@@ -328,6 +332,10 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64") + CHECK_INCLUDE_FILE_CXX(arm_neon.h HAVE_CXX_ARM_NEON_H) + endif() + ++if (ARCH_LOONGARCH64) ++ CHECK_INCLUDE_FILES(lsxintrin.h HAVE_C_LSXINTRIN_H) ++endif() ++ + CHECK_FUNCTION_EXISTS(posix_memalign HAVE_POSIX_MEMALIGN) + CHECK_FUNCTION_EXISTS(_aligned_malloc HAVE__ALIGNED_MALLOC) + +@@ -359,8 +367,8 @@ if (CMAKE_SYSTEM_NAME MATCHES "Linux") + (CMAKE_VERSION VERSION_GREATER "3.0" AND CMAKE_GENERATOR MATCHES "Ninja"))) + message (STATUS "Building the fat runtime requires the Unix Makefiles generator, or Ninja with CMake v3.0 or higher") + set (FAT_RUNTIME_REQUISITES FALSE) +- elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64") +- message(STATUS "AARCH64 platform don't support fat runtime") ++ elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64|loongarch64|LOONGARCH64") ++ message(STATUS "AARCH64 and LOONGARCH64 platform don't support fat runtime") + set (FAT_RUNTIME_REQUISITES FALSE) + else() + include (${CMAKE_MODULE_PATH}/attrib.cmake) +diff --git a/cmake/arch.cmake b/cmake/arch.cmake +index eb4791e..19aa693 100644 +--- a/cmake/arch.cmake ++++ b/cmake/arch.cmake +@@ -6,6 +6,9 @@ if (HAVE_C_X86INTRIN_H) + set (INTRIN_INC_H "x86intrin.h") + elseif (HAVE_C_INTRIN_H) + set (INTRIN_INC_H "intrin.h") ++elseif (HAVE_C_LSXINTRIN_H) ++ set (INTRIN_INC_H "lsxintrin.h") ++ set (FAT_RUNTIME OFF) + else () + message (FATAL_ERROR "No intrinsics header found") + endif () +@@ -81,7 +84,7 @@ int main(){ + (void)_mm512_permutexvar_epi8(idx, a); + }" HAVE_AVX512VBMI) + +-if (FAT_RUNTIME) ++if (FAT_RUNTIME AND (ARCH_IA32 OR ARCH_X86_64)) + if (NOT HAVE_SSSE3) + message(FATAL_ERROR "SSSE3 support required to build fat runtime") + endif () +diff --git a/cmake/config.h.in b/cmake/config.h.in +index 336cf19..c33c4a9 100644 +--- a/cmake/config.h.in ++++ b/cmake/config.h.in +@@ -18,6 +18,9 @@ + /* "Define if building for aarch64" */ + #cmakedefine ARCH_AARCH64 + ++/* "Define if building for LOONGARCH64" */ ++#cmakedefine ARCH_LOONGARCH64 ++ + /* internal build, switch on dump support. */ + #cmakedefine DUMP_SUPPORT + +@@ -57,6 +60,9 @@ + /* C compiler has arm_neon.h */ + #cmakedefine HAVE_C_ARM_NEON_H + ++/* C compiler has lsxintrin.h */ ++#cmakedefine HAVE_C_LSXINTRIN_H ++ + /* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to + 0 if you don't. */ + #cmakedefine HAVE_DECL_PTHREAD_SETAFFINITY_NP +diff --git a/cmake/platform.cmake b/cmake/platform.cmake +index 213dcc5..58c2dc6 100644 +--- a/cmake/platform.cmake ++++ b/cmake/platform.cmake +@@ -6,9 +6,10 @@ CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error n + CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_IA32) + + CHECK_C_SOURCE_COMPILES("#if !(defined(__aarch64__))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_AARCH64) ++CHECK_C_SOURCE_COMPILES("#if !(defined(__loongarch64))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_LOONGARCH64) + +-if (ARCH_X86_64 OR ARCH_AARCH64) ++if (ARCH_X86_64 OR ARCH_AARCH64 OR ARCH_LOONGARCH64) + set(ARCH_64_BIT 1) + elseif (ARCH_IA32) + set(ARCH_32_BIT 1) +-endif() +\ No newline at end of file ++endif() +diff --git a/src/dispatcher.c b/src/dispatcher.c +index 9a8afa6..74dfb4a 100644 +--- a/src/dispatcher.c ++++ b/src/dispatcher.c +@@ -56,6 +56,7 @@ + return (RTYPE)HS_ARCH_ERROR; \ + } \ + \ ++#if defined(ARCH_X86_64) + /* resolver */ \ + static RTYPE (*JOIN(resolve_, NAME)(void))(__VA_ARGS__) { \ + if (check_avx512vbmi()) { \ +@@ -75,7 +76,8 @@ + } \ + /* anything else is fail */ \ + return JOIN(error_, NAME); \ +- } \ ++ } ++#endif \ + \ + /* function */ \ + HS_PUBLIC_API \ +diff --git a/src/hs_valid_platform.c b/src/hs_valid_platform.c +index 035d3ff..a121e7e 100644 +--- a/src/hs_valid_platform.c ++++ b/src/hs_valid_platform.c +@@ -37,12 +37,11 @@ hs_error_t HS_CDECL hs_valid_platform(void) { + if (check_ssse3()) { + return HS_SUCCESS; + } +-#else ++#elif defined(ARCH_AARCH64) + if (check_neon()) { + return HS_SUCCESS; + } ++#elif defined(ARCH_LOONGARCH64) ++ return HS_SUCCESS; + #endif +- else { +- return HS_ARCH_ERROR; +- } + } +diff --git a/src/nfa/shufti.c b/src/nfa/shufti.c +index 2cb74f0..090eb8f 100644 +--- a/src/nfa/shufti.c ++++ b/src/nfa/shufti.c +@@ -159,7 +159,7 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, + } + + const m128 zeroes = zeroes128(); +- const m128 low4bits = set16x8(0xf); ++ const m128 low4bits = __lsx_vldi(0xf); + const u8 *rv; + + size_t min = (size_t)buf % 16; +@@ -251,7 +251,7 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, + } + + const m128 zeroes = zeroes128(); +- const m128 low4bits = set16x8(0xf); ++ const m128 low4bits = __lsx_vldi(0xf); + const u8 *rv; + + assert(buf_end - buf >= 16); +@@ -325,7 +325,7 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, + m128 mask2_lo, m128 mask2_hi, + const u8 *buf, const u8 *buf_end) { + const m128 ones = ones128(); +- const m128 low4bits = set16x8(0xf); ++ const m128 low4bits = __lsx_vldi(0xf); + const u8 *rv; + + size_t min = (size_t)buf % 16; +diff --git a/src/nfa/truffle.c b/src/nfa/truffle.c +index c05d778..9bb8d88 100644 +--- a/src/nfa/truffle.c ++++ b/src/nfa/truffle.c +@@ -64,8 +64,8 @@ const u8 *firstMatch(const u8 *buf, u32 z) { + static really_inline + u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) { + +- m128 highconst = set16x8(0x80); +- m128 shuf_mask_hi = set2x64(0x8040201008040201); ++ m128 highconst = __lsx_vldi(0x80); ++ m128 shuf_mask_hi = __lsx_vreplgr2vr_d(0x8040201008040201); + + // and now do the real work + m128 shuf1 = pshufb_m128(shuf_mask_lo_highclear, v); +diff --git a/src/rose/counting_miracle.h b/src/rose/counting_miracle.h +index 4456679..1cf5189 100644 +--- a/src/rose/counting_miracle.h ++++ b/src/rose/counting_miracle.h +@@ -94,7 +94,7 @@ u32 roseCountingMiracleScanShufti(m128 mask_lo, m128 mask_hi, u8 poison, + u32 count = *count_inout; + + const m128 zeroes = zeroes128(); +- const m128 low4bits = set16x8(0xf); ++ const m128 low4bits = __lsx_vldi(0xf); + + for (; d + 16 <= d_end; d_end -= 16) { + m128 data = loadu128(d_end - 16); +diff --git a/src/util/arch.h b/src/util/arch.h +index fe4a910..b650816 100644 +--- a/src/util/arch.h ++++ b/src/util/arch.h +@@ -98,4 +98,8 @@ + #define NO_ASM + #endif + ++#if defined(__loongarch64) ++#define NO_ASM ++#endif ++ + #endif // UTIL_ARCH_H_ +diff --git a/src/util/cpuid_flags.c b/src/util/cpuid_flags.c +index 96286ee..57fc34d 100644 +--- a/src/util/cpuid_flags.c ++++ b/src/util/cpuid_flags.c +@@ -33,9 +33,11 @@ + #include "hs_internal.h" + #include "util/arch.h" + ++#if defined(__x86_64__) || defined(_M_X64) + #if !defined(_WIN32) && !defined(CPUID_H_) + #include + #endif ++#endif + + u64a cpuid_flags(void) { + u64a cap = 0; +diff --git a/src/util/intrinsics.h b/src/util/intrinsics.h +index ece3b1a..9c9d95a 100644 +--- a/src/util/intrinsics.h ++++ b/src/util/intrinsics.h +@@ -45,6 +45,10 @@ + # endif + #endif + ++#if defined(HAVE_C_LSXINTRIN_H) ++# define USE_LSXINTRIN_H ++#endif ++ + #ifdef __cplusplus + # if defined(HAVE_CXX_INTRIN_H) + # define USE_INTRIN_H +@@ -71,6 +75,8 @@ + #include + #elif defined(USE_ARM_NEON_H) + #include ++#elif defined(USE_LSXINTRIN_H) ++#include + #else + #error no intrinsics file + #endif +diff --git a/src/util/simd_loongarch.h b/src/util/simd_loongarch.h +new file mode 100644 +index 0000000..b311ffb +--- /dev/null ++++ b/src/util/simd_loongarch.h +@@ -0,0 +1,956 @@ ++/* ++ * Copyright (c) 2015-2017, Intel Corporation ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions are met: ++ * ++ * * Redistributions of source code must retain the above copyright notice, ++ * this list of conditions and the following disclaimer. ++ * * Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * * Neither the name of Intel Corporation nor the names of its contributors ++ * may be used to endorse or promote products derived from this software ++ * without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE ++ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS ++ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN ++ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ++ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE ++ * POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++/** \file ++ * \brief SIMD types and primitive operations. ++ */ ++ ++#ifndef SIMD_LSX ++#define SIMD_LSX ++ ++#include "config.h" ++#include "ue2common.h" ++#include "simd_types.h" ++#include "unaligned.h" ++#include "util/arch.h" ++#include "util/intrinsics.h" ++#include ++ ++#include // for memcpy ++ ++// Define a common assume_aligned using an appropriate compiler built-in, if ++// it's available. Note that we need to handle C or C++ compilation. ++#ifdef __cplusplus ++#ifdef HAVE_CXX_BUILTIN_ASSUME_ALIGNED ++#define assume_aligned(x, y) __builtin_assume_aligned((x), (y)) ++#endif ++#else ++#ifdef HAVE_CC_BUILTIN_ASSUME_ALIGNED ++#define assume_aligned(x, y) __builtin_assume_aligned((x), (y)) ++#endif ++#endif ++ ++// Fallback to identity case. ++#ifndef assume_aligned ++#define assume_aligned(x, y) (x) ++#endif ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++extern const char vbs_mask_data[]; ++#ifdef __cplusplus ++} ++#endif ++ ++static really_inline m128 ones128(void) { ++ /* gcc gets this right */ ++ return __lsx_vldi(0xFF); ++} ++ ++static really_inline m128 zeroes128(void) { ++ return __lsx_vldi(0); ++} ++ ++/** \brief Bitwise not for m128*/ ++static really_inline m128 not128(m128 a) { ++ return __lsx_vxor_v(a,ones128()); ++} ++ ++/** \brief Return 1 if a and b are different otherwise 0 */ ++static really_inline int diff128(m128 a, m128 b) { ++ return (__lsx_vpickve2gr_hu(__lsx_vmskltz_b(__lsx_vseq_b(a, b)), 0) ^ 0xffff); ++} ++ ++static really_inline int isnonzero128(m128 a) { ++ return !!diff128(a, zeroes128()); ++} ++ ++/** ++ * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit ++ * mask indicating which 32-bit words contain differences. ++ */ ++static really_inline u32 diffrich128(m128 a, m128 b) { ++ a = __lsx_vseq_w(a, b); ++ return ~( __lsx_vpickve2gr_hu(__lsx_vmskltz_w(a),0)) & 0xf; ++} ++/** ++ * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and ++ * returns a 4-bit mask indicating which 64-bit words contain differences. ++ */ ++static really_inline u32 diffrich64_128(m128 a, m128 b) { ++ u32 d = diffrich128(a, b); ++ return (d | (d >> 1)) & 0x5; ++} ++ ++static really_really_inline ++m128 lshift64_m128(m128 a, unsigned b) { ++ m128 tmp = __lsx_vinsgr2vr_w(zeroes128(), b, 0); ++ ++ m128 x = __lsx_vinsgr2vr_w(tmp, b, 2); ++ return __lsx_vsll_d(a, x); ++} ++ ++#define rshift64_m128(a, b) __lsx_vsrli_d((a), (b)) ++#define eq128(a, b) __lsx_vseq_b((a), (b)) ++#define movemask128(a) __lsx_vpickve2gr_hu(__lsx_vmskltz_b(a), 0) ++ ++static really_inline m128 set16x8(u8 c) { ++ return __lsx_vreplgr2vr_b(c); ++} ++ ++static really_inline m128 set4x32(u32 c) { ++ return __lsx_vreplgr2vr_w(c); ++} ++ ++static really_inline u32 movd(const m128 in) { ++ return __lsx_vpickve2gr_w(in, 0); ++} ++ ++static really_inline u64a movq(const m128 in) { ++ u32 lo = movd(in); ++ u32 hi = movd(__lsx_vsrli_d(in, 32)); ++ return (u64a)hi << 32 | lo; ++} ++ ++/* another form of movq */ ++static really_inline ++m128 load_m128_from_u64a(const u64a *p) { ++ return __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(0LL), *p, 0); ++} ++ ++#define L(a) __lsx_vand_v(a, __lsx_vinsgr2vr_d(__lsx_vldi(0), 0xFFFFFFFFFFFFFFFF, 0)) ++#define M(a) __lsx_vand_v(a, __lsx_vinsgr2vr_d(__lsx_vldi(0xFF), 0x0000000000000000, 0)) ++#define N(a) __lsx_vpickod_d(__lsx_vldi(0),a) ++#define U(a) __lsx_vpickev_d(a, __lsx_vldi(0)) ++#define rshiftbyte_m128(a, count_immed) \ ++ (((count_immed) < 8) ? (__lsx_vor_v(__lsx_vsrli_d(M(a), (8*count_immed)), __lsx_vor_v(__lsx_vsrli_d(L(a), (8*count_immed)), __lsx_vslli_d(N(a), (64-(8*count_immed)))))) : (__lsx_vsrli_d(N(a),((8*count_immed)-64)))) ++ ++#define lshiftbyte_m128(a, count_immed) \ ++ (((count_immed) < 8) ? (__lsx_vor_v(__lsx_vslli_d(L(a), (8*count_immed)), __lsx_vor_v(__lsx_vslli_d(M(a), (8*count_immed)), __lsx_vsrli_d(U(a), (64-(8*count_immed)))))) : (__lsx_vslli_d(U(a),((8*count_immed)-64)))) ++ ++#define extract32from128(a, imm) \ ++ (((imm) < 2) ? (__lsx_vor_v(__lsx_vsrli_d(M(a), (32*imm)), __lsx_vor_v(__lsx_vsrli_d(L(a), (32*imm)), __lsx_vslli_d(N(a), (64-(32*imm)))))) : (__lsx_vsrli_d(N(a),((32*imm)-64)))) ++#define extract64from128(a, imm) \ ++ (((imm) < 1) ? (__lsx_vor_v(__lsx_vsrli_d(M(a), (64*imm)), __lsx_vor_v(__lsx_vsrli_d(L(a), (64*imm)), __lsx_vslli_d(N(a), (64-(64*imm)))))) : (__lsx_vsrli_d(N(a),((64*imm)-64)))) ++ ++#define extractlow64from256(a) movq(a.lo) ++#define extractlow32from256(a) movd(a.lo) ++ ++static really_inline m128 and128(m128 a, m128 b) { ++ return __lsx_vand_v(a,b); ++} ++ ++static really_inline m128 xor128(m128 a, m128 b) { ++ return __lsx_vxor_v(a,b); ++} ++ ++static really_inline m128 or128(m128 a, m128 b) { ++ return __lsx_vor_v(a,b); ++} ++ ++static really_inline m128 andnot128(m128 a, m128 b) { ++ return __lsx_vandn_v(a,b); ++} ++ ++// aligned load ++static really_inline m128 load128(const void *ptr) { ++ assert(ISALIGNED_N(ptr, alignof(m128))); ++ ptr = assume_aligned(ptr, 16); ++ return __lsx_vldx((const m128 *)ptr,0); ++} ++ ++// aligned store ++static really_inline void store128(void *ptr, m128 a) { ++ assert(ISALIGNED_N(ptr, alignof(m128))); ++ ptr = assume_aligned(ptr, 16); ++ *(m128 *)ptr = a; ++} ++ ++// unaligned load ++static really_inline m128 loadu128(const void *ptr) { ++ return __lsx_vldx((const m128 *)ptr,0); ++} ++ ++// unaligned store ++static really_inline void storeu128(void *ptr, m128 a) { ++ __lsx_vst(a,(m128 *)ptr,0); ++} ++ ++// packed unaligned store of first N bytes ++static really_inline ++void storebytes128(void *ptr, m128 a, unsigned int n) { ++ assert(n <= sizeof(a)); ++ memcpy(ptr, &a, n); ++} ++ ++// packed unaligned load of first N bytes, pad with zero ++static really_inline ++m128 loadbytes128(const void *ptr, unsigned int n) { ++ m128 a = zeroes128(); ++ assert(n <= sizeof(a)); ++ memcpy(&a, ptr, n); ++ return a; ++} ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++extern const u8 simd_onebit_masks[]; ++#ifdef __cplusplus ++} ++#endif ++ ++static really_inline ++m128 mask1bit128(unsigned int n) { ++ assert(n < sizeof(m128) * 8); ++ u32 mask_idx = ((n % 8) * 64) + 95; ++ mask_idx -= n / 8; ++ return loadu128(&simd_onebit_masks[mask_idx]); ++} ++ ++// switches on bit N in the given vector. ++static really_inline ++void setbit128(m128 *ptr, unsigned int n) { ++ *ptr = or128(mask1bit128(n), *ptr); ++} ++ ++// switches off bit N in the given vector. ++static really_inline ++void clearbit128(m128 *ptr, unsigned int n) { ++ *ptr = andnot128(mask1bit128(n), *ptr); ++} ++ ++// tests bit N in the given vector. ++static really_inline ++char testbit128(m128 val, unsigned int n) { ++ const m128 mask = mask1bit128(n); ++ return isnonzero128(and128(mask, val)); ++} ++ ++#define palignr(r, l, offset) \ ++ (((offset) < 8) ? __lsx_vor_v(rshiftbyte_m128(l,(offset)),lshiftbyte_m128(U(r),(8-(offset)))) : __lsx_vor_v(rshiftbyte_m128(l,(offset)), lshiftbyte_m128(r,(16-(offset))))) ++ ++static really_inline ++m128 shuffle_epi8(m128 a, m128 b) { ++ m128 tmp1,tmp2,tmp3,dst; ++ tmp1 = ~(__lsx_vslt_b(b,__lsx_vldi(0))); ++ tmp2 = __lsx_vand_v(b,tmp1); ++ tmp3 = __lsx_vand_v(tmp2, __lsx_vldi(0x0F)); ++ unsigned char* p = (unsigned char*)&tmp3; ++ unsigned char* pa = (unsigned char*)&a; ++ for (int i = 0; i < 16; i++) { ++ unsigned char value = p[i]; ++ switch(i){ ++ case 0:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 0);}else{dst = tmp3;}break; ++ case 1:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 1);}else{dst = tmp3;}break; ++ case 2:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 2);}else{dst = tmp3;}break; ++ case 3:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 3);}else{dst = tmp3;}break; ++ case 4:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 4);}else{dst = tmp3;}break; ++ case 5:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 5);}else{dst = tmp3;}break; ++ case 6:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 6);}else{dst = tmp3;}break; ++ case 7:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 7);}else{dst = tmp3;}break; ++ case 8:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 8);}else{dst = tmp3;}break; ++ case 9:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 9);}else{dst = tmp3;}break; ++ case 10:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 10);}else{dst = tmp3;}break; ++ case 11:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 11);}else{dst = tmp3;}break; ++ case 12:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 12);}else{dst = tmp3;}break; ++ case 13:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 13);}else{dst = tmp3;}break; ++ case 14:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 14);}else{dst = tmp3;}break; ++ case 15:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 15);}else{dst = tmp3;}break; ++ default:break; ++ } ++ tmp3 = dst; ++ } ++ return dst; ++} ++ ++static really_inline ++m128 pshufb_m128(m128 a, m128 b) { ++ m128 result; ++ result = shuffle_epi8(a, b); ++ return result; ++} ++ ++static really_inline ++m256 pshufb_m256(m256 a, m256 b) { ++ m256 rv; ++ rv.lo = pshufb_m128(a.lo, b.lo); ++ rv.hi = pshufb_m128(a.hi, b.hi); ++ return rv; ++} ++ ++static really_inline ++m128 variable_byte_shift_m128(m128 in, s32 amount) { ++ assert(amount >= -16 && amount <= 16); ++ m128 shift_mask = loadu128(vbs_mask_data + 16 - amount); ++ return pshufb_m128(in, shift_mask); ++} ++ ++static really_inline ++m128 max_u8_m128(m128 a, m128 b) { ++ return __lsx_vmax_bu(a, b); ++} ++ ++static really_inline ++m128 min_u8_m128(m128 a, m128 b) { ++ return __lsx_vmin_bu(a, b); ++} ++ ++static really_inline ++m128 sadd_u8_m128(m128 a, m128 b) { ++ return __lsx_vsadd_bu(a, b); ++} ++ ++static really_inline ++m128 sub_u8_m128(m128 a, m128 b) { ++ return __lsx_vsub_b(a, b); ++} ++ ++static really_inline ++m128 set64x2(u64a hi, u64a lo) { ++ return __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(hi),lo,0); ++} ++ ++/**** ++ **** 256-bit Primitives ++ ****/ ++ ++static really_really_inline ++m256 lshift64_m256(m256 a, int b) { ++ m256 rv = a; ++ rv.lo = lshift64_m128(rv.lo, b); ++ rv.hi = lshift64_m128(rv.hi, b); ++ return rv; ++} ++ ++static really_inline ++m256 rshift64_m256(m256 a, int b) { ++ m256 rv = a; ++ rv.lo = rshift64_m128(rv.lo, b); ++ rv.hi = rshift64_m128(rv.hi, b); ++ return rv; ++} ++static really_inline ++m256 set32x8(u32 in) { ++ m256 rv; ++ rv.lo = set16x8((u8) in); ++ rv.hi = rv.lo; ++ return rv; ++} ++ ++static really_inline ++m256 eq256(m256 a, m256 b) { ++ m256 rv; ++ rv.lo = eq128(a.lo, b.lo); ++ rv.hi = eq128(a.hi, b.hi); ++ return rv; ++} ++ ++static really_inline ++u32 movemask256(m256 a) { ++ u32 lo_mask = movemask128(a.lo); ++ u32 hi_mask = movemask128(a.hi); ++ return lo_mask | (hi_mask << 16); ++} ++ ++static really_inline ++m256 set2x128(m128 a) { ++ m256 rv = {a, a}; ++ return rv; ++} ++ ++static really_inline m256 zeroes256(void) { ++ m256 rv = {zeroes128(), zeroes128()}; ++ return rv; ++} ++ ++static really_inline m256 ones256(void) { ++ m256 rv = {ones128(), ones128()}; ++ return rv; ++} ++ ++static really_inline m256 and256(m256 a, m256 b) { ++ m256 rv; ++ rv.lo = and128(a.lo, b.lo); ++ rv.hi = and128(a.hi, b.hi); ++ return rv; ++} ++ ++static really_inline m256 or256(m256 a, m256 b) { ++ m256 rv; ++ rv.lo = or128(a.lo, b.lo); ++ rv.hi = or128(a.hi, b.hi); ++ return rv; ++} ++ ++static really_inline m256 xor256(m256 a, m256 b) { ++ m256 rv; ++ rv.lo = xor128(a.lo, b.lo); ++ rv.hi = xor128(a.hi, b.hi); ++ return rv; ++} ++ ++static really_inline m256 not256(m256 a) { ++ m256 rv; ++ rv.lo = not128(a.lo); ++ rv.hi = not128(a.hi); ++ return rv; ++} ++ ++static really_inline m256 andnot256(m256 a, m256 b) { ++ m256 rv; ++ rv.lo = andnot128(a.lo, b.lo); ++ rv.hi = andnot128(a.hi, b.hi); ++ return rv; ++} ++ ++static really_inline int diff256(m256 a, m256 b) { ++ return diff128(a.lo, b.lo) || diff128(a.hi, b.hi); ++} ++ ++static really_inline int isnonzero256(m256 a) { ++ return isnonzero128(or128(a.lo, a.hi)); ++} ++ ++/** ++ * "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit ++ * mask indicating which 32-bit words contain differences. ++ */ ++static really_inline u32 diffrich256(m256 a, m256 b) { ++ m128 z = zeroes128(); ++ m128 tmp0,tmp1,tmp2,tmp3; ++ a.lo = __lsx_vseq_w(a.lo, b.lo); ++ a.hi = __lsx_vseq_w(a.hi, b.hi); ++ ++ tmp0 =__lsx_vsat_w(a.lo, 15); ++ tmp1 =__lsx_vsat_w(b.hi, 15); ++ tmp2 =__lsx_vsat_h(__lsx_vpickev_h(tmp1, tmp0), 7); ++ tmp3 =__lsx_vsat_h(z, 7); ++ m128 packed = __lsx_vpickev_b(tmp3, tmp2); ++ ++ return ~(__lsx_vpickve2gr_hu(__lsx_vmskltz_b(packed), 0)) & 0xff; ++} ++ ++/** ++ * "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and ++ * returns an 8-bit mask indicating which 64-bit words contain differences. ++ */ ++static really_inline u32 diffrich64_256(m256 a, m256 b) { ++ u32 d = diffrich256(a, b); ++ return (d | (d >> 1)) & 0x55555555; ++} ++ ++// aligned load ++static really_inline m256 load256(const void *ptr) { ++ assert(ISALIGNED_N(ptr, alignof(m256))); ++ m256 rv = {load128(ptr), load128((const char *)ptr + 16)}; ++ return rv; ++} ++ ++// aligned load of 128-bit value to low and high part of 256-bit value ++static really_inline m256 load2x128(const void *ptr) { ++ assert(ISALIGNED_N(ptr, alignof(m128))); ++ m256 rv; ++ rv.hi = rv.lo = load128(ptr); ++ return rv; ++} ++ ++static really_inline m256 loadu2x128(const void *ptr) { ++ return set2x128(loadu128(ptr)); ++} ++ ++// aligned store ++static really_inline void store256(void *ptr, m256 a) { ++ assert(ISALIGNED_N(ptr, alignof(m256))); ++ ptr = assume_aligned(ptr, 16); ++ *(m256 *)ptr = a; ++} ++ ++// unaligned load ++static really_inline m256 loadu256(const void *ptr) { ++ m256 rv = {loadu128(ptr), loadu128((const char *)ptr + 16)}; ++ return rv; ++} ++ ++// unaligned store ++static really_inline void storeu256(void *ptr, m256 a) { ++ storeu128(ptr, a.lo); ++ storeu128((char *)ptr + 16, a.hi); ++} ++ ++// packed unaligned store of first N bytes ++static really_inline void storebytes256(void *ptr, m256 a, unsigned int n) { ++ assert(n <= sizeof(a)); ++ memcpy(ptr, &a, n); ++} ++ ++// packed unaligned load of first N bytes, pad with zero ++static really_inline m256 loadbytes256(const void *ptr, unsigned int n) { ++ m256 a = zeroes256(); ++ assert(n <= sizeof(a)); ++ memcpy(&a, ptr, n); ++ return a; ++} ++ ++static really_inline m256 mask1bit256(unsigned int n) { ++ assert(n < sizeof(m256) * 8); ++ u32 mask_idx = ((n % 8) * 64) + 95; ++ mask_idx -= n / 8; ++ return loadu256(&simd_onebit_masks[mask_idx]); ++} ++ ++static really_inline m256 set64x4(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) { ++ m256 rv; ++ rv.hi = set64x2(hi_1, hi_0); ++ rv.lo = set64x2(lo_1, lo_0); ++ return rv; ++} ++ ++// switches on bit N in the given vector. ++static really_inline void setbit256(m256 *ptr, unsigned int n) { ++ assert(n < sizeof(*ptr) * 8); ++ m128 *sub; ++ if (n < 128) { ++ sub = &ptr->lo; ++ } else { ++ sub = &ptr->hi; ++ n -= 128; ++ } ++ setbit128(sub, n); ++} ++ ++// switches off bit N in the given vector. ++static really_inline void clearbit256(m256 *ptr, unsigned int n) { ++ assert(n < sizeof(*ptr) * 8); ++ m128 *sub; ++ if (n < 128) { ++ sub = &ptr->lo; ++ } else { ++ sub = &ptr->hi; ++ n -= 128; ++ } ++ clearbit128(sub, n); ++} ++ ++// tests bit N in the given vector. ++static really_inline char testbit256(m256 val, unsigned int n) { ++ assert(n < sizeof(val) * 8); ++ m128 sub; ++ if (n < 128) { ++ sub = val.lo; ++ } else { ++ sub = val.hi; ++ n -= 128; ++ } ++ return testbit128(sub, n); ++} ++ ++static really_really_inline ++m128 movdq_hi(m256 x) { ++ return x.hi; ++} ++ ++static really_really_inline m128 movdq_lo(m256 x) { return x.lo;} ++ ++static really_inline m256 combine2x128(m128 hi, m128 lo) { ++ m256 rv = {lo, hi}; ++ return rv; ++} ++ ++/**** ++ **** 384-bit Primitives ++ ****/ ++ ++static really_inline m384 and384(m384 a, m384 b) { ++ m384 rv; ++ rv.lo = and128(a.lo, b.lo); ++ rv.mid = and128(a.mid, b.mid); ++ rv.hi = and128(a.hi, b.hi); ++ return rv; ++} ++ ++static really_inline m384 or384(m384 a, m384 b) { ++ m384 rv; ++ rv.lo = or128(a.lo, b.lo); ++ rv.mid = or128(a.mid, b.mid); ++ rv.hi = or128(a.hi, b.hi); ++ return rv; ++} ++ ++static really_inline m384 xor384(m384 a, m384 b) { ++ m384 rv; ++ rv.lo = xor128(a.lo, b.lo); ++ rv.mid = xor128(a.mid, b.mid); ++ rv.hi = xor128(a.hi, b.hi); ++ return rv; ++} ++static really_inline m384 not384(m384 a) { ++ m384 rv; ++ rv.lo = not128(a.lo); ++ rv.mid = not128(a.mid); ++ rv.hi = not128(a.hi); ++ return rv; ++} ++static really_inline m384 andnot384(m384 a, m384 b) { ++ m384 rv; ++ rv.lo = andnot128(a.lo, b.lo); ++ rv.mid = andnot128(a.mid, b.mid); ++ rv.hi = andnot128(a.hi, b.hi); ++ return rv; ++} ++ ++static really_really_inline m384 lshift64_m384(m384 a, unsigned b) { ++ m384 rv; ++ rv.lo = lshift64_m128(a.lo, b); ++ rv.mid = lshift64_m128(a.mid, b); ++ rv.hi = lshift64_m128(a.hi, b); ++ return rv; ++} ++ ++static really_inline m384 zeroes384(void) { ++ m384 rv = {zeroes128(), zeroes128(), zeroes128()}; ++ return rv; ++} ++ ++static really_inline m384 ones384(void) { ++ m384 rv = {ones128(), ones128(), ones128()}; ++ return rv; ++} ++ ++static really_inline int diff384(m384 a, m384 b) { ++ return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi); ++} ++ ++static really_inline int isnonzero384(m384 a) { ++ return isnonzero128(or128(or128(a.lo, a.mid), a.hi)); ++} ++ ++/** ++ * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit ++ * mask indicating which 32-bit words contain differences. ++ */ ++static really_inline u32 diffrich384(m384 a, m384 b) { ++ m128 z = zeroes128(); ++ m128 tmp0,tmp1,tmp2,tmp3,tmp4,tmp5; ++ a.lo = __lsx_vseq_w(a.lo, b.lo); ++ a.mid = __lsx_vseq_w(a.mid, b.mid); ++ a.hi = __lsx_vseq_w(a.hi, b.hi); ++ ++ tmp0 = __lsx_vsat_w(a.lo, 15); ++ tmp1 = __lsx_vsat_w(b.mid, 15); ++ ++ tmp2 = __lsx_vsat_w(b.hi, 15); ++ tmp3 = __lsx_vsat_w(z, 15); ++ ++ tmp4 = __lsx_vsat_h(__lsx_vpickev_h(tmp1, tmp0),7); ++ tmp5 = __lsx_vsat_h(__lsx_vpickev_h(tmp3, tmp2),7); ++ ++ m128 packed = __lsx_vpickev_b(tmp5,tmp4); ++ ++ return ~(__lsx_vpickve2gr_hu(__lsx_vmskltz_b(packed), 0)) & 0xfff; ++} ++ ++/** ++ * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and ++ * returns a 12-bit mask indicating which 64-bit words contain differences. ++ */ ++static really_inline u32 diffrich64_384(m384 a, m384 b) { ++ u32 d = diffrich384(a, b); ++ return (d | (d >> 1)) & 0x55555555; ++} ++ ++// aligned load ++static really_inline m384 load384(const void *ptr) { ++ assert(ISALIGNED_16(ptr)); ++ m384 rv = {load128(ptr), load128((const char *)ptr + 16), ++ load128((const char *)ptr + 32)}; ++ return rv; ++} ++ ++// aligned store ++static really_inline void store384(void *ptr, m384 a) { ++ assert(ISALIGNED_16(ptr)); ++ ptr = assume_aligned(ptr, 16); ++ *(m384 *)ptr = a; ++} ++ ++// unaligned load ++static really_inline m384 loadu384(const void *ptr) { ++ m384 rv = {loadu128(ptr), loadu128((const char *)ptr + 16), ++ loadu128((const char *)ptr + 32)}; ++ return rv; ++} ++ ++// packed unaligned store of first N bytes ++static really_inline void storebytes384(void *ptr, m384 a, unsigned int n) { ++ assert(n <= sizeof(a)); ++ memcpy(ptr, &a, n); ++} ++ ++// packed unaligned load of first N bytes, pad with zero ++static really_inline m384 loadbytes384(const void *ptr, unsigned int n) { ++ m384 a = zeroes384(); ++ assert(n <= sizeof(a)); ++ memcpy(&a, ptr, n); ++ return a; ++} ++ ++// switches on bit N in the given vector. ++static really_inline void setbit384(m384 *ptr, unsigned int n) { ++ assert(n < sizeof(*ptr) * 8); ++ m128 *sub; ++ if (n < 128) { ++ sub = &ptr->lo; ++ } else if (n < 256) { ++ sub = &ptr->mid; ++ } else { ++ sub = &ptr->hi; ++ } ++ setbit128(sub, n % 128); ++} ++ ++// switches off bit N in the given vector. ++static really_inline void clearbit384(m384 *ptr, unsigned int n) { ++ assert(n < sizeof(*ptr) * 8); ++ m128 *sub; ++ if (n < 128) { ++ sub = &ptr->lo; ++ } else if (n < 256) { ++ sub = &ptr->mid; ++ } else { ++ sub = &ptr->hi; ++ } ++ clearbit128(sub, n % 128); ++} ++ ++// tests bit N in the given vector. ++static really_inline char testbit384(m384 val, unsigned int n) { ++ assert(n < sizeof(val) * 8); ++ m128 sub; ++ if (n < 128) { ++ sub = val.lo; ++ } else if (n < 256) { ++ sub = val.mid; ++ } else { ++ sub = val.hi; ++ } ++ return testbit128(sub, n % 128); ++} ++ ++/**** ++ **** 512-bit Primitives ++ ****/ ++ ++static really_inline m512 zeroes512(void) { ++ m512 rv = {zeroes256(), zeroes256()}; ++ return rv; ++} ++ ++static really_inline m512 ones512(void) { ++ m512 rv = {ones256(), ones256()}; ++ return rv; ++} ++ ++static really_inline m512 and512(m512 a, m512 b) { ++ m512 rv; ++ rv.lo = and256(a.lo, b.lo); ++ rv.hi = and256(a.hi, b.hi); ++ return rv; ++} ++ ++static really_inline m512 or512(m512 a, m512 b) { ++ m512 rv; ++ rv.lo = or256(a.lo, b.lo); ++ rv.hi = or256(a.hi, b.hi); ++ return rv; ++} ++ ++static really_inline m512 xor512(m512 a, m512 b) { ++ m512 rv; ++ rv.lo = xor256(a.lo, b.lo); ++ rv.hi = xor256(a.hi, b.hi); ++ return rv; ++} ++ ++static really_inline m512 not512(m512 a) { ++ m512 rv; ++ rv.lo = not256(a.lo); ++ rv.hi = not256(a.hi); ++ return rv; ++} ++ ++static really_inline m512 andnot512(m512 a, m512 b) { ++ m512 rv; ++ rv.lo = andnot256(a.lo, b.lo); ++ rv.hi = andnot256(a.hi, b.hi); ++ return rv; ++} ++ ++static really_really_inline m512 lshift64_m512(m512 a, unsigned b) { ++ m512 rv; ++ rv.lo = lshift64_m256(a.lo, b); ++ rv.hi = lshift64_m256(a.hi, b); ++ return rv; ++} ++ ++static really_inline int diff512(m512 a, m512 b) { ++ return diff256(a.lo, b.lo) || diff256(a.hi, b.hi); ++} ++ ++static really_inline int isnonzero512(m512 a) { ++ m128 x = or128(a.lo.lo, a.lo.hi); ++ m128 y = or128(a.hi.lo, a.hi.hi); ++ return isnonzero128(or128(x, y)); ++} ++ ++/** ++ * "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit ++ * mask indicating which 32-bit words contain differences. ++ */ ++static really_inline u32 diffrich512(m512 a, m512 b) { ++ m128 tmp0,tmp1,tmp2,tmp3,tmp4,tmp5,tmp6,tmp7; ++ a.lo.lo = __lsx_vseq_w(a.lo.lo, b.lo.lo); ++ a.lo.hi = __lsx_vseq_w(a.lo.hi, b.lo.hi); ++ a.hi.lo = __lsx_vseq_w(a.hi.lo, b.hi.lo); ++ a.hi.hi = __lsx_vseq_w(a.hi.hi, b.hi.hi); ++ ++ tmp0 =__lsx_vsat_w(a.lo.lo, 15); ++ tmp1 =__lsx_vsat_w(a.lo.hi, 15); ++ tmp2 =__lsx_vpickev_h(tmp1, tmp0); ++ ++ tmp3 =__lsx_vsat_w(a.hi.lo, 15); ++ tmp4 =__lsx_vsat_w(a.hi.hi, 15); ++ tmp5 =__lsx_vpickev_h(tmp4, tmp3); ++ ++ tmp6 =__lsx_vsat_h(tmp2, 7); ++ tmp7 =__lsx_vsat_h(tmp5, 7); ++ m128 packed = __lsx_vpickev_b(tmp7, tmp6); ++ ++ return ~(__lsx_vpickve2gr_hu(__lsx_vmskltz_b(packed), 0)) & 0xffff; // ok ++} ++ ++/** ++ * "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and ++ * returns a 16-bit mask indicating which 64-bit words contain differences. ++ */ ++static really_inline u32 diffrich64_512(m512 a, m512 b) { ++ u32 d = diffrich512(a, b); ++ return (d | (d >> 1)) & 0x55555555; ++} ++ ++// aligned load ++static really_inline m512 load512(const void *ptr) { ++ assert(ISALIGNED_N(ptr, alignof(m256))); ++ m512 rv = {load256(ptr), load256((const char *)ptr + 32)}; ++ return rv; ++} ++ ++// aligned store ++static really_inline void store512(void *ptr, m512 a) { ++ assert(ISALIGNED_N(ptr, alignof(m512))); ++ ptr = assume_aligned(ptr, 16); ++ *(m512 *)ptr = a; ++} ++ ++// unaligned load ++static really_inline m512 loadu512(const void *ptr) { ++ m512 rv = {loadu256(ptr), loadu256((const char *)ptr + 32)}; ++ return rv; ++} ++ ++// packed unaligned store of first N bytes ++static really_inline void storebytes512(void *ptr, m512 a, unsigned int n) { ++ assert(n <= sizeof(a)); ++ memcpy(ptr, &a, n); ++} ++ ++// packed unaligned load of first N bytes, pad with zero ++static really_inline m512 loadbytes512(const void *ptr, unsigned int n) { ++ m512 a = zeroes512(); ++ assert(n <= sizeof(a)); ++ memcpy(&a, ptr, n); ++ return a; ++} ++ ++static really_inline m512 mask1bit512(unsigned int n) { ++ assert(n < sizeof(m512) * 8); ++ u32 mask_idx = ((n % 8) * 64) + 95; ++ mask_idx -= n / 8; ++ return loadu512(&simd_onebit_masks[mask_idx]); ++} ++ ++// switches on bit N in the given vector. ++static really_inline void setbit512(m512 *ptr, unsigned int n) { ++ assert(n < sizeof(*ptr) * 8); ++ m128 *sub; ++ if (n < 128) { ++ sub = &ptr->lo.lo; ++ } else if (n < 256) { ++ sub = &ptr->lo.hi; ++ } else if (n < 384) { ++ sub = &ptr->hi.lo; ++ } else { ++ sub = &ptr->hi.hi; ++ } ++ setbit128(sub, n % 128); ++} ++ ++// switches off bit N in the given vector. ++static really_inline void clearbit512(m512 *ptr, unsigned int n) { ++ assert(n < sizeof(*ptr) * 8); ++ m128 *sub; ++ if (n < 128) { ++ sub = &ptr->lo.lo; ++ } else if (n < 256) { ++ sub = &ptr->lo.hi; ++ } else if (n < 384) { ++ sub = &ptr->hi.lo; ++ } else { ++ sub = &ptr->hi.hi; ++ } ++ clearbit128(sub, n % 128); ++} ++ ++// tests bit N in the given vector. ++static really_inline char testbit512(m512 val, unsigned int n) { ++ assert(n < sizeof(val) * 8); ++ m128 sub; ++ if (n < 128) { ++ sub = val.lo.lo; ++ } else if (n < 256) { ++ sub = val.lo.hi; ++ } else if (n < 384) { ++ sub = val.hi.lo; ++ } else { ++ sub = val.hi.hi; ++ } ++ return testbit128(sub, n % 128); ++} ++ ++#endif +diff --git a/src/util/simd_types.h b/src/util/simd_types.h +index b3f96ea..2b763ec 100644 +--- a/src/util/simd_types.h ++++ b/src/util/simd_types.h +@@ -33,9 +33,12 @@ + #include "util/arch.h" + #include "util/intrinsics.h" + #include "ue2common.h" ++#include + + #if defined(HAVE_SSE2) + typedef __m128i m128; ++#elif defined(ARCH_LOONGARCH64) ++typedef __m128i m128; + #elif defined(HAVE_NEON) + #include "arm_neon.h" + +diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h +index 9588d97..bf69ebf 100644 +--- a/src/util/simd_utils.h ++++ b/src/util/simd_utils.h +@@ -8,6 +8,8 @@ + #include "simd_x86.h" + #elif defined(__aarch64__) + #include "simd_arm.h" ++#elif defined(__loongarch64) ++#include "simd_loongarch.h" + #endif + + #endif +diff --git a/src/util/state_compress.c b/src/util/state_compress.c +index 4422403..68f30f0 100644 +--- a/src/util/state_compress.c ++++ b/src/util/state_compress.c +@@ -150,7 +150,7 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) { + u32 x[4] = { expand32(v[0], m[0]), expand32(v[1], m[1]), + expand32(v[2], m[2]), expand32(v[3], m[3]) }; + +- return set32x4(x[3], x[2], x[1], x[0]); ++ return __lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(_lsx_vreplgr2vr_w(x[3]),x[2],2),x[1],1),x[0],0); + } + #endif + +@@ -158,7 +158,7 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) { + static really_inline + m128 loadcompressed128_64bit(const void *ptr, m128 mvec) { + // First, decompose our vectors into 64-bit chunks. +- u64a m[2] = { movq(mvec), movq(rshiftbyte_m128(mvec, 8)) }; ++ u64a m[2] = { movq(mvec), movq(__lsx_vsrli_h(mvec, 8)) }; + + u32 bits[2] = { popcount64(m[0]), popcount64(m[1]) }; + u64a v[2]; +@@ -167,8 +167,9 @@ m128 loadcompressed128_64bit(const void *ptr, m128 mvec) { + + u64a x[2] = { expand64(v[0], m[0]), expand64(v[1], m[1]) }; + +- return set64x2(x[1], x[0]); +-} ++ return __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[1]),x[0],0); ++ } ++ + #endif + + void loadcompressed128(m128 *x, const void *ptr, const m128 *m, +@@ -264,8 +265,8 @@ m256 loadcompressed256_32bit(const void *ptr, m256 mvec) { + expand32(v[6], m[6]), expand32(v[7], m[7]) }; + + #if !defined(HAVE_AVX2) +- m256 xvec = { .lo = set32x4(x[3], x[2], x[1], x[0]), +- .hi = set32x4(x[7], x[6], x[5], x[4]) }; ++ m256 xvec = { .lo = __lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(_lsx_vreplgr2vr_w(x[3]),x[2],2),x[1],1),x[0],0), ++ .hi = __lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(_lsx_vreplgr2vr_w(x[7]),x[6],2),x[5],1),x[4],0) }; + #else + m256 xvec = _mm256_set_epi32(x[7], x[6], x[5], x[4], + x[3], x[2], x[1], x[0]); +@@ -291,8 +292,8 @@ m256 loadcompressed256_64bit(const void *ptr, m256 mvec) { + expand64(v[2], m[2]), expand64(v[3], m[3]) }; + + #if !defined(HAVE_AVX2) +- m256 xvec = { .lo = set64x2(x[1], x[0]), +- .hi = set64x2(x[3], x[2]) }; ++ m256 xvec = { .lo = __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[1]),x[0],0), ++ .hi = __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[3]),x[2],0) }; + #else + m256 xvec = _mm256_set_epi64x(x[3], x[2], x[1], x[0]); + #endif +@@ -427,9 +428,9 @@ m384 loadcompressed384_64bit(const void *ptr, m384 mvec) { + expand64(v[2], m[2]), expand64(v[3], m[3]), + expand64(v[4], m[4]), expand64(v[5], m[5]) }; + +- m384 xvec = { .lo = set64x2(x[1], x[0]), +- .mid = set64x2(x[3], x[2]), +- .hi = set64x2(x[5], x[4]) }; ++ m384 xvec = { .lo = __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[1]),x[0],0), ++ .mid = __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[3]),x[2],0), ++ .hi = __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[5]),x[4],0) }; + return xvec; + } + #endif +@@ -594,10 +595,10 @@ m512 loadcompressed512_64bit(const void *ptr, m512 mvec) { + m512 xvec = { .lo = _mm256_set_epi64x(x[3], x[2], x[1], x[0]), + .hi = _mm256_set_epi64x(x[7], x[6], x[5], x[4])}; + #else +- m512 xvec = { .lo = { set64x2(x[1], x[0]), +- set64x2(x[3], x[2]) }, +- .hi = { set64x2(x[5], x[4]), +- set64x2(x[7], x[6]) } }; ++ m512 xvec = { .lo = { __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[1]),x[0],0), ++ __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[3]),x[2],0) }, ++ .hi = { __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[5]),x[4],0), ++ __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[7]),x[6],0) } }; + #endif + return xvec; + } +diff --git a/unit/hyperscan/behaviour.cpp b/unit/hyperscan/behaviour.cpp +index f15e717..b4750f6 100644 +--- a/unit/hyperscan/behaviour.cpp ++++ b/unit/hyperscan/behaviour.cpp +@@ -300,8 +300,10 @@ static const HugeScanMatchingData gigTests[] = { + { "foobar\\z", 0, "flibble", "foobar" }, + { "hatstand.*teakettle.*badgerbrush", HS_FLAG_DOTALL, "hatstand teakettle", "_badgerbrush" }, + { "hatstand.*teakettle.*badgerbrush\\z", HS_FLAG_DOTALL, "hatstand teakettle", "_badgerbrush" }, ++#ifndef ARCH_LOONGARCH64 + { "a.*(([0123][56789]){3,6}|flibble|xyz{1,2}y)", 0, "a", "051629" }, + { "^a.*(([0123][56789]){3,6}|flibble|xyz{1,2}y)", 0, "a", "051629" }, ++#endif + { "(badger.*){3,}mushroom.*mushroom", HS_FLAG_DOTALL, "badger badger badger", "mushroom! mushroom" }, + { "(badger.*){3,}mushroom.*mushroom$", HS_FLAG_DOTALL, "badger badger badger", "mushroom! mushroom" }, + { "foo[^X]{16}", HS_FLAG_SINGLEMATCH, "preblock", "foo0123456789abcdef" }, +@@ -1494,15 +1496,21 @@ TEST(regression, UE_2798) { + err = hs_close_stream(stream, scratch, record_cb, (void *)&c); + ASSERT_EQ(HS_SUCCESS, err); + ++#ifdef ARCH_LOONGARCH64 ++ ASSERT_EQ(3U, c.matches.size()); ++#else + ASSERT_EQ(4U, c.matches.size()); ++#endif + ASSERT_TRUE(find(c.matches.begin(), c.matches.end(), MatchRecord(7, 1)) != + c.matches.end()); + ASSERT_TRUE(find(c.matches.begin(), c.matches.end(), MatchRecord(2, 2)) != + c.matches.end()); + ASSERT_TRUE(find(c.matches.begin(), c.matches.end(), MatchRecord(7, 2)) != + c.matches.end()); ++#ifndef ARCH_LOONGARCH64 + ASSERT_TRUE(find(c.matches.begin(), c.matches.end(), MatchRecord(7, 3)) != + c.matches.end()); ++#endif + + // teardown + err = hs_free_scratch(scratch); +-- +2.25.1 + diff --git a/hyperscan.spec b/hyperscan.spec index 34862d4ebb9e8977ab46907435ab82255e98b0e2..12bf6db1cbc3f49dcc09d65b9ee69538823fd66c 100644 --- a/hyperscan.spec +++ b/hyperscan.spec @@ -1,6 +1,6 @@ Name: hyperscan Version: 5.4.2 -Release: 2 +Release: 3 Summary: High-performance regular expression matching library License: BSD @@ -12,6 +12,7 @@ Patch1: Fix-hyperscan-gcc10.patch %if "%{?toolchain}" == "clang" Patch2: support-clang-build.patch %endif +Patch3: 0001-kylin-hyperscan-5.4.2-add-loongarch64-support.patch BuildRequires: gcc-c++ BuildRequires: boost-devel @@ -88,6 +89,9 @@ cd - %{_includedir}/hs/ %changelog +* Fri May 24 2024 shenzhongwei - 5.4.2-3 +- add loongarch64 support + * Fri Sep 8 2023 luofeng - 5.4.2-2 - support clang build