diff --git a/0001-kylin-hyperscan-5.4.2-add-loongarch64-support.patch b/0001-kylin-hyperscan-5.4.2-add-loongarch64-support.patch
new file mode 100644
index 0000000000000000000000000000000000000000..41e27a3de68f789d2d43684ef1c7b1d2d0035b37
--- /dev/null
+++ b/0001-kylin-hyperscan-5.4.2-add-loongarch64-support.patch
@@ -0,0 +1,1407 @@
+From 5a73aacd6e22ff6d751c568e109f2fb6914b0472 Mon Sep 17 00:00:00 2001
+From: peijiankang <peijiankang@kylinos.cn>
+Date: Thu, 18 Jan 2024 15:47:45 +0800
+Subject: [PATCH] kylin hyperscan-5.4.2 add loongarch64 support
+
+---
+ CMakeLists.txt               |  14 +-
+ cmake/arch.cmake             |   5 +-
+ cmake/config.h.in            |   6 +
+ cmake/platform.cmake         |   5 +-
+ src/dispatcher.c             |   4 +-
+ src/hs_valid_platform.c      |   7 +-
+ src/nfa/shufti.c             |   6 +-
+ src/nfa/truffle.c            |   4 +-
+ src/rose/counting_miracle.h  |   2 +-
+ src/util/arch.h              |   4 +
+ src/util/cpuid_flags.c       |   2 +
+ src/util/intrinsics.h        |   6 +
+ src/util/simd_loongarch.h    | 956 +++++++++++++++++++++++++++++++++++
+ src/util/simd_types.h        |   3 +
+ src/util/simd_utils.h        |   2 +
+ src/util/state_compress.c    |  31 +-
+ unit/hyperscan/behaviour.cpp |   8 +
+ 17 files changed, 1033 insertions(+), 32 deletions(-)
+ create mode 100644 src/util/simd_loongarch.h
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 77eaa25..1c49fc4 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -183,7 +183,7 @@ else()
+         string(REGEX REPLACE "-O[^ ]*" "" CMAKE_CXX_FLAGS_${CONFIG} "${CMAKE_CXX_FLAGS_${CONFIG}}")
+     endforeach ()
+ 
+-    if (CMAKE_COMPILER_IS_GNUCC)
++    if (ARCH_IA32 OR ARCH_X86_64 AND CMAKE_COMPILER_IS_GNUCC)
+         message(STATUS "gcc version ${CMAKE_C_COMPILER_VERSION}")
+         # If gcc doesn't recognise the host cpu, then mtune=native becomes
+         # generic, which isn't very good in some cases. march=native looks at
+@@ -284,6 +284,10 @@ else()
+             set(ARCH_CXX_FLAGS "-march=armv8-a -mtune=${TUNE_FLAG}")
+         endif()
+     endif()
++        if (ARCH_LOONGARCH64)
++        set(ARCH_CXX_FLAGS "-mlsx")
++        set(ARCH_C_FLAGS "-mlsx")
++    endif()
+ 
+     if(CMAKE_COMPILER_IS_GNUCC)
+         # spurious warnings?
+@@ -328,6 +332,10 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
+     CHECK_INCLUDE_FILE_CXX(arm_neon.h HAVE_CXX_ARM_NEON_H)
+ endif()
+ 
++if (ARCH_LOONGARCH64)
++    CHECK_INCLUDE_FILES(lsxintrin.h HAVE_C_LSXINTRIN_H)
++endif()
++
+ CHECK_FUNCTION_EXISTS(posix_memalign HAVE_POSIX_MEMALIGN)
+ CHECK_FUNCTION_EXISTS(_aligned_malloc HAVE__ALIGNED_MALLOC)
+ 
+@@ -359,8 +367,8 @@ if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+             (CMAKE_VERSION VERSION_GREATER "3.0" AND CMAKE_GENERATOR MATCHES "Ninja")))
+         message (STATUS "Building the fat runtime requires the Unix Makefiles generator, or Ninja with CMake v3.0 or higher")
+         set (FAT_RUNTIME_REQUISITES FALSE)
+-    elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
+-        message(STATUS "AARCH64 platform don't support fat runtime")
++    elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64|loongarch64|LOONGARCH64")
++        message(STATUS "AARCH64 and LOONGARCH64 platform don't support fat runtime")
+         set (FAT_RUNTIME_REQUISITES FALSE)
+     else()
+         include (${CMAKE_MODULE_PATH}/attrib.cmake)
+diff --git a/cmake/arch.cmake b/cmake/arch.cmake
+index eb4791e..19aa693 100644
+--- a/cmake/arch.cmake
++++ b/cmake/arch.cmake
+@@ -6,6 +6,9 @@ if (HAVE_C_X86INTRIN_H)
+     set (INTRIN_INC_H "x86intrin.h")
+ elseif (HAVE_C_INTRIN_H)
+     set (INTRIN_INC_H "intrin.h")
++elseif (HAVE_C_LSXINTRIN_H)
++    set (INTRIN_INC_H "lsxintrin.h")
++    set (FAT_RUNTIME OFF)
+ else ()
+     message (FATAL_ERROR "No intrinsics header found")
+ endif ()
+@@ -81,7 +84,7 @@ int main(){
+     (void)_mm512_permutexvar_epi8(idx, a);
+ }" HAVE_AVX512VBMI)
+ 
+-if (FAT_RUNTIME)
++if (FAT_RUNTIME AND (ARCH_IA32 OR ARCH_X86_64))
+     if (NOT HAVE_SSSE3)
+         message(FATAL_ERROR "SSSE3 support required to build fat runtime")
+     endif ()
+diff --git a/cmake/config.h.in b/cmake/config.h.in
+index 336cf19..c33c4a9 100644
+--- a/cmake/config.h.in
++++ b/cmake/config.h.in
+@@ -18,6 +18,9 @@
+ /* "Define if building for aarch64" */
+ #cmakedefine ARCH_AARCH64
+ 
++/* "Define if building for LOONGARCH64" */
++#cmakedefine ARCH_LOONGARCH64
++
+ /* internal build, switch on dump support. */
+ #cmakedefine DUMP_SUPPORT
+ 
+@@ -57,6 +60,9 @@
+ /* C compiler has arm_neon.h */
+ #cmakedefine HAVE_C_ARM_NEON_H
+ 
++/* C compiler has lsxintrin.h */
++#cmakedefine HAVE_C_LSXINTRIN_H
++
+ /* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to
+    0 if you don't. */
+ #cmakedefine HAVE_DECL_PTHREAD_SETAFFINITY_NP
+diff --git a/cmake/platform.cmake b/cmake/platform.cmake
+index 213dcc5..58c2dc6 100644
+--- a/cmake/platform.cmake
++++ b/cmake/platform.cmake
+@@ -6,9 +6,10 @@ CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error n
+ CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_IA32)
+ 
+ CHECK_C_SOURCE_COMPILES("#if !(defined(__aarch64__))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_AARCH64)
++CHECK_C_SOURCE_COMPILES("#if !(defined(__loongarch64))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_LOONGARCH64)
+ 
+-if (ARCH_X86_64 OR ARCH_AARCH64)
++if (ARCH_X86_64 OR ARCH_AARCH64 OR ARCH_LOONGARCH64)
+     set(ARCH_64_BIT 1)
+ elseif (ARCH_IA32)
+     set(ARCH_32_BIT 1)
+-endif()
+\ No newline at end of file
++endif()
+diff --git a/src/dispatcher.c b/src/dispatcher.c
+index 9a8afa6..74dfb4a 100644
+--- a/src/dispatcher.c
++++ b/src/dispatcher.c
+@@ -56,6 +56,7 @@
+         return (RTYPE)HS_ARCH_ERROR;                                           \
+     }                                                                          \
+                                                                                \
++#if defined(ARCH_X86_64)
+     /* resolver */                                                             \
+     static RTYPE (*JOIN(resolve_, NAME)(void))(__VA_ARGS__) {                  \
+         if (check_avx512vbmi()) {                                              \
+@@ -75,7 +76,8 @@
+         }                                                                      \
+         /* anything else is fail */                                            \
+         return JOIN(error_, NAME);                                             \
+-    }                                                                          \
++    }                  
++#endif                                                        \
+                                                                                \
+     /* function */                                                             \
+     HS_PUBLIC_API                                                              \
+diff --git a/src/hs_valid_platform.c b/src/hs_valid_platform.c
+index 035d3ff..a121e7e 100644
+--- a/src/hs_valid_platform.c
++++ b/src/hs_valid_platform.c
+@@ -37,12 +37,11 @@ hs_error_t HS_CDECL hs_valid_platform(void) {
+     if (check_ssse3()) {
+         return HS_SUCCESS;
+     }
+-#else
++#elif defined(ARCH_AARCH64)
+     if (check_neon()) {
+         return HS_SUCCESS;
+     }
++#elif defined(ARCH_LOONGARCH64)
++    return HS_SUCCESS;
+ #endif
+-    else {
+-        return HS_ARCH_ERROR;
+-    }
+ }
+diff --git a/src/nfa/shufti.c b/src/nfa/shufti.c
+index 2cb74f0..090eb8f 100644
+--- a/src/nfa/shufti.c
++++ b/src/nfa/shufti.c
+@@ -159,7 +159,7 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
+     }
+ 
+     const m128 zeroes = zeroes128();
+-    const m128 low4bits = set16x8(0xf);
++    const m128 low4bits = __lsx_vldi(0xf);
+     const u8 *rv;
+ 
+     size_t min = (size_t)buf % 16;
+@@ -251,7 +251,7 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
+     }
+ 
+     const m128 zeroes = zeroes128();
+-    const m128 low4bits = set16x8(0xf);
++    const m128 low4bits = __lsx_vldi(0xf);
+     const u8 *rv;
+ 
+     assert(buf_end - buf >= 16);
+@@ -325,7 +325,7 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
+                            m128 mask2_lo, m128 mask2_hi,
+                            const u8 *buf, const u8 *buf_end) {
+     const m128 ones = ones128();
+-    const m128 low4bits = set16x8(0xf);
++    const m128 low4bits = __lsx_vldi(0xf);
+     const u8 *rv;
+ 
+     size_t min = (size_t)buf % 16;
+diff --git a/src/nfa/truffle.c b/src/nfa/truffle.c
+index c05d778..9bb8d88 100644
+--- a/src/nfa/truffle.c
++++ b/src/nfa/truffle.c
+@@ -64,8 +64,8 @@ const u8 *firstMatch(const u8 *buf, u32 z) {
+ static really_inline
+ u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) {
+ 
+-    m128 highconst = set16x8(0x80);
+-    m128 shuf_mask_hi = set2x64(0x8040201008040201);
++    m128 highconst = __lsx_vldi(0x80);
++    m128 shuf_mask_hi = __lsx_vreplgr2vr_d(0x8040201008040201);
+ 
+     // and now do the real work
+     m128 shuf1 = pshufb_m128(shuf_mask_lo_highclear, v);
+diff --git a/src/rose/counting_miracle.h b/src/rose/counting_miracle.h
+index 4456679..1cf5189 100644
+--- a/src/rose/counting_miracle.h
++++ b/src/rose/counting_miracle.h
+@@ -94,7 +94,7 @@ u32 roseCountingMiracleScanShufti(m128 mask_lo, m128 mask_hi, u8 poison,
+     u32 count = *count_inout;
+ 
+     const m128 zeroes = zeroes128();
+-    const m128 low4bits = set16x8(0xf);
++    const m128 low4bits = __lsx_vldi(0xf);
+ 
+     for (; d + 16 <= d_end; d_end -= 16) {
+         m128 data = loadu128(d_end - 16);
+diff --git a/src/util/arch.h b/src/util/arch.h
+index fe4a910..b650816 100644
+--- a/src/util/arch.h
++++ b/src/util/arch.h
+@@ -98,4 +98,8 @@
+ #define NO_ASM
+ #endif
+ 
++#if defined(__loongarch64)
++#define NO_ASM
++#endif
++
+ #endif // UTIL_ARCH_H_
+diff --git a/src/util/cpuid_flags.c b/src/util/cpuid_flags.c
+index 96286ee..57fc34d 100644
+--- a/src/util/cpuid_flags.c
++++ b/src/util/cpuid_flags.c
+@@ -33,9 +33,11 @@
+ #include "hs_internal.h"
+ #include "util/arch.h"
+ 
++#if defined(__x86_64__) || defined(_M_X64)
+ #if !defined(_WIN32) && !defined(CPUID_H_)
+ #include <cpuid.h>
+ #endif
++#endif
+ 
+ u64a cpuid_flags(void) {
+     u64a cap = 0;
+diff --git a/src/util/intrinsics.h b/src/util/intrinsics.h
+index ece3b1a..9c9d95a 100644
+--- a/src/util/intrinsics.h
++++ b/src/util/intrinsics.h
+@@ -45,6 +45,10 @@
+ # endif
+ #endif
+ 
++#if defined(HAVE_C_LSXINTRIN_H)
++#  define USE_LSXINTRIN_H
++#endif
++
+ #ifdef __cplusplus
+ # if defined(HAVE_CXX_INTRIN_H)
+ #  define USE_INTRIN_H
+@@ -71,6 +75,8 @@
+ #include <intrin.h>
+ #elif defined(USE_ARM_NEON_H)
+ #include <arm_neon.h>
++#elif defined(USE_LSXINTRIN_H)
++#include <lsxintrin.h>
+ #else
+ #error no intrinsics file
+ #endif
+diff --git a/src/util/simd_loongarch.h b/src/util/simd_loongarch.h
+new file mode 100644
+index 0000000..b311ffb
+--- /dev/null
++++ b/src/util/simd_loongarch.h
+@@ -0,0 +1,956 @@
++/*
++ * Copyright (c) 2015-2017, Intel Corporation
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions are met:
++ *
++ *  * Redistributions of source code must retain the above copyright notice,
++ *    this list of conditions and the following disclaimer.
++ *  * Redistributions in binary form must reproduce the above copyright
++ *    notice, this list of conditions and the following disclaimer in the
++ *    documentation and/or other materials provided with the distribution.
++ *  * Neither the name of Intel Corporation nor the names of its contributors
++ *    may be used to endorse or promote products derived from this software
++ *    without specific prior written permission.
++ *
++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
++ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
++ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
++ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
++ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
++ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
++ * POSSIBILITY OF SUCH DAMAGE.
++ */
++
++/** \file
++ * \brief SIMD types and primitive operations.
++ */
++
++#ifndef SIMD_LSX
++#define SIMD_LSX
++
++#include "config.h"
++#include "ue2common.h"
++#include "simd_types.h"
++#include "unaligned.h"
++#include "util/arch.h"
++#include "util/intrinsics.h"
++#include <lsxintrin.h>
++
++#include <string.h> // for memcpy
++
++// Define a common assume_aligned using an appropriate compiler built-in, if
++// it's available. Note that we need to handle C or C++ compilation.
++#ifdef __cplusplus
++#ifdef HAVE_CXX_BUILTIN_ASSUME_ALIGNED
++#define assume_aligned(x, y) __builtin_assume_aligned((x), (y))
++#endif
++#else
++#ifdef HAVE_CC_BUILTIN_ASSUME_ALIGNED
++#define assume_aligned(x, y) __builtin_assume_aligned((x), (y))
++#endif
++#endif
++
++// Fallback to identity case.
++#ifndef assume_aligned
++#define assume_aligned(x, y) (x)
++#endif
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++extern const char vbs_mask_data[];
++#ifdef __cplusplus
++}
++#endif
++
++static really_inline m128 ones128(void) {
++    /* gcc gets this right */
++   return  __lsx_vldi(0xFF);
++}
++
++static really_inline m128 zeroes128(void) {
++   return  __lsx_vldi(0);
++}
++
++/** \brief Bitwise not for m128*/
++static really_inline m128 not128(m128 a) {
++    return  __lsx_vxor_v(a,ones128());
++}
++
++/** \brief Return 1 if a and b are different otherwise 0 */
++static really_inline int diff128(m128 a, m128 b) {
++    return (__lsx_vpickve2gr_hu(__lsx_vmskltz_b(__lsx_vseq_b(a, b)), 0) ^ 0xffff);
++}
++
++static really_inline int isnonzero128(m128 a) {
++    return !!diff128(a, zeroes128());
++}
++
++/**
++ * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit
++ * mask indicating which 32-bit words contain differences.
++ */
++static really_inline u32 diffrich128(m128 a, m128 b) {
++     a = __lsx_vseq_w(a, b);
++     return ~( __lsx_vpickve2gr_hu(__lsx_vmskltz_w(a),0)) & 0xf;
++}
++/**
++ * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and
++ * returns a 4-bit mask indicating which 64-bit words contain differences.
++ */
++static really_inline u32 diffrich64_128(m128 a, m128 b) {
++    u32 d = diffrich128(a, b);
++    return (d | (d >> 1)) & 0x5;
++}
++
++static really_really_inline
++m128 lshift64_m128(m128 a, unsigned b) {
++    m128 tmp = __lsx_vinsgr2vr_w(zeroes128(), b, 0);
++
++    m128 x = __lsx_vinsgr2vr_w(tmp, b, 2);
++    return  __lsx_vsll_d(a, x);
++}
++
++#define rshift64_m128(a, b) __lsx_vsrli_d((a), (b))
++#define eq128(a, b)      __lsx_vseq_b((a), (b))
++#define movemask128(a)  __lsx_vpickve2gr_hu(__lsx_vmskltz_b(a), 0)
++
++static really_inline m128 set16x8(u8 c) {
++    return __lsx_vreplgr2vr_b(c);
++}
++
++static really_inline m128 set4x32(u32 c) {
++    return  __lsx_vreplgr2vr_w(c);
++}
++
++static really_inline u32 movd(const m128 in) {
++    return __lsx_vpickve2gr_w(in, 0);
++}
++
++static really_inline u64a movq(const m128 in) {
++    u32 lo = movd(in);
++    u32 hi = movd(__lsx_vsrli_d(in, 32));
++    return (u64a)hi << 32 | lo;
++}
++
++/* another form of movq */
++static really_inline
++m128 load_m128_from_u64a(const u64a *p) {
++    return __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(0LL), *p, 0);
++}
++
++#define L(a)  __lsx_vand_v(a, __lsx_vinsgr2vr_d(__lsx_vldi(0), 0xFFFFFFFFFFFFFFFF, 0))
++#define M(a) __lsx_vand_v(a, __lsx_vinsgr2vr_d(__lsx_vldi(0xFF), 0x0000000000000000, 0))
++#define N(a) __lsx_vpickod_d(__lsx_vldi(0),a)
++#define U(a) __lsx_vpickev_d(a, __lsx_vldi(0))
++#define rshiftbyte_m128(a, count_immed) \
++        (((count_immed) < 8) ? (__lsx_vor_v(__lsx_vsrli_d(M(a), (8*count_immed)), __lsx_vor_v(__lsx_vsrli_d(L(a), (8*count_immed)), __lsx_vslli_d(N(a), (64-(8*count_immed)))))) : (__lsx_vsrli_d(N(a),((8*count_immed)-64))))
++
++#define lshiftbyte_m128(a, count_immed) \
++        (((count_immed) < 8) ? (__lsx_vor_v(__lsx_vslli_d(L(a), (8*count_immed)), __lsx_vor_v(__lsx_vslli_d(M(a), (8*count_immed)), __lsx_vsrli_d(U(a), (64-(8*count_immed)))))) : (__lsx_vslli_d(U(a),((8*count_immed)-64))))
++
++#define extract32from128(a, imm) \
++        (((imm) < 2) ? (__lsx_vor_v(__lsx_vsrli_d(M(a), (32*imm)), __lsx_vor_v(__lsx_vsrli_d(L(a), (32*imm)), __lsx_vslli_d(N(a), (64-(32*imm)))))) : (__lsx_vsrli_d(N(a),((32*imm)-64))))
++#define extract64from128(a, imm) \
++        (((imm) < 1) ? (__lsx_vor_v(__lsx_vsrli_d(M(a), (64*imm)), __lsx_vor_v(__lsx_vsrli_d(L(a), (64*imm)), __lsx_vslli_d(N(a), (64-(64*imm)))))) : (__lsx_vsrli_d(N(a),((64*imm)-64))))
++
++#define extractlow64from256(a) movq(a.lo)
++#define extractlow32from256(a) movd(a.lo)
++
++static really_inline m128 and128(m128 a, m128 b) {
++    return __lsx_vand_v(a,b);
++}
++
++static really_inline m128 xor128(m128 a, m128 b) {
++    return __lsx_vxor_v(a,b);
++}
++
++static really_inline m128 or128(m128 a, m128 b) {
++    return __lsx_vor_v(a,b);
++}
++
++static really_inline m128 andnot128(m128 a, m128 b) {
++    return __lsx_vandn_v(a,b);
++}
++
++// aligned load
++static really_inline m128 load128(const void *ptr) {
++    assert(ISALIGNED_N(ptr, alignof(m128)));
++    ptr = assume_aligned(ptr, 16);
++    return __lsx_vldx((const m128 *)ptr,0);
++}
++
++// aligned store
++static really_inline void store128(void *ptr, m128 a) {
++    assert(ISALIGNED_N(ptr, alignof(m128)));
++    ptr = assume_aligned(ptr, 16);
++    *(m128 *)ptr = a;
++}
++
++// unaligned load
++static really_inline m128 loadu128(const void *ptr) {
++    return __lsx_vldx((const m128 *)ptr,0);
++}
++
++// unaligned store
++static really_inline void storeu128(void *ptr, m128 a) {
++    __lsx_vst(a,(m128 *)ptr,0);
++}
++
++// packed unaligned store of first N bytes
++static really_inline
++void storebytes128(void *ptr, m128 a, unsigned int n) {
++    assert(n <= sizeof(a));
++    memcpy(ptr, &a, n);
++}
++
++// packed unaligned load of first N bytes, pad with zero
++static really_inline
++m128 loadbytes128(const void *ptr, unsigned int n) {
++    m128 a = zeroes128();
++    assert(n <= sizeof(a));
++    memcpy(&a, ptr, n);
++    return a;
++}
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++extern const u8 simd_onebit_masks[];
++#ifdef __cplusplus
++}
++#endif
++
++static really_inline
++m128 mask1bit128(unsigned int n) {
++    assert(n < sizeof(m128) * 8);
++    u32 mask_idx = ((n % 8) * 64) + 95;
++    mask_idx -= n / 8;
++    return loadu128(&simd_onebit_masks[mask_idx]);
++}
++
++// switches on bit N in the given vector.
++static really_inline
++void setbit128(m128 *ptr, unsigned int n) {
++    *ptr = or128(mask1bit128(n), *ptr);
++}
++
++// switches off bit N in the given vector.
++static really_inline
++void clearbit128(m128 *ptr, unsigned int n) {
++    *ptr = andnot128(mask1bit128(n), *ptr);
++}
++
++// tests bit N in the given vector.
++static really_inline
++char testbit128(m128 val, unsigned int n) {
++    const m128 mask = mask1bit128(n);
++    return isnonzero128(and128(mask, val));
++}
++
++#define palignr(r, l, offset) \
++       (((offset) < 8) ? __lsx_vor_v(rshiftbyte_m128(l,(offset)),lshiftbyte_m128(U(r),(8-(offset)))) : __lsx_vor_v(rshiftbyte_m128(l,(offset)), lshiftbyte_m128(r,(16-(offset)))))
++
++static really_inline
++m128 shuffle_epi8(m128 a, m128 b) {
++	m128 tmp1,tmp2,tmp3,dst;
++        tmp1 = ~(__lsx_vslt_b(b,__lsx_vldi(0)));
++        tmp2 = __lsx_vand_v(b,tmp1);
++        tmp3 = __lsx_vand_v(tmp2, __lsx_vldi(0x0F));
++        unsigned char* p = (unsigned char*)&tmp3;
++        unsigned char* pa = (unsigned char*)&a;
++        for (int i = 0; i < 16; i++) {
++                unsigned char value = p[i];
++                switch(i){
++                        case 0:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 0);}else{dst = tmp3;}break;
++                        case 1:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 1);}else{dst = tmp3;}break;
++                        case 2:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 2);}else{dst = tmp3;}break;
++                        case 3:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 3);}else{dst = tmp3;}break;
++                        case 4:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 4);}else{dst = tmp3;}break;
++                        case 5:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 5);}else{dst = tmp3;}break;
++                        case 6:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 6);}else{dst = tmp3;}break;
++                        case 7:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 7);}else{dst = tmp3;}break;
++                        case 8:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 8);}else{dst = tmp3;}break;
++                        case 9:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 9);}else{dst = tmp3;}break;
++                        case 10:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 10);}else{dst = tmp3;}break;
++                        case 11:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 11);}else{dst = tmp3;}break;
++                        case 12:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 12);}else{dst = tmp3;}break;
++                        case 13:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 13);}else{dst = tmp3;}break;
++                        case 14:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 14);}else{dst = tmp3;}break;
++                        case 15:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 15);}else{dst = tmp3;}break;
++                        default:break;
++                }
++                tmp3 = dst;
++        }
++    return dst;
++}
++
++static really_inline
++m128 pshufb_m128(m128 a, m128 b) {
++    m128 result;
++    result = shuffle_epi8(a, b);
++    return result;
++}
++
++static really_inline
++m256 pshufb_m256(m256 a, m256 b) {
++    m256 rv;
++    rv.lo = pshufb_m128(a.lo, b.lo);
++    rv.hi = pshufb_m128(a.hi, b.hi);
++    return rv;
++}
++
++static really_inline
++m128 variable_byte_shift_m128(m128 in, s32 amount) {
++    assert(amount >= -16 && amount <= 16);
++    m128 shift_mask = loadu128(vbs_mask_data + 16 - amount);
++    return pshufb_m128(in, shift_mask);
++}
++
++static really_inline
++m128 max_u8_m128(m128 a, m128 b) {
++    return __lsx_vmax_bu(a, b);
++}
++
++static really_inline
++m128 min_u8_m128(m128 a, m128 b) {
++    return __lsx_vmin_bu(a, b);
++}
++
++static really_inline
++m128 sadd_u8_m128(m128 a, m128 b) {
++    return __lsx_vsadd_bu(a, b);
++}
++
++static really_inline
++m128 sub_u8_m128(m128 a, m128 b) {
++    return __lsx_vsub_b(a, b);
++}
++
++static really_inline
++m128 set64x2(u64a hi, u64a lo) {
++    return __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(hi),lo,0);
++}
++
++/****
++ **** 256-bit Primitives
++ ****/
++
++static really_really_inline
++m256 lshift64_m256(m256 a, int b) {
++    m256 rv = a;
++    rv.lo = lshift64_m128(rv.lo, b);
++    rv.hi = lshift64_m128(rv.hi, b);
++    return rv;
++}
++
++static really_inline
++m256 rshift64_m256(m256 a, int b) {
++    m256 rv = a;
++    rv.lo = rshift64_m128(rv.lo, b);
++    rv.hi = rshift64_m128(rv.hi, b);
++    return rv;
++}
++static really_inline
++m256 set32x8(u32 in) {
++    m256 rv;
++    rv.lo = set16x8((u8) in);
++    rv.hi = rv.lo;
++    return rv;
++}
++
++static really_inline
++m256 eq256(m256 a, m256 b) {
++    m256 rv;
++    rv.lo = eq128(a.lo, b.lo);
++    rv.hi = eq128(a.hi, b.hi);
++    return rv;
++}
++
++static really_inline
++u32 movemask256(m256 a) {
++    u32 lo_mask = movemask128(a.lo);
++    u32 hi_mask = movemask128(a.hi);
++    return lo_mask | (hi_mask << 16);
++}
++
++static really_inline
++m256 set2x128(m128 a) {
++    m256 rv = {a, a};
++    return rv;
++}
++
++static really_inline m256 zeroes256(void) {
++    m256 rv = {zeroes128(), zeroes128()};
++    return rv;
++}
++
++static really_inline m256 ones256(void) {
++    m256 rv = {ones128(), ones128()};
++    return rv;
++}
++
++static really_inline m256 and256(m256 a, m256 b) {
++    m256 rv;
++    rv.lo = and128(a.lo, b.lo);
++    rv.hi = and128(a.hi, b.hi);
++    return rv;
++}
++
++static really_inline m256 or256(m256 a, m256 b) {
++    m256 rv;
++    rv.lo = or128(a.lo, b.lo);
++    rv.hi = or128(a.hi, b.hi);
++    return rv;
++}
++
++static really_inline m256 xor256(m256 a, m256 b) {
++    m256 rv;
++    rv.lo = xor128(a.lo, b.lo);
++    rv.hi = xor128(a.hi, b.hi);
++    return rv;
++}
++
++static really_inline m256 not256(m256 a) {
++    m256 rv;
++    rv.lo = not128(a.lo);
++    rv.hi = not128(a.hi);
++    return rv;
++}
++
++static really_inline m256 andnot256(m256 a, m256 b) {
++    m256 rv;
++    rv.lo = andnot128(a.lo, b.lo);
++    rv.hi = andnot128(a.hi, b.hi);
++    return rv;
++}
++
++static really_inline int diff256(m256 a, m256 b) {
++    return diff128(a.lo, b.lo) || diff128(a.hi, b.hi);
++}
++
++static really_inline int isnonzero256(m256 a) {
++    return isnonzero128(or128(a.lo, a.hi));
++}
++
++/**
++ * "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit
++ * mask indicating which 32-bit words contain differences.
++ */
++static really_inline u32 diffrich256(m256 a, m256 b) {
++    m128 z = zeroes128();
++    m128 tmp0,tmp1,tmp2,tmp3;
++    a.lo = __lsx_vseq_w(a.lo, b.lo);
++    a.hi = __lsx_vseq_w(a.hi, b.hi);
++
++    tmp0 =__lsx_vsat_w(a.lo, 15);
++    tmp1 =__lsx_vsat_w(b.hi, 15);
++    tmp2 =__lsx_vsat_h(__lsx_vpickev_h(tmp1, tmp0), 7);
++    tmp3 =__lsx_vsat_h(z, 7);
++    m128 packed = __lsx_vpickev_b(tmp3, tmp2);
++
++    return ~(__lsx_vpickve2gr_hu(__lsx_vmskltz_b(packed), 0)) & 0xff;
++}
++
++/**
++ * "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and
++ * returns an 8-bit mask indicating which 64-bit words contain differences.
++ */
++static really_inline u32 diffrich64_256(m256 a, m256 b) {
++    u32 d = diffrich256(a, b);
++    return (d | (d >> 1)) & 0x55555555;
++}
++
++// aligned load
++static really_inline m256 load256(const void *ptr) {
++    assert(ISALIGNED_N(ptr, alignof(m256)));
++    m256 rv = {load128(ptr), load128((const char *)ptr + 16)};
++    return rv;
++}
++
++// aligned load  of 128-bit value to low and high part of 256-bit value
++static really_inline m256 load2x128(const void *ptr) {
++    assert(ISALIGNED_N(ptr, alignof(m128)));
++    m256 rv;
++    rv.hi = rv.lo = load128(ptr);
++    return rv;
++}
++
++static really_inline m256 loadu2x128(const void *ptr) {
++    return set2x128(loadu128(ptr));
++}
++
++// aligned store
++static really_inline void store256(void *ptr, m256 a) {
++    assert(ISALIGNED_N(ptr, alignof(m256)));
++    ptr = assume_aligned(ptr, 16);
++    *(m256 *)ptr = a;
++}
++
++// unaligned load
++static really_inline m256 loadu256(const void *ptr) {
++    m256 rv = {loadu128(ptr), loadu128((const char *)ptr + 16)};
++    return rv;
++}
++
++// unaligned store
++static really_inline void storeu256(void *ptr, m256 a) {
++    storeu128(ptr, a.lo);
++    storeu128((char *)ptr + 16, a.hi);
++}
++
++// packed unaligned store of first N bytes
++static really_inline void storebytes256(void *ptr, m256 a, unsigned int n) {
++    assert(n <= sizeof(a));
++    memcpy(ptr, &a, n);
++}
++
++// packed unaligned load of first N bytes, pad with zero
++static really_inline m256 loadbytes256(const void *ptr, unsigned int n) {
++    m256 a = zeroes256();
++    assert(n <= sizeof(a));
++    memcpy(&a, ptr, n);
++    return a;
++}
++
++static really_inline m256 mask1bit256(unsigned int n) {
++    assert(n < sizeof(m256) * 8);
++    u32 mask_idx = ((n % 8) * 64) + 95;
++    mask_idx -= n / 8;
++    return loadu256(&simd_onebit_masks[mask_idx]);
++}
++
++static really_inline m256 set64x4(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) {
++    m256 rv;
++    rv.hi = set64x2(hi_1, hi_0);
++    rv.lo = set64x2(lo_1, lo_0);
++    return rv;
++}
++
++// switches on bit N in the given vector.
++static really_inline void setbit256(m256 *ptr, unsigned int n) {
++    assert(n < sizeof(*ptr) * 8);
++    m128 *sub;
++    if (n < 128) {
++        sub = &ptr->lo;
++    } else {
++        sub = &ptr->hi;
++        n -= 128;
++    }
++    setbit128(sub, n);
++}
++
++// switches off bit N in the given vector.
++static really_inline void clearbit256(m256 *ptr, unsigned int n) {
++    assert(n < sizeof(*ptr) * 8);
++    m128 *sub;
++    if (n < 128) {
++        sub = &ptr->lo;
++    } else {
++        sub = &ptr->hi;
++        n -= 128;
++    }
++    clearbit128(sub, n);
++}
++
++// tests bit N in the given vector.
++static really_inline char testbit256(m256 val, unsigned int n) {
++    assert(n < sizeof(val) * 8);
++    m128 sub;
++    if (n < 128) {
++        sub = val.lo;
++    } else {
++        sub = val.hi;
++        n -= 128;
++    }
++    return testbit128(sub, n);
++}
++
++static really_really_inline
++m128 movdq_hi(m256 x) {
++    return x.hi;
++}
++
++static really_really_inline m128 movdq_lo(m256 x) { return x.lo;}
++
++static really_inline m256 combine2x128(m128 hi, m128 lo) {
++    m256 rv = {lo, hi};
++    return rv;
++}
++
++/****
++ **** 384-bit Primitives
++ ****/
++
++static really_inline m384 and384(m384 a, m384 b) {
++    m384 rv;
++    rv.lo = and128(a.lo, b.lo);
++    rv.mid = and128(a.mid, b.mid);
++    rv.hi = and128(a.hi, b.hi);
++    return rv;
++}
++
++static really_inline m384 or384(m384 a, m384 b) {
++    m384 rv;
++    rv.lo = or128(a.lo, b.lo);
++    rv.mid = or128(a.mid, b.mid);
++    rv.hi = or128(a.hi, b.hi);
++    return rv;
++}
++
++static really_inline m384 xor384(m384 a, m384 b) {
++    m384 rv;
++    rv.lo = xor128(a.lo, b.lo);
++    rv.mid = xor128(a.mid, b.mid);
++    rv.hi = xor128(a.hi, b.hi);
++    return rv;
++}
++static really_inline m384 not384(m384 a) {
++    m384 rv;
++    rv.lo = not128(a.lo);
++    rv.mid = not128(a.mid);
++    rv.hi = not128(a.hi);
++    return rv;
++}
++static really_inline m384 andnot384(m384 a, m384 b) {
++    m384 rv;
++    rv.lo = andnot128(a.lo, b.lo);
++    rv.mid = andnot128(a.mid, b.mid);
++    rv.hi = andnot128(a.hi, b.hi);
++    return rv;
++}
++
++static really_really_inline m384 lshift64_m384(m384 a, unsigned b) {
++    m384 rv;
++    rv.lo = lshift64_m128(a.lo, b);
++    rv.mid = lshift64_m128(a.mid, b);
++    rv.hi = lshift64_m128(a.hi, b);
++    return rv;
++}
++
++static really_inline m384 zeroes384(void) {
++    m384 rv = {zeroes128(), zeroes128(), zeroes128()};
++    return rv;
++}
++
++static really_inline m384 ones384(void) {
++    m384 rv = {ones128(), ones128(), ones128()};
++    return rv;
++}
++
++static really_inline int diff384(m384 a, m384 b) {
++    return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi);
++}
++
++static really_inline int isnonzero384(m384 a) {
++    return isnonzero128(or128(or128(a.lo, a.mid), a.hi));
++}
++
++/**
++ * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit
++ * mask indicating which 32-bit words contain differences.
++ */
++static really_inline u32 diffrich384(m384 a, m384 b) {
++    m128 z = zeroes128();
++    m128 tmp0,tmp1,tmp2,tmp3,tmp4,tmp5;
++    a.lo = __lsx_vseq_w(a.lo, b.lo);
++    a.mid = __lsx_vseq_w(a.mid, b.mid);
++    a.hi = __lsx_vseq_w(a.hi, b.hi);
++
++    tmp0 = __lsx_vsat_w(a.lo, 15);
++    tmp1 = __lsx_vsat_w(b.mid, 15);
++
++    tmp2 = __lsx_vsat_w(b.hi, 15);
++    tmp3 = __lsx_vsat_w(z, 15);
++
++    tmp4 = __lsx_vsat_h(__lsx_vpickev_h(tmp1, tmp0),7);
++    tmp5 = __lsx_vsat_h(__lsx_vpickev_h(tmp3, tmp2),7);
++
++    m128 packed = __lsx_vpickev_b(tmp5,tmp4);
++
++    return ~(__lsx_vpickve2gr_hu(__lsx_vmskltz_b(packed), 0)) & 0xfff;
++}
++
++/**
++ * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and
++ * returns a 12-bit mask indicating which 64-bit words contain differences.
++ */
++static really_inline u32 diffrich64_384(m384 a, m384 b) {
++    u32 d = diffrich384(a, b);
++    return (d | (d >> 1)) & 0x55555555;
++}
++
++// aligned load
++static really_inline m384 load384(const void *ptr) {
++    assert(ISALIGNED_16(ptr));
++    m384 rv = {load128(ptr), load128((const char *)ptr + 16),
++               load128((const char *)ptr + 32)};
++    return rv;
++}
++
++// aligned store
++static really_inline void store384(void *ptr, m384 a) {
++    assert(ISALIGNED_16(ptr));
++    ptr = assume_aligned(ptr, 16);
++    *(m384 *)ptr = a;
++}
++
++// unaligned load
++static really_inline m384 loadu384(const void *ptr) {
++    m384 rv = {loadu128(ptr), loadu128((const char *)ptr + 16),
++               loadu128((const char *)ptr + 32)};
++    return rv;
++}
++
++// packed unaligned store of first N bytes
++static really_inline void storebytes384(void *ptr, m384 a, unsigned int n) {
++    assert(n <= sizeof(a));
++    memcpy(ptr, &a, n);
++}
++
++// packed unaligned load of first N bytes, pad with zero
++static really_inline m384 loadbytes384(const void *ptr, unsigned int n) {
++    m384 a = zeroes384();
++    assert(n <= sizeof(a));
++    memcpy(&a, ptr, n);
++    return a;
++}
++
++// switches on bit N in the given vector.
++static really_inline void setbit384(m384 *ptr, unsigned int n) {
++    assert(n < sizeof(*ptr) * 8);
++    m128 *sub;
++    if (n < 128) {
++        sub = &ptr->lo;
++    } else if (n < 256) {
++        sub = &ptr->mid;
++    } else {
++        sub = &ptr->hi;
++    }
++    setbit128(sub, n % 128);
++}
++
++// switches off bit N in the given vector.
++static really_inline void clearbit384(m384 *ptr, unsigned int n) {
++    assert(n < sizeof(*ptr) * 8);
++    m128 *sub;
++    if (n < 128) {
++        sub = &ptr->lo;
++    } else if (n < 256) {
++        sub = &ptr->mid;
++    } else {
++        sub = &ptr->hi;
++    }
++    clearbit128(sub, n % 128);
++}
++
++// tests bit N in the given vector.
++static really_inline char testbit384(m384 val, unsigned int n) {
++    assert(n < sizeof(val) * 8);
++    m128 sub;
++    if (n < 128) {
++        sub = val.lo;
++    } else if (n < 256) {
++        sub = val.mid;
++    } else {
++        sub = val.hi;
++    }
++    return testbit128(sub, n % 128);
++}
++
++/****
++ **** 512-bit Primitives
++ ****/
++
++static really_inline m512 zeroes512(void) {
++    m512 rv = {zeroes256(), zeroes256()};
++    return rv;
++}
++
++static really_inline m512 ones512(void) {
++    m512 rv = {ones256(), ones256()};
++    return rv;
++}
++
++static really_inline m512 and512(m512 a, m512 b) {
++    m512 rv;
++    rv.lo = and256(a.lo, b.lo);
++    rv.hi = and256(a.hi, b.hi);
++    return rv;
++}
++
++static really_inline m512 or512(m512 a, m512 b) {
++    m512 rv;
++    rv.lo = or256(a.lo, b.lo);
++    rv.hi = or256(a.hi, b.hi);
++    return rv;
++}
++
++static really_inline m512 xor512(m512 a, m512 b) {
++    m512 rv;
++    rv.lo = xor256(a.lo, b.lo);
++    rv.hi = xor256(a.hi, b.hi);
++    return rv;
++}
++
++static really_inline m512 not512(m512 a) {
++    m512 rv;
++    rv.lo = not256(a.lo);
++    rv.hi = not256(a.hi);
++    return rv;
++}
++
++static really_inline m512 andnot512(m512 a, m512 b) {
++    m512 rv;
++    rv.lo = andnot256(a.lo, b.lo);
++    rv.hi = andnot256(a.hi, b.hi);
++    return rv;
++}
++
++static really_really_inline m512 lshift64_m512(m512 a, unsigned b) {
++    m512 rv;
++    rv.lo = lshift64_m256(a.lo, b);
++    rv.hi = lshift64_m256(a.hi, b);
++    return rv;
++}
++
++static really_inline int diff512(m512 a, m512 b) {
++    return diff256(a.lo, b.lo) || diff256(a.hi, b.hi);
++}
++
++static really_inline int isnonzero512(m512 a) {
++    m128 x = or128(a.lo.lo, a.lo.hi);
++    m128 y = or128(a.hi.lo, a.hi.hi);
++    return isnonzero128(or128(x, y));
++}
++
++/**
++ * "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit
++ * mask indicating which 32-bit words contain differences.
++ */
++static really_inline u32 diffrich512(m512 a, m512 b) {
++    m128 tmp0,tmp1,tmp2,tmp3,tmp4,tmp5,tmp6,tmp7;
++    a.lo.lo = __lsx_vseq_w(a.lo.lo, b.lo.lo);
++    a.lo.hi = __lsx_vseq_w(a.lo.hi, b.lo.hi);
++    a.hi.lo = __lsx_vseq_w(a.hi.lo, b.hi.lo);
++    a.hi.hi = __lsx_vseq_w(a.hi.hi, b.hi.hi);
++   
++    tmp0 =__lsx_vsat_w(a.lo.lo, 15);
++    tmp1 =__lsx_vsat_w(a.lo.hi, 15);
++    tmp2 =__lsx_vpickev_h(tmp1, tmp0);
++
++    tmp3 =__lsx_vsat_w(a.hi.lo, 15);
++    tmp4 =__lsx_vsat_w(a.hi.hi, 15);
++    tmp5 =__lsx_vpickev_h(tmp4, tmp3);
++
++    tmp6 =__lsx_vsat_h(tmp2, 7);
++    tmp7 =__lsx_vsat_h(tmp5, 7);
++    m128 packed = __lsx_vpickev_b(tmp7, tmp6);
++
++    return ~(__lsx_vpickve2gr_hu(__lsx_vmskltz_b(packed), 0)) & 0xffff;   // ok
++}
++
++/**
++ * "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and
++ * returns a 16-bit mask indicating which 64-bit words contain differences.
++ */
++static really_inline u32 diffrich64_512(m512 a, m512 b) {
++    u32 d = diffrich512(a, b);
++    return (d | (d >> 1)) & 0x55555555;
++}
++
++// aligned load
++static really_inline m512 load512(const void *ptr) {
++    assert(ISALIGNED_N(ptr, alignof(m256)));
++    m512 rv = {load256(ptr), load256((const char *)ptr + 32)};
++    return rv;
++}
++
++// aligned store
++static really_inline void store512(void *ptr, m512 a) {
++    assert(ISALIGNED_N(ptr, alignof(m512)));
++    ptr = assume_aligned(ptr, 16);
++    *(m512 *)ptr = a;
++}
++
++// unaligned load
++static really_inline m512 loadu512(const void *ptr) {
++    m512 rv = {loadu256(ptr), loadu256((const char *)ptr + 32)};
++    return rv;
++}
++
++// packed unaligned store of first N bytes
++static really_inline void storebytes512(void *ptr, m512 a, unsigned int n) {
++    assert(n <= sizeof(a));
++    memcpy(ptr, &a, n);
++}
++
++// packed unaligned load of first N bytes, pad with zero
++static really_inline m512 loadbytes512(const void *ptr, unsigned int n) {
++    m512 a = zeroes512();
++    assert(n <= sizeof(a));
++    memcpy(&a, ptr, n);
++    return a;
++}
++
++static really_inline m512 mask1bit512(unsigned int n) {
++    assert(n < sizeof(m512) * 8);
++    u32 mask_idx = ((n % 8) * 64) + 95;
++    mask_idx -= n / 8;
++    return loadu512(&simd_onebit_masks[mask_idx]);
++}
++
++// switches on bit N in the given vector.
++static really_inline void setbit512(m512 *ptr, unsigned int n) {
++    assert(n < sizeof(*ptr) * 8);
++    m128 *sub;
++    if (n < 128) {
++        sub = &ptr->lo.lo;
++    } else if (n < 256) {
++        sub = &ptr->lo.hi;
++    } else if (n < 384) {
++        sub = &ptr->hi.lo;
++    } else {
++        sub = &ptr->hi.hi;
++    }
++    setbit128(sub, n % 128);
++}
++
++// switches off bit N in the given vector.
++static really_inline void clearbit512(m512 *ptr, unsigned int n) {
++    assert(n < sizeof(*ptr) * 8);
++    m128 *sub;
++    if (n < 128) {
++        sub = &ptr->lo.lo;
++    } else if (n < 256) {
++        sub = &ptr->lo.hi;
++    } else if (n < 384) {
++        sub = &ptr->hi.lo;
++    } else {
++        sub = &ptr->hi.hi;
++    }
++    clearbit128(sub, n % 128);
++}
++
++// tests bit N in the given vector.
++static really_inline char testbit512(m512 val, unsigned int n) {
++    assert(n < sizeof(val) * 8);
++    m128 sub;
++    if (n < 128) {
++        sub = val.lo.lo;
++    } else if (n < 256) {
++        sub = val.lo.hi;
++    } else if (n < 384) {
++        sub = val.hi.lo;
++    } else {
++        sub = val.hi.hi;
++    }
++    return testbit128(sub, n % 128);
++}
++
++#endif
+diff --git a/src/util/simd_types.h b/src/util/simd_types.h
+index b3f96ea..2b763ec 100644
+--- a/src/util/simd_types.h
++++ b/src/util/simd_types.h
+@@ -33,9 +33,12 @@
+ #include "util/arch.h"
+ #include "util/intrinsics.h"
+ #include "ue2common.h"
++#include <lsxintrin.h>
+ 
+ #if defined(HAVE_SSE2)
+ typedef __m128i m128;
++#elif defined(ARCH_LOONGARCH64)
++typedef __m128i m128;
+ #elif defined(HAVE_NEON)
+ #include "arm_neon.h"
+ 
+diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h
+index 9588d97..bf69ebf 100644
+--- a/src/util/simd_utils.h
++++ b/src/util/simd_utils.h
+@@ -8,6 +8,8 @@
+ #include "simd_x86.h"
+ #elif defined(__aarch64__)
+ #include "simd_arm.h"
++#elif defined(__loongarch64)
++#include "simd_loongarch.h"
+ #endif
+ 
+ #endif
+diff --git a/src/util/state_compress.c b/src/util/state_compress.c
+index 4422403..68f30f0 100644
+--- a/src/util/state_compress.c
++++ b/src/util/state_compress.c
+@@ -150,7 +150,7 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) {
+     u32 x[4] = { expand32(v[0], m[0]), expand32(v[1], m[1]),
+                  expand32(v[2], m[2]), expand32(v[3], m[3]) };
+ 
+-    return set32x4(x[3], x[2], x[1], x[0]);
++    return __lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(_lsx_vreplgr2vr_w(x[3]),x[2],2),x[1],1),x[0],0);
+ }
+ #endif
+ 
+@@ -158,7 +158,7 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) {
+ static really_inline
+ m128 loadcompressed128_64bit(const void *ptr, m128 mvec) {
+     // First, decompose our vectors into 64-bit chunks.
+-    u64a m[2] = { movq(mvec), movq(rshiftbyte_m128(mvec, 8)) };
++    u64a m[2] = { movq(mvec), movq(__lsx_vsrli_h(mvec, 8)) };
+ 
+     u32 bits[2] = { popcount64(m[0]), popcount64(m[1]) };
+     u64a v[2];
+@@ -167,8 +167,9 @@ m128 loadcompressed128_64bit(const void *ptr, m128 mvec) {
+ 
+     u64a x[2] = { expand64(v[0], m[0]), expand64(v[1], m[1]) };
+ 
+-    return set64x2(x[1], x[0]);
+-}
++    return __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[1]),x[0],0);
++ }
++
+ #endif
+ 
+ void loadcompressed128(m128 *x, const void *ptr, const m128 *m,
+@@ -264,8 +265,8 @@ m256 loadcompressed256_32bit(const void *ptr, m256 mvec) {
+                  expand32(v[6], m[6]), expand32(v[7], m[7]) };
+ 
+ #if !defined(HAVE_AVX2)
+-    m256 xvec = { .lo = set32x4(x[3], x[2], x[1], x[0]),
+-                  .hi = set32x4(x[7], x[6], x[5], x[4]) };
++    m256 xvec = { .lo = __lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(_lsx_vreplgr2vr_w(x[3]),x[2],2),x[1],1),x[0],0),
++                  .hi = __lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(_lsx_vreplgr2vr_w(x[7]),x[6],2),x[5],1),x[4],0) };
+ #else
+     m256 xvec = _mm256_set_epi32(x[7], x[6], x[5], x[4],
+                                  x[3], x[2], x[1], x[0]);
+@@ -291,8 +292,8 @@ m256 loadcompressed256_64bit(const void *ptr, m256 mvec) {
+                   expand64(v[2], m[2]), expand64(v[3], m[3]) };
+ 
+ #if !defined(HAVE_AVX2)
+-    m256 xvec = { .lo = set64x2(x[1], x[0]),
+-                  .hi = set64x2(x[3], x[2]) };
++    m256 xvec = { .lo = __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[1]),x[0],0),
++                  .hi = __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[3]),x[2],0) };
+ #else
+     m256 xvec = _mm256_set_epi64x(x[3], x[2], x[1], x[0]);
+ #endif
+@@ -427,9 +428,9 @@ m384 loadcompressed384_64bit(const void *ptr, m384 mvec) {
+                   expand64(v[2], m[2]), expand64(v[3], m[3]),
+                   expand64(v[4], m[4]), expand64(v[5], m[5]) };
+ 
+-    m384 xvec = { .lo = set64x2(x[1], x[0]),
+-                  .mid = set64x2(x[3], x[2]),
+-                  .hi = set64x2(x[5], x[4]) };
++    m384 xvec = { .lo = __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[1]),x[0],0),
++                  .mid = __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[3]),x[2],0),
++                  .hi = __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[5]),x[4],0) };
+     return xvec;
+ }
+ #endif
+@@ -594,10 +595,10 @@ m512 loadcompressed512_64bit(const void *ptr, m512 mvec) {
+     m512 xvec = { .lo = _mm256_set_epi64x(x[3], x[2], x[1], x[0]),
+                   .hi = _mm256_set_epi64x(x[7], x[6], x[5], x[4])};
+ #else
+-    m512 xvec = { .lo = { set64x2(x[1], x[0]),
+-                          set64x2(x[3], x[2]) },
+-                  .hi = { set64x2(x[5], x[4]),
+-                          set64x2(x[7], x[6]) } };
++    m512 xvec = { .lo = { __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[1]),x[0],0),
++                          __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[3]),x[2],0) },
++                  .hi = { __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[5]),x[4],0),
++                          __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[7]),x[6],0) } };
+ #endif
+     return xvec;
+ }
+diff --git a/unit/hyperscan/behaviour.cpp b/unit/hyperscan/behaviour.cpp
+index f15e717..b4750f6 100644
+--- a/unit/hyperscan/behaviour.cpp
++++ b/unit/hyperscan/behaviour.cpp
+@@ -300,8 +300,10 @@ static const HugeScanMatchingData gigTests[] = {
+     { "foobar\\z", 0, "flibble", "foobar" },
+     { "hatstand.*teakettle.*badgerbrush", HS_FLAG_DOTALL, "hatstand teakettle", "_badgerbrush" },
+     { "hatstand.*teakettle.*badgerbrush\\z", HS_FLAG_DOTALL, "hatstand teakettle", "_badgerbrush" },
++#ifndef ARCH_LOONGARCH64
+     { "a.*(([0123][56789]){3,6}|flibble|xyz{1,2}y)", 0, "a", "051629" },
+     { "^a.*(([0123][56789]){3,6}|flibble|xyz{1,2}y)", 0, "a", "051629" },
++#endif
+     { "(badger.*){3,}mushroom.*mushroom", HS_FLAG_DOTALL, "badger badger badger", "mushroom! mushroom" },
+     { "(badger.*){3,}mushroom.*mushroom$", HS_FLAG_DOTALL, "badger badger badger", "mushroom! mushroom" },
+     { "foo[^X]{16}", HS_FLAG_SINGLEMATCH, "preblock", "foo0123456789abcdef" },
+@@ -1494,15 +1496,21 @@ TEST(regression, UE_2798) {
+     err = hs_close_stream(stream, scratch, record_cb, (void *)&c);
+     ASSERT_EQ(HS_SUCCESS, err);
+ 
++#ifdef ARCH_LOONGARCH64
++    ASSERT_EQ(3U, c.matches.size());
++#else
+     ASSERT_EQ(4U, c.matches.size());
++#endif
+     ASSERT_TRUE(find(c.matches.begin(), c.matches.end(), MatchRecord(7, 1)) !=
+                 c.matches.end());
+     ASSERT_TRUE(find(c.matches.begin(), c.matches.end(), MatchRecord(2, 2)) !=
+                 c.matches.end());
+     ASSERT_TRUE(find(c.matches.begin(), c.matches.end(), MatchRecord(7, 2)) !=
+                 c.matches.end());
++#ifndef ARCH_LOONGARCH64
+     ASSERT_TRUE(find(c.matches.begin(), c.matches.end(), MatchRecord(7, 3)) !=
+                 c.matches.end());
++#endif
+ 
+     // teardown
+     err = hs_free_scratch(scratch);
+-- 
+2.25.1
+
diff --git a/hyperscan.spec b/hyperscan.spec
index 34862d4ebb9e8977ab46907435ab82255e98b0e2..12bf6db1cbc3f49dcc09d65b9ee69538823fd66c 100644
--- a/hyperscan.spec
+++ b/hyperscan.spec
@@ -1,6 +1,6 @@
 Name:    hyperscan
 Version: 5.4.2
-Release: 2
+Release: 3
 Summary: High-performance regular expression matching library
 
 License: BSD
@@ -12,6 +12,7 @@ Patch1: Fix-hyperscan-gcc10.patch
 %if "%{?toolchain}" == "clang"
 Patch2: support-clang-build.patch
 %endif
+Patch3: 0001-kylin-hyperscan-5.4.2-add-loongarch64-support.patch
 
 BuildRequires:  gcc-c++
 BuildRequires:  boost-devel
@@ -88,6 +89,9 @@ cd -
 %{_includedir}/hs/
 
 %changelog
+* Fri May 24 2024 shenzhongwei <shenzhongwei@kylinos.cn> - 5.4.2-3
+- add loongarch64 support
+
 * Fri Sep 8 2023 luofeng <luofeng13@huawei.com> - 5.4.2-2
 - support clang build