mcst-linux-kernel/patches-2024.06.26/xz-5.2.4/0003-xz-clmul.patch

305 lines
9.1 KiB
Diff
Raw Permalink Normal View History

2024-07-09 13:51:45 +03:00
From ce4c53685367184a03c1dd2088b80c44b6fce5f7 Mon Sep 17 00:00:00 2001
From: Ilya Kurdyukov <jpegqs@gmail.com>
Date: Mon, 27 Jun 2022 18:40:44 +0700
Subject: [PATCH] xz-5.2.5 faster CRC64 using CLMUL
---
configure.ac | 29 +++++++
src/liblzma/check/Makefile.inc | 4 +
src/liblzma/check/crc64_clmul.c | 140 ++++++++++++++++++++++++++++++++
src/liblzma/check/crc64_fast.c | 55 ++++++++++++-
4 files changed, 227 insertions(+), 1 deletion(-)
create mode 100644 src/liblzma/check/crc64_clmul.c
diff --git a/configure.ac b/configure.ac
index 2418e4b..1d20946 100644
--- a/configure.ac
+++ b/configure.ac
@@ -325,6 +325,35 @@ AM_CONDITIONAL(COND_ASM_X86, test "x$enable_assembler" = xx86)
AM_CONDITIONAL(COND_ASM_X86_64, test "x$enable_assembler" = xx86_64)
+#####################
+# CRC64 using CLMUL #
+#####################
+
+AC_MSG_CHECKING([if CLMUL instruction can be used])
+AC_ARG_ENABLE([clmul], AS_HELP_STRING([--enable-clmul],
+ [Much faster CRC64 using CLMUL instruction.]),
+ [], [enable_clmul=auto])
+if test "x$enable_clmul" = xauto; then
+ enable_clmul=no
+ case $host_os in
+ # Darwin and Windows could work too but I cannot test.
+ linux* | *bsd* | mingw* | cygwin | msys | *djgpp*)
+ case $host_cpu in
+ i?86 | x86_64 | e2k) enable_clmul=yes ;;
+ esac
+ ;;
+ esac
+fi
+if test "x$enable_clmul" = xyes; then
+ AC_DEFINE([HAVE_CRC64_CLMUL], [1], [Define to 1 if the architecture support CLMUL.])
+elif test "x$enable_clmul" != xno; then
+ AC_MSG_RESULT([])
+ AC_MSG_ERROR([--enable-clmul accepts only `yes' or `no'])
+fi
+AC_MSG_RESULT([$enable_small])
+AM_CONDITIONAL(COND_CRC64_CLMUL, test "x$enable_clmul" = xyes)
+
+
#####################
# Size optimization #
#####################
diff --git a/src/liblzma/check/Makefile.inc b/src/liblzma/check/Makefile.inc
index dc011a3..ce1ff25 100644
--- a/src/liblzma/check/Makefile.inc
+++ b/src/liblzma/check/Makefile.inc
@@ -43,6 +43,10 @@ liblzma_la_SOURCES += check/crc64_x86.S
else
liblzma_la_SOURCES += check/crc64_fast.c
endif
+if COND_CRC64_CLMUL
+liblzma_la_SOURCES += check/crc64_clmul.c
+%-crc64_clmul.lo: CFLAGS += -msse4.1 -mpclmul
+endif
endif
endif
diff --git a/src/liblzma/check/crc64_clmul.c b/src/liblzma/check/crc64_clmul.c
new file mode 100644
index 0000000..a4acf01
--- /dev/null
+++ b/src/liblzma/check/crc64_clmul.c
@@ -0,0 +1,140 @@
+///////////////////////////////////////////////////////////////////////////////
+//
+/// \file crc64_clmul.c
+/// \brief CRC64 calculation
+///
+/// Calculate the CRC64 using SSE4.1 and the CLMUL instruction.
+/// Derived from 'github.com/rawrunprotected/crc' and
+/// 'www.intel.com/content/dam/www/public/us/en/documents/white-papers/
+/// fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf'
+//
+// Author: Ilya Kurdyukov
+//
+// This file has been put into the public domain.
+// You can do whatever you want with this file.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#include "check.h"
+#include "config.h"
+
+uint64_t lzma_crc64_clmul(const uint8_t *data, size_t length, uint64_t crc);
+
+#if defined(__e2k__) && __iset__ < 6
+#undef HAVE_CRC64_CLMUL
+#endif
+
+#if defined(HAVE_CRC64_CLMUL) && defined(__SSE4_1__) && defined(__PCLMUL__)
+#include <smmintrin.h>
+#include <wmmintrin.h>
+
+#if 0
+static uint64_t calc_lo(uint64_t p) {
+ int i; uint64_t a = p, b = 0;
+ for (i = 0; i < 64; i++) {
+ b = b >> 1 | a << 63;
+ a = (a >> 1) ^ (a & 1 ? p : 0);
+ }
+ return b;
+}
+
+static uint64_t calc_hi(uint64_t p, uint64_t a) {
+ int i;
+ for (i = 0; i < 64; i++)
+ a = (a >> 1) ^ (a & 1 ? p : 0);
+ return a;
+}
+#endif
+
+#define MASK_L(in, mask, r) r = _mm_shuffle_epi8(in, mask);
+#define MASK_H(in, mask, r) \
+ r = _mm_shuffle_epi8(in, _mm_xor_si128(mask, vsign));
+#define MASK_LH(in, mask, low, high) \
+ MASK_L(in, mask, low) MASK_H(in, mask, high)
+
+uint64_t lzma_crc64_clmul(const uint8_t *data, size_t length, uint64_t crc) {
+ // uint64_t p = 0xc96c5795d7870f42;
+ uint64_t i0 = 0x92d8af2baf0e1e85; // p << 1 | 1
+ uint64_t i1 = 0x9c3e466c172963d5; // calc_lo(p) << 1 | 1
+ uint64_t i2 = 0xdabe95afc7875f40; // calc_hi(p, 1)
+ uint64_t i3 = 0xe05dd497ca393ae4; // calc_hi(p, i2)
+ __m128i vfold0 = _mm_set_epi64x(i0, i1);
+ __m128i vfold1 = _mm_set_epi64x(i2, i3);
+
+ uintptr_t skipS = (uintptr_t)data & 15;
+ uintptr_t skipE = -(uintptr_t)(data + length) & 15;
+ __m128i vramp = _mm_setr_epi32(0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c);
+ __m128i vsign = _mm_set1_epi8(0x80);
+ __m128i maskS = _mm_sub_epi8(vramp, _mm_set1_epi8(skipS));
+ __m128i maskE = _mm_sub_epi8(vramp, _mm_set1_epi8(skipE));
+
+ uintptr_t length2 = skipS + length;
+ const __m128i* adata = (const __m128i*)((uintptr_t)data & -16);
+ __m128i v0, v1, v2, v3, vcrc, data0;
+
+#ifdef __i386__
+ vcrc = _mm_set_epi64x(0, ~crc);
+#else
+ vcrc = _mm_cvtsi64x_si128(~crc);
+#endif
+ if (!length) return crc;
+ data0 = _mm_load_si128(adata);
+ data0 = _mm_blendv_epi8(data0, _mm_setzero_si128(), maskS);
+ adata++;
+ if (length2 <= 16) {
+ __m128i maskL = _mm_add_epi8(vramp, _mm_set1_epi8(length - 16));
+ MASK_LH(vcrc, maskL, v0, v1)
+ MASK_L(data0, maskE, v3)
+ v0 = _mm_xor_si128(v0, v3);
+ v1 = _mm_alignr_epi8(v1, v0, 8);
+ } else {
+ __m128i data1 = _mm_load_si128(adata);
+ if (length <= 16) {
+ __m128i maskL = _mm_add_epi8(vramp, _mm_set1_epi8(length - 16));
+ MASK_LH(vcrc, maskL, v0, v1);
+ MASK_H(data0, maskE, v2)
+ MASK_L(data1, maskE, v3)
+ v0 = _mm_xor_si128(v0, v2);
+ v0 = _mm_xor_si128(v0, v3);
+ v1 = _mm_alignr_epi8(v1, v0, 8);
+ } else {
+ MASK_LH(vcrc, maskS, v0, v1)
+ v0 = _mm_xor_si128(v0, data0);
+ v1 = _mm_xor_si128(v1, data1);
+
+#define FOLD \
+ v1 = _mm_xor_si128(v1, _mm_clmulepi64_si128(v0, vfold1, 0x00)); \
+ v0 = _mm_xor_si128(v1, _mm_clmulepi64_si128(v0, vfold1, 0x11));
+
+ while (length2 > 32) {
+ adata++;
+ length2 -= 16;
+ FOLD
+ v1 = _mm_load_si128(adata);
+ }
+ if (length2 < 32) {
+ MASK_H(v0, maskE, v2)
+ MASK_L(v0, maskE, v0)
+ MASK_L(v1, maskE, v3)
+ v1 = _mm_or_si128(v2, v3);
+ }
+ FOLD
+ v1 = _mm_srli_si128(v0, 8);
+#undef FOLD
+ }
+ }
+
+ v1 = _mm_xor_si128(_mm_clmulepi64_si128(v0, vfold1, 0x10), v1);
+ v0 = _mm_clmulepi64_si128(v1, vfold0, 0x00);
+ v2 = _mm_clmulepi64_si128(v0, vfold0, 0x10);
+ v0 = _mm_xor_si128(_mm_xor_si128(v2, _mm_slli_si128(v0, 8)), v1);
+
+#ifdef __i386__
+ return ~(((uint64_t)(uint32_t)_mm_extract_epi32(v0, 3) << 32) |
+ (uint64_t)(uint32_t)_mm_extract_epi32(v0, 2));
+#else
+ return ~_mm_extract_epi64(v0, 1);
+#endif
+}
+
+#endif
diff --git a/src/liblzma/check/crc64_fast.c b/src/liblzma/check/crc64_fast.c
index 8af54cd..a1091c3 100644
--- a/src/liblzma/check/crc64_fast.c
+++ b/src/liblzma/check/crc64_fast.c
@@ -7,7 +7,8 @@
/// idea that is used in crc32_fast.c, but for CRC64 we use only four tables
/// instead of eight to avoid increasing CPU cache usage.
//
-// Author: Lasse Collin
+// Authors: Lasse Collin
+// Ilya Kurdyukov (CLMUL detection code)
//
// This file has been put into the public domain.
// You can do whatever you want with this file.
@@ -15,8 +16,12 @@
///////////////////////////////////////////////////////////////////////////////
#include "check.h"
+#include "config.h"
#include "crc_macros.h"
+#if defined(__e2k__) && __iset__ < 6
+#undef HAVE_CRC64_CLMUL
+#endif
#ifdef WORDS_BIGENDIAN
# define A1(x) ((x) >> 56)
@@ -24,11 +29,59 @@
# define A1 A
#endif
+#ifdef HAVE_CRC64_CLMUL
+extern uint64_t lzma_crc64_fast(const uint8_t *buf, size_t size, uint64_t crc);
+extern uint64_t lzma_crc64_clmul(const uint8_t *buf, size_t size, uint64_t crc);
+static uint64_t lzma_crc64_detect(const uint8_t *buf, size_t size, uint64_t crc);
+
+#ifdef _MSC_VER
+#include <intrin.h>
+// void __cpuidex(int cpuInfo[4], int function_id, int subfunction_id);
+#define get_cpuid(a, c, out) __cpuidex(out, a, c)
+#elif defined(__GNUC__) && !defined(__e2k__)
+static inline void get_cpuid(int32_t a, int32_t c, int32_t out[4]) {
+ __asm__ ("cpuid" : "=a"(out[0]), "=b"(out[1]),
+ "=c"(out[2]), "=d"(out[3]) : "a"(a), "c"(c));
+}
+#else
+//#error
+#endif
+
+static uint64_t (*lzma_crc64_fn)(const uint8_t *buf, size_t size, uint64_t crc) = lzma_crc64_detect;
+
+static uint64_t lzma_crc64_detect(const uint8_t *buf, size_t size, uint64_t crc) {
+#if defined(__i386__) || defined(__x86_64__)
+ int type = 0;
+ do {
+ int32_t cpuid[4];
+ get_cpuid(0, 0, cpuid);
+ if (cpuid[0] < 1) break;
+ get_cpuid(1, 0, cpuid);
+ if (!(cpuid[2] & (1 << 19))) break; // SSE4.1
+ if (!(cpuid[2] & (1 << 1))) break; // PCLMULQDQ
+ type = 1;
+ } while (0);
+ lzma_crc64_fn = type ? lzma_crc64_clmul : lzma_crc64_fast;
+#elif defined(__e2k__) && __iset__ >= 6
+ lzma_crc64_fn = lzma_crc64_clmul;
+#else
+ lzma_crc64_fn = lzma_crc64_fast;
+#endif
+ return lzma_crc64_fn(buf, size, crc);
+}
+#endif
// See the comments in crc32_fast.c. They aren't duplicated here.
extern LZMA_API(uint64_t)
lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc)
{
+#ifdef HAVE_CRC64_CLMUL
+ return lzma_crc64_fn(buf, size, crc);
+}
+
+uint64_t lzma_crc64_fast(const uint8_t *buf, size_t size, uint64_t crc)
+{
+#endif
crc = ~crc;
#ifdef WORDS_BIGENDIAN
--
2.34.1