mcst-linux-kernel/patches-2024.06.26/zfs-2.1.5/0006-perf-vdev-raidz-bug110...

131 lines
4.4 KiB
Diff

Subject: raidz xor performance patch
Bug: 110247
Tags: perf e2k
diff -rupN a/module/zfs/vdev_raidz_math_impl.h b/module/zfs/vdev_raidz_math_impl.h
--- a/module/zfs/vdev_raidz_math_impl.h 2023-03-29 19:29:12.412225000 +0300
+++ b/module/zfs/vdev_raidz_math_impl.h 2023-03-29 19:29:26.578780657 +0300
@@ -386,13 +386,36 @@ raidz_generate_p_impl(raidz_row_t * cons
* @csize size of parity columns
* @dsize size of data column
*/
+
+#define MAS_CACHE_1_D 1UL /* DCACHE1 disabled only */
+#define MAS_DCACHE_SHIFT 0x05
+#define MAS_BYPASS_L1_CACHE (MAS_CACHE_1_D << MAS_DCACHE_SHIFT)
+
+#define E2K_PREFETCH_L2_256(addr) \
+({ \
+ int unused; \
+ asm ( "ldb,0,sm %1, 0, %%empty, mas=%2\n" \
+ "ldb,2,sm %1, 64, %%empty, mas=%2\n" \
+ "ldb,3,sm %1, 128, %%empty, mas=%2\n" \
+ "ldb,5,sm %1, 192, %%empty, mas=%2" \
+ : "=r" (unused) \
+ : "r" (addr), \
+ "i" (MAS_BYPASS_L1_CACHE)); \
+})
+
static void
raidz_gen_pq_add(void **c, const void *dc, const size_t csize,
const size_t dsize)
{
+#if defined(__e2k__)
+ v_t* __restrict__ p = (v_t *)c[0];
+ v_t* __restrict__ q = (v_t *)c[1];
+ const v_t* __restrict__ d = (v_t *)dc;
+#else
v_t *p = (v_t *)c[0];
v_t *q = (v_t *)c[1];
const v_t *d = (const v_t *)dc;
+#endif
const v_t * const dend = d + (dsize / sizeof (v_t));
const v_t * const qend = q + (csize / sizeof (v_t));
@@ -400,6 +423,49 @@ raidz_gen_pq_add(void **c, const void *d
MUL2_SETUP();
+#if defined(__e2k__)
+ const v_t *d_range_start, *d_range_end, *qq, *pp;
+ // prefetch 16*3 Kilobytes to L2
+ for (d_range_start=d; d_range_start < dend; d_range_start+= (16*1024)/8) {
+ d = d_range_start;
+ d += 64/8;
+ d_range_end = d_range_start + (16*1024)/8;
+ if(d_range_end > dend) d_range_end = dend;
+ // do prefetch of range by 256 bytes (4 L2 lines)
+ // queue depth est at 28 x 4 ldb prefetch reqs
+ qq = q + 64/8;
+ pp = p + 64/8;
+ for(; d < d_range_end; d += 256/8, qq += 256/8, pp += 256/8) {
+ E2K_PREFETCH_L2_256(d);
+ E2K_PREFETCH_L2_256(qq);
+ E2K_PREFETCH_L2_256(pp);
+ }
+
+ // compute over the range
+ d = d_range_start;
+ for (; d < d_range_end; d += GEN_PQ_STRIDE, p += GEN_PQ_STRIDE,
+ q += GEN_PQ_STRIDE) {
+ LOAD(d, GEN_PQ_D);
+ LOAD(d+1, GEN_PQ_D1);
+ LOAD(d+2, GEN_PQ_D2);
+ LOAD(d+3, GEN_PQ_D3);
+ P_D_SYNDROME(GEN_PQ_D, GEN_PQ_C, p);
+ P_D_SYNDROME(GEN_PQ_D1, GEN_PQ_C1, p+1);
+ P_D_SYNDROME(GEN_PQ_D2, GEN_PQ_C2, p+2);
+ P_D_SYNDROME(GEN_PQ_D3, GEN_PQ_C3, p+3);
+ Q_D_SYNDROME(GEN_PQ_D, GEN_PQ_C, q);
+ Q_D_SYNDROME(GEN_PQ_D1, GEN_PQ_C1, q+1);
+ Q_D_SYNDROME(GEN_PQ_D2, GEN_PQ_C2, q+2);
+ Q_D_SYNDROME(GEN_PQ_D3, GEN_PQ_C3, q+3);
+ }
+ }
+ for (; q < qend; q += GEN_PQ_STRIDE) {
+ Q_SYNDROME(GEN_PQ_C, q);
+ Q_SYNDROME(GEN_PQ_C1, q+1);
+ Q_SYNDROME(GEN_PQ_C2, q+2);
+ Q_SYNDROME(GEN_PQ_C3, q+3);
+ }
+#else
for (; d < dend; d += GEN_PQ_STRIDE, p += GEN_PQ_STRIDE,
q += GEN_PQ_STRIDE) {
LOAD(d, GEN_PQ_D);
@@ -409,6 +475,7 @@ raidz_gen_pq_add(void **c, const void *d
for (; q < qend; q += GEN_PQ_STRIDE) {
Q_SYNDROME(GEN_PQ_C, q);
}
+#endif
}
diff -rupN a/module/zfs/vdev_raidz_math_scalar.c b/module/zfs/vdev_raidz_math_scalar.c
--- a/module/zfs/vdev_raidz_math_scalar.c 2023-03-29 19:27:18.819842165 +0300
+++ b/module/zfs/vdev_raidz_math_scalar.c 2023-03-29 19:29:26.578780657 +0300
@@ -177,10 +177,24 @@ static const struct {
#define GEN_P_DEFINE() v_t p0
#define GEN_P_P p0
+// for e2k prefetch
+#if defined(__e2k__)
+#define GEN_PQ_STRIDE 4
+#define GEN_PQ_DEFINE() v_t d0, c0, d1, c1, d2, c2, d3, c3
+#define GEN_PQ_D d0
+#define GEN_PQ_D1 d1
+#define GEN_PQ_D2 d2
+#define GEN_PQ_D3 d3
+#define GEN_PQ_C c0
+#define GEN_PQ_C1 c1
+#define GEN_PQ_C2 c2
+#define GEN_PQ_C3 c3
+#else
#define GEN_PQ_STRIDE 1
#define GEN_PQ_DEFINE() v_t d0, c0
#define GEN_PQ_D d0
#define GEN_PQ_C c0
+#endif
#define GEN_PQR_STRIDE 1
#define GEN_PQR_DEFINE() v_t d0, c0