131 lines
4.4 KiB
Diff
131 lines
4.4 KiB
Diff
Subject: raidz xor performance patch
|
|
Bug: 110247
|
|
Tags: perf e2k
|
|
|
|
diff -rupN a/module/zfs/vdev_raidz_math_impl.h b/module/zfs/vdev_raidz_math_impl.h
|
|
--- a/module/zfs/vdev_raidz_math_impl.h 2023-03-29 19:29:12.412225000 +0300
|
|
+++ b/module/zfs/vdev_raidz_math_impl.h 2023-03-29 19:29:26.578780657 +0300
|
|
@@ -386,13 +386,36 @@ raidz_generate_p_impl(raidz_row_t * cons
|
|
* @csize size of parity columns
|
|
* @dsize size of data column
|
|
*/
|
|
+
|
|
+#define MAS_CACHE_1_D 1UL /* DCACHE1 disabled only */
|
|
+#define MAS_DCACHE_SHIFT 0x05
|
|
+#define MAS_BYPASS_L1_CACHE (MAS_CACHE_1_D << MAS_DCACHE_SHIFT)
|
|
+
|
|
+#define E2K_PREFETCH_L2_256(addr) \
|
|
+({ \
|
|
+ int unused; \
|
|
+ asm ( "ldb,0,sm %1, 0, %%empty, mas=%2\n" \
|
|
+ "ldb,2,sm %1, 64, %%empty, mas=%2\n" \
|
|
+ "ldb,3,sm %1, 128, %%empty, mas=%2\n" \
|
|
+ "ldb,5,sm %1, 192, %%empty, mas=%2" \
|
|
+ : "=r" (unused) \
|
|
+ : "r" (addr), \
|
|
+ "i" (MAS_BYPASS_L1_CACHE)); \
|
|
+})
|
|
+
|
|
static void
|
|
raidz_gen_pq_add(void **c, const void *dc, const size_t csize,
|
|
const size_t dsize)
|
|
{
|
|
+#if defined(__e2k__)
|
|
+ v_t* __restrict__ p = (v_t *)c[0];
|
|
+ v_t* __restrict__ q = (v_t *)c[1];
|
|
+ const v_t* __restrict__ d = (v_t *)dc;
|
|
+#else
|
|
v_t *p = (v_t *)c[0];
|
|
v_t *q = (v_t *)c[1];
|
|
const v_t *d = (const v_t *)dc;
|
|
+#endif
|
|
const v_t * const dend = d + (dsize / sizeof (v_t));
|
|
const v_t * const qend = q + (csize / sizeof (v_t));
|
|
|
|
@@ -400,6 +423,49 @@ raidz_gen_pq_add(void **c, const void *d
|
|
|
|
MUL2_SETUP();
|
|
|
|
+#if defined(__e2k__)
|
|
+ const v_t *d_range_start, *d_range_end, *qq, *pp;
|
|
+ // prefetch 16*3 Kilobytes to L2
|
|
+ for (d_range_start=d; d_range_start < dend; d_range_start+= (16*1024)/8) {
|
|
+ d = d_range_start;
|
|
+ d += 64/8;
|
|
+ d_range_end = d_range_start + (16*1024)/8;
|
|
+ if(d_range_end > dend) d_range_end = dend;
|
|
+ // do prefetch of range by 256 bytes (4 L2 lines)
|
|
+ // queue depth est at 28 x 4 ldb prefetch reqs
|
|
+ qq = q + 64/8;
|
|
+ pp = p + 64/8;
|
|
+ for(; d < d_range_end; d += 256/8, qq += 256/8, pp += 256/8) {
|
|
+ E2K_PREFETCH_L2_256(d);
|
|
+ E2K_PREFETCH_L2_256(qq);
|
|
+ E2K_PREFETCH_L2_256(pp);
|
|
+ }
|
|
+
|
|
+ // compute over the range
|
|
+ d = d_range_start;
|
|
+ for (; d < d_range_end; d += GEN_PQ_STRIDE, p += GEN_PQ_STRIDE,
|
|
+ q += GEN_PQ_STRIDE) {
|
|
+ LOAD(d, GEN_PQ_D);
|
|
+ LOAD(d+1, GEN_PQ_D1);
|
|
+ LOAD(d+2, GEN_PQ_D2);
|
|
+ LOAD(d+3, GEN_PQ_D3);
|
|
+ P_D_SYNDROME(GEN_PQ_D, GEN_PQ_C, p);
|
|
+ P_D_SYNDROME(GEN_PQ_D1, GEN_PQ_C1, p+1);
|
|
+ P_D_SYNDROME(GEN_PQ_D2, GEN_PQ_C2, p+2);
|
|
+ P_D_SYNDROME(GEN_PQ_D3, GEN_PQ_C3, p+3);
|
|
+ Q_D_SYNDROME(GEN_PQ_D, GEN_PQ_C, q);
|
|
+ Q_D_SYNDROME(GEN_PQ_D1, GEN_PQ_C1, q+1);
|
|
+ Q_D_SYNDROME(GEN_PQ_D2, GEN_PQ_C2, q+2);
|
|
+ Q_D_SYNDROME(GEN_PQ_D3, GEN_PQ_C3, q+3);
|
|
+ }
|
|
+ }
|
|
+ for (; q < qend; q += GEN_PQ_STRIDE) {
|
|
+ Q_SYNDROME(GEN_PQ_C, q);
|
|
+ Q_SYNDROME(GEN_PQ_C1, q+1);
|
|
+ Q_SYNDROME(GEN_PQ_C2, q+2);
|
|
+ Q_SYNDROME(GEN_PQ_C3, q+3);
|
|
+ }
|
|
+#else
|
|
for (; d < dend; d += GEN_PQ_STRIDE, p += GEN_PQ_STRIDE,
|
|
q += GEN_PQ_STRIDE) {
|
|
LOAD(d, GEN_PQ_D);
|
|
@@ -409,6 +475,7 @@ raidz_gen_pq_add(void **c, const void *d
|
|
for (; q < qend; q += GEN_PQ_STRIDE) {
|
|
Q_SYNDROME(GEN_PQ_C, q);
|
|
}
|
|
+#endif
|
|
}
|
|
|
|
|
|
diff -rupN a/module/zfs/vdev_raidz_math_scalar.c b/module/zfs/vdev_raidz_math_scalar.c
|
|
--- a/module/zfs/vdev_raidz_math_scalar.c 2023-03-29 19:27:18.819842165 +0300
|
|
+++ b/module/zfs/vdev_raidz_math_scalar.c 2023-03-29 19:29:26.578780657 +0300
|
|
@@ -177,10 +177,24 @@ static const struct {
|
|
#define GEN_P_DEFINE() v_t p0
|
|
#define GEN_P_P p0
|
|
|
|
+// for e2k prefetch
|
|
+#if defined(__e2k__)
|
|
+#define GEN_PQ_STRIDE 4
|
|
+#define GEN_PQ_DEFINE() v_t d0, c0, d1, c1, d2, c2, d3, c3
|
|
+#define GEN_PQ_D d0
|
|
+#define GEN_PQ_D1 d1
|
|
+#define GEN_PQ_D2 d2
|
|
+#define GEN_PQ_D3 d3
|
|
+#define GEN_PQ_C c0
|
|
+#define GEN_PQ_C1 c1
|
|
+#define GEN_PQ_C2 c2
|
|
+#define GEN_PQ_C3 c3
|
|
+#else
|
|
#define GEN_PQ_STRIDE 1
|
|
#define GEN_PQ_DEFINE() v_t d0, c0
|
|
#define GEN_PQ_D d0
|
|
#define GEN_PQ_C c0
|
|
+#endif
|
|
|
|
#define GEN_PQR_STRIDE 1
|
|
#define GEN_PQR_DEFINE() v_t d0, c0
|