mcst-linux-kernel/patches-2024.06.26/xorg-server-21.1.8/0042-Carry-of-cycle-in-func...

320 lines
10 KiB
Diff

From 84a4f134911336235862994fc3b651360c9ebb90 Mon Sep 17 00:00:00 2001
Date: Tue, 16 Apr 2019 12:02:43 +0300
Subject: [PATCH] Carry of cycle in function fbBlt_memcpy_vis for optimization
x11perf, bug113290
---
fb/fbblt.c | 263 ++++++++++++++++++++++++++++++++++++++++---------------------
1 file changed, 172 insertions(+), 91 deletions(-)
diff --git a/fb/fbblt.c b/fb/fbblt.c
index 45613bf..b7015e0 100644
--- a/fb/fbblt.c
+++ b/fb/fbblt.c
@@ -39,97 +39,201 @@
} \
}
-#if defined(__sparc_v9__)
-
-static inline void *
-fbBlt_memcpy_vis(void *dest, const void *sorc, int n)
+static inline void
+fbBlt_memcpy_vis(void *dest, const void *sorc, const int width_byte,
+ const int dst_byte_stride, const int src_byte_stride,
+ const int height, Bool upsidedown)
{
+ int k;
+
+#if defined(__sparc_v9__)
uint8_t *dst, *src, *ps;
int32_t i;
+ int n;
vis_d64 *addr_aligned;
vis_d64 data_hi, data_lo, data;
vis_d64 data_lo2, data2;
vis_d64 data_lo3, data3;
vis_d64 data_lo4, data4;
+#endif
- dst = (uint8_t *) dest;
- src = (uint8_t *) sorc;
+ if (!upsidedown) {
+#pragma loop count(1000)
+ for (k = 0; k < height; k++) {
+#if defined(__sparc_v9__)
+ n = width_byte;
- for (ps = src; ps < src + n; ps += 64) {
- __builtin_prefetch(ps, 0);
- }
+ dst = (uint8_t *) dest + k * dst_byte_stride;
+ src = (uint8_t *) sorc + k * src_byte_stride;
+
+ for (ps = src; ps < src + n; ps += 64) {
+ __builtin_prefetch(ps, 0);
+ }
#pragma loop count(1)
- while (n-- && (uintptr_t) dst & 7) {
- *dst++ = *src++;
- }
+ while (n-- && (uintptr_t) dst & 7) {
+ *dst++ = *src++;
+ }
- if (!((uintptr_t) src & 7)) {
+ if (!((uintptr_t) src & 7)) {
#pragma loop count(1000)
- while (n >= 64) {
- __builtin_prefetch(dst + 128, 1, 3);
- *(vis_d64 *) dst = *(vis_d64 *) src;
- *(vis_d64 *) (dst + 8) = *(vis_d64 *) (src + 8);
- *(vis_d64 *) (dst + 16) = *(vis_d64 *) (src + 16);
- *(vis_d64 *) (dst + 24) = *(vis_d64 *) (src + 24);
- *(vis_d64 *) (dst + 32) = *(vis_d64 *) (src + 32);
- *(vis_d64 *) (dst + 40) = *(vis_d64 *) (src + 40);
- *(vis_d64 *) (dst + 48) = *(vis_d64 *) (src + 48);
- *(vis_d64 *) (dst + 56) = *(vis_d64 *) (src + 56);
-
- dst += 64;
- src += 64;
- n -= 64;
+ while (n >= 64) {
+ __builtin_prefetch(dst + 128, 1, 3);
+ *(vis_d64 *) dst = *(vis_d64 *) src;
+ *(vis_d64 *) (dst + 8) = *(vis_d64 *) (src + 8);
+ *(vis_d64 *) (dst + 16) = *(vis_d64 *) (src + 16);
+ *(vis_d64 *) (dst + 24) = *(vis_d64 *) (src + 24);
+ *(vis_d64 *) (dst + 32) = *(vis_d64 *) (src + 32);
+ *(vis_d64 *) (dst + 40) = *(vis_d64 *) (src + 40);
+ *(vis_d64 *) (dst + 48) = *(vis_d64 *) (src + 48);
+ *(vis_d64 *) (dst + 56) = *(vis_d64 *) (src + 56);
+
+ dst += 64;
+ src += 64;
+ n -= 64;
- }
+ }
#pragma loop count(4)
- while (n >= 8) {
- *(vis_d64 *) dst = *(vis_d64 *) src;
- dst += 8;
- src += 8;
- n -= 8;
+ while (n >= 8) {
+ *(vis_d64 *) dst = *(vis_d64 *) src;
+ dst += 8;
+ src += 8;
+ n -= 8;
+ }
+ }
+ else {
+ i = 0;
+ addr_aligned = (vis_d64 *) vis_alignaddr(src, 0);
+ data_hi = addr_aligned[i];
+#pragma loop count(1000)
+ while (n >= 32) {
+ vis_prefetch_write((vis_d64 *) dst + 16);
+ data_lo = addr_aligned[i + 1];
+ data_lo2 = addr_aligned[i + 2];
+ data_lo3 = addr_aligned[i + 3];
+ data_lo4 = addr_aligned[i + 4];
+
+ data = vis_faligndata(data_hi, data_lo);
+ data2 = vis_faligndata(data_lo, data_lo2);
+ data3 = vis_faligndata(data_lo2, data_lo3);
+ data4 = vis_faligndata(data_lo3, data_lo4);
+
+ *(vis_d64 *) dst = data;
+ *(vis_d64 *) (dst + 8) = data2;
+ *(vis_d64 *) (dst + 16) = data3;
+ *(vis_d64 *) (dst + 24) = data4;
+
+ data_hi = data_lo4;
+
+ dst += 32;
+ src += 32;
+ n -= 32;
+ i += 4;
+ }
+ }
+
+#pragma loop count(16)
+ while (n-- >= 0) {
+ *dst++ = *src++;
+ }
+#else
+ MEMCPY_WRAPPED(dest + k * dst_byte_stride,
+ sorc + k * src_byte_stride,
+ width_byte);
+#endif
}
}
else {
- i = 0;
- addr_aligned = (vis_d64 *) vis_alignaddr(src, 0);
- data_hi = addr_aligned[i];
#pragma loop count(1000)
- while (n >= 32) {
- vis_prefetch_write((vis_d64 *) dst + 16);
- data_lo = addr_aligned[i + 1];
- data_lo2 = addr_aligned[i + 2];
- data_lo3 = addr_aligned[i + 3];
- data_lo4 = addr_aligned[i + 4];
-
- data = vis_faligndata(data_hi, data_lo);
- data2 = vis_faligndata(data_lo, data_lo2);
- data3 = vis_faligndata(data_lo2, data_lo3);
- data4 = vis_faligndata(data_lo3, data_lo4);
-
- *(vis_d64 *) dst = data;
- *(vis_d64 *) (dst + 8) = data2;
- *(vis_d64 *) (dst + 16) = data3;
- *(vis_d64 *) (dst + 24) = data4;
-
- data_hi = data_lo4;
-
- dst += 32;
- src += 32;
- n -= 32;
- i += 4;
- }
- }
+ for (k = height - 1; k >= 0; k--) {
+#if defined(__sparc_v9__)
+ n = width_byte;
+
+ dst = (uint8_t *) dest + k * dst_byte_stride;
+ src = (uint8_t *) sorc + k * src_byte_stride;
+
+ for (ps = src; ps < src + n; ps += 64) {
+ __builtin_prefetch(ps, 0);
+ }
+
+#pragma loop count(1)
+ while (n-- && (uintptr_t) dst & 7) {
+ *dst++ = *src++;
+ }
+
+ if (!((uintptr_t) src & 7)) {
+#pragma loop count(1000)
+ while (n >= 64) {
+ __builtin_prefetch(dst + 128, 1, 3);
+ *(vis_d64 *) dst = *(vis_d64 *) src;
+ *(vis_d64 *) (dst + 8) = *(vis_d64 *) (src + 8);
+ *(vis_d64 *) (dst + 16) = *(vis_d64 *) (src + 16);
+ *(vis_d64 *) (dst + 24) = *(vis_d64 *) (src + 24);
+ *(vis_d64 *) (dst + 32) = *(vis_d64 *) (src + 32);
+ *(vis_d64 *) (dst + 40) = *(vis_d64 *) (src + 40);
+ *(vis_d64 *) (dst + 48) = *(vis_d64 *) (src + 48);
+ *(vis_d64 *) (dst + 56) = *(vis_d64 *) (src + 56);
+
+ dst += 64;
+ src += 64;
+ n -= 64;
+
+ }
+
+#pragma loop count(4)
+ while (n >= 8) {
+ *(vis_d64 *) dst = *(vis_d64 *) src;
+ dst += 8;
+ src += 8;
+ n -= 8;
+ }
+ }
+ else {
+ i = 0;
+ addr_aligned = (vis_d64 *) vis_alignaddr(src, 0);
+ data_hi = addr_aligned[i];
+#pragma loop count(1000)
+ while (n >= 32) {
+ vis_prefetch_write((vis_d64 *) dst + 16);
+ data_lo = addr_aligned[i + 1];
+ data_lo2 = addr_aligned[i + 2];
+ data_lo3 = addr_aligned[i + 3];
+ data_lo4 = addr_aligned[i + 4];
+
+ data = vis_faligndata(data_hi, data_lo);
+ data2 = vis_faligndata(data_lo, data_lo2);
+ data3 = vis_faligndata(data_lo2, data_lo3);
+ data4 = vis_faligndata(data_lo3, data_lo4);
+
+ *(vis_d64 *) dst = data;
+ *(vis_d64 *) (dst + 8) = data2;
+ *(vis_d64 *) (dst + 16) = data3;
+ *(vis_d64 *) (dst + 24) = data4;
+
+ data_hi = data_lo4;
+
+ dst += 32;
+ src += 32;
+ n -= 32;
+ i += 4;
+ }
+ }
#pragma loop count(16)
- while (n-- >= 0) {
- *dst++ = *src++;
+ while (n-- >= 0) {
+ *dst++ = *src++;
+ }
+#else
+ MEMCPY_WRAPPED(dest + k * dst_byte_stride,
+ sorc + k * src_byte_stride,
+ width_byte);
+#endif
+ }
}
- return dest;
}
-#endif // __sparc_v9__
+
void
fbBlt(FbBits * srcLine,
@@ -167,32 +271,9 @@ fbBlt(FbBits * srcLine,
if (src_byte + width_byte <= dst_byte ||
dst_byte + width_byte <= src_byte)
{
- int i;
-
- if (!upsidedown)
- #pragma loop count(1000)
- for (i = 0; i < height; i++)
-#if defined(__sparc_v9__)
- fbBlt_memcpy_vis(dst_byte + i * dst_byte_stride,
- src_byte + i * src_byte_stride,
- width_byte);
-#else
- MEMCPY_WRAPPED(dst_byte + i * dst_byte_stride,
- src_byte + i * src_byte_stride,
- width_byte);
-#endif
- else
- #pragma loop count(1000)
- for (i = height - 1; i >= 0; i--)
-#if defined(__sparc_v9__)
- fbBlt_memcpy_vis(dst_byte + i * dst_byte_stride,
- src_byte + i * src_byte_stride,
- width_byte);
-#else
- MEMCPY_WRAPPED(dst_byte + i * dst_byte_stride,
- src_byte + i * src_byte_stride,
- width_byte);
-#endif
+ fbBlt_memcpy_vis(dst_byte, src_byte, width_byte,
+ dst_byte_stride, src_byte_stride,
+ height, upsidedown);
return;
}
}
--
2.16.4