320 lines
10 KiB
Diff
320 lines
10 KiB
Diff
From 84a4f134911336235862994fc3b651360c9ebb90 Mon Sep 17 00:00:00 2001
|
|
Date: Tue, 16 Apr 2019 12:02:43 +0300
|
|
Subject: [PATCH] Carry of cycle in function fbBlt_memcpy_vis for optimization
|
|
x11perf, bug113290
|
|
|
|
---
|
|
fb/fbblt.c | 263 ++++++++++++++++++++++++++++++++++++++++---------------------
|
|
1 file changed, 172 insertions(+), 91 deletions(-)
|
|
|
|
diff --git a/fb/fbblt.c b/fb/fbblt.c
|
|
index 45613bf..b7015e0 100644
|
|
--- a/fb/fbblt.c
|
|
+++ b/fb/fbblt.c
|
|
@@ -39,97 +39,201 @@
|
|
} \
|
|
}
|
|
|
|
-#if defined(__sparc_v9__)
|
|
-
|
|
-static inline void *
|
|
-fbBlt_memcpy_vis(void *dest, const void *sorc, int n)
|
|
+static inline void
|
|
+fbBlt_memcpy_vis(void *dest, const void *sorc, const int width_byte,
|
|
+ const int dst_byte_stride, const int src_byte_stride,
|
|
+ const int height, Bool upsidedown)
|
|
{
|
|
+ int k;
|
|
+
|
|
+#if defined(__sparc_v9__)
|
|
uint8_t *dst, *src, *ps;
|
|
int32_t i;
|
|
+ int n;
|
|
|
|
vis_d64 *addr_aligned;
|
|
vis_d64 data_hi, data_lo, data;
|
|
vis_d64 data_lo2, data2;
|
|
vis_d64 data_lo3, data3;
|
|
vis_d64 data_lo4, data4;
|
|
+#endif
|
|
|
|
- dst = (uint8_t *) dest;
|
|
- src = (uint8_t *) sorc;
|
|
+ if (!upsidedown) {
|
|
+#pragma loop count(1000)
|
|
+ for (k = 0; k < height; k++) {
|
|
+#if defined(__sparc_v9__)
|
|
+ n = width_byte;
|
|
|
|
- for (ps = src; ps < src + n; ps += 64) {
|
|
- __builtin_prefetch(ps, 0);
|
|
- }
|
|
+ dst = (uint8_t *) dest + k * dst_byte_stride;
|
|
+ src = (uint8_t *) sorc + k * src_byte_stride;
|
|
+
|
|
+ for (ps = src; ps < src + n; ps += 64) {
|
|
+ __builtin_prefetch(ps, 0);
|
|
+ }
|
|
|
|
#pragma loop count(1)
|
|
- while (n-- && (uintptr_t) dst & 7) {
|
|
- *dst++ = *src++;
|
|
- }
|
|
+ while (n-- && (uintptr_t) dst & 7) {
|
|
+ *dst++ = *src++;
|
|
+ }
|
|
|
|
- if (!((uintptr_t) src & 7)) {
|
|
+ if (!((uintptr_t) src & 7)) {
|
|
#pragma loop count(1000)
|
|
- while (n >= 64) {
|
|
- __builtin_prefetch(dst + 128, 1, 3);
|
|
- *(vis_d64 *) dst = *(vis_d64 *) src;
|
|
- *(vis_d64 *) (dst + 8) = *(vis_d64 *) (src + 8);
|
|
- *(vis_d64 *) (dst + 16) = *(vis_d64 *) (src + 16);
|
|
- *(vis_d64 *) (dst + 24) = *(vis_d64 *) (src + 24);
|
|
- *(vis_d64 *) (dst + 32) = *(vis_d64 *) (src + 32);
|
|
- *(vis_d64 *) (dst + 40) = *(vis_d64 *) (src + 40);
|
|
- *(vis_d64 *) (dst + 48) = *(vis_d64 *) (src + 48);
|
|
- *(vis_d64 *) (dst + 56) = *(vis_d64 *) (src + 56);
|
|
-
|
|
- dst += 64;
|
|
- src += 64;
|
|
- n -= 64;
|
|
+ while (n >= 64) {
|
|
+ __builtin_prefetch(dst + 128, 1, 3);
|
|
+ *(vis_d64 *) dst = *(vis_d64 *) src;
|
|
+ *(vis_d64 *) (dst + 8) = *(vis_d64 *) (src + 8);
|
|
+ *(vis_d64 *) (dst + 16) = *(vis_d64 *) (src + 16);
|
|
+ *(vis_d64 *) (dst + 24) = *(vis_d64 *) (src + 24);
|
|
+ *(vis_d64 *) (dst + 32) = *(vis_d64 *) (src + 32);
|
|
+ *(vis_d64 *) (dst + 40) = *(vis_d64 *) (src + 40);
|
|
+ *(vis_d64 *) (dst + 48) = *(vis_d64 *) (src + 48);
|
|
+ *(vis_d64 *) (dst + 56) = *(vis_d64 *) (src + 56);
|
|
+
|
|
+ dst += 64;
|
|
+ src += 64;
|
|
+ n -= 64;
|
|
|
|
- }
|
|
+ }
|
|
|
|
#pragma loop count(4)
|
|
- while (n >= 8) {
|
|
- *(vis_d64 *) dst = *(vis_d64 *) src;
|
|
- dst += 8;
|
|
- src += 8;
|
|
- n -= 8;
|
|
+ while (n >= 8) {
|
|
+ *(vis_d64 *) dst = *(vis_d64 *) src;
|
|
+ dst += 8;
|
|
+ src += 8;
|
|
+ n -= 8;
|
|
+ }
|
|
+ }
|
|
+ else {
|
|
+ i = 0;
|
|
+ addr_aligned = (vis_d64 *) vis_alignaddr(src, 0);
|
|
+ data_hi = addr_aligned[i];
|
|
+#pragma loop count(1000)
|
|
+ while (n >= 32) {
|
|
+ vis_prefetch_write((vis_d64 *) dst + 16);
|
|
+ data_lo = addr_aligned[i + 1];
|
|
+ data_lo2 = addr_aligned[i + 2];
|
|
+ data_lo3 = addr_aligned[i + 3];
|
|
+ data_lo4 = addr_aligned[i + 4];
|
|
+
|
|
+ data = vis_faligndata(data_hi, data_lo);
|
|
+ data2 = vis_faligndata(data_lo, data_lo2);
|
|
+ data3 = vis_faligndata(data_lo2, data_lo3);
|
|
+ data4 = vis_faligndata(data_lo3, data_lo4);
|
|
+
|
|
+ *(vis_d64 *) dst = data;
|
|
+ *(vis_d64 *) (dst + 8) = data2;
|
|
+ *(vis_d64 *) (dst + 16) = data3;
|
|
+ *(vis_d64 *) (dst + 24) = data4;
|
|
+
|
|
+ data_hi = data_lo4;
|
|
+
|
|
+ dst += 32;
|
|
+ src += 32;
|
|
+ n -= 32;
|
|
+ i += 4;
|
|
+ }
|
|
+ }
|
|
+
|
|
+#pragma loop count(16)
|
|
+ while (n-- >= 0) {
|
|
+ *dst++ = *src++;
|
|
+ }
|
|
+#else
|
|
+ MEMCPY_WRAPPED(dest + k * dst_byte_stride,
|
|
+ sorc + k * src_byte_stride,
|
|
+ width_byte);
|
|
+#endif
|
|
}
|
|
}
|
|
else {
|
|
- i = 0;
|
|
- addr_aligned = (vis_d64 *) vis_alignaddr(src, 0);
|
|
- data_hi = addr_aligned[i];
|
|
#pragma loop count(1000)
|
|
- while (n >= 32) {
|
|
- vis_prefetch_write((vis_d64 *) dst + 16);
|
|
- data_lo = addr_aligned[i + 1];
|
|
- data_lo2 = addr_aligned[i + 2];
|
|
- data_lo3 = addr_aligned[i + 3];
|
|
- data_lo4 = addr_aligned[i + 4];
|
|
-
|
|
- data = vis_faligndata(data_hi, data_lo);
|
|
- data2 = vis_faligndata(data_lo, data_lo2);
|
|
- data3 = vis_faligndata(data_lo2, data_lo3);
|
|
- data4 = vis_faligndata(data_lo3, data_lo4);
|
|
-
|
|
- *(vis_d64 *) dst = data;
|
|
- *(vis_d64 *) (dst + 8) = data2;
|
|
- *(vis_d64 *) (dst + 16) = data3;
|
|
- *(vis_d64 *) (dst + 24) = data4;
|
|
-
|
|
- data_hi = data_lo4;
|
|
-
|
|
- dst += 32;
|
|
- src += 32;
|
|
- n -= 32;
|
|
- i += 4;
|
|
- }
|
|
- }
|
|
+ for (k = height - 1; k >= 0; k--) {
|
|
+#if defined(__sparc_v9__)
|
|
+ n = width_byte;
|
|
+
|
|
+ dst = (uint8_t *) dest + k * dst_byte_stride;
|
|
+ src = (uint8_t *) sorc + k * src_byte_stride;
|
|
+
|
|
+ for (ps = src; ps < src + n; ps += 64) {
|
|
+ __builtin_prefetch(ps, 0);
|
|
+ }
|
|
+
|
|
+#pragma loop count(1)
|
|
+ while (n-- && (uintptr_t) dst & 7) {
|
|
+ *dst++ = *src++;
|
|
+ }
|
|
+
|
|
+ if (!((uintptr_t) src & 7)) {
|
|
+#pragma loop count(1000)
|
|
+ while (n >= 64) {
|
|
+ __builtin_prefetch(dst + 128, 1, 3);
|
|
+ *(vis_d64 *) dst = *(vis_d64 *) src;
|
|
+ *(vis_d64 *) (dst + 8) = *(vis_d64 *) (src + 8);
|
|
+ *(vis_d64 *) (dst + 16) = *(vis_d64 *) (src + 16);
|
|
+ *(vis_d64 *) (dst + 24) = *(vis_d64 *) (src + 24);
|
|
+ *(vis_d64 *) (dst + 32) = *(vis_d64 *) (src + 32);
|
|
+ *(vis_d64 *) (dst + 40) = *(vis_d64 *) (src + 40);
|
|
+ *(vis_d64 *) (dst + 48) = *(vis_d64 *) (src + 48);
|
|
+ *(vis_d64 *) (dst + 56) = *(vis_d64 *) (src + 56);
|
|
+
|
|
+ dst += 64;
|
|
+ src += 64;
|
|
+ n -= 64;
|
|
+
|
|
+ }
|
|
+
|
|
+#pragma loop count(4)
|
|
+ while (n >= 8) {
|
|
+ *(vis_d64 *) dst = *(vis_d64 *) src;
|
|
+ dst += 8;
|
|
+ src += 8;
|
|
+ n -= 8;
|
|
+ }
|
|
+ }
|
|
+ else {
|
|
+ i = 0;
|
|
+ addr_aligned = (vis_d64 *) vis_alignaddr(src, 0);
|
|
+ data_hi = addr_aligned[i];
|
|
+#pragma loop count(1000)
|
|
+ while (n >= 32) {
|
|
+ vis_prefetch_write((vis_d64 *) dst + 16);
|
|
+ data_lo = addr_aligned[i + 1];
|
|
+ data_lo2 = addr_aligned[i + 2];
|
|
+ data_lo3 = addr_aligned[i + 3];
|
|
+ data_lo4 = addr_aligned[i + 4];
|
|
+
|
|
+ data = vis_faligndata(data_hi, data_lo);
|
|
+ data2 = vis_faligndata(data_lo, data_lo2);
|
|
+ data3 = vis_faligndata(data_lo2, data_lo3);
|
|
+ data4 = vis_faligndata(data_lo3, data_lo4);
|
|
+
|
|
+ *(vis_d64 *) dst = data;
|
|
+ *(vis_d64 *) (dst + 8) = data2;
|
|
+ *(vis_d64 *) (dst + 16) = data3;
|
|
+ *(vis_d64 *) (dst + 24) = data4;
|
|
+
|
|
+ data_hi = data_lo4;
|
|
+
|
|
+ dst += 32;
|
|
+ src += 32;
|
|
+ n -= 32;
|
|
+ i += 4;
|
|
+ }
|
|
+ }
|
|
|
|
#pragma loop count(16)
|
|
- while (n-- >= 0) {
|
|
- *dst++ = *src++;
|
|
+ while (n-- >= 0) {
|
|
+ *dst++ = *src++;
|
|
+ }
|
|
+#else
|
|
+ MEMCPY_WRAPPED(dest + k * dst_byte_stride,
|
|
+ sorc + k * src_byte_stride,
|
|
+ width_byte);
|
|
+#endif
|
|
+ }
|
|
}
|
|
- return dest;
|
|
}
|
|
-#endif // __sparc_v9__
|
|
+
|
|
|
|
void
|
|
fbBlt(FbBits * srcLine,
|
|
@@ -167,32 +271,9 @@ fbBlt(FbBits * srcLine,
|
|
if (src_byte + width_byte <= dst_byte ||
|
|
dst_byte + width_byte <= src_byte)
|
|
{
|
|
- int i;
|
|
-
|
|
- if (!upsidedown)
|
|
- #pragma loop count(1000)
|
|
- for (i = 0; i < height; i++)
|
|
-#if defined(__sparc_v9__)
|
|
- fbBlt_memcpy_vis(dst_byte + i * dst_byte_stride,
|
|
- src_byte + i * src_byte_stride,
|
|
- width_byte);
|
|
-#else
|
|
- MEMCPY_WRAPPED(dst_byte + i * dst_byte_stride,
|
|
- src_byte + i * src_byte_stride,
|
|
- width_byte);
|
|
-#endif
|
|
- else
|
|
- #pragma loop count(1000)
|
|
- for (i = height - 1; i >= 0; i--)
|
|
-#if defined(__sparc_v9__)
|
|
- fbBlt_memcpy_vis(dst_byte + i * dst_byte_stride,
|
|
- src_byte + i * src_byte_stride,
|
|
- width_byte);
|
|
-#else
|
|
- MEMCPY_WRAPPED(dst_byte + i * dst_byte_stride,
|
|
- src_byte + i * src_byte_stride,
|
|
- width_byte);
|
|
-#endif
|
|
+ fbBlt_memcpy_vis(dst_byte, src_byte, width_byte,
|
|
+ dst_byte_stride, src_byte_stride,
|
|
+ height, upsidedown);
|
|
return;
|
|
}
|
|
}
|
|
--
|
|
2.16.4
|
|
|