[smooth] Remove SSE2.
As a result of 7b308a29dd
, the regular 64-bit execution is now faster
than SSE2. The rendering speed of script fonts at 64 ppem or larger is
improved by about 3% without SSE2. See !314 for the testing results.
* src/smooth/ftgrays.c (gray_render_conic)[FT_INT64]: Remove SSE2 code.
This commit is contained in:
parent
6d9147263b
commit
5761778246
@ -997,49 +997,12 @@ typedef ptrdiff_t FT_PtrDist;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Benchmarking shows that using DDA to flatten the quadratic Bézier arcs
|
||||
* is slightly faster in the following cases:
|
||||
*
|
||||
* - When the host CPU is 64-bit.
|
||||
* - When SSE2 SIMD registers and instructions are available (even on
|
||||
* x86).
|
||||
*
|
||||
* For other cases, using binary splits is actually slightly faster.
|
||||
* For now, the code that uses DDA to render conic curves requires
|
||||
* `FT_Int64` to be defined. See for example
|
||||
* https://gitlab.freedesktop.org/freetype/freetype/-/issues/1071.
|
||||
*/
|
||||
#if ( defined( __SSE2__ ) || \
|
||||
defined( __x86_64__ ) || \
|
||||
defined( _M_AMD64 ) || \
|
||||
( defined( _M_IX86_FP ) && _M_IX86_FP >= 2 ) ) && \
|
||||
!defined( __VMS )
|
||||
# define FT_SSE2 1
|
||||
#else
|
||||
# define FT_SSE2 0
|
||||
#endif
|
||||
|
||||
#if FT_SSE2 || \
|
||||
defined( __aarch64__ ) || \
|
||||
defined( _M_ARM64 )
|
||||
# define BEZIER_USE_DDA 1
|
||||
#else
|
||||
# define BEZIER_USE_DDA 0
|
||||
#endif
|
||||
|
||||
/*
|
||||
* For now, the code that depends on `BEZIER_USE_DDA` requires `FT_Int64`
|
||||
* to be defined. If `FT_INT64` is not defined, meaning there is no
|
||||
* 64-bit type available, disable it to avoid compilation errors. See for
|
||||
* example https://gitlab.freedesktop.org/freetype/freetype/-/issues/1071.
|
||||
*/
|
||||
#if !defined( FT_INT64 )
|
||||
# undef BEZIER_USE_DDA
|
||||
# define BEZIER_USE_DDA 0
|
||||
#endif
|
||||
|
||||
#if BEZIER_USE_DDA
|
||||
|
||||
#if FT_SSE2
|
||||
# include <emmintrin.h>
|
||||
#endif
|
||||
#ifdef FT_INT64
|
||||
|
||||
#define LEFT_SHIFT( a, b ) (FT_Int64)( (FT_UInt64)(a) << (b) )
|
||||
|
||||
@ -1151,61 +1114,6 @@ typedef ptrdiff_t FT_PtrDist;
|
||||
* = (B << (33 - N)) + (A << (32 - 2*N))
|
||||
*/
|
||||
|
||||
#if FT_SSE2
|
||||
/* Experience shows that for small counts, SSE2 is actually slower. */
|
||||
if ( count > 4 )
|
||||
{
|
||||
union
|
||||
{
|
||||
struct { FT_Int64 ax, ay, bx, by; } i;
|
||||
struct { __m128i a, b; } vec;
|
||||
|
||||
} u;
|
||||
|
||||
union
|
||||
{
|
||||
struct { FT_Int32 px_lo, px_hi, py_lo, py_hi; } i;
|
||||
__m128i vec;
|
||||
|
||||
} v;
|
||||
|
||||
__m128i p, q, r;
|
||||
|
||||
|
||||
u.i.ax = ax;
|
||||
u.i.ay = ay;
|
||||
u.i.bx = bx;
|
||||
u.i.by = by;
|
||||
|
||||
q = _mm_load_si128( &u.vec.b );
|
||||
r = _mm_load_si128( &u.vec.a );
|
||||
|
||||
q = _mm_slli_epi64( q, shift + 17);
|
||||
r = _mm_slli_epi64( r, shift + shift );
|
||||
q = _mm_add_epi64( q, r );
|
||||
r = _mm_add_epi64( r, r );
|
||||
|
||||
v.i.px_lo = 0;
|
||||
v.i.px_hi = p0.x;
|
||||
v.i.py_lo = 0;
|
||||
v.i.py_hi = p0.y;
|
||||
|
||||
p = _mm_load_si128( &v.vec );
|
||||
|
||||
do
|
||||
{
|
||||
p = _mm_add_epi64( p, q );
|
||||
q = _mm_add_epi64( q, r );
|
||||
|
||||
_mm_store_si128( &v.vec, p );
|
||||
|
||||
gray_render_line( RAS_VAR_ v.i.px_hi, v.i.py_hi );
|
||||
} while ( --count );
|
||||
|
||||
return;
|
||||
}
|
||||
#endif /* FT_SSE2 */
|
||||
|
||||
rx = LEFT_SHIFT( ax, shift + shift );
|
||||
ry = LEFT_SHIFT( ay, shift + shift );
|
||||
|
||||
@ -1230,7 +1138,7 @@ typedef ptrdiff_t FT_PtrDist;
|
||||
} while ( --count );
|
||||
}
|
||||
|
||||
#else /* !BEZIER_USE_DDA */
|
||||
#else /* !FT_INT64 */
|
||||
|
||||
/*
|
||||
* Note that multiple attempts to speed up the function below
|
||||
@ -1324,7 +1232,7 @@ typedef ptrdiff_t FT_PtrDist;
|
||||
} while ( --draw );
|
||||
}
|
||||
|
||||
#endif /* !BEZIER_USE_DDA */
|
||||
#endif /* !FT_INT64 */
|
||||
|
||||
|
||||
/*
|
||||
|
Loading…
Reference in New Issue
Block a user