* src/smooth/ftgrays.c (gray_render_conic) [SSE2]: Improve flow.
This commit is contained in:
parent
e0e5b838e5
commit
7b308a29dd
@ -1095,16 +1095,17 @@ typedef ptrdiff_t FT_PtrDist;
|
||||
return;
|
||||
}
|
||||
|
||||
/* We can calculate the number of necessary bisections because */
|
||||
/* We can calculate the number of necessary segments because */
|
||||
/* each bisection predictably reduces deviation exactly 4-fold. */
|
||||
/* Even 32-bit deviation would vanish after 16 bisections. */
|
||||
shift = 0;
|
||||
shift = 16;
|
||||
do
|
||||
{
|
||||
dx >>= 2;
|
||||
shift += 1;
|
||||
dx >>= 2;
|
||||
shift--;
|
||||
|
||||
} while ( dx > ONE_PIXEL / 4 );
|
||||
count = 0x10000U >> shift;
|
||||
|
||||
/*
|
||||
* The (P0,P1,P2) arc equation, for t in [0,1] range:
|
||||
@ -1151,9 +1152,8 @@ typedef ptrdiff_t FT_PtrDist;
|
||||
*/
|
||||
|
||||
#if FT_SSE2
|
||||
/* Experience shows that for small shift values, */
|
||||
/* SSE2 is actually slower. */
|
||||
if ( shift > 2 )
|
||||
/* Experience shows that for small counts, SSE2 is actually slower. */
|
||||
if ( count > 4 )
|
||||
{
|
||||
union
|
||||
{
|
||||
@ -1169,9 +1169,7 @@ typedef ptrdiff_t FT_PtrDist;
|
||||
|
||||
} v;
|
||||
|
||||
__m128i a, b;
|
||||
__m128i r, q, q2;
|
||||
__m128i p;
|
||||
__m128i p, q, r;
|
||||
|
||||
|
||||
u.i.ax = ax;
|
||||
@ -1179,14 +1177,13 @@ typedef ptrdiff_t FT_PtrDist;
|
||||
u.i.bx = bx;
|
||||
u.i.by = by;
|
||||
|
||||
a = _mm_load_si128( &u.vec.a );
|
||||
b = _mm_load_si128( &u.vec.b );
|
||||
q = _mm_load_si128( &u.vec.b );
|
||||
r = _mm_load_si128( &u.vec.a );
|
||||
|
||||
r = _mm_slli_epi64( a, 33 - 2 * shift );
|
||||
q = _mm_slli_epi64( b, 33 - shift );
|
||||
q2 = _mm_slli_epi64( a, 32 - 2 * shift );
|
||||
|
||||
q = _mm_add_epi64( q2, q );
|
||||
q = _mm_slli_epi64( q, shift + 17);
|
||||
r = _mm_slli_epi64( r, shift + shift );
|
||||
q = _mm_add_epi64( q, r );
|
||||
r = _mm_add_epi64( r, r );
|
||||
|
||||
v.i.px_lo = 0;
|
||||
v.i.px_hi = p0.x;
|
||||
@ -1195,7 +1192,7 @@ typedef ptrdiff_t FT_PtrDist;
|
||||
|
||||
p = _mm_load_si128( &v.vec );
|
||||
|
||||
for ( count = 1U << shift; count > 0; count-- )
|
||||
do
|
||||
{
|
||||
p = _mm_add_epi64( p, q );
|
||||
q = _mm_add_epi64( q, r );
|
||||
@ -1203,22 +1200,25 @@ typedef ptrdiff_t FT_PtrDist;
|
||||
_mm_store_si128( &v.vec, p );
|
||||
|
||||
gray_render_line( RAS_VAR_ v.i.px_hi, v.i.py_hi );
|
||||
}
|
||||
} while ( --count );
|
||||
|
||||
return;
|
||||
}
|
||||
#endif /* FT_SSE2 */
|
||||
|
||||
rx = LEFT_SHIFT( ax, 33 - 2 * shift );
|
||||
ry = LEFT_SHIFT( ay, 33 - 2 * shift );
|
||||
rx = LEFT_SHIFT( ax, shift + shift );
|
||||
ry = LEFT_SHIFT( ay, shift + shift );
|
||||
|
||||
qx = LEFT_SHIFT( bx, 33 - shift ) + LEFT_SHIFT( ax, 32 - 2 * shift );
|
||||
qy = LEFT_SHIFT( by, 33 - shift ) + LEFT_SHIFT( ay, 32 - 2 * shift );
|
||||
qx = LEFT_SHIFT( bx, shift + 17 ) + rx;
|
||||
qy = LEFT_SHIFT( by, shift + 17 ) + ry;
|
||||
|
||||
rx *= 2;
|
||||
ry *= 2;
|
||||
|
||||
px = LEFT_SHIFT( p0.x, 32 );
|
||||
py = LEFT_SHIFT( p0.y, 32 );
|
||||
|
||||
for ( count = 1U << shift; count > 0; count-- )
|
||||
do
|
||||
{
|
||||
px += qx;
|
||||
py += qy;
|
||||
@ -1227,7 +1227,7 @@ typedef ptrdiff_t FT_PtrDist;
|
||||
|
||||
gray_render_line( RAS_VAR_ (FT_Pos)( px >> 32 ),
|
||||
(FT_Pos)( py >> 32 ) );
|
||||
}
|
||||
} while ( --count );
|
||||
}
|
||||
|
||||
#else /* !BEZIER_USE_DDA */
|
||||
|
Loading…
Reference in New Issue
Block a user