Rewrite FLAC__bitwriter_write_rice_signed_block for 32 bit words

Using a very different approach: instead of trying to write a full
32-bit bitwriter word, a 64-bit buffer is filled, of which the
first 32 bits are copied to the bitwriter as soon as they are
filled. This way, only very large symbols have to be splitted.

This approach is considerably simpler, and the fast path is taken
much more often.
This commit is contained in:
Martijn van Beurden 2023-02-18 19:18:23 +01:00
parent a336ea5264
commit b698ed45f3
1 changed files with 117 additions and 1 deletions

View File

@ -53,8 +53,10 @@
#if (ENABLE_64_BIT_WORDS == 0)
typedef FLAC__uint32 bwword;
typedef FLAC__uint64 FLAC__bwtemp;
#define FLAC__BYTES_PER_WORD 4 /* sizeof bwword */
#define FLAC__BITS_PER_WORD 32
#define FLAC__TEMP_BITS 64
/* SWAP_BE_WORD_TO_HOST swaps bytes in a bwword (which is always big-endian) if necessary to match host byte order */
#if WORDS_BIGENDIAN
#define SWAP_BE_WORD_TO_HOST(x) (x)
@ -533,14 +535,44 @@ FLAC__bool FLAC__bitwriter_write_rice_signed(FLAC__BitWriter *bw, FLAC__int32 va
}
#endif /* UNUSED */
#if (ENABLE_64_BIT_WORDS == 0)
#define EMPTY_WIDE_ACCUM_TO_BW { \
bw->buffer[bw->words++] = 0; \
bitpointer += FLAC__BITS_PER_WORD; \
}
#if WORDS_BIGENDIAN
#define WIDE_ACCUM_TO_BW { \
bw->accum = *((FLAC__int32 *)&wide_accum); \
bw->buffer[bw->words++] = SWAP_BE_WORD_TO_HOST(bw->accum); \
wide_accum <<= FLAC__BITS_PER_WORD; \
bitpointer += FLAC__BITS_PER_WORD; \
}
#else
#define WIDE_ACCUM_TO_BW { \
bw->accum = *(((FLAC__int32 *)&wide_accum)+1); \
bw->buffer[bw->words++] = SWAP_BE_WORD_TO_HOST(bw->accum); \
wide_accum <<= FLAC__BITS_PER_WORD; \
bitpointer += FLAC__BITS_PER_WORD; \
}
#endif
#endif
FLAC__bool FLAC__bitwriter_write_rice_signed_block(FLAC__BitWriter *bw, const FLAC__int32 *vals, uint32_t nvals, uint32_t parameter)
{
const FLAC__uint32 mask1 = (FLAC__uint32)0xffffffff << parameter; /* we val|=mask1 to set the stop bit above it... */
const FLAC__uint32 mask2 = (FLAC__uint32)0xffffffff >> (31-parameter); /* ...then mask off the bits above the stop bit with val&=mask2 */
FLAC__uint32 uval;
uint32_t left;
const uint32_t lsbits = 1 + parameter;
uint32_t msbits, total_bits;
#if (ENABLE_64_BIT_WORDS == 0)
FLAC__bwtemp wide_accum = 0;
FLAC__uint32 bitpointer = FLAC__TEMP_BITS;
#else
uint32_t left;
#endif
FLAC__ASSERT(0 != bw);
FLAC__ASSERT(0 != bw->buffer);
@ -548,6 +580,20 @@ FLAC__bool FLAC__bitwriter_write_rice_signed_block(FLAC__BitWriter *bw, const FL
/* WATCHOUT: code does not work with <32bit words; we can make things much faster with this assertion */
FLAC__ASSERT(FLAC__BITS_PER_WORD >= 32);
#if (ENABLE_64_BIT_WORDS == 0)
if(bw->bits > 0) {
bitpointer -= bw->bits;
wide_accum = (FLAC__bwtemp)(bw->accum) << bitpointer;
}
{
FLAC__uint32 capacity_needed = (bw->words + 2 * nvals) * FLAC__BITS_PER_WORD + bw->bits;
/* Reserve two words per symbol, add space only when very large symbols are encountered */
if(bw->capacity * FLAC__BITS_PER_WORD <= capacity_needed && !bitwriter_grow_(bw, capacity_needed - bw->capacity * FLAC__BITS_PER_WORD))
return false;
}
#endif
while(nvals) {
/* fold signed to uint32_t; actual formula is: negative(v)? -2v-1 : 2v */
uval = *vals;
@ -557,6 +603,75 @@ FLAC__bool FLAC__bitwriter_write_rice_signed_block(FLAC__BitWriter *bw, const FL
msbits = uval >> parameter;
total_bits = lsbits + msbits;
uval |= mask1; /* set stop bit */
uval &= mask2; /* mask off unused top bits */
#if (ENABLE_64_BIT_WORDS == 0)
if(total_bits <= bitpointer) {
/* There is room enough to store the symbol whole at once */
wide_accum |= (FLAC__bwtemp)(uval) << (bitpointer - total_bits);
bitpointer -= total_bits;
if(bitpointer <= FLAC__BITS_PER_WORD) {
/* A word is finished, copy the upper 32 bits of the wide_accum */
WIDE_ACCUM_TO_BW
}
}
else {
/* The symbol needs to be split. This code isn't used often */
/* First check for space in the bitwriter */
if(total_bits > (2 * FLAC__BITS_PER_WORD)) {
FLAC__uint32 oversize_in_bits = total_bits - 2 * FLAC__BITS_PER_WORD;
FLAC__uint32 capacity_needed = (bw->words + 2 * nvals) * FLAC__BITS_PER_WORD + bw->bits + oversize_in_bits;
if(bw->capacity * FLAC__BITS_PER_WORD <= capacity_needed && !bitwriter_grow_(bw, capacity_needed * FLAC__BITS_PER_WORD - bw->capacity))
return false;
}
if(msbits > bitpointer) {
/* Conveniently empty wide_accum */
msbits -= bitpointer - FLAC__BITS_PER_WORD;
bitpointer = FLAC__BITS_PER_WORD;
WIDE_ACCUM_TO_BW
while(msbits > bitpointer) {
/* As the accumulator is already zero, we only need to
* assign zeroes to the bitbuffer */
EMPTY_WIDE_ACCUM_TO_BW
bitpointer -= FLAC__BITS_PER_WORD;
msbits -= FLAC__BITS_PER_WORD;
}
/* The remaining bits are zero, and the accumulator already is zero,
* so just subtract the number of bits from bitpointer. When storing,
* we can also just store 0 */
bitpointer -= msbits;
if(bitpointer <= FLAC__BITS_PER_WORD)
EMPTY_WIDE_ACCUM_TO_BW
}
else {
bitpointer -= msbits;
if(bitpointer <= FLAC__BITS_PER_WORD)
WIDE_ACCUM_TO_BW
}
/* The lsbs + stop bit always fit 32 bit, so this code mirrors the code above */
wide_accum |= (FLAC__bwtemp)(uval) << (bitpointer - lsbits);
bitpointer -= lsbits;
if(bitpointer <= FLAC__BITS_PER_WORD) {
/* A word is finished, copy the upper 32 bits of the wide_accum */
WIDE_ACCUM_TO_BW
}
}
vals++;
nvals--;
}
/* Now fixup remainer of wide_accum */
if(bitpointer < FLAC__TEMP_BITS) {
bw->accum = wide_accum >> bitpointer;
bw->bits = FLAC__TEMP_BITS - bitpointer;
}
else {
bw->accum = 0;
bw->bits = 0;
}
#else
if(bw->bits && bw->bits + total_bits < FLAC__BITS_PER_WORD) { /* i.e. if the whole thing fits in the current bwword */
/* ^^^ if bw->bits is 0 then we may have filled the buffer and have no free bwword to work in */
bw->bits += total_bits;
@ -624,6 +739,7 @@ break1:
vals++;
nvals--;
}
#endif
return true;
}