Rewrite FLAC__bitwriter_write_rice_signed_block for 32 bit words

Using a very different approach: instead of trying to write a full 32-bit bitwriter word, a 64-bit buffer is filled, of which the first 32 bits are copied to the bitwriter as soon as they are filled. This way, only very large symbols have to be splitted. This approach is considerably simpler, and the fast path is taken much more often.
2023-02-18 19:18:23 +01:00 · 2023-02-18 19:18:23 +01:00 · b698ed45f3
commit b698ed45f3
parent a336ea5264
1 changed files with 117 additions and 1 deletions
--- a/src/libFLAC/bitwriter.c
+++ b/src/libFLAC/bitwriter.c
@ -53,8 +53,10 @@
 #if (ENABLE_64_BIT_WORDS == 0)

 typedef FLAC__uint32 bwword;
+typedef FLAC__uint64 FLAC__bwtemp;
 #define FLAC__BYTES_PER_WORD 4		/* sizeof bwword */
 #define FLAC__BITS_PER_WORD 32
+#define FLAC__TEMP_BITS 64
 /* SWAP_BE_WORD_TO_HOST swaps bytes in a bwword (which is always big-endian) if necessary to match host byte order */
 #if WORDS_BIGENDIAN
 #define SWAP_BE_WORD_TO_HOST(x) (x)
@ -533,14 +535,44 @@ FLAC__bool FLAC__bitwriter_write_rice_signed(FLAC__BitWriter *bw, FLAC__int32 va
 }
 #endif /* UNUSED */

+#if (ENABLE_64_BIT_WORDS == 0)
+
+#define EMPTY_WIDE_ACCUM_TO_BW {  \
+	bw->buffer[bw->words++] = 0;  \
+	bitpointer += FLAC__BITS_PER_WORD;  \
+}
+
+#if WORDS_BIGENDIAN
+#define WIDE_ACCUM_TO_BW {  \
+	bw->accum = *((FLAC__int32 *)&wide_accum);  \
+	bw->buffer[bw->words++] = SWAP_BE_WORD_TO_HOST(bw->accum); \
+	wide_accum <<= FLAC__BITS_PER_WORD;  \
+	bitpointer += FLAC__BITS_PER_WORD;  \
+}
+#else
+#define WIDE_ACCUM_TO_BW {  \
+	bw->accum = *(((FLAC__int32 *)&wide_accum)+1); \
+	bw->buffer[bw->words++] = SWAP_BE_WORD_TO_HOST(bw->accum); \
+	wide_accum <<= FLAC__BITS_PER_WORD;  \
+	bitpointer += FLAC__BITS_PER_WORD;  \
+}
+#endif
+
+#endif
+
 FLAC__bool FLAC__bitwriter_write_rice_signed_block(FLAC__BitWriter *bw, const FLAC__int32 *vals, uint32_t nvals, uint32_t parameter)
 {
 	const FLAC__uint32 mask1 = (FLAC__uint32)0xffffffff << parameter; /* we val|=mask1 to set the stop bit above it... */
 	const FLAC__uint32 mask2 = (FLAC__uint32)0xffffffff >> (31-parameter); /* ...then mask off the bits above the stop bit with val&=mask2 */
 	FLAC__uint32 uval;
-	uint32_t left;
 	const uint32_t lsbits = 1 + parameter;
 	uint32_t msbits, total_bits;
+#if (ENABLE_64_BIT_WORDS == 0)
+	FLAC__bwtemp wide_accum = 0;
+	FLAC__uint32 bitpointer = FLAC__TEMP_BITS;
+#else
+	uint32_t left;
+#endif

 	FLAC__ASSERT(0 != bw);
 	FLAC__ASSERT(0 != bw->buffer);
@ -548,6 +580,20 @@ FLAC__bool FLAC__bitwriter_write_rice_signed_block(FLAC__BitWriter *bw, const FL
 	/* WATCHOUT: code does not work with <32bit words; we can make things much faster with this assertion */
 	FLAC__ASSERT(FLAC__BITS_PER_WORD >= 32);

+#if (ENABLE_64_BIT_WORDS == 0)
+	if(bw->bits > 0) {
+		bitpointer -= bw->bits;
+		wide_accum = (FLAC__bwtemp)(bw->accum) << bitpointer;
+	}
+
+	{
+		FLAC__uint32 capacity_needed = (bw->words + 2 * nvals) * FLAC__BITS_PER_WORD + bw->bits;
+		/* Reserve two words per symbol, add space only when very large symbols are encountered */
+		if(bw->capacity * FLAC__BITS_PER_WORD <= capacity_needed && !bitwriter_grow_(bw, capacity_needed - bw->capacity * FLAC__BITS_PER_WORD))
+			return false;
+	}
+#endif
+
 	while(nvals) {
 		/* fold signed to uint32_t; actual formula is: negative(v)? -2v-1 : 2v */
 		uval = *vals;
@ -557,6 +603,75 @@ FLAC__bool FLAC__bitwriter_write_rice_signed_block(FLAC__BitWriter *bw, const FL
 		msbits = uval >> parameter;
 		total_bits = lsbits + msbits;

+		uval |= mask1; /* set stop bit */
+		uval &= mask2; /* mask off unused top bits */
+
+
+#if (ENABLE_64_BIT_WORDS == 0)
+		if(total_bits <= bitpointer) {
+			/* There is room enough to store the symbol whole at once */
+			wide_accum |= (FLAC__bwtemp)(uval) << (bitpointer - total_bits);
+			bitpointer -= total_bits;
+			if(bitpointer <= FLAC__BITS_PER_WORD) {
+				/* A word is finished, copy the upper 32 bits of the wide_accum */
+				WIDE_ACCUM_TO_BW
+			}
+		}
+		else {
+			/* The symbol needs to be split. This code isn't used often */
+			/* First check for space in the bitwriter */
+			if(total_bits > (2 * FLAC__BITS_PER_WORD)) {
+				FLAC__uint32 oversize_in_bits = total_bits - 2 * FLAC__BITS_PER_WORD;
+				FLAC__uint32 capacity_needed = (bw->words + 2 * nvals) * FLAC__BITS_PER_WORD + bw->bits  + oversize_in_bits;
+				if(bw->capacity * FLAC__BITS_PER_WORD <= capacity_needed && !bitwriter_grow_(bw, capacity_needed * FLAC__BITS_PER_WORD - bw->capacity))
+					return false;
+			}
+			if(msbits > bitpointer) {
+				/* Conveniently empty wide_accum */
+				msbits -= bitpointer - FLAC__BITS_PER_WORD;
+				bitpointer = FLAC__BITS_PER_WORD;
+				WIDE_ACCUM_TO_BW
+				while(msbits > bitpointer) {
+					/* As the accumulator is already zero, we only need to
+					 * assign zeroes to the bitbuffer */
+					EMPTY_WIDE_ACCUM_TO_BW
+					bitpointer -= FLAC__BITS_PER_WORD;
+					msbits -= FLAC__BITS_PER_WORD;
+				}
+				/* The remaining bits are zero, and the accumulator already is zero,
+				 * so just subtract the number of bits from bitpointer. When storing,
+				 * we can also just store 0 */
+				bitpointer -= msbits;
+				if(bitpointer <= FLAC__BITS_PER_WORD)
+					EMPTY_WIDE_ACCUM_TO_BW
+			}
+			else {
+				bitpointer -= msbits;
+				if(bitpointer <= FLAC__BITS_PER_WORD)
+					WIDE_ACCUM_TO_BW
+			}
+			/* The lsbs + stop bit always fit 32 bit, so this code mirrors the code above */
+                        wide_accum |= (FLAC__bwtemp)(uval) << (bitpointer - lsbits);
+                        bitpointer -= lsbits;
+                        if(bitpointer <= FLAC__BITS_PER_WORD) {
+                                /* A word is finished, copy the upper 32 bits of the wide_accum */
+                                WIDE_ACCUM_TO_BW
+                        }
+		}
+		vals++;
+		nvals--;
+	}
+	/* Now fixup remainer of wide_accum */
+	if(bitpointer < FLAC__TEMP_BITS) {
+		bw->accum = wide_accum >> bitpointer;
+		bw->bits = FLAC__TEMP_BITS - bitpointer;
+	}
+	else {
+		bw->accum = 0;
+		bw->bits = 0;
+	}
+#else
+
 		if(bw->bits && bw->bits + total_bits < FLAC__BITS_PER_WORD) { /* i.e. if the whole thing fits in the current bwword */
 			/* ^^^ if bw->bits is 0 then we may have filled the buffer and have no free bwword to work in */
 			bw->bits += total_bits;
@ -624,6 +739,7 @@ break1:
 		vals++;
 		nvals--;
 	}
+#endif
 	return true;
 }