Add use of x86 BMI2 to bitreader_read_rice_signed_block

This commit is contained in:
Martijn van Beurden 2023-03-08 08:42:02 +01:00
parent 75d596a234
commit 00cb41ee03
8 changed files with 175 additions and 144 deletions

View File

@ -36,7 +36,7 @@
* Unpublished debug routines from libFLAC. This should not be used from any
* client code other than code shipped with the FLAC sources.
*/
FLAC_API FLAC__bool FLAC__stream_encoder_disable_instruction_set(FLAC__StreamEncoder *encoder, FLAC__bool value);
FLAC_API FLAC__bool FLAC__stream_encoder_disable_instruction_set(FLAC__StreamEncoder *encoder, int value);
FLAC_API FLAC__bool FLAC__stream_encoder_disable_constant_subframes(FLAC__StreamEncoder *encoder, FLAC__bool value);
FLAC_API FLAC__bool FLAC__stream_encoder_disable_fixed_subframes(FLAC__StreamEncoder *encoder, FLAC__bool value);
FLAC_API FLAC__bool FLAC__stream_encoder_disable_verbatim_subframes(FLAC__StreamEncoder *encoder, FLAC__bool value);

View File

@ -56,6 +56,7 @@ EXTRA_DIST = \
CMakeLists.txt \
flac.pc.in \
libFLAC.m4 \
deduplication/bitreader_read_rice_signed_block.c \
deduplication/lpc_compute_autocorrelation_intrin.c \
deduplication/lpc_compute_autocorrelation_intrin_sse2.c \
deduplication/lpc_compute_autocorrelation_intrin_neon.c

View File

@ -39,6 +39,7 @@
#include "private/bitmath.h"
#include "private/bitreader.h"
#include "private/crc.h"
#include "private/cpu.h"
#include "private/macros.h"
#include "FLAC/assert.h"
#include "share/compat.h"
@ -831,149 +832,13 @@ FLAC__bool FLAC__bitreader_read_rice_signed(FLAC__BitReader *br, int *val, uint3
/* this is by far the most heavily used reader call. it ain't pretty but it's fast */
FLAC__bool FLAC__bitreader_read_rice_signed_block(FLAC__BitReader *br, int vals[], uint32_t nvals, uint32_t parameter)
{
/* try and get br->consumed_words and br->consumed_bits into register;
* must remember to flush them back to *br before calling other
* bitreader functions that use them, and before returning */
uint32_t cwords, words, lsbs, msbs, x, y, limit;
uint32_t ucbits; /* keep track of the number of unconsumed bits in word */
brword b;
int *val, *end;
#include "deduplication/bitreader_read_rice_signed_block.c"
FLAC__ASSERT(0 != br);
FLAC__ASSERT(0 != br->buffer);
/* WATCHOUT: code does not work with <32bit words; we can make things much faster with this assertion */
FLAC__ASSERT(FLAC__BITS_PER_WORD >= 32);
FLAC__ASSERT(parameter < 32);
/* the above two asserts also guarantee that the binary part never straddles more than 2 words, so we don't have to loop to read it */
limit = UINT32_MAX >> parameter; /* Maximal msbs that can occur with residual bounded to int32_t */
val = vals;
end = vals + nvals;
if(parameter == 0) {
while(val < end) {
/* read the unary MSBs and end bit */
if(!FLAC__bitreader_read_unary_unsigned(br, &msbs))
return false;
/* Checking limit here would be overzealous: coding UINT32_MAX
* with parameter == 0 would take 4GiB */
*val++ = (int)(msbs >> 1) ^ -(int)(msbs & 1);
}
return true;
}
FLAC__ASSERT(parameter > 0);
cwords = br->consumed_words;
words = br->words;
/* if we've not consumed up to a partial tail word... */
if(cwords >= words) {
x = 0;
goto process_tail;
}
ucbits = FLAC__BITS_PER_WORD - br->consumed_bits;
b = br->buffer[cwords] << br->consumed_bits; /* keep unconsumed bits aligned to left */
while(val < end) {
/* read the unary MSBs and end bit */
x = y = COUNT_ZERO_MSBS2(b);
if(x == FLAC__BITS_PER_WORD) {
x = ucbits;
do {
/* didn't find stop bit yet, have to keep going... */
cwords++;
if (cwords >= words)
goto incomplete_msbs;
b = br->buffer[cwords];
y = COUNT_ZERO_MSBS2(b);
x += y;
} while(y == FLAC__BITS_PER_WORD);
}
b <<= y;
b <<= 1; /* account for stop bit */
ucbits = (ucbits - x - 1) % FLAC__BITS_PER_WORD;
msbs = x;
if(x > limit)
return false;
/* read the binary LSBs */
x = (FLAC__uint32)(b >> (FLAC__BITS_PER_WORD - parameter)); /* parameter < 32, so we can cast to 32-bit uint32_t */
if(parameter <= ucbits) {
ucbits -= parameter;
b <<= parameter;
} else {
/* there are still bits left to read, they will all be in the next word */
cwords++;
if (cwords >= words)
goto incomplete_lsbs;
b = br->buffer[cwords];
ucbits += FLAC__BITS_PER_WORD - parameter;
x |= (FLAC__uint32)(b >> ucbits);
b <<= FLAC__BITS_PER_WORD - ucbits;
}
lsbs = x;
/* compose the value */
x = (msbs << parameter) | lsbs;
*val++ = (int)(x >> 1) ^ -(int)(x & 1);
continue;
/* at this point we've eaten up all the whole words */
process_tail:
do {
if(0) {
incomplete_msbs:
br->consumed_bits = 0;
br->consumed_words = cwords;
}
/* read the unary MSBs and end bit */
if(!FLAC__bitreader_read_unary_unsigned(br, &msbs))
return false;
msbs += x;
x = ucbits = 0;
if(0) {
incomplete_lsbs:
br->consumed_bits = 0;
br->consumed_words = cwords;
}
/* read the binary LSBs */
if(!FLAC__bitreader_read_raw_uint32(br, &lsbs, parameter - ucbits))
return false;
lsbs = x | lsbs;
/* compose the value */
x = (msbs << parameter) | lsbs;
*val++ = (int)(x >> 1) ^ -(int)(x & 1);
x = 0;
cwords = br->consumed_words;
words = br->words;
ucbits = FLAC__BITS_PER_WORD - br->consumed_bits;
b = cwords < br->capacity ? br->buffer[cwords] << br->consumed_bits : 0;
} while(cwords >= words && val < end);
}
if(ucbits == 0 && cwords < words) {
/* don't leave the head word with no unconsumed bits */
cwords++;
ucbits = FLAC__BITS_PER_WORD;
}
br->consumed_bits = FLAC__BITS_PER_WORD - ucbits;
br->consumed_words = cwords;
return true;
}
#ifdef FLAC__BMI2_SUPPORTED
FLAC__SSE_TARGET("bmi2")
FLAC__bool FLAC__bitreader_read_rice_signed_block_bmi2(FLAC__BitReader *br, int vals[], uint32_t nvals, uint32_t parameter)
#include "deduplication/bitreader_read_rice_signed_block.c"
#endif
#if 0 /* UNUSED */
FLAC__bool FLAC__bitreader_read_golomb_signed(FLAC__BitReader *br, int *val, uint32_t parameter)

View File

@ -76,6 +76,7 @@ static const uint32_t FLAC__CPUINFO_X86_CPUID_FMA = 0x00001000;
/* these are flags in EBX of CPUID AX=00000007 */
static const uint32_t FLAC__CPUINFO_X86_CPUID_AVX2 = 0x00000020;
static const uint32_t FLAC__CPUINFO_X86_CPUID_BMI2 = 0x00000100;
static uint32_t
cpu_xgetbv_x86(void)
@ -186,6 +187,7 @@ x86_cpu_info (FLAC__CPUInfo *info)
info->x86.fma = (flags_ecx & FLAC__CPUINFO_X86_CPUID_FMA ) ? true : false;
cpuinfo_x86(7, &flags_eax, &flags_ebx, &flags_ecx, &flags_edx);
info->x86.avx2 = (flags_ebx & FLAC__CPUINFO_X86_CPUID_AVX2 ) ? true : false;
info->x86.bmi2 = (flags_ebx & FLAC__CPUINFO_X86_CPUID_BMI2 ) ? true : false;
}
#if defined FLAC__CPU_IA32
@ -206,6 +208,7 @@ x86_cpu_info (FLAC__CPUInfo *info)
dfprintf(stderr, " AVX ........ %c\n", info->x86.avx ? 'Y' : 'n');
dfprintf(stderr, " FMA ........ %c\n", info->x86.fma ? 'Y' : 'n');
dfprintf(stderr, " AVX2 ....... %c\n", info->x86.avx2 ? 'Y' : 'n');
dfprintf(stderr, " BMI2 ....... %c\n", info->x86.bmi2 ? 'Y' : 'n');
}
/*

View File

@ -0,0 +1,143 @@
{
/* try and get br->consumed_words and br->consumed_bits into register;
* must remember to flush them back to *br before calling other
* bitreader functions that use them, and before returning */
uint32_t cwords, words, lsbs, msbs, x, y, limit;
uint32_t ucbits; /* keep track of the number of unconsumed bits in word */
brword b;
int *val, *end;
FLAC__ASSERT(0 != br);
FLAC__ASSERT(0 != br->buffer);
/* WATCHOUT: code does not work with <32bit words; we can make things much faster with this assertion */
FLAC__ASSERT(FLAC__BITS_PER_WORD >= 32);
FLAC__ASSERT(parameter < 32);
/* the above two asserts also guarantee that the binary part never straddles more than 2 words, so we don't have to loop to read it */
limit = UINT32_MAX >> parameter; /* Maximal msbs that can occur with residual bounded to int32_t */
val = vals;
end = vals + nvals;
if(parameter == 0) {
while(val < end) {
/* read the unary MSBs and end bit */
if(!FLAC__bitreader_read_unary_unsigned(br, &msbs))
return false;
/* Checking limit here would be overzealous: coding UINT32_MAX
* with parameter == 0 would take 4GiB */
*val++ = (int)(msbs >> 1) ^ -(int)(msbs & 1);
}
return true;
}
FLAC__ASSERT(parameter > 0);
cwords = br->consumed_words;
words = br->words;
/* if we've not consumed up to a partial tail word... */
if(cwords >= words) {
x = 0;
goto process_tail;
}
ucbits = FLAC__BITS_PER_WORD - br->consumed_bits;
b = br->buffer[cwords] << br->consumed_bits; /* keep unconsumed bits aligned to left */
while(val < end) {
/* read the unary MSBs and end bit */
x = y = COUNT_ZERO_MSBS2(b);
if(x == FLAC__BITS_PER_WORD) {
x = ucbits;
do {
/* didn't find stop bit yet, have to keep going... */
cwords++;
if (cwords >= words)
goto incomplete_msbs;
b = br->buffer[cwords];
y = COUNT_ZERO_MSBS2(b);
x += y;
} while(y == FLAC__BITS_PER_WORD);
}
b <<= y;
b <<= 1; /* account for stop bit */
ucbits = (ucbits - x - 1) % FLAC__BITS_PER_WORD;
msbs = x;
if(x > limit)
return false;
/* read the binary LSBs */
x = (FLAC__uint32)(b >> (FLAC__BITS_PER_WORD - parameter)); /* parameter < 32, so we can cast to 32-bit uint32_t */
if(parameter <= ucbits) {
ucbits -= parameter;
b <<= parameter;
} else {
/* there are still bits left to read, they will all be in the next word */
cwords++;
if (cwords >= words)
goto incomplete_lsbs;
b = br->buffer[cwords];
ucbits += FLAC__BITS_PER_WORD - parameter;
x |= (FLAC__uint32)(b >> ucbits);
b <<= FLAC__BITS_PER_WORD - ucbits;
}
lsbs = x;
/* compose the value */
x = (msbs << parameter) | lsbs;
*val++ = (int)(x >> 1) ^ -(int)(x & 1);
continue;
/* at this point we've eaten up all the whole words */
process_tail:
do {
if(0) {
incomplete_msbs:
br->consumed_bits = 0;
br->consumed_words = cwords;
}
/* read the unary MSBs and end bit */
if(!FLAC__bitreader_read_unary_unsigned(br, &msbs))
return false;
msbs += x;
x = ucbits = 0;
if(0) {
incomplete_lsbs:
br->consumed_bits = 0;
br->consumed_words = cwords;
}
/* read the binary LSBs */
if(!FLAC__bitreader_read_raw_uint32(br, &lsbs, parameter - ucbits))
return false;
lsbs = x | lsbs;
/* compose the value */
x = (msbs << parameter) | lsbs;
*val++ = (int)(x >> 1) ^ -(int)(x & 1);
x = 0;
cwords = br->consumed_words;
words = br->words;
ucbits = FLAC__BITS_PER_WORD - br->consumed_bits;
b = cwords < br->capacity ? br->buffer[cwords] << br->consumed_bits : 0;
} while(cwords >= words && val < end);
}
if(ucbits == 0 && cwords < words) {
/* don't leave the head word with no unconsumed bits */
cwords++;
ucbits = FLAC__BITS_PER_WORD;
}
br->consumed_bits = FLAC__BITS_PER_WORD - ucbits;
br->consumed_words = cwords;
return true;
}

View File

@ -88,6 +88,10 @@ FLAC__bool FLAC__bitreader_read_byte_block_aligned_no_crc(FLAC__BitReader *br, F
FLAC__bool FLAC__bitreader_read_unary_unsigned(FLAC__BitReader *br, uint32_t *val);
FLAC__bool FLAC__bitreader_read_rice_signed(FLAC__BitReader *br, int *val, uint32_t parameter);
FLAC__bool FLAC__bitreader_read_rice_signed_block(FLAC__BitReader *br, int vals[], uint32_t nvals, uint32_t parameter);
#ifdef FLAC__BMI2_SUPPORTED
FLAC__bool FLAC__bitreader_read_rice_signed_block_bmi2(FLAC__BitReader *br, int vals[], uint32_t nvals, uint32_t parameter);
#endif
#if 0 /* UNUSED */
FLAC__bool FLAC__bitreader_read_golomb_signed(FLAC__BitReader *br, int *val, uint32_t parameter);
FLAC__bool FLAC__bitreader_read_golomb_unsigned(FLAC__BitReader *br, uint32_t *val, uint32_t parameter);

View File

@ -90,6 +90,7 @@
#define FLAC__AVX_SUPPORTED 1
#define FLAC__AVX2_SUPPORTED 1
#define FLAC__FMA_SUPPORTED 1
#define FLAC__BMI2_SUPPORTED 1
#endif
#elif defined __GNUC__ && !defined __clang__ && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)) /* GCC 4.9+ */
#define FLAC__SSE_TARGET(x) __attribute__ ((__target__ (x)))
@ -102,6 +103,7 @@
#define FLAC__AVX_SUPPORTED 1
#define FLAC__AVX2_SUPPORTED 1
#define FLAC__FMA_SUPPORTED 1
#define FLAC__BMI2_SUPPORTED 1
#endif
#elif defined _MSC_VER
#define FLAC__SSE_TARGET(x)
@ -178,6 +180,7 @@ typedef struct {
FLAC__bool avx;
FLAC__bool avx2;
FLAC__bool fma;
FLAC__bool bmi2;
} FLAC__CPUInfo_x86;
typedef struct {

View File

@ -45,6 +45,7 @@
#include "protected/stream_decoder.h"
#include "private/bitreader.h"
#include "private/bitmath.h"
#include "private/cpu.h"
#include "private/crc.h"
#include "private/fixed.h"
#include "private/format.h"
@ -147,6 +148,7 @@ typedef struct FLAC__StreamDecoderPrivate {
size_t metadata_filter_ids_count, metadata_filter_ids_capacity; /* units for both are IDs, not bytes */
FLAC__Frame frame;
FLAC__bool cached; /* true if there is a byte in lookahead */
FLAC__CPUInfo cpuinfo;
FLAC__byte header_warmup[2]; /* contains the sync code and reserved bits */
FLAC__byte lookahead; /* temp storage when we need to look ahead one byte in the stream */
/* unaligned (original) pointers to allocated data */
@ -164,6 +166,7 @@ typedef struct FLAC__StreamDecoderPrivate {
FLAC__uint64 target_sample;
uint32_t unparseable_frame_count; /* used to tell whether we're decoding a future version of FLAC or just got a bad sync */
FLAC__bool got_a_frame; /* hack needed in Ogg FLAC seek routine to check when process_single() actually writes a frame */
FLAC__bool (*local_bitreader_read_rice_signed_block)(FLAC__BitReader *br, int vals[], uint32_t nvals, uint32_t parameter);
} FLAC__StreamDecoderPrivate;
/***********************************************************************
@ -369,6 +372,15 @@ static FLAC__StreamDecoderInitStatus init_stream_internal_(
return decoder->protected_->initstate = FLAC__STREAM_DECODER_INIT_STATUS_ERROR_OPENING_FILE;
#endif
FLAC__cpu_info(&decoder->private_->cpuinfo);
decoder->private_->local_bitreader_read_rice_signed_block = FLAC__bitreader_read_rice_signed_block;
#ifdef FLAC__BMI2_SUPPORTED
if (decoder->private_->cpuinfo.x86.bmi2) {
decoder->private_->local_bitreader_read_rice_signed_block = FLAC__bitreader_read_rice_signed_block_bmi2;
}
#endif
/* from here on, errors are fatal */
if(!FLAC__bitreader_init(decoder->private_->input, read_callback_, decoder)) {
@ -2948,7 +2960,7 @@ FLAC__bool read_residual_partitioned_rice_(FLAC__StreamDecoder *decoder, uint32_
if(rice_parameter < pesc) {
partitioned_rice_contents->raw_bits[partition] = 0;
u = (partition == 0) ? partition_samples - predictor_order : partition_samples;
if(!FLAC__bitreader_read_rice_signed_block(decoder->private_->input, residual + sample, u, rice_parameter)){
if(!decoder->private_->local_bitreader_read_rice_signed_block(decoder->private_->input, residual + sample, u, rice_parameter)){
if(decoder->protected_->state == FLAC__STREAM_DECODER_READ_FRAME) {
/* no error was set, read_callback_ didn't set it, so
* invalid rice symbol was found */