From 00cb41ee034fd4c5a21bf4167ee9363cc4ab39e2 Mon Sep 17 00:00:00 2001 From: Martijn van Beurden Date: Wed, 8 Mar 2023 08:42:02 +0100 Subject: [PATCH] Add use of x86 BMI2 to bitreader_read_rice_signed_block --- include/share/private.h | 2 +- src/libFLAC/Makefile.am | 1 + src/libFLAC/bitreader.c | 149 +----------------- src/libFLAC/cpu.c | 3 + .../bitreader_read_rice_signed_block.c | 143 +++++++++++++++++ src/libFLAC/include/private/bitreader.h | 4 + src/libFLAC/include/private/cpu.h | 3 + src/libFLAC/stream_decoder.c | 14 +- 8 files changed, 175 insertions(+), 144 deletions(-) create mode 100644 src/libFLAC/deduplication/bitreader_read_rice_signed_block.c diff --git a/include/share/private.h b/include/share/private.h index 653c415d..b4312756 100644 --- a/include/share/private.h +++ b/include/share/private.h @@ -36,7 +36,7 @@ * Unpublished debug routines from libFLAC. This should not be used from any * client code other than code shipped with the FLAC sources. */ -FLAC_API FLAC__bool FLAC__stream_encoder_disable_instruction_set(FLAC__StreamEncoder *encoder, FLAC__bool value); +FLAC_API FLAC__bool FLAC__stream_encoder_disable_instruction_set(FLAC__StreamEncoder *encoder, int value); FLAC_API FLAC__bool FLAC__stream_encoder_disable_constant_subframes(FLAC__StreamEncoder *encoder, FLAC__bool value); FLAC_API FLAC__bool FLAC__stream_encoder_disable_fixed_subframes(FLAC__StreamEncoder *encoder, FLAC__bool value); FLAC_API FLAC__bool FLAC__stream_encoder_disable_verbatim_subframes(FLAC__StreamEncoder *encoder, FLAC__bool value); diff --git a/src/libFLAC/Makefile.am b/src/libFLAC/Makefile.am index 40e3308d..dd2a5e39 100644 --- a/src/libFLAC/Makefile.am +++ b/src/libFLAC/Makefile.am @@ -56,6 +56,7 @@ EXTRA_DIST = \ CMakeLists.txt \ flac.pc.in \ libFLAC.m4 \ + deduplication/bitreader_read_rice_signed_block.c \ deduplication/lpc_compute_autocorrelation_intrin.c \ deduplication/lpc_compute_autocorrelation_intrin_sse2.c \ deduplication/lpc_compute_autocorrelation_intrin_neon.c diff --git a/src/libFLAC/bitreader.c b/src/libFLAC/bitreader.c index 56f992e4..af878e06 100644 --- a/src/libFLAC/bitreader.c +++ b/src/libFLAC/bitreader.c @@ -39,6 +39,7 @@ #include "private/bitmath.h" #include "private/bitreader.h" #include "private/crc.h" +#include "private/cpu.h" #include "private/macros.h" #include "FLAC/assert.h" #include "share/compat.h" @@ -831,149 +832,13 @@ FLAC__bool FLAC__bitreader_read_rice_signed(FLAC__BitReader *br, int *val, uint3 /* this is by far the most heavily used reader call. it ain't pretty but it's fast */ FLAC__bool FLAC__bitreader_read_rice_signed_block(FLAC__BitReader *br, int vals[], uint32_t nvals, uint32_t parameter) -{ - /* try and get br->consumed_words and br->consumed_bits into register; - * must remember to flush them back to *br before calling other - * bitreader functions that use them, and before returning */ - uint32_t cwords, words, lsbs, msbs, x, y, limit; - uint32_t ucbits; /* keep track of the number of unconsumed bits in word */ - brword b; - int *val, *end; +#include "deduplication/bitreader_read_rice_signed_block.c" - FLAC__ASSERT(0 != br); - FLAC__ASSERT(0 != br->buffer); - /* WATCHOUT: code does not work with <32bit words; we can make things much faster with this assertion */ - FLAC__ASSERT(FLAC__BITS_PER_WORD >= 32); - FLAC__ASSERT(parameter < 32); - /* the above two asserts also guarantee that the binary part never straddles more than 2 words, so we don't have to loop to read it */ - - limit = UINT32_MAX >> parameter; /* Maximal msbs that can occur with residual bounded to int32_t */ - - val = vals; - end = vals + nvals; - - if(parameter == 0) { - while(val < end) { - /* read the unary MSBs and end bit */ - if(!FLAC__bitreader_read_unary_unsigned(br, &msbs)) - return false; - /* Checking limit here would be overzealous: coding UINT32_MAX - * with parameter == 0 would take 4GiB */ - *val++ = (int)(msbs >> 1) ^ -(int)(msbs & 1); - } - - return true; - } - - FLAC__ASSERT(parameter > 0); - - cwords = br->consumed_words; - words = br->words; - - /* if we've not consumed up to a partial tail word... */ - if(cwords >= words) { - x = 0; - goto process_tail; - } - - ucbits = FLAC__BITS_PER_WORD - br->consumed_bits; - b = br->buffer[cwords] << br->consumed_bits; /* keep unconsumed bits aligned to left */ - - while(val < end) { - /* read the unary MSBs and end bit */ - x = y = COUNT_ZERO_MSBS2(b); - if(x == FLAC__BITS_PER_WORD) { - x = ucbits; - do { - /* didn't find stop bit yet, have to keep going... */ - cwords++; - if (cwords >= words) - goto incomplete_msbs; - b = br->buffer[cwords]; - y = COUNT_ZERO_MSBS2(b); - x += y; - } while(y == FLAC__BITS_PER_WORD); - } - b <<= y; - b <<= 1; /* account for stop bit */ - ucbits = (ucbits - x - 1) % FLAC__BITS_PER_WORD; - msbs = x; - - if(x > limit) - return false; - - /* read the binary LSBs */ - x = (FLAC__uint32)(b >> (FLAC__BITS_PER_WORD - parameter)); /* parameter < 32, so we can cast to 32-bit uint32_t */ - if(parameter <= ucbits) { - ucbits -= parameter; - b <<= parameter; - } else { - /* there are still bits left to read, they will all be in the next word */ - cwords++; - if (cwords >= words) - goto incomplete_lsbs; - b = br->buffer[cwords]; - ucbits += FLAC__BITS_PER_WORD - parameter; - x |= (FLAC__uint32)(b >> ucbits); - b <<= FLAC__BITS_PER_WORD - ucbits; - } - lsbs = x; - - /* compose the value */ - x = (msbs << parameter) | lsbs; - *val++ = (int)(x >> 1) ^ -(int)(x & 1); - - continue; - - /* at this point we've eaten up all the whole words */ -process_tail: - do { - if(0) { -incomplete_msbs: - br->consumed_bits = 0; - br->consumed_words = cwords; - } - - /* read the unary MSBs and end bit */ - if(!FLAC__bitreader_read_unary_unsigned(br, &msbs)) - return false; - msbs += x; - x = ucbits = 0; - - if(0) { -incomplete_lsbs: - br->consumed_bits = 0; - br->consumed_words = cwords; - } - - /* read the binary LSBs */ - if(!FLAC__bitreader_read_raw_uint32(br, &lsbs, parameter - ucbits)) - return false; - lsbs = x | lsbs; - - /* compose the value */ - x = (msbs << parameter) | lsbs; - *val++ = (int)(x >> 1) ^ -(int)(x & 1); - x = 0; - - cwords = br->consumed_words; - words = br->words; - ucbits = FLAC__BITS_PER_WORD - br->consumed_bits; - b = cwords < br->capacity ? br->buffer[cwords] << br->consumed_bits : 0; - } while(cwords >= words && val < end); - } - - if(ucbits == 0 && cwords < words) { - /* don't leave the head word with no unconsumed bits */ - cwords++; - ucbits = FLAC__BITS_PER_WORD; - } - - br->consumed_bits = FLAC__BITS_PER_WORD - ucbits; - br->consumed_words = cwords; - - return true; -} +#ifdef FLAC__BMI2_SUPPORTED +FLAC__SSE_TARGET("bmi2") +FLAC__bool FLAC__bitreader_read_rice_signed_block_bmi2(FLAC__BitReader *br, int vals[], uint32_t nvals, uint32_t parameter) +#include "deduplication/bitreader_read_rice_signed_block.c" +#endif #if 0 /* UNUSED */ FLAC__bool FLAC__bitreader_read_golomb_signed(FLAC__BitReader *br, int *val, uint32_t parameter) diff --git a/src/libFLAC/cpu.c b/src/libFLAC/cpu.c index 51f72ddf..a9d03d77 100644 --- a/src/libFLAC/cpu.c +++ b/src/libFLAC/cpu.c @@ -76,6 +76,7 @@ static const uint32_t FLAC__CPUINFO_X86_CPUID_FMA = 0x00001000; /* these are flags in EBX of CPUID AX=00000007 */ static const uint32_t FLAC__CPUINFO_X86_CPUID_AVX2 = 0x00000020; +static const uint32_t FLAC__CPUINFO_X86_CPUID_BMI2 = 0x00000100; static uint32_t cpu_xgetbv_x86(void) @@ -186,6 +187,7 @@ x86_cpu_info (FLAC__CPUInfo *info) info->x86.fma = (flags_ecx & FLAC__CPUINFO_X86_CPUID_FMA ) ? true : false; cpuinfo_x86(7, &flags_eax, &flags_ebx, &flags_ecx, &flags_edx); info->x86.avx2 = (flags_ebx & FLAC__CPUINFO_X86_CPUID_AVX2 ) ? true : false; + info->x86.bmi2 = (flags_ebx & FLAC__CPUINFO_X86_CPUID_BMI2 ) ? true : false; } #if defined FLAC__CPU_IA32 @@ -206,6 +208,7 @@ x86_cpu_info (FLAC__CPUInfo *info) dfprintf(stderr, " AVX ........ %c\n", info->x86.avx ? 'Y' : 'n'); dfprintf(stderr, " FMA ........ %c\n", info->x86.fma ? 'Y' : 'n'); dfprintf(stderr, " AVX2 ....... %c\n", info->x86.avx2 ? 'Y' : 'n'); + dfprintf(stderr, " BMI2 ....... %c\n", info->x86.bmi2 ? 'Y' : 'n'); } /* diff --git a/src/libFLAC/deduplication/bitreader_read_rice_signed_block.c b/src/libFLAC/deduplication/bitreader_read_rice_signed_block.c new file mode 100644 index 00000000..75ed47f7 --- /dev/null +++ b/src/libFLAC/deduplication/bitreader_read_rice_signed_block.c @@ -0,0 +1,143 @@ +{ + /* try and get br->consumed_words and br->consumed_bits into register; + * must remember to flush them back to *br before calling other + * bitreader functions that use them, and before returning */ + uint32_t cwords, words, lsbs, msbs, x, y, limit; + uint32_t ucbits; /* keep track of the number of unconsumed bits in word */ + brword b; + int *val, *end; + + FLAC__ASSERT(0 != br); + FLAC__ASSERT(0 != br->buffer); + /* WATCHOUT: code does not work with <32bit words; we can make things much faster with this assertion */ + FLAC__ASSERT(FLAC__BITS_PER_WORD >= 32); + FLAC__ASSERT(parameter < 32); + /* the above two asserts also guarantee that the binary part never straddles more than 2 words, so we don't have to loop to read it */ + + limit = UINT32_MAX >> parameter; /* Maximal msbs that can occur with residual bounded to int32_t */ + + val = vals; + end = vals + nvals; + + if(parameter == 0) { + while(val < end) { + /* read the unary MSBs and end bit */ + if(!FLAC__bitreader_read_unary_unsigned(br, &msbs)) + return false; + /* Checking limit here would be overzealous: coding UINT32_MAX + * with parameter == 0 would take 4GiB */ + *val++ = (int)(msbs >> 1) ^ -(int)(msbs & 1); + } + + return true; + } + + FLAC__ASSERT(parameter > 0); + + cwords = br->consumed_words; + words = br->words; + + /* if we've not consumed up to a partial tail word... */ + if(cwords >= words) { + x = 0; + goto process_tail; + } + + ucbits = FLAC__BITS_PER_WORD - br->consumed_bits; + b = br->buffer[cwords] << br->consumed_bits; /* keep unconsumed bits aligned to left */ + + while(val < end) { + /* read the unary MSBs and end bit */ + x = y = COUNT_ZERO_MSBS2(b); + if(x == FLAC__BITS_PER_WORD) { + x = ucbits; + do { + /* didn't find stop bit yet, have to keep going... */ + cwords++; + if (cwords >= words) + goto incomplete_msbs; + b = br->buffer[cwords]; + y = COUNT_ZERO_MSBS2(b); + x += y; + } while(y == FLAC__BITS_PER_WORD); + } + b <<= y; + b <<= 1; /* account for stop bit */ + ucbits = (ucbits - x - 1) % FLAC__BITS_PER_WORD; + msbs = x; + + if(x > limit) + return false; + + /* read the binary LSBs */ + x = (FLAC__uint32)(b >> (FLAC__BITS_PER_WORD - parameter)); /* parameter < 32, so we can cast to 32-bit uint32_t */ + if(parameter <= ucbits) { + ucbits -= parameter; + b <<= parameter; + } else { + /* there are still bits left to read, they will all be in the next word */ + cwords++; + if (cwords >= words) + goto incomplete_lsbs; + b = br->buffer[cwords]; + ucbits += FLAC__BITS_PER_WORD - parameter; + x |= (FLAC__uint32)(b >> ucbits); + b <<= FLAC__BITS_PER_WORD - ucbits; + } + lsbs = x; + + /* compose the value */ + x = (msbs << parameter) | lsbs; + *val++ = (int)(x >> 1) ^ -(int)(x & 1); + + continue; + + /* at this point we've eaten up all the whole words */ +process_tail: + do { + if(0) { +incomplete_msbs: + br->consumed_bits = 0; + br->consumed_words = cwords; + } + + /* read the unary MSBs and end bit */ + if(!FLAC__bitreader_read_unary_unsigned(br, &msbs)) + return false; + msbs += x; + x = ucbits = 0; + + if(0) { +incomplete_lsbs: + br->consumed_bits = 0; + br->consumed_words = cwords; + } + + /* read the binary LSBs */ + if(!FLAC__bitreader_read_raw_uint32(br, &lsbs, parameter - ucbits)) + return false; + lsbs = x | lsbs; + + /* compose the value */ + x = (msbs << parameter) | lsbs; + *val++ = (int)(x >> 1) ^ -(int)(x & 1); + x = 0; + + cwords = br->consumed_words; + words = br->words; + ucbits = FLAC__BITS_PER_WORD - br->consumed_bits; + b = cwords < br->capacity ? br->buffer[cwords] << br->consumed_bits : 0; + } while(cwords >= words && val < end); + } + + if(ucbits == 0 && cwords < words) { + /* don't leave the head word with no unconsumed bits */ + cwords++; + ucbits = FLAC__BITS_PER_WORD; + } + + br->consumed_bits = FLAC__BITS_PER_WORD - ucbits; + br->consumed_words = cwords; + + return true; +} diff --git a/src/libFLAC/include/private/bitreader.h b/src/libFLAC/include/private/bitreader.h index 2943a955..b450c27b 100644 --- a/src/libFLAC/include/private/bitreader.h +++ b/src/libFLAC/include/private/bitreader.h @@ -88,6 +88,10 @@ FLAC__bool FLAC__bitreader_read_byte_block_aligned_no_crc(FLAC__BitReader *br, F FLAC__bool FLAC__bitreader_read_unary_unsigned(FLAC__BitReader *br, uint32_t *val); FLAC__bool FLAC__bitreader_read_rice_signed(FLAC__BitReader *br, int *val, uint32_t parameter); FLAC__bool FLAC__bitreader_read_rice_signed_block(FLAC__BitReader *br, int vals[], uint32_t nvals, uint32_t parameter); +#ifdef FLAC__BMI2_SUPPORTED +FLAC__bool FLAC__bitreader_read_rice_signed_block_bmi2(FLAC__BitReader *br, int vals[], uint32_t nvals, uint32_t parameter); +#endif + #if 0 /* UNUSED */ FLAC__bool FLAC__bitreader_read_golomb_signed(FLAC__BitReader *br, int *val, uint32_t parameter); FLAC__bool FLAC__bitreader_read_golomb_unsigned(FLAC__BitReader *br, uint32_t *val, uint32_t parameter); diff --git a/src/libFLAC/include/private/cpu.h b/src/libFLAC/include/private/cpu.h index 1e444c09..4a79d64f 100644 --- a/src/libFLAC/include/private/cpu.h +++ b/src/libFLAC/include/private/cpu.h @@ -90,6 +90,7 @@ #define FLAC__AVX_SUPPORTED 1 #define FLAC__AVX2_SUPPORTED 1 #define FLAC__FMA_SUPPORTED 1 + #define FLAC__BMI2_SUPPORTED 1 #endif #elif defined __GNUC__ && !defined __clang__ && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)) /* GCC 4.9+ */ #define FLAC__SSE_TARGET(x) __attribute__ ((__target__ (x))) @@ -102,6 +103,7 @@ #define FLAC__AVX_SUPPORTED 1 #define FLAC__AVX2_SUPPORTED 1 #define FLAC__FMA_SUPPORTED 1 + #define FLAC__BMI2_SUPPORTED 1 #endif #elif defined _MSC_VER #define FLAC__SSE_TARGET(x) @@ -178,6 +180,7 @@ typedef struct { FLAC__bool avx; FLAC__bool avx2; FLAC__bool fma; + FLAC__bool bmi2; } FLAC__CPUInfo_x86; typedef struct { diff --git a/src/libFLAC/stream_decoder.c b/src/libFLAC/stream_decoder.c index 4ee53452..2c2b77e0 100644 --- a/src/libFLAC/stream_decoder.c +++ b/src/libFLAC/stream_decoder.c @@ -45,6 +45,7 @@ #include "protected/stream_decoder.h" #include "private/bitreader.h" #include "private/bitmath.h" +#include "private/cpu.h" #include "private/crc.h" #include "private/fixed.h" #include "private/format.h" @@ -147,6 +148,7 @@ typedef struct FLAC__StreamDecoderPrivate { size_t metadata_filter_ids_count, metadata_filter_ids_capacity; /* units for both are IDs, not bytes */ FLAC__Frame frame; FLAC__bool cached; /* true if there is a byte in lookahead */ + FLAC__CPUInfo cpuinfo; FLAC__byte header_warmup[2]; /* contains the sync code and reserved bits */ FLAC__byte lookahead; /* temp storage when we need to look ahead one byte in the stream */ /* unaligned (original) pointers to allocated data */ @@ -164,6 +166,7 @@ typedef struct FLAC__StreamDecoderPrivate { FLAC__uint64 target_sample; uint32_t unparseable_frame_count; /* used to tell whether we're decoding a future version of FLAC or just got a bad sync */ FLAC__bool got_a_frame; /* hack needed in Ogg FLAC seek routine to check when process_single() actually writes a frame */ + FLAC__bool (*local_bitreader_read_rice_signed_block)(FLAC__BitReader *br, int vals[], uint32_t nvals, uint32_t parameter); } FLAC__StreamDecoderPrivate; /*********************************************************************** @@ -369,6 +372,15 @@ static FLAC__StreamDecoderInitStatus init_stream_internal_( return decoder->protected_->initstate = FLAC__STREAM_DECODER_INIT_STATUS_ERROR_OPENING_FILE; #endif + FLAC__cpu_info(&decoder->private_->cpuinfo); + decoder->private_->local_bitreader_read_rice_signed_block = FLAC__bitreader_read_rice_signed_block; + +#ifdef FLAC__BMI2_SUPPORTED + if (decoder->private_->cpuinfo.x86.bmi2) { + decoder->private_->local_bitreader_read_rice_signed_block = FLAC__bitreader_read_rice_signed_block_bmi2; + } +#endif + /* from here on, errors are fatal */ if(!FLAC__bitreader_init(decoder->private_->input, read_callback_, decoder)) { @@ -2948,7 +2960,7 @@ FLAC__bool read_residual_partitioned_rice_(FLAC__StreamDecoder *decoder, uint32_ if(rice_parameter < pesc) { partitioned_rice_contents->raw_bits[partition] = 0; u = (partition == 0) ? partition_samples - predictor_order : partition_samples; - if(!FLAC__bitreader_read_rice_signed_block(decoder->private_->input, residual + sample, u, rice_parameter)){ + if(!decoder->private_->local_bitreader_read_rice_signed_block(decoder->private_->input, residual + sample, u, rice_parameter)){ if(decoder->protected_->state == FLAC__STREAM_DECODER_READ_FRAME) { /* no error was set, read_callback_ didn't set it, so * invalid rice symbol was found */