Enable encoder to use INT32_MIN as residual value

As abs(INT32_MIN) is undefined, it took some extra work to enable
the encoder to do this. While expected gains are zero, this is
done to ensure full spec coverage in this regard
This commit is contained in:
Martijn van Beurden 2022-06-29 12:00:13 +02:00
parent 633ab36ec5
commit 7e0a0e5723
7 changed files with 108 additions and 69 deletions

View File

@ -377,33 +377,32 @@ uint32_t FLAC__fixed_compute_best_predictor_limit_residual(const FLAC__int32 dat
#endif
{
FLAC__uint64 total_error_0 = 0, total_error_1 = 0, total_error_2 = 0, total_error_3 = 0, total_error_4 = 0, smallest_error = UINT64_MAX;
FLAC__uint64 error_0, error_1, error_2, error_3, error_4;
FLAC__int64 error_0, error_1, error_2, error_3, error_4;
FLAC__bool order_0_is_valid = true, order_1_is_valid = true, order_2_is_valid = true, order_3_is_valid = true, order_4_is_valid = true;
uint32_t order = 0;
for(int i = 0; i < (int)data_len; i++) {
error_0 = local_abs64((FLAC__int64)data[i]);
error_1 = (i > 0) ? local_abs64((FLAC__int64)data[i] - data[i-1]) : 0 ;
error_2 = (i > 1) ? local_abs64((FLAC__int64)data[i] - 2 * (FLAC__int64)data[i-1] + data[i-2]) : 0;
error_3 = (i > 2) ? local_abs64((FLAC__int64)data[i] - 3 * (FLAC__int64)data[i-1] + 3 * (FLAC__int64)data[i-2] - data[i-3]) : 0;
error_4 = (i > 3) ? local_abs64((FLAC__int64)data[i] - 4 * (FLAC__int64)data[i-1] + 6 * (FLAC__int64)data[i-2] - 4 * (FLAC__int64)data[i-3] + data[i-4]) : 0;
error_0 = (FLAC__int64)data[i];
error_1 = (i > 0) ? (FLAC__int64)data[i] - data[i-1] : 0 ;
error_2 = (i > 1) ? (FLAC__int64)data[i] - 2 * (FLAC__int64)data[i-1] + data[i-2] : 0;
error_3 = (i > 2) ? (FLAC__int64)data[i] - 3 * (FLAC__int64)data[i-1] + 3 * (FLAC__int64)data[i-2] - data[i-3] : 0;
error_4 = (i > 3) ? (FLAC__int64)data[i] - 4 * (FLAC__int64)data[i-1] + 6 * (FLAC__int64)data[i-2] - 4 * (FLAC__int64)data[i-3] + data[i-4] : 0;
total_error_0 += error_0;
total_error_1 += error_1;
total_error_2 += error_2;
total_error_3 += error_3;
total_error_4 += error_4;
total_error_0 += local_abs64(error_0);
total_error_1 += local_abs64(error_1);
total_error_2 += local_abs64(error_2);
total_error_3 += local_abs64(error_3);
total_error_4 += local_abs64(error_4);
/* residual must not be INT32_MIN because abs(INT32_MIN) is undefined */
if(error_0 > INT32_MAX)
if(error_0 > INT32_MAX || error_0 < INT32_MIN)
order_0_is_valid = false;
if(error_1 > INT32_MAX)
if(error_1 > INT32_MAX || error_1 < INT32_MIN)
order_1_is_valid = false;
if(error_2 > INT32_MAX)
if(error_2 > INT32_MAX || error_2 < INT32_MIN)
order_2_is_valid = false;
if(error_3 > INT32_MAX)
if(error_3 > INT32_MAX || error_3 < INT32_MIN)
order_3_is_valid = false;
if(error_4 > INT32_MAX)
if(error_4 > INT32_MAX || error_4 < INT32_MIN)
order_4_is_valid = false;
}
@ -423,33 +422,33 @@ uint32_t FLAC__fixed_compute_best_predictor_limit_residual_33bit(const FLAC__int
#endif
{
FLAC__uint64 total_error_0 = 0, total_error_1 = 0, total_error_2 = 0, total_error_3 = 0, total_error_4 = 0, smallest_error = UINT64_MAX;
FLAC__uint64 error_0, error_1, error_2, error_3, error_4;
FLAC__int64 error_0, error_1, error_2, error_3, error_4;
FLAC__bool order_0_is_valid = true, order_1_is_valid = true, order_2_is_valid = true, order_3_is_valid = true, order_4_is_valid = true;
uint32_t order = 0;
for(int i = 0; i < (int)data_len; i++) {
error_0 = local_abs64(data[i]);
error_1 = (i > 0) ? local_abs64(data[i] - data[i-1]) : 0 ;
error_2 = (i > 1) ? local_abs64(data[i] - 2 * data[i-1] + data[i-2]) : 0;
error_3 = (i > 2) ? local_abs64(data[i] - 3 * data[i-1] + 3 * data[i-2] - data[i-3]) : 0;
error_4 = (i > 3) ? local_abs64(data[i] - 4 * data[i-1] + 6 * data[i-2] - 4 * data[i-3] + data[i-4]) : 0;
error_0 = data[i];
error_1 = (i > 0) ? data[i] - data[i-1] : 0 ;
error_2 = (i > 1) ? data[i] - 2 * data[i-1] + data[i-2] : 0;
error_3 = (i > 2) ? data[i] - 3 * data[i-1] + 3 * data[i-2] - data[i-3] : 0;
error_4 = (i > 3) ? data[i] - 4 * data[i-1] + 6 * data[i-2] - 4 * data[i-3] + data[i-4] : 0;
total_error_0 += error_0;
total_error_1 += error_1;
total_error_2 += error_2;
total_error_3 += error_3;
total_error_4 += error_4;
total_error_0 += local_abs64(error_0);
total_error_1 += local_abs64(error_1);
total_error_2 += local_abs64(error_2);
total_error_3 += local_abs64(error_3);
total_error_4 += local_abs64(error_4);
/* residual must not be INT32_MIN because abs(INT32_MIN) is undefined */
if(error_0 > INT32_MAX)
if(error_0 > INT32_MAX || error_0 < INT32_MIN)
order_0_is_valid = false;
if(error_1 > INT32_MAX)
if(error_1 > INT32_MAX || error_1 < INT32_MIN)
order_1_is_valid = false;
if(error_2 > INT32_MAX)
if(error_2 > INT32_MAX || error_2 < INT32_MIN)
order_2_is_valid = false;
if(error_3 > INT32_MAX)
if(error_3 > INT32_MAX || error_3 < INT32_MIN)
order_3_is_valid = false;
if(error_4 > INT32_MAX)
if(error_4 > INT32_MAX || error_4 < INT32_MIN)
order_4_is_valid = false;
}

View File

@ -37,29 +37,23 @@
#include <config.h>
#endif
/*
* This is used to avoid overflow with unusual signals in 32-bit
* accumulator in the *precompute_partition_info_sums_* functions.
*/
#define FLAC__MAX_EXTRA_RESIDUAL_BPS 4
#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
#include "private/cpu.h"
#include "FLAC/format.h"
#ifdef FLAC__SSE2_SUPPORTED
extern void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[],
uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t bps);
uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t max_residual_bps);
#endif
#ifdef FLAC__SSSE3_SUPPORTED
extern void FLAC__precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[],
uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t bps);
uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t max_residual_bps);
#endif
#ifdef FLAC__AVX2_SUPPORTED
extern void FLAC__precompute_partition_info_sums_intrin_avx2(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[],
uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t bps);
uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t max_residual_bps);
#endif
#endif

View File

@ -828,8 +828,7 @@ FLAC__bool FLAC__lpc_compute_residual_from_qlp_coefficients_limit_residual(const
case 1: sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
}
residual_to_check = data[i] - (sum >> lp_quantization);
/* residual must not be INT32_MIN because abs(INT32_MIN) is undefined */
if(residual_to_check <= INT32_MIN || residual_to_check > INT32_MAX)
if(residual_to_check < INT32_MIN || residual_to_check > INT32_MAX)
return false;
else
residual[i] = residual_to_check;
@ -882,8 +881,7 @@ FLAC__bool FLAC__lpc_compute_residual_from_qlp_coefficients_limit_residual_33bit
case 1: sum += qlp_coeff[ 0] * data[i- 1];
}
residual_to_check = data[i] - (sum >> lp_quantization);
/* residual must not be INT32_MIN because abs(INT32_MIN) is undefined */
if(residual_to_check <= INT32_MIN || residual_to_check > INT32_MAX)
if(residual_to_check < INT32_MIN || residual_to_check > INT32_MAX)
return false;
else
residual[i] = residual_to_check;

View File

@ -231,7 +231,7 @@ static uint32_t find_best_partition_order_(
uint32_t rice_parameter_limit,
uint32_t min_partition_order,
uint32_t max_partition_order,
uint32_t bps,
uint32_t max_residual_bps,
FLAC__bool do_escape_coding,
uint32_t rice_parameter_search_dist,
FLAC__EntropyCodingMethod *best_ecm
@ -244,7 +244,7 @@ static void precompute_partition_info_sums_(
uint32_t predictor_order,
uint32_t min_partition_order,
uint32_t max_partition_order,
uint32_t bps
uint32_t max_residual_bps
);
static void precompute_partition_info_escapes_(
@ -349,7 +349,7 @@ typedef struct FLAC__StreamEncoderPrivate {
uint32_t current_frame_number;
FLAC__MD5Context md5context;
FLAC__CPUInfo cpuinfo;
void (*local_precompute_partition_info_sums)(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[], uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t bps);
void (*local_precompute_partition_info_sums)(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[], uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t max_residual_bps);
#ifndef FLAC__INTEGER_ONLY_LIBRARY
uint32_t (*local_fixed_compute_best_predictor)(const FLAC__int32 data[], uint32_t data_len, float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
uint32_t (*local_fixed_compute_best_predictor_wide)(const FLAC__int32 data[], uint32_t data_len, float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
@ -3873,7 +3873,7 @@ uint32_t evaluate_fixed_subframe_(
rice_parameter_limit,
min_partition_order,
max_partition_order,
subframe_bps,
(subframe_bps + order),
do_escape_coding,
rice_parameter_search_dist,
&subframe->data.fixed.entropy_coding_method
@ -3972,7 +3972,7 @@ uint32_t evaluate_lpc_subframe_(
rice_parameter_limit,
min_partition_order,
max_partition_order,
subframe_bps,
FLAC__lpc_max_residual_bps(subframe_bps, qlp_coeff, order, quantization),
do_escape_coding,
rice_parameter_search_dist,
&subframe->data.lpc.entropy_coding_method
@ -4046,7 +4046,7 @@ uint32_t find_best_partition_order_(
uint32_t rice_parameter_limit,
uint32_t min_partition_order,
uint32_t max_partition_order,
uint32_t bps,
uint32_t max_residual_bps,
FLAC__bool do_escape_coding,
uint32_t rice_parameter_search_dist,
FLAC__EntropyCodingMethod *best_ecm
@ -4060,7 +4060,7 @@ uint32_t find_best_partition_order_(
max_partition_order = FLAC__format_get_max_rice_partition_order_from_blocksize_limited_max_and_predictor_order(max_partition_order, blocksize, predictor_order);
min_partition_order = flac_min(min_partition_order, max_partition_order);
private_->local_precompute_partition_info_sums(residual, abs_residual_partition_sums, residual_samples, predictor_order, min_partition_order, max_partition_order, bps);
private_->local_precompute_partition_info_sums(residual, abs_residual_partition_sums, residual_samples, predictor_order, min_partition_order, max_partition_order, max_residual_bps);
if(do_escape_coding)
precompute_partition_info_escapes_(residual, raw_bits_per_partition, residual_samples, predictor_order, min_partition_order, max_partition_order);
@ -4138,7 +4138,7 @@ void precompute_partition_info_sums_(
uint32_t predictor_order,
uint32_t min_partition_order,
uint32_t max_partition_order,
uint32_t bps
uint32_t max_residual_bps
)
{
const uint32_t default_partition_samples = (residual_samples + predictor_order) >> max_partition_order;
@ -4150,22 +4150,33 @@ void precompute_partition_info_sums_(
{
const uint32_t threshold = 32 - FLAC__bitmath_ilog2(default_partition_samples);
uint32_t partition, residual_sample, end = (uint32_t)(-(int)predictor_order);
/* WATCHOUT: "bps + FLAC__MAX_EXTRA_RESIDUAL_BPS" is the maximum assumed size of the average residual magnitude */
if(bps + FLAC__MAX_EXTRA_RESIDUAL_BPS < threshold) {
if(max_residual_bps < threshold) {
for(partition = residual_sample = 0; partition < partitions; partition++) {
FLAC__uint32 abs_residual_partition_sum = 0;
end += default_partition_samples;
for( ; residual_sample < end; residual_sample++)
abs_residual_partition_sum += abs(residual[residual_sample]); /* abs(INT_MIN) is undefined, but if the residual is INT_MIN we have bigger problems */
abs_residual_partition_sum += abs(residual[residual_sample]);
abs_residual_partition_sums[partition] = abs_residual_partition_sum;
}
}
else { /* have to pessimistically use 64 bits for accumulator */
else if(max_residual_bps < 32) { /* have to pessimistically use 64 bits for accumulator */
for(partition = residual_sample = 0; partition < partitions; partition++) {
FLAC__uint64 abs_residual_partition_sum64 = 0;
end += default_partition_samples;
for( ; residual_sample < end; residual_sample++)
abs_residual_partition_sum64 += abs(residual[residual_sample]); /* abs(INT_MIN) is undefined, but if the residual is INT_MIN we have bigger problems */
abs_residual_partition_sum64 += abs(residual[residual_sample]);
abs_residual_partition_sums[partition] = abs_residual_partition_sum64;
}
}
else { /* must handle abs(INT32_MIN) */
for(partition = residual_sample = 0; partition < partitions; partition++) {
FLAC__uint64 abs_residual_partition_sum64 = 0;
end += default_partition_samples;
for( ; residual_sample < end; residual_sample++)
if(residual[residual_sample] == INT32_MIN)
abs_residual_partition_sum64 -= (FLAC__int64)INT32_MIN;
else
abs_residual_partition_sum64 += abs(residual[residual_sample]);
abs_residual_partition_sums[partition] = abs_residual_partition_sum64;
}
}

View File

@ -48,7 +48,7 @@
FLAC__SSE_TARGET("avx2")
void FLAC__precompute_partition_info_sums_intrin_avx2(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[],
uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t bps)
uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t max_residual_bps)
{
const uint32_t default_partition_samples = (residual_samples + predictor_order) >> max_partition_order;
uint32_t partitions = 1u << max_partition_order;
@ -60,7 +60,7 @@ void FLAC__precompute_partition_info_sums_intrin_avx2(const FLAC__int32 residual
const uint32_t threshold = 32 - FLAC__bitmath_ilog2(default_partition_samples);
uint32_t partition, residual_sample, end = (uint32_t)(-(int32_t)predictor_order);
if(bps + FLAC__MAX_EXTRA_RESIDUAL_BPS < threshold) {
if(max_residual_bps < threshold) {
for(partition = residual_sample = 0; partition < partitions; partition++) {
__m256i sum256 = _mm256_setzero_si256();
__m128i sum128;
@ -92,7 +92,7 @@ void FLAC__precompute_partition_info_sums_intrin_avx2(const FLAC__int32 residual
#endif
}
}
else { /* have to pessimistically use 64 bits for accumulator */
else if(max_residual_bps < 32) { /* have to pessimistically use 64 bits for accumulator */
for(partition = residual_sample = 0; partition < partitions; partition++) {
__m256i sum256 = _mm256_setzero_si256();
__m128i sum128;
@ -121,6 +121,18 @@ void FLAC__precompute_partition_info_sums_intrin_avx2(const FLAC__int32 residual
_mm_storel_epi64((__m128i*)(void*)(abs_residual_partition_sums+partition), sum128);
}
}
else { /* must handle abs(INT32_MIN) */
for(partition = residual_sample = 0; partition < partitions; partition++) {
FLAC__uint64 abs_residual_partition_sum64 = 0;
end += default_partition_samples;
for( ; residual_sample < end; residual_sample++)
if(residual[residual_sample] == INT32_MIN)
abs_residual_partition_sum64 -= (FLAC__int64)INT32_MIN;
else
abs_residual_partition_sum64 += abs(residual[residual_sample]);
abs_residual_partition_sums[partition] = abs_residual_partition_sum64;
}
}
}
/* now merge partitions for lower orders */

View File

@ -59,7 +59,7 @@ static inline __m128i local_abs_epi32(__m128i val)
FLAC__SSE_TARGET("sse2")
void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[],
uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t bps)
uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t max_residual_bps)
{
const uint32_t default_partition_samples = (residual_samples + predictor_order) >> max_partition_order;
uint32_t partitions = 1u << max_partition_order;
@ -71,7 +71,7 @@ void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual
const uint32_t threshold = 32 - FLAC__bitmath_ilog2(default_partition_samples);
uint32_t partition, residual_sample, end = (uint32_t)(-(int32_t)predictor_order);
if(bps + FLAC__MAX_EXTRA_RESIDUAL_BPS < threshold) {
if(max_residual_bps < threshold) {
for(partition = residual_sample = 0; partition < partitions; partition++) {
__m128i mm_sum = _mm_setzero_si128();
uint32_t e1, e3;
@ -106,7 +106,7 @@ void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual
#endif
}
}
else { /* have to pessimistically use 64 bits for accumulator */
else if(max_residual_bps < 32) { /* have to pessimistically use 64 bits for accumulator */
for(partition = residual_sample = 0; partition < partitions; partition++) {
__m128i mm_sum = _mm_setzero_si128();
uint32_t e1, e3;
@ -135,6 +135,19 @@ void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual
_mm_storel_epi64((__m128i*)(void*)(abs_residual_partition_sums+partition), mm_sum);
}
}
else { /* must handle abs(INT32_MIN) */
for(partition = residual_sample = 0; partition < partitions; partition++) {
FLAC__uint64 abs_residual_partition_sum64 = 0;
end += default_partition_samples;
for( ; residual_sample < end; residual_sample++)
if(residual[residual_sample] == INT32_MIN)
abs_residual_partition_sum64 -= (FLAC__int64)INT32_MIN;
else
abs_residual_partition_sum64 += abs(residual[residual_sample]);
abs_residual_partition_sums[partition] = abs_residual_partition_sum64;
}
}
}
/* now merge partitions for lower orders */

View File

@ -48,7 +48,7 @@
FLAC__SSE_TARGET("ssse3")
void FLAC__precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[],
uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t bps)
uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t max_residual_bps)
{
const uint32_t default_partition_samples = (residual_samples + predictor_order) >> max_partition_order;
uint32_t partitions = 1u << max_partition_order;
@ -60,7 +60,7 @@ void FLAC__precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residua
const uint32_t threshold = 32 - FLAC__bitmath_ilog2(default_partition_samples);
uint32_t partition, residual_sample, end = (uint32_t)(-(int32_t)predictor_order);
if(bps + FLAC__MAX_EXTRA_RESIDUAL_BPS < threshold) {
if(max_residual_bps < threshold) {
for(partition = residual_sample = 0; partition < partitions; partition++) {
__m128i mm_sum = _mm_setzero_si128();
uint32_t e1, e3;
@ -95,7 +95,7 @@ void FLAC__precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residua
#endif
}
}
else { /* have to pessimistically use 64 bits for accumulator */
else if(max_residual_bps < 32) { /* have to pessimistically use 64 bits for accumulator */
for(partition = residual_sample = 0; partition < partitions; partition++) {
__m128i mm_sum = _mm_setzero_si128();
uint32_t e1, e3;
@ -124,6 +124,18 @@ void FLAC__precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residua
_mm_storel_epi64((__m128i*)(void*)(abs_residual_partition_sums+partition), mm_sum);
}
}
else { /* must handle abs(INT32_MIN) */
for(partition = residual_sample = 0; partition < partitions; partition++) {
FLAC__uint64 abs_residual_partition_sum64 = 0;
end += default_partition_samples;
for( ; residual_sample < end; residual_sample++)
if(residual[residual_sample] == INT32_MIN)
abs_residual_partition_sum64 -= (FLAC__int64)INT32_MIN;
else
abs_residual_partition_sum64 += abs(residual[residual_sample]);
abs_residual_partition_sums[partition] = abs_residual_partition_sum64;
}
}
}
/* now merge partitions for lower orders */