audio: Remove 5.1->X SIMD converters, add SSE mono->stereo.

The 5.1 versions didn't use the new algorithm, and making that new algorithm work took so many permutes that it was significantly slower than just using the scalar versions. However, mono-to-stereo is an extremely common conversion, and it's trivial to accelerate it with plain SSE, so that was added!
2022-07-20 16:39:19 -04:00 · 2022-07-20 16:39:19 -04:00 · 9f56c7cf02
parent b83ae9f24c
commit 9f56c7cf02
1 changed files with 46 additions and 205 deletions
--- a/src/audio/SDL_audiocvt.c
+++ b/src/audio/SDL_audiocvt.c
@ -93,196 +93,6 @@
 * 8 channels (7.1) layout: FL+FR+FC+LFE+BL+BR+SL+SR
 */
 #if 0  /* !!! FIXME: these need to be updated to match the new scalar code. */
 #if HAVE_AVX_INTRINSICS
 /* MSVC will always accept AVX intrinsics when compiling for x64 */
 #if defined(__clang__) || defined(__GNUC__)
 __attribute__((target("avx")))
 #endif
 /* Convert from 5.1 to stereo. Average left and right, distribute center, discard LFE. */
 static void SDLCALL
 SDL_Convert51ToStereo_AVX(SDL_AudioCVT * cvt, SDL_AudioFormat format)
 {
    float *dst = (float *) cvt->buf;
    const float *src = dst;
    int i = cvt->len_cvt / (sizeof (float) * 6);
    const float two_fifths_f = 1.0f / 2.5f;
    const __m256 two_fifths_v = _mm256_set1_ps(two_fifths_f);
    const __m256 half = _mm256_set1_ps(0.5f);
    LOG_DEBUG_CONVERT("5.1", "stereo (using AVX)");
    SDL_assert(format == AUDIO_F32SYS);
    /* SDL's 5.1 layout: FL+FR+FC+LFE+BL+BR */
    while (i >= 4) {
        __m256 in0 = _mm256_loadu_ps(src + 0);  /* 0FL 0FR 0FC 0LF 0BL 0BR 1FL 1FR */
        __m256 in1 = _mm256_loadu_ps(src + 8);  /* 1FC 1LF 1BL 1BR 2FL 2FR 2FC 2LF */
        __m256 in2 = _mm256_loadu_ps(src + 16); /* 2BL 2BR 3FL 3FR 3FC 3LF 3BL 3BR */
        /* 0FL 0FR 0FC 0LF 2FL 2FR 2FC 2LF */
        __m256 temp0 = _mm256_blend_ps(in0, in1, 0xF0);
        /* 1FC 1LF 1BL 1BR 3FC 3LF 3BL 3BR */
        __m256 temp1 = _mm256_blend_ps(in1, in2, 0xF0);
        /* 0FC 0FC 1FC 1FC 2FC 2FC 3FC 3FC */
        __m256 fc_distributed = _mm256_mul_ps(half, _mm256_shuffle_ps(temp0, temp1, _MM_SHUFFLE(0, 0, 2, 2)));
        /* 0FL 0FR 1BL 1BR 2FL 2FR 3BL 3BR */
        __m256 permuted0 = _mm256_blend_ps(temp0, temp1, 0xCC);
        /* 0BL 0BR 1FL 1FR 2BL 2BR 3FL 3FR */
        __m256 permuted1 = _mm256_permute2f128_ps(in0, in2, 0x21);
        /*   0FL 0FR 1BL 1BR 2FL 2FR 3BL 3BR */
        /* + 0BL 0BR 1FL 1FR 2BL 2BR 3FL 3FR */
        /* =  0L  0R  1L  1R  2L  2R  3L  3R */
        __m256 out = _mm256_add_ps(permuted0, permuted1);
        out = _mm256_add_ps(out, fc_distributed);
        out = _mm256_mul_ps(out, two_fifths_v);
        _mm256_storeu_ps(dst, out);
        i -= 4; src += 24; dst += 8;
    }
    /* Finish off any leftovers with scalar operations. */
    while (i) {
        const float front_center_distributed = src[2] * 0.5f;
        dst[0] = (src[0] + front_center_distributed + src[4]) * two_fifths_f;  /* left */
        dst[1] = (src[1] + front_center_distributed + src[5]) * two_fifths_f;  /* right */
        i--; src += 6; dst+=2;
    }
    cvt->len_cvt /= 3;
    if (cvt->filters[++cvt->filter_index]) {
        cvt->filters[cvt->filter_index] (cvt, format);
    }
 }
 #endif
 #if HAVE_SSE_INTRINSICS
 /* Convert from 5.1 to stereo. Average left and right, distribute center, discard LFE. */
 static void SDLCALL
 SDL_Convert51ToStereo_SSE(SDL_AudioCVT * cvt, SDL_AudioFormat format)
 {
    float *dst = (float *) cvt->buf;
    const float *src = dst;
    int i = cvt->len_cvt / (sizeof (float) * 6);
    const float two_fifths_f = 1.0f / 2.5f;
    const __m128 two_fifths_v = _mm_set1_ps(two_fifths_f);
    const __m128 half = _mm_set1_ps(0.5f);
    LOG_DEBUG_CONVERT("5.1", "stereo (using SSE)");
    SDL_assert(format == AUDIO_F32SYS);
    /* SDL's 5.1 layout: FL+FR+FC+LFE+BL+BR */
    /* Just use unaligned load/stores, if the memory at runtime is */
    /* aligned it'll be just as fast on modern processors */
    while (i >= 2) {
        /* Two 5.1 samples (12 floats) fit nicely in three 128bit */
        /* registers. Using shuffles they can be rearranged so that */
        /* the conversion math can be vectorized. */
        __m128 in0 = _mm_loadu_ps(src);     /* 0FL 0FR 0FC 0LF */
        __m128 in1 = _mm_loadu_ps(src + 4); /* 0BL 0BR 1FL 1FR */
        __m128 in2 = _mm_loadu_ps(src + 8); /* 1FC 1LF 1BL 1BR */
        /* 0FC 0FC 1FC 1FC */
        __m128 fc_distributed = _mm_mul_ps(half, _mm_shuffle_ps(in0, in2, _MM_SHUFFLE(0, 0, 2, 2)));
        /* 0FL 0FR 1BL 1BR */
        __m128 blended = _mm_shuffle_ps(in0, in2, _MM_SHUFFLE(3, 2, 1, 0));
        /*   0FL 0FR 1BL 1BR */
        /* + 0BL 0BR 1FL 1FR */
        /* =  0L  0R  1L  1R */
        __m128 out = _mm_add_ps(blended, in1);
        out = _mm_add_ps(out, fc_distributed);
        out = _mm_mul_ps(out, two_fifths_v);
        _mm_storeu_ps(dst, out);
        i -= 2; src += 12; dst += 4;
    }
    /* Finish off any leftovers with scalar operations. */
    while (i) {
        const float front_center_distributed = src[2] * 0.5f;
        dst[0] = (src[0] + front_center_distributed + src[4]) * two_fifths_f;  /* left */
        dst[1] = (src[1] + front_center_distributed + src[5]) * two_fifths_f;  /* right */
        i--; src += 6; dst+=2;
    }
    cvt->len_cvt /= 3;
    if (cvt->filters[++cvt->filter_index]) {
        cvt->filters[cvt->filter_index] (cvt, format);
    }
 }
 #endif
 #if HAVE_NEON_INTRINSICS
 /* Convert from 5.1 to stereo. Average left and right, distribute center, discard LFE. */
 static void SDLCALL
 SDL_Convert51ToStereo_NEON(SDL_AudioCVT * cvt, SDL_AudioFormat format)
 {
    float *dst = (float *) cvt->buf;
    const float *src = dst;
    int i = cvt->len_cvt / (sizeof (float) * 6);
    const float two_fifths_f = 1.0f / 2.5f;
    const float32x4_t two_fifths_v = vdupq_n_f32(two_fifths_f);
    const float32x4_t half = vdupq_n_f32(0.5f);
    LOG_DEBUG_CONVERT("5.1", "stereo (using NEON)");
    SDL_assert(format == AUDIO_F32SYS);
    /* SDL's 5.1 layout: FL+FR+FC+LFE+BL+BR */
    /* Just use unaligned load/stores, it's the same NEON instructions and
       hopefully even unaligned NEON is faster than the scalar fallback. */
    while (i >= 2) {
        /* Two 5.1 samples (12 floats) fit nicely in three 128bit */
        /* registers. Using shuffles they can be rearranged so that */
        /* the conversion math can be vectorized. */
        const float32x4_t in0 = vld1q_f32(src);     /* 0FL 0FR 0FC 0LF */
        const float32x4_t in1 = vld1q_f32(src + 4); /* 0BL 0BR 1FL 1FR */
        const float32x4_t in2 = vld1q_f32(src + 8); /* 1FC 1LF 1BL 1BR */
        /* 0FC 0FC 1FC 1FC */
        const float32x4_t fc_distributed = vmulq_f32(half, vcombine_f32(vdup_lane_f32(vget_high_f32(in0), 0), vdup_lane_f32(vget_low_f32(in2), 0)));
        /* 0FL 0FR 1BL 1BR */
        const float32x4_t blended = vcombine_f32(vget_low_f32(in0), vget_high_f32(in2));
        /*   0FL 0FR 1BL 1BR */
        /* + 0BL 0BR 1FL 1FR */
        /* =  0L  0R  1L  1R */
        float32x4_t out = vaddq_f32(blended, in1);
        out = vaddq_f32(out, fc_distributed);
        out = vmulq_f32(out, two_fifths_v);
        vst1q_f32(dst, out);
        i -= 2; src += 12; dst += 4;
    }
    /* Finish off any leftovers with scalar operations. */
    while (i) {
        const float front_center_distributed = src[2] * 0.5f;
        dst[0] = (src[0] + front_center_distributed + src[4]) * two_fifths_f;  /* left */
        dst[1] = (src[1] + front_center_distributed + src[5]) * two_fifths_f;  /* right */
        i--; src += 6; dst+=2;
    }
    cvt->len_cvt /= 3;
    if (cvt->filters[++cvt->filter_index]) {
        cvt->filters[cvt->filter_index] (cvt, format);
    }
 }
 #endif
 #endif
 #if HAVE_SSE3_INTRINSICS
 /* Convert from stereo to mono. Average left and right. */
 static void SDLCALL
@ -317,6 +127,45 @@ SDL_ConvertStereoToMono_SSE3(SDL_AudioCVT * cvt, SDL_AudioFormat format)
 }
 #endif
 #if HAVE_SSE_INTRINSICS
 /* Convert from mono to stereo. Duplicate to stereo left and right. */
 static void SDLCALL
 SDL_ConvertMonoToStereo_SSE(SDL_AudioCVT * cvt, SDL_AudioFormat format)
 {
    float *dst = ((float *) (cvt->buf + (cvt->len_cvt * 2))) - 8;
    const float *src = ((const float *) (cvt->buf + cvt->len_cvt)) - 4;
    int i = cvt->len_cvt / sizeof (float);
    LOG_DEBUG_CONVERT("mono", "stereo (using SSE)");
    SDL_assert(format == AUDIO_F32SYS);
    /* Do SSE blocks as long as we have 16 bytes available.
       Just use unaligned load/stores, if the memory at runtime is
       aligned it'll be just as fast on modern processors */
    /* convert backwards, since output is growing in-place. */
    while (i >= 4) {   /* 4 * float32 */
        const __m128 input = _mm_loadu_ps(src);   /* A B C D */
        _mm_storeu_ps(dst, _mm_unpacklo_ps(input, input));  /* A A B B */
        _mm_storeu_ps(dst+4, _mm_unpackhi_ps(input, input));  /* C C D D */
        i -= 4; src -= 4; dst -= 8;
    }
    /* Finish off any leftovers with scalar operations. */
    src += 3; dst += 6;  /* adjust for smaller buffers. */
    while (i) {     /* convert backwards, since output is growing in-place. */
        const float srcFC = src[0];
        dst[1] /* FR */ = srcFC;
        dst[0] /* FL */ = srcFC;
        i--; src--; dst -= 2;
    }
    cvt->len_cvt *= 2;
    if (cvt->filters[++cvt->filter_index]) {
        cvt->filters[cvt->filter_index] (cvt, format);
    }
 }
 #endif
 /* Include the autogenerated channel converters... */
 #include "SDL_audio_channel_converters.h"
@ -833,26 +682,18 @@ SDL_BuildAudioCVT(SDL_AudioCVT * cvt,
        return SDL_SetError("Invalid channel combination");
    } else if (channel_converter != NULL) {
        /* swap in some SIMD versions for a few of these. */
-        if (channel_converter == SDL_Convert51ToStereo) {
+        if (channel_converter == SDL_ConvertStereoToMono) {
            SDL_AudioFilter filter = NULL;
 #if 0 /* !!! FIXME: these have not been updated for the new formulas */
            #if HAVE_AVX_INTRINSICS
            if (!filter && SDL_HasAVX()) { filter = SDL_Convert51ToStereo_AVX; }
            #endif
            #if HAVE_SSE_INTRINSICS
            if (!filter && SDL_HasSSE()) { filter = SDL_Convert51ToStereo_SSE; }
            #endif
            #if HAVE_NEON_INTRINSICS
            if (!filter && SDL_HasNEON()) { filter = SDL_Convert51ToStereo_NEON; }
            #endif
 #endif
            if (filter) { channel_converter = filter; }
        } else if (channel_converter == SDL_ConvertStereoToMono) {
            SDL_AudioFilter filter = NULL;
            #if HAVE_SSE3_INTRINSICS
            if (!filter && SDL_HasSSE3()) { filter = SDL_ConvertStereoToMono_SSE3; }
            #endif
            if (filter) { channel_converter = filter; }
        } else if (channel_converter == SDL_ConvertMonoToStereo) {
            SDL_AudioFilter filter = NULL;
            #if HAVE_SSE_INTRINSICS
            if (!filter && SDL_HasSSE()) { filter = SDL_ConvertMonoToStereo_SSE; }
            #endif
            if (filter) { channel_converter = filter; }
        }
        if (SDL_AddAudioCVTFilter(cvt, channel_converter) < 0) {