/** * FreeRDP: A Remote Desktop Protocol Implementation * Digital Sound Processing - FFMPEG backend * * Copyright 2018 Armin Novak * Copyright 2018 Thincast Technologies GmbH * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include #include #if defined(SWRESAMPLE_FOUND) #include #elif defined(AVRESAMPLE_FOUND) #include #else #error "libswresample or libavresample required" #endif #include "dsp.h" #include "dsp_ffmpeg.h" #define TAG FREERDP_TAG("dsp.ffmpeg") struct S_FREERDP_DSP_CONTEXT { AUDIO_FORMAT format; BOOL isOpen; BOOL encoder; UINT32 bufferedSamples; enum AVCodecID id; AVCodec* codec; AVCodecContext* context; AVFrame* frame; AVFrame* resampled; AVFrame* buffered; AVPacket* packet; #if defined(SWRESAMPLE_FOUND) SwrContext* rcontext; #else AVAudioResampleContext* rcontext; #endif wStream* channelmix; }; static BOOL ffmpeg_codec_is_filtered(enum AVCodecID id, BOOL encoder) { switch (id) { #if !defined(WITH_DSP_EXPERIMENTAL) case AV_CODEC_ID_ADPCM_IMA_OKI: case AV_CODEC_ID_MP3: case AV_CODEC_ID_ADPCM_MS: case AV_CODEC_ID_G723_1: return TRUE; #endif case AV_CODEC_ID_NONE: return TRUE; case AV_CODEC_ID_GSM_MS: case AV_CODEC_ID_AAC: case AV_CODEC_ID_AAC_LATM: #if !defined(WITH_DSP_EXPERIMENTAL) if (encoder) return TRUE; #endif return FALSE; default: return FALSE; } } static enum AVCodecID ffmpeg_get_avcodec(const AUDIO_FORMAT* WINPR_RESTRICT format) { if (!format) return AV_CODEC_ID_NONE; switch (format->wFormatTag) { case WAVE_FORMAT_UNKNOWN: return AV_CODEC_ID_NONE; case WAVE_FORMAT_PCM: switch (format->wBitsPerSample) { case 16: return AV_CODEC_ID_PCM_U16LE; case 8: return AV_CODEC_ID_PCM_U8; default: return AV_CODEC_ID_NONE; } case WAVE_FORMAT_DVI_ADPCM: return AV_CODEC_ID_ADPCM_IMA_OKI; case WAVE_FORMAT_ADPCM: return AV_CODEC_ID_ADPCM_MS; case WAVE_FORMAT_ALAW: return AV_CODEC_ID_PCM_ALAW; case WAVE_FORMAT_MULAW: return AV_CODEC_ID_PCM_MULAW; case WAVE_FORMAT_GSM610: return AV_CODEC_ID_GSM_MS; case WAVE_FORMAT_MSG723: return AV_CODEC_ID_G723_1; case WAVE_FORMAT_AAC_MS: return AV_CODEC_ID_AAC; case WAVE_FORMAT_OPUS: return AV_CODEC_ID_OPUS; default: return AV_CODEC_ID_NONE; } } static int ffmpeg_sample_format(const AUDIO_FORMAT* WINPR_RESTRICT format) { switch (format->wFormatTag) { case WAVE_FORMAT_PCM: switch (format->wBitsPerSample) { case 8: return AV_SAMPLE_FMT_U8; case 16: return AV_SAMPLE_FMT_S16; default: return FALSE; } case WAVE_FORMAT_DVI_ADPCM: case WAVE_FORMAT_ADPCM: return AV_SAMPLE_FMT_S16P; case WAVE_FORMAT_MPEGLAYER3: case WAVE_FORMAT_AAC_MS: return AV_SAMPLE_FMT_FLTP; case WAVE_FORMAT_OPUS: return AV_SAMPLE_FMT_S16; case WAVE_FORMAT_MSG723: case WAVE_FORMAT_GSM610: return AV_SAMPLE_FMT_S16P; case WAVE_FORMAT_ALAW: return AV_SAMPLE_FMT_S16; default: return FALSE; } } static void ffmpeg_close_context(FREERDP_DSP_CONTEXT* WINPR_RESTRICT context) { if (context) { if (context->context) avcodec_free_context(&context->context); if (context->frame) av_frame_free(&context->frame); if (context->resampled) av_frame_free(&context->resampled); if (context->buffered) av_frame_free(&context->buffered); if (context->packet) av_packet_free(&context->packet); if (context->rcontext) { #if defined(SWRESAMPLE_FOUND) swr_free(&context->rcontext); #else avresample_free(&context->rcontext); #endif } context->id = AV_CODEC_ID_NONE; context->codec = NULL; context->isOpen = FALSE; context->context = NULL; context->frame = NULL; context->resampled = NULL; context->packet = NULL; context->rcontext = NULL; } } static BOOL ffmpeg_open_context(FREERDP_DSP_CONTEXT* WINPR_RESTRICT context) { int ret = 0; if (!context || context->isOpen) return FALSE; const AUDIO_FORMAT* format = &context->format; if (!format) return FALSE; #if LIBAVUTIL_VERSION_INT < AV_VERSION_INT(57, 28, 100) const int layout = av_get_default_channel_layout(format->nChannels); #endif context->id = ffmpeg_get_avcodec(format); if (ffmpeg_codec_is_filtered(context->id, context->encoder)) goto fail; if (context->encoder) context->codec = avcodec_find_encoder(context->id); else context->codec = avcodec_find_decoder(context->id); if (!context->codec) goto fail; context->context = avcodec_alloc_context3(context->codec); if (!context->context) goto fail; switch (context->id) { /* We need support for multichannel and sample rates != 8000 */ case AV_CODEC_ID_GSM_MS: context->context->strict_std_compliance = FF_COMPLIANCE_UNOFFICIAL; break; case AV_CODEC_ID_AAC: context->context->profile = FF_PROFILE_AAC_MAIN; break; default: break; } context->context->max_b_frames = 1; context->context->delay = 0; #if LIBAVUTIL_VERSION_INT < AV_VERSION_INT(57, 28, 100) context->context->channels = format->nChannels; context->context->channel_layout = layout; #else av_channel_layout_default(&context->context->ch_layout, format->nChannels); #endif context->context->sample_rate = format->nSamplesPerSec; context->context->block_align = format->nBlockAlign; context->context->bit_rate = format->nAvgBytesPerSec * 8; context->context->sample_fmt = ffmpeg_sample_format(format); context->context->time_base = av_make_q(1, context->context->sample_rate); if ((ret = avcodec_open2(context->context, context->codec, NULL)) < 0) { const char* err = av_err2str(ret); WLog_ERR(TAG, "Error avcodec_open2 %s [%d]", err, ret); goto fail; } context->packet = av_packet_alloc(); if (!context->packet) goto fail; context->frame = av_frame_alloc(); if (!context->frame) goto fail; context->resampled = av_frame_alloc(); if (!context->resampled) goto fail; context->buffered = av_frame_alloc(); if (!context->buffered) goto fail; #if defined(SWRESAMPLE_FOUND) context->rcontext = swr_alloc(); #else context->rcontext = avresample_alloc_context(); #endif if (!context->rcontext) goto fail; #if LIBAVUTIL_VERSION_INT < AV_VERSION_INT(57, 28, 100) context->frame->channel_layout = layout; context->frame->channels = format->nChannels; #else av_channel_layout_default(&context->frame->ch_layout, format->nChannels); #endif context->frame->sample_rate = format->nSamplesPerSec; context->frame->format = AV_SAMPLE_FMT_S16; if (context->encoder) { context->resampled->format = context->context->sample_fmt; context->resampled->sample_rate = context->context->sample_rate; } else { context->resampled->format = AV_SAMPLE_FMT_S16; context->resampled->sample_rate = format->nSamplesPerSec; } #if LIBAVUTIL_VERSION_INT < AV_VERSION_INT(57, 28, 100) context->resampled->channel_layout = layout; context->resampled->channels = format->nChannels; #else av_channel_layout_default(&context->resampled->ch_layout, format->nChannels); #endif if (context->context->frame_size > 0) { #if LIBAVUTIL_VERSION_INT < AV_VERSION_INT(57, 28, 100) context->buffered->channel_layout = context->resampled->channel_layout; context->buffered->channels = context->resampled->channels; #else av_channel_layout_copy(&context->buffered->ch_layout, &context->resampled->ch_layout); #endif context->buffered->format = context->resampled->format; context->buffered->nb_samples = context->context->frame_size; if ((ret = av_frame_get_buffer(context->buffered, 1)) < 0) goto fail; } context->isOpen = TRUE; return TRUE; fail: ffmpeg_close_context(context); return FALSE; } #if defined(SWRESAMPLE_FOUND) static BOOL ffmpeg_resample_frame(SwrContext* WINPR_RESTRICT context, AVFrame* WINPR_RESTRICT in, AVFrame* WINPR_RESTRICT out) { int ret = 0; if (!swr_is_initialized(context)) { if ((ret = swr_config_frame(context, out, in)) < 0) { const char* err = av_err2str(ret); WLog_ERR(TAG, "Error during resampling %s [%d]", err, ret); return FALSE; } if ((ret = (swr_init(context))) < 0) { const char* err = av_err2str(ret); WLog_ERR(TAG, "Error during resampling %s [%d]", err, ret); return FALSE; } } if ((ret = swr_convert_frame(context, out, in)) < 0) { const char* err = av_err2str(ret); WLog_ERR(TAG, "Error during resampling %s [%d]", err, ret); return FALSE; } return TRUE; } #else static BOOL ffmpeg_resample_frame(AVAudioResampleContext* WINPR_RESTRICT context, AVFrame* WINPR_RESTRICT in, AVFrame* WINPR_RESTRICT out) { int ret; if (!avresample_is_open(context)) { if ((ret = avresample_config(context, out, in)) < 0) { const char* err = av_err2str(ret); WLog_ERR(TAG, "Error during resampling %s [%d]", err, ret); return FALSE; } if ((ret = (avresample_open(context))) < 0) { const char* err = av_err2str(ret); WLog_ERR(TAG, "Error during resampling %s [%d]", err, ret); return FALSE; } } if ((ret = avresample_convert_frame(context, out, in)) < 0) { const char* err = av_err2str(ret); WLog_ERR(TAG, "Error during resampling %s [%d]", err, ret); return FALSE; } return TRUE; } #endif static BOOL ffmpeg_encode_frame(AVCodecContext* WINPR_RESTRICT context, AVFrame* WINPR_RESTRICT in, AVPacket* WINPR_RESTRICT packet, wStream* WINPR_RESTRICT out) { if (in->format == AV_SAMPLE_FMT_FLTP) { uint8_t** pp = in->extended_data; #if LIBAVUTIL_VERSION_INT < AV_VERSION_INT(57, 28, 100) const int nr_channels = in->channels; #else const int nr_channels = in->ch_layout.nb_channels; #endif for (int y = 0; y < nr_channels; y++) { float* data = (float*)pp[y]; for (int x = 0; x < in->nb_samples; x++) { const float val1 = data[x]; if (isnan(val1)) data[x] = 0.0f; else if (isinf(val1)) { if (val1 < 0.0f) data[x] = -1.0f; else data[x] = 1.0f; } } } } /* send the packet with the compressed data to the encoder */ int ret = avcodec_send_frame(context, in); if (ret < 0) { const char* err = av_err2str(ret); WLog_ERR(TAG, "Error submitting the packet to the encoder %s [%d]", err, ret); return FALSE; } /* read all the output frames (in general there may be any number of them */ while (ret >= 0) { ret = avcodec_receive_packet(context, packet); if ((ret == AVERROR(EAGAIN)) || (ret == AVERROR_EOF)) return TRUE; else if (ret < 0) { const char* err = av_err2str(ret); WLog_ERR(TAG, "Error during encoding %s [%d]", err, ret); return FALSE; } if (!Stream_EnsureRemainingCapacity(out, packet->size)) return FALSE; Stream_Write(out, packet->data, packet->size); av_packet_unref(packet); } return TRUE; } static BOOL ffmpeg_fill_frame(AVFrame* WINPR_RESTRICT frame, const AUDIO_FORMAT* WINPR_RESTRICT inputFormat, const BYTE* WINPR_RESTRICT data, size_t size) { int ret = 0; int bpp = 0; #if LIBAVUTIL_VERSION_INT < AV_VERSION_INT(57, 28, 100) frame->channels = inputFormat->nChannels; frame->channel_layout = av_get_default_channel_layout(frame->channels); #else av_channel_layout_default(&frame->ch_layout, inputFormat->nChannels); #endif frame->sample_rate = inputFormat->nSamplesPerSec; frame->format = ffmpeg_sample_format(inputFormat); bpp = av_get_bytes_per_sample(frame->format); frame->nb_samples = size / inputFormat->nChannels / bpp; if ((ret = avcodec_fill_audio_frame(frame, inputFormat->nChannels, frame->format, data, size, 1)) < 0) { const char* err = av_err2str(ret); WLog_ERR(TAG, "Error during audio frame fill %s [%d]", err, ret); return FALSE; } return TRUE; } #if defined(SWRESAMPLE_FOUND) static BOOL ffmpeg_decode(AVCodecContext* WINPR_RESTRICT dec_ctx, AVPacket* WINPR_RESTRICT pkt, AVFrame* WINPR_RESTRICT frame, SwrContext* WINPR_RESTRICT resampleContext, AVFrame* WINPR_RESTRICT resampled, wStream* WINPR_RESTRICT out) #else static BOOL ffmpeg_decode(AVCodecContext* dec_ctx, AVPacket* pkt, AVFrame* frame, AVAudioResampleContext* resampleContext, AVFrame* resampled, wStream* out) #endif { int ret = 0; /* send the packet with the compressed data to the decoder */ ret = avcodec_send_packet(dec_ctx, pkt); if (ret < 0) { const char* err = av_err2str(ret); WLog_ERR(TAG, "Error submitting the packet to the decoder %s [%d]", err, ret); return FALSE; } /* read all the output frames (in general there may be any number of them */ while (ret >= 0) { ret = avcodec_receive_frame(dec_ctx, frame); if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) return TRUE; else if (ret < 0) { const char* err = av_err2str(ret); WLog_ERR(TAG, "Error during decoding %s [%d]", err, ret); return FALSE; } #if defined(SWRESAMPLE_FOUND) if (!swr_is_initialized(resampleContext)) { if ((ret = swr_config_frame(resampleContext, resampled, frame)) < 0) { #else if (!avresample_is_open(resampleContext)) { if ((ret = avresample_config(resampleContext, resampled, frame)) < 0) { #endif const char* err = av_err2str(ret); WLog_ERR(TAG, "Error during resampling %s [%d]", err, ret); return FALSE; } #if defined(SWRESAMPLE_FOUND) if ((ret = (swr_init(resampleContext))) < 0) #else if ((ret = (avresample_open(resampleContext))) < 0) #endif { const char* err = av_err2str(ret); WLog_ERR(TAG, "Error during resampling %s [%d]", err, ret); return FALSE; } } #if defined(SWRESAMPLE_FOUND) if ((ret = swr_convert_frame(resampleContext, resampled, frame)) < 0) #else if ((ret = avresample_convert_frame(resampleContext, resampled, frame)) < 0) #endif { const char* err = av_err2str(ret); WLog_ERR(TAG, "Error during resampling %s [%d]", err, ret); return FALSE; } { #if LIBAVUTIL_VERSION_INT < AV_VERSION_INT(57, 28, 100) const size_t channels = resampled->channels; #else const size_t channels = resampled->ch_layout.nb_channels; #endif const size_t data_size = channels * resampled->nb_samples * 2; if (!Stream_EnsureRemainingCapacity(out, data_size)) return FALSE; Stream_Write(out, resampled->data[0], data_size); } } return TRUE; } BOOL freerdp_dsp_ffmpeg_supports_format(const AUDIO_FORMAT* WINPR_RESTRICT format, BOOL encode) { enum AVCodecID id = ffmpeg_get_avcodec(format); if (ffmpeg_codec_is_filtered(id, encode)) return FALSE; if (encode) return avcodec_find_encoder(id) != NULL; else return avcodec_find_decoder(id) != NULL; } FREERDP_DSP_CONTEXT* freerdp_dsp_ffmpeg_context_new(BOOL encode) { FREERDP_DSP_CONTEXT* context = NULL; #if LIBAVCODEC_VERSION_INT < AV_VERSION_INT(58, 10, 100) avcodec_register_all(); #endif context = calloc(1, sizeof(FREERDP_DSP_CONTEXT)); if (!context) return NULL; context->channelmix = Stream_New(NULL, 1024); if (!context->channelmix) { WINPR_PRAGMA_DIAG_PUSH WINPR_PRAGMA_DIAG_IGNORED_MISMATCHED_DEALLOC freerdp_dsp_ffmpeg_context_free(context); WINPR_PRAGMA_DIAG_POP return NULL; } context->encoder = encode; return context; } void freerdp_dsp_ffmpeg_context_free(FREERDP_DSP_CONTEXT* context) { if (context) { ffmpeg_close_context(context); Stream_Free(context->channelmix, TRUE); free(context); } } BOOL freerdp_dsp_ffmpeg_context_reset(FREERDP_DSP_CONTEXT* WINPR_RESTRICT context, const AUDIO_FORMAT* WINPR_RESTRICT targetFormat) { if (!context || !targetFormat) return FALSE; ffmpeg_close_context(context); context->format = *targetFormat; return ffmpeg_open_context(context); } static BOOL freerdp_dsp_channel_mix(FREERDP_DSP_CONTEXT* WINPR_RESTRICT context, const BYTE* WINPR_RESTRICT src, size_t size, const AUDIO_FORMAT* WINPR_RESTRICT srcFormat, const BYTE** WINPR_RESTRICT data, size_t* WINPR_RESTRICT length, AUDIO_FORMAT* WINPR_RESTRICT dstFormat) { UINT32 bpp = 0; size_t samples = 0; if (!context || !data || !length || !dstFormat) return FALSE; if (srcFormat->wFormatTag != WAVE_FORMAT_PCM) return FALSE; bpp = srcFormat->wBitsPerSample > 8 ? 2 : 1; samples = size / bpp / srcFormat->nChannels; *dstFormat = *srcFormat; if (context->format.nChannels == srcFormat->nChannels) { *data = src; *length = size; return TRUE; } Stream_SetPosition(context->channelmix, 0); /* Destination has more channels than source */ if (context->format.nChannels > srcFormat->nChannels) { switch (srcFormat->nChannels) { case 1: if (!Stream_EnsureCapacity(context->channelmix, size * 2)) return FALSE; for (UINT32 x = 0; x < samples; x++) { for (UINT32 y = 0; y < bpp; y++) Stream_Write_UINT8(context->channelmix, src[x * bpp + y]); for (UINT32 y = 0; y < bpp; y++) Stream_Write_UINT8(context->channelmix, src[x * bpp + y]); } Stream_SealLength(context->channelmix); *data = Stream_Buffer(context->channelmix); *length = Stream_Length(context->channelmix); dstFormat->nChannels = 2; return TRUE; case 2: /* We only support stereo, so we can not handle this case. */ default: /* Unsupported number of channels */ WLog_WARN(TAG, "unsupported source channel count %" PRIu16, srcFormat->nChannels); return FALSE; } } /* Destination has less channels than source */ switch (srcFormat->nChannels) { case 2: if (!Stream_EnsureCapacity(context->channelmix, size / 2)) return FALSE; /* Simply drop second channel. * TODO: Calculate average */ for (UINT32 x = 0; x < samples; x++) { for (UINT32 y = 0; y < bpp; y++) Stream_Write_UINT8(context->channelmix, src[2 * x * bpp + y]); } Stream_SealLength(context->channelmix); *data = Stream_Buffer(context->channelmix); *length = Stream_Length(context->channelmix); dstFormat->nChannels = 1; return TRUE; case 1: /* Invalid, do we want to use a 0 channel sound? */ default: /* Unsupported number of channels */ WLog_WARN(TAG, "unsupported channel count %" PRIu16, srcFormat->nChannels); return FALSE; } return FALSE; } BOOL freerdp_dsp_ffmpeg_encode(FREERDP_DSP_CONTEXT* WINPR_RESTRICT context, const AUDIO_FORMAT* WINPR_RESTRICT format, const BYTE* WINPR_RESTRICT data, size_t length, wStream* WINPR_RESTRICT out) { AUDIO_FORMAT fmt = { 0 }; if (!context || !format || !data || !out || !context->encoder) return FALSE; if (!context || !data || !out) return FALSE; /* https://github.com/FreeRDP/FreeRDP/issues/7607 * * we get noisy data with channel transformation, so do it ourselves. */ if (!freerdp_dsp_channel_mix(context, data, length, format, &data, &length, &fmt)) return FALSE; /* Create input frame */ if (!ffmpeg_fill_frame(context->frame, format, data, length)) return FALSE; /* Resample to desired format. */ if (!ffmpeg_resample_frame(context->rcontext, context->frame, context->resampled)) return FALSE; if (context->context->frame_size <= 0) { return ffmpeg_encode_frame(context->context, context->resampled, context->packet, out); } else { int copied = 0; int rest = context->resampled->nb_samples; do { int inSamples = rest; if ((inSamples < 0) || (context->bufferedSamples > (UINT32)(INT_MAX - inSamples))) return FALSE; if (inSamples + (int)context->bufferedSamples > context->context->frame_size) inSamples = context->context->frame_size - (int)context->bufferedSamples; #if LIBAVUTIL_VERSION_INT < AV_VERSION_INT(57, 28, 100) const int channels = context->context->channels; #else const int channels = context->context->ch_layout.nb_channels; #endif const int rc = av_samples_copy(context->buffered->extended_data, context->resampled->extended_data, (int)context->bufferedSamples, copied, inSamples, channels, context->context->sample_fmt); if (rc < 0) return FALSE; rest -= inSamples; copied += inSamples; context->bufferedSamples += (UINT32)inSamples; if (context->context->frame_size <= (int)context->bufferedSamples) { /* Encode in desired format. */ if (!ffmpeg_encode_frame(context->context, context->buffered, context->packet, out)) return FALSE; context->bufferedSamples = 0; } } while (rest > 0); return TRUE; } } BOOL freerdp_dsp_ffmpeg_decode(FREERDP_DSP_CONTEXT* WINPR_RESTRICT context, const AUDIO_FORMAT* WINPR_RESTRICT srcFormat, const BYTE* WINPR_RESTRICT data, size_t length, wStream* WINPR_RESTRICT out) { if (!context || !srcFormat || !data || !out || context->encoder) return FALSE; #if LIBAVCODEC_VERSION_INT < AV_VERSION_INT(58, 133, 100) av_init_packet(context->packet); #endif context->packet->data = (uint8_t*)data; context->packet->size = length; return ffmpeg_decode(context->context, context->packet, context->frame, context->rcontext, context->resampled, out); }