File: SincResampler.cpp

package info (click to toggle)
webkit2gtk 2.48.5-1
links: PTS, VCS
area: main
in suites: forky, sid
size: 429,764 kB
sloc: cpp: 3,697,587; javascript: 194,444; ansic: 169,997; python: 46,499; asm: 19,295; ruby: 18,528; perl: 16,602; xml: 4,650; yacc: 2,360; sh: 2,098; java: 1,993; lex: 1,327; pascal: 366; makefile: 298
file content (373 lines) | stat: -rw-r--r-- 15,436 bytes
parent folder | download | duplicates (6)
/*
 * Copyright (C) 2011 Google Inc. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1.  Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 * 2.  Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 * 3.  Neither the name of Apple Inc. ("Apple") nor the names of
 *     its contributors may be used to endorse or promote products derived
 *     from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "config.h"

#if ENABLE(WEB_AUDIO)

#include "SincResampler.h"

#include "AudioBus.h"
#include "AudioUtilities.h"
#include "VectorMath.h"
#include <wtf/Algorithms.h>
#include <wtf/MathExtras.h>
#include <wtf/TZoneMallocInlines.h>
#include <wtf/text/ParsingUtilities.h>

#if USE(ACCELERATE)
#include <Accelerate/Accelerate.h>
#elif CPU(X86_SSE2)
#include <xmmintrin.h>
#elif HAVE(ARM_NEON_INTRINSICS)
#include <arm_neon.h>
#endif

// Initial input buffer layout, dividing into regions r0 to r4 (note: r0, r3
// and r4 will move after the first load):
//
// |----------------|----------------------------------------------------------------|----------------|
//
//                                              m_requestFrames
//                   <-------------------------------------------------------------------------------->
//                                           r0 (during first load)
//
//   kernelSize / 2   kernelSize / 2                                 kernelSize / 2     kernelSize / 2 
// <---------------> <--------------->                              <---------------> <--------------->
//         r1                r2                                             r3                r4
// 
//                             m_blockSize == r4 - r2
//                   <--------------------------------------->
//
//                                                  m_requestFrames
//                                    <------------------ ... ----------------->
//                                               r0 (during second load)
//
// On the second request r0 slides to the right by kernelSize / 2 and r3, r4
// and m_blockSize are reinitialized via step (3) in the algorithm below.
//
// These new regions remain constant until a Flush() occurs. While complicated,
// this allows us to reduce jitter by always requesting the same amount from the
// provided callback.

// The Algorithm:
//
// 1) Allocate input_buffer of size: m_requestFrames + kernelSize; this ensures
//    there's enough room to read m_requestFrames from the callback into region
//    r0 (which will move between the first and subsequent passes).
//
// 2) Let r1, r2 each represent half the kernel centered around r0:
//
//        r0 = m_inputBuffer + kernelSize / 2
//        r1 = m_inputBuffer
//        r2 = r0
//
//    r0 is always m_requestFrames in size. r1, r2 are kernelSize / 2 in
//    size. r1 must be zero initialized to avoid convolution with garbage (see
//    step (5) for why).
//
// 3) Let r3, r4 each represent half the kernel right aligned with the end of
//    r0 and choose m_blockSize as the distance in frames between r4 and r2:
//
//        r3 = r0 + m_requestFrames - kernelSize
//        r4 = r0 + m_requestFrames - kernelSize / 2
//        m_blockSize = r4 - r2 = m_requestFrames - kernelSize / 2
//
// 4) Consume m_requestFrames frames into r0.
//
// 5) Position kernel centered at start of r2 and generate output frames until
//    the kernel is centered at the start of r4 or we've finished generating
//    all the output frames.
//
// 6) Wrap left over data from the r3 to r1 and r4 to r2.
//
// 7) If we're on the second load, in order to avoid overwriting the frames we
//    just wrapped from r4 we need to slide r0 to the right by the size of
//    r4, which is kernelSize / 2:
//
//        r0 = r0 + kernelSize / 2 = m_inputBuffer + kernelSize
//
//    r3, r4, and m_blockSize then need to be reinitialized, so goto (3).
//
// 8) Else, if we're not on the second load, goto (4).
//
// note: we're glossing over how the sub-sample handling works with m_virtualSourceIndex, etc.

namespace WebCore {

WTF_MAKE_TZONE_ALLOCATED_IMPL(SincResampler);

constexpr unsigned kernelSize { 32 };
constexpr unsigned numberOfKernelOffsets { 32 };
constexpr unsigned kernelStorageSize { kernelSize * (numberOfKernelOffsets + 1) };

static size_t calculateChunkSize(unsigned blockSize, double scaleFactor)
{
    return blockSize / scaleFactor;
}

SincResampler::SincResampler(double scaleFactor, unsigned requestFrames, Function<void(std::span<float> buffer, size_t framesToProcess)>&& provideInput)
    : m_scaleFactor(scaleFactor)
    , m_kernelStorage(kernelStorageSize)
    , m_requestFrames(requestFrames)
    , m_provideInput(WTFMove(provideInput))
    , m_inputBuffer(m_requestFrames + kernelSize) // See input buffer layout above.
    , m_r1(m_inputBuffer.span())
    , m_r2(m_inputBuffer.span().subspan(kernelSize / 2))
{
    ASSERT(m_provideInput);
    ASSERT(m_requestFrames > 0);
    updateRegions(false);
    ASSERT(m_blockSize > kernelSize);
    initializeKernel();
}

void SincResampler::updateRegions(bool isSecondLoad)
{
    // Setup various region pointers in the buffer (see diagram above). If we're
    // on the second load we need to slide m_r0 to the right by kernelSize / 2.
    m_r0 = m_inputBuffer.span().subspan(isSecondLoad ? kernelSize : kernelSize / 2);
    m_r3 = m_r0.subspan(m_requestFrames - kernelSize);
    m_r4 = m_r0.subspan(m_requestFrames - kernelSize / 2);
    m_blockSize = std::distance(m_r2.begin(), m_r4.begin());
    m_chunkSize = calculateChunkSize(m_blockSize, m_scaleFactor);

    // m_r1 at the beginning of the buffer.
    ASSERT(m_r1.data() == m_inputBuffer.data());
    // m_r1 left of m_r2, m_r4 left of m_r3 and size correct.
    ASSERT(std::distance(m_r1.begin(), m_r2.begin()) == std::distance(m_r3.begin(), m_r4.begin()));
    // m_r2 left of r3.
    ASSERT(m_r2.begin() <= m_r3.begin());
}

void SincResampler::initializeKernel()
{
    // Blackman window parameters.
    double alpha = 0.16;
    double a0 = 0.5 * (1.0 - alpha);
    double a1 = 0.5;
    double a2 = 0.5 * alpha;

    // sincScaleFactor is basically the normalized cutoff frequency of the low-pass filter.
    double sincScaleFactor = m_scaleFactor > 1.0 ? 1.0 / m_scaleFactor : 1.0;

    // The sinc function is an idealized brick-wall filter, but since we're windowing it the
    // transition from pass to stop does not happen right away. So we should adjust the
    // lowpass filter cutoff slightly downward to avoid some aliasing at the very high-end.
    // FIXME: this value is empirical and to be more exact should vary depending on kernelSize.
    sincScaleFactor *= 0.9;

    int n = kernelSize;
    int halfSize = n / 2;

    // Generates a set of windowed sinc() kernels.
    // We generate a range of sub-sample offsets from 0.0 to 1.0.
    for (unsigned offsetIndex = 0; offsetIndex <= numberOfKernelOffsets; ++offsetIndex) {
        double subsampleOffset = static_cast<double>(offsetIndex) / numberOfKernelOffsets;

        for (int i = 0; i < n; ++i) {
            // Compute the sinc() with offset.
            double s = sincScaleFactor * piDouble * (i - halfSize - subsampleOffset);
            double sinc = !s ? 1.0 : sin(s) / s;
            sinc *= sincScaleFactor;

            // Compute Blackman window, matching the offset of the sinc().
            double x = (i - subsampleOffset) / n;
            double window = a0 - a1 * cos(2.0 * piDouble * x) + a2 * cos(4.0 * piDouble * x);

            // Window the sinc() function and store at the correct offset.
            m_kernelStorage[i + offsetIndex * kernelSize] = sinc * window;
        }
    }
}

void SincResampler::processBuffer(std::span<const float> source, std::span<float> destination, double scaleFactor)
{
    RELEASE_ASSERT(destination.size() == static_cast<size_t>(source.size() / scaleFactor));
    SincResampler resampler(scaleFactor, AudioUtilities::renderQuantumSize, [&source](std::span<float> buffer, size_t framesToProcess) mutable {
        // Clamp to number of frames available and zero-pad.
        size_t framesToCopy = std::min(source.size(), framesToProcess);

        IGNORE_WARNINGS_BEGIN("restrict")
        memcpySpan(buffer, source.first(framesToCopy));
        IGNORE_WARNINGS_END

        // Zero-pad if necessary.
        if (framesToCopy < framesToProcess)
            zeroSpan(buffer.subspan(framesToCopy, framesToProcess - framesToCopy));

        skip(source, framesToCopy);
    });

    while (!destination.empty()) {
        unsigned framesThisTime = std::min<size_t>(destination.size(), AudioUtilities::renderQuantumSize);
        resampler.process(consumeSpan(destination, framesThisTime), framesThisTime);
    }
}

void SincResampler::process(std::span<float> destination, size_t framesToProcess)
{
    unsigned numberOfDestinationFrames = framesToProcess;

    // Step (1)
    // Prime the input buffer at the start of the input stream.
    if (!m_isBufferPrimed) {
        m_provideInput(m_r0, m_requestFrames);
        m_isBufferPrimed = true;
    }
    
    // Step (2)

    size_t destinationIndex = 0;
    while (numberOfDestinationFrames) {
        while (m_virtualSourceIndex < m_blockSize) {
            // m_virtualSourceIndex lies in between two kernel offsets so figure out what they are.
            int sourceIndexI = static_cast<int>(m_virtualSourceIndex);
            double subsampleRemainder = m_virtualSourceIndex - sourceIndexI;

            double virtualOffsetIndex = subsampleRemainder * numberOfKernelOffsets;
            int offsetIndex = static_cast<int>(virtualOffsetIndex);
            
            auto k1 = m_kernelStorage.span().subspan(offsetIndex * kernelSize);
            auto k2 = k1.subspan(kernelSize);

            // Ensure |k1|, |k2| are 16-byte aligned for SIMD usage. Should always be true so long as kernelSize is a multiple of 16.
            ASSERT(!(reinterpret_cast<uintptr_t>(k1.data()) & 0x0F));
            ASSERT(!(reinterpret_cast<uintptr_t>(k2.data()) & 0x0F));

            // Initialize input pointer based on quantized m_virtualSourceIndex.
            auto inputP = m_r1.subspan(sourceIndexI);

            // Figure out how much to weight each kernel's "convolution".
            double kernelInterpolationFactor = virtualOffsetIndex - offsetIndex;

            destination[destinationIndex++] = convolve(inputP, k1, k2, kernelInterpolationFactor);

            // Advance the virtual index.
            m_virtualSourceIndex += m_scaleFactor;

            --numberOfDestinationFrames;
            if (!numberOfDestinationFrames)
                return;
        }

        // Wrap back around to the start.
        ASSERT(m_virtualSourceIndex >= m_blockSize);
        m_virtualSourceIndex -= m_blockSize;

        // Step (3) Copy r3 to r1.
        // This wraps the last input frames back to the start of the buffer.
        memcpySpan(m_r1, m_r3.first(kernelSize));

        // Step (4) -- Reinitialize regions if necessary.
        if (m_r0.data() == m_r2.data())
            updateRegions(true);

        // Step (5)
        // Refresh the buffer with more input.
        m_provideInput(m_r0, m_requestFrames);
    }
}

float SincResampler::convolve(std::span<const float> inputP, std::span<const float> k1, std::span<const float> k2, float kernelInterpolationFactor)
{
#if USE(ACCELERATE)
    float sum1 = VectorMath::dotProduct(inputP.first(kernelSize), k1.first(kernelSize));
    float sum2 = VectorMath::dotProduct(inputP.first(kernelSize), k2.first(kernelSize));

    // Linearly interpolate the two "convolutions".
    return (1.0f - kernelInterpolationFactor) * sum1 + kernelInterpolationFactor * sum2;
#elif CPU(X86_SSE2)
    __m128 m_input;
    __m128 m_sums1 = _mm_setzero_ps();
    __m128 m_sums2 = _mm_setzero_ps();

    // Based on |inputP| alignment, we need to use loadu or load.
    if (reinterpret_cast<uintptr_t>(inputP.data()) & 0x0F) {
        for (unsigned i = 0; i < kernelSize; i += 4) {
            m_input = _mm_loadu_ps(inputP.subspan(i).data());
            m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1.subspan(i).data())));
            m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2.subspan(i).data())));
        }
    } else {
        for (unsigned i = 0; i < kernelSize; i += 4) {
            m_input = _mm_load_ps(inputP.subspan(i).data());
            m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1.subspan(i).data())));
            m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2.subspan(i).data())));
        }
    }

    // Linearly interpolate the two "convolutions".
    m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(1.0f - kernelInterpolationFactor));
    m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(kernelInterpolationFactor));
    m_sums1 = _mm_add_ps(m_sums1, m_sums2);

    // Sum components together.
    float result;
    m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);
    _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps(m_sums2, m_sums2, 1)));

    return result;
#elif HAVE(ARM_NEON_INTRINSICS)
    float32x4_t m_input;
    float32x4_t m_sums1 = vmovq_n_f32(0);
    float32x4_t m_sums2 = vmovq_n_f32(0);
    inputP = inputP.first(kernelSize);
    while (!inputP.empty()) {
        m_input = vld1q_f32(inputP.data());
        skip(inputP, 4);
        m_sums1 = vmlaq_f32(m_sums1, m_input, vld1q_f32(k1.data()));
        skip(k1, 4);
        m_sums2 = vmlaq_f32(m_sums2, m_input, vld1q_f32(k2.data()));
        skip(k2, 4);
    }

    // Linearly interpolate the two "convolutions".
    m_sums1 = vmlaq_f32(vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernelInterpolationFactor)), m_sums2, vmovq_n_f32(kernelInterpolationFactor));

    // Sum components together.
    float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1));
    return vget_lane_f32(vpadd_f32(m_half, m_half), 0);
#else
    float sum1 = 0;
    float sum2 = 0;

    // Generate a single output sample.
    for (size_t i = 0; i < kernelSize; ++i) {
        sum1 += inputP[i] * k1[i];
        sum2 += inputP[i] * k2[i];
    }

    // Linearly interpolate the two "convolutions".
    return (1.0f - kernelInterpolationFactor) * sum1 + kernelInterpolationFactor * sum2;
#endif
}

} // namespace WebCore

#endif // ENABLE(WEB_AUDIO)