/*
 * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
 * Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
 * Copyright (c) 2014 Arwa Arif <arwaarif1994@gmail.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 */

#include <stdint.h>

#include "vf_fsppdsp.h"

#include "libavutil/common.h"
#include "libavutil/mathematics.h"
#include "libavutil/mem_internal.h"

#define DCTSIZE 8

#define FIX(x,s)  (int)((x) * (1 << s) + 0.5)

#define MULTIPLY16H(x,k)   (((x) * (k)) >> 16)
#define THRESHOLD(r,x,t)                         \
    if (((unsigned)((x) + t)) >= t * 2) r = (x); \
    else r = 0;
#define DESCALE(x,n)  (((x) + (1 << ((n) - 1))) >> n)

typedef int32_t int_simd16_t;

enum {
    FIX_0_382683433   = FIX(0.382683433, 14),
    FIX_0_541196100   = FIX(0.541196100, 14),
    FIX_0_707106781   = FIX(M_SQRT1_2  , 14),
    FIX_1_306562965   = FIX(1.306562965, 14),
    FIX_1_414213562_A = FIX(M_SQRT2    , 14),
    FIX_1_847759065   = FIX(1.847759065, 13),
    FIX_2_613125930   = FIX(-2.613125930, 13),
    FIX_1_414213562   = FIX(M_SQRT2    , 13),
    FIX_1_082392200   = FIX(1.082392200, 13),
};

DECLARE_ALIGNED(8, const uint8_t, ff_fspp_dither)[8][8] = {
    {  0,  48,  12,  60,   3,  51,  15,  63, },
    { 32,  16,  44,  28,  35,  19,  47,  31, },
    {  8,  56,   4,  52,  11,  59,   7,  55, },
    { 40,  24,  36,  20,  43,  27,  39,  23, },
    {  2,  50,  14,  62,   1,  49,  13,  61, },
    { 34,  18,  46,  30,  33,  17,  45,  29, },
    { 10,  58,   6,  54,   9,  57,   5,  53, },
    { 42,  26,  38,  22,  41,  25,  37,  21, },
};

//This func reads from 1 slice, 1 and clears 0 & 1
void ff_store_slice_c(uint8_t *restrict dst, int16_t *restrict src,
                      ptrdiff_t dst_stride, ptrdiff_t src_stride,
                      ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
{
#define STORE(pos)                                                             \
    temp = (src[x + pos] + (d[pos] >> log2_scale)) >> (6 - log2_scale);        \
    src[x + pos] = src[x + pos - 8 * src_stride] = 0;                          \
    temp = av_clip_uint8(temp);                                                \
    dst[x + pos] = temp;

    for (int y = 0; y < height; y++) {
        const uint8_t *d = ff_fspp_dither[y];
        for (int x = 0; x < width; x += 8) {
            int temp;
            STORE(0);
            STORE(1);
            STORE(2);
            STORE(3);
            STORE(4);
            STORE(5);
            STORE(6);
            STORE(7);
        }
        src += src_stride;
        dst += dst_stride;
    }
}

//This func reads from 2 slices, 0 & 2  and clears 2-nd
void ff_store_slice2_c(uint8_t *restrict dst, int16_t *restrict src,
                       ptrdiff_t dst_stride, ptrdiff_t src_stride,
                       ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
{
#define STORE2(pos)                                                                                       \
    temp = (src[x + pos] + src[x + pos + 16 * src_stride] + (d[pos] >> log2_scale)) >> (6 - log2_scale);  \
    src[x + pos + 16 * src_stride] = 0;                                                                   \
    temp = av_clip_uint8(temp);                                                                           \
    dst[x + pos] = temp;

    for (int y = 0; y < height; y++) {
        const uint8_t *d = ff_fspp_dither[y];
        for (int x = 0; x < width; x += 8) {
            int temp;
            STORE2(0);
            STORE2(1);
            STORE2(2);
            STORE2(3);
            STORE2(4);
            STORE2(5);
            STORE2(6);
            STORE2(7);
        }
        src += src_stride;
        dst += dst_stride;
    }
}

void ff_mul_thrmat_c(const int16_t *restrict thr_adr_noq, int16_t *restrict thr_adr, int q)
{
    for (int a = 0; a < 64; a++)
        thr_adr[a] = q * thr_adr_noq[a];
}

void ff_column_fidct_c(const int16_t *restrict thr_adr, const int16_t *restrict data,
                       int16_t *restrict output, int cnt)
{
    int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    int_simd16_t tmp10, tmp11, tmp12, tmp13;
    int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
    int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;

    int16_t *wsptr;

    wsptr = output;

    for (; cnt > 0; cnt -= 2) { //start positions
        const int16_t *threshold = thr_adr;//threshold_mtx
        for (int ctr = DCTSIZE; ctr > 0; ctr--) {
            // Process columns from input, add to output.
            tmp0 = data[DCTSIZE * 0] + data[DCTSIZE * 7];
            tmp7 = data[DCTSIZE * 0] - data[DCTSIZE * 7];

            tmp1 = data[DCTSIZE * 1] + data[DCTSIZE * 6];
            tmp6 = data[DCTSIZE * 1] - data[DCTSIZE * 6];

            tmp2 = data[DCTSIZE * 2] + data[DCTSIZE * 5];
            tmp5 = data[DCTSIZE * 2] - data[DCTSIZE * 5];

            tmp3 = data[DCTSIZE * 3] + data[DCTSIZE * 4];
            tmp4 = data[DCTSIZE * 3] - data[DCTSIZE * 4];

            // Even part of FDCT

            tmp10 = tmp0 + tmp3;
            tmp13 = tmp0 - tmp3;
            tmp11 = tmp1 + tmp2;
            tmp12 = tmp1 - tmp2;

            d0 = tmp10 + tmp11;
            d4 = tmp10 - tmp11;

            z1 = MULTIPLY16H(tmp12 + tmp13, FIX_0_707106781 << 2);
            d2 = tmp13 + z1;
            d6 = tmp13 - z1;

            // Even part of IDCT

            THRESHOLD(tmp0, d0, threshold[0 * 8]);
            THRESHOLD(tmp1, d2, threshold[2 * 8]);
            THRESHOLD(tmp2, d4, threshold[4 * 8]);
            THRESHOLD(tmp3, d6, threshold[6 * 8]);
            tmp0 += 2;
            tmp10 = (tmp0 + tmp2) >> 2;
            tmp11 = (tmp0 - tmp2) >> 2;

            tmp13 = (tmp1 + tmp3) >>2; //+2 !  (psnr decides)
            tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13; //<<2

            tmp0 = tmp10 + tmp13; //->temps
            tmp3 = tmp10 - tmp13; //->temps
            tmp1 = tmp11 + tmp12; //->temps
            tmp2 = tmp11 - tmp12; //->temps

            // Odd part of FDCT

            tmp10 = tmp4 + tmp5;
            tmp11 = tmp5 + tmp6;
            tmp12 = tmp6 + tmp7;

            z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433 << 2);
            z2 = MULTIPLY16H(tmp10, FIX_0_541196100 << 2) + z5;
            z4 = MULTIPLY16H(tmp12, FIX_1_306562965 << 2) + z5;
            z3 = MULTIPLY16H(tmp11, FIX_0_707106781 << 2);

            z11 = tmp7 + z3;
            z13 = tmp7 - z3;

            d5 = z13 + z2;
            d3 = z13 - z2;
            d1 = z11 + z4;
            d7 = z11 - z4;

            // Odd part of IDCT

            THRESHOLD(tmp4, d1, threshold[1 * 8]);
            THRESHOLD(tmp5, d3, threshold[3 * 8]);
            THRESHOLD(tmp6, d5, threshold[5 * 8]);
            THRESHOLD(tmp7, d7, threshold[7 * 8]);

            //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
            z13 = tmp6 + tmp5;
            z10 = (tmp6 - tmp5) * 2;
            z11 = tmp4 + tmp7;
            z12 = (tmp4 - tmp7) * 2;

            tmp7  = (z11 + z13) >> 2; //+2 !
            tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562 << 1);
            z5    = MULTIPLY16H(z10 + z12, FIX_1_847759065);
            tmp10 = MULTIPLY16H(z12,       FIX_1_082392200) - z5;
            tmp12 = MULTIPLY16H(z10,       FIX_2_613125930) + z5; // - !!

            tmp6 = tmp12 - tmp7;
            tmp5 = tmp11 - tmp6;
            tmp4 = tmp10 + tmp5;

            wsptr[DCTSIZE * 0] +=  (tmp0 + tmp7);
            wsptr[DCTSIZE * 1] +=  (tmp1 + tmp6);
            wsptr[DCTSIZE * 2] +=  (tmp2 + tmp5);
            wsptr[DCTSIZE * 3] +=  (tmp3 - tmp4);
            wsptr[DCTSIZE * 4] +=  (tmp3 + tmp4);
            wsptr[DCTSIZE * 5] +=  (tmp2 - tmp5);
            wsptr[DCTSIZE * 6]  =  (tmp1 - tmp6);
            wsptr[DCTSIZE * 7]  =  (tmp0 - tmp7);
            //
            data++; //next column
            wsptr++;
            threshold++;
        }
        data  += 8; //skip each second start pos
        wsptr   += 8;
    }
}

void ff_row_idct_c(const int16_t *restrict wsptr, int16_t *restrict output_adr,
                   ptrdiff_t output_stride, int cnt)
{
    int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    int_simd16_t tmp10, tmp11, tmp12, tmp13;
    int_simd16_t z5, z10, z11, z12, z13;
    int16_t *outptr;

    cnt *= 4;
    outptr = output_adr;
    for (; cnt > 0; cnt--) {
        // Even part
        //Simd version reads 4x4 block and transposes it
        tmp10 = wsptr[2] +  wsptr[3];
        tmp11 = wsptr[2] -  wsptr[3];

        tmp13 = wsptr[0] +  wsptr[1];
        tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) * 4) - tmp13;//this shift order to avoid overflow

        tmp0 = tmp10 + tmp13; //->temps
        tmp3 = tmp10 - tmp13; //->temps
        tmp1 = tmp11 + tmp12;
        tmp2 = tmp11 - tmp12;

        // Odd part
        //Also transpose, with previous:
        // ---- ----      ||||
        // ---- ---- idct ||||
        // ---- ---- ---> ||||
        // ---- ----      ||||
        z13 = wsptr[4] + wsptr[5];
        z10 = wsptr[4] - wsptr[5];
        z11 = wsptr[6] + wsptr[7];
        z12 = wsptr[6] - wsptr[7];

        tmp7 = z11 + z13;
        tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562);

        z5 =    MULTIPLY16H(z10 + z12, FIX_1_847759065);
        tmp10 = MULTIPLY16H(z12,       FIX_1_082392200) - z5;
        tmp12 = MULTIPLY16H(z10,       FIX_2_613125930) + z5; // - FIX_

        tmp6 = tmp12 * 8 - tmp7;
        tmp5 = tmp11 * 8 - tmp6;
        tmp4 = tmp10 * 8 + tmp5;

        // Final output stage: descale and write column
        outptr[0 * output_stride] += DESCALE(tmp0 + tmp7, 3);
        outptr[1 * output_stride] += DESCALE(tmp1 + tmp6, 3);
        outptr[2 * output_stride] += DESCALE(tmp2 + tmp5, 3);
        outptr[3 * output_stride] += DESCALE(tmp3 - tmp4, 3);
        outptr[4 * output_stride] += DESCALE(tmp3 + tmp4, 3);
        outptr[5 * output_stride] += DESCALE(tmp2 - tmp5, 3);
        outptr[6 * output_stride] += DESCALE(tmp1 - tmp6, 3); //no += ?
        outptr[7 * output_stride] += DESCALE(tmp0 - tmp7, 3); //no += ?
        outptr++;

        wsptr += DCTSIZE;       // advance pointer to next row
    }
}

void ff_row_fdct_c(int16_t *restrict data, const uint8_t *restrict pixels,
                   ptrdiff_t line_size, int cnt)
{
    int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    int_simd16_t tmp10, tmp11, tmp12, tmp13;
    int_simd16_t z1, z2, z3, z4, z5, z11, z13;
    int16_t *dataptr;

    cnt *= 4;
    // Pass 1: process rows.

    dataptr = data;
    for (; cnt > 0; cnt--) {
        tmp0 = pixels[line_size * 0] + pixels[line_size * 7];
        tmp7 = pixels[line_size * 0] - pixels[line_size * 7];
        tmp1 = pixels[line_size * 1] + pixels[line_size * 6];
        tmp6 = pixels[line_size * 1] - pixels[line_size * 6];
        tmp2 = pixels[line_size * 2] + pixels[line_size * 5];
        tmp5 = pixels[line_size * 2] - pixels[line_size * 5];
        tmp3 = pixels[line_size * 3] + pixels[line_size * 4];
        tmp4 = pixels[line_size * 3] - pixels[line_size * 4];

        // Even part

        tmp10 = tmp0 + tmp3;
        tmp13 = tmp0 - tmp3;
        tmp11 = tmp1 + tmp2;
        tmp12 = tmp1 - tmp2;
        //Even columns are written first, this leads to different order of columns
        //in column_fidct(), but they are processed independently, so all ok.
        //Later in the row_idct() columns are read in the same order.
        dataptr[2] = tmp10 + tmp11;
        dataptr[3] = tmp10 - tmp11;

        z1 = MULTIPLY16H(tmp12 + tmp13, FIX_0_707106781 << 2);
        dataptr[0] = tmp13 + z1;
        dataptr[1] = tmp13 - z1;

        // Odd part

        tmp10 = tmp4 + tmp5;
        tmp11 = tmp5 + tmp6;
        tmp12 = tmp6 + tmp7;

        z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433 << 2);
        z2 = MULTIPLY16H(tmp10,         FIX_0_541196100 << 2) + z5;
        z4 = MULTIPLY16H(tmp12,         FIX_1_306562965 << 2) + z5;
        z3 = MULTIPLY16H(tmp11,         FIX_0_707106781 << 2);

        z11 = tmp7 + z3;
        z13 = tmp7 - z3;

        dataptr[4] = z13 + z2;
        dataptr[5] = z13 - z2;
        dataptr[6] = z11 + z4;
        dataptr[7] = z11 - z4;

        pixels++;               // advance pointer to next column
        dataptr += DCTSIZE;
    }
}