1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185
|
/*
* Copyright (c) 2013 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_dsp/vpx_dsp_common.h"
void vpx_idct16x16_256_add_neon_pass1(const int16_t *input,
int16_t *output,
int output_stride);
void vpx_idct16x16_256_add_neon_pass2(const int16_t *src,
int16_t *output,
int16_t *pass1Output,
int16_t skip_adding,
uint8_t *dest,
int dest_stride);
void vpx_idct16x16_10_add_neon_pass1(const int16_t *input,
int16_t *output,
int output_stride);
void vpx_idct16x16_10_add_neon_pass2(const int16_t *src,
int16_t *output,
int16_t *pass1Output,
int16_t skip_adding,
uint8_t *dest,
int dest_stride);
#if HAVE_NEON_ASM
/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
extern void vpx_push_neon(int64_t *store);
extern void vpx_pop_neon(int64_t *store);
#endif // HAVE_NEON_ASM
void vpx_idct16x16_256_add_neon(const int16_t *input,
uint8_t *dest, int dest_stride) {
#if HAVE_NEON_ASM
int64_t store_reg[8];
#endif
int16_t pass1_output[16*16] = {0};
int16_t row_idct_output[16*16] = {0};
#if HAVE_NEON_ASM
// save d8-d15 register values.
vpx_push_neon(store_reg);
#endif
/* Parallel idct on the upper 8 rows */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
vpx_idct16x16_256_add_neon_pass1(input, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7
// which will be saved into row_idct_output.
vpx_idct16x16_256_add_neon_pass2(input+1,
row_idct_output,
pass1_output,
0,
dest,
dest_stride);
/* Parallel idct on the lower 8 rows */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
vpx_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7
// which will be saved into row_idct_output.
vpx_idct16x16_256_add_neon_pass2(input+8*16+1,
row_idct_output+8,
pass1_output,
0,
dest,
dest_stride);
/* Parallel idct on the left 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data.
vpx_idct16x16_256_add_neon_pass2(row_idct_output+1,
row_idct_output,
pass1_output,
1,
dest,
dest_stride);
/* Parallel idct on the right 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
vpx_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data.
vpx_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
row_idct_output+8,
pass1_output,
1,
dest+8,
dest_stride);
#if HAVE_NEON_ASM
// restore d8-d15 register values.
vpx_pop_neon(store_reg);
#endif
return;
}
void vpx_idct16x16_10_add_neon(const int16_t *input,
uint8_t *dest, int dest_stride) {
#if HAVE_NEON_ASM
int64_t store_reg[8];
#endif
int16_t pass1_output[16*16] = {0};
int16_t row_idct_output[16*16] = {0};
#if HAVE_NEON_ASM
// save d8-d15 register values.
vpx_push_neon(store_reg);
#endif
/* Parallel idct on the upper 8 rows */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
vpx_idct16x16_10_add_neon_pass1(input, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7
// which will be saved into row_idct_output.
vpx_idct16x16_10_add_neon_pass2(input+1,
row_idct_output,
pass1_output,
0,
dest,
dest_stride);
/* Skip Parallel idct on the lower 8 rows as they are all 0s */
/* Parallel idct on the left 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data.
vpx_idct16x16_256_add_neon_pass2(row_idct_output+1,
row_idct_output,
pass1_output,
1,
dest,
dest_stride);
/* Parallel idct on the right 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
vpx_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data.
vpx_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
row_idct_output+8,
pass1_output,
1,
dest+8,
dest_stride);
#if HAVE_NEON_ASM
// restore d8-d15 register values.
vpx_pop_neon(store_reg);
#endif
return;
}
|