1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158
|
/* { dg-do run } */
/* { dg-options "-mips3d forbid_cpu=octeon.* (REQUIRES_STDLIB)" } */
/* Matrix Multiplications */
#include <stdlib.h>
#include <stdio.h>
typedef float v2sf __attribute__((vector_size(8)));
float a[4] = {1.1, 2.2, 3.3, 4.4};
float b[4][4] = {{1, 2, 3, 4},
{5, 6, 7, 8},
{9, 10, 11, 12},
{13, 14, 15, 16}};
float c[4]; /* Result for matrix_multiply1() */
float d[4]; /* Result for matrix_multiply2() */
float e[4]; /* Result for matrix_multiply3() */
float f[4]; /* Result for matrix_multiply4() */
void matrix_multiply1();
NOMIPS16 void matrix_multiply2();
NOMIPS16 void matrix_multiply3();
NOMIPS16 void matrix_multiply4();
int main ()
{
int i;
/* Version 1. Use float calculations */
matrix_multiply1();
/* Version 2. Use paired-single instructions inside the inner loop*/
matrix_multiply2();
for (i = 0; i < 4; i++)
if (d[i] != c[i])
abort();
/* Version 3. Use paired-single instructions and unroll the inner loop */
matrix_multiply3();
for (i = 0; i < 4; i++)
if (e[i] != c[i])
abort();
/* Version 4. Use paired-single instructions and unroll all loops */
matrix_multiply4();
for (i = 0; i < 4; i++)
if (f[i] != c[i])
abort();
printf ("Test Passes\n");
exit (0);
}
void matrix_multiply1()
{
int i, j;
for (i = 0; i < 4; i++)
{
c[i] = 0.0;
for (j = 0; j < 4; j ++)
c[i] += a[j] * b[j][i];
}
}
NOMIPS16 void matrix_multiply2()
{
int i, j;
v2sf m1, m2;
v2sf result, temp;
for (i = 0; i < 4; i++)
{
result = (v2sf) {0.0, 0.0};
for (j = 0; j < 4; j+=2)
{
/* Load two float values into m1 */
m1 = (v2sf) {a[j], a[j+1]};
m2 = (v2sf) {b[j][i], b[j+1][i]};
/* Multiply and add */
result += m1 * m2;
}
/* Reduction add at the end */
temp = __builtin_mips_addr_ps (result, result);
d[i] = __builtin_mips_cvt_s_pl (temp);
}
}
NOMIPS16 void matrix_multiply3()
{
int i;
v2sf m1, m2, n1, n2;
v2sf result, temp;
m1 = (v2sf) {a[0], a[1]};
m2 = (v2sf) {a[2], a[3]};
for (i = 0; i < 4; i++)
{
n1 = (v2sf) {b[0][i], b[1][i]};
n2 = (v2sf) {b[2][i], b[3][i]};
/* Multiply and add */
result = m1 * n1 + m2 * n2;
/* Reduction add at the end */
temp = __builtin_mips_addr_ps (result, result);
e[i] = __builtin_mips_cvt_s_pl (temp);
}
}
NOMIPS16 void matrix_multiply4()
{
v2sf m1, m2;
v2sf n1, n2, n3, n4, n5, n6, n7, n8;
v2sf temp1, temp2, temp3, temp4;
v2sf result1, result2;
/* Load a[0] a[1] values into m1
Load a[2] a[3] values into m2 */
m1 = (v2sf) {a[0], a[1]};
m2 = (v2sf) {a[2], a[3]};
/* Load b[0][0] b[1][0] values into n1
Load b[2][0] b[3][0] values into n2
Load b[0][1] b[1][1] values into n3
Load b[2][1] b[3][1] values into n4
Load b[0][2] b[1][2] values into n5
Load b[2][2] b[3][2] values into n6
Load b[0][3] b[1][3] values into n7
Load b[2][3] b[3][3] values into n8 */
n1 = (v2sf) {b[0][0], b[1][0]};
n2 = (v2sf) {b[2][0], b[3][0]};
n3 = (v2sf) {b[0][1], b[1][1]};
n4 = (v2sf) {b[2][1], b[3][1]};
n5 = (v2sf) {b[0][2], b[1][2]};
n6 = (v2sf) {b[2][2], b[3][2]};
n7 = (v2sf) {b[0][3], b[1][3]};
n8 = (v2sf) {b[2][3], b[3][3]};
temp1 = m1 * n1 + m2 * n2;
temp2 = m1 * n3 + m2 * n4;
temp3 = m1 * n5 + m2 * n6;
temp4 = m1 * n7 + m2 * n8;
result1 = __builtin_mips_addr_ps (temp1, temp2);
result2 = __builtin_mips_addr_ps (temp3, temp4);
f[0] = __builtin_mips_cvt_s_pu (result1);
f[1] = __builtin_mips_cvt_s_pl (result1);
f[2] = __builtin_mips_cvt_s_pu (result2);
f[3] = __builtin_mips_cvt_s_pl (result2);
}
|