File: mull.c

package info (click to toggle)
simde 0.8.2-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie, trixie-backports
  • size: 58,264 kB
  • sloc: ansic: 817,393; sh: 315; makefile: 45; python: 26
file content (520 lines) | stat: -rw-r--r-- 29,171 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
#define SIMDE_TEST_ARM_NEON_INSN mull

#include "test-neon.h"
#include "../../../simde/arm/neon/mull.h"

static int
test_simde_vmull_s8 (SIMDE_MUNIT_TEST_ARGS) {
#if 1
  struct {
    int8_t a[8];
    int8_t b[8];
    int16_t r[8];
  } test_vec[] = {
    { {  INT8_C(  80), -INT8_C(  57),      INT8_MIN, -INT8_C(  68),  INT8_C(  13), -INT8_C(  44),  INT8_C(   8),  INT8_C(  65) },
      { -INT8_C(  55), -INT8_C(  20),  INT8_C(  56), -INT8_C(  54),  INT8_C( 110),  INT8_C(  55), -INT8_C(  97), -INT8_C(   8) },
      { -INT16_C(  4400),  INT16_C(  1140), -INT16_C(  7168),  INT16_C(  3672),  INT16_C(  1430), -INT16_C(  2420), -INT16_C(   776), -INT16_C(   520) } },
    { {  INT8_C(  90),  INT8_C(  52),  INT8_C(  32),  INT8_C(  61), -INT8_C( 126),  INT8_C(  97),  INT8_C(  42), -INT8_C(  90) },
      {  INT8_C( 100),  INT8_C(  38), -INT8_C( 122),  INT8_C( 112), -INT8_C(  57),  INT8_C(  19), -INT8_C(  61),  INT8_C(  23) },
      {  INT16_C(  9000),  INT16_C(  1976), -INT16_C(  3904),  INT16_C(  6832),  INT16_C(  7182),  INT16_C(  1843), -INT16_C(  2562), -INT16_C(  2070) } },
    { { -INT8_C(  38),  INT8_C(  68), -INT8_C(  44), -INT8_C(  24),  INT8_C(  24), -INT8_C(  36),  INT8_C(  41), -INT8_C(  31) },
      { -INT8_C(  56),  INT8_C(  97), -INT8_C(  85),  INT8_C(  55), -INT8_C( 104),  INT8_C(  74),  INT8_C(  47), -INT8_C(  14) },
      {  INT16_C(  2128),  INT16_C(  6596),  INT16_C(  3740), -INT16_C(  1320), -INT16_C(  2496), -INT16_C(  2664),  INT16_C(  1927),  INT16_C(   434) } },
    { {  INT8_C( 126),  INT8_C(  80),  INT8_C(  48),  INT8_C(   1), -INT8_C(  79),  INT8_C(  90), -INT8_C(  89),  INT8_C(  21) },
      {      INT8_MIN,  INT8_C(  46), -INT8_C( 123),  INT8_C(  72),  INT8_C(  65),  INT8_C(  73),  INT8_C(  95),  INT8_C(  28) },
      { -INT16_C( 16128),  INT16_C(  3680), -INT16_C(  5904),  INT16_C(    72), -INT16_C(  5135),  INT16_C(  6570), -INT16_C(  8455),  INT16_C(   588) } },
    { { -INT8_C( 115),  INT8_C(  51),  INT8_C(   4), -INT8_C(  91),  INT8_C(  16),  INT8_C(  45), -INT8_C( 122), -INT8_C(  40) },
      { -INT8_C( 114),  INT8_C(  49),  INT8_C(  15),  INT8_C(  38),  INT8_C( 123),  INT8_C(  63),  INT8_C(  25), -INT8_C(   7) },
      {  INT16_C( 13110),  INT16_C(  2499),  INT16_C(    60), -INT16_C(  3458),  INT16_C(  1968),  INT16_C(  2835), -INT16_C(  3050),  INT16_C(   280) } },
    { { -INT8_C( 113),  INT8_C(  73), -INT8_C(   6),  INT8_C(  64), -INT8_C(  93), -INT8_C(  94),  INT8_C(  86),  INT8_C(  36) },
      { -INT8_C(  48), -INT8_C(  37),  INT8_C( 108),  INT8_C(  17),  INT8_C(  36), -INT8_C(  53),  INT8_C(  45), -INT8_C(  79) },
      {  INT16_C(  5424), -INT16_C(  2701), -INT16_C(   648),  INT16_C(  1088), -INT16_C(  3348),  INT16_C(  4982),  INT16_C(  3870), -INT16_C(  2844) } },
    { { -INT8_C(   1),  INT8_C(  49),  INT8_C(  86),  INT8_C(  15),  INT8_C(  94), -INT8_C(  36), -INT8_C(  25), -INT8_C(  20) },
      {  INT8_C(  13), -INT8_C(   9),  INT8_C(  19), -INT8_C( 120),  INT8_C(  54),  INT8_C(  44), -INT8_C( 126), -INT8_C(  59) },
      { -INT16_C(    13), -INT16_C(   441),  INT16_C(  1634), -INT16_C(  1800),  INT16_C(  5076), -INT16_C(  1584),  INT16_C(  3150),  INT16_C(  1180) } },
    { {  INT8_C( 117),  INT8_C( 124),  INT8_C(   5),  INT8_C(  24),  INT8_C(  30),  INT8_C(  91),  INT8_C(  60), -INT8_C(  18) },
      {  INT8_C(  55), -INT8_C(  88),  INT8_C(   0),  INT8_C(  91),  INT8_C( 116),  INT8_C(  45),  INT8_C(  13),  INT8_C( 115) },
      {  INT16_C(  6435), -INT16_C( 10912),  INT16_C(     0),  INT16_C(  2184),  INT16_C(  3480),  INT16_C(  4095),  INT16_C(   780), -INT16_C(  2070) } },
  };

  for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) {
    simde_int8x8_t a = simde_vld1_s8(test_vec[i].a);
    simde_int8x8_t b = simde_vld1_s8(test_vec[i].b);
    simde_int16x8_t r = simde_vmull_s8(a, b);

    simde_test_arm_neon_assert_equal_i16x8(r, simde_vld1q_s16(test_vec[i].r));
  }

  return 0;

#else
  fputc('\n', stdout);
  for (int i = 0 ; i < 8 ; i++) {
    simde_int8x8_t a = simde_test_arm_neon_random_i8x8();
    simde_int8x8_t b = simde_test_arm_neon_random_i8x8();
    simde_int16x8_t r = simde_vmull_s8(a, b);

    simde_test_arm_neon_write_i8x8(2, a, SIMDE_TEST_VEC_POS_FIRST);
    simde_test_arm_neon_write_i8x8(2, b, SIMDE_TEST_VEC_POS_MIDDLE);
    simde_test_arm_neon_write_i16x8(2, r, SIMDE_TEST_VEC_POS_LAST);
  }
  return 1;
#endif
}

static int
test_simde_vmull_s16 (SIMDE_MUNIT_TEST_ARGS) {
#if 1
  struct {
    int16_t a[4];
    int16_t b[4];
    int32_t r[4];
  } test_vec[] = {
    { {  INT16_C( 11230),  INT16_C( 11512), -INT16_C( 32461), -INT16_C( 31562) },
      {  INT16_C( 29011), -INT16_C(  4051),  INT16_C( 24636), -INT16_C( 23193) },
      {  INT32_C(   325793530), -INT32_C(    46635112), -INT32_C(   799709196),  INT32_C(   732017466) } },
    { {  INT16_C( 17728),  INT16_C( 31395),  INT16_C(  5945),  INT16_C(  5959) },
      {  INT16_C(  2425), -INT16_C( 15905), -INT16_C( 10338),  INT16_C( 31939) },
      {  INT32_C(    42990400), -INT32_C(   499337475), -INT32_C(    61459410),  INT32_C(   190324501) } },
    { { -INT16_C( 17662),  INT16_C( 13993),  INT16_C( 24380), -INT16_C( 28486) },
      { -INT16_C(  6192),  INT16_C(  3200), -INT16_C(  6329), -INT16_C( 30542) },
      {  INT32_C(   109363104),  INT32_C(    44777600), -INT32_C(   154301020),  INT32_C(   870019412) } },
    { {  INT16_C( 21805),  INT16_C( 26114),  INT16_C( 18796), -INT16_C(  6787) },
      {  INT16_C( 23635), -INT16_C(  3674),  INT16_C( 27188),  INT16_C( 13933) },
      {  INT32_C(   515361175), -INT32_C(    95942836),  INT32_C(   511025648), -INT32_C(    94563271) } },
    { {  INT16_C(  5669),  INT16_C( 25196),  INT16_C(  9846),  INT16_C( 18162) },
      {  INT16_C( 29198),  INT16_C( 21843),  INT16_C(  1369), -INT16_C( 31011) },
      {  INT32_C(   165523462),  INT32_C(   550356228),  INT32_C(    13479174), -INT32_C(   563221782) } },
    { { -INT16_C(  8358), -INT16_C( 14612),  INT16_C( 26921),  INT16_C( 31916) },
      {  INT16_C( 21190), -INT16_C(  1427), -INT16_C(  9540), -INT16_C(  7632) },
      { -INT32_C(   177106020),  INT32_C(    20851324), -INT32_C(   256826340), -INT32_C(   243582912) } },
    { { -INT16_C( 25103),  INT16_C( 26436),  INT16_C( 14019), -INT16_C( 11859) },
      {  INT16_C(   168),  INT16_C(   295),  INT16_C(  1029),  INT16_C( 24456) },
      { -INT32_C(     4217304),  INT32_C(     7798620),  INT32_C(    14425551), -INT32_C(   290023704) } },
    { {  INT16_C( 29924),  INT16_C(  3366), -INT16_C( 11554), -INT16_C( 23415) },
      { -INT16_C(  2524), -INT16_C(  7778), -INT16_C( 12592), -INT16_C( 15933) },
      { -INT32_C(    75528176), -INT32_C(    26180748),  INT32_C(   145487968),  INT32_C(   373071195) } },
  };

  for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) {
    simde_int16x4_t a = simde_vld1_s16(test_vec[i].a);
    simde_int16x4_t b = simde_vld1_s16(test_vec[i].b);
    simde_int32x4_t r = simde_vmull_s16(a, b);

    simde_test_arm_neon_assert_equal_i32x4(r, simde_vld1q_s32(test_vec[i].r));
  }

  return 0;

#else
  fputc('\n', stdout);
  for (int i = 0 ; i < 8 ; i++) {
    simde_int16x4_t a = simde_test_arm_neon_random_i16x4();
    simde_int16x4_t b = simde_test_arm_neon_random_i16x4();
    simde_int32x4_t r = simde_vmull_s16(a, b);

    simde_test_arm_neon_write_i16x4(2, a, SIMDE_TEST_VEC_POS_FIRST);
    simde_test_arm_neon_write_i16x4(2, b, SIMDE_TEST_VEC_POS_MIDDLE);
    simde_test_arm_neon_write_i32x4(2, r, SIMDE_TEST_VEC_POS_LAST);
  }
  return 1;
#endif
}

static int
test_simde_vmull_s32 (SIMDE_MUNIT_TEST_ARGS) {
#if 1
  struct {
    int32_t a[2];
    int32_t b[2];
    int64_t r[2];
  } test_vec[] = {
    { {  INT32_C(   930126813), -INT32_C(   560729004) },
      {  INT32_C(   166776726), -INT32_C(   422116933) },
      {  INT64_C(  155123504636954238),  INT64_C(  236693207412624732) } },
    { {  INT32_C(  1728012372),  INT32_C(   633898368) },
      { -INT32_C(  1137785715), -INT32_C(  1374263343) },
      { -INT64_C( 1966107792204865980), -INT64_C(  871143290329924224) } },
    { {  INT32_C(  1457882626),  INT32_C(   271874170) },
      {  INT32_C(    35267655),  INT32_C(  2045309221) },
      {  INT64_C(   51416101484262030),  INT64_C(  556066746852721570) } },
    { { -INT32_C(   757078191), -INT32_C(    84433043) },
      {  INT32_C(  1018635627), -INT32_C(  1897214580) },
      { -INT64_C(  771186817777310757),  INT64_C(  160187600213366940) } },
    { { -INT32_C(   823865517), -INT32_C(  1898047417) },
      {  INT32_C(  1636890684),  INT32_C(  1004173801) },
      { -INT64_C( 1348577789646143628), -INT64_C( 1905969489207122017) } },
    { { -INT32_C(   854738592), -INT32_C(   876084128) },
      { -INT32_C(  1241022678), -INT32_C(  1622806196) },
      {  INT64_C( 1060749976433789376),  INT64_C( 1421714751135657088) } },
    { {  INT32_C(   124594624),  INT32_C(  2123713602) },
      {  INT32_C(   786441796), -INT32_C(     9848161) },
      {  INT64_C(   97986419870504704), -INT64_C(   20914673470385922) } },
    { { -INT32_C(   724732300), -INT32_C(  1532979846) },
      {  INT32_C(  1616619284),  INT32_C(  1509925017) },
      { -INT64_C( 1171616211917673200), -INT64_C( 2314684620032207382) } },
  };

  for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) {
    simde_int32x2_t a = simde_vld1_s32(test_vec[i].a);
    simde_int32x2_t b = simde_vld1_s32(test_vec[i].b);
    simde_int64x2_t r = simde_vmull_s32(a, b);

    simde_test_arm_neon_assert_equal_i64x2(r, simde_vld1q_s64(test_vec[i].r));
  }

  return 0;

#else
  fputc('\n', stdout);
  for (int i = 0 ; i < 8 ; i++) {
    simde_int32x2_t a = simde_test_arm_neon_random_i32x2();
    simde_int32x2_t b = simde_test_arm_neon_random_i32x2();
    simde_int64x2_t r = simde_vmull_s32(a, b);

    simde_test_arm_neon_write_i32x2(2, a, SIMDE_TEST_VEC_POS_FIRST);
    simde_test_arm_neon_write_i32x2(2, b, SIMDE_TEST_VEC_POS_MIDDLE);
    simde_test_arm_neon_write_i64x2(2, r, SIMDE_TEST_VEC_POS_LAST);
  }
  return 1;
#endif
}

static int
test_simde_vmull_u8 (SIMDE_MUNIT_TEST_ARGS) {
#if 1
  struct {
    uint8_t a[8];
    uint8_t b[8];
    uint16_t r[8];
  } test_vec[] = {
    { { UINT8_C( 94), UINT8_C(152), UINT8_C( 27), UINT8_C(118), UINT8_C(190), UINT8_C(231), UINT8_C( 17), UINT8_C(211) },
      { UINT8_C( 99), UINT8_C(112), UINT8_C( 80), UINT8_C(144), UINT8_C(181), UINT8_C(106), UINT8_C( 70), UINT8_C( 39) },
      { UINT16_C( 9306), UINT16_C(17024), UINT16_C( 2160), UINT16_C(16992), UINT16_C(34390), UINT16_C(24486), UINT16_C( 1190), UINT16_C( 8229) } },
    { { UINT8_C(237), UINT8_C(190), UINT8_C( 61), UINT8_C( 90), UINT8_C( 53), UINT8_C( 74), UINT8_C(239), UINT8_C( 23) },
      { UINT8_C( 70), UINT8_C(227),    UINT8_MAX, UINT8_C(159), UINT8_C(184), UINT8_C(227), UINT8_C(105), UINT8_C( 22) },
      { UINT16_C(16590), UINT16_C(43130), UINT16_C(15555), UINT16_C(14310), UINT16_C( 9752), UINT16_C(16798), UINT16_C(25095), UINT16_C(  506) } },
    { { UINT8_C(123), UINT8_C(132), UINT8_C(141), UINT8_C( 57), UINT8_C(108), UINT8_C(158), UINT8_C( 12), UINT8_C(207) },
      { UINT8_C( 14), UINT8_C( 93), UINT8_C( 96), UINT8_C(196), UINT8_C(199), UINT8_C(166), UINT8_C(235), UINT8_C(180) },
      { UINT16_C( 1722), UINT16_C(12276), UINT16_C(13536), UINT16_C(11172), UINT16_C(21492), UINT16_C(26228), UINT16_C( 2820), UINT16_C(37260) } },
    { { UINT8_C(100), UINT8_C( 40), UINT8_C( 15), UINT8_C(154), UINT8_C(114), UINT8_C(254), UINT8_C(177), UINT8_C(185) },
      { UINT8_C(226), UINT8_C(177), UINT8_C( 88), UINT8_C(154), UINT8_C(148), UINT8_C(193), UINT8_C(176), UINT8_C( 16) },
      { UINT16_C(22600), UINT16_C( 7080), UINT16_C( 1320), UINT16_C(23716), UINT16_C(16872), UINT16_C(49022), UINT16_C(31152), UINT16_C( 2960) } },
    { { UINT8_C( 69), UINT8_C( 61), UINT8_C( 73), UINT8_C(177), UINT8_C(220), UINT8_C( 86), UINT8_C(129), UINT8_C(234) },
      { UINT8_C(179), UINT8_C(225), UINT8_C(174), UINT8_C(122), UINT8_C(135), UINT8_C(153), UINT8_C( 46), UINT8_C(236) },
      { UINT16_C(12351), UINT16_C(13725), UINT16_C(12702), UINT16_C(21594), UINT16_C(29700), UINT16_C(13158), UINT16_C( 5934), UINT16_C(55224) } },
    { { UINT8_C(194), UINT8_C( 61), UINT8_C(134), UINT8_C( 52), UINT8_C( 60), UINT8_C( 55), UINT8_C(237), UINT8_C( 30) },
      { UINT8_C(232), UINT8_C( 69), UINT8_C(184), UINT8_C(125), UINT8_C(  6), UINT8_C(104), UINT8_C(141), UINT8_C( 76) },
      { UINT16_C(45008), UINT16_C( 4209), UINT16_C(24656), UINT16_C( 6500), UINT16_C(  360), UINT16_C( 5720), UINT16_C(33417), UINT16_C( 2280) } },
    { { UINT8_C(166), UINT8_C(214), UINT8_C(253), UINT8_C(130), UINT8_C( 44), UINT8_C(126), UINT8_C(108), UINT8_C(223) },
      { UINT8_C( 95), UINT8_C( 27), UINT8_C( 89), UINT8_C(231), UINT8_C(180), UINT8_C(136), UINT8_C(211), UINT8_C(118) },
      { UINT16_C(15770), UINT16_C( 5778), UINT16_C(22517), UINT16_C(30030), UINT16_C( 7920), UINT16_C(17136), UINT16_C(22788), UINT16_C(26314) } },
    { { UINT8_C(197), UINT8_C( 89), UINT8_C(171), UINT8_C(  1), UINT8_C(144), UINT8_C(152), UINT8_C( 31), UINT8_C(121) },
      { UINT8_C(222), UINT8_C(215), UINT8_C(246), UINT8_C(228), UINT8_C( 64), UINT8_C(131), UINT8_C( 48), UINT8_C(230) },
      { UINT16_C(43734), UINT16_C(19135), UINT16_C(42066), UINT16_C(  228), UINT16_C( 9216), UINT16_C(19912), UINT16_C( 1488), UINT16_C(27830) } },
  };

  for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) {
    simde_uint8x8_t a = simde_vld1_u8(test_vec[i].a);
    simde_uint8x8_t b = simde_vld1_u8(test_vec[i].b);
    simde_uint16x8_t r = simde_vmull_u8(a, b);

    simde_test_arm_neon_assert_equal_u16x8(r, simde_vld1q_u16(test_vec[i].r));
  }

  return 0;

#else
  fputc('\n', stdout);
  for (int i = 0 ; i < 8 ; i++) {
    simde_uint8x8_t a = simde_test_arm_neon_random_u8x8();
    simde_uint8x8_t b = simde_test_arm_neon_random_u8x8();
    simde_uint16x8_t r = simde_vmull_u8(a, b);

    simde_test_arm_neon_write_u8x8(2, a, SIMDE_TEST_VEC_POS_FIRST);
    simde_test_arm_neon_write_u8x8(2, b, SIMDE_TEST_VEC_POS_MIDDLE);
    simde_test_arm_neon_write_u16x8(2, r, SIMDE_TEST_VEC_POS_LAST);
  }
  return 1;
#endif
}

static int
test_simde_vmull_u16 (SIMDE_MUNIT_TEST_ARGS) {
#if 1
  struct {
    uint16_t a[4];
    uint16_t b[4];
    uint32_t r[4];
  } test_vec[] = {
    { { UINT16_C(65254), UINT16_C(49526), UINT16_C(58343), UINT16_C(28199) },
      { UINT16_C( 3568), UINT16_C(53134), UINT16_C(38079), UINT16_C(44979) },
      { UINT32_C( 232826272), UINT32_C(2631514484), UINT32_C(2221643097), UINT32_C(1268362821) } },
    { { UINT16_C(41279), UINT16_C(54255), UINT16_C(49218), UINT16_C(49274) },
      { UINT16_C(44771), UINT16_C(52368), UINT16_C(62625), UINT16_C(34586) },
      { UINT32_C(1848102109), UINT32_C(2841225840), UINT32_C(3082277250), UINT32_C(1704190564) } },
    { { UINT16_C(37107), UINT16_C(55881), UINT16_C(28787), UINT16_C(25416) },
      { UINT16_C(54910), UINT16_C(15666), UINT16_C(58986), UINT16_C(43500) },
      { UINT32_C(2037545370), UINT32_C( 875431746), UINT32_C(1698029982), UINT32_C(1105596000) } },
    { { UINT16_C(56455), UINT16_C(51581), UINT16_C(63388), UINT16_C(32649) },
      { UINT16_C( 6821), UINT16_C(17995), UINT16_C(25870), UINT16_C(  462) },
      { UINT32_C( 385079555), UINT32_C( 928200095), UINT32_C(1639847560), UINT32_C(  15083838) } },
    { { UINT16_C( 6133), UINT16_C(27099), UINT16_C( 9351), UINT16_C( 1484) },
      { UINT16_C(65530), UINT16_C(25923), UINT16_C(12261), UINT16_C(27662) },
      { UINT32_C( 401895490), UINT32_C( 702487377), UINT32_C( 114652611), UINT32_C(  41050408) } },
    { { UINT16_C(35595), UINT16_C(43062), UINT16_C(49027), UINT16_C(10279) },
      { UINT16_C(29657), UINT16_C(59503), UINT16_C(15832), UINT16_C(52969) },
      { UINT32_C(1055640915), UINT32_C(2562318186), UINT32_C( 776195464), UINT32_C( 544468351) } },
    { { UINT16_C(50516), UINT16_C(56119), UINT16_C( 1001), UINT16_C(58337) },
      { UINT16_C( 9218), UINT16_C(59208), UINT16_C(22355), UINT16_C(24404) },
      { UINT32_C( 465656488), UINT32_C(3322693752), UINT32_C(  22377355), UINT32_C(1423656148) } },
    { { UINT16_C(35554), UINT16_C(25863), UINT16_C(11849), UINT16_C( 9102) },
      { UINT16_C(64929), UINT16_C(31243), UINT16_C(62522), UINT16_C(36424) },
      { UINT32_C(2308485666), UINT32_C( 808037709), UINT32_C( 740823178), UINT32_C( 331531248) } },
  };

  for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) {
    simde_uint16x4_t a = simde_vld1_u16(test_vec[i].a);
    simde_uint16x4_t b = simde_vld1_u16(test_vec[i].b);
    simde_uint32x4_t r = simde_vmull_u16(a, b);
    simde_test_arm_neon_assert_equal_u32x4(r, simde_vld1q_u32(test_vec[i].r));
  }

  return 0;

#else
  fputc('\n', stdout);
  for (int i = 0 ; i < 8 ; i++) {
    simde_uint16x4_t a = simde_test_arm_neon_random_u16x4();
    simde_uint16x4_t b = simde_test_arm_neon_random_u16x4();
    simde_uint32x4_t r = simde_vmull_u16(a, b);

    simde_test_arm_neon_write_u16x4(2, a, SIMDE_TEST_VEC_POS_FIRST);
    simde_test_arm_neon_write_u16x4(2, b, SIMDE_TEST_VEC_POS_MIDDLE);
    simde_test_arm_neon_write_u32x4(2, r, SIMDE_TEST_VEC_POS_LAST);
  }
  return 1;
#endif
}

static int
test_simde_vmull_u32 (SIMDE_MUNIT_TEST_ARGS) {
#if 1
  struct {
    uint32_t a[2];
    uint32_t b[2];
    uint64_t r[2];
  } test_vec[] = {
    { { UINT32_C(1764671971), UINT32_C( 417693998) },
      { UINT32_C(3142976160), UINT32_C(3551123166) },
      { UINT64_C( 5546321935073211360), UINT64_C( 1483282832596957668) } },
    { { UINT32_C(1771619725), UINT32_C(3853953090) },
      { UINT32_C(4088780350), UINT32_C(1688137088) },
      { UINT64_C( 7243763919252403750), UINT64_C( 6506001146641201920) } },
    { { UINT32_C(3721252015), UINT32_C(4009079374) },
      { UINT32_C(2477411253), UINT32_C(2926007073) },
      { UINT64_C( 9219071617209924795), UINT64_C(11730594604542412302) } },
    { { UINT32_C(1528299288), UINT32_C(3863006887) },
      { UINT32_C( 618263972), UINT32_C(2441639906) },
      { UINT64_C(  944892388203651936), UINT64_C( 9432071772452032622) } },
    { { UINT32_C(2456704580), UINT32_C(3179307784) },
      { UINT32_C(3494980270), UINT32_C(2524887166) },
      { UINT64_C( 8586134036318636600), UINT64_C( 8027393420585500144) } },
    { { UINT32_C(1592891063), UINT32_C( 138686820) },
      { UINT32_C( 153886246), UINT32_C(3650794901) },
      { UINT64_C(  245124025972019498), UINT64_C(  506317135291904820) } },
    { { UINT32_C( 325781771), UINT32_C( 466742380) },
      { UINT32_C(2498437654), UINT32_C(2435541466) },
      { UINT64_C(  813945443653205234), UINT64_C( 1136770420429529080) } },
    { { UINT32_C(1676614911), UINT32_C(1953182798) },
      { UINT32_C(3883767890), UINT32_C(1489049677) },
      { UINT64_C( 6511583155237007790), UINT64_C( 2908386214483856246) } },
  };

  for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) {
    simde_uint32x2_t a = simde_vld1_u32(test_vec[i].a);
    simde_uint32x2_t b = simde_vld1_u32(test_vec[i].b);
    simde_uint64x2_t r = simde_vmull_u32(a, b);
    simde_test_arm_neon_assert_equal_u64x2(r, simde_vld1q_u64(test_vec[i].r));
  }

  return 0;

#else
  fputc('\n', stdout);
  for (int i = 0 ; i < 8 ; i++) {
    simde_uint32x2_t a = simde_test_arm_neon_random_u32x2();
    simde_uint32x2_t b = simde_test_arm_neon_random_u32x2();
    simde_uint64x2_t r = simde_vmull_u32(a, b);

    simde_test_arm_neon_write_u32x2(2, a, SIMDE_TEST_VEC_POS_FIRST);
    simde_test_arm_neon_write_u32x2(2, b, SIMDE_TEST_VEC_POS_MIDDLE);
    simde_test_arm_neon_write_u64x2(2, r, SIMDE_TEST_VEC_POS_LAST);
  }
  return 1;
#endif
}

static int
test_simde_vmull_p8 (SIMDE_MUNIT_TEST_ARGS) {
#if 1
  struct {
    simde_poly8_t a[8];
    simde_poly8_t b[8];
    simde_poly16_t r[8];
  } test_vec[] = {
    { {  SIMDE_POLY8_C(   185),  SIMDE_POLY8_C(   129),  SIMDE_POLY8_C(   202),  SIMDE_POLY8_C(   179),
         SIMDE_POLY8_C(    49),  SIMDE_POLY8_C(   106),  SIMDE_POLY8_C(    44),  SIMDE_POLY8_C(    74) },
      {  SIMDE_POLY8_C(   172),  SIMDE_POLY8_C(   196),  SIMDE_POLY8_C(   174),  SIMDE_POLY8_C(   244),
         SIMDE_POLY8_C(   107),  SIMDE_POLY8_C(   166),  SIMDE_POLY8_C(   249),  SIMDE_POLY8_C(    44) },
      {  SIMDE_POLY16_C(   19596),  SIMDE_POLY16_C(   25284),  SIMDE_POLY16_C(   30892),  SIMDE_POLY16_C(   27356),
         SIMDE_POLY16_C(    3003),  SIMDE_POLY16_C(   14652),  SIMDE_POLY16_C(    6924),  SIMDE_POLY16_C(    2616) } },
    { {  SIMDE_POLY8_C(   120),  SIMDE_POLY8_C(     3),  SIMDE_POLY8_C(   247),  SIMDE_POLY8_C(    24),
         SIMDE_POLY8_C(     5),  SIMDE_POLY8_C(   122),  SIMDE_POLY8_C(    20),  SIMDE_POLY8_C(    38) },
      {  SIMDE_POLY8_C(   198),  SIMDE_POLY8_C(    45),  SIMDE_POLY8_C(    54),  SIMDE_POLY8_C(   185),
         SIMDE_POLY8_C(    37),  SIMDE_POLY8_C(    64),  SIMDE_POLY8_C(   214),  SIMDE_POLY8_C(   191) },
      {  SIMDE_POLY16_C(    8976),  SIMDE_POLY16_C(     119),  SIMDE_POLY16_C(    5026),  SIMDE_POLY16_C(    3672),
         SIMDE_POLY16_C(     177),  SIMDE_POLY16_C(    7808),  SIMDE_POLY16_C(    3640),  SIMDE_POLY16_C(    5218) } },
    { {  SIMDE_POLY8_C(    17),  SIMDE_POLY8_C(    86),  SIMDE_POLY8_C(   204),  SIMDE_POLY8_C(   173),
         SIMDE_POLY8_C(    69),  SIMDE_POLY8_C(    59),  SIMDE_POLY8_C(    27),  SIMDE_POLY8_C(    56) },
      {  SIMDE_POLY8_C(   122),  SIMDE_POLY8_C(     0),  SIMDE_POLY8_C(   185),  SIMDE_POLY8_C(    13),
         SIMDE_POLY8_C(   243),  SIMDE_POLY8_C(    23),  SIMDE_POLY8_C(     9),  SIMDE_POLY8_C(   227) },
      {  SIMDE_POLY16_C(    2010),  SIMDE_POLY16_C(       0),  SIMDE_POLY16_C(   30188),  SIMDE_POLY16_C(    1905),
         SIMDE_POLY16_C(   16383),  SIMDE_POLY16_C(     785),  SIMDE_POLY16_C(     195),  SIMDE_POLY16_C(    5448) } },
    { {  SIMDE_POLY8_C(    42),  SIMDE_POLY8_C(   173),  SIMDE_POLY8_C(    65),  SIMDE_POLY8_C(    38),
         SIMDE_POLY8_C(    80),  SIMDE_POLY8_C(   199),  SIMDE_POLY8_C(   134),  SIMDE_POLY8_C(   200) },
      {  SIMDE_POLY8_C(    70),  SIMDE_POLY8_C(   119),  SIMDE_POLY8_C(   226),  SIMDE_POLY8_C(   247),
         SIMDE_POLY8_C(   223),  SIMDE_POLY8_C(    37),  SIMDE_POLY8_C(   108),  SIMDE_POLY8_C(    80) },
      {  SIMDE_POLY16_C(    2684),  SIMDE_POLY16_C(   14195),  SIMDE_POLY16_C(   14434),  SIMDE_POLY16_C(    7378),
         SIMDE_POLY16_C(   14896),  SIMDE_POLY16_C(    6971),  SIMDE_POLY16_C(   14184),  SIMDE_POLY16_C(   16000) } },
    { {  SIMDE_POLY8_C(   127),  SIMDE_POLY8_C(   107),  SIMDE_POLY8_C(    68),  SIMDE_POLY8_C(   137),
         SIMDE_POLY8_C(   245),  SIMDE_POLY8_C(    88),  SIMDE_POLY8_C(    61),  SIMDE_POLY8_C(   182) },
      {  SIMDE_POLY8_C(    37),  SIMDE_POLY8_C(   182),  SIMDE_POLY8_C(   241),  SIMDE_POLY8_C(    64),
         SIMDE_POLY8_C(    63),  SIMDE_POLY8_C(   134),  SIMDE_POLY8_C(   160),  SIMDE_POLY8_C(   224) },
      {  SIMDE_POLY16_C(    3683),  SIMDE_POLY16_C(   16170),  SIMDE_POLY16_C(   16260),  SIMDE_POLY16_C(    8768),
         SIMDE_POLY16_C(    5267),  SIMDE_POLY16_C(   11728),  SIMDE_POLY16_C(    6432),  SIMDE_POLY16_C(   24640) } },
    { {  SIMDE_POLY8_C(     0),  SIMDE_POLY8_C(   196),  SIMDE_POLY8_C(    41),  SIMDE_POLY8_C(    67),
         SIMDE_POLY8_C(    61),  SIMDE_POLY8_C(   176),  SIMDE_POLY8_C(    91),  SIMDE_POLY8_C(   135) },
      {  SIMDE_POLY8_C(   166),  SIMDE_POLY8_C(   199),  SIMDE_POLY8_C(   200),  SIMDE_POLY8_C(   195),
         SIMDE_POLY8_C(   192),  SIMDE_POLY8_C(   118),  SIMDE_POLY8_C(   158),  SIMDE_POLY8_C(   222) },
      {  SIMDE_POLY16_C(       0),  SIMDE_POLY16_C(   20828),  SIMDE_POLY16_C(    8072),  SIMDE_POLY16_C(   12677),
         SIMDE_POLY16_C(    4544),  SIMDE_POLY16_C(   12960),  SIMDE_POLY16_C(   11058),  SIMDE_POLY16_C(   27930) } },
    { {  SIMDE_POLY8_C(   122),  SIMDE_POLY8_C(   199),  SIMDE_POLY8_C(   173),  SIMDE_POLY8_C(     4),
         SIMDE_POLY8_C(   213),  SIMDE_POLY8_C(    48),  SIMDE_POLY8_C(    71),  SIMDE_POLY8_C(   196) },
      {  SIMDE_POLY8_C(   179),  SIMDE_POLY8_C(     3),  SIMDE_POLY8_C(   160),  SIMDE_POLY8_C(   204),
         SIMDE_POLY8_C(    80),  SIMDE_POLY8_C(    29),  SIMDE_POLY8_C(    12),  SIMDE_POLY8_C(    98) },
      {  SIMDE_POLY16_C(   13678),  SIMDE_POLY16_C(     329),  SIMDE_POLY16_C(   17184),  SIMDE_POLY16_C(     816),
         SIMDE_POLY16_C(   14352),  SIMDE_POLY16_C(     624),  SIMDE_POLY16_C(     804),  SIMDE_POLY16_C(   10248) } },
    { {  SIMDE_POLY8_C(    82),  SIMDE_POLY8_C(   238),  SIMDE_POLY8_C(    78),  SIMDE_POLY8_C(    52),
         SIMDE_POLY8_C(   152),  SIMDE_POLY8_C(   159),  SIMDE_POLY8_C(   178),  SIMDE_POLY8_C(    24) },
      {  SIMDE_POLY8_C(   194),  SIMDE_POLY8_C(   223),  SIMDE_POLY8_C(   173),  SIMDE_POLY8_C(   189),
         SIMDE_POLY8_C(   250),  SIMDE_POLY8_C(   104),  SIMDE_POLY8_C(   117),  SIMDE_POLY8_C(    50) },
      {  SIMDE_POLY16_C(   15652),  SIMDE_POLY16_C(   18330),  SIMDE_POLY16_C(   11718),  SIMDE_POLY16_C(    7812),
         SIMDE_POLY16_C(   30064),  SIMDE_POLY16_C(   12504),  SIMDE_POLY16_C(   13210),  SIMDE_POLY16_C(     688) } },
  };

  for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) {
    simde_poly8x8_t a = simde_vld1_p8(test_vec[i].a);
    simde_poly8x8_t b = simde_vld1_p8(test_vec[i].b);
    simde_poly16x8_t r = simde_vmull_p8(a, b);

    simde_test_arm_neon_assert_equal_p16x8(r, simde_vld1q_p16(test_vec[i].r));
  }

  return 0;

#else
  fputc('\n', stdout);
  for (int i = 0 ; i < 8 ; i++) {
    simde_poly8x8_t a = simde_test_arm_neon_random_p8x8();
    simde_poly8x8_t b = simde_test_arm_neon_random_p8x8();
    simde_poly16x8_t r = simde_vmull_p8(a, b);

    simde_test_arm_neon_write_p8x8(2, a, SIMDE_TEST_VEC_POS_FIRST);
    simde_test_arm_neon_write_p8x8(2, b, SIMDE_TEST_VEC_POS_MIDDLE);
    simde_test_arm_neon_write_p16x8(2, r, SIMDE_TEST_VEC_POS_LAST);
  }
  return 1;
#endif
}

#if !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE) && !defined(SIMDE_BUG_GCC_113065)
static int
test_simde_vmull_p64 (SIMDE_MUNIT_TEST_ARGS) {
#if 1
  struct {
    simde_poly64_t a[1];
    simde_poly64_t b[1];
    simde_poly64_t r[2];
  } test_vec[] = {
    { {  SIMDE_POLY64_C( 7068902937866061824) },
      {  SIMDE_POLY64_C(14874139788804648960) },
      {  SIMDE_POLY64_C( 3152885789599675803),  SIMDE_POLY64_C( 3604645741034733568) } },
    { {  SIMDE_POLY64_C( 6112195614237017088) },
      {  SIMDE_POLY64_C(15100063711026538496) },
      {  SIMDE_POLY64_C( 4210036932911217869),  SIMDE_POLY64_C( 8872695348131266560) } },
    { {  SIMDE_POLY64_C( 6184300182471711744) },
      {  SIMDE_POLY64_C(15413425443050586112) },
      {  SIMDE_POLY64_C( 4320778884545928455),  SIMDE_POLY64_C(16183132146376900608) } },
    { {  SIMDE_POLY64_C(18427427496663795712) },
      {  SIMDE_POLY64_C(17337026183024695296) },
      {  SIMDE_POLY64_C( 5785060070923143728),  SIMDE_POLY64_C( 7243298849874247680) } },
    { {  SIMDE_POLY64_C( 5898302655747178496) },
      {  SIMDE_POLY64_C(14270322748663631872) },
      {  SIMDE_POLY64_C( 4430735050927400579),  SIMDE_POLY64_C(  281238396959981568) } },
    { {  SIMDE_POLY64_C( 4310271262858839552) },
      {  SIMDE_POLY64_C(11226476335783634944) },
      {  SIMDE_POLY64_C( 2293839743262371683),  SIMDE_POLY64_C(13199221759279104000) } },
    { {  SIMDE_POLY64_C( 1504197760910681088) },
      {  SIMDE_POLY64_C(12397747313193005056) },
      {  SIMDE_POLY64_C(  576860236216524364),  SIMDE_POLY64_C( 6977643691527634944) } },
    { {  SIMDE_POLY64_C(17342940921599655936) },
      {  SIMDE_POLY64_C( 2283591279968234496) },
      {  SIMDE_POLY64_C(  735715569773265056),  SIMDE_POLY64_C( 3633255345754734592) } },
  };

  for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) {
    simde_poly64_t a = test_vec[i].a[0];
    simde_poly64_t b = test_vec[i].b[0];
    simde_poly128_t r = simde_vmull_p64(a, b);
    simde_poly128_t mask = HEDLEY_STATIC_CAST(simde_poly128_t, 0xFFFFFFFFFFFFFFFFull);
    simde_poly64_t top_r = HEDLEY_STATIC_CAST(simde_poly64_t, ((r >> 64) & mask));
    simde_poly64_t bottom_r = HEDLEY_STATIC_CAST(simde_poly64_t, (r & mask));

    simde_assert_equal_p64(top_r, test_vec[i].r[0]);
    simde_assert_equal_p64(bottom_r, test_vec[i].r[1]);
  }

  return 0;

#else
  fputc('\n', stdout);
  for (int i = 0 ; i < 8 ; i++) {
    simde_poly64x1_t a = simde_test_arm_neon_random_p64x1();
    simde_poly64x1_t b = simde_test_arm_neon_random_p64x1();
    simde_poly64x2_t r = simde_vmull_p64(a, b);

    simde_test_arm_neon_write_p64x1(2, a, SIMDE_TEST_VEC_POS_FIRST);
    simde_test_arm_neon_write_p64x1(2, b, SIMDE_TEST_VEC_POS_MIDDLE);
    simde_test_arm_neon_write_p64x2(2, r, SIMDE_TEST_VEC_POS_LAST);
  }
  return 1;
#endif
}
#endif /* !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE) && !defined(SIMDE_BUG_GCC_113065) */

SIMDE_TEST_FUNC_LIST_BEGIN
SIMDE_TEST_FUNC_LIST_ENTRY(vmull_s8)
SIMDE_TEST_FUNC_LIST_ENTRY(vmull_s16)
SIMDE_TEST_FUNC_LIST_ENTRY(vmull_s32)
SIMDE_TEST_FUNC_LIST_ENTRY(vmull_u8)
SIMDE_TEST_FUNC_LIST_ENTRY(vmull_u16)
SIMDE_TEST_FUNC_LIST_ENTRY(vmull_u32)

SIMDE_TEST_FUNC_LIST_ENTRY(vmull_p8)
#if !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE) && !defined(SIMDE_BUG_GCC_113065)
  SIMDE_TEST_FUNC_LIST_ENTRY(vmull_p64)
#endif /* !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE) && !defined(SIMDE_BUG_GCC_113065) */
SIMDE_TEST_FUNC_LIST_END

#include "test-neon-footer.h"