File: fpstring.c

package info (click to toggle)
ghc 9.10.3-3
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 169,076 kB
  • sloc: haskell: 713,554; ansic: 84,184; cpp: 30,255; javascript: 9,003; sh: 7,870; fortran: 3,527; python: 3,228; asm: 2,523; makefile: 2,324; yacc: 1,570; lisp: 532; xml: 196; perl: 111; csh: 2
file content (318 lines) | stat: -rw-r--r-- 10,740 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
/*
 * Copyright (c) 2003 David Roundy
 * Copyright (c) 2005-6 Don Stewart
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the names of the authors or the names of any contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include "HsFFI.h"
#include "MachDeps.h"

#include "fpstring.h"
#if defined(__x86_64__)
#include <x86intrin.h>
#include <cpuid.h>
#endif

#include <stdint.h>
#include <stdbool.h>

#if defined(__x86_64__) && (__GNUC__ >= 7 || __GNUC__ == 6 && __GNUC_MINOR__ >= 3 || defined(__clang_major__)) && !defined(__STDC_NO_ATOMICS__)
#include <stdatomic.h>
#define USE_SIMD_COUNT
#endif

/* copy a string in reverse */
void fps_reverse(unsigned char *q, unsigned char *p, size_t n) {
    p += n-1;
    while (n-- != 0)
        *q++ = *p--;
}

/* duplicate a string, interspersing the character through the elements
   of the duplicated string */
void fps_intersperse(unsigned char *q,
                     unsigned char *p,
                     size_t n,
                     unsigned char c) {
#if defined(__x86_64__)
  {
    const __m128i separator = _mm_set1_epi8(c);
    const unsigned char *const p_begin = p;
    const unsigned char *const p_end = p_begin + n - 9;
    while (p < p_end) {
      const __m128i eight_src_bytes = _mm_loadl_epi64((__m128i *)p);
      const __m128i sixteen_dst_bytes = _mm_unpacklo_epi8(eight_src_bytes, separator);
      _mm_storeu_si128((__m128i *)q, sixteen_dst_bytes);
      p += 8;
      q += 16;
    }
    n -= p - p_begin;
  }
#endif
    while (n > 1) {
        *q++ = *p++;
        *q++ = c;
        n--;
    }
    if (n == 1)
        *q = *p;
}

/* find maximum char in a packed string */
unsigned char fps_maximum(unsigned char *p, size_t len) {
    unsigned char *q, c = *p;
    for (q = p; q < p + len; q++)
        if (*q > c)
            c = *q;
    return c;
}

/* find minimum char in a packed string */
unsigned char fps_minimum(unsigned char *p, size_t len) {
    unsigned char *q, c = *p;
    for (q = p; q < p + len; q++)
        if (*q < c)
            c = *q;
    return c;
}

int fps_compare(const void *a, const void *b) {
    return (int)*(unsigned char*)a - (int)*(unsigned char*)b;
}

void fps_sort(unsigned char *p, size_t len) {
    return qsort(p, len, 1, fps_compare);
}

// We don't actually always use these unaligned write functions on the
// Haskell side, but the macros we check there aren't visible here...
void fps_unaligned_write_u16(uint16_t x, uint8_t *p) {
  memcpy(p, &x, 2);
  return;
}

void fps_unaligned_write_u32(uint32_t x, uint8_t *p) {
  memcpy(p, &x, 4);
  return;
}

void fps_unaligned_write_u64(uint64_t x, uint8_t *p) {
  memcpy(p, &x, 8);
  return;
}

void fps_unaligned_write_HsFloat(HsFloat x, uint8_t *p) {
  memcpy(p, &x, SIZEOF_HSFLOAT);
}

void fps_unaligned_write_HsDouble(HsDouble x, uint8_t *p) {
  memcpy(p, &x, SIZEOF_HSDOUBLE);
}

uint64_t fps_unaligned_read_u64(uint8_t *p) {
  uint64_t ans;
  memcpy(&ans, p, 8);
  return ans;
}

/* count the number of occurrences of a char in a string */
size_t fps_count_naive(unsigned char *str, size_t len, unsigned char w) {
    size_t c;
    for (c = 0; len-- != 0; ++str)
        if (*str == w)
            ++c;
    return c;
}


#ifdef USE_SIMD_COUNT
__attribute__((target("sse4.2")))
size_t fps_count_cmpestrm(unsigned char *str, size_t len, unsigned char w) {
    const __m128i pat = _mm_set1_epi8(w);

    size_t res = 0;

    size_t i = 0;

    for (; i < len && (intptr_t)(str + i) % 64; ++i) {
        res += str[i] == w;
    }

    for (size_t end = len - 128; i < end; i += 128) {
        __m128i p0 = _mm_load_si128((const __m128i*)(str + i + 16 * 0));
        __m128i p1 = _mm_load_si128((const __m128i*)(str + i + 16 * 1));
        __m128i p2 = _mm_load_si128((const __m128i*)(str + i + 16 * 2));
        __m128i p3 = _mm_load_si128((const __m128i*)(str + i + 16 * 3));
        __m128i p4 = _mm_load_si128((const __m128i*)(str + i + 16 * 4));
        __m128i p5 = _mm_load_si128((const __m128i*)(str + i + 16 * 5));
        __m128i p6 = _mm_load_si128((const __m128i*)(str + i + 16 * 6));
        __m128i p7 = _mm_load_si128((const __m128i*)(str + i + 16 * 7));
        // Here, cmpestrm compares two strings in the following mode:
        // * _SIDD_SBYTE_OPS: interprets the strings as consisting of 8-bit chars,
        // * _SIDD_CMP_EQUAL_EACH: computes the number of `i`s
        //    for which `p[i]`, a part of `str`, is equal to `pat[i]`
        //    (the latter being always equal to `w`).
        //
        // q.v. https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpestrm&expand=835
#define MODE _SIDD_SBYTE_OPS | _SIDD_CMP_EQUAL_EACH
        __m128i r0 = _mm_cmpestrm(p0, 16, pat, 16, MODE);
        __m128i r1 = _mm_cmpestrm(p1, 16, pat, 16, MODE);
        __m128i r2 = _mm_cmpestrm(p2, 16, pat, 16, MODE);
        __m128i r3 = _mm_cmpestrm(p3, 16, pat, 16, MODE);
        __m128i r4 = _mm_cmpestrm(p4, 16, pat, 16, MODE);
        __m128i r5 = _mm_cmpestrm(p5, 16, pat, 16, MODE);
        __m128i r6 = _mm_cmpestrm(p6, 16, pat, 16, MODE);
        __m128i r7 = _mm_cmpestrm(p7, 16, pat, 16, MODE);
#undef MODE
        res += _popcnt64(_mm_extract_epi64(r0, 0));
        res += _popcnt64(_mm_extract_epi64(r1, 0));
        res += _popcnt64(_mm_extract_epi64(r2, 0));
        res += _popcnt64(_mm_extract_epi64(r3, 0));
        res += _popcnt64(_mm_extract_epi64(r4, 0));
        res += _popcnt64(_mm_extract_epi64(r5, 0));
        res += _popcnt64(_mm_extract_epi64(r6, 0));
        res += _popcnt64(_mm_extract_epi64(r7, 0));
    }

    for (; i < len; ++i) {
        res += str[i] == w;
    }

    return res;
}

__attribute__((target("avx2")))
size_t fps_count_avx2(unsigned char *str, size_t len, unsigned char w) {
    __m256i pat = _mm256_set1_epi8(w);

    size_t prefix = 0, res = 0;

    size_t i = 0;

    for (; i < len && (intptr_t)(str + i) % 64; ++i) {
        prefix += str[i] == w;
    }

    for (size_t end = len - 128; i < end; i += 128) {
        __m256i p0 = _mm256_load_si256((const __m256i*)(str + i + 32 * 0));
        __m256i p1 = _mm256_load_si256((const __m256i*)(str + i + 32 * 1));
        __m256i p2 = _mm256_load_si256((const __m256i*)(str + i + 32 * 2));
        __m256i p3 = _mm256_load_si256((const __m256i*)(str + i + 32 * 3));
        __m256i r0 = _mm256_cmpeq_epi8(p0, pat);
        __m256i r1 = _mm256_cmpeq_epi8(p1, pat);
        __m256i r2 = _mm256_cmpeq_epi8(p2, pat);
        __m256i r3 = _mm256_cmpeq_epi8(p3, pat);
        res += _popcnt64(_mm256_extract_epi64(r0, 0));
        res += _popcnt64(_mm256_extract_epi64(r0, 1));
        res += _popcnt64(_mm256_extract_epi64(r0, 2));
        res += _popcnt64(_mm256_extract_epi64(r0, 3));
        res += _popcnt64(_mm256_extract_epi64(r1, 0));
        res += _popcnt64(_mm256_extract_epi64(r1, 1));
        res += _popcnt64(_mm256_extract_epi64(r1, 2));
        res += _popcnt64(_mm256_extract_epi64(r1, 3));
        res += _popcnt64(_mm256_extract_epi64(r2, 0));
        res += _popcnt64(_mm256_extract_epi64(r2, 1));
        res += _popcnt64(_mm256_extract_epi64(r2, 2));
        res += _popcnt64(_mm256_extract_epi64(r2, 3));
        res += _popcnt64(_mm256_extract_epi64(r3, 0));
        res += _popcnt64(_mm256_extract_epi64(r3, 1));
        res += _popcnt64(_mm256_extract_epi64(r3, 2));
        res += _popcnt64(_mm256_extract_epi64(r3, 3));
    }

    // _mm256_cmpeq_epi8(p, pat) returns a SIMD vector
    // with `i`th byte consisting of eight `1`s if `p[i] == pat[i]`,
    // and of eight `0`s otherwise,
    // hence each matching byte is counted 8 times by popcnt.
    // Dividing by 8 corrects for that.
    res /= 8;

    res += prefix;

    for (; i < len; ++i) {
        res += str[i] == w;
    }

    return res;
}

typedef size_t (*fps_impl_t) (unsigned char*, size_t, unsigned char);

fps_impl_t select_fps_simd_impl() {
    uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;

    uint32_t ecx1 = 0;
    if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
        ecx1 = ecx;
    }

    const bool has_xsave = ecx1 & (1 << 26);
    const bool has_popcnt = ecx1 & (1 << 23);

    if (__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx)) {
        const bool has_avx2 = has_xsave && (ebx & (1 << 5));
        if (has_avx2 && has_popcnt) {
            return &fps_count_avx2;
        }
    }

    const bool has_sse42 = ecx1 & (1 << 19);
    if (has_sse42 && has_popcnt) {
        return &fps_count_cmpestrm;
    }

    return &fps_count_naive;
}
#endif



size_t fps_count(unsigned char *str, size_t len, unsigned char w) {
#ifndef USE_SIMD_COUNT
    return fps_count_naive(str, len, w);
#else
    // 1024 is a rough guesstimate of the string length
    // for which the extra performance of the main SIMD loop
    // starts to compensate the extra work and extra branching outside the SIMD loop.
    // The real optimal number depends on the specific μarch
    // and isn't worth optimizing for in this context,
    // since counting characters in shorter strings is unlikely to be a hot spot.
    if (len <= 1024) {
        return fps_count_naive(str, len, w);
    }

    static _Atomic fps_impl_t s_impl = (fps_impl_t)NULL;
    fps_impl_t impl = atomic_load_explicit(&s_impl, memory_order_relaxed);
    if (!impl) {
      impl = select_fps_simd_impl();
      atomic_store_explicit(&s_impl, impl, memory_order_relaxed);
    }

    return (*impl)(str, len, w);
#endif
}