1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
|
/// \file
/// \brief String tokenization
/// \ingroup cgraph_utils
///
/// This is essentially equivalent to `strtok` but with two main improvements:
///
/// 1. The input string is not modified. This means, if you have a `const`
/// string, you do not need to `strdup` it in order to tokenize it. This
/// (combined with other properties like no opaque struct pointers) enables
/// you to tokenize a string with no heap allocation.
///
/// 2. No global state. All the state for tokenization is contained in the
/// `tok_t` struct.
///
/// The above two properties are intended to make string tokenization scalable
/// (no locks, no thread-shared state) and transparent to the compiler (a good
/// optimizing compiler implements all the string.h functions we use as
/// built-ins and, if `separators` is a compile-time literal, can typically
/// flatten everything into a tight loop with no function calls).
///
/// Sample usage:
///
/// const char my_input[] = "foo; bar:/baz";
/// for (tok_t t = tok(my_input, ";:/"); !tok_end(&t); tok_next(&t)) {
/// strview_t s = tok_get(&t);
/// printf("%.*s\n", (int)s.size, s.data);
/// }
/// // prints “foo”, “ bar”, “baz”
#include <assert.h>
#include <stddef.h>
#include <string.h>
#include <util/strview.h>
/// state for an in-progress string tokenization
typedef struct {
const char *start; ///< start of the string being scanned
const char *separators; ///< characters to treat as token separators
strview_t next; ///< next token to yield
} tok_t;
/// begin tokenization of a new string
static inline tok_t tok(const char *input, const char *separators) {
assert(input != NULL);
assert(separators != NULL);
assert(strcmp(separators, "") != 0 &&
"at least one separator must be provided");
#ifndef NDEBUG
for (const char *s1 = separators; *s1 != '\0'; ++s1) {
for (const char *s2 = s1 + 1; *s2 != '\0'; ++s2) {
assert(*s1 != *s2 && "duplicate separator characters");
}
}
#endif
tok_t t = {.start = input, .separators = separators};
// find the end of the first token
size_t size = strcspn(input, separators);
t.next = (strview_t){.data = input, .size = size};
return t;
}
/// is this tokenizer exhausted?
static inline bool tok_end(const tok_t *t) {
assert(t != NULL);
return t->next.data == NULL;
}
/// get the current token
static inline strview_t tok_get(const tok_t *t) {
assert(t != NULL);
assert(t->next.data != NULL && "extracting from an exhausted tokenizer");
return t->next;
}
/// advance to the next token in the string being scanned
static inline void tok_next(tok_t *t) {
assert(t != NULL);
assert(t->start != NULL);
assert(t->separators != NULL);
assert(t->next.data != NULL && "advancing an exhausted tokenizer");
// resume from where the previous token ended
const char *start = t->next.data + t->next.size;
// if we are at the end of the string, we are done
if (start == t->start + strlen(t->start)) {
t->next = (strview_t){0};
return;
}
// skip last separator characters
start += strspn(start, t->separators);
// find the end of the next token
size_t size = strcspn(start, t->separators);
t->next = (strview_t){.data = start, .size = size};
}
|