File: tokenize.h

package info (click to toggle)

graphviz 14.1.2-1

links: PTS
area: main
in suites: forky, sid
size: 139,476 kB
sloc: ansic: 142,288; cpp: 11,975; python: 7,883; makefile: 4,044; yacc: 3,030; xml: 2,972; tcl: 2,495; sh: 1,391; objc: 1,159; java: 560; lex: 423; perl: 243; awk: 156; pascal: 139; php: 58; ruby: 49; cs: 31; sed: 1

file content (108 lines) | stat: -rw-r--r-- 3,286 bytes

/// \file
/// \brief String tokenization
/// \ingroup cgraph_utils
///
/// This is essentially equivalent to `strtok` but with two main improvements:
///
///   1. The input string is not modified. This means, if you have a `const`
///      string, you do not need to `strdup` it in order to tokenize it. This
///      (combined with other properties like no opaque struct pointers) enables
///      you to tokenize a string with no heap allocation.
///
///   2. No global state. All the state for tokenization is contained in the
///      `tok_t` struct.
///
/// The above two properties are intended to make string tokenization scalable
/// (no locks, no thread-shared state) and transparent to the compiler (a good
/// optimizing compiler implements all the string.h functions we use as
/// built-ins and, if `separators` is a compile-time literal, can typically
/// flatten everything into a tight loop with no function calls).
///
/// Sample usage:
///
///   const char my_input[] = "foo; bar:/baz";
///   for (tok_t t = tok(my_input, ";:/"); !tok_end(&t); tok_next(&t)) {
///     strview_t s = tok_get(&t);
///     printf("%.*s\n", (int)s.size, s.data);
///   }
///   // prints “foo”, “ bar”, “baz”

#include <assert.h>
#include <stddef.h>
#include <string.h>
#include <util/strview.h>

/// state for an in-progress string tokenization
typedef struct {
  const char *start;      ///< start of the string being scanned
  const char *separators; ///< characters to treat as token separators
  strview_t next;         ///< next token to yield
} tok_t;

/// begin tokenization of a new string
static inline tok_t tok(const char *input, const char *separators) {

  assert(input != NULL);
  assert(separators != NULL);
  assert(strcmp(separators, "") != 0 &&
         "at least one separator must be provided");

#ifndef NDEBUG
  for (const char *s1 = separators; *s1 != '\0'; ++s1) {
    for (const char *s2 = s1 + 1; *s2 != '\0'; ++s2) {
      assert(*s1 != *s2 && "duplicate separator characters");
    }
  }
#endif

  tok_t t = {.start = input, .separators = separators};

  // find the end of the first token
  size_t size = strcspn(input, separators);
  t.next = (strview_t){.data = input, .size = size};

  return t;
}

/// is this tokenizer exhausted?
static inline bool tok_end(const tok_t *t) {

  assert(t != NULL);

  return t->next.data == NULL;
}

/// get the current token
static inline strview_t tok_get(const tok_t *t) {

  assert(t != NULL);
  assert(t->next.data != NULL && "extracting from an exhausted tokenizer");

  return t->next;
}

/// advance to the next token in the string being scanned
static inline void tok_next(tok_t *t) {

  assert(t != NULL);
  assert(t->start != NULL);
  assert(t->separators != NULL);
  assert(t->next.data != NULL && "advancing an exhausted tokenizer");

  // resume from where the previous token ended
  const char *start = t->next.data + t->next.size;

  // if we are at the end of the string, we are done
  if (start == t->start + strlen(t->start)) {
    t->next = (strview_t){0};
    return;
  }

  // skip last separator characters
  start += strspn(start, t->separators);

  // find the end of the next token
  size_t size = strcspn(start, t->separators);

  t->next = (strview_t){.data = start, .size = size};
}