1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339
|
/** \file tokenize.c
*
* Tokenize a string, accommodating quoted strings.
*
* @addtogroup autoopts
* @{
*/
/*
* This file defines the string_tokenize interface
* This file is part of AutoOpts, a companion to AutoGen.
* AutoOpts is free software.
* AutoOpts is Copyright (C) 1992-2015 by Bruce Korb - all rights reserved
*
* AutoOpts is available under any one of two licenses. The license
* in use must be one of these two and the choice is under the control
* of the user of the license.
*
* The GNU Lesser General Public License, version 3 or later
* See the files "COPYING.lgplv3" and "COPYING.gplv3"
*
* The Modified Berkeley Software Distribution License
* See the file "COPYING.mbsd"
*
* These files have the following sha256 sums:
*
* 8584710e9b04216a394078dc156b781d0b47e1729104d666658aecef8ee32e95 COPYING.gplv3
* 4379e7444a0e2ce2b12dd6f5a52a27a4d02d39d247901d3285c88cf0d37f477b COPYING.lgplv3
* 13aa749a5b0a454917a944ed8fffc530b784f5ead522b1aacaf4ec8aa55a6239 COPYING.mbsd
*/
#include <errno.h>
#include <stdlib.h>
#define cc_t const unsigned char
#define ch_t unsigned char
/* = = = START-STATIC-FORWARD = = = */
static void
copy_cooked(ch_t ** ppDest, char const ** ppSrc);
static void
copy_raw(ch_t ** ppDest, char const ** ppSrc);
static token_list_t *
alloc_token_list(char const * str);
/* = = = END-STATIC-FORWARD = = = */
static void
copy_cooked(ch_t ** ppDest, char const ** ppSrc)
{
ch_t * pDest = (ch_t *)*ppDest;
const ch_t * pSrc = (const ch_t *)(*ppSrc + 1);
for (;;) {
ch_t ch = *(pSrc++);
switch (ch) {
case NUL: *ppSrc = NULL; return;
case '"': goto done;
case '\\':
pSrc += ao_string_cook_escape_char((const char *)pSrc, (char *)&ch, 0x7F);
if (ch == 0x7F)
break;
/* FALLTHROUGH */
default:
*(pDest++) = ch;
}
}
done:
*ppDest = (ch_t *)pDest; /* next spot for storing character */
*ppSrc = (char const *)pSrc; /* char following closing quote */
}
static void
copy_raw(ch_t ** ppDest, char const ** ppSrc)
{
ch_t * pDest = *ppDest;
cc_t * pSrc = (cc_t *) (*ppSrc + 1);
for (;;) {
ch_t ch = *(pSrc++);
switch (ch) {
case NUL: *ppSrc = NULL; return;
case '\'': goto done;
case '\\':
/*
* *Four* escapes are handled: newline removal, escape char
* quoting and apostrophe quoting
*/
switch (*pSrc) {
case NUL: *ppSrc = NULL; return;
case '\r':
if (*(++pSrc) == NL)
++pSrc;
continue;
case NL:
++pSrc;
continue;
case '\'':
ch = '\'';
/* FALLTHROUGH */
case '\\':
++pSrc;
break;
}
/* FALLTHROUGH */
default:
*(pDest++) = ch;
}
}
done:
*ppDest = pDest; /* next spot for storing character */
*ppSrc = (char const *) pSrc; /* char following closing quote */
}
static token_list_t *
alloc_token_list(char const * str)
{
token_list_t * res;
int max_token_ct = 2; /* allow for trailing NULL pointer & NUL on string */
if (str == NULL) goto enoent_res;
/*
* Trim leading white space. Use "ENOENT" and a NULL return to indicate
* an empty string was passed.
*/
str = SPN_WHITESPACE_CHARS(str);
if (*str == NUL) goto enoent_res;
/*
* Take an approximate count of tokens. If no quoted strings are used,
* it will be accurate. If quoted strings are used, it will be a little
* high and we'll squander the space for a few extra pointers.
*/
{
char const * pz = str;
do {
max_token_ct++;
pz = BRK_WHITESPACE_CHARS(pz+1);
pz = SPN_WHITESPACE_CHARS(pz);
} while (*pz != NUL);
res = malloc(sizeof(*res) + (size_t)(pz - str)
+ ((size_t)max_token_ct * sizeof(ch_t *)));
}
if (res == NULL)
errno = ENOMEM;
else res->tkn_list[0] = (ch_t *)(res->tkn_list + (max_token_ct - 1));
return res;
enoent_res:
errno = ENOENT;
return NULL;
}
/*=export_func ao_string_tokenize
*
* what: tokenize an input string
*
* arg: + char const * + string + string to be tokenized +
*
* ret_type: token_list_t *
* ret_desc: pointer to a structure that lists each token
*
* doc:
*
* This function will convert one input string into a list of strings.
* The list of strings is derived by separating the input based on
* white space separation. However, if the input contains either single
* or double quote characters, then the text after that character up to
* a matching quote will become the string in the list.
*
* The returned pointer should be deallocated with @code{free(3C)} when
* are done using the data. The data are placed in a single block of
* allocated memory. Do not deallocate individual token/strings.
*
* The structure pointed to will contain at least these two fields:
* @table @samp
* @item tkn_ct
* The number of tokens found in the input string.
* @item tok_list
* An array of @code{tkn_ct + 1} pointers to substring tokens, with
* the last pointer set to NULL.
* @end table
*
* There are two types of quoted strings: single quoted (@code{'}) and
* double quoted (@code{"}). Singly quoted strings are fairly raw in that
* escape characters (@code{\\}) are simply another character, except when
* preceding the following characters:
* @example
* @code{\\} double backslashes reduce to one
* @code{'} incorporates the single quote into the string
* @code{\n} suppresses both the backslash and newline character
* @end example
*
* Double quote strings are formed according to the rules of string
* constants in ANSI-C programs.
*
* example:
* @example
* #include <stdlib.h>
* int ix;
* token_list_t * ptl = ao_string_tokenize(some_string)
* for (ix = 0; ix < ptl->tkn_ct; ix++)
* do_something_with_tkn(ptl->tkn_list[ix]);
* free(ptl);
* @end example
* Note that everything is freed with the one call to @code{free(3C)}.
*
* err:
* NULL is returned and @code{errno} will be set to indicate the problem:
* @itemize @bullet
* @item
* @code{EINVAL} - There was an unterminated quoted string.
* @item
* @code{ENOENT} - The input string was empty.
* @item
* @code{ENOMEM} - There is not enough memory.
* @end itemize
=*/
token_list_t *
ao_string_tokenize(char const * str)
{
token_list_t * res = alloc_token_list(str);
ch_t * pzDest;
/*
* Now copy each token into the output buffer.
*/
if (res == NULL)
return res;
pzDest = (ch_t *)(res->tkn_list[0]);
res->tkn_ct = 0;
do {
res->tkn_list[ res->tkn_ct++ ] = pzDest;
for (;;) {
int ch = (ch_t)*str;
if (IS_WHITESPACE_CHAR(ch)) {
found_white_space:
str = SPN_WHITESPACE_CHARS(str+1);
break;
}
switch (ch) {
case '"':
copy_cooked(&pzDest, &str);
if (str == NULL) {
free(res);
errno = EINVAL;
return NULL;
}
if (IS_WHITESPACE_CHAR(*str))
goto found_white_space;
break;
case '\'':
copy_raw(&pzDest, &str);
if (str == NULL) {
free(res);
errno = EINVAL;
return NULL;
}
if (IS_WHITESPACE_CHAR(*str))
goto found_white_space;
break;
case NUL:
goto copy_done;
default:
str++;
*(pzDest++) = (unsigned char)ch;
}
} copy_done:;
/*
* NUL terminate the last token and see if we have any more tokens.
*/
*(pzDest++) = NUL;
} while (*str != NUL);
res->tkn_list[ res->tkn_ct ] = NULL;
return res;
}
#ifdef TEST
#include <stdio.h>
#include <string.h>
int
main(int argc, char ** argv)
{
if (argc == 1) {
printf("USAGE: %s arg [ ... ]\n", *argv);
return 1;
}
while (--argc > 0) {
char * arg = *(++argv);
token_list_t * p = ao_string_tokenize(arg);
if (p == NULL) {
printf("Parsing string ``%s'' failed:\n\terrno %d (%s)\n",
arg, errno, strerror(errno));
} else {
int ix = 0;
printf("Parsed string ``%s''\ninto %d tokens:\n", arg, p->tkn_ct);
do {
printf(" %3d: ``%s''\n", ix+1, p->tkn_list[ix]);
} while (++ix < p->tkn_ct);
free(p);
}
}
return 0;
}
#endif
/** @}
*
* Local Variables:
* mode: C
* c-file-style: "stroustrup"
* indent-tabs-mode: nil
* End:
* end of autoopts/tokenize.c */
|