File: hxindex.c

package info (click to toggle)
html-xml-utils 6.1-1
links: PTS, VCS
area: main
in suites: wheezy
size: 1,620 kB
sloc: ansic: 10,027; sh: 2,135; lex: 189; yacc: 125; perl: 123; makefile: 122
file content (839 lines) | stat: -rw-r--r-- 24,489 bytes
/*
 * Insert an index between "<!--begin-index-->" and "<!--end-index-->",
 * or replacing the comment "<!--index-->"
 *
 * The index links to elements with ID attributes as well as with
 * empty <A NAME> elements.
 *
 * Any <A> tags with a class of "bctarget" are not copied, but
 * regenerated. They are assumed to be backwards-compatible versions
 * of ID attributes on their parent elements. But if the option -t or
 * -x are given, those <A> elements are removed.
 *
 * There's a limit of 100000 index terms (10^(MAXIDLEN-1)).
 *
 * Index terms are elements with a class of "index", "index-inst" or
 * "index-def", as well as all <dfn> elements. The contents of the
 * element is the index term, unless the element has a title
 * attribute. The title attribute can contain "|" and "!!":
 *
 * "term"
 * "term1|term2|term3|..."
 * "term!!subterm!!subsubterm!!..."
 * "term1!!subterm1|term2!!subterm2|..."
 * etc.
 *
 * For backward compatibility with an earlier Perl program, "::" is
 * accepted as an alternative for "!!", but it is better not to use
 * both separators in the same project, since the sorting maybe
 * adversely affected.
 *
 * Class "index-def" results in a bold entry in the index, "index" in
 * a normal one. "index-inst" is an alias for "index", provided for
 * backward compatibility.
 *
 * Copyright © 1994-2005 World Wide Web Consortium
 * See http://www.w3.org/Consortium/Legal/copyright-software
 *
 * Author: Bert Bos <bert@w3.org>
 * Created: 11 Apr 2000
 * Version: $Id: hxindex.c,v 1.15 2012/03/08 22:22:20 bbos Exp $
 *
 **/
#include <config.h>
#include <assert.h>
#include <locale.h>
#include <wchar.h>
#include <ctype.h>
#include <stdlib.h>
#include <stdio.h>
#include <iconv.h>
#include <unistd.h>
#if STDC_HEADERS
# include <string.h>
#else
# ifndef HAVE_STRCHR
#  define strchr index
#  define strrchr rindex
# endif
# ifndef HAVE_STRSTR
#  include "strstr.e"
# endif
#endif
#ifdef HAVE_ERRNO_H
#  include <errno.h>
#else
extern int errno;
char *strerror(int errnum);
int strerror_r(int errnum, char *buf, size_t n);
#endif

#ifdef HAVE_ERRNO_H
#  include <errno.h>
#endif

#ifdef HAVE_SEARCH_H
#  include <search.h>
#else
#  include "search-freebsd.h"
#endif
#include "export.h"
#include "types.e"
#include "heap.e"
#include "tree.e"
#include "html.e"
#include "scan.e"
#include "dict.e"
#include "openurl.e"
#include "genid.e"
#include "errexit.e"
#include "class.e"

#undef USE_DATA_ATTRIBUTE	/* Data attributes are a proposal in HTML5 */

#define BEGIN_INDEX "begin-index" /* <!--begin-index--> */
#define END_INDEX "end-index"	/* <!--end-index--> */
#define INDEX "index"		/* <!--index--> */
#define INDEX_INST "index-inst"	/* class="index-inst" */
#define INDEX_DEF "index-def"	/* class="index-def" */
#define TARGET "bctarget"	/* CLASS="...bctarget..." */

#define MAXSTR 2048		/* Max. length of URL + term */
#define MAXSUBS 20		/* Max. depth of subterms */
#define SECNO "secno"		/* Class of elements that define section # */
#define NO_NUM "no-num"		/* Class of elements without a section # */

typedef struct _indexterm {
  string url;
  int importance;		/* 1 (low) or 2 (high) */
  string secno;
  string *terms;		/* Array of subterms */
  wchar_t **sortkeys;		/* Array of normalized subterms */
  int nrkeys;			/* Length of term and sortkeys arrays */
} *Indexterm;

static Tree tree;
static Boolean xml = False;	/* Use <empty /> convention */
static string base = NULL;	/* (Rel.) URL of output file */
static string indexdb = NULL;	/* Persistent store of terms */
static string* userclassnames = NULL;	/* Persistent store of class names */
static FILE *globalfile;	/* Must be global for twalk */
static Indexterm globalprevious; /* Must be global for twalk */
static string globalurlprevious;/* Must be global for twalk */
static Boolean bctarget = True;	/* Add <A name=> after IDs */
static Boolean use_secno = False; /* Anchor text is "#" instead of section # */
static Boolean final = False;	/* Leave used attributes in document */
static Boolean trim_punct = True; /* Remove trailing punctuation from terms */


/* handle_error -- called when a parse error occurred */
static void handle_error(void *clientdata, const string s, int lineno)
{
  (void) fprintf(stderr, "%d: %s\n", lineno, s);
}

/* start -- called before the first event is reported */
static void* start(void)
{
  tree = create();
  return NULL;
}
  
/* end -- called after the last event is reported */
static void end(void *clientdata)
{
  /* skip */
}

/* handle_comment -- called after a comment is parsed */
static void handle_comment(void *clientdata, string commenttext)
{
  tree = append_comment(tree, commenttext);
}

/* handle_text -- called after a tex chunk is parsed */
static void handle_text(void *clientdata, string text)
{
  tree = append_text(tree, text);
}

/* handle_declaration -- called after a declaration is parsed */
static void handle_decl(void *clientdata, string gi,
			string fpi, string url)
{
  tree = append_declaration(tree, gi, fpi, url);
}

/* handle_proc_instr -- called after a PI is parsed */
static void handle_pi(void *clientdata, string pi_text)
{
  tree = append_procins(tree, pi_text);
}

/* handle_starttag -- called after a start tag is parsed */
static void handle_starttag(void *clientdata, string name, pairlist attribs)
{
  conststring id;

  tree = html_push(tree, name, attribs);

  /* If it has an ID, store it (so we don't accidentally generate it) */
  if ((id = pairlist_get(attribs, "id"))) storeID(id);
}

/* handle_emptytag -- called after an empty tag is parsed */
static void handle_emptytag(void *clientdata, string name, pairlist attribs)
{
  conststring id;

  tree = html_push(tree, name, attribs);

  /* If it has an ID, store it (so we don't accidentally generate it) */
  if ((id = pairlist_get(attribs, "id"))) storeID(id);
}

/* handle_endtag -- called after an endtag is parsed (name may be "") */
static void handle_endtag(void *clientdata, string name)
{
  tree = html_pop(tree, name);
}

/* trim -- remove leading and trailing white space, collapse white space */
static void trim(string s)
{
  string t;
  int i, j;

  t = newstring(s);
  for (i = 0; isspace(t[i]); i++); /* Skip leading white space */
  for (j = 0; t[i]; i++)
    if (!isspace(t[i])) s[j++] = t[i];
    else if (!isspace(t[i-1])) s[j++] = ' ';
  if (j == 0) s[j] = '\0';
  else if (isspace(s[j-1])) s[j-1] = '\0';
  else s[j] = '\0';
  dispose(t);
}

/* parse_subterms -- parse s to create terms & sortkeys array in an Indexterm */
static void parse_subterms(const Indexterm term, const conststring s)
{
  enum {TEXT, TAG, DQUOTE, SQUOTE} state;
  string h, k, p, q;
  iconv_t cd;
  size_t len, len2;
  int i;

  /* Create the terms array and count the number of subterms */
  h = newstring(s);
  trim(h);
  term->nrkeys = 1;
  newarray(term->terms, 1);
  term->terms[0] = h;
  while ((k = strstr(h, "!!")) || (k = strstr(h, "::"))) {
    h = k + 2;
    *k = '\0';
    renewarray(term->terms, term->nrkeys + 1);
    trim(h);				/* Remove leading & trailing space */
    term->terms[term->nrkeys] = h;	/* All terms point into h */
    term->nrkeys++;
  }

  /* Create the sortkeys array by normalizing each term */
  newarray(term->sortkeys, term->nrkeys);
  for (i = 0; i < term->nrkeys; i++) {

    /* First remove mark-up and expand the standard XML entities */
    h = newstring(term->terms[i]);
    for (state = TEXT, p = q = h; *p; p++) {
      switch (state) {
      case TEXT:
	if (*p == '<') state = TAG;
	else if (hasprefix(p, "&lt;"))   {*(q++) = '<'; p += 3;}
	else if (hasprefix(p, "&#60;"))  {*(q++) = '<'; p += 4;}
	else if (hasprefix(p, "&#x3c;")) {*(q++) = '<'; p += 5;}
	else if (hasprefix(p, "&#x3C;")) {*(q++) = '<'; p += 5;}
	else if (hasprefix(p, "&gt;"))   {*(q++) = '>'; p += 3;}
	else if (hasprefix(p, "&#62;"))  {*(q++) = '>'; p += 4;}
	else if (hasprefix(p, "&#x3e;")) {*(q++) = '>'; p += 5;}
	else if (hasprefix(p, "&#x3E;")) {*(q++) = '>'; p += 5;}
	else if (hasprefix(p, "&quot;")) {*(q++) = '"'; p += 5;}
	else if (hasprefix(p, "&#34;"))  {*(q++) = '"'; p += 4;}
	else if (hasprefix(p, "&#x22;")) {*(q++) = '"'; p += 5;}
	else if (hasprefix(p, "&amp;"))  {*(q++) = '&'; p += 4;}
	else if (hasprefix(p, "&#38;"))  {*(q++) = '&'; p += 4;}
	else if (hasprefix(p, "&#x26;")) {*(q++) = '&'; p += 5;}
	else *(q++) = tolower(*p);
	break;
      case TAG:
	if (*p == '>') state = TEXT;
	else if (*p == '"') state = DQUOTE;
	else if (*p == '\'') state = SQUOTE;
	break;
      case DQUOTE:
	if (*p == '"') state = TAG;
	break;
      case SQUOTE:
	if (*p == '\'') state = TAG;
	break;
      default:
	assert(!"Cannot happen!");
      }
    }
    *q = '\0';

    if (trim_punct) {
      /* Remove some trailing white space and punctuation */
      for (q--; q != h && strspn(q, " \r\n\t\f\v,:;!?"); q--) *q = '\0';

      /* Remove final '.' only if it is the only '.' in the term */
      if ((q = strrchr(h, '.')) && !*(q+1) && q == strchr(h, '.')) *q = '\0';
    }

    /* Then convert from UTF-8 to wchar_t */
    cd = iconv_open("wchar_t", "UTF-8");
    if (cd == (iconv_t)(-1)) {perror("hxindex"); exit(1);}
    len = strlen(h) + 1;
    newarray(term->sortkeys[i], len); /* Large enough */
    p = (string) term->sortkeys[i];
    len2 = len * sizeof(term->sortkeys[i][0]);
    if (iconv(cd, &h, &len, &p, &len2) == (size_t)(-1)) {
      perror("hxindex"); exit(1);
    }
    /* *p = L'\0'; */
    if (iconv_close(cd) == -1) {perror("hxindex"); exit(1);}
  }
}

/* folding_cmp -- compare two arrays of sort keys */
static int folding_cmp(wchar_t **a, const int alen, wchar_t **b, const int blen)
{
  int i, j;

  assert(a && alen >= 0);
  assert(b && blen >= 0);
  for (i = 0;; i++) {
    if (i == alen) return i == blen ? 0 : -1;
    if (i == blen) return 1;
    assert(a[i] && b[i]);
    if ((j = wcscoll(a[i], b[i])) != 0) return j;
  }
  assert(! "Cannot happen!");
}

/* indent -- print newline and n times 2 spaces */
static void indent(int n)
{
  putchar('\n');
  for (; n > 0; n--) printf("  ");
}

/* print_full_term -- print the full term for use in a TITLE attribute */
static void print_full_term(const Indexterm term)
{
  enum {TEXT, TAG, DQUOTE, SQUOTE} state;
  string h;
  int i;

  for (state = TEXT, i = 0; i < term->nrkeys; i++) {
    if (i != 0) fputs(", ", stdout);
    for (h = term->terms[i]; *h; h++)
      switch (state) {
      case TEXT:
	if (*h == '<') state = TAG;
	else if (*h == '"') fputs("&quot;", stdout);
	else putchar(*h);
	break;
      case TAG:
	if (*h == '>') state = TEXT;
	else if (*h == '"') state = DQUOTE;
	else if (*h == '\'') state = SQUOTE;
	break;
      case DQUOTE:
	if (*h == '"') state = TAG;
	break;
      case SQUOTE:
	if (*h == '\'') state = TAG;
	break;
      default:
	assert(!"Cannot happen!");
      }
  }
}

/* write_index_item -- write one item in the list of index terms */
static void write_index_item(const void *term1, const VISIT which,
			     const int depth)
{
  Indexterm term = *(Indexterm*)term1;
  int i, j;

  if (which != postorder && which != leaf) return;

  /* Count how many subterms are equal to the previous entry */
  i = 0;
  while (i < min(term->nrkeys, globalprevious->nrkeys) &&
	 !folding_cmp(term->sortkeys + i, 1, globalprevious->sortkeys + i, 1))
    i++;

  /* Close lists as needed */
  for (j = globalprevious->nrkeys - 1; j > i; j--) {
    indent(j);
    printf("</ul>");
  }

  /* Open a list if needed */
  if (term->nrkeys > globalprevious->nrkeys && globalprevious->nrkeys == i) {
    indent(i);
    printf("<ul>");
  }

  /* Print new subterms, if any */
  for (j = i; j < term->nrkeys; j++) {
    indent(j);
    printf("<li>%s", term->terms[j]);
    if (j != term->nrkeys - 1) {
      indent(j + 1);
      printf("<ul>");
    }
  }

  int listmode = 0;

  if (folding_cmp(globalprevious->sortkeys, globalprevious->nrkeys,
		  term->sortkeys, term->nrkeys) == 0)
    if (globalurlprevious) {
      string d = strchr(globalurlprevious,'#');
      if (d)
        listmode = strncmp(globalurlprevious, term->url, d - globalurlprevious);
      else
        listmode = strcmp(globalurlprevious, term->url);
    }

  /* Print a link */
  switch (term->importance) {
    case 1:
      printf("%s <a href=\"%s\" title=\"", listmode ? ";" : ",", term->url);
      print_full_term(term);
      printf("\">%s</a>", use_secno ? term->secno : "#");
      break;
    case 2:
      printf("%s <a href=\"%s\" title=\"", listmode ? ";" : ",", term->url);
      print_full_term(term);
      printf("\"><strong>%s</strong></a>", use_secno ? term->secno : "#");
      break;
    default:
      assert(! "Cannot happen\n");
  }

  /* Remember this term */
  globalprevious = term;
  globalurlprevious = term->url;

}

/* mkindex -- write out an index */
static void mkindex(Indexterm terms)
{
  int i;

  printf("<ul class=\"indexlist\">");

  /* Initialize globalprevious to a term with an unlikely sortkey */
  new(globalprevious);
  globalprevious->nrkeys = 1;
  newarray(globalprevious->sortkeys, globalprevious->nrkeys);
  newarray(globalprevious->sortkeys[0], 15);
  wcscpy(globalprevious->sortkeys[0], L"zzzzzzzzzzzzzz");

  twalk(terms, write_index_item);

  /* Close all open lists */
  for (i = 0; i < globalprevious->nrkeys; i++) printf("\n</ul>");
}

/* expand -- write the tree, add <A NAME> if needed and replace <!--index--> */
static void expand(Tree t, Boolean *write, Indexterm terms)
{
  conststring val;
  Tree h;
  pairlist a;
  string s;
  Boolean do_tag;

  for (h = t->children; h != NULL; h = h->sister) {
    switch (h->tp) {
      case Text:
	if (*write) printf("%s", h->text);
	break;
      case Comment:
	s = newstring(h->text);
	trim(s);
	if (eq(s, INDEX) || eq(s, BEGIN_INDEX)) {
	  if (!final) printf("<!--%s-->\n", BEGIN_INDEX);
	  mkindex(terms);
	  if (!final) printf("<!--%s-->", END_INDEX);
	  if (eq(s, BEGIN_INDEX)) *write = False;	/* Skip old index */
	} else if (eq(s, END_INDEX)) {
	  *write = True;
	} else {
	  printf("<!--%s-->", h->text);
	}
	dispose(s);
	break;
      case Declaration:
	printf("<!DOCTYPE %s", h->name);
	if (h->text) printf(" PUBLIC \"%s\"", h->text);
	if (h->url) printf(" %s\"%s\"", h->text ? "" : "SYSTEM ", h->url);
	printf(">");
	break;
      case Procins:
	if (*write) printf("<?%s>", h->text);
	break;
      case Element:
	if (*write) {
	  /* If an <a> was inserted by index itself, remove it */
	  do_tag = !eq(h->name, "a") || !has_class(h->attribs, TARGET);
	  if (do_tag) {
	    printf("<%s", h->name);
	    for (a = h->attribs; a != NULL; a = a->next) {
	      printf(" %s", a->name);
	      if (a->value != NULL) printf("=\"%s\"", a->value);
	    }
	    assert(! is_empty(h->name) || h->children == NULL);
	    printf(xml && is_empty(h->name) ? " />" : ">");
	    /* Insert an <A NAME> if element has an ID and is not <A> */
	    if (bctarget && is_mixed(h->name) && (val = get_attrib(h, "id"))
		&& !eq(h->name, "a") && ! xml)
	      printf("<a class=\"%s\" name=\"%s\"></a>", TARGET, val);
	  }
	  expand(h, write, terms);
	  if (do_tag && ! is_empty(h->name)) printf("</%s>", h->name);
	}
	break;
      case Root:
	assert(! "Cannot happen");
	break;
      default:
	assert(! "Cannot happen");
    }
  }
}

/* termcmp -- comparison routine for Indexterms */
static int termcmp(const void *a1, const void *b1)
{
  Indexterm a = (Indexterm)a1, b = (Indexterm)b1;
  int r;

  assert(a);
  assert(b);
  assert(a->sortkeys);
  assert(b->sortkeys);
  assert(a->nrkeys > 0);
  assert(b->nrkeys > 0);

  r = folding_cmp(a->sortkeys, a->nrkeys, b->sortkeys, b->nrkeys);
  if (r != 0) return r;
  return strcmp(a->url, b->url); /* Terms are equal, compare URL instead */
}

/* copy_contents -- recursively expand contents of element t into a string */
static void copy_contents(Tree t, string *s)
{
  Tree h;
  int i;
  pairlist a;
  string p;

  for (h = t->children; h != NULL; h = h->sister) {
    switch (h->tp) {
      case Text:
	i = *s ? strlen(*s) : 0;
	renewarray(*s, i + strlen(h->text) + 1);
	/* Copy, but transform all whitespace to spaces */
	for (p = h->text; *p; p++, i++) (*s)[i] = isspace(*p) ? ' ' : *p;
	(*s)[i] = '\0';
	break;
      case Comment: break;
      case Declaration: break;
      case Procins: break;
      case Element:
	/* Only certain tags are retained */
	if (eq(h->name, "span") || eq(h->name, "code") || eq(h->name, "tt")
	    || eq(h->name, "acronym") || eq(h->name, "abbr")
	    || eq(h->name, "bdo") || eq(h->name, "kbd") || eq(h->name, "samp")
	    || eq(h->name, "sub") || eq(h->name, "sup")
	    || eq(h->name, "var")) {
	  strapp(s, "<", h->name, NULL);
	  for (a = h->attribs; a != NULL; a = a->next) {
	    if (! a->value) strapp(s, " ", a->name, NULL);
	    else strapp(s, " ", a->name, "=\"", a->value, "\"", NULL);
	  }
	  assert(! is_empty(h->name) || h->children == NULL);
	  if (is_empty(h->name)) {
	    strapp(s, xml ? " />" : ">", NULL);
	  } else {
	    strapp(s, ">", NULL);
	    copy_contents(h, s);
	    strapp(s, "</", h->name, ">", NULL);
	  }
	} else {				/* Ignore tag, copy contents */
	  copy_contents(h, s);
	}
	break;
      case Root: assert(! "Cannot happen"); break;
      default: assert(! "Cannot happen");
    }
  }
}

/* copy_to_index -- copy the contents of element h to the index db */
static void copy_to_index(Tree t, Indexterm *terms, int importance,
			  conststring secno)
{
  conststring id, title;
  string h;
  Indexterm term;
  int i, n;

  id = get_attrib(t, "id");
#ifdef USE_DATA_ATTRIBUTE
  if (! (title = get_attrib(t, "data-index")))
#endif
    title = get_attrib(t, "title");

  /* Get term either from title attribute or contents */
  if (title) {

    i = 0;
    while (title[i]) {
      n = strcspn(title + i, "|");		/* Find | or \0 */
      new(term);
      term->importance = importance;
      term->secno = newstring(secno);
      term->url = NULL;
      strapp(&term->url, base, "#", id, NULL);
      h = newnstring(title + i, n);
      parse_subterms(term, h);
      if (! tsearch(term, (void**)terms, termcmp))
	errexit("Out of memory while parsing term %s\n", h);
      i += n;
      if (title[i]) i++;			/* Skip '|' */
    }
    if (final)					/* Remove used attribute */
#ifdef USE_DATA_ATTRIBUTE
      if (!delete_attrib(t, "data-index"))
#endif
	delete_attrib(t, "title");

  } else {					/* Recursively copy contents */

    h = NULL;
    copy_contents(t, &h);
    if (h) {					/* Non-empty contents */
      new(term);
      term->importance = importance;
      term->secno = newstring(secno);
      term->url = NULL;
      strapp(&term->url, base, "#", id, NULL);
      parse_subterms(term, h);
      if (! tsearch(term, (void**)terms, termcmp))
	errexit("Out of memory while parsing term %s\n", h);
    }

  }
}

/* collect -- collect index terms, add IDs where needed */
static void collect(Tree t, Indexterm *terms, string *secno)
{
  int importance;
  Tree h;

  for (h = t->children; h != NULL; h = h->sister) {
    switch (h->tp) {
      case Text: case Comment: case Declaration: case Procins: break;
      case Element:
	if (has_class(h->attribs, SECNO)) {
	  dispose(*secno);
	  copy_contents(h, secno);
	  trim(*secno);
	} else if (has_class(h->attribs, NO_NUM)) {
	  dispose(*secno);
	  *secno = newstring("#");
	}
	if (eq(h->name, "dfn")) importance = 2;
	else if (has_class(h->attribs,INDEX)||has_class(h->attribs,INDEX_INST))
	  importance = 1;
	else if (userclassnames && has_class_in_list(h->attribs, userclassnames))
	  importance = 1;
	else if (has_class(h->attribs, INDEX_DEF)) importance = 2;
	else importance = 0;
	if (importance != 0) {
	  /* Give it an ID, if it doesn't have one */
	  if (! get_attrib(h, "id")) set_attrib(h, "id", gen_id(h));
	  copy_to_index(h, terms, importance, *secno);
	} else {
	  collect(h, terms, secno);
	}
	break;
      case Root: assert(! "Cannot happen"); break;
      default: assert(! "Cannot happen");
    }
  }
}

/* load_index -- read persistent term db from file */
static void load_index(const string indexdb, Indexterm *terms)
{
  FILE *f;
  int n, n3;
  char line[MAXSTR];
  Indexterm term;
  string h;

  if (! (f = fopen(indexdb, "r"))) return;	/* Assume file not found... */

  while (fgets(line, sizeof(line), f)) {
    chomp(line);
    n = strcspn(line, "\t");
    if (line[n] != '\t') errexit("Illegal syntax in %s\n", indexdb);
    new(term);
    h = newnstring(line, n);
    switch (line[n + 1]) {
      case '1': term->importance = 1; break;
      case '2': term->importance = 2; break;
      default: errexit("Error in %s (column 2 must be '1' or '2')\n", indexdb);
    }
    if (line[n+2] != '\t') errexit("Illegal syntax in %s\n", indexdb);
    n3 = n + 3 + strcspn(line + n + 3, "\t");
    if (line[n3] != '\t') errexit("Illegal syntax in %s\n", indexdb);
    term->url = newnstring(line + n + 3, n3 - n - 3);
    term->secno = newstring(line + n3 + 1);
    parse_subterms(term, h);
    if (! tsearch(term, (void**)terms, termcmp))
      errexit("Out of memory while loading %s\n", indexdb);
  }

  fclose(f);
}

/* save_a_term -- write one term to globalfile */
static void save_a_term(const void *term1, const VISIT which, const int dp)
{
  Indexterm term = *(Indexterm*)term1;
  int i;

  if (which == endorder || which == leaf) {
    for (i = 0; i < term->nrkeys; i++) {
      if (i > 0) fprintf(globalfile, "!!");
      fprintf(globalfile, "%s", term->terms[i]);
    }
    fprintf(globalfile, "\t%d\t%s\t%s\n", term->importance, term->url,
	    term->secno);
  }
}

/* save_index -- write terms to file */
static void save_index(const string indexdb, Indexterm terms)
{
  if (! (globalfile = fopen(indexdb, "w")))
    errexit("%s: %s\n", indexdb, strerror(errno));
  twalk(terms, save_a_term);
  fclose(globalfile);
}

/* usage -- print usage message and exit */
static void usage(string name)
{
  errexit("Version %s\nUsage: %s [-i indexdb] [-b base] [-x] [-t] [-n] [-c userclass] [html-file]\n",
	  VERSION, name);
}

/* tokenize -- split string s into tokens at each comma, return an array */
static string * tokenize(string s)
{
  string * t;
  int i, n;

  assert(s && s[0]);
  for (t = NULL, n = 0; *s; s += i + 1, n++) {
    i = strcspn(s, ",");
    renewarray(t, n + 1);
    t[n] = newnstring(s, i);
  }
  renewarray(t, n + 1);		/* Make final item NULL */
  t[n] = NULL;
  return t;
}

/* main */
int main(int argc, char *argv[])
{
  Boolean write = True;
  Indexterm termtree = NULL;	/* Sorted tree of terms */
  string secno;
  int c, status = 200;

  /* Bind the parser callback routines to our handlers */
  set_error_handler(handle_error);
  set_start_handler(start);
  set_end_handler(end);
  set_comment_handler(handle_comment);
  set_text_handler(handle_text);
  set_decl_handler(handle_decl);
  set_pi_handler(handle_pi);
  set_starttag_handler(handle_starttag);
  set_emptytag_handler(handle_emptytag);
  set_endtag_handler(handle_endtag);

  yyin = NULL;

  while ((c = getopt(argc, argv, "txbi:cnfr")) != -1)
  switch (c) {
  case 't': bctarget = False; break; /* Don't write <a name> after each ID */
  case 'x': xml = True; break;	/* Output as XML */
  case 'b': base = newstring(optarg); break; /* Set base of URL */
  case 'i': indexdb = newstring(optarg); break;	/* Set name of index db */
  case 'c': userclassnames = tokenize(optarg); break; /* Set class names */
  case 'n': use_secno = True; break; /* Print section numbers instead of "#" */
  case 'f': final = True; break; /* "Final": remove used attributes */
  case 'r': trim_punct = False; break; /* Do not remove trailing punctuation */
  default: usage(argv[0]);
  }
  if (optind == argc) yyin = stdin;
  else if (argc > optind + 1) usage(argv[0]);
  else if (eq(argv[optind], "-")) yyin = stdin;
  else yyin = fopenurl(argv[optind], "r", &status);

  if (yyin == NULL) {perror(argv[optind]); exit(1);}
  if (status != 200) errexit("%s : %s\n", argv[optind], http_strerror(status));

  if (!base) base = newstring("");

  /* Apply user's locale */
  setlocale(LC_ALL, "");

  /* Read the index DB into memory */
  if (indexdb) load_index(indexdb, &termtree);

  /* Parse, build tree, collect existing IDs */
  if (yyparse() != 0) exit(3);

  /* Scan for index terms, add them to the tree, add IDs where needed */
  secno = newstring("#");
  collect(get_root(tree), &termtree, &secno);

  /* Write out the document, adding <A NAME> and replacing <!--index--> */
  expand(get_root(tree), &write, termtree);

  /* Store terms to file */
  if (indexdb) save_index(indexdb, termtree);

  fclose(yyin);
#if 0
  tree_delete(tree);				/* Just to test memory mgmt */
#endif
  return 0;
}