File: unhtml.c

package info (click to toggle)
unhtml 2.3.9-6
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 252 kB
sloc: ansic: 486; makefile: 71; awk: 2
file content (293 lines) | stat: -rw-r--r-- 6,334 bytes
parent folder | download | duplicates (4)
/*
 * File:        unhtml.c
 * Program:     unhtml
 * Written by:  Kevin Swan, 013639s@dragon.acadiau.ca
 * Completed:   February 3, 1998
 * Version:     2.3
 *
 * Usage:
 *     unhtml -version | [ filename ]
 *
 * Specification:
 * unhtml is a program which removes HTML formatting from a stream, writing
 * to output to a stream.  If it is invoked with a file name, it attempts
 * to read from the named file.  If it is invoked without a filename, it
 * reads from stdin.  It writes all output to stdout.
 */

#include <ctype.h>
#include <malloc.h>
#include <string.h>

#include "ops.h"
#include "esc.h"


char *VERSION = "unhtml  Version 2.3  Copyright (C) 1998 by Kevin Swan";
char *USAGE   = "unhtml -version | [ filename ]";



int main(int argc, char *argv[]) {

  /*
   * Variables local to the program.
   */
  FILE *inStream;
  char *tag;
  char *tmp;
  int tag_size;
  int ch;
  int i, j;



  /*
   * Do argument checking.  If more than one command line argument was
   * given, print a usage error.
   */
  if (argc > 2)
    fprintf (stderr, "Usage: %s\n", USAGE);

  /*
   * If the user simply requested the version of the program, print that
   * information and terminate.
   */
  if (argc == 2)
    if (strcmp (argv[1], "-version") == 0) {
      printf ("%s\n", VERSION);
      return 0;
    }

  /*
   * Allocate tag space, now that we know we need to do some actual work.
   */
  tag_size = MAX_TAG_SIZE;
  tag = (char *)malloc(tag_size);
  if (!tag) {
     fprintf (stderr, "Cannot malloc tag space (%d bytes).\n", tag_size);
     return 1;
  }

  /*
   * If an input file was specified, try to open a read stream on it.
   */
  if (argc == 2) {
    if ((inStream = fopen (argv[1], "r")) == NULL) {
      fprintf (stderr, "Error opening file [%s] for reading.\n", argv[1]);
      fprintf (stderr, "%s", USAGE);
      return 1;
    }
  } else
  /*
   * Otherwise, just use the standard input stream.
   */
    inStream = stdin;

  /*
   * Read tokens from the stream until we hit an opener for an HTML tag.
   */
  while (1) {

    ch = fgetc (inStream);

    /*
     * If we hit the end of the file, we're done.
     */
    if (ch == EOF)
      break;

    /*
     * If the character is not a tag opener, just print it.
     */
    if (ch != '<') {
      m_putchar (ch);
      continue;
    }

    /*
     * If we get this far, we've hit an HTML tag.  Read it into the
     * variable tag.
     */
    memset(tag, 0, tag_size);
    i = 0;
    while (ch != EOF) {
      tag[i] = ch;
      if (i == 1 && ch != '/' && !isalpha(ch)) {
	 m_putchar(tag[0]);
	 m_putchar(ch);
         break;
      }
      i++;
      if (ch == '>') {
	/*
	 * If it's really an html tag, then toss it.  Otherwise, it could
	 * have been just a '<' sign in the text.
	 */
	if (!isRealHtmlTag(tag)) {
	  fprintf(stderr, "not: %s\n",tag);
	  for (j = 0; j < i; j++)
            m_putchar(tag[j]);
	}

        break;
      }

			if (i >= tag_size-1) {
				tag_size <<= 1;
				tmp = realloc(tag, tag_size);
        if (!tmp) {
          fprintf (stderr, "Cannot malloc tag space (%d bytes).\n", tag_size);
          return 1;
        }
        tag = tmp;
      }

      ch = fgetc (inStream);
    }

    tag[i] = '\0';

#ifdef DEBUG
    fprintf (stderr, "Read in the tag \"%s\"\n", tag);
#endif

    /*
     * If it was a script opener, it is a special case.  We may find
     * '>' characters inside the <SCRIPT></SCRIPT> pair that are not
     * associated with a tag.  In addition, comment delimiters are
     * found inside the script tag pairs.  So, if we get a script
     * tag, skip ahead to the closing script tag.
     */
    if (isScriptOpeningTag (tag)) {

#ifdef DEBUG
      fprintf (stderr, "\"%s\" is a script opener.\n", tag);
#endif

      /*
       * This loop is necessary to ensure that we don't swallow up the
       * closing </SCRIPT> tag while filling the buffer if we happen to
       * hit a '<' character in some comparison in the scripting language.
       */
      while (1) {



#ifdef DEBUG
        fprintf (stderr, "1. Read till we hit a '<'.\n");
#endif

        /*
         * Read until we hit a '<'.
         */
        ch = fgetc (inStream);

        while (ch != EOF)
          if (ch == '<')
            break;
          else
            ch = fgetc (inStream);

        if (ch == EOF) {
          ungetc (ch, inStream);
          break;
        }



#ifdef DEBUG
        fprintf (stderr, "2. Read till we hit a '>' or a '<', filling the buffer.\n");
#endif

        /*
         * Hit a '<'.  Read till we hit a '>' or a '<', filling the buffer.
         */
        i = 1;
	memset(tag, 0, tag_size);
        tag[0] = '<';
        ch = fgetc (inStream);
        while (ch != EOF) {
          tag[i] = ch;
          i++;

					if (i >= tag_size-1) {
						tag_size <<= 1;
						tmp=realloc(tag, tag_size);
            if (!tmp) {
              fprintf (stderr, "Cannot malloc tag space (%d bytes).\n", tag_size);
              return 1;
            }
            tag = tmp;
          }

          if ((ch == '>') || (ch == '<'))
            break;

          ch = fgetc (inStream);

        } /* while */

        if (ch == EOF) {
          ungetc (ch, inStream);
          break;
        }

        tag[i] = '\0';

#ifdef DEBUG
        fprintf (stderr, "Read tag: \"%s\"\n", tag);
#endif

        if (ch == '<') {
          ungetc (ch, inStream);
          continue;
        } else
          if (isScriptClosingTag(tag))
            break;

      }

#ifdef DEBUG
      fprintf(stderr, "Got to the end of the script, found \"%s\"\n", tag);
#endif

      /*
       * At this point, we should be ready to read the first character
       * after the closing '>' of the </SCRIPT> tag.
       */
      continue;
    }

    /*
     * If it was a comment opener, skip to the comment closer.
     */
/*
    ch = fgetc (inStream);
    if (ch == EOF)
      break;
    for (i = 0 ; i < 10 ; i++) {
      if ((ch = fgetc (inStream)) == EOF)
        break;
      if (ch == '>')
        break;
      tag[i] = ch;
    }

    if (ch == EOF)
      break;
 */

  }

  m_putchar(EOF); /* for the rare case in which chars remain in bff */

  /*
   * Try to peacefully close the stream, if it is not stdin.
   */
  if (argc == 2)
    if (fclose(inStream))
      fprintf (stderr, "Error %d closing file.\n", errno);

  return 0;
}