File: unhtml.c

package info (click to toggle)
unhtml 2.2-5
links: PTS
area: main
in suites: potato
size: 92 kB
ctags: 30
sloc: ansic: 382; makefile: 52; awk: 2
file content (253 lines) | stat: -rw-r--r-- 5,534 bytes
parent folder | download | duplicates (2)
/*
 * File:        unhtml.c
 * Program:     unhtml
 * Written by:  Kevin Swan, 013639s@dragon.acadiau.ca
 * Completed:   February 3, 1998
 * Version:     2.3
 *
 * Usage:
 *     unhtml -version | [ filename ]
 *
 * Specification:
 * unhtml is a program which removes HTML formatting from a stream, writing
 * to output to a stream.  If it is invoked with a file name, it attempts
 * to read from the named file.  If it is invoked without a filename, it
 * reads from stdin.  It writes all output to stdout.
 */



#include "ops.h"
#include "esc.h"


char *VERSION = "unhtml  Version 2.3  Copyright (C) 1998 by Kevin Swan";
char *USAGE   = "unhtml -version | [ filename ]";



int main(int argc, char *argv[]) {

  /*
   * Variables local to the program.
   */
  FILE *inStream;
  char tag[MAX_TAG_SIZE];
  char ch;
  int i;



  /*
   * Do argument checking.  If more than one command line argument was
   * given, print a usage error.
   */
  if (argc > 2)
    fprintf (stderr, "Usage: %s\n", USAGE);

  /*
   * If the user simply requested the version of the program, print that
   * information and terminate.
   */
  if (argc == 2)
    if (strcmp (argv[1], "-version") == 0) {
      printf ("%s\n", VERSION);
      return 0;
    }

  /*
   * If an input file was specified, try to open a read stream on it.
   */
  if (argc == 2) {
    if ((inStream = fopen (argv[1], "r")) == NULL) {
      fprintf (stderr, "Error opening file [%s] for reading.\n", argv[1]);
      fprintf (stderr, USAGE);
      return 1;
    }
  } else
  /*
   * Otherwise, just use the standard input stream.
   */
    inStream = stdin;

  /*
   * Read tokens from the stream until we hit an opener for an HTML tag.
   */
  while (1) {

    ch = fgetc (inStream);

    /*
     * If we hit the end of the file, we're done.
     */
    if (ch == EOF)
      break;

    /*
     * If the character is not a tag opener, just print it.
     */
    if (ch != '<') {
      m_putchar (ch);
      continue;
    }

    /*
     * If we get this far, we've hit an HTML tag.  Read it into the
     * variable tag.
     */
    i = 0;
    while (ch != EOF) {
      tag[i] = ch;
      i++;
      if (ch == '>')
        break;

      if (i > (MAX_TAG_SIZE - 1)) {
        fprintf (stderr, "Error: encountered a tag larger than %d.\n", MAX_TAG_SIZE);
        fprintf (stderr, "Recompile with a larger MAX_TAG_SIZE value.\n");
        return 1;
      }

      ch = fgetc (inStream);
    }

    tag[i] = '\0';

#ifdef DEBUG
    fprintf (stderr, "Read in the tag \"%s\"\n", tag);
#endif

    /*
     * If it was a script opener, it is a special case.  We may find
     * '>' characters inside the <SCRIPT></SCRIPT> pair that are not
     * associated with a tag.  In addition, comment delimiters are
     * found inside the script tag pairs.  So, if we get a script
     * tag, skip ahead to the closing script tag.
     */
    if (isScriptOpeningTag (tag)) {

#ifdef DEBUG
      fprintf (stderr, "\"%s\" is a script opener.\n", tag);
#endif

      /*
       * This loop is necessary to ensure that we don't swallow up the
       * closing </SCRIPT> tag while filling the buffer if we happen to
       * hit a '<' character in some comparison in the scripting language.
       */
      while (1) {



#ifdef DEBUG
        fprintf (stderr, "1. Read till we hit a '<'.\n");
#endif

        /*
         * Read until we hit a '<'.
         */
        ch = fgetc (inStream);

        while (ch != EOF)
          if (ch == '<')
            break;
          else
            ch = fgetc (inStream);

        if (ch == EOF) {
          ungetc (ch, inStream);
          break;
        }



#ifdef DEBUG
        fprintf (stderr, "2. Read till we hit a '>' or a '<', filling the buffer.\n");
#endif

        /*
         * Hit a '<'.  Read till we hit a '>' or a '<', filling the buffer.
         */
        i = 1;
        tag[0] = '<';
        ch = fgetc (inStream);
        while (ch != EOF) {
          tag[i] = ch;
          i++;

          if (i > MAX_TAG_SIZE) {
            fprintf (stderr, "Error: encountered a tag larger than %d.\n", MAX_TAG_SIZE);
            fprintf (stderr, "Recompile with a larger MAX_TAG_SIZE value.\n");
            return 1;
          }

          if ((ch == '>') || (ch == '<'))
            break;

          ch = fgetc (inStream);

        } /* while */

        if (ch == EOF) {
          ungetc (ch, inStream);
          break;
        }

        tag[i] = '\0';

#ifdef DEBUG
        fprintf (stderr, "Read tag: \"%s\"\n", tag);
#endif

        if (ch == '<') {
          ungetc (ch, inStream);
          continue;
        } else
          if (isScriptClosingTag(tag))
            break;

      }

#ifdef DEBUG
      fprintf(stderr, "Got to the end of the script, found \"%s\"\n", tag);
#endif

      /*
       * At this point, we should be ready to read the first character
       * after the closing '>' of the </SCRIPT> tag.
       */
      continue;
    }

    /*
     * If it was a comment opener, skip to the comment closer.
     */
/*
    ch = fgetc (inStream);
    if (ch == EOF)
      break;
    for (i = 0 ; i < 10 ; i++) {
      if ((ch = fgetc (inStream)) == EOF)
        break;
      if (ch == '>')
        break;
      tag[i] = ch;
    }

    if (ch == EOF)
      break;
 */

  }

  m_putchar(EOF); /* for the rare case in which chars remain in bff */

  /*
   * Try to peacefully close the stream, if it is not stdin.
   */
  if (argc == 2)
    if (fclose(inStream))
      fprintf (stderr, "Error %d closing file.\n", errno);

  return 0;
}