File: reader.cc

package info (click to toggle)
word2x 1.5-2
links: PTS
area: main
in suites: hamm
size: 228 kB
ctags: 326
sloc: cpp: 3,076; ansic: 350; makefile: 64
file content (709 lines) | stat: -rw-r--r-- 14,288 bytes
/* $Id: reader.cc,v 1.10 1997/03/25 23:26:02 dps Exp $ */
/* Reads the word document */

#include <iostream.h>
#include <stdio.h>
extern "C" {
#include <string.h>
#include <ctype.h>
}
#include "word6.h"
#include "interface.h"
#include "reader.h"

#define RESUME_CHARS 30    
/* This code is basically a layered filtration process. At the bottom layer
   is this next function that reads a character from a word document. Pointers
   to object are used extensively to avoid implicit copies. */
static int read_character(FILE *in)
{
    int c, d, asc;

 re_feed:
    c=fgetc(in);
    switch(c)
    {
    case 0:
	/* This is a hack to skip junk---seems to work quite well */
	asc=0;
	while (asc<RESUME_CHARS)
	{
	    if ((c=fgetc(in))==EOF)
		goto re_feed;
	    if (c>=' ' && c<127)
		asc++;
	    else
		asc=0;
	}
	fseek(in, -RESUME_CHARS, SEEK_CUR);
	return CH_SUSPECT;

    case PAR_END:
	return (CH_PAR | CONTROL_FLAG);

    case TABLE_SEP:
	d=fgetc(in);
	if (d!=c)
	{
	    ungetc(d, in);		/* Push back character */
	    return (CH_FIELD | CONTROL_FLAG);
	}	
	return (CH_ROW | CONTROL_FLAG);

    case START_CTL:
	return (CH_SPEC | CONTROL_FLAG);

    case END_CTL:
	return (CH_ENDSPEC | CONTROL_FLAG);

    case HARD_RETURN:
	return (CH_HDRTN | CONTROL_FLAG);

    default:
	if (c<' ' && c!=EOF)
	    return (CH_OTHER | CONTROL_FLAG);
	else
	    return c;
    }
}


/* This function reads a paragraph, field of a table or whatever. It copies
   everything in any embed tags unprocessed and leaves it in the element */
void chunk_reader::read_chunk_raw(void)
{
    int c, is_ctl=0;

    text.zero();		// Zero text buffer
    while ((c=read_character(in))!=(EOF | CONTROL_FLAG))
    {
	if (c & CONTROL_FLAG)
	{
	    c &= ~CONTROL_FLAG;
	    /* If in embedded item then ignore all but end embed */
	    if (is_ctl)
	    {
		if (c==CH_ENDSPEC)
		{
		    is_ctl=0;
		    text.add(c);
		}
		continue;
	    }

	    switch(c)
	    {
	    case CH_PAR:
	    case CH_FIELD:
	    case CH_ROW:
		break;
		
	    case CH_HDRTN:
		text.add('\n');	// Add newline
		continue;	// Continue processing

	    case CH_OTHER:
		continue;	// Just ignore character

	    case CH_SPEC:
		text.add(c);
		if (!is_ctl)
		    is_ctl=1;
		continue;
		
	    case CH_ENDSPEC:
		cerr<<"Suprious ^U ignored\n";
		continue;
		
	    default:
		cerr<<"Unexpected value "<<(c & (~CONTROL_FLAG))\
		    <<" switch\n";
		continue;
	    }
	    type=c;
	    tptr=text;
	    return;
	}
	/* Not control or end of inclusion */
	text.add(c);
    }
    type=CH_EOF;
    tptr=text;
    return;
}


/* This function reads chunks from using read_chunk_raw and hands them
   out in contigous peices of the same type. Emebedded stuff gets
   seperated from the rest here. The partial flag is set if only some
   of a field is returned (usually because of an embedded item).
*/
struct chunk_rtn chunk_reader::read_chunk(void)
{
    const char *s;		// Save stupid compilers
    struct chunk_rtn res;

    if (tptr==NULL)
	this->read_chunk_raw();

    s=tptr;

    /* Embed */
    if (*s==CH_SPEC)
    {
	while (*(++s))
	{
	    if (*s==CH_ENDSPEC)
		break;
	    res.txt.add(*s);
	}
        tptr=s+1;
	res.type=CH_SPEC;
	return res;
    }

    /* Normal */
    while (*s)
    {
	if (*s==CH_SPEC)
	{
	    tptr=s;
	    res.type=(PART_FLAG | type);
	    return res;
	}

	res.txt.add(*s);
	s++;
    }
    res.type=type;
    tptr=NULL;
    text.zero();		// Save memory
    return res;
}



/*
 * Code that scans forward to the end of stuff that looks like an extension
 * of some maths that was the last thing.
 */
const char *math_forward_scan(const char *s)
{
    const char *scan, *end;
    int blvl;

    end=scan=s;

    /* Check whether the first part looks like more of the equation */
    while (1)
    {
	/* Skip spaces */
	while (isspace(*scan))
	    scan++;

	/* Look for binary operator */
	if (*scan=='+' || *scan=='-' || *scan=='*' || *scan=='/' ||
	    *scan=='=')
	{
	    /* skip spaces */
	    scan++;
	    while (isspace(*scan))
		scan++;

	    /* Grab next word */
	    blvl=0;
	    while (!isspace(*scan) || blvl>0)
	    {
		switch(*scan)
		{
		case '(':
		    blvl++;
		    break;

		case ')':
		    blvl--;
		    break;

		default:
		    break;
		}
		if (*scan=='\0')
		    break;	// Robustness fix
		scan++;
	    }

	    end=scan;		// Update end
	}
	else
	    break;		// No binary operator, assume no text
    }
    return end;
}

/*
 * Code that scans backwards to the start of stuff that looks like it should
 * ohave been prepended to the current maths.
 */
const char *math_reverse_scan(const char *s)
{
    const char *scan, *start;
    int blvl;

    start=scan=s+strlen(s)-1;

    /* Check whether the first part looks like more of the equation */
    while (scan>=s)
    {
	/* Skip spaces */
	while (scan>=s && isspace(*scan))
	    scan--;
	if (scan<s)
	    return s;

	/* Look for binary operator */
	if (*scan=='+' || *scan=='-' || *scan=='*' || *scan=='/' ||
	    *scan=='=')
	{
	    /* skip spaces */
	    scan--;
	    while (scan>=s && isspace(*scan))
		scan--;
	    if (scan<s)
		return s;

	    /* Grab next word */
	    blvl=0;
	    while (!isspace(*scan) || blvl>0 )
	    {
		switch(*scan)
		{
		case ')':
		    blvl++;
		    break;

		case '(':
		    blvl--;
		    break;

		default:
		    break;
		}
		if (scan==s)
		    return s;	// Robustness fix
		scan--;
	    }
	    start=scan;		// Update end
	}
	else
	    break;		// No binary operator, assume no text
    }
    return start;
}

/*
 * Code to feed a token one at a time. (private, need prostproccessing
 * to compensate for equation abuse by word users)
 */
const tok_seq::tok *tok_seq::feed_token(void)
{
    while (output.is_empty())
    {
	if (!rd_token())
	{
	    if (!done_end)
	    {
		tok *t;
		t=new(tok)(T_DOC, "End of word2x output", tok::TOK_END);
		output.enqueue(t);
		done_end=1;
	    }
	    else
		return NULL;
	}
    }
    return output.dequeue();
}

/* Private token reader, compensates for equation abouse */
const tok_seq::tok *tok_seq::math_collect(void)
{
    const tok *rdt, *ntok, *nntok;
    const char *mptr, *endptr;
    char *s, *t;
    
 math_aggregate: ;
    if ((rdt=this->saved_tok)==NULL)
    {
	if ((rdt=this->feed_token())==NULL)
		return NULL;
    }
    else
	saved_tok=NULL;
    
    switch (rdt->tok & (~PART_FLAG))
    {
    case T_PARAGRAPH:
	if (rdt->end!=tok::TOK_START || (rdt->tok & PART_FLAG==0)
	    || rdt->data.d==NULL)
	    break;
	if ((ntok=this->feed_token())==NULL)
	    break;
	/* Passed all the easy rejection cases, invoke math_reverse_scan */
	saved_tok=ntok;
	if (ntok->tok==T_SPEC && ntok->end==tok::TOK_START &&
	    ntok->data.d!=NULL && strncmp(ntok->data.d, "eq ", 3)==0)
	{
	    mptr=math_reverse_scan(rdt->data.d);
	    endptr=rdt->data.d+strlen(rdt->data.d)-1;
	    if (mptr>=endptr)
		break;
	    /* Allocate memory */
	    if ((s=(char *) malloc(mptr-rdt->data.d+1))==NULL)
	    {
		cerr<<"Malloc read_token::malloc failure (fatal)\n";
		exit(1);
	    }
	    if ((t=(char *) malloc(strlen(ntok->data.d)+endptr-mptr+1))==NULL)
	    {
		free((void *) s);
		cerr<<"Malloc read_token::malloc failure (fatal)\n";
		exit(1);
	    }
	    /* Compute result strings */
	    memcpy(s, rdt->data.d, mptr-rdt->data.d);
	    *(s+(mptr-rdt->data.d))='\0';
	    memcpy(t, ntok->data.d, 3);
	    memcpy(t+3, mptr, endptr-mptr+1);
	    strcpy(t+3+(endptr-mptr)+1, ntok->data.d+3);
	    /* Replace original data */
	    free((void *) rdt->data.d);
	    ((tok *) rdt)->data.d=s;
	    free((void *) ntok->data.d);
	    ((tok *) ntok)->data.d=t;
	}
	break;
	    
	    
    case T_SPEC:
	if (rdt->end!=tok::TOK_START || rdt->data.d==NULL ||
	    strncmp(rdt->data.d, "eq ", 3)!=0)
	    break;
	if ((nntok=this->feed_token())==NULL)
	    break;		// this is the end of the SPEC.
	if (nntok->tok!=T_SPEC || nntok->end!=tok::TOK_END)
	{
	    cerr<<"Unexpected value of nntok: type "
		<<nntok->tok<<" end "<<nntok->end<<"\n";
	}
	if ((ntok=this->feed_token())==NULL)
	{
	    output.insert(nntok);
	    break;
	}
	/* Passed all the easy rejection cases, invoke math_forward_scan */
	saved_tok=ntok;
	if (ntok->tok==T_PARAGRAPH && ntok->end!=tok::TOK_END &&
	    ntok->data.d!=NULL)
	{
	    mptr=math_forward_scan(ntok->data.d);
	    endptr=ntok->data.d+strlen(ntok->data.d);
	    if (mptr==ntok->data.d)
	    {
		output.insert(ntok); // This comes out second
		output.insert(nntok);
		saved_tok=NULL;
		break;
	    }
	    /* Allocate memory */
	    if (*mptr!='\0')
	    {
		if ((s=(char *) malloc(endptr-mptr))==NULL)
		{
		    cerr<<"Malloc read_token::malloc failure (fatal)\n";
		    exit(1);
		}
		memcpy(s, mptr, endptr-mptr);
		*(s+(endptr-mptr))='\0';
	    }
	    else
		s=NULL;

	    if ((t=(char *)
		 malloc(strlen(rdt->data.d)+mptr-ntok->data.d+1))==NULL)
	    {
		if (s!=NULL)
		    free((void *) s);
		cerr<<"Malloc read_token::malloc failure (fatal)\n";
		exit(1);
	    }
	    endptr=rdt->data.d+strlen(rdt->data.d);
	    memcpy(t, rdt->data.d, endptr-rdt->data.d);
	    memcpy(t+(endptr-rdt->data.d), ntok->data.d, mptr-ntok->data.d);
	    *(t+(endptr-rdt->data.d)+(mptr-ntok->data.d))='\0';
	    /* Afjust result */
	    free((void *) rdt->data.d);
	    ((tok *) rdt)->data.d=t;
	    if (*mptr=='\0')
	    {
		/* If we consumed 100% continue seeking */
		delete(ntok);
		saved_tok=rdt;
		output.insert(nntok); // Re-insert end of spec.
		goto math_aggregate;
	    }
	    free((void *) ntok->data.d);
	    ((tok *) ntok)->data.d=s;
	    /* Not all consumed, return result */
	}
	else if (ntok->tok==T_SPEC && ntok->end==tok::TOK_START &&
		 ntok->data.d!=NULL && strncmp(ntok->data.d, "eq ", 3)==0)
	{
	    /* Combine consecutive eq's */
	    endptr=rdt->data.d+strlen(rdt->data.d);
	    if ((t=(char *)
		 malloc((endptr-rdt->data.d)+strlen(ntok->data.d)-2))==NULL)
	    {
		cerr<<"Malloc read_token::malloc failure (fatal)\n";
		exit(1);
	    }
	    memcpy(t, rdt->data.d, endptr-rdt->data.d);
	    strcpy(t+(endptr-rdt->data.d), ntok->data.d+3);
	    delete(nntok);	// Reply on end of spec following this eq
	    delete(ntok);	// Junk this eq
	    free((void *) rdt->data.d);
	    ((tok *) rdt)->data.d=t;
	    saved_tok=rdt;
	    goto math_aggregate;
	}
	output.insert(ntok); // This comes out second
    	output.insert(nntok);
	saved_tok=NULL;
	break;

		
    default:
	break;
    }
    return rdt;
}


/* Public token reader */
const tok_seq::tok *tok_seq::read_token(void)
{
    const tok *t, *n;
    fifo<tok> *tf;
    int tot, specs;

    if ((t=this->math_collect())==NULL)
	return NULL;

    switch(t->tok)
    {
    case T_PARAGRAPH:
	if (t->end!=tok::TOK_START)
	    return t;
	/* Check for spec only paragraph */

	tf=new(fifo<tok>);
	n=t;
	tot=0;
	specs=0;
	/*
	 * This loop counts the number of characters in paragraphs and other
	 * items untilt the end of the paragraph. Each item is dumped on tf
	 * and this is inserted onto the beginning of the output queue.
	 */
	while(1)
	{
	    tf->enqueue(n);
	    if (n->tok==T_PARAGRAPH)
	    {
		if (n->end==tok::TOK_END)
		    break;
		if (n->data.d!=NULL)
		    tot+=strlen(n->data.d);
		if (tot>DISPL_TRESHOLD)
		    break;
	    }
	    else
		specs++;

	    if (n->tok!=T_SPEC && n->tok!=T_OTHER && n->tok!=T_PARAGRAPH)
	    {
		tot+=DISPL_TRESHOLD;
		break;
	    }
	    if ((n=this->math_collect())==NULL)
		break;
	}
	/*
	 * If the total is small enough and there is one or more item that
	 * will make it through the filter. Since insert()ed things end up
	 * in reverse order we must first reverse the queue (this is the
	 * uncommon case, so it is OK if it costs a bit more).
	 */
	if (tot<DISPL_TRESHOLD && specs>0)
	{
	    tf->rev();
	    while ((n=tf->dequeue())!=NULL)
	    {
		if (n->tok!=T_PARAGRAPH)
		    output.insert(n);
		else
		    delete(n);
	    }
	}
	else
	{
	    output.ins_trans(tf);
	}
	delete(tf);
	t=output.dequeue();
	break;

    default:
	break;
    }

    return t;
}
    


/*
 * Refill the token queue.
 */
int tok_seq::rd_token(void)
{
    struct chunk_rtn r;
    tok *t;
    int i;

    r=read_chunk();
    if (r.type==CH_EOF)
	return 0;

    switch(r.type & ~PART_FLAG)
    {
    case CH_ROW:
	if (table==NULL)
	    table=new(table_info);
	/* Handle 1 field rows properly */
	if (table->col==0)
	{
	    t=new(tok)(T_ROW, NULL, tok::TOK_START);
	    table->enqueue(t);
	}
	table->col++;
	if (table->col>table->cols)
	    table->cols=table->col;
	table->rows++;
	table->tok_push(T_FIELD, &(r.txt));
	t=new(tok)(T_ROW, NULL, tok::TOK_END);
	table->enqueue(t);
	table->col=0;
	break;
	
    case CH_FIELD:
	if (table==NULL)
	{
	    table=new(table_info);
	}
	if (table->col==0)
	{
	    t=new(tok)(T_ROW, NULL, tok::TOK_START);
	    table->enqueue(t);
	}
	table->col++;
	table->tok_push(T_FIELD, &(r.txt));
	break;
	

    case CH_PAR:
	if (table!=NULL)
	{
	    /* Table handling */
	    if (table->col!=0)
	    {
		table->tok_push(T_FIELD, &(r.txt));
		t=new(tok)(T_ROW, NULL, tok::TOK_END);
		table->enqueue(t);
		t=new(tok)(T_ROW, NULL, tok::TOK_START);
		table->enqueue(t);
		for (i=0; i<table->col; i++)
		{
		    t=new(tok)(T_FIELD, "\0", tok::TOK_START);
		    table->enqueue(t);
		    t=new(tok)(T_FIELD, NULL, tok::TOK_END);
		    table->enqueue(t);
		}
		table->rows++;
		break;
	    }
	    table->finish(&output);
	    delete(table);
	    table=NULL;
	}

	if (r.type & PART_FLAG)
	{
	    tok *td;
	    td=new(tok)(T_PARAGRAPH, (const char *) (r.txt), tok::TOK_START);
	    output.enqueue(td);
	}
	else
	    tok_push(T_PARAGRAPH, &(r.txt));
	break;
	
    case CH_SPEC:
	tok_push(T_SPEC, &(r.txt));
	break;

    default:
	break;
    }

    return 1;
}

ostream &operator<<(ostream &os, const tok_seq::tok *d)
{
    os<<'('<<d->tok<<',';
    switch(d->dtype)
    {
    case 1:
	if (d->data.d!=NULL && strlen(d->data.d)>10)
	{
	    char foo[11];
	    int i;
	    
	    for(i=0; i<7; i++)
		foo[i]=d->data.d[i];
	    for ( ; i<10; i++)
		foo[i]='.';
	    foo[10]='\0';
	    os<<foo;
	}
	else
	    os<<d->data.d;
	break;
    case 0:
	os<<d->data.table.rows<<'x'<<d->data.table.cols;
	break;
    }
    os<<','<<((d->end==tok_seq::tok::TOK_START) ? "start" : "end")<<')';
    return os;
}

tok_seq::tok &tok_seq::tok::operator=(const tok_seq::tok &d)
{
    tok=d.tok;
    end=d.end;
    dtype=d.dtype;
    if (d.dtype==TEXT && d.data.d!=NULL)
    {
	data.d=strdup(d.data.d);
    }
    return (*this);
}