File: BasicDocument.cc

package info (click to toggle)
htdig 1%3A3.2.0b6-21
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 21,292 kB
sloc: ansic: 49,632; cpp: 46,468; sh: 17,400; xml: 4,180; perl: 2,543; makefile: 888; php: 79; asm: 14
file content (316 lines) | stat: -rw-r--r-- 7,858 bytes
//
// BasicDocument.cc
//
// 2/6/2002 created for libhtdig to simplify & mimic Document.cc
//
// Neal Richter nealr@rightnow.com
//
//
// BasicDocument: This class holds everything there is to know about a document.
//           The actual contents of the document may or may not be present at
//           all times for memory conservation reasons.
//
//           This is a basic extensable container for plain text holding documents.
//
//           Uses any Parser with parse method handling this class.
//           
// Part of the ht://Dig package   <https://htdig.sourceforge.net/>
// Copyright (c) 1995-2004 The ht://Dig Group
// For copyright details, see the file COPYING in your distribution
// or the GNU Library General Public License (LGPL) version 2 or later or later
// <http://www.gnu.org/copyleft/lgpl.html>
//
// $Id: BasicDocument.cc,v 1.3 2004/05/28 13:15:28 lha Exp $
//
//--------------------------------------------------------------------

#ifdef HAVE_CONFIG_H
#include "htconfig.h"
#endif /* HAVE_CONFIG_H */

#include <signal.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <ctype.h>

#include "BasicDocument.h"
#include "TextCollector.h"
#include "StringList.h"
#include "htdig.h"
#include "Plaintext.h"
#include "HTML.h"
#include "ExternalParser.h"
#include "lib.h"

#include "defaults.h"

#if 1
typedef void (*SIGNAL_HANDLER) (...);
#else
typedef SIG_PF SIGNAL_HANDLER;
#endif

//*****************************************************************************
// BasicDocument::BasicDocument(char *loc)
//   Initialize with the given loc-parameter as the location for this document.
//   If the max_size is given, use that for size, otherwise use the
//   config value.
//
BasicDocument::BasicDocument(char *loc, int suggested_size)
{
    int temp_size = 0;

    id = 0;
    location = 0;
    title = 0;
    metacontent = 0;
    contents = 0;
    document_length = 0;


    HtConfiguration *config = HtConfiguration::config();

    //We probably need to move assignment of max_doc_size, according
    //to a configuration value. 

    if (suggested_size > 0)
        temp_size = suggested_size;
    else
        temp_size = config->Value("max_doc_size");

    contents.allocate(temp_size + 100);

    contentType = "";

    if (loc)
    {
        Location(loc);
    }
}


//*****************************************************************************
// BasicDocument::~BasicDocument()
//
BasicDocument::~BasicDocument()
{
    // We delete only the derived class objects

#if MEM_DEBUG
    char *p = new char;
    cout << "==== BasicDocument deleted: " << this << " new at " << ((void *) p) << endl;
    delete p;
#endif
}


//*****************************************************************************
// void BasicDocument::Reset()
//   Restore the BasicDocument object to an initial state.
//
void
BasicDocument::Reset()
{

    id = 0;
    location = 0;
    title = 0;
    metacontent = 0;
    contents = 0;

    contentType = 0;
    document_length = 0;

}

//*****************************************************************************
// void BasicDocument::Length()
//   Return/Calc length of BasicDocument... icummulative size of the Strings
//
int
BasicDocument::Length()
{
    if (document_length < 0)
    {
        document_length = 0;
        document_length += location.length();
        document_length += title.length();
        document_length += metacontent.length();
        document_length += contents.length();
        document_length += id.length();
    }

    return (document_length);
}


//*****************************************************************************
// Parsable *BasicDocument::getParsable()
//   Given the content-type of a document, returns a document parser.
//   This will first look through the list of user supplied parsers and
//   then at our (limited) builtin list of parsers.  The user supplied
//   parsers are external programs that will be used.

Parsable *
BasicDocument::getParsable()
{
    static HTML *html = 0;
    static Plaintext *plaintext = 0;
    static ExternalParser *externalParser = 0;

    Parsable *parsable = 0;

    if (ExternalParser::canParse(contentType))
    {
        if (externalParser)
        {
            delete externalParser;
        }
        externalParser = new ExternalParser(contentType);
        parsable = externalParser;
    }
    else if (mystrncasecmp((char *) contentType, "text/html", 9) == 0)
    {
        if (!html)
            html = new HTML();
        parsable = html;
    }
    else if (mystrncasecmp((char *) contentType, "text/plain", 10) == 0)
    {
        if (!plaintext)
            plaintext = new Plaintext();
        parsable = plaintext;
    }
    else if (mystrncasecmp((char *) contentType, "text/css", 8) == 0)
    {
        return NULL;
    }
    else if (mystrncasecmp((char *) contentType, "text/", 5) == 0)
    {
        if (!plaintext)
            plaintext = new Plaintext();
        parsable = plaintext;
        if (debug > 1)
        {
            cout << '"' << contentType << "\" not a recognized type.  Assuming text/plain\n";
        }
    }
    else
    {
        if (debug > 1)
        {
            cout << '"' << contentType << "\" not a recognized type.  Ignoring\n";
        }
        return NULL;
    }

    parsable->setContents(contents.get(), contents.length());
    return parsable;
}

//*****************************************************************************
//
//  Test for self parseaable
//
int
BasicDocument::SelfParseable()
{

    if (mystrncasecmp((char *) contentType, "text/vnd.customdocument", 10) == 0)
    {
        return (TRUE);
    }
    else
        return (FALSE);

}


//*****************************************************************************
// Parsable *BasicDocument::internalParser()
int     
BasicDocument::internalParser(TextCollector & textcollector)
{
    HtConfiguration* config= HtConfiguration::config();
    char *position = NULL;
    static int minimumWordLength = config->Value("minimum_word_length", 3);
    int wordIndex = 1;
    String word;
    int letter_count = 0;

    //First Process Title
    textcollector.got_title((char *) title);

    //Next Process Contents
    position = contents;

    while (*position)
    {
        word = 0;

        if (HtIsStrictWordChar(*position))
        {
            //
            // Start of a word.  Try to find the whole thing
            //
            //TODO NEAL RICHTER  Imposed a 50-letter word length limit here
            //
            while (*position && HtIsWordChar(*position) && (letter_count < 50))
            {
                word << *position;
                position++;
                letter_count++;
            }

            letter_count = 0;
            if (word.length() >= minimumWordLength)
            {
                textcollector.got_word((char *) word, wordIndex++, 0);
            }
        }
        
        if (*position)
            position++;

    }//end while

    textcollector.got_head((char*) contents);

      //Third, Process MetaContent
    position = metacontent;
    textcollector.got_meta_dsc(metacontent);
    

    //max_meta_description_length???
    
    while (*position)
    {
        word = 0;

        if (HtIsStrictWordChar(*position))
        {
            //
            // Start of a word.  Try to find the whole thing
            //
            while (*position && HtIsWordChar(*position) && (letter_count < 50))
            {
                word << *position;
                position++;
                letter_count++;
            }

            letter_count = 0;

            if (word.length() >= minimumWordLength)
            {
                textcollector.got_word((char *) word, wordIndex++, 9);
            }
        }
        
        if (*position)
            position++;

    }//end while

    return(1);
}