File: URL.cc

package info (click to toggle)
htdig 1%3A3.2.0b6-21
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 21,292 kB
sloc: ansic: 49,632; cpp: 46,468; sh: 17,400; xml: 4,180; perl: 2,543; makefile: 888; php: 79; asm: 14
file content (936 lines) | stat: -rw-r--r-- 23,391 bytes
//
// URL.cc
//
// URL: A URL parsing class, implementing as closely as possible the standard
//      laid out in RFC2396 (e.g. http://www.faqs.org/rfcs/rfc2396.html)
//      including support for multiple services. (schemes in the RFC)
//
// Part of the ht://Dig package   <https://htdig.sourceforge.net/>
// Copyright (c) 1995-2004 The ht://Dig Group
// For copyright details, see the file COPYING in your distribution
// or the GNU Library General Public License (LGPL) version 2 or later 
// <http://www.gnu.org/copyleft/lgpl.html>
//
// $Id: URL.cc,v 1.16 2004/06/04 08:51:01 angusgb Exp $
//

#ifdef HAVE_CONFIG_H
#include "htconfig.h"
#endif /* HAVE_CONFIG_H */

#include "URL.h"
#include "QuotedStringList.h"
#include "Dictionary.h"
#include "HtConfiguration.h"
#include "StringMatch.h"
#include "StringList.h"
#include "HtURLRewriter.h"

#include <string.h>
#include <stdlib.h>
#include <stdio.h>

#ifdef HAVE_STD
#include <fstream>
#ifdef HAVE_NAMESPACES
using namespace std;
#endif
#else
#include <fstream.h>
#endif /* HAVE_STD */

#include <sys/types.h>
#include <ctype.h>

#ifndef _MSC_VER /* _WIN32 */
#include <sys/socket.h>
#include <netdb.h>
#include <arpa/inet.h>
#endif

#define NNTP_DEFAULT_PORT 119

static Dictionary	*slashCount = 0;

//*****************************************************************************
// URL::URL()
// Default Constructor
//
URL::URL()
: _url(0),
    _path(0),
    _service(0),
    _host(0),
    _port(0),
    _normal(0),
    _hopcount(0),
    _signature(0),
    _user(0)
{
}


//*****************************************************************************
// URL::URL(const URL& rhs)
// Copy constructor
//
URL::URL(const URL& rhs)
: _url(rhs._url),
    _path(rhs._path),
    _service(rhs._service),
    _host(rhs._host),
    _port(rhs._port),
    _normal(rhs._normal),
    _hopcount(rhs._hopcount),
    _signature(rhs._signature),
    _user(rhs._user)
{
}


//*****************************************************************************
// URL::URL(const String &nurl)
// Construct a URL from a String (obviously parses the string passed in)
// 
URL::URL(const String &nurl)
: _url(0),
    _path(0),
    _service(0),
    _host(0),
    _port(0),
    _normal(0),
    _hopcount(0),
    _signature(0),
    _user(0)
{
    parse(nurl);
}


//*****************************************************************************
// Assignment operator
const URL &URL::operator = (const URL &rhs)
{
	if (this == &rhs)
		return *this;

	// Copy the attributes
	_url = rhs._url;
	_path = rhs._path;
	_service = rhs._service;
	_host = rhs._host;
	_port = rhs._port;
	_normal = rhs._normal;
	_hopcount = rhs._hopcount;
	_signature = rhs._signature;
	_user = rhs._user;

	return *this;
}

//*****************************************************************************
// URL::URL(const String &url, const URL &parent)
//   Parse a reference given a parent url.  This is needed to resolve relative
//   references which do NOT have a full url.
//
URL::URL(const String &url, const URL &parent)
: _url(0),
    _path(0),
    _service(parent._service),
    _host(parent._host),
    _port(parent._port),
    _normal(parent._normal),
    _hopcount(parent._hopcount + 1), // Since this is one hop *after* the parent, we should account for this
    _signature(parent._signature),
    _user(parent._user)
{
	HtConfiguration* config= HtConfiguration::config();
    int  allowspace = config->Boolean("allow_space_in_url", 0);
    String      temp;
    const char *urp = url.get();
    while (*urp)
    {
	if (*urp == ' ' && temp.length() > 0 && allowspace)
	{
	    // Replace space character with %20 if there's more non-space
	    // characters to come...
	    const char *s = urp+1;
	    while (*s && isspace(*s))
		s++;
	    if (*s)
		temp << "%20";
	}
	else if (!isspace(*urp))
	    temp << *urp;
	urp++;
    }
    char* ref = temp;

    //
    // Strip any optional anchor from the reference.  If, however, the
    // reference contains CGI parameters after the anchor, the parameters
    // will be moved left to replace the anchor.  The overall effect is that
    // the anchor is removed.
    // Thanks goes to David Filiatrault <dwf@WebThreads.Com> for suggesting
    // this removal process.
    //
    char	*anchor = strchr(ref, '#');
    char	*params = strchr(ref, '?');
    if (anchor)
    {
	*anchor = '\0';
	if (params)
	{
	    if (anchor < params)
	    {
		while (*params)
		{
		    *anchor++ = *params++;
		}
		*anchor = '\0';
	    }
	}
    }

    //
    // If, after the removal of a possible '#' we have nothing left,
    // we just want to use the base URL (we're on the same page but
    // different anchors)
    //
    if (!*ref)
    {
        // We've already copied much of the info
	_url = parent._url;
	_path = parent._path;
	// Since this is on the same page, we want the same hopcount
	_hopcount = parent._hopcount;
	return;
    }

    // OK, now we need to work out what type of child URL this is
    char	*p = ref;
    while (isalpha(*p))  // Skip through the service portion
	p++;
    int	hasService = (*p == ':');
    	// Why single out http?  Shouldn't others be the same?
	// Child URL of the form  https:/child  or  ftp:child  called "full"
	// How about using slashes()?
    if (hasService && ((strncmp(ref, "http://", 7) == 0) ||
		       (strncmp(ref, "http:", 5) != 0)))
    {
	//
	// No need to look at the parent url since this is a complete url...
	//
	parse(ref);
    }
    else if (strncmp(ref, "//", 2) == 0)
    {
	// look at the parent url's _service, to make this is a complete url...
	String	fullref(parent._service);
	fullref << ':' << ref;
	parse((char*)fullref);
    }
    else
    {
	if (hasService)
	    ref = p + 1;	// Relative URL, skip "http:"

	if (*ref == '/')
	{
	    //
	    // The reference is on the same server as the parent, but
	    // an absolute path was given...
	    //
	    _path = ref;

            //
            // Get rid of loop-causing constructs in the path
            //
            normalizePath();
	}
	else
	{
	    //
	    // The reference is relative to the parent
	    //

	    _path = parent._path;
	    int i = _path.indexOf('?');
	    if (i >= 0)
	    {
		_path.chop(_path.length() - i);
	    }

	    //
	    // Remove any leading "./" sequences which could get us into
	    // recursive loops.
	    //
	    while (strncmp(ref, "./", 2) == 0)
		ref += 2;

	    if (_path.last() == '/')
	    {
		//
		// Parent was a directory.  Easy enough: just append
		// the current ref to it
		//
		_path << ref;
	    }
	    else
	    {
		//
		// Parent was a file.  We need to strip the last part
		// of the path before we add the reference to it.
		//
		String	temp = _path;
		p = strrchr((char*)temp, '/');
		if (p)
		{
		    p[1] = '\0';
		    _path = temp.get();
		    _path << ref;
		}
		else
		{
		    //
		    // Something must be wrong since there were no '/'
		    // found in the parent url.
		    //
		    // We do nothing here.  The new url is the parent.
		    //
		}
	    }

            //
            // Get rid of loop-causing constructs in the path
            //
            normalizePath();
	}

	//
	// Build the url.  (Note, the host name has NOT been normalized!)
	// No need for this if we have called URL::parse.
	//
	constructURL();
    }
}


//*****************************************************************************
// void URL::rewrite()
//
void URL::rewrite()
{
	if (HtURLRewriter::instance()->replace(_url) > 0)
		parse(_url.get());
}


//*****************************************************************************
// void URL::parse(const String &u)
//   Given a URL string, extract the service, host, port, and path from it.
//
void URL::parse(const String &u)
{
	HtConfiguration* config= HtConfiguration::config();
    int  allowspace = config->Boolean("allow_space_in_url", 0);
    String	temp;
    const char *urp = u.get();
    while (*urp)
    {
	if (*urp == ' ' && temp.length() > 0 && allowspace)
	{
	    // Replace space character with %20 if there's more non-space
	    // characters to come...
	    const char *s = urp+1;
	    while (*s && isspace(*s))
		s++;
	    if (*s)
		temp << "%20";
	}
	else if (!isspace(*urp))
	    temp << *urp;
	urp++;
    }
    char	*nurl = temp;

    //
    // Ignore any part of the URL that follows the '#' since this is just
    // an index into a document.
    //
    char	*p = strchr(nurl, '#');
    if (p)
	*p = '\0';

    // Some members need to be reset.  If not, the caller would
    // have used URL::URL(char *ref, URL &parent)
    // (which may call us, if the URL is found to be absolute).
    _normal = 0;
    _signature = 0;
    _user = 0;

    //
    // Extract the service
    //
    p = strchr(nurl, ':');
    if (p)
    {
	_service = strtok(nurl, ":");
	p = strtok(0, "\n");
    }
    else
    {
	_service = "http";
	p = strtok(nurl, "\n");
    }
    _service.lowercase();

    //
    // Extract the host
    //
    if (!p || strncmp(p, "//", 2) != 0)
    {
	// No host specified, it's all a path.
	_host = 0;
	_port = 0;
	_url = 0;
	if (p)		// if non-NULL, skip (some) leading slashes in path
	{
	    int i;
	    for (i = slashes (_service); i > 0 && *p == '/'; i--)
		p++;
	    if (i)	// if fewer slashes than specified for protocol don't
			// delete any. -> Backwards compatible (necessary??)
		p -= slashes (_service) - i;
	}
	_path = p;
	if (strcmp((char*)_service, "file") == 0 || slashes (_service) < 2)
	  _host = "localhost";
    }
    else
    {
	p += 2;

	//
	// p now points to the host
	//
	char	*q = strchr(p, ':');
	char	*slash = strchr(p, '/');
    
	_path = "/";
	if (strcmp((char*)_service, "file") == 0)
	  {
	    // These should be of the form file:/// (i.e. no host)
	    // if there is a file://host/path then strip the host
	    if (strncmp(p, "/", 1) != 0)
	      {
		p = strtok(p, "/");
		_path << strtok(0, "\n");
	      }
	    else
	      _path << strtok(p+1, "\n");	// _path is "/" - don't double
	    _host = "localhost";
	    _port = 0;
	  }
	else if (q && ((slash && slash > q) || !slash))
	{
	    _host = strtok(p, ":");
	    p = strtok(0, "/");
	    if (p)
	      _port = atoi(p);
	    if (!p || _port <= 0)
               _port = DefaultPort();
	    //
	    // The rest of the input string is the path.
	    //
	    _path << strtok(0, "\n");

	}
	else
	{
	    _host = strtok(p, "/");
	    _host.chop(" \t");
            _port = DefaultPort();

	    //
	    // The rest of the input string is the path.
	    //
	    _path << strtok(0, "\n");

	}

	// Check to see if host contains a user@ portion
	int atMark = _host.indexOf('@');
	if (atMark != -1)
	  {
	    _user = _host.sub(0, atMark);
	    _host = _host.sub(atMark + 1);
	  }
    }

    //
    // Get rid of loop-causing constructs in the path
    //
    normalizePath();

    //
    // Build the url.  (Note, the host name has NOT been normalized!)
    //
    constructURL();
}


//*****************************************************************************
// void URL::normalizePath()
// Called from: URL(const String &url, const URL &parent)
//
void URL::normalizePath()
{
    //
    // Rewrite the path to be the minimal.
    // Remove "//", "/../" and "/./" components
    //
	HtConfiguration* config= HtConfiguration::config();

    int	i, limit;
    int	leadingdotdot = 0;
    String	newPath;
    int	pathend = _path.indexOf('?');	// Don't mess up query strings.
    if (pathend < 0)
        pathend = _path.length();

    //
    // get rid of "//" first, or "/foo//../" will become "/foo/" not "/"
    // Some database lookups interpret empty paths (// != /), so give
    // the use the option to turn this off.
    //
    if (!config->Boolean ("allow_double_slash"))
	while ((i = _path.indexOf("//")) >= 0 && i < pathend)
	{
	    newPath = _path.sub(0, i).get();
	    newPath << _path.sub(i + 1).get();
	    _path = newPath;
	    pathend = _path.indexOf('?');
	    if (pathend < 0)
		pathend = _path.length();
	}

    //
    // Next get rid of redundant "/./".  This could cause infinite
    // loops.  Moreover, "/foo/./../" should become "/", not "/foo/"
    //
    while ((i = _path.indexOf("/./")) >= 0 && i < pathend)
    {
        newPath = _path.sub(0, i).get();
        newPath << _path.sub(i + 2).get();
        _path = newPath;
        pathend = _path.indexOf('?');
        if (pathend < 0)
            pathend = _path.length();
    }
    if ((i = _path.indexOf("/.")) >= 0 && i == pathend-2)
    {
        newPath = _path.sub(0, i+1).get();		// keep trailing slash
        newPath << _path.sub(i + 2).get();
        _path = newPath;
        pathend--;
    }

    //
    // Now that "empty" path components are gone, remove ("/../").
    //
    while ((i = _path.indexOf("/../")) >= 0 && i < pathend)
    {
        if ((limit = _path.lastIndexOf('/', i - 1)) >= 0)
        {
            newPath = _path.sub(0, limit).get();
            newPath << _path.sub(i + 3).get();
            _path = newPath;
        }
        else
        {
            _path = _path.sub(i + 3).get();
            leadingdotdot++;
        }
        pathend = _path.indexOf('?');
        if (pathend < 0)
            pathend = _path.length();
    }
    if ((i = _path.indexOf("/..")) >= 0 && i == pathend-3)
    {
        if ((limit = _path.lastIndexOf('/', i - 1)) >= 0)
            newPath = _path.sub(0, limit+1).get();	// keep trailing slash
        else
        {
            newPath = '/';
            leadingdotdot++;
        }
        newPath << _path.sub(i + 3).get();
        _path = newPath;
        pathend = _path.indexOf('?');
        if (pathend < 0)
            pathend = _path.length();
    }
    // The RFC gives us a choice of what to do when we have .. left and
    // we're at the top level. By principle of least surprise, we'll just
    // toss any "leftovers" Otherwise, we'd have a loop here to add them.

    // Finally change all "%7E" to "~" for sanity
    while ((i = _path.indexOf("%7E")) >= 0 && i < pathend)
      {
        newPath = _path.sub(0, i).get();
	newPath << "~";
        newPath << _path.sub(i + 3).get();
        _path = newPath;
        pathend = _path.indexOf('?');
        if (pathend < 0)
            pathend = _path.length();
      }

    // If the server *isn't* case sensitive, we want to lowercase the path
    if (!config->Boolean("case_sensitive", 1))
      _path.lowercase();

    // And don't forget to remove index.html or similar file.
//    if (strcmp((char*)_service, "file") != 0)  (check is now internal)
	removeIndex(_path, _service);
}

//*****************************************************************************
// void URL::dump()
//
void URL::dump()
{
    cout << "service = " << _service.get() << endl;
    cout << "user = " << _user.get() << endl;
    cout << "host = " << _host.get() << endl;
    cout << "port = " << _port << endl;
    cout << "path = " << _path << endl;
    cout << "url = " << _url << endl;
}


//*****************************************************************************
// void URL::path(const String &newpath)
//
void URL::path(const String &newpath)
{
	HtConfiguration* config= HtConfiguration::config();
    _path = newpath;
    if (!config->Boolean("case_sensitive",1))
      _path.lowercase();
    constructURL();
}


//*****************************************************************************
// void URL::removeIndex(String &path, String &service)
//   Attempt to remove the remove_default_doc from the end of a URL path if
//   the service allows that.  (File, ftp don't.  Do others?)
//   This needs to be done to normalize the paths and make .../ the
//   same as .../index.html
// Called from: URL::normalize() from URL::signature()  [redundant?]
// 		URL::normalizePath()
//
void URL::removeIndex(String &path, String &service)
{
	HtConfiguration* config= HtConfiguration::config();
    static StringMatch *defaultdoc = 0;

    if (strcmp((char*)_service, "file") == 0 ||
        strcmp((char*)_service, "ftp")  == 0)
	return;

    if (path.length() == 0 || strchr((char*)path, '?'))
	return;

    int filename = path.lastIndexOf('/') + 1;
    if (filename == 0)
        return;

    if (! defaultdoc)
    {
      StringList  l(config->Find("remove_default_doc"), " \t");
      defaultdoc = new StringMatch();
      defaultdoc->IgnoreCase();
      defaultdoc->Pattern(l.Join('|'));
    }
    int which, length;
    if (defaultdoc->hasPattern() &&
	    defaultdoc->CompareWord((char*)path.sub(filename), which, length) &&
	    filename+length == path.length())
	path.chop(path.length() - filename);
}


//*****************************************************************************
// void URL::normalize()
//   Make sure that URLs are always in the same format.
//
void URL::normalize()
{
	HtConfiguration* config= HtConfiguration::config();
    static int	hits = 0, misses = 0;

    if (_service.length() == 0 || _normal)
	return;

    
//  if (strcmp((char*)_service, "http") != 0)
    // if service specifies "doesn't specify an IP host", don't normalize it
    if (slashes (_service) != 2)
	return;

//    if (strcmp ((char*)_service, "http") == 0)  (check is now internal)
	removeIndex(_path, _service);

    //
    // Convert a hostname to an IP address
    //
    _host.lowercase();

    if (!config->Boolean("allow_virtual_hosts", 1))
    {
	static Dictionary	hostbyname;
	unsigned long		addr;
	struct hostent		*hp;

	String	*ip = (String *) hostbyname[_host];
	if (ip)
	{
	    memcpy((char *) &addr, ip->get(), ip->length());
	    hits++;
	}
	else
	{
	    addr = inet_addr(_host.get());
	    if (addr == 0xffffffff)
	    {
		hp = gethostbyname(_host.get());
		if (hp == NULL)
		{
		    return;
		}
		memcpy((char *)&addr, (char *)hp->h_addr, hp->h_length);
		ip = new String((char *) &addr, hp->h_length);
		hostbyname.Add(_host, ip);
		misses++;
	    }
	}

	static Dictionary	machines;
	String			key;
	key << int(addr);
	String			*realname = (String *) machines[key];
	if (realname)
	    _host = realname->get();
	else
	    machines.Add(key, new String(_host));
    }
    ServerAlias();
    
    //
    // Reconstruct the url
    //
    constructURL();
    _normal = 1;
    _signature = 0;
}


//*****************************************************************************
// const String &URL::signature()
//   Return a string which uniquely identifies the server the current
//   URL is refering to.
//   This is the first portion of a url: service://user@host:port/
//   (in short this is the URL pointing to the root of this server)
//
const String &URL::signature()
{
    if (_signature.length())
	return _signature;

    if (!_normal)
	normalize();
    _signature = _service;
    _signature << "://";
    if (_user.length())
      _signature << _user << '@';
    _signature << _host;
    _signature << ':' << _port << '/';
    return _signature;
}

//*****************************************************************************
// void URL::ServerAlias()
// Takes care of the server aliases, which attempt to simplify virtual
// host problems
//
void URL::ServerAlias()
{
  HtConfiguration* config= HtConfiguration::config();
  static Dictionary *serveraliases= 0;

  if (! serveraliases)
    {
      String l= config->Find("server_aliases");
      String from, *to;
      serveraliases = new Dictionary();
      char *p = strtok(l, " \t");
      char *salias= NULL;
      while (p)
	{
	  salias = strchr(p, '=');
	  if (! salias)
	    {
	      p = strtok(0, " \t");
	      continue;
	    }
	  *salias++= '\0';
	  from = p;
	  from.lowercase();
	  if (from.indexOf(':') == -1)
	    from.append(":80");
	  to= new String(salias);
	  to->lowercase();
	  if (to->indexOf(':') == -1)
	    to->append(":80");
	  serveraliases->Add(from.get(), to);
	  // fprintf (stderr, "Alias: %s->%s\n", from.get(), to->get());
	  p = strtok(0, " \t");
	}
    }

  String *al= 0;
  int newport;
  int delim;
  String serversig = _host;
  serversig << ':' << _port;
  if ((al= (String *) serveraliases->Find(serversig)))
    {
      delim= al->indexOf(':');
      // fprintf(stderr, "\nOld URL: %s->%s\n", (char *) serversig, (char *) *al);
      _host= al->sub(0,delim).get();
      sscanf((char*)al->sub(delim+1), "%d", &newport);
      _port= newport;
      // fprintf(stderr, "New URL: %s:%d\n", (char *) _host, _port);
    }
}

//*****************************************************************************
// int URL::slash(const String &protocol)
// Returns number of slashes folowing the service name for protocol
//
int
URL::slashes(const String &protocol)
{
    if (!slashCount)
    {
	HtConfiguration* config= HtConfiguration::config();
	slashCount = new Dictionary();

	slashCount->Add (String("mailto"), new String("0"));
	slashCount->Add (String("news"),   new String("0"));
	slashCount->Add (String("http"),   new String("2"));
	slashCount->Add (String("ftp"),    new String("2"));
	// file:///  has three, but the last counts as part of the path...
	slashCount->Add (String("file"),   new String("2"));
	
	QuotedStringList	qsl(config->Find("external_protocols"), " \t");
	String			from;
	int			i;
	int			sep,colon;

	for (i = 0; qsl[i]; i += 2)
	{
	    from = qsl[i];
	    sep = from.indexOf("->");
	    if (sep != -1)
		from = from.sub(0, sep).get();  // "get" aids portability...

	    colon = from.indexOf(":");
	    // if service specified as "help:/" or "man:", note trailing slashes
	    // Default is 2.
	    if (colon != -1)
	    {
		int i;
		char count [2];
		for (i = colon+1; from[i] == '/'; i++)
		    ;
		count [0] = i - colon + '0' - 1;
		count [1] = '\0';
		from = from.sub(0,colon).get();
		slashCount->Add (from, new String (count));
	    } else
		slashCount->Add (from, new String ("2"));
	}
    }
    
    // Default to two slashes for unknown protocols
    String *count = (String *)slashCount->Find(protocol);
    return count ? (count->get()[0] - '0') : 2;
}

//*****************************************************************************
// void URL::constructURL()
// Constructs the _url member from everything else
// Also ensures the port number is correct for the service
// Called from  URL::URL(const String &url, const URL &parent)
//		URL::parse(const String &u)
//		URL::path(const String &newpath)
//		URL::normalize()
//
void URL::constructURL()
{
    if (strcmp((char*)_service, "file") != 0 && _host.length() == 0) {
	_url = "";
	return;
    }

    _url = _service;
    _url << ":";

    // Add correct number of slashes after service name
    int i;
    for (i = slashes (_service); i > 0; i--)
    {
	_url << "/";
    }

    if (slashes (_service) == 2)	// services specifying a particular
    {					// IP host must begin "service://"
	if (strcmp((char*)_service, "file") != 0)
	  {
	    if (_user.length())
	      _url << _user << '@';
	    _url << _host;
	  }

       if (_port != DefaultPort() && _port != 0)  // Different than the default port
	  _url << ':' << _port;
    }

    _url << _path;
}


///////
   //    Get the default port for the recognised service
///////

int URL::DefaultPort()
{
   if (strcmp((char*)_service, "http") == 0)
      return 80;
   else if (strcmp((char*)_service, "https") == 0)
      return 443;
   else if (strcmp((char*)_service, "ftp") == 0)
      return 21;
   else if (strcmp((char*)_service, "gopher") == 0)
      return 70;
   else if (strcmp((char*)_service, "file") == 0)
      return 0;
   else if (strcmp((char*)_service, "news") == 0)
      return NNTP_DEFAULT_PORT;
   else return 80;
}