File: print-urlrefs.c

package info (click to toggle)
glimpse 4.1-2
  • links: PTS
  • area: non-free
  • in suites: slink
  • size: 2,344 kB
  • ctags: 2,254
  • sloc: ansic: 32,194; makefile: 561; sh: 170; perl: 142
file content (129 lines) | stat: -rw-r--r-- 4,526 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
static char rcsid[] = "$Id: print-urlrefs.c,v 1.9 1995/01/10 16:30:39 hardy Exp $";
/*
 *  print-urlrefs - Reads SOIF and prints normalized URLs from the 
 *  URL-References attribute.  Used to extract URLs from HTML object sums.
 *
 *  Usage: print-urlrefs
 *
 *  Darren Hardy, hardy@cs.colorado.edu, July 1994
 *
 *  ----------------------------------------------------------------------
 *  Copyright (c) 1994, 1995.  All rights reserved.
 *  
 *          Mic Bowman of Transarc Corporation.
 *          Peter Danzig of the University of Southern California.
 *          Darren R. Hardy of the University of Colorado at Boulder.
 *          Udi Manber of the University of Arizona.
 *          Michael F. Schwartz of the University of Colorado at Boulder. 
 *  
 *  This copyright notice applies to all code in Harvest other than
 *  subsystems developed elsewhere, which contain other copyright notices
 *  in their source text.
 *  
 *  The Harvest software was developed by the Internet Research Task
 *  Force Research Group on Resource Discovery (IRTF-RD).  The Harvest
 *  software may be used for academic, research, government, and internal
 *  business purposes without charge.  If you wish to sell or distribute
 *  the Harvest software to commercial clients or partners, you must
 *  license the software.  See
 *  http://harvest.cs.colorado.edu/harvest/copyright,licensing.html#licensing.
 *  
 *  The Harvest software is provided ``as is'', without express or
 *  implied warranty, and with no support nor obligation to assist in its
 *  use, correction, modification or enhancement.  We assume no liability
 *  with respect to the infringement of copyrights, trade secrets, or any
 *  patents, and are not responsible for consequential damages.  Proper
 *  use of the Harvest software is entirely the responsibility of the user.
 *  
 *  For those who are using Harvest for non-commercial purposes, you may
 *  make derivative works, subject to the following constraints:
 *  
 *  - You must include the above copyright notice and these accompanying 
 *    paragraphs in all forms of derivative works, and any documentation 
 *    and other materials related to such distribution and use acknowledge 
 *    that the software was developed at the above institutions.
 *  
 *  - You must notify IRTF-RD regarding your distribution of the 
 *    derivative work.
 *  
 *  - You must clearly notify users that your are distributing a modified 
 *    version and not the original Harvest software.
 *  
 *  - Any derivative product is also subject to the restrictions of the 
 *    copyright, including distribution and use limitations.
 */
#include <stdio.h>
#include <string.h>
#include <time.h>
#include "util.h"
#include "url.h"
#include "template.h"

static void print_urlrefs(t)
Template *t;
{
	AVPair *avp;
	char *s, url[BUFSIZ];
	URL *up;

	if ((avp = extract_AVPair(t->list, "URL-References")) == NULL)
		return;

	/* For each line in the data, grab the URL */
        for (s = strtok(avp->value, "\n"); s != NULL; s = strtok(NULL, "\n")) {
		url[0] = '\0';

		/* Remove poorly formated lines */
		if (strchr(s, '=') || strchr(s, ' ') || strchr(s, '<'))
			continue;
		if (strncmp(s, "mailto:", 6) == 0)
			continue;
		if (strncmp(s, "http:", 5) == 0 && !strstr(s, "://"))
			continue;

                if (strstr(s, "://") != NULL) {
			/* Is this URL ok as-is?  If so, save it */
                        strcpy(url, s);
                } else if (s[0] == '/') {
			/* This URL is relative to the top of t->url */
                        char *thishost = t->url + strlen("http://"), *z;

                        z = strchr(thishost, '/');
                        if (z != NULL) *z = '\0';
                        sprintf(url, "http:/%s%s", thishost, s);
                        if (z != NULL) *z = '/';
                } else {
			/* This URL is relative to t->url */
			char *z = strdup(t->url), *p;

			if ((p = strrchr(z, '/')) != NULL) *p = '\0';
                        sprintf(url, "%s/%s", z, s);
			xfree(z);
                }
		/* If the URL is set, then parse it and print if ok */
		if (url[0]) {
			if ((up = url_open(url)) != NULL) {
				printf("%s\n", up->url);
				url_close(up);
			}
		}
	}
}


int main(argc, argv)
int argc;
char *argv[];
{
	Template *template;
	Buffer *b;

	init_parse_template_file(stdin);
	while ((template = parse_template()) != NULL) {
		print_urlrefs(template);
		printf("%s\n", template->url);
		free_template(template);	
	}
	finish_parse_template();
	exit(0);
}