1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
|
/*
** This is a simple program used to retrieve an HTML document using
** HTTP. The program also fetches all images that the document
** references.
*/
#include <stdio.h>
#include <stdlib.h>
#include "getpage.h"
#define stricmp strcasecmp
/*
** Each image to be loaded is an instance of the following structure.
*/
typedef struct Image Image;
struct Image {
char *zUrl; /* The URL for this image */
char *zLocal; /* The local filename */
Image *pNext; /* Next in a list of them all */
};
static FILE *html; /* Html output to this file. */
static int nImage = 0; /* Number of images loaded so far */
static Image *pImage; /* List of all images */
static global_nErr = 0; /* System wide errors */
static char baseUrl[1000];/* The base URL */
static int quiet = 0; /* The quiet flag */
/*
** Make sure the given URL is loaded as a local file. Return the
** name of the local file.
*/
static char *GetImage(char *zUrl){
Image *p;
for(p=pImage; p; p=p->pNext){
if( strcmp(p->zUrl,zUrl)==0 ){
return p->zLocal;
}
}
p = malloc( sizeof(*p) + strlen(zUrl) + 100 );
p->zUrl = (char*)&p[1];
strcpy(p->zUrl, zUrl);
p->zLocal = &p->zUrl[strlen(zUrl)+1];
sprintf(p->zLocal,"image%d", ++nImage);
p->pNext = pImage;
pImage = p;
HttpFetch(zUrl, p->zLocal, quiet, 0, 0);
return p->zLocal;
}
/*
** Print a usage comment and exit
*/
void usage(char *argv0){
fprintf(stderr,"Usage: %s URL\n",argv0);
exit(1);
}
/*
** Handle anything that isn't markup
*/
static void WordHandler(const char *zText, void *notUsed){
fprintf(html, zText);
}
/*
** Handle all markup that we don't care about.
*/
static void DefaultMarkup(int argc, const char **argv, void *notUsed){
int i;
fprintf(html,"<%s",argv[0]);
for(i=1; i<argc-1; i+=2){
fprintf(html," %s=\"%s\"", argv[i], argv[i+1]);
}
fprintf(html,">");
}
/*
** Handler for <IMG> markup
*/
static void ImageMarkup(int argc, const char **argv, void *notUsed){
int i;
for(i=1; i<argc-1; i+=2){
if( stricmp(argv[i],"src")==0 ){
const char *azUrl[2];
char *zResolved;
azUrl[0] = argv[i+1];
azUrl[1] = 0;
zResolved = ResolveUrl(baseUrl, azUrl);
if( !quiet ){
printf("Resolved: (%s) (%s) -> (%s)\n",baseUrl, azUrl[0], zResolved);
}
argv[i+1] = GetImage(zResolved);
/* printf("%s -> %s -> argv[i+1]\n",argv[i+1], zResolved); */
free(zResolved);
}
}
DefaultMarkup(argc, argv, 0);
}
/*
** Handler for <BASE> markup
*/
static void BaseMarkup(int argc, const char **argv, void *notUsed){
int i;
for(i=1; i<argc-1; i+=2){
if( stricmp(argv[i],"href")==0 ){
if( !quiet ){
printf("Base Href=%s\n",argv[i+1]);
}
sprintf(baseUrl,"%.*s", sizeof(baseUrl), argv[i+1]);
}
}
}
/*
** Name of a temporary file
*/
static char zTemp[] = "index.html.orig";
/*
** The main routine
*/
int main(int argc, char **argv){
int i; /* Loop counter */
int nErr; /* Number of errors */
int rc; /* Result code */
char *zUrl = 0; /* The URL */
FILE *in; /* For reading the raw html */
if( argc<2 ) usage(argv[0]);
zUrl = 0;
for(i=1; i<argc; i++){
if( strcmp(argv[i],"-quiet")==0 ){
quiet = 1;
}else if( argv[i][0]=='-' ){
usage(argv[0]);
}else{
zUrl = argv[i];
}
}
if( zUrl==0 ) usage(argv[0]);
rc = HttpFetch(zUrl, zTemp, quiet, sizeof(baseUrl), baseUrl);
if( rc!=200 ){
unlink(zTemp);
fprintf(stderr,"Unable to fetch base page %s\n", zUrl);
exit(1);
}
in = fopen(zTemp,"r");
/* unlink(zTemp); */
if( in==0 ){
perror("can't reopen temporary file!");
exit(1);
}
html = fopen("index.html","w");
if( html==0 ){
perror("can't open output file \"index.html\"");
exit(1);
}
SgmlWordHandler(WordHandler);
SgmlSpaceHandler(WordHandler);
SgmlCommentHandler(WordHandler);
SgmlDefaultMarkupHandler(DefaultMarkup);
SgmlHandler("img", ImageMarkup);
SgmlHandler("base", BaseMarkup);
SgmlParse(in, 0);
fclose(in);
fclose(html);
return global_nErr;
}
|