File: parser.c

package info (click to toggle)
webdruid 0.5.4-5
  • links: PTS
  • area: main
  • in suites: lenny
  • size: 2,236 kB
  • ctags: 806
  • sloc: ansic: 10,823; sh: 2,763; makefile: 162
file content (589 lines) | stat: -rw-r--r-- 18,557 bytes parent folder | download | duplicates (8)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
/*
    The WebDruid - a web server log analysis program

    Copyright (C) 2003-2004  Fabien Chevalier (fabien@juliana-multimedia.com)

    Original webalizer copyright:
    Copyright (C) 1997-2001  Bradford L. Barrett (brad@mrunix.net)

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version, and provided that the above
    copyright and permission notice is included with all distributed
    copies of this or derived software.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA

    This software uses the gd graphics library, which is copyright by
    Quest Protein Database Center, Cold Spring Harbor Labs.  Please
    see the documentation supplied with the library for additional
    information and license terms, or visit www.boutell.com/gd/ for the
    most recent version of the library and supporting documentation.

    W3C log format suport generously donated by Klaus Reimer <k@ailis.de>

*/

/*********************************************/
/* STANDARD INCLUDES                         */
/*********************************************/

#include "config.h"

#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>                           /* normal stuff             */
#include <ctype.h>
#include <sys/utsname.h>
#include <sys/times.h>

/* ensure getopt */
#ifdef HAVE_GETOPT_H
#include <getopt.h>
#endif

/* ensure sys/types */
#ifndef _SYS_TYPES_H
#include <sys/types.h>
#endif

/* some systems need this */
#ifdef HAVE_MATH_H
#include <math.h>
#endif

/* SunOS 4.x Fix */
#ifndef CLK_TCK
#define CLK_TCK _SC_CLK_TCK
#endif

#include "webdruid.h"                        /* main header              */
#include "lang.h"
#include "parser.h"

/* internal function prototypes */

static void fmt_logrec(char *);
static int  parse_record_clf(char *, struct log_struct *);
static int  parse_record_w3c(char *buffer, struct log_struct * log_record);

/* month names used for parsing logfile (shouldn't be lang specific) */

static char *log_month[12]={ "jan", "feb", "mar",
                             "apr", "may", "jun",
                             "jul", "aug", "sep",
                             "oct", "nov", "dec"};

static char tmp_buf[BUFSIZE];

/*********************************************/
/* FMT_LOGREC - terminate log fields w/zeros */
/*********************************************/

static void fmt_logrec(char *buffer)
{
   char *cp=buffer;
   int  q=0,b=0,p=0;

   while (*cp != '\0')
   {
      /* break record up, terminate fields with '\0' */
      switch (*cp)
      {
       case ' ': if (b || q || p) break; *cp='\0'; break;
       case '"': q^=1;  break;
       case '[': if (q) break; b++; break;
       case ']': if (q) break; if (b>0) b--; break;
       case '(': if (q) break; p++; break;
       case ')': if (q) break; if (p>0) p--; break;
       case '\t': if (b || q || p) break; *cp='\0'; break;
      }
      cp++;
   }
}

/*********************************************/
/* PARSE_RECORD - uhhh, you know...          */
/*********************************************/

int parse_record(char *buffer, struct log_struct * log_record)
{
   int ret;

   /* clear out structure */
   memset(log_record, 0, sizeof(struct log_struct));

   /* dates initialisation */

   log_record->year  = 0;
   log_record->month = 0;
   log_record->day   = 0;
   log_record->hour  = 0;
   log_record->min   = 0;
   log_record->sec   = 0;

#ifdef USE_DNS
   #ifdef USE_IPV6
   memset(&log_record->addr,0,sizeof(struct sockaddr_storage));
   #else
   memset(&log_rec.addr,0,sizeof(struct in_addr));
   #endif
#endif

   /* call appropriate handler */
   switch (log_type)
   {
      default:
      case LOG_CLF:   ret = parse_record_clf(buffer, log_record);   break; /* clf   */
      case LOG_W3C:   ret = parse_record_w3c(buffer, log_record);   break; /* w3c   */
   }

   /*
   printf("%d %d %d %d %d %d\n", log_record->year, log_record->month, log_record->day,
            log_record->hour, log_record->min, log_record->sec);
   */

   if(ret == 1)
      return 0;
   else
      return 1;
}

/*********************************************/
/* PARSE_RECORD_CLF - web log handler        */
/*********************************************/

static int parse_record_clf(char *buffer, struct log_struct * log_record)
{
   int size, i;
   char *cp1, *cp2, *cpx, *eob, *eos;
   char datetime[29];  /* raw timestamp        */

   size = strlen(buffer);                 /* get length of buffer        */
   eob = buffer+size;                     /* calculate end of buffer     */
   fmt_logrec(buffer);                    /* seperate fields with \0's   */

   /* HOSTNAME */
   cp1 = cpx = buffer; cp2=log_record->hostname;
   eos = (cp1+MAXHOST)-1;
   if (eos >= eob) eos=eob-1;

   while ( (*cp1 != '\0') && (cp1 != eos) ) *cp2++ = *cp1++;
   *cp2 = '\0';
   if (*cp1 != '\0')
   {
      if (verbose)
      {
         fprintf(stderr,"%s",_("Warning: Truncating oversized hostname"));
         if (debug_mode) fprintf(stderr,": %s\n",cpx);
         else fprintf(stderr,"\n");
      }
      while (*cp1 != '\0') cp1++;
   }
   if (cp1 < eob) cp1++;

   /* skip next field (ident) */
   while ( (*cp1 != '\0') && (cp1 < eob) ) cp1++;
   if (cp1 < eob) cp1++;

   /* IDENT (authuser) field */
   cpx = cp1;
   cp2 = log_record->ident;
   eos = (cp1+MAXIDENT-1);
   if (eos >= eob) eos=eob-1;

   while ( (*cp1 != '[') && (cp1 < eos) ) /* remove embeded spaces */
   {
      if (*cp1=='\0') *cp1=' ';
      *cp2++=*cp1++;
   }
   *cp2--='\0';

   if (cp1 >= eob) return 0;

   /* check if oversized username */
   if (*cp1 != '[')
   {
      if (verbose)
      {
         fprintf(stderr,"%s",_("Warning: Truncating oversized username"));
         if (debug_mode) fprintf(stderr,": %s\n",cpx);
         else fprintf(stderr,"\n");
      }
      while ( (*cp1 != '[') && (cp1 < eob) ) cp1++;
   }

   /* strip trailing space(s) */
   while (*cp2==' ') *cp2--='\0';

   /* date/time string */
   cpx = cp1;
   cp2 = datetime;
   eos = (cp1+28);
   if (eos >= eob) eos=eob-1;

   while ( (*cp1 != '\0') && (cp1 != eos) ) *cp2++ = *cp1++;
   *cp2 = '\0';
   if (*cp1 != '\0')
   {
      if (verbose)
      {
         fprintf(stderr,"%s",_("Warning: Truncating oversized date field"));
         if (debug_mode) fprintf(stderr,": %s\n",cpx);
         else fprintf(stderr,"\n");
      }
      while (*cp1 != '\0') cp1++;
   }
   if (cp1 < eob) cp1++;

   /* minimal sanity check on timestamp */
   if ( (datetime[0] != '[') ||
        (datetime[3] != '/') ||
        (cp1 >= eob))  return 0;

   /* HTTP request */
   cpx = cp1;
   cp2 = log_record->url;
   eos = (cp1+MAXURL-1);
   if (eos >= eob) eos = eob-1;

   while ( (*cp1 != '\0') && (cp1 != eos) ) *cp2++ = *cp1++;
   *cp2 = '\0';
   if (*cp1 != '\0')
   {
      if (verbose)
      {
         fprintf(stderr,"%s",_("Warning: Truncating oversized request field"));
         if (debug_mode) fprintf(stderr,": %s\n",cpx);
         else fprintf(stderr,"\n");
      }
      while (*cp1 != '\0') cp1++;
   }
   if (cp1 < eob) cp1++;

   if ( (log_record->url[0] != '"') ||
        (cp1 >= eob) ) return 0;

   /* response code */
   log_record->resp_code = atoi(cp1);

   /* xfer size */
   while ( (*cp1 != '\0') && (cp1 < eob) ) cp1++;
   if (cp1 < eob) cp1++;
   if (*cp1<'0'||*cp1>'9') log_record->xfer_size=0;
   else log_record->xfer_size = strtoul(cp1,NULL,10);

   /* done with CLF record  -- check if we have more stuff to read*/
   if (cp1 < eob)
   {
      while ( (*cp1 != '\0') && (*cp1 != '\n') && (cp1 < eob) ) cp1++;
      if (cp1 < eob) cp1++;
      /* get referrer if present */
      cpx = cp1;
      cp2 = log_record->refer;
      eos = (cp1+MAXREF-1);
      if (eos >= eob) eos = eob-1;

      while ( (*cp1 != '\0') && (*cp1 != '\n') && (cp1 != eos) ) *cp2++ = *cp1++;
      *cp2 = '\0';
      if (*cp1 != '\0')
      {
         if (verbose)
         {
            fprintf(stderr,"%s",_("Warning: Truncating oversized referrer field"));
            if (debug_mode) fprintf(stderr,": %s\n",cpx);
            else fprintf(stderr,"\n");
         }
         while (*cp1 != '\0') cp1++;
      }
      if (cp1 < eob) cp1++;

      cpx = cp1;
      cp2 = log_record->agent;
      eos = cp1+(MAXAGENT-1);
      if (eos >= eob) eos = eob-1;

      while ( (*cp1 != '\0') && (cp1 != eos) ) *cp2++ = *cp1++;
      *cp2 = '\0';
   }

   /* do backup copy the buffer, in case it gets damaged by our tries to analyse it */
   strncpy(tmp_buf, buffer, BUFSIZE);

   /* convert month name to lowercase */
   for (i=4; i<7; i++)
         datetime[i]=tolower(datetime[i]);

   /* get year/month/day/hour/min/sec values    */
   log_record->month = 1;
   for (i=0; i<12; i++)
   {
         if (strncmp(log_month[i],&datetime[4],3)==0)
         { log_record->month = i+1; break; }
   }

   log_record->year = atoi(&datetime[8]);    /* get year number (int)   */
   log_record->day  = atoi(&datetime[1]);    /* get day number          */
   log_record->hour = atoi(&datetime[13]);   /* get hour number         */
   log_record->min  = atoi(&datetime[16]);   /* get minute number       */
   log_record->sec  = atoi(&datetime[19]);   /* get second number       */

   /* minimal sanity check on date */
   if ((i>=12)||(log_record->min>59)||(log_record->sec>59)||(log_record->year<1990))
   {
         total_bad++;                /* if a bad date, bump counter      */
         if (verbose)
         {
            fprintf(stderr,"%s: %s [%lu]",
               _("Error: bad record (bad date)"),datetime,total_rec);
            if (debug_mode)
               fprintf(stderr,":\n%s\n",tmp_buf);
            else
               fprintf(stderr,"\n");
         }
         return 0;                   /* and ignore this record           */
   }

   return 1;     /* maybe a valid record, return with TRUE */
}

/*********************************************/
/* PARSE_RECORD_W3C - w3c log handler        */
/*********************************************/

/*
   Patch: webalizer-w3c
   For: Webalizer 2.01.10
   From: Klaus Reimer <k@ailis.de>
   Desc: Implement W3C extended log file format support.
   Version: 3

   This patch implements W3C extended log file format support into webalizer.
   Thig log file format is used by Microsoft's Webserver IIS but other programs
   may also use this format which was invented by W3C. With this patch
   webalizer can read the W3C headers and can parse the data lines according to
   the specified Fields-header. Format-switching within a single logfile is
   supported. IIS is using this format-switching to write process accounting
   informations to the same logfile. These accounting informations are
   producing "bad record" warnings because these lines doesn't hold any
   informations usable for webalizer. You can just ignore them or disable
   process accounting logs in the IIS configuration if you don't need them

   IIS seems to be very buggy. I found three bugs in the current IIS version
   which affects logfiles:
   1. If output buffering is enabled in ASP scripts the sc-bytes field is
      always 0. This prevents accurate measurement of download traffic.
   2. Sometimes IIS writes nonsense into the log so the number of entries
      are not conform to the number of fields specified in the logfile header.
      This patch does its best to filter out these lines.
   3. IIS writes a lot of Null-characters to the end of each logfile. These
      lines are ignored by this patch.

   Known Bugs in this patch: The user-agent field is not completely
   "urldecoded". Only the '+' signs are replaced by space characters. Other
   url-encoded characters are not touched.
*/

/* field index structure */
struct  field_index_struct
{
   int date;     /* Date field index */
   int time;     /* Time field index */
   int ip;       /* IP field index */
   int username; /* Username field index */
   int method;   /* Method field index */
   int url;      /* URL field index    */
   int query;    /* Querystring field index */
   int status;   /* Status code field index */
   int size;     /* Size field index */
   int referer;  /* Referrer field index */
   int agent;    /* User agent field index */
   int fields;   /* Number of fields in this format */
};

/* field structure */
struct  fields_struct
{
   char *date;     /* Date field */
   char *time;     /* Time field */
   char *ip;       /* IP field */
   char *username; /* Username field */
   char *url;      /* URL field */
   char *query;    /* Querystring */
   char *status;   /* Status code */
   char *size;     /* Size field */
   char *referer;  /* Referrer field */
   char *agent;    /* User agent field */
};

static int parse_record_w3c(char *buffer, struct log_struct * log_record)
{
   int size;
   char *eob;
   char *cp;
   int index;
   static struct field_index_struct field_index;
   struct fields_struct fields;
   struct tm time;

   memset(&time, 0, sizeof(struct tm));
   size = strlen(buffer);                 /* get length of buffer        */
   eob = buffer + size;                   /* calculate end of buffer     */
   fmt_logrec(buffer);                    /* seperate fields with \0's   */

   cp = buffer;
   
   /* Check if the line is empty or a line suffers from the IIS
      Null-Character bug and abort parsing if found. */
   if (*cp == '\0') return 0;

   /* If it's a header line ignore it or parse the Fields header if found */
   if (*cp == '#')
   {
      cp++;
      if (!strcmp(cp, "Fields:"))
      {
         /* Reset the field indices */
         memset(&field_index, 0, sizeof(struct field_index_struct));
         while (*cp) cp++;
         cp++;
         index = 1;
         while ((cp < eob) && (*cp != '\r') && (*cp != '\n'))
         {
            /* Set the field index */
            if (!strcmp(cp, "date")) field_index.date = index;
            if (!strcmp(cp, "time")) field_index.time = index;
            if (!strcmp(cp, "c-ip")) field_index.ip = index;
            if (!strcmp(cp, "cs-uri-stem")) field_index.url = index;
            if (!strcmp(cp, "cs-uri-query")) field_index.query = index;
            if (!strcmp(cp, "sc-status")) field_index.status = index;
            if (!strcmp(cp, "cs(Referer)")) field_index.referer = index;
            if (!strcmp(cp, "sc-bytes")) field_index.size = index;
            if (!strcmp(cp, "cs(User-Agent)")) field_index.agent = index;
            if (!strcmp(cp, "cs-username")) field_index.username = index;

            /* Continue with the next field */
            while (*cp) cp++;
            cp++;
            index++;
         }
         field_index.fields = index -1;
      }
      

      /* Return because this header line is completely parsed */
      return 0;
   }
   
   /* A data line has been found */
   
   /* Check if the number of entries in this line are conform to the
      format specified in the header */
   index = 1;
   while ((cp < eob) && (*cp != '\r') && (*cp != '\n')) {
      while (*cp) cp++;
      cp++;
      index++;
   }
   if (index-1 != field_index.fields) return 0;
   
   /* Reset pointer */
   cp = buffer;

   /* Reset the field pointers and begin parsing the data line */
   memset(&fields, 0, sizeof(struct fields_struct));
   index = 1;
   while ((cp < eob) && (*cp != '\r') && (*cp != '\n'))
   {
      /* Set the field pointers */
      if (index == field_index.date) fields.date = cp;
      if (index == field_index.time) fields.time = cp;
      if (index == field_index.ip) fields.ip = cp;
      if (index == field_index.url) fields.url = cp;
      if (index == field_index.query) fields.query = cp;
      if (index == field_index.status) fields.status = cp;
      if (index == field_index.referer) fields.referer = cp;
      if (index == field_index.size) fields.size = cp;
      if (index == field_index.agent) fields.agent = cp;
      if (index == field_index.username) fields.username = cp;

      /* Continue with the next data field */
      while (*cp) cp++;
      cp++;
      index++;
   }

   /* Save URL */
   if (fields.url)
   {
      if (fields.query && (fields.query[0]!='-'))
         snprintf(log_record->url, MAXURL, "%s?%s", fields.url, fields.query);
      else
         strncpy(log_record->url, fields.url, MAXURL - 1);
   } else return 0;

   /* Save hostname */
   if (fields.ip) strncpy(log_record->hostname, fields.ip, MAXHOST - 1);

   /* Save response code */
   if (fields.status) log_record->resp_code = atoi(fields.status);

   /* Save referer */
   if (fields.referer) strncpy(log_record->refer, fields.referer, MAXREF - 1);

   /* Save transfer size */
   if (fields.size) log_record->xfer_size = strtoul(fields.size, NULL, 10);

   /* Save user agent */
   if (fields.agent)
   {
      cp = fields.agent;
      while (*cp)
      {
        if (*cp == '+') *cp = ' ';
        cp++;
      }
      strncpy(log_record->agent, fields.agent, MAXAGENT - 1);
   }

   /* Save auth username */
   if (fields.username) strncpy(log_record->ident, fields.username, MAXIDENT - 1);

   /* Parse date and time and save it */
   if (fields.date)
   {
      log_record->year = atoi(fields.date);
      while ((fields.date[0] != '\0') && (fields.date[0] != '-')) fields.date++;
      if (fields.date[0] == '\0') return 0;
      fields.date++;
      log_record->month = atoi(fields.date);
      while ((fields.date[0] != '\0') && (fields.date[0] != '-')) fields.date++;
      if (fields.date[0] == '\0') return 0;
      fields.date++;
      log_record->day = atoi(fields.date);
   }
   if (fields.time)
   {
      log_record->hour = atoi(fields.time);
      while ((fields.time[0] != '\0') && (fields.time[0] != ':')) fields.time++;
      if (fields.time[0] == '\0') return 0;
      fields.time++;
      log_record->min = atoi(fields.time);
      while ((fields.time[0] != '\0') && (fields.time[0] != ':')) fields.time++;
      if (fields.time[0] == '\0') return 0;
      fields.time++;
      log_record->sec = atoi(fields.time);
   }

   return 1;
}