File: clfmerge.cpp

package info (click to toggle)
logtools 0.13e%2Bnmu3
  • links: PTS
  • area: main
  • in suites: forky, trixie
  • size: 324 kB
  • sloc: cpp: 1,110; sh: 116; makefile: 78
file content (356 lines) | stat: -rw-r--r-- 8,921 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
#include <stdio.h>
#include <unordered_map>
#include <stdlib.h>
#include <map>
#include <cstring>
#include <unistd.h>
#include <ctype.h>
#include <string>

#include "logtools.h"

#define BUF_SIZE 4096

using namespace std;
using namespace __gnu_cxx;

struct hashing_func {
  unsigned long operator()(const char *key) const
  { return (key[0] << 8) + key[1]; }
};

struct eqstr
{
  bool operator()(const char *s1, const char *s2) const
  { return strcmp(s1, s2) == 0; }
};

unordered_map<const char *, const char *, hashing_func, eqstr> months;

class LogFile
{
public:
  LogFile();
  ~LogFile(){ if(m_fp) fclose(m_fp); }

  // open a file, return 0 for success, 1 for minor error (0 length file)
  // and 2 for serious error (file not found)
  int open(const char *name, bool domain_mangling);

  // get the date string of the current item
  const char *date() const { return m_date; }
  // get the full data for the current item
  const char *line() { m_valid = false; return m_line; }

  // get a line from the file and parse it
  // return 0 for success and 1 for EOF
  int getLine();

  bool valid() const { return m_valid; }
  
  bool verbose;

private:
  bool m_valid;
  FILE *m_fp;
  char m_date[17];
  char m_lineBuf[BUF_SIZE];
  char m_lineBuf2[BUF_SIZE + 7];
  char *m_line;
  bool m_web_first;

  // store the date in numeric format so that strcmp() can be used to compare
  // dates.  Returns 1 for error and 0 for OK.
  int setDate();

  LogFile(const LogFile&);
  LogFile & operator=(const LogFile&);
};

LogFile::LogFile()
 : m_valid(false)
 , m_fp(NULL)
{
  m_line = m_lineBuf;
  m_lineBuf[0] = '\0';
}

// Common log format:
// 1.2.3.4 - - [23/Aug/2000:12:00:32 +0200] etc
int LogFile::setDate()
{
  unsigned int i;
  // find '[' or abort if we can't
  for(i = 0; m_lineBuf[i] != '[' && m_lineBuf[i] != '\0'; i++)
  { }

  // if not enough data left for the full date then return
  if(i + 21 > strlen(m_lineBuf))
    return 1;
  memcpy(m_date, &m_lineBuf[i + 8], 4);
  char mon[4];
  memcpy(mon, &m_lineBuf[i + 4], 3);
  mon[3] = '\0';
  const char *m = months[mon];
  if(!m) return 1;
  strcpy(&m_date[4], m);
  memcpy(&m_date[6], &m_lineBuf[i + 1], 2);
  memcpy(&m_date[8], &m_lineBuf[i + 13], 8);
  m_date[16] = '\0';
  if(m_web_first) // make m_lineBuf2 have the data for the mangled line
  {
    unsigned int end_webname;
    // find where the domain name ends
    for(end_webname = 0; m_lineBuf[end_webname] != ' ' && m_lineBuf[end_webname] != '\0'; end_webname++)
    { }

    if(end_webname >= i)
      return 1;
    for(i = 0; i < end_webname; i++)
      m_lineBuf[end_webname] = tolower(m_lineBuf[end_webname]);

    // there will be more than 40 chars in between
    unsigned int start_url = end_webname + 40;
    // search for the start quote character
    for(; m_lineBuf[start_url] != '\"' && m_lineBuf[start_url] != '\0'; start_url++)
    { }
    // search for the space in the web request
    for(; m_lineBuf[start_url] != ' ' && m_lineBuf[start_url] != '\0'; start_url++)
    { }

    if(strlen(&m_lineBuf[start_url]) < 6) return 1;

    memcpy(m_lineBuf2, &m_lineBuf[end_webname + 1], start_url - end_webname);
    m_line = &m_lineBuf2[start_url - end_webname]; // m_line points to next char
    if(strncmp(&m_lineBuf[start_url + 1], "http://", 7))
    {
      strcpy(m_line, "http://");
      m_line += 7;
      memcpy(m_line, m_lineBuf, end_webname);
      m_line += end_webname;
      if(m_lineBuf[start_url + 1] != '/')
      {
        // if URL doesn't start with a '/' then we add one
        *m_line = '/';
        m_line++;
      }
    }
    strcpy(m_line, &m_lineBuf[start_url + 1]);
    m_line = m_lineBuf2;
  }
  return 0;
}

int LogFile::open(const char *name, bool domain_mangling)
{
  m_web_first = domain_mangling;
  m_fp = fopen(name, "r");
  if(!m_fp)
  {
    fprintf(stderr, "Can't open %s.\n", name);
    return 2;
  }
  if(getLine())
    return 1;
  return 0;
}

int LogFile::getLine()
{
  while(1)
  {
    // if can't get more data then return 1
    if(!fgets(m_lineBuf, sizeof(m_lineBuf) - 1, m_fp))
      return 1;
    m_lineBuf[sizeof(m_lineBuf) - 1] = '\0';
    m_line = m_lineBuf;
    strtok(m_line, "\n\r");
    // if setDate() returns 1 then we can't parse the line so we keep looping
    // if setDate() returns 0 then return success!
    if(!setDate())
    {
      m_valid = true;
      return 0;
    }
    if(verbose)
      fprintf(stderr, "Skipping bad line: %s\n", m_line);
  }
  return 0; // to make compilers happy - will not be reached
}

typedef LogFile *PLogFile;

int item_compare(const void *a, const void *b)
{
  const LogFile * const left = *(LogFile * const *)a;
  const LogFile * const right = *(LogFile * const *)b;
  return strcmp(left->date(), right->date());
}

struct ltstr
{
  bool operator()(const string s1, const string s2) const
  {
    return strcmp(s1.c_str(), s2.c_str()) < 0;
  }
};

void usage(const char *const arg)
{
  fprintf(stderr, "usage: %s [OPTION] [filenames]", arg);
  fprintf(stderr, "\n"
  "This program merges web logs in common log format into a single stream\n"
  "on standard output.  It reads from multiple input files and outputs the\n"
  "data in-order as much as is possible.  If there is only a single input\n"
  "file it will re-order it (with a 1000 line buffer size) to deal with web\n"
  "servers that output data out of order.\n"
  "\nVersion: " VERSION "\n");
  exit(ERR_PARAM);
}

int main(int argc, char **argv)
{
  if(argc == 1)
    return 0;

  unsigned int map_items = 0;
  bool set_map_items = false, domain_mangling = false, verbose = false;
  int int_c;
  optind = 0;
  while(-1 != (int_c = getopt(argc, argv, "b:hdv")) )
  {
    switch(char(int_c))
    {
      case '?':
      case ':':
      case 'h':
        usage(argv[0]);
      break;
      case 'b':
        set_map_items = true;
        map_items = atoi(optarg);
      break;
      case 'd':
        domain_mangling = true;
      case 'v':
        verbose = true;
      break;
    }
  }
  months["Jan"] = "01";
  months["Feb"] = "02";
  months["Mar"] = "03";
  months["Apr"] = "04";
  months["May"] = "05";
  months["Jun"] = "06";
  months["Jul"] = "07";
  months["Aug"] = "08";
  months["Sep"] = "09";
  months["Oct"] = "10";
  months["Nov"] = "11";
  months["Dec"] = "12";

  multimap<const string, const string, ltstr> outputMap;

  LogFile **items = new PLogFile[argc - optind];

  unsigned int item_count = 0;
  int i;
  for(i = optind; i < argc; i++)
  {
    items[item_count] = new LogFile;
    items[item_count]->verbose = verbose;
    int rc = items[item_count]->open(argv[i], domain_mangling);
    // if rc==2 then file not found, if rc==1 then 0 length file
    if(rc > 1)
      return ERR_INPUT;
    if(rc == 1)
      delete items[item_count];
    else
      item_count++;
  }

  if(!set_map_items)
  {
    map_items = item_count * 400;
    if(map_items < 4000)
      map_items = 4000;
  }
  while(item_count > 1)
  {
    qsort(items, item_count, sizeof(LogFile *), item_compare);
    while(items[0]->valid() && strcmp(items[0]->date(), items[1]->date()) <= 0)
    {
      if(map_items > 0)
      {
        outputMap.insert(pair<string, string>(items[0]->date(), items[0]->line()));
        while(outputMap.size() > map_items)
        {
          printf("%s\n", outputMap.begin()->second.c_str());
          outputMap.erase(outputMap.begin());
        }
      }
      else
      {
        printf("%s\n", items[0]->line());
      }
      if(items[0]->getLine())
      {
        delete(items[0]);
        item_count--;
        items[0] = items[item_count];
        break;
      }
    }
  }
  if(item_count == 1)
  {
    if(map_items > 0)
    {
      do
      {
        outputMap.insert(pair<string, string>(items[0]->date(), items[0]->line()));
      } while(!items[0]->getLine() && outputMap.size() < map_items);

      if(items[0]->valid())
      {
        do
        {
          outputMap.insert(pair<string, string>(items[0]->date(), items[0]->line()));
          CPCCHAR tmp = outputMap.begin()->second.c_str();
          if(printf("%s\n", tmp) != int(strlen(tmp) + 1))
          {
            fprintf(stderr, "Can't write output!\n");
              return ERR_OUTPUT;
          }
          outputMap.erase(outputMap.begin());
        } while(!items[0]->getLine());
      }
      delete items[0];
      while(!outputMap.empty())
      {
        CPCCHAR tmp = outputMap.begin()->second.c_str();
        if(printf("%s\n", tmp) != int(strlen(tmp) + 1))
        {
          fprintf(stderr, "Can't write output!\n");
            return ERR_OUTPUT;
        }
        outputMap.erase(outputMap.begin());
      }
    }
    else
    {
      do
      {
        CPCCHAR tmp = items[0]->line();
        if(printf("%s\n", tmp) != int(strlen(tmp) + 1))
        {
          fprintf(stderr, "Can't write output!\n");
            return ERR_OUTPUT;
        }
      } while(!items[0]->getLine());
    }
  }
  delete items;
  return 0;
}