File: analyze-traffic.pl

package info (click to toggle)
inn2 2.5.4-3
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 11,720 kB
  • ctags: 8,983
  • sloc: ansic: 92,499; sh: 13,509; perl: 12,921; makefile: 2,985; yacc: 842; python: 342; lex: 255
file content (185 lines) | stat: -rwxr-xr-x 6,958 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#!/usr/bin/perl -w

########################################################################
#
# analyze-traffic.pl
#
# Written by Jeffrey M. Vinocur <jeff@litech.org>
# This work is hereby placed in the public domain by its author.
#
# Script for keeping track of which newsgroups are receiving the most
# traffic (by article count or byte usage), and which peer is most
# responsible for the traffic in each high-traffic group.
#
########################################################################
#
# Usage:
#
# 1.  Add an entry in $pathetc/newsfeeds like the one below, and issue
#     `ctlinnd reload newsfeeds traffic` (you can change the path to 
#     whatever you like).
#
#             analyze!\
#                 :*\
#                 :Tf,WgsbmnN:/usr/local/news/log/traffic
#
#     You may find it useful to restrict the articles being logged,
#     either by modifing the wildmat pattern, or by using the /exclude
#     notation to indicate articles that have passed through some
#     servers should not be included.
#
#     Also, if $USE_ALL_GROUPS (see below) is false, you can leave out
#     the N flag, which may eliminate some parsing errors.
#
# 2.  Wait for some data.
#
# 3.  Run analyze-traffic.pl on the logged data (you can pass the
#     filename as an argument or feed the data on standard input).
#     You probably want to pipe it into a file, `less`, or `tail` as the
#     output is a line for every group that has received an article
#     according the input data.
#
#     There are some options hardcoded into the script below, under
#     "Constants" -- check for customization, if you like.
#
# 4.  Be sure to comment out the newsfeeds entry when done, or set
#     up some sort of log rotation, or INN will eventually fill up your
#     disk...
#
########################################################################
#
# Implementation notes and known bugs:
#
# - We try (if $USE_ALL_GROUPS is set, below) to count crossposted
#   towards each listed group (even ones not carried on the server!),
#   but since some articles have funky Newsgroups headers, that can
#   backfire.  So parsing can fail, which usually results in the
#   relevant line being skipped, but occasionally can cause Perl to
#   issue warnings (and perhaps produce funny things in the output).
#
#   A workaround would be to repeat e.g. the Message-ID at the end of
#   the intput format (i.e. WgsbmnNm), and then the script could read as
#   many lines as necessary until that ad hoc end-of-record marker
#   appeared.  I haven't found a need for this yet, though.
#
# - The input format is a sequence of lines, each containing a number of
#   space-separated fields.  Check newsfeeds(5) for what the semantics
#   are, but an example line (wrapped), for reference, looks like:
#
#       rec.aviation.military [space]
#       news-out.maxwell.syr.edu [space]
#       2796 [space]
#       <3Jvua.104184$My6.1642017@twister.tampabay.rr.com> [space]
#       @030247454E45524C31000016AD3100000004@ [space]
#       rec.aviation.military,rec.travel.usa-canada, [no space here]
#           sci.electronics.design,sci.econ,sci.environment
#
# - The output format is a sequence of lines, one for each newsgroup,
#   with three tab-separated fields.  They are sorted by either the
#   second or third field, depending on $SORT_BY_SIZE, below.  The first
#   field is the name of the newsgroup.  The second is the total number
#   of articles appearing in that newsgroup followed by, in parentheses,
#   the short name of the peer (see about $TLD_REGEX below) responsible
#   for the most articles and the percentage it made up.  The third is
#   the total number of kilobytes of (accepted) traffic in that
#   newsgroup, followed similarly by the peer responsible for the most
#   traffic in that group.  It looks something like this:
#   
#       news.lists.filters  1057 arts (63% syr)  7105.9 KB (36% cox)
#
#   The short names are made by taking the last component of the
#   (dot-separated) peer name that doesn't match /$TLD_REGEX/.  The idea
#   is that, for example, "isc.org" would be listed as "isc", and
#   "demon.co.uk" would be listed as "demon".  Adjust $TLD_REGEX as
#   needed to trim the top-level domains in your part of the world.
#
#   If your peers have very long short names, the output may look
#   somewhat funny.  Similar things can happen with newsgroup names, so
#   those longer than $FIELD1_WIDTH will be truncated to fit.  (You can
#   set $FIELD1_WIDTH to '' to skip this truncation, in which case the
#   first column will not be space-padded and the output will look a bit
#   ragged.)
#
########################################################################
#
# Constants:

my $USE_ALL_GROUPS = 1;  # if 0, use only group article is stored under
my $SORT_BY_SIZE = 1;    # if 0, sort output by number of articles
my $FIELD1_WIDTH = 30;   # maximum length of newsgroup name, '' for none

my $TLD_REGEX = '^(?:com|net|org|edu|gov|mil|ac|co|uk|au|ca|de)$';
                         # feel free to add any others as needed


########################################################################

use strict;

my %stats;

while( <> ) {
    my ($group, $peer, $bytes, $id, $token, @Newsgroups) = split;
    next unless ($USE_ALL_GROUPS ? @Newsgroups : $token);  # bad input line

    my @groups = map { split /\s*,\s*/ } @Newsgroups;

    foreach ($USE_ALL_GROUPS && @groups ? @groups : $group) {
        my $s = $stats{$_} ||= { count => 0,
                                 bytes => 0,
                                 peers => {},
                                 };

        $s->{count}++;
        $s->{bytes} += $bytes;
        $s->{peers}->{$peer}->{count}++;
        $s->{peers}->{$peer}->{bytes} += $bytes;
    }
}


my $f = $SORT_BY_SIZE ? 'bytes' : 'count';

foreach (sort { $stats{$a}->{$f} <=> $stats{$b}->{$f} } (keys %stats)) {
    my %s = %{$stats{$_}};

    my ($topcount,$topcountwho) = &max('count', $s{peers});
    my ($topbytes,$topbyteswho) = &max('bytes', $s{peers});
    $topcountwho = &trim($topcountwho);
    $topbyteswho = &trim($topbyteswho);
    my $countf = int(100 * $topcount / $s{count});
    my $bytesf = int(100 * $topbytes / $s{bytes});
    my $kb = 0.1 * int($s{bytes} * 10 / 1024);

    my $ng = $FIELD1_WIDTH eq '' ? $_ : substr($_,0,$FIELD1_WIDTH);
    print +(sprintf("%-${FIELD1_WIDTH}s\t", $ng)),
          "$s{count} arts ($countf% $topcountwho)\t",
          "${kb} KB ($bytesf% $topbyteswho)\n";
}

1;

sub trim {
    my @parts = split(/\./, $_[0]);
    my $part;
    while( defined($part = pop(@parts)) ) {
        last unless $part =~ /$TLD_REGEX/o;
    }
    return defined($part) ? $part : $_[0];
}

sub max {
    my $x = 0;
    my $who;
    my ($field, $listref) = @_;

    while( my ($peer, $stats) = each %{$listref} ) {
        if( $stats->{$field} > $x ) {
            $x = $stats->{$field};
            $who = $peer;
        }
    }

    return ($x, $who);
}