Category:
Author/Contact Info Val Polyakov vpolyakov@katrillion.com
Description: A reporting module. Woo hoo!
#!/usr/bin/perl -w

package Report;

use strict;
use Compress::Zlib;

my $log_dir="/path/to/logs";
my $cache_dir="/path/to/log_cache";
my $flt_dir = 'flt';
my $map_dir = 'map';
my @servers=("server1", "server2", "server3", "server4", "server5", "s
+erver6");
my %cache_map = (
        'all' => '-total.cache.gz',
        'www' => '-www.cache.gz',
        'channel1' => '-channel1.cache.gz',
        'channel2' => '-channel2.cache.gz',
        'channel3'  => '-channel3.cache.gz');


sub new
{
        my $proto = shift;
        my $class = ref($proto) || $proto;
        my $self = {};

        $self->{TAG} = ();
        $self->{FLT} = ();
        $self->{FLT_FILE} = 'standard';
        $self->{DAILY} = 0;
        $self->{VERBOSE} = 0;
        $self->{HITS} = 0;
        $self->{LINES} = 0;

        my @date = localtime(time - (24 * 60 * 60));
        my $yesterday = sprintf("%d-%02d-%02d", $date[5] + 1900, $date
+[4] + 1, $date[3]);
        $self->{START} = $self->{END} = $yesterday;

        bless ($self, $class);
        return $self;
}

sub _check_tags
{
        my $self = shift;
        if (!defined($self->{TAG})) { push @{$self->{TAG}}, ('all') }
        foreach (@{$self->{TAG}}) { if (!exists($cache_map{$_})) { die
+ "Unrecognized tag: $_\n" } }
}

sub _check_dates
{
        my $self = shift;
        if ($self->{START} !~ /^\d+-\d+-\d+$/ || $self->{END} !~ /^\d+
+-\d+-\d+$/ || $self->{START} gt $self->{END})
                { die "Invalid date or date range: ($self->{START}) ->
+ ($self->{END})" }
}

sub _load_filter
{
        my $self = shift;
        open(FILTER, "<$flt_dir/$self->{FLT_FILE}");
        while(<FILTER>)
        {
                if (!/^#/)
                {
                        my ($action, $flag, $pattern) = /(accept|rejec
+t)\s(=|!)\s(.*)/;
                        if (defined($action) && defined($flag) &&defin
+ed($pattern)) { push @{$self->{FLT}}, [ qr/$pattern/, $flag, $action 
+] }
                }
        }
        close(FILTER);
}

sub init
{
        my $self = shift;
        while (defined($_ = shift))
        {
                if    ($_ eq '-t' || $_ eq '--tag')     { push @{$self
+->{TAG}}, shift or die "Missing tag!\n" }
                elsif ($_ eq '-s' || $_ eq '--start')   { $self->{STAR
+T} = shift or die "Missing start date!\n" }
                elsif ($_ eq '-e' || $_ eq '--end')     { $self->{END}
+ = shift or die "Missing end date!\n" }
                elsif ($_ eq '-d' || $_ eq '--date')    { $self->{STAR
+T} = $self->{END} = shift or die "Missing date!\n" }
                elsif ($_ eq '-f' || $_ eq '--filter')  { $self->{FLT_
+FILE} = shift or die "Missing filter!" }
                elsif ($_ eq '-a' || $_ eq '--daily')   { $self->{DAIL
+Y} = 1 }
                elsif ($_ eq '-v' || $_ eq '--verbose') { $self->{VERB
+OSE}++ }
        }
        _check_tags($self);
        _check_dates($self);
        _load_filter($self);
}

sub get_tags { my $self = shift; return @{$self->{TAG}} }

sub get_start_date { my $self = shift; return $self->{START} }

sub get_end_date { my $self = shift; return $self->{END} }

sub get_filter
{
        my $self = shift;
        return wantarray ? @{$self->{FILTER}} : $self->{FLT_FILE};
}

sub set_tags(@)
{
        my $self = shift;
        @{$self->{TAG}} = @_;
        _check_tags();
}

sub set_date
{
        my ($self, $type, $val) = @_;
        if ($type eq 'START') { $self->{START} = $val }
                elsif ($type eq 'END') { $self->{END} = $val }
                else { $self->{START} = $self->{END} = $val }
        _check_dates();
}

sub set_filter
{
        my ($self, $filter) = @_;
        $self->{FLT_FILE} = $filter;
        if (exists($self->{FLT})) { delete $self->{FLT} }
        _load_filter($self);
}

sub hook
{
        my $self = shift;
        my $hookhdl = "HOOK_" . shift;
        my $hooksub = shift;
        $self->{$hookhdl} = $hooksub;
}

sub load_map
{
        my ($self, $map_file) = @_;
        my @re_map;

        open(MAPFILE, "<$map_dir/$map_file");
        while(<MAPFILE>)
        {
                if (!/^#/)
                {
                        my ($pattern, $val) = /(.*)\t+(.*)/;
                        if (defined($pattern) && defined($val)) { push
+ @re_map, [qr/$pattern/, $val] }
                }
        }
        close(MAPFILE);
        return @re_map;
}

sub commanum {
        my ($self,$num) = @_;
        my $i;
        my $tmp_len;
        my $final_num = "";
        my $remaining;

        $tmp_len = length($num);

        if ($tmp_len <= 3) { return $num; }

        $i = 3;
        while ($i < $tmp_len)
        {
                $final_num = "," . substr($num, -$i, 3) . $final_num ;
                $remaining = substr($num, 0, $tmp_len - $i);
                $i += 3;
        }

        return $remaining . $final_num;
}

sub run
{
        my $self = shift;

        if ($self->{VERBOSE}) { print STDERR "Using dates: $self->{STA
+RT} -> $self->{END}\n" }
        if ($self->{VERBOSE} > 1) { print STDERR "Filters:\n"; print S
+TDERR "$$_[2]\t$$_[1]\t$$_[0]\n" foreach @{$self->{FLT}} }

        opendir (CACHEDIR, $cache_dir)
                or die "Couldn't open directory: $!\n";
        my @filelist = sort(readdir CACHEDIR);
        closedir CACHEDIR;

        my $tag_list = '(' . join('|', @cache_map{@{$self->{TAG}}}) . 
+')';
        my $file_pat = qr/$tag_list$/;
        my $flt_size = scalar @{$self->{FLT}};

        foreach (@filelist[2 .. $#filelist])
        {
                if (/$file_pat/)
                {
                        my ($year, $month, $day, $cur_tag) = /(\d{4})-
+(\d{2})-(\d{2})-(\w+)/;
                        my $date_stamp = "$year-$month-$day";
                        if ($date_stamp ge $self->{START} && $date_sta
+mp le $self->{END})
                        {
                                if ($self->{VERBOSE}) { print STDERR "
+grabbing $cache_dir/$_\n" }
                                my $gz = gzopen("$cache_dir/$_", "rb")
+;

                                # pay attention here - we drop the fil
+ename out of $_ in favor of data lines
                                while ($gz->gzreadline($_) > 0)
                                {
                                        my $i = 0;
                                        my $match = 0;
                                        my ($views, $url) = /([0-9]+)\
+t(\S+)/;

                                        while ($i < $flt_size && !$mat
+ch)
                                        {
                                                if (($self->{FLT}[$i][
+1] eq '=' && $url =~ /$self->{FLT}[$i][0]/) ||
                                                        ($self->{FLT}[
+$i][1] eq '!' && $url !~ /$self->{FLT}[$i][0]/))
                                                {
                                                        if ($self->{FL
+T}[$i][2] eq 'accept')
                                                        {
                                                                &{$sel
+f->{HOOK_PROC}}($year, $month, $day, $cur_tag, $url, $views);
                                                        }
                                                        $match = 1;
                                                }

                                                $i++;
                                        }
                                        $self->{LINES}++;
                                        if ($self->{VERBOSE} > 2 && $s
+elf->{LINES}% 10000 == 0) { print STDERR "$self->{LINES} lines proces
+sed\n"; }
                                }
                                $gz->gzclose();
                        }
                }
        }
}
1;
</readmore>