1: #!/usr/bin/perl
   2: 
   3: #############################
   4: #
   5: # A simple multi-format log parser which is intended to
   6: # to be used as a filter.  Could be faster, but it does
   7: # allow you to define a pretty output format.
   8: #
   9: # Author: Chris Jensen
  10: #
  11: 
  12: use Getopt::Long;
  13: 
  14: my %optctl;
  15: GetOptions (\%optctl, "type|t=s", "pattern|p=s");
  16: 
  17: 
  18: my $log_formats = {
  19: 
  20:   'common'   => [ qr{(\S+) (\S+) (\S+) \[([^\]]*)\] \"([^\"]*)\" (\d+) (\d+)},
  21:                  [ qw( _ h l u t r c b ) ] ],
  22: 
  23: 
  24:   'virtual'  => [ qr{(\S+) (\S+) (\S+) (\S+) \[([^\]]*)\] \"([^\"]*)\" (\d+) (\d+)},
  25:                  [ qw( _ v h l u t r c b ) ] ],
  26: 
  27:   'combined' => [ qr{(\S+) (\S+) (\S+) \[([^\]]*)\] \"([^\"]*)\" (\d+) (\d+) \"([^\"]*)\" \"([^\"]*)\"},
  28:                  [ qw( _ h l u t r c b R A ) ] ],
  29: 
  30:   'referer'  => [ qr{(\S+) \-\> (\S+)},
  31:                  [ qw( _ R r ) ] ],
  32: 
  33:   'agent'    => [ qr{(\S+)},
  34:                  [ qw( _ A ) ] ],
  35: 
  36:   'extended' => [ qr{(\S+) (\S+) (\S+) \[([^\]]*)\] \"([^\"]*)\" (\d+) (\d+) \"([^\"]*)\" \"([^\"]*)\" (\d+) (\d+)},
  37:                  [ qw( _ h l u t r c b R A P T ) ] ],
  38: 
  39:   'custom'   => [ qr{(\S+) (\S+) (\S+) \[([^\]]*)\] \"([^\"]*)\" (\d+) (\d+) \"([^\"]*)\" \"([^\"]*)\" (\d+)},
  40:                  [ qw( _ h l u t r c b A R T ) ] ],
  41: 
  42: };
  43: 
  44: 
  45: my $type = $optctl{type} || 'common';
  46: my $pattern = $optctl{pattern} or usage();
  47: 
  48: my ($format, $access) = @{$log_formats->{$type}};
  49: 
  50: 
  51: foreach my $pat (split(/\%/, $pattern)) {
  52:     if ($pat =~ /^([^a-zA-Z\_\%]*)(.)/) {
  53:        push(@pats, $2);
  54:     }
  55: }
  56: 
  57: my $outpat = $pattern;
  58: $outpat =~ s/(\%[^a-zA-Z\_\%]*)([a-zA-Z\_])/$1s/g;
  59: 
  60: 
  61: my $qrtime    = qr{(\d+)\/(\w+)\/(\d+)\:(\d+)\:(\d+)\:(\d+)\s};
  62: my $qruri     = qr{(\w+)\s([^\?]*)\??([^\s]*)?\s(.*)};
  63: my $qrsession = qr{(\w*)\-(\w*)};
  64: my $qrref     = qr{.*\:\/\/([^\/]+)(\/.*)};
  65: 
  66: 
  67: while(<STDIN>) {
  68:     my @vals;
  69:     my %info;
  70:     if (/$format/) {
  71: 	for ($x = 0; $x <= $#{$access}; $x++) {
  72:             $info{$access->[$x]} = $$x;
  73: 	    if ($access->[$x] eq "r") {
  74:                 $info{$access->[$x]} =~ /$qruri/;
  75:                 $info{'a'} = $1;
  76:                 $info{'f'} = $2;
  77:  		$info{'q'} = $3;
  78:                 $info{'p'} = $4;
  79:             }
  80: 	    if ($access->[$x] eq "t") {
  81: 		$info{$access->[$x]} =~ /$qrtime/;
  82: 		$info{'d'} = $1;
  83: 		$info{'m'} = $2;
  84: 		$info{'y'} = $3;
  85: 		$info{'H'} = $4;
  86: 		$info{'M'} = $5;
  87: 		$info{'S'} = $6;
  88: 	    }
  89: 	    if ($access->[$x] eq "u") {
  90: 		$info{$access->[$x]} =~ /$qrsession/;
  91: 		$info{'s'} = $1;
  92: 		$info{'i'} = $2;
  93: 	    }
  94: 	    if ($access->[$x] eq "R") {
  95: 		$info{$access->[$x]} =~ /$qrref/;
  96: 		$info{'o'} = $1;
  97: 		$info{'F'} = $2;
  98: 	    }
  99: 	}
 100: 
 101: 	foreach my $pat (@pats) {
 102: 	   push(@vals, $info{$pat}) if exists($info{$pat});
 103: 	}
 104: 
 105: 	printf $outpat, @vals;
 106:         print "\n";
 107:     }
 108: }
 109: 
 110: sub usage {
 111: 
 112: print qq{
 113: usage: logparse [-t=<type>] -p=<pattern>
 114: 
 115: example: tail -50 access_log | logparse -t=extended -p="%H:%M  %-15o  %f"
 116: 
 117: Formatting characters:
 118: 
 119:   _   - The name of this filter script
 120:   v   - The virtual host name/address
 121:   h   - The host IP name/address
 122:   l   - The remote logname
 123:   u   - Remote User/Session
 124:   t   - The time of the request
 125:   r   - The full request
 126:   c   - The HTTP code (302, 200, etc)
 127:   b   - Bytes
 128:   R   - Referrer string
 129:   A   - User Agent string
 130:   P   - Process ID
 131:   T   - Time taken in seconds
 132: 
 133: Request string breakdown:
 134: 
 135:   a   - Action/Method (GET, POST, etc)
 136:   f   - File path
 137:   q   - Query string
 138:   p   - HTTP protocol version
 139: 
 140: Time of request breakdown:
 141: 
 142:   d   - Day of the month
 143:   m   - Month (Apr, May, etc)
 144:   y   - Year
 145:   H   - Hour
 146:   M   - Minute
 147:   S   - Second
 148: 
 149: User Session breakdown:
 150: 
 151:   s   - Session ID
 152:   i   - User ID
 153: 
 154: Referrer string breakdown:
 155: 
 156:   o   - Host of referrer
 157:   F   - File path of referrer
 158: 
 159: };
 160: 
 161:    exit(0);
 162: 
 163: }

Replies are listed 'Best First'.
Re: Multi-format Log Parser
by tstock (Curate) on Oct 05, 2001 at 10:14 UTC
    Very nice.

    This script should make my life easier when working with log files from the shell. The important part is that it will make using certain commands with pipes much easier, like 'sort', 'uniq', 'cut' and 'grep'.

    One nice addition would be the _option_ to build the regexp by analysing the httpd.conf file given the format name.

    Another one would be to maybe eliminate the need to do the uncompression of the log file outside the script, with zcat or zgrep.

    Update:
    You could use Compress::Zlib for this
      I typicially do something like this:

      zgrep -h '2001:13:5' *access_log* | logparse -p="%o" | sort | uniq -c | sort -r | head -30

      Get the top 30 hosts that referred traffic during the 10-minute block between 1:50 and 2:00 PM. If I know there was an issue, like a spike or a dip, I use this filter to investigate what may have caused it. Works well with tail -f too.

      I thought (briefly) about auto-building the regex by analyzing a conf file and/or the actual log, but it was quicker to write and hard-code them myself, and I don't change log file formats very often. You can do some pretty funky things with log file formats, and it doesn't seem like it would be easy to anticipate all those possibilities.

      Would it still be useful as a filter if you use Compress::Zlib? I don't want to pass more data into the script than necessary. Still thinking about the idea.
        Would it still be useful as a filter if you use Compress::Zlib?

        It would only be usefull in the cases where you zcat to your filter first, then grep on the resulting format. For example, grepping on a url could bring up referes and requests alike, so if you filter referers out first, you can then grep OK for requests. same for some other fields like status codes vs size vs IP.
Re: Multi-format Log Parser
by mattr (Curate) on Nov 13, 2001 at 15:34 UTC
    Very nice! This would be wonderful if I could get it to run.. perl 5.00502. Are you doing anything with 5.6? Lots of other perl utils run fine. Getopts seems to eat perl's -w..

    bash-2.02# which logparse
    /root/bin/logparse
    bash-2.02# logparse
    bash: /root/bin/logparse: No such file or directory

      Often that message means that the #! line in the script isn't pointing to a real file. Where is your perl located? Do you have a /usr/bin/perl like the script assumes?

      -Blake