#!/usr/bin/perl -w
use strict;

# small script to merge two or more apache logfiles so that entries
# appear in the right chronological order, optionally adding a describing
# field to every entry in the resulting combined logfile.

# useful for merging files to generate combined statistics for more than one
# server/vhost/etc.

# syntax logmerge.pl [-o outfile] [[-r|-i] -d desc1,...,descN] [infiles]

# -o output   : file to write log to. Assume STDOUT if omitted.
# -r          : Insert descriptions into request header (/a.html becomes 
#               /description/a.html
# -i          : insert descriptions as the first field of every log entry
# -d descN    : one entry for each logfile describing it (i.e. sv1,sv2,etc.
#               if there are more logfiles than entries, empty descriptions
#               will be inserted for some logfiles (- for -i, nothing for -r)
# -f file     : Read the list of logfiles to merge from file, use - for STDIN
# logfiles    : two or more logfiles containing data in CLF or similar log
#               formats (the first four fields need to look right, change 
#               the regex in the code if that's not the case)

# example : find -name "access.log" | ./logmerge -f -

# If you have a _large_ number of logfiles (say, more than your system allows
# a perl script to open), you might have to invoke the program several times 
# in a tree-like structure.

# 2001 by Eike Frost (mailto:Eike.Frost@gmx.de)

use Getopt::Std;
use IO::File;
use IO::Handle;
use Time::Local;

getopts ('o:ri:d:f:');
our ($opt_o,$opt_r,$opt_i,$opt_d,$opt_f);
my (@filelist, @logfiles, @descs, $output, $finished, %timehash);

# prepare list of logfiles
if ($opt_f) {
  my $filelist = new IO::File ($opt_f) or die ($opt_f.' '.$!);
  @filelist = <$filelist>;
  undef $filelist;
} else {
  @filelist = @ARGV;
}
(scalar @filelist eq 0) and die ("no files to merge\n");

# read descriptions
$opt_d and (@descs = split ",", $opt_d);

# open logfiles
@logfiles = map (new IO::File ($_) || die ($_ . ' ' . $!), @filelist);

# open outputfile
($opt_o and $output = new IO::File ($opt_o, "w")) 
 or ($output = new IO::File and $output->fdopen (fileno(STDOUT),"w"));

# convert the timefield from the logfile to a unix timestamp for serializing
sub converttounixtime ($) {
  my $logtime = shift;
  my %months = ('Jan'=>1,'Feb'=>2,'Mar'=>3,'Apr'=>4,'May'=>5,'Jun'=>6,
                'Jul'=>7,'Aug'=>8,'Sep'=>9,'Oct'=>10,'Nov'=>11,'Dec'=>12);
  my ($day,$month,$year,$hour,$minute,$second,$adjust) = unpack "A2xA3xA4xA2xA2xA2xA*", $logtime;
  my $timestamp = timegm ($second,$minute,$hour,$day,$months{$month},$year);
  $timestamp += $adjust*.6*60;
  return $timestamp;
}

# insert a logentry into the timehash
sub insertfrom ($) {
  my $number = shift;
  my $lf = $logfiles[$number];
  my $logline = <$lf> || return;
  $logline =~ /[^ ]+ [^ ]+ [^ ]+ \[([^\]]+)\].*/;
  my $timestamp = $1;
  while (defined $timehash{$timestamp}) { $timestamp .= 'a'; }
  $timehash{$timestamp} = [$logline, $number];
}

# write out a logentry to the outfile
# inserting description as specified per -i, -r, and -d if possible
sub writelog ($) {                   
  my $key = shift;
  my $towrite = $timehash{$key}[0]; 
  if ($opt_r && (defined $descs[$timehash{$key}[1]])) {
    $towrite =~ s/([^"]+"[^ ]+ )(.*)/$1\/$descs[$timehash{$key}[1]]$2/g;
  } elsif ($opt_i) {
    if (defined $descs[$timehash{$key}[1]]) {
      $towrite = $descs[$timehash{$key}[1]] . " " . $towrite; 
    } else {
      $towrite = "- " . $towrite; 
   }
  }
  return print $output $towrite;;
}

# fill the initial timehash;
for (my $counter=0; $counter < scalar @logfiles; $counter++) { 
  insertfrom ($counter);
}

# main loop, finishes when every logfile eof'ed
$finished = scalar keys %timehash;
while ($finished > 0) {
    my $oldest = (sort keys %timehash)[0];
    writelog ($oldest) or die $!;
    insertfrom ($timehash{$oldest}[1]);
    delete $timehash{$oldest};
    $finished = scalar keys %timehash;
}