#!/bin/perl use File::Path; # Constants my $from="Distribute message from:"; my $to="Distributed:"; if ( $#ARGV != 0 ) { usage(); exit 1; } # # Parse the command line argument # $dir = shift @ARGV; if ( ! -e $dir ) { print "'$dir' does not exists. Exiting the script.\n"; exit 1; } elsif (! -d $dir ) { print "'$dir' is not a valid directory. Exiting the script.\n"; exit 1; } my $outdir = "$dir/filtered"; my $logdir = "$dir/logdir"; if ( -e "$outdir" ) { if ( -f "$outdir" ) { print "'$outdir' is a file. Rename the file.\n"; exit 0; } else { rmtree("$outdir", 0) || die "Could not delete '$outdir' $!\n"; } } mkpath("$outdir") || die "Could not create '$outdir' $!\n"; if ( -e "$logdir" ) { if ( -f "$logdir" ) { print "'$logdir' is a file. Rename the file.\n"; exit 0; } else { rmtree("$logdir", 0) || die "Could not delete '$logdir' $!\n"; } } mkpath("$logdir") || die "Could not create '$logdir' $!\n"; opendir(DIR, "$dir") || die "Can't open directory '$dir' $!\n"; @files = readdir(DIR); closedir(DIR); foreach $file (@files) { print "Processing $dir/$file\n"; if ( -f "$dir/$file") { open(FIN, "<$dir/$file"); open(FOUT, ">$outdir/$file"); open(LOG, ">$logdir/$file"); my %map = (); my %lmap = (); while ($line = ) { chomp($line); doLog("Processing", $line); if ( $line =~ /^[0-9]/m) { my ($time, $data) = removetimestamp($line); my ($key, $value) = keyValue($data); if (defined $key) { if ( $value =~ /^$from/ ) { my $sender = getSenderName($value); if (defined $map{$key}) { $val = $map{$key}; doLog("End of sender", $val); my $length = $val; $val = substr($val, 0, ($length - 1)); print FOUT "$val\n"; $map{$key} = "$sender,"; $lmap{$key} = $line; doLog("Replacing", $sender); } else { doLog("New Entry", "$data"); $map{$key} = "$sender,"; $lmap{$key} = $line; } } elsif ( $value =~ /^$to/ ) { my $recipient = getRecipientName($value); if (defined $map{$key} ) { $val = $map{$key}; doLog("Adding recipeint:", "$recipient to $val"); $val .= "$recipient,"; $map{$key} = $val; } else { doLog("Ignoring", $line); } } else { doLog("Ignoring", $line); } } } else { doLog("Ignoring", $line); } } for $mkey ( keys %lmap ) { doLog("Incomplete", $lmap{$mkey}); } close FIN; close FOUT; close LOG; } print "Processed $dir/$file\n"; } sub removetimestamp() { my ($line) = @_; my $ind = index($line, " "); if ( $ind != "-1" ) { $time = substr($line, 0, $ind); $line = substr($line, $ind + 1); return ($time, $line); } } sub keyValue() { my ($line) = @_; my $ind = index($line, " "); if ($ind != -1) { my $key = substr($line, 0, $ind); my $value = substr($line, $ind + 1); return ($key, $value); } } sub getSenderName() { my ($from_line) = @_; my $ind = index($from_line, ":"); if ( $ind != "-1" ) { $sender = substr($from_line, $ind + 1); $sender =~ s/^\s+//; #remove leading spaces $sender =~ s/\s+$//; #remove trailing spaces return $sender; } } sub getRecipientName() { my ($to_line) = @_; my $ind = index($to_line, ":"); if ( $ind != "-1" ) { $recipient = substr($to_line, $ind + 1); $recipient =~ s/^\s+//; #remove leading spaces $recipient =~ s/\s+$//; #remove trailing spaces return $recipient; } } sub doLog() { my ($msg, $line) = @_; print LOG "$msg: $line\n"; } sub usage() { print "Usage:\n"; print " cmd: perl dmining.pl \n"; print " where:\n"; print " directory-name: the absolute or relative path to raw data\n"; }