wow! thanks a bunch man. That gave me good insight. surely there is always other way to do it! :-)
however, in my program the main pain point (in terms of performance- time to execute) is a function called: 'applyWhiteList' where in each entry of the log file ( total entries are around 21 Millions ) is checked against the entries of the (csv)hash (this where major time is eaten up) and if not found the only record needs to be compared and categorized for other things.
Here is the complete executable code. I am a beginner in perl so please bear with me and my long way of doing the things.
#!/usr/bin/perl
use strict;
use Thread;
my $preprocessLogFileDir;
my $whiteListCsvFile;
if ($#ARGV != 1) #Check if total no. of args passed is exactly equa
+l to 2. Note 1 is array index; indicates 0 and 1 as 2 args.
{
help_message();
exit 0;
}
else
{
$preprocessLogFileDir=shift;
$whiteListCsvFile=shift;
if (! -d "out-single-file")
{
`mkdir out-single-file`;
}
else
{
`rm -rf out-single-file/*`;
}
if (! -e "$whiteListCsvFile/csv_file.csv")
{
print "The white list file(csv_file.csv) is not presen
+t in $whiteListCsvFile\n";
print "Please provide the directory location of the wh
+ilte list file\n";
print "Exiting.\n";
exit 0;
}
}
sub help_message
{
print "Insufficient or excess number of arguments\n";
print "The script takes only two arguments and in-order mentioned\
+n";
print "1. Directory location of the pre-process log files\n";
print "2. Directory location of the white-list file (csv_file.csv)
+\n";
}
my @whiteListPatterns=("xxx", "yyy", "asv:d:xxy xyz");
my %master;
my $thr;
getWhiteListCsvArrays();
getFiles();
splitFiles();
sub splitFiles
{
#`split -l 2000000 out-single-file/1xx_2xx_total -a 1 temp`;
opendir DIR, "out-single-file" or die $!;
while( my $dir_file = readdir(DIR))
{
if( $dir_file =~ /total$/)
{
$thr = new Thread \&processFiles, "$dir_file";
}
}
my $res_thr=$thr->join();
print "result-thread=$res_thr\n";
`rm -f temp*`;
}
sub getFiles
{
opendir DIR, $preprocessLogFileDir or die $!;
while( my $dir_file = readdir(DIR))
{
if($dir_file =~/log$/ && -f "$preprocessLogFileDir/$dir_f
+ile" )
{
print "Reading the file:$preprocessLogFileDir/$dir_fil
+e\n";
open DFH, "$preprocessLogFileDir/$dir_file";
while(<DFH>)
{
my $line=$_;
if ($line =~/(.*?)\t(.*?)\t(.*?)\t(.*?)\t(.*?)
+\t(.*?)\t(.*?)\t(.*?)\t(.*?)\t(.*?)\t(.*?)/)
{
my $rc=$3; #Get the Response Code p
+resent in the line.
my $ct=$10; #Get the Content Type p
+resent in the line.
my $cl=$6; #Get the Content Length
+present in the line.
if($rc =~/^1/ || $rc=~/^2/) #Check
+if Response Code is 1xx or 2xx or not.
{
open HAND, ">>out-single-file/
+1xx_2xx_total"; print HAND $line; close HAND;
} #if (rc is other than 1 or 2)
+ ends here.
} #if line patterns ends here
} #while DFH ends here
} #if:to check file extension ends here.
else
{
print "\n$preprocessLogFileDir/$dir_file is either
+ a directory or does not end with 'log'.";
print "Probabally not a pre-process file. Ignoring
+ the file.\n\n";
}
} # while readdir ends here.
} #Function getFiles ends here.
sub processFiles
{
my($file)=@_;
print "processing the file:$file\n";
open DFH, "out-single-file/$file";
while(<DFH>)
{
my $line=$_;
if ($line =~/(.*?)\t(.*?)\t(.*?)\t(.*?)\t(.*?)\t(.*?)\t(.*
+?)\t(.*?)\t(.*?)\t(.*?)\t(.*?)/)
{
my $rc=$3; #Get the Response Code present in the l
+ine.
my $ct=$10; #Get the Content Type present in the l
+ine.
my $cl=$6; #Get the Content Length present in the
+line.
#my $retval=applyWhiteList($line,$rc,$cl,$ct);
my $retval=0;
if(!$retval && ($ct=~/audio\// || ($ct=~/^$/ && $l
+ine=~/\.wma\?|\.mp3\?|\.wav\?|\.acc\?|\.m4a\?|\.m4r\?|\.m4p\?|\.asf\?
+/)))
{
open HAND, ">>out-single-file/audios_total
+"; print HAND $line; close HAND;
if($cl==2)
{
open HAND, ">>out-single-file/audi
+os_dropped"; print HAND $line; close HAND;
}
else
{
open HAND, ">>out-single-file/audi
+os_useful"; print HAND $line; close HAND;
applyBlackList($line,"audios_usefu
+l",$ct);
}
}
elsif(!$retval && ($ct=~/video\// || ($ct=~/^$/ &&
+ $line=~/\.mp4\?|\.3gp\?|\.3g2\?|\.flv\?|\.wmv\?|\.mov\?|\.avi\?|\.mp
+g\?|\.mpeg\?|\.m4v\?/)))
#elsif(!$retval && $ct=~/video\//)
{
open HAND, ">>out-single-file/videos_total
+"; print HAND $line; close HAND;
if($cl==2 || $cl < 10000)
{
open HAND, ">>out-single-file/vide
+os_dropped"; print HAND $line; close HAND;
}
else
{
open HAND, ">>out-single-file/vide
+os_useful"; print HAND $line; close HAND;
applyBlackList($line,"videos_usefu
+l",$ct);
}
}
elsif(!$retval)
{
open HAND, ">>out-single-file/data_traffic
+_total"; print HAND $line; close HAND;
applyBlackList($line,"data_traffic
+_total",$ct);
}
} # $line pattern ends
} # while file closes
} #Function ProcessFiles ends here.
sub applyWhiteList
{
my ($line,$rc,$cl,$ct) = @_;
my @fields=split("\t", $line);
my $url = $fields[1];
$url =~ s/^\w+:\/\/(.*?)/$1/g; #removes http:// etc.
$url =~ s/(.*?):\d+/$1/g; #removes port number.
if($url !~/^\d+\.\d+\.\d+\.\d+/) #ignore if ip-address.
{
my @urlparts=split("/", $url);
my $size = $#urlparts + 1;
if ($size >= 2 )
{
my $retval_def=0;
foreach my $key ( keys %master)
{
my $key_cont = $key . "_cont";
foreach my $row (@{$master{$key}})
{
my $param_op1_flag="nc";
+ #Set the flag to indicate that the optional parameter 1 is nc(not-c
+hecked).
my $param_op2_flag="nc";
+ #Set the flag to indicate that the optional parameter 2 is nc(not-c
+hecked).
my @row_csv = split(",", $
+row);
$row_csv[3] =~s/"(.*?)"/$1
+/g; #Get the mandatory part 1. This is the domain-name.
$row_csv[4] =~s/"(.*?)"/$1
+/g; #Get the mandatory part 2. This is part after domain-name.
my $param_man = $row_csv[3
+] . $row_csv[4]; #combine the mandatory parts.
my $param_op1=$row_csv[5];
+ #Get the optional parameter 1.
my $param_op2=$row_csv[6];
+ #Get the optional parameter 2.
$param_op1=~ s/\n//g; #
+Remove the new-lines if any.
$param_op2=~ s/\n//g; #
+Remove the new-lines if any.
if(length($param_op1))
+#check if optional parameter 1 has something to check or not.
{
$param_op1 =~s/"(.
+*?)"/$1/g; #Remove the double inverted commas.
$param_op1 =~s/\?/
+\\\?/g; #Escape the special characters like: ?.
if($url =~/$param_
+op1/) #check if optional parameter 1 is present in URL or not.
{
$param_op1
+_flag="cf"; #Set the optional parameter 1 flag to cf (checked-foun
+d).
}
else
{
$param_op1
+_flag="cnf"; #Set the optional parameter 1 flag to cnf (checked-no
+t-found).
}
}
if(length($param_op2) > 1
+)
{
$param_op2 =~s/"(.
+*?)"/$1/g; #Remove the double inverted commans.
$param_op2 =~s/\?/
+\\\?/g; #Escape the special characters like: ?.
if($url =~ /$param
+_op2/) #check if optional parameter 2 is present in URL or not.
{
$param_op2
+_flag="cf"; #Set the optional parameter 2 flag to cf (checked-foun
+d).
}
else
{
$param_op2
+_flag="cnf"; #Set the optional parameter 2 flag to cnf (checked-no
+t-found).
}
}
if($url=~/$param_man/ && (
+$param_op1_flag eq "cf" || $param_op1_flag eq "nc") && ($param_op2_fl
+ag eq "cf" || $param_op2_flag eq "nc"))
{
if (($cl < 5000 ||
+ $rc == 206) && $key =~/^AS_D/)
{
open OPF,
+">>out-single-file/$key_cont";
print OPF
+$line;
close OPF;
applyBlack
+List($line,$key_cont,$ct);
$retval_de
+f=1;
return $re
+tval_def;
}
open OPF, ">>o
+ut-single-file/$key";
print OPF $lin
+e;
close OPF;
if($key !~/^AD
+/)
{
applyB
+lackList($line,$key,$ct);
}
$retval_def=1;
return $retval
+_def
}
}
}
return $retval_def;
}
}
}
sub getWhiteListCsvArrays
{
%master=();
foreach my $wlp (@whiteListPatterns)
{
my $key=$wlp;
$key =~ s/[\s+|:]/_/g;
open FILE, "$whiteListCsvFile/csv_file.csv" or die $!;
while(<FILE>)
{
my $line=$_;
if ($line!~/^#/ || $line!~/^$/ || $line!~/^,/
+|| $line!~/^"#/)
{
my @csv = split(",", $line);
if ($csv[1] =~ /"$wlp/)
{
push (@{$master{$key}}, $line)
+; # push as value of a hash
}
}
} #while(<FILE>) ends here
close FILE;
} #foreach $wlp ends here
} #Function getWhiteListCsvArrays ends here.
sub applyBlackList
{
my($line,$file,$ct)=@_;
my $fileA= $file . "Blk_applied";
my $fileD= $file . "Dropped_due_to_black_list";
if(($ct=~/image|text\/css|text\/java|text\/x-java|application\/jav
+a|application\/x-java/) || $line=~/\.gif|\.jpg|\.png|\.jpeg|\.ico|\.j
+s|\.css|\.swf/)
{
open FD, ">>out-single-file/$fileD";
print FD $line;
close FD;
}
else
{
open FD, ">>out-single-file/$fileA";
print FD $line;
close FD;
}
} #Function applyBlackList ends
|