#!/usr/bin/perl use strict; use Thread; my $preprocessLogFileDir; my $whiteListCsvFile; if ($#ARGV != 1) #Check if total no. of args passed is exactly equa +l to 2. Note 1 is array index; indicates 0 and 1 as 2 args. { help_message(); exit 0; } else { $preprocessLogFileDir=shift; $whiteListCsvFile=shift; if (! -d "out-single-file") { `mkdir out-single-file`; } else { `rm -rf out-single-file/*`; } if (! -e "$whiteListCsvFile/csv_file.csv") { print "The white list file(csv_file.csv) is not presen +t in $whiteListCsvFile\n"; print "Please provide the directory location of the wh +ilte list file\n"; print "Exiting.\n"; exit 0; } } sub help_message { print "Insufficient or excess number of arguments\n"; print "The script takes only two arguments and in-order mentioned\ +n"; print "1. Directory location of the pre-process log files\n"; print "2. Directory location of the white-list file (csv_file.csv) +\n"; } my @whiteListPatterns=("xxx", "yyy", "asv:d:xxy xyz"); my %master; my $thr; getWhiteListCsvArrays(); getFiles(); splitFiles(); sub splitFiles { #`split -l 2000000 out-single-file/1xx_2xx_total -a 1 temp`; opendir DIR, "out-single-file" or die $!; while( my $dir_file = readdir(DIR)) { if( $dir_file =~ /total$/) { $thr = new Thread \&processFiles, "$dir_file"; } } my $res_thr=$thr->join(); print "result-thread=$res_thr\n"; `rm -f temp*`; } sub getFiles { opendir DIR, $preprocessLogFileDir or die $!; while( my $dir_file = readdir(DIR)) { if($dir_file =~/log$/ && -f "$preprocessLogFileDir/$dir_f +ile" ) { print "Reading the file:$preprocessLogFileDir/$dir_fil +e\n"; open DFH, "$preprocessLogFileDir/$dir_file"; while(<DFH>) { my $line=$_; if ($line =~/(.*?)\t(.*?)\t(.*?)\t(.*?)\t(.*?) +\t(.*?)\t(.*?)\t(.*?)\t(.*?)\t(.*?)\t(.*?)/) { my $rc=$3; #Get the Response Code p +resent in the line. my $ct=$10; #Get the Content Type p +resent in the line. my $cl=$6; #Get the Content Length +present in the line. if($rc =~/^1/ || $rc=~/^2/) #Check +if Response Code is 1xx or 2xx or not. { open HAND, ">>out-single-file/ +1xx_2xx_total"; print HAND $line; close HAND; } #if (rc is other than 1 or 2) + ends here. } #if line patterns ends here } #while DFH ends here } #if:to check file extension ends here. else { print "\n$preprocessLogFileDir/$dir_file is either + a directory or does not end with 'log'."; print "Probabally not a pre-process file. Ignoring + the file.\n\n"; } } # while readdir ends here. } #Function getFiles ends here. sub processFiles { my($file)=@_; print "processing the file:$file\n"; open DFH, "out-single-file/$file"; while(<DFH>) { my $line=$_; if ($line =~/(.*?)\t(.*?)\t(.*?)\t(.*?)\t(.*?)\t(.*?)\t(.* +?)\t(.*?)\t(.*?)\t(.*?)\t(.*?)/) { my $rc=$3; #Get the Response Code present in the l +ine. my $ct=$10; #Get the Content Type present in the l +ine. my $cl=$6; #Get the Content Length present in the +line. #my $retval=applyWhiteList($line,$rc,$cl,$ct); my $retval=0; if(!$retval && ($ct=~/audio\// || ($ct=~/^$/ && $l +ine=~/\.wma\?|\.mp3\?|\.wav\?|\.acc\?|\.m4a\?|\.m4r\?|\.m4p\?|\.asf\? +/))) { open HAND, ">>out-single-file/audios_total +"; print HAND $line; close HAND; if($cl==2) { open HAND, ">>out-single-file/audi +os_dropped"; print HAND $line; close HAND; } else { open HAND, ">>out-single-file/audi +os_useful"; print HAND $line; close HAND; applyBlackList($line,"audios_usefu +l",$ct); } } elsif(!$retval && ($ct=~/video\// || ($ct=~/^$/ && + $line=~/\.mp4\?|\.3gp\?|\.3g2\?|\.flv\?|\.wmv\?|\.mov\?|\.avi\?|\.mp +g\?|\.mpeg\?|\.m4v\?/))) #elsif(!$retval && $ct=~/video\//) { open HAND, ">>out-single-file/videos_total +"; print HAND $line; close HAND; if($cl==2 || $cl < 10000) { open HAND, ">>out-single-file/vide +os_dropped"; print HAND $line; close HAND; } else { open HAND, ">>out-single-file/vide +os_useful"; print HAND $line; close HAND; applyBlackList($line,"videos_usefu +l",$ct); } } elsif(!$retval) { open HAND, ">>out-single-file/data_traffic +_total"; print HAND $line; close HAND; applyBlackList($line,"data_traffic +_total",$ct); } } # $line pattern ends } # while file closes } #Function ProcessFiles ends here. sub applyWhiteList { my ($line,$rc,$cl,$ct) = @_; my @fields=split("\t", $line); my $url = $fields[1]; $url =~ s/^\w+:\/\/(.*?)/$1/g; #removes http:// etc. $url =~ s/(.*?):\d+/$1/g; #removes port number. if($url !~/^\d+\.\d+\.\d+\.\d+/) #ignore if ip-address. { my @urlparts=split("/", $url); my $size = $#urlparts + 1; if ($size >= 2 ) { my $retval_def=0; foreach my $key ( keys %master) { my $key_cont = $key . "_cont"; foreach my $row (@{$master{$key}}) { my $param_op1_flag="nc"; + #Set the flag to indicate that the optional parameter 1 is nc(not-c +hecked). my $param_op2_flag="nc"; + #Set the flag to indicate that the optional parameter 2 is nc(not-c +hecked). my @row_csv = split(",", $ +row); $row_csv[3] =~s/"(.*?)"/$1 +/g; #Get the mandatory part 1. This is the domain-name. $row_csv[4] =~s/"(.*?)"/$1 +/g; #Get the mandatory part 2. This is part after domain-name. my $param_man = $row_csv[3 +] . $row_csv[4]; #combine the mandatory parts. my $param_op1=$row_csv[5]; + #Get the optional parameter 1. my $param_op2=$row_csv[6]; + #Get the optional parameter 2. $param_op1=~ s/\n//g; # +Remove the new-lines if any. $param_op2=~ s/\n//g; # +Remove the new-lines if any. if(length($param_op1)) +#check if optional parameter 1 has something to check or not. { $param_op1 =~s/"(. +*?)"/$1/g; #Remove the double inverted commas. $param_op1 =~s/\?/ +\\\?/g; #Escape the special characters like: ?. if($url =~/$param_ +op1/) #check if optional parameter 1 is present in URL or not. { $param_op1 +_flag="cf"; #Set the optional parameter 1 flag to cf (checked-foun +d). } else { $param_op1 +_flag="cnf"; #Set the optional parameter 1 flag to cnf (checked-no +t-found). } } if(length($param_op2) > 1 +) { $param_op2 =~s/"(. +*?)"/$1/g; #Remove the double inverted commans. $param_op2 =~s/\?/ +\\\?/g; #Escape the special characters like: ?. if($url =~ /$param +_op2/) #check if optional parameter 2 is present in URL or not. { $param_op2 +_flag="cf"; #Set the optional parameter 2 flag to cf (checked-foun +d). } else { $param_op2 +_flag="cnf"; #Set the optional parameter 2 flag to cnf (checked-no +t-found). } } if($url=~/$param_man/ && ( +$param_op1_flag eq "cf" || $param_op1_flag eq "nc") && ($param_op2_fl +ag eq "cf" || $param_op2_flag eq "nc")) { if (($cl < 5000 || + $rc == 206) && $key =~/^AS_D/) { open OPF, +">>out-single-file/$key_cont"; print OPF +$line; close OPF; applyBlack +List($line,$key_cont,$ct); $retval_de +f=1; return $re +tval_def; } open OPF, ">>o +ut-single-file/$key"; print OPF $lin +e; close OPF; if($key !~/^AD +/) { applyB +lackList($line,$key,$ct); } $retval_def=1; return $retval +_def } } } return $retval_def; } } } sub getWhiteListCsvArrays { %master=(); foreach my $wlp (@whiteListPatterns) { my $key=$wlp; $key =~ s/[\s+|:]/_/g; open FILE, "$whiteListCsvFile/csv_file.csv" or die $!; while(<FILE>) { my $line=$_; if ($line!~/^#/ || $line!~/^$/ || $line!~/^,/ +|| $line!~/^"#/) { my @csv = split(",", $line); if ($csv[1] =~ /"$wlp/) { push (@{$master{$key}}, $line) +; # push as value of a hash } } } #while(<FILE>) ends here close FILE; } #foreach $wlp ends here } #Function getWhiteListCsvArrays ends here. sub applyBlackList { my($line,$file,$ct)=@_; my $fileA= $file . "Blk_applied"; my $fileD= $file . "Dropped_due_to_black_list"; if(($ct=~/image|text\/css|text\/java|text\/x-java|application\/jav +a|application\/x-java/) || $line=~/\.gif|\.jpg|\.png|\.jpeg|\.ico|\.j +s|\.css|\.swf/) { open FD, ">>out-single-file/$fileD"; print FD $line; close FD; } else { open FD, ">>out-single-file/$fileA"; print FD $line; close FD; } } #Function applyBlackList ends
In reply to Re^6: how to split huge file reading into multiple threads
by sagarika
in thread how to split huge file reading into multiple threads
by sagarika
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |