Thats the kind of steer that I need. Perhaps you can help with problem of eliminating the garbage in a more efficient manner
I currently use the following to identify and remove garbage from a file. I compile the regexes because I am using them repeatedly over
and over again different lines in the file. I have a vague recollection that (?<![A-Z])(AUD.*) using lookbehind and lookaheads
causes a large overhead. Runs really slowly hogging all the memory on a file with 150000 lines for 20 mins. No doubt this can be improved:-
sub ValidateInputFile{
my($line,
$llen,
$LineNo,
$ErrChar,
$ErrCount,
$mtch_junk,
$tncount,
@check,
$SOF_fh);
$tncount=0;
$ErrCount=0;
$SOF_fh=OpenFile($fqd_sof, "READ");
$SOFV_fh=OpenFile($fqd_sof.'validated', "WRITE");
$SOFR_fh=OpenFile($fqd_sof.'removed', "WRITE");
@HeaderGarbage = ('^LD 20',
'^PT0000 ',
'^REQ\: PRT',
'^TYPE\: TNB',
'^TN $',
'^CDEN ',
'^CUST ',
'^DATE ',
'^PAGE ',
'^DES ');
foreach $RegEx (@HeaderGarbage){
$CHRegex = qr/($RegEx\n)/;
push @HeaderRegExes, $CHRegex;
}
@ErrCodes = ('(^\*.*\n)',
'(ADMIN VSID.*)',
'(BUG.*\n)',
'(?<![A-Z])(AUD.*)',
'(SCH.*\n)',
'(MAINT.*\n)',
'(OPRDATA.*\n)',
'(TIM000.*\n)',
'(DTC103.*)',
'(DTC001.*)',
'(DSL.*TIME.*\n)',
'(FIR110.*\n)',
'(FIR103.*\n)',
'(^ERR.*\n)',
'(DCH:.*)',
'(^VAS008.*\n)',
'(^INDEX.*VTN.*\n)',
'(^IND.*\n)',
'(^ NET 56 P.*\n)',
'(00A1491A 00000C16 00000000 000014F0 00000000 000000
+00 00000000 128)',
'(0000 00000000 0 00000000 00000000 00000000 0000000
+0 00000000 00000000)');
foreach $RegEx (@ErrCodes){
$CBRegex = qr/$RegEx/;
push @BodyRegExes, $CBRegex;
}
while ($line = ($SOF_fh->getline)){
$LineNo++;
$llen=length($line);
#Remove the DOS line endings
$line=~s/\r\n/\n/;
# Following code removes spurious switch codes that cant be suppre
+ssed
# First lets strip the first 20 lines of the TNB file of garbage
if ($LineNo < 30){
#print $LineNo."\n";
#print $line;
foreach $ErrChar (@HeaderRegExes){
undef $mtch_junk;
#print "Checking ".$ErrChar."\n";
if ($line=~/$ErrChar/){
$ErrCount++;
#print " Found ".$ErrChar."\n";
$line=~s/$ErrChar//;
$mtch_junk = $1;
if ($LineNo != $LastInvLine){
# Only print the line the first time we go over it
# Because $1 isn't reset if a subsequent attempt t
+o match fails
if ($mtch_junk){
$LastInvLine = $LineNo;
$SOFR_fh->write("$LineNo($ErrChar)->$mtch_
+junk\n");
}
}
}
}
}
# Now attempt to remove spurious codes
foreach $ErrChar (@BodyRegExes){
undef $mtch_junk;
# Need to ensure this doesn't match BAUD just AUD
if ($line=~/$ErrChar/){
$ErrCount++;
$line=~s/$ErrChar//;
$mtch_junk = $1;
if ($LineNo != $LastInvLine){
# Only print the line the first time we go over it
# Because $1 isn't reset if a subsequent attempt t
+o match fails
if ($mtch_junk){
$LastInvLine = $LineNo;
$SOFR_fh->write("$LineNo($ErrChar)->$mtch_junk\n")
+;
}
}
}
}
if ($line =~ /^(TN\s+\d+\s+\d+)/) {
$tncount++;
#$LOG_fh->write(msg("INFO","Found $1","FORMAT"),8);
#print $1."\n";
}
$SOFV_fh->write($line);
}
$LOG_fh->write(msg("INFO","Removed $ErrCount occurrences of non TN dat
+a","FORMAT"),4);
msg("INFO","Removed $ErrCount occurrences of non TN data","PRINT");
$LOG_fh->write(msg("INFO","Found $tncount occurrences of TN at beginni
+ng of line","FORMAT"),4);
msg("INFO","Found $tncount occurrences of TN at beginning of line","PR
+INT");
$SwitchCount{$switch_id}{'REMOVED'} = $ErrCount;
$SwitchCount{$switch_id}{'TN_COUNT'} = $tncount;
$SOF_fh->close or throw TNBCriticalException("Cannot close switch outp
+ut file");
undef $SOF_fh;
undef $SOFV_fh;
undef $SOFR_fh;
return $tncount;
}
Simon |