#!/usr/bin/perl -w #use strict; use Benchmark; # Get the HTML-Format package from the package manager. use HTML::Formatter; # Get the HTML-TREE from the package manager use HTML::TreeBuilder; use HTML::FormatText; use File::Copy; $startTime = new Benchmark; # Specifies the directory of the output file my $outfile="D:\\output\\2009\\2009.01.xls"; my $outdir="D:\\output\\2009\\01"; # Specifies the directory with the 8-Ks my $direct="D:\\8K\\2009\\01"; my $anchorstring='(chief\s*exec|ceo|chairman|founder|president)(.){0,2 +00}(\s+dies|\s+died|\s+passed|\s+death|(medical(.){0,30}leave)|(leave +(.){0,30}medical)|(diagnos(.){0,30}cancer)|(leave\s*of\s*absence)|(pe +rsonal\s*reasons)|(medical\s*treatment))'; my $anchorstring2='(death|passing)(.){0,200}(chief\s*exec|ceo|chairman +|founder|president)'; my $icounter=1; # Open the directory containing the files to read # Store the names of each file in an array @New1 opendir(DIR1, "$direct") || die "Can't open directory"; my @New1=readdir(DIR1); # Open (and overwrite) the output file # Print first line of output file open(OUTPUT, ">$outfile") || die "can't open $outfile: $!"; print OUTPUT "file \t form_type \t HTML \t cik \t report_date \t file_ +date \t name \t text \n"; # Loop over each file in the arry foreach $file(@New1) { print " File number $icounter out of $#New1 \n"; $icounter++; # Don't read the directories . and .. if ($file=~/^\./) {next;} # Initialize the variables # CHECK WHICH I NEED my $cik=-99; my $report_date=-99; my $file_date=-99; #my $file_number=-99; my $form_type="Not Found"; my $name=""; my $sic=-99; my $HTML=0; my $announcement_text='Not Found'; my $ao="Not Found"; my $tree="Empty"; my $data=""; # Open the file and put the content in variable $data # $data contains the entire filing { # Remove the default end of line character (\n) so that the entire + file can be read at once local $/; open (SLURP, "$direct\\"."$file") or die "can't open $file: $!"; # Read the contents into $data $data = <SLURP>; } close SLURP or die "cannot close $file: $!"; # The following steps obtain basic data from the filings if($data=~m/<HTML>/i) {$HTML=1;} if($data=~m/^\s*FORM\s*TYPE:\s*(.*$)/m) {$form_type=$1;} if($data=~m/^\s*CENTRAL\s*INDEX\s*KEY:\s*(\d*)/m) {$cik=$1;} if($data=~m/^\s*CONFORMED\s*PERIOD\s*OF\s*REPORT:\s*(\d*)/m) {$rep +ort_date=$1;} if($data=~m/^\s*FILED\s*AS\s*OF\s*DATE:\s*(\d*)/m) {$file_date=$1; +} #if($data=~m/^\s*SEC\s*FILE\s*NUMBER:\s*([0-9-]*)/m) {$file_number +=$1;} if($data=~m/^\s*COMPANY\s*CONFORMED\s*NAME:\s*(.*$)/m) {$name=$1;} if($data=~m/^\s*STANDARD\s*INDUSTRIAL\s*CLASSIFICATION:.*?\[(\d{4} +)/m) {$sic=$1;} # The following steps extract the audit opinion (or whatever secti +on of text you want) # The first if statement determines whether the filing is in HTML +format or plain text. if($HTML==0) { if($data=~m/((?:.){0,200}$anchorstring(?:.){0,200})/is) {$announcement_text=$1;} } else { if($data=~m/((?:.){0,200}$anchorstring(?:.){0,200})/is) { $ao=$1; # Clean up HTML tags, etc. $tree=HTML::TreeBuilder->new->parse($ao); $formatter=HTML::FormatText->new(leftmargin=> 0, right +margin=>60); $announcement_text=$formatter->format($tree); $tree->delete(); } } if($announcement_text eq 'Not Found') { if($HTML==0) { if($data=~m/((?:.){0,200}$anchorstring2(?:.){0,200})/is) {$announcement_text=$1;} } else { if($data=~m/((?:.){0,200}$anchorstring2(?:.){0,200})/is) { $ao=$1; # Clean up HTML tags, etc. $tree=HTML::TreeBuilder->new->parse($ao); $formatter=HTML::FormatText->new(leftmargin=> 0, right +margin=>60); $announcement_text=$formatter->format($tree); $tree->delete(); } } } #if($announcement_text eq 'Not Found') {$announcement_text='Not Fo +und'} if($announcement_text eq 'Not Found') {next;} # Clean up a bit $announcement_text=~s/[^[:ascii:]]+//g; $announcement_text=~s/\s+/ /mg; print OUTPUT "$file \t $form_type \t $HTML \t $cik \t $report_date + \t $file_date \t $name \t $announcement_text \n"; copy("$direct\\"."$file", "$outdir\\"."$file") or die; } close(OUTPUT); # Show how long it took to run the program $endTime = new Benchmark; $runTime = timediff($endTime, $startTime); print ("Processing files took ", timestr($runTime));
In reply to Help with code optimization by hyu968
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |