Hello PerlMonks, Inherited a program that creates the "Out of Memory" error. Modified the code to read one line at a time rather than store the entire file in a variable (or so I thought). Program runs/works, but still encounters the "Out of Memory" error (obviously I failed). I suspect that the problem lies with a foreach statement at line 59. I'm thinking that the notion of reading all files into an array is unnecessary. Hence, lines 51-59 can be dropped. However, my dilemma now is how do I assign the filename to $file in lines 63 and 86? Apologize for such a trivial question to experienced users. Program is below. I am grateful for any insight you may have. Thanks!!
#!/usr/bin/perl -w #use strict; # This program extracts data from an SEC filing, including chunks of t +ext use File::stat; #This program is going to obtain and extract the entire audit opinion +but you can #extract whatever text you are interested in by changing the regular e +xpressions for the start #and end strings below. #This program was written by Andy Leone, May 15 2007 and updated July + 25, 2008. #You are free to use this program for your own use. My only request i +s #that you make an acknowledgement in any research manuscripts that #benefit from the program. my $startstring='((^\s*?)((We\s*(have|were)\s*(audited\s*the\s*(Statem +ent\s*of\s*Financial\s*Condition|consolidated|accompanying|combined|b +alance\s*sheets)|(completed\s*|engaged\s*to\s*perform)\s*an\s*integra +ted\s*audit))|In\s*our\s*opinion,\s*the\s*(consolidated|accompanying) +))'; my $startstringhtm='(((We\s*(have|were)\s*(audited\s*the\s*(Statement\ +s*of\s*Financial\s*Condition|consolidated|accompanying|combined|balan +ce\s*sheets)|(completed\s*|engaged\s*to\s*perform)\s*an\s*integrated\ +s*audit))|In\s*our\s*opinion,\s*the\s*(consolidated|accompanying)))'; + #Specify the end of the text you are looking for. my $endstring='((^\s*)/s/|^\s*(Date:\s*)?(\d{1,2}\s*)?((January|Februa +ry|March|April|May|June|July|August|September|October|November|Decemb +er))\s*(\d{1,2},)?\s*\d{4}(\s*$|,\s{0,3}except|\s*/s|\s*s/|\d{1,2}))' +; my $endstringhtm='((>|^\s*)(/s/)?\s*(Date:\s*)?(\d{1,2}\s*)?(January|F +ebruary|March|April|May|June|July|August|September|October|November|D +ecember))\s*(&\w+?;\s*)?(\d{1,2},)?\s*\d{4}(\s*$|\s*[,\(]\s{0,3}excep +t|\s*with\s*respect\s*to\s*our\s*opinion|<\/P>|<BR>|\s{0,1}\<\/FONT\> +|\d{1,2})'; #Specify the directory containing the files that you want to read my $direct="E:\\Research\\SEC filings 10K and 10Q\\Data\\Filing Docs\\ +2008test"; my $outfile="E:\\Research\\SEC filings 10K and 10Q\\Data\\Header Data\ +\Data2008test.txt"; #If Windows "\\", if Mac "/"; my $slash='\\'; $outfiler=">$outfile"; open(OUTPUT, "$outfiler") || die "file for 2006 1: $!"; #The following two steps open the directory containing the files you p +lan to read #and then stores the name of each file in an array called @New1. opendir(DIR1,"$direct")||die "Can't open directory"; my @New1=readdir(DIR1); #We will now loop through each file. THe file names #have been stored in the array called @New1; foreach $file(@New1) { #This prevents me from reading the first two entries in a directory . +and ..; if ($file=~/^\./){next;} #Initialize the variable names. my $cik=-99; my $form_type=""; my $report_date=-99; my $file_date=-99; my $name=""; my $sic=-99; my $HTML=0; my $Audit_Opinion="Not Found"; my $Going_Concern=0; my $ao="Not Found"; my $tree="Empty"; my $data=""; #Open the file and put the file in variable called $data #$data will contain the entire filing { # this step removes the default end of line character (\n) # so the the entire file can be read in at once. local $/; #read the contents into data open (INPUT, "$direct$slash"."$file"); while ($data=<INPUT> ) { #The following steps obtain basic data from the filings if ($data=~m/<HTML>/i){$HTML=1;} if($data=~m/^\s*CENTRAL\s*INDEX\s*KEY:\s*(\d*)/m){$cik=$1;} if($data=~m/^\s*FORM\s*TYPE:\s*(.*$)/m){$form_type=$1;} if($data=~m/^\s*CONFORMED\s*PERIOD\s*OF\s*REPORT:\s*(\d*)/m){$report +_date=$1;} if($data=~m/^\s*FILED\s*AS\s*OF\s*DATE:\s*(\d*)/m){$file_date=$1;} if($data=~m/^\s*COMPANY\s*CONFORMED\s*NAME:\s*(.*$)/m){$name=$1;} if($data=~m/^\s*STANDARD\s*INDUSTRIAL\s*CLASSIFICATION:.*?\[(\d{4})/ +m){$sic=$1;} my $filesize = -s $direct . '/' . $file; my $sb = stat($direct . '/' . $file)->size; print OUTPUT "$cik,$form_type,$report_date,$file_date,$name,$sic,$file +size,$sb\n"; } close INPUT or die "cannot close $file: $!"; } }
In reply to READ one line at a time by wrkrbeee
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |