in reply to Re: Getstore to avoid of memory?
in thread Getstore to avoid of memory?
#!/usr/bin/perl -w #use strict; use Tie::File; use Fcntl; use LWP::Simple; #First year you want downloaded files for for: my $startyear=2016; #Last year you want files for: my $endyear=2016; #First qtr you want files for (usually 1): my $startqtr=1; #Last qtr you want files for (usually 4): my $endqtr=1; #The directory you want your index files to be stored in. my $inddirect="C:/Volumes/EDGAR1/Edgar/full-index"; #The directory you are going to download filings to my $direct="G:/Research/SEC filings 10K and 10Q/Data"; #The file that will contain the filings you want to download. my $outfile="G:/Research/SEC filings 10K and 10Q/Data/sizefiles1.txt"; #Specify the directory containing the results/output; my $write_dir = 'G:\Research\SEC filings 10K and 10Q\Data\Header Data\ +data2016.txt'; my $base_url = 'http://www.sec.gov/Archives'; my $line_count=0; my $cik=-99; my $form_type=""; my $report_date=-99; my $file_date=-99; my $name=""; my $count=0; #Initialize file counter variable; my $file_count = 0; my $formget1='(10-K )'; my $formget2='(10-K405 )'; my $formget3='(10KSB )'; my $formget4='(10-KSB )'; my $formget5='(10KSB40 )'; my $formget6='(10-KT )'; my $formget7='(10KT405 )'; my $slash='/'; for($yr=$startyear;$yr<=$endyear;$yr++) { #loop through all the index quarters you specified if($yr<$endyear){$eqtr=4}else{$eqtr=$endqtr} for($qtr=$startqtr;$qtr<=$eqtr;$qtr++) { #Open the index file open(INPUT, "$inddirect/company$qtr$yr.idx") || die "file for company$ +qtr$yr.idx: $!"; #Open the file you want to write to. The first time through #the file is opened to "replace" the existing file. #After that, it is opened to append ">>". if ($yr==$startyear && $qtr==$startqtr) {$outfiler=">$outfile";} else{$outfiler=">>$outfile";} open(OUTPUT, "$outfiler") || die "file for 2006 1: $!"; $count=1; while ($line=<INPUT>) { #ignore the first 10 lines because they only contain header informatio +n if ($.<11) {next}; $company_name=substr($line,0,60); $form_type=substr($line,62,12); my $cik=substr($line,74,10); $file_date=substr($line,86,10); $file_date=~s/\-//g; my $fullfilename=trim(substr($line,98,43)); if ($form_type=~/^$formget1(?!\/)/) { print OUTPUT "$fullfilename\n" ; $count++; print $line_count, " ", $form_type, " ", $base_url,"/",$fullfilena +me,"\n"; } elsif ($form_type=~/^$formget2(?!\/)/) { print OUTPUT "$fullfilename\n" ; $count++; print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\n +"; } elsif ($form_type=~/^$formget3(?!\/)/) { print OUTPUT "$fullfilename\n" ; $count++; print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\n +"; } elsif ($form_type=~/^$formget4(?!\/)/) { print OUTPUT "$fullfilename\n" ; $count++; print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\n +"; } elsif ($form_type=~/^$formget5(?!\/)/) { print OUTPUT "$fullfilename\n" ; print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\ +n"; $count++; } elsif ($form_type=~/^$formget6(?!\/)/) { print OUTPUT "$fullfilename\n" ; print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\ +n"; $count++; } elsif ($form_type=~/^$formget7(?!\/)/) { print OUTPUT "$fullfilename\n" ; print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\ +n"; $count++; } } close(INPUT); close(OUTPUT); # check to see if directory exists. If not, create it. unless(-d "$direct$slash$yr"){ mkdir("$direct$slash$yr") or die; } #Open the directory and get put the names of all files into the array +@old opendir(DIR,"$direct$slash$yr")||die "Can't open directory"; @Old=readdir(DIR); #The tie statement assigns the file containing the #files you want to download to the array @New1. tie(@New1,Tie::File,"$outfile", mode=> O_RDWR) or die "Cannot tie file BOO: $!n"; #checks to see what files on the current index listing are not in the +directory #defines a hash called seen. %seen=(); #defines an array called @aonly. @aonly=(); #build lookup table. This step is building a lookup table(hash). #each filename (from OLD) has a value of 1 assigned to it. foreach $item(@Old){$seen{$item}=1} #for each item in the New1 array, which we got from the txt file #containing all the files we want to download, add #it to the array, @aonly, as long is it is not already #in the current directory. We do this so we don't download #a file we have already downloaded. foreach $item(@New1){ $item=~/(edgar\/data\/.*\/)(.*\.txt)/; unless($seen{$item}){ push(@aonly,$item); } } #downloads all the files in the @oanly array which are the files not i +n the directory foreach $filetoget(@aonly) { $fullfile="$base_url/$filetoget"; $fonly=$filetoget; for my $line (split qr/\'\n'/, get($fullfile)) { while ($line_count < 2) { if($line=~m/^\s*CENTRAL\s*INDEX\s*KEY:\s*(\d*)/m){$cik=$1;} if($line=~m/^\s*FORM\s*TYPE:\s*(.*$)/m){$form_type=$1;} if($line=~m/^\s*CONFORMED\s*PERIOD\s*OF\s*REPORT:\s*(\d*)/m){$ +report_date=$1;} if($line=~m/^\s*FILED\s*AS\s*OF\s*DATE:\s*(\d*)/m){$file_date= +$1;} if($line=~m/^\s*COMPANY\s*CONFORMED\s*NAME:\s*(.*$)/m){$name=$ +1;} $line_count++; print "$cik, $form_type, $report_date, $file_date, $name\n"; print "$line_count\n"; } # end of while loop; } # end of for my $line split loop; ### Now write the results to file!; #Open the ouput file; open my $FH_OUT, '>>',$write_dir or die "Can't open file $write_dir"; #Save/write results/output; $,='|'; print $FH_OUT "$cik$,$form_type$,$report_date$,$file_date$,$name$,\n"; $line_count=0; #Update file counter; ++$file_count; print "$file_count lines read from $fullfile\n"; #closedir($dir_handle); close($FH_OUT); } # end of foreach file to get loop; #end of qtr loop } #end of year loop } sub trim { my $new_phrase; my $phrase = shift(@_); $phrase =~ s/^\s+//; $phrase =~ s/\s+$//; $new_phrase = "$phrase"; return "$new_phrase"; }
|
---|
Replies are listed 'Best First'. | |
---|---|
Re^3: Getstore to avoid of memory?
by GotToBTru (Prior) on Mar 01, 2017 at 22:42 UTC | |
Re^3: Getstore to avoid of memory?
by huck (Prior) on Mar 02, 2017 at 01:59 UTC | |
by huck (Prior) on Mar 02, 2017 at 02:25 UTC | |
by wrkrbeee (Scribe) on Mar 02, 2017 at 14:01 UTC | |
Re^3: Getstore to avoid of memory?
by Anonymous Monk on Mar 02, 2017 at 00:26 UTC |