My apologies, revised/cleaned code appears below:
#!/usr/bin/perl -w #use strict; use Tie::File; use Fcntl; use LWP::Simple; #First year you want downloaded files for for: my $startyear=2016; #Last year you want files for: my $endyear=2016; #First qtr you want files for (usually 1): my $startqtr=1; #Last qtr you want files for (usually 4): my $endqtr=1; #The directory you want your index files to be stored in. my $inddirect="C:/Volumes/EDGAR1/Edgar/full-index"; #The directory you are going to download filings to my $direct="G:/Research/SEC filings 10K and 10Q/Data"; #The file that will contain the filings you want to download. my $outfile="G:/Research/SEC filings 10K and 10Q/Data/sizefiles1.txt"; #Specify the directory containing the results/output; my $write_dir = 'G:\Research\SEC filings 10K and 10Q\Data\Header Data\ +data2016.txt'; my $base_url = 'http://www.sec.gov/Archives'; my $line_count=0; my $cik=-99; my $form_type=""; my $report_date=-99; my $file_date=-99; my $name=""; my $count=0; #Initialize file counter variable; my $file_count = 0; my $formget1='(10-K )'; my $formget2='(10-K405 )'; my $formget3='(10KSB )'; my $formget4='(10-KSB )'; my $formget5='(10KSB40 )'; my $formget6='(10-KT )'; my $formget7='(10KT405 )'; my $slash='/'; for($yr=$startyear;$yr<=$endyear;$yr++) { #loop through all the index quarters you specified if($yr<$endyear){$eqtr=4}else{$eqtr=$endqtr} for($qtr=$startqtr;$qtr<=$eqtr;$qtr++) { #Open the index file open(INPUT, "$inddirect/company$qtr$yr.idx") || die "file for company$ +qtr$yr.idx: $!"; #Open the file you want to write to. The first time through #the file is opened to "replace" the existing file. #After that, it is opened to append ">>". if ($yr==$startyear && $qtr==$startqtr) {$outfiler=">$outfile";} else{$outfiler=">>$outfile";} open(OUTPUT, "$outfiler") || die "file for 2006 1: $!"; $count=1; while ($line=<INPUT>) { #ignore the first 10 lines because they only contain header informatio +n if ($.<11) {next}; $company_name=substr($line,0,60); $form_type=substr($line,62,12); my $cik=substr($line,74,10); $file_date=substr($line,86,10); $file_date=~s/\-//g; my $fullfilename=trim(substr($line,98,43)); if ($form_type=~/^$formget1(?!\/)/) { print OUTPUT "$fullfilename\n" ; $count++; print $line_count, " ", $form_type, " ", $base_url,"/",$fullfilena +me,"\n"; } elsif ($form_type=~/^$formget2(?!\/)/) { print OUTPUT "$fullfilename\n" ; $count++; print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\n +"; } elsif ($form_type=~/^$formget3(?!\/)/) { print OUTPUT "$fullfilename\n" ; $count++; print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\n +"; } elsif ($form_type=~/^$formget4(?!\/)/) { print OUTPUT "$fullfilename\n" ; $count++; print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\n +"; } elsif ($form_type=~/^$formget5(?!\/)/) { print OUTPUT "$fullfilename\n" ; print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\ +n"; $count++; } elsif ($form_type=~/^$formget6(?!\/)/) { print OUTPUT "$fullfilename\n" ; print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\ +n"; $count++; } elsif ($form_type=~/^$formget7(?!\/)/) { print OUTPUT "$fullfilename\n" ; print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\ +n"; $count++; } } close(INPUT); close(OUTPUT); # check to see if directory exists. If not, create it. unless(-d "$direct$slash$yr"){ mkdir("$direct$slash$yr") or die; } #Open the directory and get put the names of all files into the array +@old opendir(DIR,"$direct$slash$yr")||die "Can't open directory"; @Old=readdir(DIR); #The tie statement assigns the file containing the #files you want to download to the array @New1. tie(@New1,Tie::File,"$outfile", mode=> O_RDWR) or die "Cannot tie file BOO: $!n"; #checks to see what files on the current index listing are not in the +directory #defines a hash called seen. %seen=(); #defines an array called @aonly. @aonly=(); #build lookup table. This step is building a lookup table(hash). #each filename (from OLD) has a value of 1 assigned to it. foreach $item(@Old){$seen{$item}=1} #for each item in the New1 array, which we got from the txt file #containing all the files we want to download, add #it to the array, @aonly, as long is it is not already #in the current directory. We do this so we don't download #a file we have already downloaded. foreach $item(@New1){ $item=~/(edgar\/data\/.*\/)(.*\.txt)/; unless($seen{$item}){ push(@aonly,$item); } } #downloads all the files in the @oanly array which are the files not i +n the directory foreach $filetoget(@aonly) { $fullfile="$base_url/$filetoget"; $fonly=$filetoget; for my $line (split qr/\'\n'/, get($fullfile)) { while ($line_count < 2) { if($line=~m/^\s*CENTRAL\s*INDEX\s*KEY:\s*(\d*)/m){$cik=$1;} if($line=~m/^\s*FORM\s*TYPE:\s*(.*$)/m){$form_type=$1;} if($line=~m/^\s*CONFORMED\s*PERIOD\s*OF\s*REPORT:\s*(\d*)/m){$ +report_date=$1;} if($line=~m/^\s*FILED\s*AS\s*OF\s*DATE:\s*(\d*)/m){$file_date= +$1;} if($line=~m/^\s*COMPANY\s*CONFORMED\s*NAME:\s*(.*$)/m){$name=$ +1;} $line_count++; print "$cik, $form_type, $report_date, $file_date, $name\n"; print "$line_count\n"; } # end of while loop; } # end of for my $line split loop; ### Now write the results to file!; #Open the ouput file; open my $FH_OUT, '>>',$write_dir or die "Can't open file $write_dir"; #Save/write results/output; $,='|'; print $FH_OUT "$cik$,$form_type$,$report_date$,$file_date$,$name$,\n"; $line_count=0; #Update file counter; ++$file_count; print "$file_count lines read from $fullfile\n"; #closedir($dir_handle); close($FH_OUT); } # end of foreach file to get loop; #end of qtr loop } #end of year loop } sub trim { my $new_phrase; my $phrase = shift(@_); $phrase =~ s/^\s+//; $phrase =~ s/\s+$//; $new_phrase = "$phrase"; return "$new_phrase"; }

In reply to Re^2: Getstore to avoid of memory? by wrkrbeee
in thread Getstore to avoid of memory? by wrkrbeee

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post, it's "PerlMonks-approved HTML":



  • Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!
  • Titles consisting of a single word are discouraged, and in most cases are disallowed outright.
  • Read Where should I post X? if you're not absolutely sure you're posting in the right place.
  • Please read these before you post! —
  • Posts may use any of the Perl Monks Approved HTML tags:
    a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr
  • You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)
            For:     Use:
    & &amp;
    < &lt;
    > &gt;
    [ &#91;
    ] &#93;
  • Link using PerlMonks shortcuts! What shortcuts can I use for linking?
  • See Writeup Formatting Tips and other pages linked from there for more info.