comment on

My apologies, revised/cleaned code appears below:


#!/usr/bin/perl -w

#use strict;

use Tie::File;
use Fcntl;
use LWP::Simple;

#First year you want downloaded files for for:
my $startyear=2016;
#Last year you want files for:

my $endyear=2016;
#First qtr you want  files for (usually 1):
my $startqtr=1;

#Last qtr you want files for (usually 4):

my $endqtr=1;
#The directory you want your index files to be stored in.
my $inddirect="C:/Volumes/EDGAR1/Edgar/full-index";
#The directory you are going to download filings to
my $direct="G:/Research/SEC filings 10K and 10Q/Data";
#The file that will contain the filings you want to download.
my $outfile="G:/Research/SEC filings 10K and 10Q/Data/sizefiles1.txt";
#Specify the directory containing the results/output;
my $write_dir = 'G:\Research\SEC filings 10K and 10Q\Data\Header Data\
+data2016.txt';
my $base_url  = 'http://www.sec.gov/Archives';

my $line_count=0;
my $cik=-99;
my $form_type="";
my $report_date=-99;
my $file_date=-99;
my $name="";

my $count=0;

#Initialize file counter variable;
my $file_count = 0;

my $formget1='(10-K )';
my $formget2='(10-K405 )';
my $formget3='(10KSB )';
my $formget4='(10-KSB )';
my $formget5='(10KSB40 )';
my $formget6='(10-KT )';
my $formget7='(10KT405 )';
my $slash='/';

for($yr=$startyear;$yr<=$endyear;$yr++)

{
#loop through all the index quarters you specified
if($yr<$endyear){$eqtr=4}else{$eqtr=$endqtr}
for($qtr=$startqtr;$qtr<=$eqtr;$qtr++)

{
#Open the index file
open(INPUT, "$inddirect/company$qtr$yr.idx") || die "file for company$
+qtr$yr.idx: $!";
#Open the file you want to write to.  The first time through
#the file is opened to "replace" the existing file.
#After that, it is opened to append ">>".

if ($yr==$startyear && $qtr==$startqtr)
{$outfiler=">$outfile";}
else{$outfiler=">>$outfile";}
open(OUTPUT, "$outfiler") || die "file for 2006 1: $!";
$count=1;

while ($line=<INPUT>)

    {

#ignore the first 10 lines because they only contain header informatio
+n

if ($.<11) {next};

$company_name=substr($line,0,60);
$form_type=substr($line,62,12);
my $cik=substr($line,74,10);

$file_date=substr($line,86,10);

$file_date=~s/\-//g;
my $fullfilename=trim(substr($line,98,43));

if ($form_type=~/^$formget1(?!\/)/) 
    
{
    print OUTPUT "$fullfilename\n" ;
    
    $count++;
    
    print $line_count, " ", $form_type, " ", $base_url,"/",$fullfilena
+me,"\n";
}
elsif ($form_type=~/^$formget2(?!\/)/)
{
    print OUTPUT "$fullfilename\n" ;
    
   $count++;
   
   print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\n
+";
}
elsif ($form_type=~/^$formget3(?!\/)/)
{
    print OUTPUT "$fullfilename\n" ;
    
   $count++;
   
   print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\n
+";
}
elsif ($form_type=~/^$formget4(?!\/)/)
{
    print OUTPUT "$fullfilename\n" ;
    
   $count++;
   
   print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\n
+";
}
elsif ($form_type=~/^$formget5(?!\/)/)
{
    print OUTPUT "$fullfilename\n" ;
    
    print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\
+n";
    
   $count++;
}
elsif ($form_type=~/^$formget6(?!\/)/)
{
    print OUTPUT "$fullfilename\n" ;
    
    print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\
+n";
    
   $count++;
}
elsif ($form_type=~/^$formget7(?!\/)/)
{
    print OUTPUT "$fullfilename\n" ;
    
    print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\
+n";
    
   $count++;
}

   }

close(INPUT);
close(OUTPUT);
# check to see if directory exists.  If not, create it.
unless(-d "$direct$slash$yr"){
    mkdir("$direct$slash$yr") or die;
}
#Open the directory and get put the names of all files into the array 
+@old
opendir(DIR,"$direct$slash$yr")||die "Can't open directory";
@Old=readdir(DIR);
#The tie statement assigns the file containing the
#files you want to download to the array @New1.
tie(@New1,Tie::File,"$outfile", mode=> O_RDWR)
or die "Cannot tie file BOO: $!n";
#checks to see what files on the current index listing are not in the 
+directory
#defines a hash called seen.
%seen=();
#defines an array called @aonly.
@aonly=();
#build lookup table.  This step is building a lookup table(hash).
#each filename (from OLD) has a value of 1 assigned to it.
foreach $item(@Old){$seen{$item}=1}
#for each item in the New1 array, which we got from the txt file
#containing all the files we want to download, add
#it to the array, @aonly, as long is it is not already
#in the current directory.  We do this so we don't download
#a file we have already downloaded.
foreach $item(@New1){
         $item=~/(edgar\/data\/.*\/)(.*\.txt)/;
    unless($seen{$item}){
        push(@aonly,$item);

    }

}

#downloads all the files in the @oanly array which are the files not i
+n the directory
    
foreach $filetoget(@aonly)

{
        
    $fullfile="$base_url/$filetoget";
    $fonly=$filetoget;
    
    for my $line (split qr/\'\n'/, get($fullfile))
       
     {
         
         while ($line_count < 2)   {
        if($line=~m/^\s*CENTRAL\s*INDEX\s*KEY:\s*(\d*)/m){$cik=$1;}
        if($line=~m/^\s*FORM\s*TYPE:\s*(.*$)/m){$form_type=$1;}
        if($line=~m/^\s*CONFORMED\s*PERIOD\s*OF\s*REPORT:\s*(\d*)/m){$
+report_date=$1;}
        if($line=~m/^\s*FILED\s*AS\s*OF\s*DATE:\s*(\d*)/m){$file_date=
+$1;}
        if($line=~m/^\s*COMPANY\s*CONFORMED\s*NAME:\s*(.*$)/m){$name=$
+1;}
        
        $line_count++;
        
        print "$cik, $form_type, $report_date, $file_date, $name\n";
   
   print "$line_count\n";
    
    }   # end of while loop;
    
    }    # end of for my $line split loop;
    
    ### Now write the results to file!;
    
#Open the ouput file;

open my $FH_OUT, '>>',$write_dir or die "Can't open file $write_dir"; 

#Save/write results/output; 

$,='|';
print $FH_OUT "$cik$,$form_type$,$report_date$,$file_date$,$name$,\n";
$line_count=0;

#Update file counter;
  ++$file_count;
     
print "$file_count lines read from $fullfile\n";
  
#closedir($dir_handle);
close($FH_OUT);

}         # end of foreach file to get loop;             
                                                     
#end of qtr loop
}
#end of year loop
}

sub trim {
    my $new_phrase;
    my $phrase = shift(@_);
    $phrase =~ s/^\s+//;
    $phrase =~ s/\s+$//;
    $new_phrase = "$phrase";
    return "$new_phrase";
     }
[download]

In reply to Re^2: Getstore to avoid of memory? by wrkrbeee
in thread Getstore to avoid of memory? by wrkrbeee

Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!

Titles consisting of a single word are discouraged, and in most cases are disallowed outright.

Read Where should I post X? if you're not absolutely sure you're posting in the right place.

Please read these before you post! —

Posts may use any of the Perl Monks Approved HTML tags:

a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr

You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)

	For:		Use:
	&		`&`
	<		`<`
	>		`>`
	[		`[`
	]		`]`

Link using PerlMonks shortcuts! What shortcuts can I use for linking?

See Writeup Formatting Tips and other pages linked from there for more info.