For the fiscal year ending December 31, 2015 #### #!/usr/bin/perl -w use strict; use warnings; use Getopt::Long qw/GetOptions/; my $base_url = 'http://www.sec.gov/Archives'; my $debug =0; my $stillerror =0; my $waserror =0; my $percent =5; my $onlyrun =0; my $idxgz =''; my $sleep =0; GetOptions ("debug=i" => \$debug ,"stillerror!" => \$stillerror ,"waserror!" => \$waserror ,"percent=i" => \$percent # 0 means run all ,"onlyrun=i" => \$onlyrun # 0 means run all ,"sleep=i" => \$sleep # between process steps ,"idxgz=s" => \$idxgz ) or die("Error in command line arguments\n"); my @aonly; # need to be cd'd to dir with these in it or -idxgz needs full path # 2016QTR4company.gz from https://www.sec.gov/Archives/edgar/full-index/2016/QTR4/company.gz # 2016QTR3company.gz from https://www.sec.gov/Archives/edgar/full-index/2016/QTR3/company.gz # 2016QTR2company.gz from https://www.sec.gov/Archives/edgar/full-index/2016/QTR2/company.gz # 2016QTR1company.gz from https://www.sec.gov/Archives/edgar/full-index/2016/QTR1/company.gz if (0) { #Assign variable to file with URLs; my $urls = 'c:/my documents/research/sec filings/10K and 10Q/data/urls/sizefiles1.txt'; #my $urls = 'g:/research/SEC filings 10K and 10Q/data/urls'; #open text file with URLs, read URLs into array; open (my $fh, "<",$urls) or die "can't open $urls: $!"; while (my $url=<$fh>) {chomp $url; push @aonly,$url;} #close text file with URLs; close $fh or die "Cannot closee $urls: $!"; } if ($idxgz) { from_idx(\@aonly,$percent,$idxgz); } if ($waserror) { # had problems my @add=qw! edgar/data/1606163/0001144204-16-089184.txt edgar/data/1496443/0001019687-16-005668.txt edgar/data/1375195/0001144204-16-091408.txt edgar/data/910638/0001171843-16-008156.txt edgar/data/812149/0001144204-16-074455.txt edgar/data/786947/0001144204-16-085018.txt edgar/data/1459417/0001047469-16-010989.txt edgar/data/34782/0000034782-16-000102.txt edgar/data/1110783/0001110783-16-000532.txt edgar/data/1499275/0001499275-16-000012.txt edgar/data/2969/0001193125-16-773346.txt edgar/data/1446806/0001446806-16-000020.txt !; print '** adding had problems:'.scalar(@add)."\n"; unshift @aonly,@add; # put them first } if ($stillerror) { # still a prob my @add=qw! edgar/data/1167419/0001079973-16-000870.txt edgar/data/1445918/0001079973-16-000855.txt edgar/data/1031093/0001079973-16-000755.txt edgar/data/1568079/0001079973-16-000879.txt edgar/data/1493137/0001079973-16-000865.txt edgar/data/1598893/0001079973-16-000780.txt edgar/data/1353970/0001079973-16-000846.txt edgar/data/1628468/0001493152-16-013426.txt edgar/data/62234/0001144204-16-088204.txt edgar/data/715812/0000715812-16-000006.txt edgar/data/786110/0001193125-16-751374.txt edgar/data/1584754/0001615774-16-007904.txt edgar/data/1634293/0001690824-16-000006.txt edgar/data/786110/0001193125-16-751374.txt edgar/data/1432967/0001683168-16-000003.txt edgar/data/715812/0000715812-16-000006.txt edgar/data/715812/0000715812-16-000007.txt edgar/data/1496741/0001515971-16-000566.txt edgar/data/1622244/0001213900-16-012573.txt edgar/data/1413507/0001413507-16-000126.txt !; print '** adding still problems:'.scalar(@add)."\n"; unshift @aonly,@add; # put them first } print '** @aonly size:'.scalar(@aonly)."\n"; my $file_count=0; my $FH_OUT; my $fn_OUT = "c:/my documents/research/sec filings/Data2016_fiscal_year.txt"; #open ( $FH_OUT, '>>', $fn_OUT) or die "Couldn't open $!"; $FH_OUT=\*STDOUT; my @fields=qw/cik form_type report_date file_date name fiscal_year_ended/; my $mons=qr/January|February|March |April|May|June |July|August|September |October|November|December |Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec /ix; foreach my $filetoget(@aonly) { last if ($onlyrun && $file_count>=$onlyrun); my $fullfile="$base_url/$filetoget"; my $partial=get_trunc ($fullfile,100000); my $res=process ($partial,$fullfile); if (scalar(keys(%$res))) { my $lineout=''; for my $field (@fields){ if ($res->{$field}) {$lineout.=$res->{$field}} $lineout.='|'; } print $FH_OUT $lineout."\n"; } $file_count++; sleep $sleep if ($sleep); } print '** Processed:'.$file_count."\n"; close ($FH_OUT); exit; sub get_trunc { # content_cb hints via http://www.perlmonks.org/?node_id=1183107 my $fullfile=shift; my $truncsize=shift; use LWP::UserAgent; my $received_size = 0; my $partial = ''; my $ua = LWP::UserAgent->new; my $response = $ua->get($fullfile , ':content_cb'=> sub { my ($data, $response, $protocol) = @_; $partial.=$data; $received_size += length $data; # die inside this callback interrupts the request, not the program!! die if ($received_size>$truncsize); } ); return $partial; } # get_trunc sub process { use HTML::Parser (); my $partial=shift; my $fullfile=shift; my $res={}; #print $fullfile."\n"; if ($partial) { # when the search was for the simple text fields it was easy ... but ... # there was all sorts of html tags before and in the middle of dates, # the  's and dates split across lines # so just get rid of all the htmp so the date is inside the next 88 chars of trigger phrase my $line=''; my $hp = HTML::Parser->new( api_version => 3, text_h=>[ sub {$line .= shift},'dtext' ] ); $hp->parse( $partial ); # use HTML::Entities qw/decode_entities/; # my $line=decode_entities($line); # this is redundent after change to dtext rather than text { if($line=~m/^\s*CENTRAL\s*INDEX\s*KEY:\s*(\d*)/m) {$res->{cik} =$1;} if($line=~m/^\s*FORM\s*TYPE:\s*(.*$)/m) {$res->{form_type} =$1;} if($line=~m/^\s*CONFORMED\s*PERIOD\s*OF\s*REPORT:\s*(\d*)/m) {$res->{report_date}=$1;} if($line=~m/^\s*FILED\s*AS\s*OF\s*DATE:\s*(\d*)/m) {$res->{file_date} =$1;} if($line=~m/^\s*COMPANY\s*CONFORMED\s*NAME:\s*(.*$)/m) {$res->{name} =$1;} if($line=~m/(for\s+the\s+fiscal\s+year.*?\s+end.*?\s+.{88})/si){ my $next_n=$1; $next_n=~s/[^ a-zA-Z0-9,]+/ /gs; # be brutal $next_n=~s/ {2,}/ /gs; if($next_n=~m/($mons)\s+(\d{1,2})(?:st|th)*,?\s+(\d{2,4})/i){ # december 31, 2015 style $res->{fiscal_year_ended} =$1.' '.$2.', '.$3; } elsif($next_n=~m/(\d{1,2})\s+($mons)\s+(\d{2,4})/i){ # 30 december 2015 style $res->{fiscal_year_ended} =$2.' '.$1.', '.$3; } unless( (!$waserror) && ($res->{fiscal_year_ended}) ) { print '** '.$fullfile."\n"; print '** '.$next_n."\n"; # from before being brutal # my $hex=unpack("H*", $next_n); # my $len=length($hex); # print '** '; for (my $i=0;$i<$len;$i=$i+2){print substr($hex,$i,1);}print "\n"; # print '** '; for (my $i=1;$i<$len;$i=$i+2){print substr($hex,$i,1);}print "\n"; } } } } # success return $res; } # process sub from_idx{ my ($aonly,$percent,$fn)=@_; use IO::Uncompress::Gunzip qw(gunzip $GunzipError) ; use IO::File; my $idx= new IO::Uncompress::Gunzip $fn or die "IO::Uncompress::Gunzip failed: $GunzipError\n"; my $tested=0; my $ct=0; my $elig=0; while (my $line=<$idx>) { # last if ($ct>=50); #testing $tested++; my $ok=0; if ($line=~/10\-K/){ $ok=1;} # if ($line=~/10\-Q/){ $ok=1;} # 10-q dont have any dates next unless ($ok); chomp $line; $elig++; next if ($percent && $percent<=rand(100)); # % chance to be included my @data=split(' ',$line); push @$aonly,$data[-1]; $ct++; } close $idx; print '** Fn::'.$fn."\n"; print '** Tested:'.$tested."\n"; print '** Elig :'.$elig."\n"; print '** Used :'.$ct."\n"; } # from_idx #### # perl -still #### ** For the fiscal year ended or X TRANSITION REPORT PURSUANT TO SECTION 13 OR 15 d OF THE SEC #### ** FOR THE FISCAL YEAR ENDED 12 31 2015 UNITED STATES SECURITIES AND EXCHANGE COMMISSION WASHINGTON, D C 2