For the fiscal year ending December 31, 2015
####
#!/usr/bin/perl -w
use strict; use warnings;
use Getopt::Long qw/GetOptions/;
my $base_url = 'http://www.sec.gov/Archives';
my $debug =0;
my $stillerror =0;
my $waserror =0;
my $percent =5;
my $onlyrun =0;
my $idxgz ='';
my $sleep =0;
GetOptions ("debug=i" => \$debug
,"stillerror!" => \$stillerror
,"waserror!" => \$waserror
,"percent=i" => \$percent # 0 means run all
,"onlyrun=i" => \$onlyrun # 0 means run all
,"sleep=i" => \$sleep # between process steps
,"idxgz=s" => \$idxgz
) or die("Error in command line arguments\n");
my @aonly;
# need to be cd'd to dir with these in it or -idxgz needs full path
# 2016QTR4company.gz from https://www.sec.gov/Archives/edgar/full-index/2016/QTR4/company.gz
# 2016QTR3company.gz from https://www.sec.gov/Archives/edgar/full-index/2016/QTR3/company.gz
# 2016QTR2company.gz from https://www.sec.gov/Archives/edgar/full-index/2016/QTR2/company.gz
# 2016QTR1company.gz from https://www.sec.gov/Archives/edgar/full-index/2016/QTR1/company.gz
if (0)
{
#Assign variable to file with URLs;
my $urls = 'c:/my documents/research/sec filings/10K and 10Q/data/urls/sizefiles1.txt';
#my $urls = 'g:/research/SEC filings 10K and 10Q/data/urls';
#open text file with URLs, read URLs into array;
open (my $fh, "<",$urls) or die "can't open $urls: $!";
while (my $url=<$fh>) {chomp $url; push @aonly,$url;}
#close text file with URLs;
close $fh or die "Cannot closee $urls: $!";
}
if ($idxgz) {
from_idx(\@aonly,$percent,$idxgz);
}
if ($waserror)
{
# had problems
my @add=qw!
edgar/data/1606163/0001144204-16-089184.txt
edgar/data/1496443/0001019687-16-005668.txt
edgar/data/1375195/0001144204-16-091408.txt
edgar/data/910638/0001171843-16-008156.txt
edgar/data/812149/0001144204-16-074455.txt
edgar/data/786947/0001144204-16-085018.txt
edgar/data/1459417/0001047469-16-010989.txt
edgar/data/34782/0000034782-16-000102.txt
edgar/data/1110783/0001110783-16-000532.txt
edgar/data/1499275/0001499275-16-000012.txt
edgar/data/2969/0001193125-16-773346.txt
edgar/data/1446806/0001446806-16-000020.txt
!;
print '** adding had problems:'.scalar(@add)."\n";
unshift @aonly,@add; # put them first
}
if ($stillerror)
{
# still a prob
my @add=qw!
edgar/data/1167419/0001079973-16-000870.txt
edgar/data/1445918/0001079973-16-000855.txt
edgar/data/1031093/0001079973-16-000755.txt
edgar/data/1568079/0001079973-16-000879.txt
edgar/data/1493137/0001079973-16-000865.txt
edgar/data/1598893/0001079973-16-000780.txt
edgar/data/1353970/0001079973-16-000846.txt
edgar/data/1628468/0001493152-16-013426.txt
edgar/data/62234/0001144204-16-088204.txt
edgar/data/715812/0000715812-16-000006.txt
edgar/data/786110/0001193125-16-751374.txt
edgar/data/1584754/0001615774-16-007904.txt
edgar/data/1634293/0001690824-16-000006.txt
edgar/data/786110/0001193125-16-751374.txt
edgar/data/1432967/0001683168-16-000003.txt
edgar/data/715812/0000715812-16-000006.txt
edgar/data/715812/0000715812-16-000007.txt
edgar/data/1496741/0001515971-16-000566.txt
edgar/data/1622244/0001213900-16-012573.txt
edgar/data/1413507/0001413507-16-000126.txt
!;
print '** adding still problems:'.scalar(@add)."\n";
unshift @aonly,@add; # put them first
}
print '** @aonly size:'.scalar(@aonly)."\n";
my $file_count=0;
my $FH_OUT;
my $fn_OUT = "c:/my documents/research/sec filings/Data2016_fiscal_year.txt";
#open ( $FH_OUT, '>>', $fn_OUT) or die "Couldn't open $!";
$FH_OUT=\*STDOUT;
my @fields=qw/cik form_type report_date file_date name fiscal_year_ended/;
my $mons=qr/January|February|March
|April|May|June
|July|August|September
|October|November|December
|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec
/ix;
foreach my $filetoget(@aonly) {
last if ($onlyrun && $file_count>=$onlyrun);
my $fullfile="$base_url/$filetoget";
my $partial=get_trunc ($fullfile,100000);
my $res=process ($partial,$fullfile);
if (scalar(keys(%$res))) {
my $lineout='';
for my $field (@fields){
if ($res->{$field}) {$lineout.=$res->{$field}}
$lineout.='|';
}
print $FH_OUT $lineout."\n";
}
$file_count++;
sleep $sleep if ($sleep);
}
print '** Processed:'.$file_count."\n";
close ($FH_OUT);
exit;
sub get_trunc {
# content_cb hints via http://www.perlmonks.org/?node_id=1183107
my $fullfile=shift;
my $truncsize=shift;
use LWP::UserAgent;
my $received_size = 0;
my $partial = '';
my $ua = LWP::UserAgent->new;
my $response = $ua->get($fullfile
, ':content_cb'=> sub {
my ($data, $response, $protocol) = @_;
$partial.=$data;
$received_size += length $data;
# die inside this callback interrupts the request, not the program!!
die if ($received_size>$truncsize);
}
);
return $partial;
} # get_trunc
sub process {
use HTML::Parser ();
my $partial=shift;
my $fullfile=shift;
my $res={};
#print $fullfile."\n";
if ($partial) {
# when the search was for the simple text fields it was easy ... but ...
# there was all sorts of html tags before and in the middle of dates,
# the 's and dates split across lines
# so just get rid of all the htmp so the date is inside the next 88 chars of trigger phrase
my $line='';
my $hp = HTML::Parser->new( api_version => 3,
text_h=>[ sub {$line .= shift},'dtext' ]
);
$hp->parse( $partial );
# use HTML::Entities qw/decode_entities/;
# my $line=decode_entities($line); # this is redundent after change to dtext rather than text
{
if($line=~m/^\s*CENTRAL\s*INDEX\s*KEY:\s*(\d*)/m) {$res->{cik} =$1;}
if($line=~m/^\s*FORM\s*TYPE:\s*(.*$)/m) {$res->{form_type} =$1;}
if($line=~m/^\s*CONFORMED\s*PERIOD\s*OF\s*REPORT:\s*(\d*)/m) {$res->{report_date}=$1;}
if($line=~m/^\s*FILED\s*AS\s*OF\s*DATE:\s*(\d*)/m) {$res->{file_date} =$1;}
if($line=~m/^\s*COMPANY\s*CONFORMED\s*NAME:\s*(.*$)/m) {$res->{name} =$1;}
if($line=~m/(for\s+the\s+fiscal\s+year.*?\s+end.*?\s+.{88})/si){
my $next_n=$1;
$next_n=~s/[^ a-zA-Z0-9,]+/ /gs; # be brutal
$next_n=~s/ {2,}/ /gs;
if($next_n=~m/($mons)\s+(\d{1,2})(?:st|th)*,?\s+(\d{2,4})/i){ # december 31, 2015 style
$res->{fiscal_year_ended} =$1.' '.$2.', '.$3;
}
elsif($next_n=~m/(\d{1,2})\s+($mons)\s+(\d{2,4})/i){ # 30 december 2015 style
$res->{fiscal_year_ended} =$2.' '.$1.', '.$3;
}
unless( (!$waserror) && ($res->{fiscal_year_ended}) ) {
print '** '.$fullfile."\n";
print '** '.$next_n."\n";
# from before being brutal
# my $hex=unpack("H*", $next_n);
# my $len=length($hex);
# print '** '; for (my $i=0;$i<$len;$i=$i+2){print substr($hex,$i,1);}print "\n";
# print '** '; for (my $i=1;$i<$len;$i=$i+2){print substr($hex,$i,1);}print "\n";
}
}
}
} # success
return $res;
} # process
sub from_idx{
my ($aonly,$percent,$fn)=@_;
use IO::Uncompress::Gunzip qw(gunzip $GunzipError) ;
use IO::File;
my $idx= new IO::Uncompress::Gunzip $fn or die "IO::Uncompress::Gunzip failed: $GunzipError\n";
my $tested=0;
my $ct=0;
my $elig=0;
while (my $line=<$idx>) {
# last if ($ct>=50); #testing
$tested++;
my $ok=0;
if ($line=~/10\-K/){ $ok=1;}
# if ($line=~/10\-Q/){ $ok=1;} # 10-q dont have any dates
next unless ($ok);
chomp $line;
$elig++;
next if ($percent && $percent<=rand(100)); # % chance to be included
my @data=split(' ',$line);
push @$aonly,$data[-1];
$ct++;
}
close $idx;
print '** Fn::'.$fn."\n";
print '** Tested:'.$tested."\n";
print '** Elig :'.$elig."\n";
print '** Used :'.$ct."\n";
} # from_idx
####
# perl -still
####
** For the fiscal year ended or X TRANSITION REPORT PURSUANT TO SECTION 13 OR 15 d OF THE SEC
####
** FOR THE FISCAL YEAR ENDED 12 31 2015 UNITED STATES SECURITIES AND EXCHANGE COMMISSION WASHINGTON, D C 2