<P STYLE="font: 10pt Times New Roman, Times, Serif; margin: 0; text-al
+ign: center">For the fiscal year ending December 31, 2015
Oh, well that changes everything doesnt it?
Just incase wrkrbeee comes back, or someone else is interested, here is refactored code instead of Re^3: Getstore to avoid of memory?
#!/usr/bin/perl -w
use strict; use warnings;
use Getopt::Long qw/GetOptions/;
my $base_url = 'http://www.sec.gov/Archives';
my $debug =0;
my $stillerror =0;
my $waserror =0;
my $percent =5;
my $onlyrun =0;
my $idxgz ='';
my $sleep =0;
GetOptions ("debug=i" => \$debug
,"stillerror!" => \$stillerror
,"waserror!" => \$waserror
,"percent=i" => \$percent # 0 means run all
,"onlyrun=i" => \$onlyrun # 0 means run all
,"sleep=i" => \$sleep # between process step
+s
,"idxgz=s" => \$idxgz
) or die("Error in command line arguments\n");
my @aonly;
# need to be cd'd to dir with these in it or -idxgz needs full path
# 2016QTR4company.gz from https://www.sec.gov/Archives/edgar/full-ind
+ex/2016/QTR4/company.gz
# 2016QTR3company.gz from https://www.sec.gov/Archives/edgar/full-ind
+ex/2016/QTR3/company.gz
# 2016QTR2company.gz from https://www.sec.gov/Archives/edgar/full-ind
+ex/2016/QTR2/company.gz
# 2016QTR1company.gz from https://www.sec.gov/Archives/edgar/full-ind
+ex/2016/QTR1/company.gz
if (0)
{
#Assign variable to file with URLs;
my $urls = 'c:/my documents/research/sec filings/10K and 10Q/data/
+urls/sizefiles1.txt';
#my $urls = 'g:/research/SEC filings 10K and 10Q/data/urls';
#open text file with URLs, read URLs into array;
open (my $fh, "<",$urls) or die "can't open $urls: $!";
while (my $url=<$fh>) {chomp $url; push @aonly,$url;}
#close text file with URLs;
close $fh or die "Cannot closee $urls: $!";
}
if ($idxgz) {
from_idx(\@aonly,$percent,$idxgz);
}
if ($waserror)
{
# had problems
my @add=qw!
edgar/data/1606163/0001144204-16-089184.txt
edgar/data/1496443/0001019687-16-005668.txt
edgar/data/1375195/0001144204-16-091408.txt
edgar/data/910638/0001171843-16-008156.txt
edgar/data/812149/0001144204-16-074455.txt
edgar/data/786947/0001144204-16-085018.txt
edgar/data/1459417/0001047469-16-010989.txt
edgar/data/34782/0000034782-16-000102.txt
edgar/data/1110783/0001110783-16-000532.txt
edgar/data/1499275/0001499275-16-000012.txt
edgar/data/2969/0001193125-16-773346.txt
edgar/data/1446806/0001446806-16-000020.txt
!;
print '** adding had problems:'.scalar(@add)."\n";
unshift @aonly,@add; # put them first
}
if ($stillerror)
{
# still a prob
my @add=qw!
edgar/data/1167419/0001079973-16-000870.txt
edgar/data/1445918/0001079973-16-000855.txt
edgar/data/1031093/0001079973-16-000755.txt
edgar/data/1568079/0001079973-16-000879.txt
edgar/data/1493137/0001079973-16-000865.txt
edgar/data/1598893/0001079973-16-000780.txt
edgar/data/1353970/0001079973-16-000846.txt
edgar/data/1628468/0001493152-16-013426.txt
edgar/data/62234/0001144204-16-088204.txt
edgar/data/715812/0000715812-16-000006.txt
edgar/data/786110/0001193125-16-751374.txt
edgar/data/1584754/0001615774-16-007904.txt
edgar/data/1634293/0001690824-16-000006.txt
edgar/data/786110/0001193125-16-751374.txt
edgar/data/1432967/0001683168-16-000003.txt
edgar/data/715812/0000715812-16-000006.txt
edgar/data/715812/0000715812-16-000007.txt
edgar/data/1496741/0001515971-16-000566.txt
edgar/data/1622244/0001213900-16-012573.txt
edgar/data/1413507/0001413507-16-000126.txt
!;
print '** adding still problems:'.scalar(@add)."\n";
unshift @aonly,@add; # put them first
}
print '** @aonly size:'.scalar(@aonly)."\n";
my $file_count=0;
my $FH_OUT;
my $fn_OUT = "c:/my documents/research/sec filings/Data2016_fiscal_yea
+r.txt";
#open ( $FH_OUT, '>>', $fn_OUT) or die "Couldn't open $!";
$FH_OUT=\*STDOUT;
my @fields=qw/cik form_type report_date file_date name fiscal_year_end
+ed/;
my $mons=qr/January|February|March
|April|May|June
|July|August|September
|October|November|December
|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec
/ix;
foreach my $filetoget(@aonly) {
last if ($onlyrun && $file_count>=$onlyrun);
my $fullfile="$base_url/$filetoget";
my $partial=get_trunc ($fullfile,100000);
my $res=process ($partial,$fullfile);
if (scalar(keys(%$res))) {
my $lineout='';
for my $field (@fields){
if ($res->{$field}) {$lineout.=$res->{$field}}
$lineout.='|';
}
print $FH_OUT $lineout."\n";
}
$file_count++;
sleep $sleep if ($sleep);
}
print '** Processed:'.$file_count."\n";
close ($FH_OUT);
exit;
sub get_trunc {
# content_cb hints via http://www.perlmonks.org/?node_id=1183107
my $fullfile=shift;
my $truncsize=shift;
use LWP::UserAgent;
my $received_size = 0;
my $partial = '';
my $ua = LWP::UserAgent->new;
my $response = $ua->get($fullfile
, ':content_cb'=> sub {
my ($data, $response, $protocol) = @_;
$partial.=$data;
$received_size += length $data;
+
# die inside this callback interrupts t
+he request, not the program!!
die if ($received_size>$truncsize);
}
);
return $partial;
} # get_trunc
sub process {
use HTML::Parser ();
my $partial=shift;
my $fullfile=shift;
my $res={};
#print $fullfile."\n";
if ($partial) {
# when the search was for the simple text fields it was easy ..
+. but ...
# there was all sorts of html tags before and in the middle of d
+ates,
# the 's and dates split across lines
# so just get rid of all the htmp so the date is inside the nex
+t 88 chars of trigger phrase
my $line='';
my $hp = HTML::Parser->new( api_version => 3,
text_h=>[ sub {$line .= shift},'dtext
+' ]
);
$hp->parse( $partial );
# use HTML::Entities qw/decode_entities/;
# my $line=decode_entities($line); # this is redundent after chan
+ge to dtext rather than text
{
if($line=~m/^\s*CENTRAL\s*INDEX\s*KEY:\s*(\d*)/m)
+{$res->{cik} =$1;}
if($line=~m/^\s*FORM\s*TYPE:\s*(.*$)/m)
+{$res->{form_type} =$1;}
if($line=~m/^\s*CONFORMED\s*PERIOD\s*OF\s*REPORT:\s*(\d*)/m)
+{$res->{report_date}=$1;}
if($line=~m/^\s*FILED\s*AS\s*OF\s*DATE:\s*(\d*)/m)
+{$res->{file_date} =$1;}
if($line=~m/^\s*COMPANY\s*CONFORMED\s*NAME:\s*(.*$)/m)
+{$res->{name} =$1;}
if($line=~m/(for\s+the\s+fiscal\s+year.*?\s+end.*?\s+.{88})/si
+){
my $next_n=$1;
$next_n=~s/[^ a-zA-Z0-9,]+/ /gs; # be brutal
$next_n=~s/ {2,}/ /gs;
if($next_n=~m/($mons)\s+(\d{1,2})(?:st|th)*,?\s+(\d{2,4})/
+i){ # december 31, 2015 style
$res->{fiscal_year_ended} =$1.' '.$2.', '.$3;
}
elsif($next_n=~m/(\d{1,2})\s+($mons)\s+(\d{2,4})/i){ # 30
+december 2015 style
$res->{fiscal_year_ended} =$2.' '.$1.', '.$3;
}
unless( (!$waserror) && ($res->{fiscal_year_ended}) ) {
print '** '.$fullfile."\n";
print '** '.$next_n."\n";
# from before being brutal
# my $hex=unpack("H*", $next_n);
# my $len=length($hex);
# print '** '; for (my $i=0;$i<$len;$i=$i+2){print subs
+tr($hex,$i,1);}print "\n";
# print '** '; for (my $i=1;$i<$len;$i=$i+2){print subs
+tr($hex,$i,1);}print "\n";
}
}
}
} # success
return $res;
} # process
sub from_idx{
my ($aonly,$percent,$fn)=@_;
use IO::Uncompress::Gunzip qw(gunzip $GunzipError) ;
use IO::File;
my $idx= new IO::Uncompress::Gunzip $fn or die "IO::Uncompress::Gunz
+ip failed: $GunzipError\n";
my $tested=0;
my $ct=0;
my $elig=0;
while (my $line=<$idx>) {
# last if ($ct>=50); #testing
$tested++;
my $ok=0;
if ($line=~/10\-K/){ $ok=1;}
# if ($line=~/10\-Q/){ $ok=1;} # 10-q dont have any dates
next unless ($ok);
chomp $line;
$elig++;
next if ($percent && $percent<=rand(100)); # % chance to be includ
+ed
my @data=split(' ',$line);
push @$aonly,$data[-1];
$ct++;
}
close $idx;
print '** Fn::'.$fn."\n";
print '** Tested:'.$tested."\n";
print '** Elig :'.$elig."\n";
print '** Used :'.$ct."\n";
} # from_idx
Notice that i deleted using split at all since dates were split across lines.
and fixed a regexp problem when the OR list of months was not in its own grouping parens(duh!!!).
I use HTML::Parser to remove all the html-code, cuz there was often huge html code after the trigger phrase and in at least one case there was html code between the month and date (font changes).
And i brutally kill everything that isnt in a list of approved chars because translated to a char that didnt look right in the output, and there was something else that counted as \s in the date that didnt look good either after the decode_entities($line). (edit: and \n could end up in the captured date too!)
I split getting and processing just because.
if you want to try this you can use
# perl <fn of this download> -still
I ran all of 2016QTR1company.gz last nite(-idxgz 2016QTR1company.gz -percent 100
), resulting in @aonly size:6608, and it showed few errors, mostly totally missing dates
** For the fiscal year ended or X TRANSITION REPORT PURSUANT TO SECTIO
+N 13 OR 15 d OF THE SEC
and a few like this with totaly numeric dates
** FOR THE FISCAL YEAR ENDED 12 31 2015 UNITED STATES SECURITIES AND E
+XCHANGE COMMISSION WASHINGTON, D C 2
Also note that if you run with -was, it forces the printout of $next_n even if it matched ok, so you wouldnt want to run it with any other flag but -still.
|