#!/usr/bin/perl -w #use strict; # This program extracts data from an SEC filing, including chunks of text use File::stat; #This program is going to obtain and extract the entire audit opinion but you can #extract whatever text you are interested in by changing the regular expressions for the start #and end strings below. #This program was written by Andy Leone, May 15 2007 and updated July 25, 2008. #You are free to use this program for your own use. My only request is #that you make an acknowledgement in any research manuscripts that #benefit from the program. my $startstring='((^\s*?)((We\s*(have|were)\s*(audited\s*the\s*(Statement\s*of\s*Financial\s*Condition|consolidated|accompanying|combined|balance\s*sheets)|(completed\s*|engaged\s*to\s*perform)\s*an\s*integrated\s*audit))|In\s*our\s*opinion,\s*the\s*(consolidated|accompanying)))'; my $startstringhtm='(((We\s*(have|were)\s*(audited\s*the\s*(Statement\s*of\s*Financial\s*Condition|consolidated|accompanying|combined|balance\s*sheets)|(completed\s*|engaged\s*to\s*perform)\s*an\s*integrated\s*audit))|In\s*our\s*opinion,\s*the\s*(consolidated|accompanying)))'; #Specify the end of the text you are looking for. my $endstring='((^\s*)/s/|^\s*(Date:\s*)?(\d{1,2}\s*)?((January|February|March|April|May|June|July|August|September|October|November|December))\s*(\d{1,2},)?\s*\d{4}(\s*$|,\s{0,3}except|\s*/s|\s*s/|\d{1,2}))'; my $endstringhtm='((>|^\s*)(/s/)?\s*(Date:\s*)?(\d{1,2}\s*)?(January|February|March|April|May|June|July|August|September|October|November|December))\s*(&\w+?;\s*)?(\d{1,2},)?\s*\d{4}(\s*$|\s*[,\(]\s{0,3}except|\s*with\s*respect\s*to\s*our\s*opinion|<\/P>|
|\s{0,1}\<\/FONT\>|\d{1,2})'; #Specify the directory containing the files that you want to read my $direct="E:\\Research\\SEC filings 10K and 10Q\\Data\\Filing Docs\\2008test"; my $outfile="E:\\Research\\SEC filings 10K and 10Q\\Data\\Header Data\\Data2008test.txt"; #If Windows "\\", if Mac "/"; my $slash='\\'; $outfiler=">$outfile"; open(OUTPUT, "$outfiler") || die "file for 2006 1: $!"; #The following two steps open the directory containing the files you plan to read #and then stores the name of each file in an array called @New1. opendir(DIR1,"$direct")||die "Can't open directory"; my @New1=readdir(DIR1); #We will now loop through each file. THe file names #have been stored in the array called @New1; foreach $file(@New1) { #This prevents me from reading the first two entries in a directory . and ..; if ($file=~/^\./){next;} #Initialize the variable names. my $cik=-99; my $form_type=""; my $report_date=-99; my $file_date=-99; my $name=""; my $sic=-99; my $HTML=0; my $Audit_Opinion="Not Found"; my $Going_Concern=0; my $ao="Not Found"; my $tree="Empty"; my $data=""; #Open the file and put the file in variable called $data #$data will contain the entire filing { # this step removes the default end of line character (\n) # so the the entire file can be read in at once. local $/; #read the contents into data open (INPUT, "$direct$slash"."$file"); while ($data= ) { #The following steps obtain basic data from the filings if ($data=~m//i){$HTML=1;} if($data=~m/^\s*CENTRAL\s*INDEX\s*KEY:\s*(\d*)/m){$cik=$1;} if($data=~m/^\s*FORM\s*TYPE:\s*(.*$)/m){$form_type=$1;} if($data=~m/^\s*CONFORMED\s*PERIOD\s*OF\s*REPORT:\s*(\d*)/m){$report_date=$1;} if($data=~m/^\s*FILED\s*AS\s*OF\s*DATE:\s*(\d*)/m){$file_date=$1;} if($data=~m/^\s*COMPANY\s*CONFORMED\s*NAME:\s*(.*$)/m){$name=$1;} if($data=~m/^\s*STANDARD\s*INDUSTRIAL\s*CLASSIFICATION:.*?\[(\d{4})/m){$sic=$1;} my $filesize = -s $direct . '/' . $file; my $sb = stat($direct . '/' . $file)->size; print OUTPUT "$cik,$form_type,$report_date,$file_date,$name,$sic,$filesize,$sb\n"; } close INPUT or die "cannot close $file: $!"; } }