#!/usr/bin/perl -w
#use strict;
# This program extracts data from an SEC filing, including chunks of text
use File::stat;
#This program is going to obtain and extract the entire audit opinion but you can
#extract whatever text you are interested in by changing the regular expressions for the start
#and end strings below.
#This program was written by Andy Leone, May 15 2007 and updated July 25, 2008.
#You are free to use this program for your own use. My only request is
#that you make an acknowledgement in any research manuscripts that
#benefit from the program.
my $startstring='((^\s*?)((We\s*(have|were)\s*(audited\s*the\s*(Statement\s*of\s*Financial\s*Condition|consolidated|accompanying|combined|balance\s*sheets)|(completed\s*|engaged\s*to\s*perform)\s*an\s*integrated\s*audit))|In\s*our\s*opinion,\s*the\s*(consolidated|accompanying)))';
my $startstringhtm='(((We\s*(have|were)\s*(audited\s*the\s*(Statement\s*of\s*Financial\s*Condition|consolidated|accompanying|combined|balance\s*sheets)|(completed\s*|engaged\s*to\s*perform)\s*an\s*integrated\s*audit))|In\s*our\s*opinion,\s*the\s*(consolidated|accompanying)))';
#Specify the end of the text you are looking for.
my $endstring='((^\s*)/s/|^\s*(Date:\s*)?(\d{1,2}\s*)?((January|February|March|April|May|June|July|August|September|October|November|December))\s*(\d{1,2},)?\s*\d{4}(\s*$|,\s{0,3}except|\s*/s|\s*s/|\d{1,2}))';
my $endstringhtm='((>|^\s*)(/s/)?\s*(Date:\s*)?(\d{1,2}\s*)?(January|February|March|April|May|June|July|August|September|October|November|December))\s*(&\w+?;\s*)?(\d{1,2},)?\s*\d{4}(\s*$|\s*[,\(]\s{0,3}except|\s*with\s*respect\s*to\s*our\s*opinion|<\/P>| |\s{0,1}\<\/FONT\>|\d{1,2})';
#Specify the directory containing the files that you want to read
my $direct="E:\\Research\\SEC filings 10K and 10Q\\Data\\Filing Docs\\2008test";
my $outfile="E:\\Research\\SEC filings 10K and 10Q\\Data\\Header Data\\Data2008test.txt";
#If Windows "\\", if Mac "/";
my $slash='\\';
$outfiler=">$outfile";
open(OUTPUT, "$outfiler") || die "file for 2006 1: $!";
#The following two steps open the directory containing the files you plan to read
#and then stores the name of each file in an array called @New1.
opendir(DIR1,"$direct")||die "Can't open directory";
my @New1=readdir(DIR1);
#We will now loop through each file. THe file names
#have been stored in the array called @New1;
foreach $file(@New1)
{
#This prevents me from reading the first two entries in a directory . and ..;
if ($file=~/^\./){next;}
#Initialize the variable names.
my $cik=-99;
my $form_type="";
my $report_date=-99;
my $file_date=-99;
my $name="";
my $sic=-99;
my $HTML=0;
my $Audit_Opinion="Not Found";
my $Going_Concern=0;
my $ao="Not Found";
my $tree="Empty";
my $data="";
#Open the file and put the file in variable called $data
#$data will contain the entire filing
{
# this step removes the default end of line character (\n)
# so the the entire file can be read in at once.
local $/;
#read the contents into data
open (INPUT, "$direct$slash"."$file");
while ($data= ) {
#The following steps obtain basic data from the filings
if ($data=~m//i){$HTML=1;}
if($data=~m/^\s*CENTRAL\s*INDEX\s*KEY:\s*(\d*)/m){$cik=$1;}
if($data=~m/^\s*FORM\s*TYPE:\s*(.*$)/m){$form_type=$1;}
if($data=~m/^\s*CONFORMED\s*PERIOD\s*OF\s*REPORT:\s*(\d*)/m){$report_date=$1;}
if($data=~m/^\s*FILED\s*AS\s*OF\s*DATE:\s*(\d*)/m){$file_date=$1;}
if($data=~m/^\s*COMPANY\s*CONFORMED\s*NAME:\s*(.*$)/m){$name=$1;}
if($data=~m/^\s*STANDARD\s*INDUSTRIAL\s*CLASSIFICATION:.*?\[(\d{4})/m){$sic=$1;}
my $filesize = -s $direct . '/' . $file;
my $sb = stat($direct . '/' . $file)->size;
print OUTPUT "$cik,$form_type,$report_date,$file_date,$name,$sic,$filesize,$sb\n";
}
close INPUT or die "cannot close $file: $!";
}
}