in reply to Re: Perl Script in Windows Works, but not in Unix
in thread Perl Script in Windows Works, but not in Unix
#!/usr/bin/env perl # name # Script to extract "item 7" from a 10-K report. # This will write the "good" part of the file to stdout, and will writ +e # a "schema string" on a single line to stderr. $dirtoget="/10K-txt/"; $dirwrite="/MDA/"; opendir(IMD, $dirtoget) || die("Cannot open directory"); @thefiles= readdir(IMD); closedir(IMD); foreach $f (@thefiles) { unless ( ($f eq ".") || ($f eq "..") ) { $fr="$dirtoget$f"; open(FILEREAD, "< $fr"); $f=~s/.txt/.mda/g; $fw="$dirwrite$f"; #print $f1; open(FILEWRITE, "> $fw"); $x=""; while($line = <FILEREAD>) { $x .= $line; } # read the whole file into +one string close FILEREAD; # $x =~ s/^(\-|\s)*$//gs; # not in Bryan's script; probably not used i +n generating the data # these are the magic regexps that break the file into parts based on +the names of items 7, 7a, and 8: #$x =~ s/([^\"])(item\s+7[^0-9a-z\"]*management(?:[^0-9a-z]{0,3}s)?\s+ +discussions?\s+and\s+analysis(?:\s+of\s+(?:financial\s+conditions?|re +sults\s+of\s+operations?))?(?:\s+and\s+results\s+of\s+operations?|\s+ +and\s+financial\s+conditions?)?)/$1#######ITEM7:$2#######/gis; # Item7. MANAGEMENT'S DISCUSSION AND ANALYSIS OR PLAN OF OPERATIONS. #ITEM7. MANAGEMENT'S DISCUSSION AND ANALYSIS. $x =~ s/([^\"])(\nitem\s*(?:6|7)[^0-9a-z\"]*(?:management(?:[^0-9a-z]{ +0,3}s)?\s+)?\s*discussions?\s+and\s+analysis(?:\s+of\s+(?:financial\s ++conditions?|results\s+of\s+operations?))?(?:\s+and\s+results\s+of\s+ +operations?|\s+and\s+financial\s+conditions?|\s+or\s+plan\s+of\s+oper +ations?)?)/$1#######ITEM7:$2#######/gis; #$x =~ s/([^\"])(item\s*(?:\d\d?)[^0-9a-z\"]*management(?:[^0-9a-z]{0, +3}s)?\s+discussions?\s+and\s+analysis(?:\s+of\s+(?:financial\s+condit +ions?|results\s+of\s+operations?))?(?:\s+and\s+results\s+of\s+operati +ons?|\s+and\s+financial\s+conditions?|\s+or\s+plan\s+of\s+operations? +)?)/$1#######ITEM7:$2#######/gis; $x =~ s/([^\"])(\nitem\s*7[^0-9a-z\"]*a[^0-9a-z\"]*(?:(?:quantitative| +quantitive)\s+and\s+(?:qualitative|qualification)\s+disclosures?\s+ab +out\s+)?market\s+risk)/$1#######ITEM7A:$2#######/gis; #QUANTITIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK $x =~ s/([^\"])(\nitem\s*(?:7|8)[^0-9a-z\"]*.{0,40}financial\s+stateme +nts)/$1#######ITEM8:$2#######/gis; #FINANCIAL STATEMENTS $x =~ s/([^\"])(\nitem\s*(?:8|9)[^0-9a-z\"]*.{0,40}changes?\s+in\s+and +\s+disagreements?)/$1#######ITEM9:$2#######/gis; # changes in and disagreements with accountants on accounting and fina +ncial disclosure $x =~ s/([^\"])(\nitem\s*(?:8|9)[^0-9a-z\"]*a.{0,40}controls?\s+.{0,10 +}procedures?)/$1#######ITEM9A:$2#######/gis; #ITEM 9A(T). Controls and Procedures Disclosure Controls and Procedur +es $x =~ s/([^\"])(\nitem\s*(?:9|10)[^0-9a-z\"]*.{0,40}directors?.{0,10}e +xecutive\s+officers)/$1#######ITEM10:$2#######/gis; # Directors, Executive Officers and Corporate Governance # Directors and Executive Officers, Promoters and Control Persons $x =~ s/([^\"])(\nitem\s*(?:10|11)[^0-9a-z\"]*.{0,40}executive\s+compe +nsation)/$1#######ITEM11:$2#######/gis; # Executive Compensation $x =~ s/([^\"])(\nitem\s*(?:11|12)[^0-9a-z\"]*.{0,40}security\s+owners +hip)/$1#######ITEM12:$2#######/gis; # Security Ownership of Certain Beneficial Owners and Management and R +elated Stockholder Matters $x =~ s/([^\"])(\nitem\s*(?:12|13)[^0-9a-z\"]*.{0,50}transactions?)/$1 +#######ITEM13:$2#######/gis; # Transactions with Related Persons and Director Independence # CERTAIN RELATIONSHIPS AND RELATED TRANSACTIONS #CERTAIN RELATIONSHIPS AND RELATED TRANSACTIONS, AND DIRECTOR INDEPEND +ENCE #CERTAIN RELATIONSHIPS AND RELATED PERSON TRANSACTIONS AND DIRECTOR IN +DEPENDENCE $x =~ s/([^\"])(\nitem\s*14[^0-9a-z\"]*.{0,40}principal\s+account)/$1# +######ITEM14:$2#######/gis; # Principal Accountant Fees and Services #Principal Accounting Fees and Services $x =~ s/([^\"])(\nitem\s*15[^0-9a-z\"]*.{0,40}exhibits?(?:\s+and\s+fin +ancial)?)/$1#######ITEM15:$2#######/gis; # Exhibits and Financial Statement Schedules @X = (split /\#\#\#\#\#\#\#/, $x); # tokenized raw file with some note +s for($i = 0; $i < scalar(@X); ++$i) { if($X[$i] =~ m/^(ITEM(?:7|7A|8|9|9A|10|11|12|13|14|15)):(.*)$/s) { #print $X[$i],"\n"; $Z[$i] = $2; # this is what will get written out, maybe $Y[$i] = $i . ':' . $1; # this is for the schema output (stderr) #print $Y[$i],"\n"; } else { $Z[$i] = $X[$i]; # this is what will get written out, maybe $Y[$i] = $i . ':' . length_in_words($X[$i]); # for schema output +(stderr) #print $Y[$i],"\n"; } } $y = join " ", @Y; # magic schema string (with indices) #print $y,"\n"; (@O) = ($y =~ m/((?:\d+:ITEM7 \d+:\d+ )+(?:\d+:ITEM8 \d+:\d+\s*)+(?:\d ++:ITEM7A \d+:\d+ )+)(?:\d+:ITEM8 \d+:\d+\s*)+/g); # find all matches +of ITEM7 stuff ITEM7A stuff ITEM8, and remember them so we can decide + which one to extract ... #print @O; (@M) = ($y =~ m/((?:\d+:ITEM7 \d+:\d+ )+(?:\d+:ITEM7A \d+:\d+ )*)(?:\d ++:ITEM8 \d+:\d+\s*)+/g); # find all matches of ITEM7 stuff ITEM7A stu +ff ITEM8, and remember them so we can decide which one to extract ... #print scalar(@M),"\n"; #(@N) = ($y =~ m/((?:\d+:ITEM7 \d+:\d+ )+(?:\d+:ITEM7A \d+:\d+ )*(?:\d ++:ITEM8 \d+:\d+\s*))(?:\d+:ITEM8 \d+:\d+\s*)+/g); # find all matches +of ITEM7 stuff ITEM7A stuff ITEM8, and remember them so we can decide + which one to extract ... #print $N[0],"\n"; #print $Z[1],"\n"; #print $Z[2],"\n"; #print $Z[3],"\n"; #print $Z[4],"\n"; if(scalar(@M)==0) { (@M) = ($y =~ m/((?:\d+:ITEM7 \d+:\d+ )+(?:\d+:ITEM7A \d+:\d+ )*)( +?:\d+:ITEM9 \d+:\d+\s*)+/g); # find all matches of ITEM7 stuff ITEM7A + stuff ITEM8, and remember them so we can decide which one to extract + ... } if(scalar(@M)==0) { (@M) = ($y =~ m/((?:\d+:ITEM7 \d+:\d+ )+(?:\d+:ITEM7A \d+:\d+ )*)( +?:\d+:ITEM10 \d+:\d+\s*)+/g); # find all matches of ITEM7 stuff ITEM7 +A stuff ITEM8, and remember them so we can decide which one to extrac +t ... } if(scalar(@M)==0) { (@M) = ($y =~ m/((?:\d+:ITEM7 \d+:\d+ )+(?:\d+:ITEM7A \d+:\d+ )*)( +?:\d+:ITEM11 \d+:\d+\s*)+/g); # find all matches of ITEM7 stuff ITEM7 +A stuff ITEM8, and remember them so we can decide which one to extrac +t ... } if(scalar(@M)==0) { (@M) = ($y =~ m/((?:\d+:ITEM7 \d+:\d+ )+(?:\d+:ITEM7A \d+:\d+ )*)( +?:\d+:ITEM12 \d+:\d+\s*)+/g); # find all matches of ITEM7 stuff ITEM7 +A stuff ITEM8, and remember them so we can decide which one to extrac +t ... } if(scalar(@M)==0) { (@M) = ($y =~ m/((?:\d+:ITEM7 \d+:\d+ )+(?:\d+:ITEM7A \d+:\d+ )*)( +?:\d+:ITEM13 \d+:\d+\s*)+/g); # find all matches of ITEM7 stuff ITEM7 +A stuff ITEM8, and remember them so we can decide which one to extrac +t ... } if(scalar(@M)==0) { (@M) = ($y =~ m/((?:\d+:ITEM7 \d+:\d+ )+(?:\d+:ITEM7A \d+:\d+ )*)( +?:\d+:ITEM14 \d+:\d+\s*)+/g); # find all matches of ITEM7 stuff ITEM7 +A stuff ITEM8, and remember them so we can decide which one to extrac +t ... } if(scalar(@M)==0) { (@M) = ($y =~ m/((?:\d+:ITEM7 \d+:\d+ )+(?:\d+:ITEM7A \d+:\d+ )*)( +?:\d+:ITEM15 \d+:\d+\s*)+/g); # find all matches of ITEM7 stuff ITEM7 +A stuff ITEM8, and remember them so we can decide which one to extrac +t ... } # ... figure out which match is the best one, if there are any. curre +ntly, # "best" equals "longest number of words excluding the headers." $best = 0; $bestseq = ""; for($i = 0; $i < scalar(@O); ++$i) { $m = $O[$i]; $m =~ s/\d+://g; (@m) = (split / /, $m); $v = 0; map { $v += $_ if($_ =~ m/^\d+$/); } @m; if($v > $best) { $best = $v; $bestseq = $O[$i]; } } for($i = 0; $i < scalar(@M); ++$i) { $m = $M[$i]; $m =~ s/\d+://g; (@m) = (split / /, $m); $v = 0; map { $v += $_ if($_ =~ m/^\d+$/); } @m; if($v > $best) { $best = $v; $bestseq = $M[$i]; } } $content=""; # if we have a best match, write it out! if($bestseq ne "") { $bestseq =~ s/:\S+//g; (@m) = (split / /, $bestseq); map{ $Z[$_] =~ s/\s*$/\n/; $content="$content$Z[$_]"; } @m; } if(length_in_words($content)<2000) { $content =~ m/([^\"])(pages\s+([0-9]+)[^0-9]+([0-9]+))/s; $y=$3; if($y ne ""){ $y=$y-1; } $z=$4; # print "$y "; # print "$z \n"; # $x =~ m/(\n$y\n)/s; # print $1; $x =~ s/(\n$y\n)/##########START:/gis; $x =~ s/(\n$z\n)/##########/gis; @X = (split /\#\#\#\#\#\#\#\#\#\#/, $x); # tokenized raw file with + some notes # print scalar(@X); for($i = 0; $i < scalar(@X); ++$i) { if($X[$i] =~ m/^(START):(.*)$/s) { $Z = $2; # this is what will get written out, maybe } } if(length_in_words($Z)>$best) { $content = $Z;} # if($bestseq ne "") { # print $Z; # } } if($content ne "") { print FILEWRITE $content; # $bestseq =~ s/:\S+//g; # (@m) = (split / /, $bestseq); # map { $Z[$_] =~ s/\s*$/\n/; print $Z[$_]; $kept{$_} = 1; } @m; } @X = (); @Y = (); @Z = (); @O = (); @M = (); @m = (); #else { print STDERR "(NO_MATCH) "; } # write out the schema of the file to stderr so we can think more # deeply about our regexps and debug ... put stars next to the bits we + extract #$y =~ s/\b\d+://g; # (strip human-unreadable indices out of magic sch +ema string) #@Y = split / /, $y; #for($i = 0; $i < scalar(@Y); ++$i) { # print STDERR "*" if(defined $kept{$i}); # print STDERR $Y[$i], " "; #} #print STDERR "\n"; close FILEWRITE; } } sub length_in_words { my $x = shift; my @k; return scalar(@k = $x =~ m/(\S+)/sg); }
|
|---|