corpus - print a large corpus of code text

Corpus is a Perl script which scans your code directories looking for executable text files, plus any non-executable files with optional specified extensions, and puts them in a nice HTML file suitable for submitting to a code review or federal agency. It optionally adds line numbers too, and will ignore any directory's contents (including subdirs) containing the file .nocorpus. Meant for documenting work, it undoubtedly duplicates lots of other pretty printers out there, and it doesn't even do any syntax highlighting! Like all good tools, I wrote it for myself, but maybe it will help someone else.

#!/usr/bin/perl
# corpus - Generate a text of all code in a corpus
# Written by S. Flitman, MD; released under GPLv3
# 
# 032710 Initial implementation
# 032810 Unangle to avoid premature termination, also unamp
# 032810 Fold long lines to avoid problems, -n for line numbers
# 032810 If .nocorpus is in a folder it is ignored
# 032810 Better displayed names for duplicates in different subdirs li
+ke index.fcgi
# 032810 Wanted extensions will also be included, even if not executab
+le

use warnings;
use strict 'vars';
use vars qw/%opt %doc $tpl $pgbrk $lnbrk $maxline %maxlineByFontsize
   $title $sigIgnore $reWantExts $nLines/;
use File::Find;
use Getopt::Std;

$pgbrk=qq{<p style="page-break-before: always">&nbsp;</p>\n};
$lnbrk=qq{ &crarr;\n};
$maxline=0;
%maxlineByFontsize=( 12=>120,10=>144,8=>180 );
$sigIgnore='.nocorpus';

getopts('f:hno:t:T:w:x:',\%opt);
if (!@ARGV || $opt{h}) {
   print <<"EOT";
Usage: corpus [-f size] [-hn] [-t title] [-T tpl] [-o file] [-w ext] [
+-x pat] [folder]...

    Grab all code in folders and produced a nice listing
    in HTML to stdout or to outfile, folds long lines too

    -f N      Font size N pt, also sets how long a line has to be to g
+et folded
    -h        This help
    -n        Add line numbers
    -o X      Send output to file X
    -t X      Use X as title for generated HTML page
    -T X      Use alternate template file X     
    -w X,Y..  Additional wanted extensions, no dots, separate by comma
    -x P      Also extract all lines matching pattern P to STDERR
EOT
   exit(1);
}
$opt{f}||=12;
$maxline=$maxlineByFontsize{$opt{f}};
die "Bad fontsize $opt{f}\n" unless $maxline;
$title=$opt{t}||join(',',@ARGV);
if ($opt{T}) {
   open(TPL,$opt{T}) || die "$opt{T}: $!";
   $tpl.=$_ while (<TPL>);
   close TPL;
} else {
   $tpl=<<"EOT";
<table border=2 width="100%" height="100%" cellspacing=0 cellpadding=3
+>
 <tr height=16>
  <td align=left><em>:n:</em></td>
  <td align=left>:name:</td>
  <td align=center>:modified:</td>
  <td align=right>:size:</td>
 </tr>
 <tr>
  <td colspan=4 valign=top>
   <pre>
:text:
   </pre>
  </td>
</table>
EOT
}
$reWantExts=join('|',split(/,/,$opt{w}));

find({ wanted=>\&process,follow=>0,no_chdir=>1 },@ARGV);

sub process { 
   my $file=$File::Find::name;
   $File::Find::prune=1,return if -e "$File::Find::dir/$sigIgnore";
   my ($tplFile,$text,@s,$ln,$name,$fWantExts);
   return if -l $file;  # no symlinks
   $fWantExts=$file=~/\.($reWantExts)$/o;
   return unless (-x $file && -T $file) or $fWantExts;
   $name=substr($file,$fWantExts ? rindex($file,'/',rindex($file,'/')-
+1)+1 : rindex($file,'/')+1);
   $ln=0;
   open(FILE,$file) || die "$file: $!";
   while (<FILE>) {
      print STDERR "$name: $_" if $opt{x} && /$opt{x}/o;
      $text.=($opt{n} ? sprintf('%4d  ',++$ln) : '').$_;
      ++$nLines;
   }
   close FILE;
   $text=~s/&/&amp;/g;
   $text=~s/</&lt;/g;
   $text=~s/>/&gt;/g;
   @s=stat($file);
   $tplFile=$tpl;
   $tplFile=~s/:text:/$text/;
   $tplFile=~s!:modified:!getdate('M0/DD/YY',$s[9])!e;
   $tplFile=~s!:size:!comma($s[7])!e;
   $doc{$file}{name}=$name;
   $doc{$file}{text}=$tplFile;
}

if ($opt{o}) {
   open(STDOUT,'>',$opt{o}) || die "$opt{o}: $!";
} 
print <<"EOT";
<html>
<head>
<title>$title</title>
<style>
pre { font-size:$opt{f}pt }
</style>
</head>
<body>
<center>
EOT

# disambiguate filenames in corpus
my ($i,@doc,$text,$name,$file,%docnames,@names);
for $file (keys %doc) {
   push @{$docnames{$doc{$file}{name}}},$file;
}
for $name (keys %docnames) {
   if (scalar @{$docnames{$name}}>1) {            # not unique?
      for $file (@{$docnames{$name}}) {
         $doc{$file}{name}=substr($file,rindex($file,'/',rindex($file,
+'/')-1)+1);
      }
   } 
}
@doc=sort keys %doc;
for ($i=0; $i<=$#doc; $i++) {
   $file=$doc[$i];
   $text=$doc{$file}{text};
   $text=~s/:n:/sprintf('%03d',$i+1)/e;
   $text=~s/:name:/$doc{$file}{name}/;
   $text=~s/^(.{$maxline})(.*)$/$1$lnbrk$2/omg;
   print $text;
   print $pgbrk if $i<$#doc;
}

print <<"EOT";
</center>
</body>
</html>
EOT

print STDERR comma($i)," files, ",comma($nLines)," lines\n";
exit;

sub fmtdate {  # format a date. MMMM=full month name, MMM=short month 
+name,
   # M0=2 digit month, leading 0, MM=digit month, no leading 0
   # YY=2 digit year, YYYY=full year; DD=2 digit day, dd=day
   # DDDD=ordinal day +st/nd/rd/th; WWW=weekday
   # All with leading 0: hh=2 digit hour, mm=2 digit minute, ss=2 digi
+t second
   my ($fmt,$mon,$day,$yr,$hr,$min,$sec)=@_;   # note month is 1..12
   my @months=qw/January February March April May June July August
                 September October November December/;
   $fmt=~s/WWW/substr('MonTueWedThuFriSatSun',Day_of_Week($yr,$mon,$da
+y)*3-3,3)/e;
   $fmt=~s/MMMM/$months[$mon-1]/;
   $fmt=~s/MMM/substr($months[$mon-1],0,3)/e;
   $fmt=~s/M0/sprintf("%02d",$mon)/e;
   $fmt=~s/MM/$mon/;
   $fmt=~s/YYYY/$yr/;
   $fmt=~s/YY/sprintf("%02d",$yr%100)/e;
   $fmt=~s/DDDD/getOrdinal($day)/e;
   $fmt=~s/DD/sprintf("%02d",$day)/e;
   $fmt=~s/dd/$day/;
   $fmt=~s/hh/sprintf("%02d",$hr)/e;
   $fmt=~s/mm/sprintf("%02d",$min)/e;
   $fmt=~s/ss/sprintf("%02d",$sec)/e;
   $fmt;
}

sub getdate {
   my ($fmt,$time)=@_;
   $fmt||='M0DDYY_hhmmss';
   $time||=time;
   my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst)
       =localtime($time);
   $year+=1900 if $year<1900;
   return fmtdate($fmt,$mon+1,$mday,$year,$hour,$min,$sec);
}

sub getOrdinal {
   my $n=shift;
   my @ordend=qw/th st nd rd th th th th th th/;
   $n . ($n>=10 && $n<=20 ? 'th' : $ordend[$n%10]);
}

sub comma {        # print longs formatted with commas
   local($_) = shift;
   1 while s/^(-?\d+)(\d{3})/$1,$2/;
   return $_;
}
[download]

Comment on corpus - print a large corpus of code text Select or Download Code