Corpus is a Perl script which scans your code directories looking for executable text files, plus any non-executable files with optional specified extensions, and puts them in a nice HTML file suitable for submitting to a code review or federal agency. It optionally adds line numbers too, and will ignore any directory's contents (including subdirs) containing the file
. Meant for documenting work, it undoubtedly duplicates lots of other pretty printers out there, and it doesn't even do any syntax highlighting! Like all good tools, I wrote it for myself, but maybe it will help someone else.
#!/usr/bin/perl
# corpus - Generate a text of all code in a corpus
# Written by S. Flitman, MD; released under GPLv3
#
# 032710 Initial implementation
# 032810 Unangle to avoid premature termination, also unamp
# 032810 Fold long lines to avoid problems, -n for line numbers
# 032810 If .nocorpus is in a folder it is ignored
# 032810 Better displayed names for duplicates in different subdirs li
+ke index.fcgi
# 032810 Wanted extensions will also be included, even if not executab
+le
use warnings;
use strict 'vars';
use vars qw/%opt %doc $tpl $pgbrk $lnbrk $maxline %maxlineByFontsize
$title $sigIgnore $reWantExts $nLines/;
use File::Find;
use Getopt::Std;
$pgbrk=qq{<p style="page-break-before: always"> </p>\n};
$lnbrk=qq{ ↵\n};
$maxline=0;
%maxlineByFontsize=( 12=>120,10=>144,8=>180 );
$sigIgnore='.nocorpus';
getopts('f:hno:t:T:w:x:',\%opt);
if (!@ARGV || $opt{h}) {
print <<"EOT";
Usage: corpus [-f size] [-hn] [-t title] [-T tpl] [-o file] [-w ext] [
+-x pat] [folder]...
Grab all code in folders and produced a nice listing
in HTML to stdout or to outfile, folds long lines too
-f N Font size N pt, also sets how long a line has to be to g
+et folded
-h This help
-n Add line numbers
-o X Send output to file X
-t X Use X as title for generated HTML page
-T X Use alternate template file X
-w X,Y.. Additional wanted extensions, no dots, separate by comma
-x P Also extract all lines matching pattern P to STDERR
EOT
exit(1);
}
$opt{f}||=12;
$maxline=$maxlineByFontsize{$opt{f}};
die "Bad fontsize $opt{f}\n" unless $maxline;
$title=$opt{t}||join(',',@ARGV);
if ($opt{T}) {
open(TPL,$opt{T}) || die "$opt{T}: $!";
$tpl.=$_ while (<TPL>);
close TPL;
} else {
$tpl=<<"EOT";
<table border=2 width="100%" height="100%" cellspacing=0 cellpadding=3
+>
<tr height=16>
<td align=left><em>:n:</em></td>
<td align=left>:name:</td>
<td align=center>:modified:</td>
<td align=right>:size:</td>
</tr>
<tr>
<td colspan=4 valign=top>
<pre>
:text:
</pre>
</td>
</table>
EOT
}
$reWantExts=join('|',split(/,/,$opt{w}));
find({ wanted=>\&process,follow=>0,no_chdir=>1 },@ARGV);
sub process {
my $file=$File::Find::name;
$File::Find::prune=1,return if -e "$File::Find::dir/$sigIgnore";
my ($tplFile,$text,@s,$ln,$name,$fWantExts);
return if -l $file; # no symlinks
$fWantExts=$file=~/\.($reWantExts)$/o;
return unless (-x $file && -T $file) or $fWantExts;
$name=substr($file,$fWantExts ? rindex($file,'/',rindex($file,'/')-
+1)+1 : rindex($file,'/')+1);
$ln=0;
open(FILE,$file) || die "$file: $!";
while (<FILE>) {
print STDERR "$name: $_" if $opt{x} && /$opt{x}/o;
$text.=($opt{n} ? sprintf('%4d ',++$ln) : '').$_;
++$nLines;
}
close FILE;
$text=~s/&/&/g;
$text=~s/</</g;
$text=~s/>/>/g;
@s=stat($file);
$tplFile=$tpl;
$tplFile=~s/:text:/$text/;
$tplFile=~s!:modified:!getdate('M0/DD/YY',$s[9])!e;
$tplFile=~s!:size:!comma($s[7])!e;
$doc{$file}{name}=$name;
$doc{$file}{text}=$tplFile;
}
if ($opt{o}) {
open(STDOUT,'>',$opt{o}) || die "$opt{o}: $!";
}
print <<"EOT";
<html>
<head>
<title>$title</title>
<style>
pre { font-size:$opt{f}pt }
</style>
</head>
<body>
<center>
EOT
# disambiguate filenames in corpus
my ($i,@doc,$text,$name,$file,%docnames,@names);
for $file (keys %doc) {
push @{$docnames{$doc{$file}{name}}},$file;
}
for $name (keys %docnames) {
if (scalar @{$docnames{$name}}>1) { # not unique?
for $file (@{$docnames{$name}}) {
$doc{$file}{name}=substr($file,rindex($file,'/',rindex($file,
+'/')-1)+1);
}
}
}
@doc=sort keys %doc;
for ($i=0; $i<=$#doc; $i++) {
$file=$doc[$i];
$text=$doc{$file}{text};
$text=~s/:n:/sprintf('%03d',$i+1)/e;
$text=~s/:name:/$doc{$file}{name}/;
$text=~s/^(.{$maxline})(.*)$/$1$lnbrk$2/omg;
print $text;
print $pgbrk if $i<$#doc;
}
print <<"EOT";
</center>
</body>
</html>
EOT
print STDERR comma($i)," files, ",comma($nLines)," lines\n";
exit;
sub fmtdate { # format a date. MMMM=full month name, MMM=short month
+name,
# M0=2 digit month, leading 0, MM=digit month, no leading 0
# YY=2 digit year, YYYY=full year; DD=2 digit day, dd=day
# DDDD=ordinal day +st/nd/rd/th; WWW=weekday
# All with leading 0: hh=2 digit hour, mm=2 digit minute, ss=2 digi
+t second
my ($fmt,$mon,$day,$yr,$hr,$min,$sec)=@_; # note month is 1..12
my @months=qw/January February March April May June July August
September October November December/;
$fmt=~s/WWW/substr('MonTueWedThuFriSatSun',Day_of_Week($yr,$mon,$da
+y)*3-3,3)/e;
$fmt=~s/MMMM/$months[$mon-1]/;
$fmt=~s/MMM/substr($months[$mon-1],0,3)/e;
$fmt=~s/M0/sprintf("%02d",$mon)/e;
$fmt=~s/MM/$mon/;
$fmt=~s/YYYY/$yr/;
$fmt=~s/YY/sprintf("%02d",$yr%100)/e;
$fmt=~s/DDDD/getOrdinal($day)/e;
$fmt=~s/DD/sprintf("%02d",$day)/e;
$fmt=~s/dd/$day/;
$fmt=~s/hh/sprintf("%02d",$hr)/e;
$fmt=~s/mm/sprintf("%02d",$min)/e;
$fmt=~s/ss/sprintf("%02d",$sec)/e;
$fmt;
}
sub getdate {
my ($fmt,$time)=@_;
$fmt||='M0DDYY_hhmmss';
$time||=time;
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst)
=localtime($time);
$year+=1900 if $year<1900;
return fmtdate($fmt,$mon+1,$mday,$year,$hour,$min,$sec);
}
sub getOrdinal {
my $n=shift;
my @ordend=qw/th st nd rd th th th th th th/;
$n . ($n>=10 && $n<=20 ? 'th' : $ordend[$n%10]);
}
sub comma { # print longs formatted with commas
local($_) = shift;
1 while s/^(-?\d+)(\d{3})/$1,$2/;
return $_;
}