#!/usr/bin/perl # corpus - Generate a text of all code in a corpus # Written by S. Flitman, MD; released under GPLv3 # # 032710 Initial implementation # 032810 Unangle to avoid premature termination, also unamp # 032810 Fold long lines to avoid problems, -n for line numbers # 032810 If .nocorpus is in a folder it is ignored # 032810 Better displayed names for duplicates in different subdirs like index.fcgi # 032810 Wanted extensions will also be included, even if not executable use warnings; use strict 'vars'; use vars qw/%opt %doc $tpl $pgbrk $lnbrk $maxline %maxlineByFontsize $title $sigIgnore $reWantExts $nLines/; use File::Find; use Getopt::Std; $pgbrk=qq{

 

\n}; $lnbrk=qq{ ↵\n}; $maxline=0; %maxlineByFontsize=( 12=>120,10=>144,8=>180 ); $sigIgnore='.nocorpus'; getopts('f:hno:t:T:w:x:',\%opt); if (!@ARGV || $opt{h}) { print <<"EOT"; Usage: corpus [-f size] [-hn] [-t title] [-T tpl] [-o file] [-w ext] [-x pat] [folder]... Grab all code in folders and produced a nice listing in HTML to stdout or to outfile, folds long lines too -f N Font size N pt, also sets how long a line has to be to get folded -h This help -n Add line numbers -o X Send output to file X -t X Use X as title for generated HTML page -T X Use alternate template file X -w X,Y.. Additional wanted extensions, no dots, separate by comma -x P Also extract all lines matching pattern P to STDERR EOT exit(1); } $opt{f}||=12; $maxline=$maxlineByFontsize{$opt{f}}; die "Bad fontsize $opt{f}\n" unless $maxline; $title=$opt{t}||join(',',@ARGV); if ($opt{T}) { open(TPL,$opt{T}) || die "$opt{T}: $!"; $tpl.=$_ while (); close TPL; } else { $tpl=<<"EOT";
:n: :name: :modified: :size:
:text:
   
EOT } $reWantExts=join('|',split(/,/,$opt{w})); find({ wanted=>\&process,follow=>0,no_chdir=>1 },@ARGV); sub process { my $file=$File::Find::name; $File::Find::prune=1,return if -e "$File::Find::dir/$sigIgnore"; my ($tplFile,$text,@s,$ln,$name,$fWantExts); return if -l $file; # no symlinks $fWantExts=$file=~/\.($reWantExts)$/o; return unless (-x $file && -T $file) or $fWantExts; $name=substr($file,$fWantExts ? rindex($file,'/',rindex($file,'/')-1)+1 : rindex($file,'/')+1); $ln=0; open(FILE,$file) || die "$file: $!"; while () { print STDERR "$name: $_" if $opt{x} && /$opt{x}/o; $text.=($opt{n} ? sprintf('%4d ',++$ln) : '').$_; ++$nLines; } close FILE; $text=~s/&/&/g; $text=~s//>/g; @s=stat($file); $tplFile=$tpl; $tplFile=~s/:text:/$text/; $tplFile=~s!:modified:!getdate('M0/DD/YY',$s[9])!e; $tplFile=~s!:size:!comma($s[7])!e; $doc{$file}{name}=$name; $doc{$file}{text}=$tplFile; } if ($opt{o}) { open(STDOUT,'>',$opt{o}) || die "$opt{o}: $!"; } print <<"EOT"; $title
EOT # disambiguate filenames in corpus my ($i,@doc,$text,$name,$file,%docnames,@names); for $file (keys %doc) { push @{$docnames{$doc{$file}{name}}},$file; } for $name (keys %docnames) { if (scalar @{$docnames{$name}}>1) { # not unique? for $file (@{$docnames{$name}}) { $doc{$file}{name}=substr($file,rindex($file,'/',rindex($file,'/')-1)+1); } } } @doc=sort keys %doc; for ($i=0; $i<=$#doc; $i++) { $file=$doc[$i]; $text=$doc{$file}{text}; $text=~s/:n:/sprintf('%03d',$i+1)/e; $text=~s/:name:/$doc{$file}{name}/; $text=~s/^(.{$maxline})(.*)$/$1$lnbrk$2/omg; print $text; print $pgbrk if $i<$#doc; } print <<"EOT";
EOT print STDERR comma($i)," files, ",comma($nLines)," lines\n"; exit; sub fmtdate { # format a date. MMMM=full month name, MMM=short month name, # M0=2 digit month, leading 0, MM=digit month, no leading 0 # YY=2 digit year, YYYY=full year; DD=2 digit day, dd=day # DDDD=ordinal day +st/nd/rd/th; WWW=weekday # All with leading 0: hh=2 digit hour, mm=2 digit minute, ss=2 digit second my ($fmt,$mon,$day,$yr,$hr,$min,$sec)=@_; # note month is 1..12 my @months=qw/January February March April May June July August September October November December/; $fmt=~s/WWW/substr('MonTueWedThuFriSatSun',Day_of_Week($yr,$mon,$day)*3-3,3)/e; $fmt=~s/MMMM/$months[$mon-1]/; $fmt=~s/MMM/substr($months[$mon-1],0,3)/e; $fmt=~s/M0/sprintf("%02d",$mon)/e; $fmt=~s/MM/$mon/; $fmt=~s/YYYY/$yr/; $fmt=~s/YY/sprintf("%02d",$yr%100)/e; $fmt=~s/DDDD/getOrdinal($day)/e; $fmt=~s/DD/sprintf("%02d",$day)/e; $fmt=~s/dd/$day/; $fmt=~s/hh/sprintf("%02d",$hr)/e; $fmt=~s/mm/sprintf("%02d",$min)/e; $fmt=~s/ss/sprintf("%02d",$sec)/e; $fmt; } sub getdate { my ($fmt,$time)=@_; $fmt||='M0DDYY_hhmmss'; $time||=time; my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) =localtime($time); $year+=1900 if $year<1900; return fmtdate($fmt,$mon+1,$mday,$year,$hour,$min,$sec); } sub getOrdinal { my $n=shift; my @ordend=qw/th st nd rd th th th th th th/; $n . ($n>=10 && $n<=20 ? 'th' : $ordend[$n%10]); } sub comma { # print longs formatted with commas local($_) = shift; 1 while s/^(-?\d+)(\d{3})/$1,$2/; return $_; }