#!/opt/local/bin/perl -w =pod # -------------------------------------------------------------------- The software provided here is released by the National Institute of Standards and Technology (NIST), an agency of the U.S. Department of Commerce, Gaithersburg MD 20899, USA. The software bears no warranty, either expressed or implied. NIST does not assume legal liability nor responsibility for a User's use of the software or the results of such use. Please note that within the United States, copyright protection, under Section 105 of the United States Code, Title 17, is not available for any work of the United States Government and/or for any works created by United States Government employees. User acknowledges that this software contains work which was created by NIST employees and is therefore in the public domain and not subject to copyright. The User may use, distribute, or incorporate this software provided the User acknowledges this via an explicit acknowledgment of NIST-related contributions to the User's work. User also agrees to acknowledge, via an explicit acknowledgment, that any modifications or alterations have been made to this software before redistribution. # -------------------------------------------------------------------- =cut =pod This code reduces an input file to a group of meta-characters. I chose the 8 characters below because the low order bits can be used in future in a digest that I have in mind. @ block boundary A alphanumeric B binary (above 0x7f / 0177 UNLESS in UTF-16 mode) C control chars (below 0x20 / 0040) D whitespace E 0x00 F 0xff G other (TBD) Mapping into a set that can take advantage of perlbio algorithms might be useful. UTF-16 (and -32?) need to be fleshed out. Entropy calculation needs to be fleshed out. =cut use strict; use Getopt::Std; use vars qw( $opt_M $opt_h $opt_U $opt_i $opt_o $opt_b $opt_n $bsize $utf $data $bcount %count $mstr ); my $LOBLOCK = 64; my $HIBLOCK = 8192; %count = ( '@', 0, 'A', 0, 'B', 0, 'C', 0, 'D', 0, 'E', 0, 'F', 0, 'G', 0 ); $mstr =''; # -- command line arguments getopts('hnMU:i:o:b:') or $opt_h = 1; if (! $opt_i) { giveHelp("input file is MANDATORY"); } if (! -e "$opt_i") { giveHelp("input file does not exist: $opt_i"); } if (-e "$opt_o") { giveHelp("output file cannot be overwritten: $opt_o"); } if (defined $opt_b) { $bsize = int($opt_b); if (($bsize < $LOBLOCK) || ($bsize > $HIBLOCK)) { giveHelp("block size ($opt_b) must range from $LOBLOCK to $HIBLOCK"); } } else { $bsize = 4096; # default } if (defined $opt_U) { $utf = int($opt_U); if (($utf != 8) && ($utf != 16)) { giveHelp("UTF size ($opt_U) must be 8 or 16"); } } else { $utf = 8; # default } if ($opt_h) { giveHelp("help"); } # -- hack for now if ($utf != 8) { giveHelp("UTF size ($utf) is not supported yet"); } # -- file I/O open(FIN,"$opt_i") or die "$0 : cannot open input file $opt_i for reading\n"; if ($opt_M) { open(FOUT,">$opt_o") or die "$0 : cannot open output file $opt_o for writing\n"; } # -- process the file by blocks $bcount=0; # this is redundant with $count{'@'} while(read(FIN,$data,$bsize)) { while (length($data)) { my $known = 0; if ($data =~ /^(\000+)/ ) { notate('E', length($1)); $known=1; } if ($data =~ /^(\377+)/ ) { notate('F', length($1)); $known=1; } if ($data =~ /^([\d\w]+)/ ) { notate('A', length($1)); $known=1; } if ($data =~ /^([\001-\037]+)/ ) { notate('C', length($1)); $known=1; } if ($data =~ /^([\200-\376]+)/ ) { notate('B', length($1)); $known=1; } if ($data =~ /^(\s+)/ ) { notate('D', length($1)); $known=1; } if (! $known) { notate('G', 1); } } if (! $opt_M) { $mstr .= '@'; if ($opt_n) { $mstr .= "\n"; } } else { print FOUT '@'; if ($opt_n) { print FOUT "\n"; } } $bcount++; $count{'@'}++; } close(FIN); # -- file output if built in memory if (! $opt_M) { open(FOUT,">$opt_o") or die "$0 : cannot open output file $opt_o for writing\n"; print FOUT $mstr ; } close(FOUT); # -- print some stats print STDERR "\n$0 summary $opt_i : $bcount $bsize blocks"; my $cc = 0; for my $k (sort {$a cmp $b} (keys %count)) { print STDERR ", $k $count{$k}"; if ($k ne '@') { $cc += $count{$k}; } } print STDERR ", chars $cc\n"; exit; # -- usage help, and warnings sub giveHelp { my $msg = shift; if (defined $msg && ($msg ne 'help')) { print STDERR "$0 Warning : $msg\n"; } print STDERR <