#!/usr/bin/perl use strict; die "Usage: $0 < doctable > sig.list\n" if ( @ARGV ); # a simple-minded attempt to generate a "signature" value # that could be used as a means of numerically measuring # document similarity # @letters is ordered according to relative letter frequency in # English news text (based on a 1-month sample of AP newswire): my @letters = qw/e a t i n o s r h l d c u m p g f w y b v k j x z q/; # %vectors stores a distinct power of 2 for each letter; # more frequent letters are assigned lower powers: my $i; my %vectors = map { $_ => 1<<$i++ } @letters; my ($openfn,$offset); # Input on stdin is one doc entry per line: # file.name doc_id begin_offset end_offset while (<>) { my ( $fn, $id, $so, $eo ) = split; if ( $fn ne $openfn ) { open IN, $fn or die "$fn: $!"; $openfn = $fn; $offset = 0; } seek( IN, $so, 0 ) if ( $so != $offset ); read( IN, $_, $eo-$so ); $offset = $eo; tr/A-Z/a-z/; tr/a-z//cd; my %c_hist = (); $c_hist{$_}++ for ( split // ); my $sig = 0; $sig += $c_hist{$_} * $vectors{$_} for ( keys %c_hist ); printf "%s\t%f\n", $id, $sig; }