http://qs1969.pair.com?node_id=30216
Category: Miscellaneous
Author/Contact Info Chad Johnston cjohnston@rockstardevelopment.com Guildenstern
Description: Originally written to find common substrings in raw packet data. Can take a file of strings and will count all occurrences of substrings. Can take parameters for smallest substring and number of minimum matches to show to speed it up. I only tested it with the characters 0-9 and A-Z, so the regex may need to be protected if you use different data.
# Packet parser
# Version 1.5 - 29 Aug 2000
#
# I wrote this to inspect raw hex data from network packets
# and find how many times all possible substrings occur.
# Example 1: I was able to determine for one product that
# a particular string was present in all packets and decrypted
# to the administrator's username.
# Example 2: Another program uses tokens inserted into the packets
# to verify to teh application that the remote connections is allowed.
# I was able to determine what that token was using this program.
#
# Typical usage: pp.pl -f myinput.txt > output.fil
use strict;
use IO::File;        # we _need_ a temp file
sub usage();
sub get_cmd_opts();

usage() unless @ARGV;

my $level;           # length of substring to cut off at
my $fname;           # file (data) to be parsed
my $threshold;       # number of matches to cut off at
my $sortby;          # sort method
my @packets = ();    # input data
my $tmpfile;         # tempfile for storing intermediate results
my %all = ();        # substrings and match numbers

get_cmd_opts();

while (<>){push(@packets,$_);}
chomp(@packets);

@packets = sort { length($a) <=> length($b) } @packets;

$tmpfile = IO::File->new_tmpfile or die "Could not create tempfile: $!
+\n";
$tmpfile->autoflush(1);

do
{
 my $p = $packets[0];                  # get first packet
 foreach my $l ($level..length($p))    # substring lengths come from h
+ere
 {
  foreach my $pos (0..length($p)-$l)   # position of substring
  {
   my $str = substr($p,$pos,$l);       # create substr
    my $num = 0;                       # number matches for this subst
+r
    for (0..$#packets)                   # check against all packets
    {
     if ($l <= length($packets[$_]))   # useless to check if our subst
+r is too long
     {
      pos($packets[$_]) = 0;           # makes subsequent matches work
      while ($packets[$_] =~ /$str/g)  # find all matches
      {
       $num ++; 
      }
     }
    }
   if ($num >= $threshold)        # enough matches to keep it?
   {
    print $tmpfile "$str:$num\n"; # dump to temp
   }
  }
 }
 shift(@packets);  # get rid of the packet we just checked
} while ($#packets >= 0);


# first step is done, now read results from tempfile and
# generate output
seek($tmpfile,0,0) or die "Seek error: $!\n";

while (<$tmpfile>)
{
 chomp($_);
 my ($str,$val) = split(":",$_);
 if (!exists($all{$str})) # duplicates mess up counts
 {
  $all{$str} = $val;
 }
}

#my $value;
my @sort = ();

# hack to sort by command line param
# could be improved, i'm sure
if ($sortby eq "l")
{
 # reverse sort by length
 @sort = sort {length($b) <=> length($a) } keys %all;
}
else
{
 # reverse sort by number
 # should also sort by length since output looks a
 # tad funky
 @sort = sort { $all{$b} <=> $all{$a} } keys %all;
}

foreach (@sort)
{
 if (($all{$_} >= $threshold) && (length($_) >= $level))
 {
  print "(",length($_),") \"$_\": $all{$_} times.\n";
 }
 delete($all{$_});
}
undef @sort;

exit;

sub get_cmd_opts()
{
 use Getopt::Std;
 my %opts = ();       # command line params
 
 getopts('f:l:s:n:',\%opts);

 if (!($fname = $opts{'f'})) # file name
 {
  usage(); 
 }
 push(@ARGV,$fname); # cheap hack to avoid opening file, etc, later

 $level = exists($opts{'l'}) ? $opts{'l'} : 5;  # minimum substr lengt
+h

 if ((!exists($opts{'s'})) || ($opts{'s'} eq 'l')) # sort method
 {
  $sortby = "l";   # by length of string
 }
 elsif ($opts{'s'} eq 'f')
 {
  $sortby = "f";   # by frequency of match
 }
 else
 {
  usage();
 }

 $threshold = exists($opts{'n'}) ? $opts{'n'} : 2; # minimum match cou
+nt
}

sub usage()
{
 print "Packet Parser v1.5\n";
 print "Reads in a file of strings and shows all matches for substring
+s.\n";
 print "Note: Tested only with 0-9 and A-Z since that's all we need to
+ make hex characters.\n";
 print "Usage: pp.pl -f inputfile [-l matchlen] [-s sorttype] [-n thre
+shold]\n";
 print "-f\tinputfile: File containing data to be evaluated.\n";
 print "-l\tmatchlen: Specify minimum string length to show matches fo
+r. (Default: 5)\n";
 print "-s\tsorttype: Use l to specify sort by string length. Use f to
+ specify sort by match frequency. (Default: l)\n";
 print "-n\tthreshold: Specify minimum match number in output. (Defaul
+t: 2)\n";
 exit;
}