Substring Finding/Counting

Category:	Miscellaneous
Author/Contact Info	Chad Johnston cjohnston@rockstardevelopment.com Guildenstern
Description:	Originally written to find common substrings in raw packet data. Can take a file of strings and will count all occurrences of substrings. Can take parameters for smallest substring and number of minimum matches to show to speed it up. I only tested it with the characters 0-9 and A-Z, so the regex may need to be protected if you use different data.
# Packet parser # Version 1.5 - 29 Aug 2000 # # I wrote this to inspect raw hex data from network packets # and find how many times all possible substrings occur. # Example 1: I was able to determine for one product that # a particular string was present in all packets and decrypted # to the administrator's username. # Example 2: Another program uses tokens inserted into the packets # to verify to teh application that the remote connections is allowed. # I was able to determine what that token was using this program. # # Typical usage: pp.pl -f myinput.txt > output.fil use strict; use IO::File; # we _need_ a temp file sub usage(); sub get_cmd_opts(); usage() unless @ARGV; my $level; # length of substring to cut off at my $fname; # file (data) to be parsed my $threshold; # number of matches to cut off at my $sortby; # sort method my @packets = (); # input data my $tmpfile; # tempfile for storing intermediate results my %all = (); # substrings and match numbers get_cmd_opts(); while (<>){push(@packets,$_);} chomp(@packets); @packets = sort { length($a) <=> length($b) } @packets; $tmpfile = IO::File->new_tmpfile or die "Could not create tempfile: $! +\n"; $tmpfile->autoflush(1); do { my $p = $packets[0]; # get first packet foreach my $l ($level..length($p)) # substring lengths come from h +ere { foreach my $pos (0..length($p)-$l) # position of substring { my $str = substr($p,$pos,$l); # create substr my $num = 0; # number matches for this subst +r for (0..$#packets) # check against all packets { if ($l <= length($packets[$_])) # useless to check if our subst +r is too long { pos($packets[$_]) = 0; # makes subsequent matches work while ($packets[$_] =~ /$str/g) # find all matches { $num ++; } } } if ($num >= $threshold) # enough matches to keep it? { print $tmpfile "$str:$num\n"; # dump to temp } } } shift(@packets); # get rid of the packet we just checked } while ($#packets >= 0); # first step is done, now read results from tempfile and # generate output seek($tmpfile,0,0) or die "Seek error: $!\n"; while (<$tmpfile>) { chomp($_); my ($str,$val) = split(":",$_); if (!exists($all{$str})) # duplicates mess up counts { $all{$str} = $val; } } #my $value; my @sort = (); # hack to sort by command line param # could be improved, i'm sure if ($sortby eq "l") { # reverse sort by length @sort = sort {length($b) <=> length($a) } keys %all; } else { # reverse sort by number # should also sort by length since output looks a # tad funky @sort = sort { $all{$b} <=> $all{$a} } keys %all; } foreach (@sort) { if (($all{$_} >= $threshold) && (length($_) >= $level)) { print "(",length($_),") \"$_\": $all{$_} times.\n"; } delete($all{$_}); } undef @sort; exit; sub get_cmd_opts() { use Getopt::Std; my %opts = (); # command line params getopts('f:l:s:n:',\%opts); if (!($fname = $opts{'f'})) # file name { usage(); } push(@ARGV,$fname); # cheap hack to avoid opening file, etc, later $level = exists($opts{'l'}) ? $opts{'l'} : 5; # minimum substr lengt +h if ((!exists($opts{'s'})) \|\| ($opts{'s'} eq 'l')) # sort method { $sortby = "l"; # by length of string } elsif ($opts{'s'} eq 'f') { $sortby = "f"; # by frequency of match } else { usage(); } $threshold = exists($opts{'n'}) ? $opts{'n'} : 2; # minimum match cou +nt } sub usage() { print "Packet Parser v1.5\n"; print "Reads in a file of strings and shows all matches for substring +s.\n"; print "Note: Tested only with 0-9 and A-Z since that's all we need to + make hex characters.\n"; print "Usage: pp.pl -f inputfile [-l matchlen] [-s sorttype] [-n thre +shold]\n"; print "-f\tinputfile: File containing data to be evaluated.\n"; print "-l\tmatchlen: Specify minimum string length to show matches fo +r. (Default: 5)\n"; print "-s\tsorttype: Use l to specify sort by string length. Use f to + specify sort by match frequency. (Default: l)\n"; print "-n\tthreshold: Specify minimum match number in output. (Defaul +t: 2)\n"; exit; }

Back to Code Catacombs