package Security::Monitoring::Detection::Markov; use 5.006; use strict; use warnings; use Storable qw(nstore retrieve); use Carp; use Security::Monitoring::Utils; our $dict_file = '/usr/share/dict/words'; =head1 NAME Markov chain based automata class =head1 VERSION Version 0.01 =cut our $VERSION = '0.01'; =head1 SYNOPSIS This module allows you to summon Markov chain based automata That, after being given a dictionary to learn from will give marks to words based on how likely these words are to be part of the language used in the dictionary they learnt. =head1 SUBROUTINES/METHODS =head2 new Summoning method. give it a number (it will be a percentage) to determine when the automaton will trigger an alert. This percentage is the deviation score from the mean of all learnt words. eg 10% with a mean score of all words learnt of 1 will make the automaton trigger an alert if after examining a word it obtains a score that is greate or lower than his mean by at least 10% =head3 use my $automaton = $class->new($alert_level); =cut sub new { my $class = shift; my $alert_level = shift; my $self = {}; bless $self, $class; $self->_init($alert_level); $self; } =head2 _init separate init function to make inheritance easier =cut sub _init{ my $self = shift; my $alert_level = shift; if( !defined($alert_level) || !($alert_level =~ /\d+/)){ croak "alert leve undefined!\n"; } my @keys = qw(t_number word_count mean_score t_table_ref alert_level false_positives); my %t_table;#hash that will later contain the transitions my %false_positives;#hash that will contian the false positives foreach my $key (@keys){ $self->{$key} = 0; } $self->{t_table_ref} = \%t_table; $self->{false_positives} = \%false_positives; } =head2 learn this method is used by an automaton to look through a dictionary in the form of multiple lines. beware that telling an already educated automaton to learn will change whatever results you obtain this function will take a filehandle to the dictionary file (one word per line) and will proceed to populate the automaton t_table with it. each object has a transition table wich is in face made of a number of hash table : first level one has one key for each element of the alphabet and another one for the empty word, each value is a reference to another hashtable containing as keys the letter to which the transition is mad and as value the number of transitions made to that key the last parameter is a boolean, leave it empty to indicate its a file and not just a string. =head3 use $automaton->learn($file_name[,$is_string]); if you do not provide the $is_string parameter then it will behave as if reading from a file =cut sub learn { my ($self,$file_name,$is_string) = @_; my $fh; my $hash = $self->{t_table_ref}; my $words_learnt = 0; #chooses whether we ar elearning from a string or a file if(!defined($is_string)){ if (!defined($file_name)){ croak("using an undefined filename\n"); } open $fh, '<',$file_name or croak qq(could not open dictionary file $file_name :$!\n); } else{ if (!defined $file_name){ croak "sorry, that string is undefined, can not read from it\n"; } elsif ($file_name eq ''){ croak "sorry, that string is empty, nothing to read!\n" } open $fh, '<',\$file_name; } #reads each word and update the transitiontable with it while(<$fh>){ my $word = $_; chomp $word; $word =~ s/(\w)/lc($1)/ge;#put it in full lowercase $word =~ s/[^[[::alphanum::]]]//;#gets rid of non alphanumeric #symbols my @letters = split '',$word; my $last_letter = 'empty';#first iteration starts from the empty #letter for(my $i = 0; $i <= $#letters;$i++){ my $cur_letter = $letters[$i]; #if there is no subkey in the transition table then we start #with one which value is set at 1 $hash->{$last_letter}->{$cur_letter} = defined $hash->{$last_letter}->{$cur_letter} ? $hash->{$last_letter}->{$cur_letter}++ : 1; $self->{t_number}++; $last_letter = $cur_letter; } $words_learnt++; } my $total = 0; seek $fh,0,0; while (<$fh>){ my $word = $_; chomp $word; $word =~ s/(\w)/lc($1)/ge;#put it in full lowercase $word =~ s/[^[[::alphanum::]]]//;#gets rid of non alphanumeric $total += $self->get_score($word); } $self->{mean_score} = (($self->{mean_score} * $self->{word_count}) +($total * $words_learnt)) / ($self->{word_count} + $words_learnt); $self->{word_count}+=$words_learnt; close $fh; return $words_learnt; } =head2 _transitions getter for t_number field =cut sub _transitions{ my $self = shift; $self->{t_number}; } =head2 get_score This method is used once the automaton has learnt enough words (enough being left to the judgement of the user, dictionary files are a good source I reckon) using it it will give you a score that will determine wether that letter ordering is unusual (the higher the score the most unusual it will be). The score should be around 0 for known words, normal transitions such as consumn to vowels should not add too much to the score. strange ones( such as xxxbxczbbwdx)will do. Keep in mind that a completely unknown transition will give one point, anything else will give a score calculated using the following formula : 1 - (number of time this transition already occured / total of transitions recorded); =head3 use $automaton->get_score($word); =cut sub get_score{ my ($self,$word) = @_; my $score = 0; chomp $word; $word =~ s/(\w)/lc($1)/ge; $word =~ s/[^[[::alphanum::]]]//;#gets rid of non alphanumeric #symbols if(defined $self->{false_positives}->{$word}){ return $self->{mean_score}; } my @letters = split('',$word); my $last_letter = "empty"; foreach my $l (@letters){ $score += defined $self->{t_table_ref}->{$last_letter}->{$l}? $self->{t_table_ref}->{$last_letter}->{$l} / $self->{t_number}: 1; $last_letter = $l; } return $score; } =head2 flag_as_false_positive this function allows you to flag a specific word as a false positive afterward whenever the autmaton tries to calculate the score of the word it will return his mean score to prevent triggering an alert =cut sub flag_as_false_positive{ my($self,$word) = @_; $word =~ s/(\w)/lc($1)/ge; $word =~ s/[^[[::alphanum::]]]//;#gets rid of non alphanumeric #symbols $self->{false_positives}->{$word} = 1; } =head2 get_false_positives returns a reference to a list of flagged false positives from the automata =cut sub get_false_positives{ my $self = shift; my @false_positives = keys %{$self->{false_positives}}; return \@false_positives; } =head2 sys_learn this subroutines uses the learn function on the system's /usr/share/dict/words file if it exists. if it does not it will return 0. If it does it will return the number of learnt words if the file is empty it will return -1; =head3 use $automaton->sys_learn; =cut sub sys_learn{ my $self = shift; if (!ref $self){ croak("can not syslearn as class!\n"); } my $file_name = $dict_file; if (-e $file_name){ my $learnt = $self->learn($file_name); return $learnt; } else{ croak "bad package var dict_file\n"; } } 1; # End of Markov