package Markov::Ndimensional; # by bliako # for https://perlmonks.org/?node_id=1228191 # 20/01/2019 use strict; use warnings; our $VERSION = 0.1; use Exporter qw/import/; our @EXPORT = qw/learn predict range save_state load_state/; use Storable; use Data::Dump; # * reads in some data (from file or string), # * splits it to symbols according to separator # or it works character-by-character if the separator is left undef, # * it eventually will have symbols. # These symbols can be words in text (if separator = word boundary='\b') # Or they can be letters in text (if separator is left undefined) # (in this case you must arrange that the input has each word in a line on its own) # Symbols can are then put together to form "ngrams" and the frequency of # each ngram is counted. # Input parameter 'ngram-length' refers to the number of symbols in an ngram. # For example, with an ngram-length=2 and a word-boundary separator # we get # 'hello','there' => 5 # 'hello','you' => 10 # 'goodbye','all' => 5 # etc. # the numbers are the counts for each ngram as read from the input data. # Input parameter 'counts' can be used to supply such frequency data # without re-reading huge corpus each time. The param must be a hashref # where key=ngrams (separated by anything you like, no quotes necessary # but use some separator in order not to confuse composite words) # With that frequency data, we proceed to calculate a probability # distribution ('dist') which is just normalised frequencies. # Then we build the cumulative probability distribution but with a twist: # the ngrams are broken into (n-1)grams and 1grams (from left to right for latin) # The first ngram is the key. For all possible endings of that (i.e. # all the 1grams with which it ends) we count frequencies, normalise to # probabilities and then build the cumulative probability. # For example consider the following 'counts' # (let's assume we are dealing with letters, ngram-length=3): # HEL => 10 # HEA => 20 # HEN => 5 # ALA => 10 # ALB => 20 # The cumulative-probability-distribution will be something like this: # HE => { # L => 10/(10+20+5) # A => 10/(10+20+5) + 20(10+20+5) # N => 10/(10+20+5) + 20/(10+20+5) + 5/(10+20+5) # } # AL # left as an exercise for the reader # The sub returns a hashref made of these three hashrefs # 'counts' => frequency(=counts) of each ngram as it occured in the input data # 'dist' => probability distribution (i.e. normalised frequency/counts) of the above # 'twisted-dist' => as explained before. # 'cum-twisted-dist' => as explained before. # Refer to the 'predict()' sub for how the 'cum-twisted-dist' hash is used to make a prediction. sub learn { my $params = $_[0] // {}; my $parent = ( caller(1) )[3] || "N/A"; my $whoami = ( caller(0) )[3]; my $ngram_length = $params->{'ngram-length'} // 1; # optional separator, can be left undef in which case we work letter-by-letter # if defined it has the form of a regex, e.g. '.' or '\b' my $separator = $params->{'separator'}; my $internal_separator = defined $separator ? $params->{'internal-separator'} // '\|/' : ''; # optional debug parameter my $DEBUG = $params->{'debug'} // 0; # optionally specify here 'counts' or 'twisted-dist' # in order to return them my $need = $params->{'need-to-return'} // {}; # optionally specify here what not to return my $avoid = $params->{'avoid-to-return'} // {}; # counts the occurence of each combination of symbols of length 'ngram_length' # if specified in params, data will be appended and distribution # re-calculated with old and new data: my $counts = undef; my $remove_these_characters_from_input = $params->{'remove-these-characters'}; # TODO: separator for symbols if( defined($counts=$params->{'counts'}) ){ my ($anykey) = keys %$counts; my $le = length($anykey); if( $le != $ngram_length ){ print STDERR "$whoami (via $parent) : input counts data was for ngrams-length of $le and not $ngram_length which you requested.\n"; return undef } } else { $counts = {} } # <<< fresh counts my ($infile, $instring, $howmany, $m); my $remove_chars_regex = defined($remove_these_characters_from_input) ? qr/${remove_these_characters_from_input}/ : undef; if( defined($infile=$params->{'input-filename'}) ){ $howmany = 0; $counts = {}; my $FH; if( ! open $FH, '<', $infile ){ print STDERR "$whoami (via $parent) : could not open file '$infile' for reading, $!"; return undef } while( my $line = <$FH> ){ if( defined $remove_chars_regex ){ $line =~ s/${remove_chars_regex}/ /g; } $howmany += _process_line(\$line, $counts, $ngram_length, $separator, $internal_separator) } close($FH); if( $DEBUG ){ print "$whoami (via $parent) : file read OK '$infile': $howmany items read.\n" } } elsif( defined($instring=$params->{'input-string'}) ){ $howmany = 0; $counts = {}; foreach my $line (split /^/, $instring){ $howmany += _process_line(\$line, $counts, $ngram_length, $separator, $internal_separator) } if( $DEBUG ){ print "$whoami (via $parent) : string read OK: $howmany items read.\n" } } elsif( defined($m=$params->{'input-array-ngrams'}) ){ $counts = {}; $howmany = scalar(@$m) - $ngram_length + 1; my ($ngram, $i); for($i=0;$i<$howmany;$i++){ $ngram = join($internal_separator, @{$m}[$i..($i+$ngram_length)]); $counts->{$ngram}++; } if( $DEBUG ){ print "$whoami (via $parent) : array-ngrams read OK: $howmany items read.\n" } } elsif( defined($m=$params->{'input-array-symbols'}) ){ $counts = {}; $howmany = scalar(@$m); my $i; for($i=0;$i<$howmany;$i++){ $counts->{$m->[$i]}++; } if( $DEBUG ){ print "$whoami (via $parent) : array-symbols read OK: $howmany items read.\n" } } elsif( ! defined($counts=$params->{'counts'}) ){ print STDERR "$whoami (via $parent) : either 'input-filename', 'input-string' or 'input-array-symbols' or 'input-array-ngrams' or 'counts' (which is a hashref of symbol counts) must be specified.\n"; return undef } # create normalised counts = probability distribution (into %dist) my %dist = %$counts; my $sum = 0; $sum+=$_ for values %dist; $dist{$_} /= $sum for keys %dist; if( $DEBUG ){ print "$whoami (via $parent) : normalised counts OK:\n".Data::Dump::dump(%dist)."\n" } my ($asy, $lasy, $c, $i, $l, $arr); # make a cumulative distribution but break it into 2 parts # the last part is the last symbol and the first part are the rest # we do a lookup by using the first part so that we have all the available # choices for what may follow that first part as cum-prob-dist my %twisted_dist = (); foreach $asy (keys %$counts){ $c = $counts->{$asy}; # internal_separator must be escaped (\Q)... my @lasys = split /\Q${internal_separator}/, $asy; $lasy = pop @lasys; $asy = join $internal_separator, @lasys; # ngram-length == 1 then ... if( $asy eq '' ){ $asy = '' } if( ! exists $twisted_dist{$asy} ){ $twisted_dist{$asy} = [] } push @{$twisted_dist{$asy}}, ($lasy, $c); } # normalise it, remember: it's an array of 'symbol','count','symbol','count', ... foreach $asy (keys %twisted_dist){ $arr = $twisted_dist{$asy}; $l = scalar(@$arr); $sum = 0; for($i=1;$i<$l;$i+=2){ $sum += $arr->[$i] } for($i=1;$i<$l;$i+=2){ $arr->[$i] /= $sum } } if( $DEBUG ){ print "$whoami (via $parent) : made twisted distribution OK:\n".Data::Dump::dump(\%twisted_dist)."\n" } my $cum_twisted_dist = undef; if( $need->{'all'} or $need->{'twisted-dist'} ){ $cum_twisted_dist = {}; foreach $asy (keys %twisted_dist){ $cum_twisted_dist->{$asy} = [@{$twisted_dist{$asy}}] } } else { $cum_twisted_dist = \%twisted_dist } # do the cumulative thing: foreach $asy (keys %$cum_twisted_dist){ $arr = $cum_twisted_dist->{$asy}; $l = scalar(@$arr); for($i=3;$i<$l;$i+=2){ $arr->[$i] += $arr->[$i-2] } # make last item 1 because of rounding errors might be .9999 $arr->[-1] = 1.0; } if( $DEBUG ){ print "$whoami (via $parent) : made cumulative twisted distribution OK:\n".Data::Dump::dump($cum_twisted_dist)."\n" } my %ret = ( 'cum-twisted-dist' => $cum_twisted_dist, 'dist' => \%dist, 'N' => $ngram_length ); if( $need->{'all'} or $need->{'twisted-dist'} ){ $ret{'twisted-dist'} = \%twisted_dist } if( ! $avoid->{'counts'} ){ $ret{'counts'} = $counts } return \%ret } # return the next in sequence weighted on probabilities # and given the previous symbols (inp) # 'dat' must be built using learn(), it is the 'cum-twisted-dist' entry (in the returned hash) # returns undef on failure. # # The input is a 'cum-twisted-dist' hashref (as described in 'learn()') # Assuming that we processed our data with ngram-length=3, # then this function will take as input the first 2 symbols # and will predict the 3rd based on the probability distribution # built from the input data during the 'learn()' stage. # For example, if built with ngram-length=3 the distribution of # triplets of bases of the human genome (A,T,C,G), then # during the 'learn()' stage we would have build the following data # 'counts', e.g. ATC => 1000 # 'dist', e.g. ATC => 1000/(total number of triplets) # << which is the normalised frequency # 'cum-twisted-dist', with a twist, e.g. # AT => {C=>0.2, G=>0.3, A=>0.8, T=>1.0} # << Cumulative probabilities, always sum to 1 # This last 'cum-twisted-dist' is our 'dat' input parameter. # Then we can ask the system to predict the third base # given the first two bases, 'AT'. # The prediction will be random but will be based (weightet) on the cum-twisted-dist. # In our example, predict(AT) will yield an A most of the times, followed by # equally-probable C and T and then G (1 in 10). sub predict { my $dat = $_[0]; # the hashref with the cum-prob distribution ('cum-twisted-dist') my $inp = $_[1]; # must be a string containing symbols separated by the internal separator, e.g. ATGC or I|want|to my $c = $dat->{$inp}; if( ! defined $c ){ return undef } my $l = scalar(@$c); my $r = rand; my $i; for($i=1;$i<$l;$i+=2){ if( $r < $c->[$i] ){ return $c->[$i-1] } } return undef } sub save_state { my $dat = $_[0]; my $outfile = $_[1]; if( ! Storable::store($dat, $_[1]) ){ print STDERR 'save_state()'.": call to ".'Storable::store()'." has failed for output file '$outfile'.\n"; return 0 } return 1 } sub load_state { my $infile = $_[0]; my $ret = Storable::retrieve($infile); if( ! defined($ret) ){ print STDERR 'load_state()'.": call to ".'Storable::retrieve()'." has failed for input file '$infile'.\n"; return 0 } return $ret } # Same as the 'predict()' but it does not make a prediction. # It returns all the possible options the (n-1)grams input # has as a hashref of cumulative probabilities. # will return undef if input has never been seen and does # not exist in 'dat' (which is a 'cum-twisted-dist',see 'learn()') sub range { #my $dat = $_[0]; # the hashref with the cum-prob distribution ('cum-twisted-dist') #my $inp = $_[1]; # must be a string of length = ngram_length #return $dat->{$inp}; return $_[0]->{$_[1]} } # private subs # processes a line of text which consists of ngram_length words separated by optional separator # By specifying a separator we can process sentences where words are our basic symbols. In this # way we can have arbitrary-length symbols. # On the other hand, the separator can be left undef. In this case we treat a line as character-by-character # and each symbol consists of exactly ngram_length letters. # it returns the number of symbols found in this line (>=0) # It assumes that a line contains only complete words, no half word can end the line. sub _process_line { my $line = ${$_[0]}; # expecting a scalar ref to the line to process my $output_ref = $_[1]; my $ngram_length = $_[2]; my $sep = $_[3]; # optional, can be left undefined for no separator and work letter-by-letter my $intsep = $_[4]; # optional and only if sep is defined, it sets the internal separator between symbols $line =~ s/>.*$//; $line =~ s/\s+/ /g; $line =~ s/\s+$//; return 0 if $line =~ /^\s*$/; # nothing to do my ($howmany); if( defined $sep ){ # we have a separator, so we need to regex $howmany = 0; my $temp; while( $line =~ /(((.+?)(?=${sep}+|$)){${ngram_length}})/g ){ $temp = $1; $temp =~ s/^${sep}+//; $temp =~ s/${sep}+/${intsep}/g; $output_ref->{$temp}++; $howmany++; } } else { $howmany = length($line)-$ngram_length+1; for(my $from=$howmany;$from-->0;){ $output_ref->{substr $line, $from, $ngram_length}++; } } return $howmany; } 1;