CT => ["A", 0.7, "G", 1],
GA => ["T", 0.8, "C", 1],
GC => ["A", 0.2, "T", 1],
GT => ["A", 1],
####
AAA => 0.0335,
CCC => 0.0158,
GGG => 0.0158,
TTT => 0.0350,
GCG => 0.0030,
...
####
package Markov::Ndimensional;
# by bliako
# for https://perlmonks.org/?node_id=1228191
# 20/01/2019
use strict;
use warnings;
our $VERSION = 0.1;
use Exporter qw/import/;
our @EXPORT = qw/learn predict range save_state load_state/;
use Storable;
use Data::Dump;
# * reads in some data (from file or string),
# * splits it to symbols according to separator
# or it works character-by-character if the separator is left undef,
# * it eventually will have symbols.
# These symbols can be words in text (if separator = word boundary='\b')
# Or they can be letters in text (if separator is left undefined)
# (in this case you must arrange that the input has each word in a line on its own)
# Symbols can are then put together to form "ngrams" and the frequency of
# each ngram is counted.
# Input parameter 'ngram-length' refers to the number of symbols in an ngram.
# For example, with an ngram-length=2 and a word-boundary separator
# we get
# 'hello','there' => 5
# 'hello','you' => 10
# 'goodbye','all' => 5
# etc.
# the numbers are the counts for each ngram as read from the input data.
# Input parameter 'counts' can be used to supply such frequency data
# without re-reading huge corpus each time. The param must be a hashref
# where key=ngrams (separated by anything you like, no quotes necessary
# but use some separator in order not to confuse composite words)
# With that frequency data, we proceed to calculate a probability
# distribution ('dist') which is just normalised frequencies.
# Then we build the cumulative probability distribution but with a twist:
# the ngrams are broken into (n-1)grams and 1grams (from left to right for latin)
# The first ngram is the key. For all possible endings of that (i.e.
# all the 1grams with which it ends) we count frequencies, normalise to
# probabilities and then build the cumulative probability.
# For example consider the following 'counts'
# (let's assume we are dealing with letters, ngram-length=3):
# HEL => 10
# HEA => 20
# HEN => 5
# ALA => 10
# ALB => 20
# The cumulative-probability-distribution will be something like this:
# HE => {
# L => 10/(10+20+5)
# A => 10/(10+20+5) + 20(10+20+5)
# N => 10/(10+20+5) + 20/(10+20+5) + 5/(10+20+5)
# }
# AL # left as an exercise for the reader
# The sub returns a hashref made of these three hashrefs
# 'counts' => frequency(=counts) of each ngram as it occured in the input data
# 'dist' => probability distribution (i.e. normalised frequency/counts) of the above
# 'twisted-dist' => as explained before.
# 'cum-twisted-dist' => as explained before.
# Refer to the 'predict()' sub for how the 'cum-twisted-dist' hash is used to make a prediction.
sub learn {
my $params = $_[0] // {};
my $parent = ( caller(1) )[3] || "N/A";
my $whoami = ( caller(0) )[3];
my $ngram_length = $params->{'ngram-length'} // 1;
# optional separator, can be left undef in which case we work letter-by-letter
# if defined it has the form of a regex, e.g. '.' or '\b'
my $separator = $params->{'separator'};
my $internal_separator = defined $separator ? $params->{'internal-separator'} // '\|/' : '';
# optional debug parameter
my $DEBUG = $params->{'debug'} // 0;
# optionally specify here 'counts' or 'twisted-dist'
# in order to return them
my $need = $params->{'need-to-return'} // {};
# optionally specify here what not to return
my $avoid = $params->{'avoid-to-return'} // {};
# counts the occurence of each combination of symbols of length 'ngram_length'
# if specified in params, data will be appended and distribution
# re-calculated with old and new data:
my $counts = undef;
my $remove_these_characters_from_input = $params->{'remove-these-characters'};
# TODO: separator for symbols
if( defined($counts=$params->{'counts'}) ){
my ($anykey) = keys %$counts;
my $le = length($anykey);
if( $le != $ngram_length ){ print STDERR "$whoami (via $parent) : input counts data was for ngrams-length of $le and not $ngram_length which you requested.\n"; return undef }
} else { $counts = {} } # <<< fresh counts
my ($infile, $instring, $howmany, $m);
my $remove_chars_regex = defined($remove_these_characters_from_input) ? qr/${remove_these_characters_from_input}/ : undef;
if( defined($infile=$params->{'input-filename'}) ){
$howmany = 0;
$counts = {};
my $FH;
if( ! open $FH, '<', $infile ){ print STDERR "$whoami (via $parent) : could not open file '$infile' for reading, $!"; return undef }
while( my $line = <$FH> ){
if( defined $remove_chars_regex ){ $line =~ s/${remove_chars_regex}/ /g; }
$howmany += _process_line(\$line, $counts, $ngram_length, $separator, $internal_separator)
}
close($FH);
if( $DEBUG ){ print "$whoami (via $parent) : file read OK '$infile': $howmany items read.\n" }
} elsif( defined($instring=$params->{'input-string'}) ){
$howmany = 0;
$counts = {};
foreach my $line (split /^/, $instring){
$howmany += _process_line(\$line, $counts, $ngram_length, $separator, $internal_separator)
}
if( $DEBUG ){ print "$whoami (via $parent) : string read OK: $howmany items read.\n" }
} elsif( defined($m=$params->{'input-array-ngrams'}) ){
$counts = {};
$howmany = scalar(@$m) - $ngram_length + 1;
my ($ngram, $i);
for($i=0;$i<$howmany;$i++){
$ngram = join($internal_separator, @{$m}[$i..($i+$ngram_length)]);
$counts->{$ngram}++;
}
if( $DEBUG ){ print "$whoami (via $parent) : array-ngrams read OK: $howmany items read.\n" }
} elsif( defined($m=$params->{'input-array-symbols'}) ){
$counts = {};
$howmany = scalar(@$m);
my $i;
for($i=0;$i<$howmany;$i++){
$counts->{$m->[$i]}++;
}
if( $DEBUG ){ print "$whoami (via $parent) : array-symbols read OK: $howmany items read.\n" }
} elsif( ! defined($counts=$params->{'counts'}) ){ print STDERR "$whoami (via $parent) : either 'input-filename', 'input-string' or 'input-array-symbols' or 'input-array-ngrams' or 'counts' (which is a hashref of symbol counts) must be specified.\n"; return undef }
# create normalised counts = probability distribution (into %dist)
my %dist = %$counts;
my $sum = 0; $sum+=$_ for values %dist;
$dist{$_} /= $sum for keys %dist;
if( $DEBUG ){ print "$whoami (via $parent) : normalised counts OK:\n".Data::Dump::dump(%dist)."\n" }
my ($asy, $lasy, $c, $i, $l, $arr);
# make a cumulative distribution but break it into 2 parts
# the last part is the last symbol and the first part are the rest
# we do a lookup by using the first part so that we have all the available
# choices for what may follow that first part as cum-prob-dist
my %twisted_dist = ();
foreach $asy (keys %$counts){
$c = $counts->{$asy};
# internal_separator must be escaped (\Q)...
my @lasys = split /\Q${internal_separator}/, $asy;
$lasy = pop @lasys;
$asy = join $internal_separator, @lasys;
# ngram-length == 1 then ...
if( $asy eq '' ){ $asy = '' }
if( ! exists $twisted_dist{$asy} ){ $twisted_dist{$asy} = [] }
push @{$twisted_dist{$asy}}, ($lasy, $c);
}
# normalise it, remember: it's an array of 'symbol','count','symbol','count', ...
foreach $asy (keys %twisted_dist){
$arr = $twisted_dist{$asy};
$l = scalar(@$arr);
$sum = 0; for($i=1;$i<$l;$i+=2){ $sum += $arr->[$i] }
for($i=1;$i<$l;$i+=2){ $arr->[$i] /= $sum }
}
if( $DEBUG ){ print "$whoami (via $parent) : made twisted distribution OK:\n".Data::Dump::dump(\%twisted_dist)."\n" }
my $cum_twisted_dist = undef;
if( $need->{'all'} or $need->{'twisted-dist'} ){
$cum_twisted_dist = {};
foreach $asy (keys %twisted_dist){
$cum_twisted_dist->{$asy} = [@{$twisted_dist{$asy}}]
}
} else { $cum_twisted_dist = \%twisted_dist }
# do the cumulative thing:
foreach $asy (keys %$cum_twisted_dist){
$arr = $cum_twisted_dist->{$asy};
$l = scalar(@$arr);
for($i=3;$i<$l;$i+=2){ $arr->[$i] += $arr->[$i-2] }
# make last item 1 because of rounding errors might be .9999
$arr->[-1] = 1.0;
}
if( $DEBUG ){ print "$whoami (via $parent) : made cumulative twisted distribution OK:\n".Data::Dump::dump($cum_twisted_dist)."\n" }
my %ret = (
'cum-twisted-dist' => $cum_twisted_dist,
'dist' => \%dist,
'N' => $ngram_length
);
if( $need->{'all'} or $need->{'twisted-dist'} ){ $ret{'twisted-dist'} = \%twisted_dist }
if( ! $avoid->{'counts'} ){ $ret{'counts'} = $counts }
return \%ret
}
# return the next in sequence weighted on probabilities
# and given the previous symbols (inp)
# 'dat' must be built using learn(), it is the 'cum-twisted-dist' entry (in the returned hash)
# returns undef on failure.
#
# The input is a 'cum-twisted-dist' hashref (as described in 'learn()')
# Assuming that we processed our data with ngram-length=3,
# then this function will take as input the first 2 symbols
# and will predict the 3rd based on the probability distribution
# built from the input data during the 'learn()' stage.
# For example, if built with ngram-length=3 the distribution of
# triplets of bases of the human genome (A,T,C,G), then
# during the 'learn()' stage we would have build the following data
# 'counts', e.g. ATC => 1000
# 'dist', e.g. ATC => 1000/(total number of triplets) # << which is the normalised frequency
# 'cum-twisted-dist', with a twist, e.g.
# AT => {C=>0.2, G=>0.3, A=>0.8, T=>1.0} # << Cumulative probabilities, always sum to 1
# This last 'cum-twisted-dist' is our 'dat' input parameter.
# Then we can ask the system to predict the third base
# given the first two bases, 'AT'.
# The prediction will be random but will be based (weightet) on the cum-twisted-dist.
# In our example, predict(AT) will yield an A most of the times, followed by
# equally-probable C and T and then G (1 in 10).
sub predict {
my $dat = $_[0]; # the hashref with the cum-prob distribution ('cum-twisted-dist')
my $inp = $_[1]; # must be a string containing symbols separated by the internal separator, e.g. ATGC or I|want|to
my $c = $dat->{$inp};
if( ! defined $c ){ return undef }
my $l = scalar(@$c);
my $r = rand;
my $i;
for($i=1;$i<$l;$i+=2){
if( $r < $c->[$i] ){ return $c->[$i-1] }
}
return undef
}
sub save_state {
my $dat = $_[0];
my $outfile = $_[1];
if( ! Storable::store($dat, $_[1]) ){ print STDERR 'save_state()'.": call to ".'Storable::store()'." has failed for output file '$outfile'.\n"; return 0 }
return 1
}
sub load_state {
my $infile = $_[0];
my $ret = Storable::retrieve($infile);
if( ! defined($ret) ){ print STDERR 'load_state()'.": call to ".'Storable::retrieve()'." has failed for input file '$infile'.\n"; return 0 }
return $ret
}
# Same as the 'predict()' but it does not make a prediction.
# It returns all the possible options the (n-1)grams input
# has as a hashref of cumulative probabilities.
# will return undef if input has never been seen and does
# not exist in 'dat' (which is a 'cum-twisted-dist',see 'learn()')
sub range {
#my $dat = $_[0]; # the hashref with the cum-prob distribution ('cum-twisted-dist')
#my $inp = $_[1]; # must be a string of length = ngram_length
#return $dat->{$inp};
return $_[0]->{$_[1]}
}
# private subs
# processes a line of text which consists of ngram_length words separated by optional separator
# By specifying a separator we can process sentences where words are our basic symbols. In this
# way we can have arbitrary-length symbols.
# On the other hand, the separator can be left undef. In this case we treat a line as character-by-character
# and each symbol consists of exactly ngram_length letters.
# it returns the number of symbols found in this line (>=0)
# It assumes that a line contains only complete words, no half word can end the line.
sub _process_line {
my $line = ${$_[0]}; # expecting a scalar ref to the line to process
my $output_ref = $_[1];
my $ngram_length = $_[2];
my $sep = $_[3]; # optional, can be left undefined for no separator and work letter-by-letter
my $intsep = $_[4]; # optional and only if sep is defined, it sets the internal separator between symbols
$line =~ s/>.*$//;
$line =~ s/\s+/ /g; $line =~ s/\s+$//;
return 0 if $line =~ /^\s*$/; # nothing to do
my ($howmany);
if( defined $sep ){
# we have a separator, so we need to regex
$howmany = 0;
my $temp;
while( $line =~ /(((.+?)(?=${sep}+|$)){${ngram_length}})/g ){
$temp = $1; $temp =~ s/^${sep}+//; $temp =~ s/${sep}+/${intsep}/g;
$output_ref->{$temp}++;
$howmany++;
}
} else {
$howmany = length($line)-$ngram_length+1;
for(my $from=$howmany;$from-->0;){
$output_ref->{substr $line, $from, $ngram_length}++;
}
}
return $howmany;
}
1;
####
#!/usr/bin/env perl
# FILE: analyse_DNA_sequence.pl
# by bliako
use strict;
use warnings;
use Getopt::Long;
use Data::Dump qw/dump/;
use Markov::Ndimensional;
my $input_fasta_filename = undef;
my $input_state_filename = undef;
my $output_state_filename = undef;
my $output_stats_filename = undef;
my $separator = undef;
my $ngram_length = -1;
if( ! Getopt::Long::GetOptions(
'input-fasta=s' => \$input_fasta_filename,
'input-state=s' => \$input_state_filename,
'output-state=s' => \$output_state_filename,
'output-stats=s' => \$output_stats_filename,
'ngram-length=i' => \$ngram_length,
'separator=s' => \$separator,
'help|h' => sub { print STDERR usage($0); exit(0) }
) ){ print STDERR usage($0) . "\n\nSomething wrong with command-line parameters...\n"; exit(1); }
if( $ngram_length <= 0 ){ print STDERR "$0 : ngram-length must be a positive integer.\n"; exit(1) }
my %params = ();
if( defined($output_state_filename) ){ $params{'need'} = {'all'=>1} }
else { $params{'avoid'} = {'counts'=>1} }
my $state = undef;
if( defined($input_state_filename) ){
$state = load_state($input_state_filename);
if( ! defined($state) ){ print STDERR "$0 : call to ".'load_state()'." has failed.\n"; exit(1) }
$params{'counts'} = $state->{'counts'};
}
if( defined($input_fasta_filename) ){
$state = learn({
%params,
'ngram-length' => $ngram_length,
'separator' => $separator,
'input-filename' => $input_fasta_filename,
});
if( ! defined($state) ){ print STDERR "$0 : call to ".'learn()'." has failed.\n"; exit(1) }
}
if( ! defined($state) ){ print STDERR "$0 : --input-state and/or --input-fasta must be specified.\n"; exit(1) }
if( defined($output_state_filename) ){
if( ! save_state($state, $output_state_filename) ){ print STDERR "$0 : call to ".'save_state()'." has failed.\n"; exit(1) }
}
if( defined($output_stats_filename) ){
print Data::Dump::dump($state);
} else {
print Data::Dump::dump($state);
}
print "\n$0 : done.\n";
exit(0);
sub usage {
return "Usage : $0 \n";
}
1;
####
#!/usr/bin/env perl
# FILE: analyse_text.pl
# by bliako
use strict;
use warnings;
use Getopt::Long;
use Data::Dump qw/dump/;
use Markov::Ndimensional;
my $input_corpus_filename = undef;
my $input_state_filename = undef;
my $output_state_filename = undef;
my $output_stats_filename = undef;
my $separator = '\s';
my $internal_separator = '|';
my $ngram_length = -1;
if( ! Getopt::Long::GetOptions(
'input-corpus=s' => \$input_corpus_filename,
'input-state=s' => \$input_state_filename,
'output-state=s' => \$output_state_filename,
'output-stats=s' => \$output_stats_filename,
'ngram-length=i' => \$ngram_length,
'separator=s' => \$separator,
'help|h' => sub { print STDERR usage($0); exit(0) }
) ){ print STDERR usage($0) . "\n\nSomething wrong with command-line parameters...\n"; exit(1); }
if( $ngram_length <= 0 ){ print STDERR "$0 : ngram-length must be a positive integer.\n"; exit(1) }
my %params = ();
if( defined($output_state_filename) ){ $params{'need'} = {'all'=>1} }
else { $params{'avoid'} = {'counts'=>1} }
my $state = undef;
if( defined($input_state_filename) ){
$state = load_state($input_state_filename);
if( ! defined($state) ){ print STDERR "$0 : call to ".'load_state()'." has failed.\n"; exit(1) }
$params{'counts'} = $state->{'counts'};
}
if( defined($input_corpus_filename) ){
$state = learn({
%params,
'ngram-length' => $ngram_length,
'separator' => $separator,
'internal-separator' => $internal_separator,
'remove-these-characters' => '[^a-zA-Z]',
'input-filename' => $input_corpus_filename,
});
if( ! defined($state) ){ print STDERR "$0 : call to ".'learn()'." has failed.\n"; exit(1) }
}
if( ! defined($state) ){ print STDERR "$0 : --input-state and/or --input-fasta must be specified.\n"; exit(1) }
if( defined($output_state_filename) ){
if( ! save_state($state, $output_state_filename) ){ print STDERR "$0 : call to ".'save_state()'." has failed.\n"; exit(1) }
}
if( defined($output_stats_filename) ){
print Data::Dump::dump($state);
} else {
print Data::Dump::dump($state);
}
print "\n$0 : done.\n";
exit(0);
sub usage {
return "Usage : $0 \n";
}
1;
####
#!/usr/bin/env perl
# FILE: predict_text.pl
# by bliako
use strict;
use warnings;
use Getopt::Long;
use Data::Dump qw/dump/;
use Markov::Ndimensional;
my $input_corpus_filename = undef;
my $input_state_filename = undef;
my $output_state_filename = undef;
my $output_stats_filename = undef;
my $separator = '\s';
my $internal_separator = '|';
my $seed = undef;
my $num_iterations = 100;
if( ! Getopt::Long::GetOptions(
'input-state=s' => \$input_state_filename,
'separator=s' => \$separator,
'num-iterations=i' => $num_iterations,
'seed=s' => \$seed,
'help|h' => sub { print STDERR usage($0); exit(0) }
) ){ print STDERR usage($0) . "\n\nSomething wrong with command-line parameters...\n"; exit(1); }
if( ! defined($input_state_filename) ){ print STDERR "$0 : --input-state must be used to specify a state file to read. In order to produce a state use analyse_text.pl\n"; exit(1); }
my $state = load_state($input_state_filename);
if( ! defined($state) ){ print STDERR "$0 : call to ".'load_state()'." has failed.\n"; exit(1) }
my $ngram_length = $state->{'N'};
print "$0 : read state from '$input_state_filename', ngram-length is $ngram_length.\n";
if( defined $seed ){
my @crap = split /${separator}/, $seed;
if( scalar(@crap) != ($ngram_length-1) ){ print STDERR "$0 : the ngram-length from state file '$input_state_filename' is $ngram_length but the provided seed assumes length ".(scalar(@crap)+1)."\n"; exit(1) }
$seed = join $internal_separator, @crap;
} else {
$seed = (keys %{$state->{'cum-twisted-dist'}})[0]
}
print "$0 : starting with seed '$seed' ...\n";
my %params = ();
$params{'avoid'} = {'counts'=>1};
my $w = $state->{'cum-twisted-dist'};
print $seed;
for(1..$num_iterations){
my $ret = predict($w, $seed);
if( ! defined($ret) ){ last }
print " $ret";
my @y = split /${separator}/, $seed; shift @y;
$seed = join ' ', @y, $ret;
}
print "\n$0 : done.\n";
exit(0);
sub usage {
return "Usage : $0 \n";
}
1;
####
$ wget ftp://ftp.ncbi.nih.gov/genomes/Homo_sapiens/CHR_20/hs_ref_GRCh38.p12_chr20.fa.gz # warning ~100MB
# this will build the 3-dim probability distribution of the input DNA seq and serialise it to the state file
$ analyse_DNA_sequence.pl --input-fasta hs_ref_GRCh38.p12_chr20.fa --ngram-length 3 --output-state hs_ref_GRCh38.p12_chr20.fa.3.state --output-stats stats.txt
# now work with some text, e.g http://www.gutenberg.org/files/84/84-0.txt (easy on the gutenberg servers!!!)
$ analyse_text.pl --input-corpus ShelleyFrankenstein.txt --ngram-length 2 --output-state shelley.state
$ predict_text.pl --input-state shelley.state