package	Markov::Ndimensional;

# by bliako
# for https://perlmonks.org/?node_id=1228191
# 20/01/2019

use strict;
use warnings;

our $VERSION = 0.1;

use Exporter qw/import/;
our @EXPORT = qw/learn predict range save_state load_state/;

use Storable;
use Data::Dump;

# * reads in some data (from file or string),
# * splits it to symbols according to separator
#   or it works character-by-character if the separator is left undef,
# * it eventually will have symbols.
# These symbols can be words in text (if separator = word boundary='\b')
# Or they can be letters in text (if separator is left undefined)
# (in this case you must arrange that the input has each word in a line on its own)
# Symbols can are then put together to form "ngrams" and the frequency of
# each ngram is counted.
# Input parameter 'ngram-length' refers to the number of symbols in an ngram.
# For example, with an ngram-length=2 and a word-boundary separator
# we get
#   'hello','there' => 5
#   'hello','you' => 10
#   'goodbye','all' => 5
# etc.
# the numbers are the counts for each ngram as read from the input data.
# Input parameter 'counts' can be used to supply such frequency data
# without re-reading huge corpus each time. The param must be a hashref
# where key=ngrams (separated by anything you like, no quotes necessary
# but use some separator in order not to confuse composite words)
# With that frequency data, we proceed to calculate a probability
# distribution ('dist') which is just normalised frequencies.
# Then we build the cumulative probability distribution but with a twist:
# the ngrams are broken into (n-1)grams and 1grams (from left to right for latin)
# The first ngram is the key. For all possible endings of that (i.e.
# all the 1grams with which it ends) we count frequencies, normalise to
# probabilities and then build the cumulative probability.
# For example consider the following 'counts'
# (let's assume we are dealing with letters, ngram-length=3):
#    HEL => 10
#    HEA => 20
#    HEN => 5
#    ALA => 10
#    ALB => 20
# The cumulative-probability-distribution will be something like this:
#    HE => {
#            L => 10/(10+20+5)
#            A => 10/(10+20+5) + 20(10+20+5)
#            N => 10/(10+20+5) + 20/(10+20+5) + 5/(10+20+5)
#    }
#    AL # left as an exercise for the reader
# The sub returns a hashref made of these three hashrefs
# 'counts' => frequency(=counts) of each ngram as it occured in the input data
# 'dist' => probability distribution (i.e. normalised frequency/counts) of the above
# 'twisted-dist' => as explained before.
# 'cum-twisted-dist' => as explained before.
# Refer to the 'predict()' sub for how the 'cum-twisted-dist' hash is used to make a prediction.

sub	learn {
	my $params = $_[0] // {};

	my $parent = ( caller(1) )[3] || "N/A";
	my $whoami = ( caller(0) )[3];

	my $ngram_length = $params->{'ngram-length'} // 1;
	# optional separator, can be left undef in which case we work letter-by-letter
	# if defined it has the form of a regex, e.g. '.' or '\b'
	my $separator = $params->{'separator'};
	my $internal_separator = defined $separator ? $params->{'internal-separator'} // '\|/' : '';
	# optional debug parameter
	my $DEBUG = $params->{'debug'} // 0;
	# optionally specify here 'counts' or 'twisted-dist'
	# in order to return them
	my $need = $params->{'need-to-return'} // {};
	# optionally specify here what not to return
	my $avoid = $params->{'avoid-to-return'} // {};
	# counts the occurence of each combination of symbols of length 'ngram_length'
	# if specified in params, data will be appended and distribution
	# re-calculated with old and new data:
	my $counts = undef;
	my $remove_these_characters_from_input = $params->{'remove-these-characters'};

	# TODO: separator for symbols
	if( defined($counts=$params->{'counts'}) ){
		my ($anykey) = keys %$counts;
		my $le = length($anykey);
		if( $le != $ngram_length ){ print STDERR "$whoami (via $parent) : input counts data was for ngrams-length of $le and not $ngram_length which you requested.\n"; return undef }
	} else { $counts = {} } # <<< fresh counts

	my ($infile, $instring, $howmany, $m);
	my $remove_chars_regex = defined($remove_these_characters_from_input) ? qr/${remove_these_characters_from_input}/ : undef;
	if( defined($infile=$params->{'input-filename'}) ){
		$howmany = 0;
		$counts = {};
		my $FH;
		if( ! open $FH, '<', $infile ){ print STDERR "$whoami (via $parent) : could not open file '$infile' for reading, $!"; return undef }
		while( my $line = <$FH> ){
			if( defined $remove_chars_regex ){ $line =~ s/${remove_chars_regex}/ /g; }
			$howmany += _process_line(\$line, $counts, $ngram_length, $separator, $internal_separator)
		}
		close($FH);
		if( $DEBUG ){ print "$whoami (via $parent) : file read OK '$infile': $howmany items read.\n" }
	} elsif( defined($instring=$params->{'input-string'}) ){
		$howmany = 0;
		$counts = {};
		foreach my $line (split /^/, $instring){
			$howmany += _process_line(\$line, $counts, $ngram_length, $separator, $internal_separator)
		}
		if( $DEBUG ){ print "$whoami (via $parent) : string read OK: $howmany items read.\n" }
	} elsif( defined($m=$params->{'input-array-ngrams'}) ){
		$counts = {};
		$howmany = scalar(@$m) - $ngram_length + 1;
		my ($ngram, $i);
		for($i=0;$i<$howmany;$i++){
			$ngram = join($internal_separator, @{$m}[$i..($i+$ngram_length)]);
			$counts->{$ngram}++;
		}
		if( $DEBUG ){ print "$whoami (via $parent) : array-ngrams read OK: $howmany items read.\n" }
	} elsif( defined($m=$params->{'input-array-symbols'}) ){
		$counts = {};
		$howmany = scalar(@$m);
		my $i;
		for($i=0;$i<$howmany;$i++){
			$counts->{$m->[$i]}++;
		}
		if( $DEBUG ){ print "$whoami (via $parent) : array-symbols read OK: $howmany items read.\n" }
	} elsif( ! defined($counts=$params->{'counts'}) ){ print STDERR "$whoami (via $parent) : either 'input-filename', 'input-string' or 'input-array-symbols' or 'input-array-ngrams' or 'counts' (which is a hashref of symbol counts) must be specified.\n"; return undef }

	# create normalised counts = probability distribution (into %dist)
	my %dist = %$counts;
	my $sum = 0; $sum+=$_ for values %dist;
	$dist{$_} /= $sum for keys %dist;
	if( $DEBUG ){ print "$whoami (via $parent) : normalised counts OK:\n".Data::Dump::dump(%dist)."\n" }

	my ($asy, $lasy, $c, $i, $l, $arr);
	# make a cumulative distribution but break it into 2 parts
	# the last part is the last symbol and the first part are the rest
	# we do a lookup by using the first part so that we have all the available
	# choices for what may follow that first part as cum-prob-dist
	my %twisted_dist = ();
	foreach $asy (keys %$counts){
		$c = $counts->{$asy};
		# internal_separator must be escaped (\Q)...
		my @lasys = split /\Q${internal_separator}/, $asy;
		$lasy = pop @lasys;
		$asy = join $internal_separator, @lasys;
		# ngram-length == 1 then ...
		if( $asy eq '' ){ $asy = '<empty>' }
		if( ! exists $twisted_dist{$asy} ){ $twisted_dist{$asy} = [] }
		push @{$twisted_dist{$asy}}, ($lasy, $c);
	}
	# normalise it, remember: it's an array of 'symbol','count','symbol','count', ...
	foreach $asy (keys %twisted_dist){
		$arr = $twisted_dist{$asy};
		$l = scalar(@$arr);
		$sum = 0; for($i=1;$i<$l;$i+=2){ $sum += $arr->[$i] }
		for($i=1;$i<$l;$i+=2){ $arr->[$i] /= $sum }
	}
	if( $DEBUG ){ print "$whoami (via $parent) : made twisted distribution OK:\n".Data::Dump::dump(\%twisted_dist)."\n" }

	my $cum_twisted_dist = undef;
	if( $need->{'all'} or $need->{'twisted-dist'} ){
		$cum_twisted_dist = {};
		foreach $asy (keys %twisted_dist){
			$cum_twisted_dist->{$asy} = [@{$twisted_dist{$asy}}]
		}
	} else { $cum_twisted_dist = \%twisted_dist }

	# do the cumulative thing:
	foreach $asy (keys %$cum_twisted_dist){
		$arr = $cum_twisted_dist->{$asy};
		$l = scalar(@$arr);
		for($i=3;$i<$l;$i+=2){ $arr->[$i] += $arr->[$i-2] }
		# make last item 1 because of rounding errors might be .9999
		$arr->[-1] = 1.0;
	}
	if( $DEBUG ){ print "$whoami (via $parent) : made cumulative twisted distribution OK:\n".Data::Dump::dump($cum_twisted_dist)."\n" }

	my %ret = (
		'cum-twisted-dist' => $cum_twisted_dist,
		'dist' => \%dist,
		'N' => $ngram_length
	);
	if( $need->{'all'} or $need->{'twisted-dist'} ){ $ret{'twisted-dist'} = \%twisted_dist }
	if( ! $avoid->{'counts'} ){ $ret{'counts'} = $counts }

	return \%ret
}
# return the next in sequence weighted on probabilities
# and given the previous symbols (inp)
# 'dat' must be built using learn(), it is the 'cum-twisted-dist' entry (in the returned hash)
# returns undef on failure.
# 
# The input is a 'cum-twisted-dist' hashref (as described in 'learn()')
# Assuming that we processed our data with ngram-length=3,
# then this function will take as input the first 2 symbols
# and will predict the 3rd based on the probability distribution
# built from the input data during the 'learn()' stage.
# For example, if built with ngram-length=3 the distribution of
# triplets of bases of the human genome (A,T,C,G), then
# during the 'learn()' stage we would have build the following data
# 'counts', e.g. ATC => 1000
# 'dist', e.g. ATC => 1000/(total number of triplets) # << which is the normalised frequency
# 'cum-twisted-dist', with a twist, e.g.
# AT => {C=>0.2, G=>0.3, A=>0.8, T=>1.0} # << Cumulative probabilities, always sum to 1
# This last 'cum-twisted-dist' is our 'dat' input parameter.
# Then we can ask the system to predict the third base
# given the first two bases, 'AT'.
# The prediction will be random but will be based (weightet) on the cum-twisted-dist.
# In our example, predict(AT) will yield an A most of the times, followed by
# equally-probable C and T and then G (1 in 10).
sub	predict {
	my $dat = $_[0]; # the hashref with the cum-prob distribution ('cum-twisted-dist')
	my $inp = $_[1]; # must be a string containing symbols separated by the internal separator, e.g. ATGC or I|want|to

	my $c = $dat->{$inp};
	if( ! defined $c ){ return undef } 
	my $l = scalar(@$c);
	my $r = rand;
	my $i;
	for($i=1;$i<$l;$i+=2){
		if( $r < $c->[$i] ){ return $c->[$i-1] }
	}
	return undef
}
sub	save_state {
	my $dat = $_[0];
	my $outfile = $_[1];
	if( ! Storable::store($dat, $_[1]) ){ print STDERR 'save_state()'.": call to ".'Storable::store()'." has failed for output file '$outfile'.\n"; return 0 }
	return 1
}
sub	load_state {
	my $infile = $_[0];
	my $ret = Storable::retrieve($infile);
	if( ! defined($ret) ){ print STDERR 'load_state()'.": call to ".'Storable::retrieve()'." has failed for input file '$infile'.\n"; return 0 }
	return $ret
}

# Same as the 'predict()' but it does not make a prediction.
# It returns all the possible options the (n-1)grams input
# has as a hashref of cumulative probabilities.
# will return undef if input has never been seen and does
# not exist in 'dat' (which is a 'cum-twisted-dist',see 'learn()')
sub	range {
	#my $dat = $_[0]; # the hashref with the cum-prob distribution ('cum-twisted-dist')
	#my $inp = $_[1]; # must be a string of length = ngram_length
	#return $dat->{$inp};
	return $_[0]->{$_[1]}
}

# private subs
# processes a line of text which consists of ngram_length words separated by optional separator
# By specifying a separator we can process sentences where words are our basic symbols. In this
# way we can have arbitrary-length symbols.
# On the other hand, the separator can be left undef. In this case we treat a line as character-by-character
# and each symbol consists of exactly ngram_length letters.
# it returns the number of symbols found in this line (>=0)
# It assumes that a line contains only complete words, no half word can end the line.
sub	_process_line {
	my $line = ${$_[0]}; # expecting a scalar ref to the line to process
	my $output_ref = $_[1];
	my $ngram_length = $_[2];
	my $sep = $_[3]; # optional, can be left undefined for no separator and work letter-by-letter
	my $intsep = $_[4]; # optional and only if sep is defined, it sets the internal separator between symbols

	$line =~ s/>.*$//;
	$line =~ s/\s+/ /g; $line =~ s/\s+$//;
	return 0 if $line =~ /^\s*$/; # nothing to do

	my ($howmany);	
	if( defined $sep ){
		# we have a separator, so we need to regex
		$howmany = 0;
		my $temp;
		while( $line =~ /(((.+?)(?=${sep}+|$)){${ngram_length}})/g ){
			$temp = $1; $temp =~ s/^${sep}+//; $temp =~ s/${sep}+/${intsep}/g;
			$output_ref->{$temp}++;
			$howmany++;
		}
	} else {
		$howmany = length($line)-$ngram_length+1;
		for(my $from=$howmany;$from-->0;){
			$output_ref->{substr $line, $from, $ngram_length}++;
		}
	}
	return $howmany;
}
1;