comment on

here is a random_data() which simulates the statistical properties of the 3 lines of data baxy77bax provided:

5' EDIT on the script below, nothing important

# by bliako on 10-09-2021
# for https://perlmonks.org/?node_id=11136582
# and specifically https://perlmonks.org/?node_id=11136632
use strict;
use warnings;

use Data::Dumper;

# easily reproduce results or comment it out
srand 1234;

# how many lines of new data to produce
my $lines_to_produce = 1000;

# this is the actual data we want to discover
# and simulate its statistical properties:
my @actual_data = qw/
    ABBCBCAAAAABBCBCACCCAAAAACAAAAABBBBBAAAAABBAAAAAAAABBCCCACCAABC
    BCCCBCAACAABBBCAAACCAAAAACAAAAABBBBBAAAAABBAAAAAAAABBCCCACCABBC
    ABCCBBBAAAABBABCACABCCCCCCAAAAABBCBBCCCCAAAAAAAAAAAAACCCACCACCC
/;

my $random_data = random_data(\@actual_data, $lines_to_produce);

# and calculate and print statistics on the data just produced
my ($xdist1, $xdist2) = CumProbDists($random_data);

## end

sub random_data {
    my $actual_data = shift;
        my $lines = shift;

    # this calculates the mono-gram and di-gram
    # prob.dist of the actual data.
    # mono-gram P(A), di-gram P(A|B)
    my ($dist1, $dist2) = CumProbDists($actual_data);

    my $width = length $actual_data->[0];
    my @results;
    while( $lines-- ){
        my @line;
        my $letter = which_letter($dist1);
        push @line, $letter;
        for (2..$width){
            $letter = which_letter($dist2->{$letter});
            push @line, $letter;
        }
        push @results, join('', @line);
    }
    return \@results;
}
#### end main ####

# given some data in the form of an arrayref of strings, it will
# calculate cumulative probability distribution (1st frequency, then p
+.d.
# and then cumulative p.d.)
sub CumProbDists {
    # make a copy because it destructs $data
    my $data = [ @{$_[0]} ];
    # the results:
    my %dist1; # cpd for each letter A, B, C
    my %dist2; # cpd for each digram, e.g. A->A, A->B, C->A etc.
    for my $aline (@$data){
        ###################################
        # I hope this somewhat obsene regex
        # does not violate any CoCs
        ###################################
            while( $aline =~ s/^(.)(.)/$2/g ){
                    $dist1{$1}++;
                    $dist2{$1}->{$2}++;
            }
    }
    print "Frequencies:\n";
    print Dumper(\%dist1);
    print Dumper(\%dist2);

    # convert to prob.dist.
    my $sum = 0;
    $sum += $_ for values %dist1;
    $_ /= $sum for values %dist1;
    for my $v1 (keys %dist1){
        $sum = 0;
        $sum += $_ for values %{$dist2{$v1}};
        $_ /= $sum for values %{$dist2{$v1}};
    }
    print "Probability Distribution:\n";
    print Dumper(\%dist1);
    print Dumper(\%dist2);

    # convert to cumulative prob.dist.
    $sum = 0;
    for (sort keys %dist1){
        $dist1{$_} += $sum;
        $sum = $dist1{$_};
    }
    for my $v1 (keys %dist1){
        $sum = 0;
        for (sort keys %{$dist2{$v1}}){
            $dist2{$v1}->{$_} += $sum;
            $sum = $dist2{$v1}->{$_};
        }
    }
    print "Cumulative Probability distribution:\n";
    print Dumper(\%dist1);
    print Dumper(\%dist2);

    return (\%dist1, \%dist2)
}
# given a cum-prob-dist (as a hashref where key is the letter to choos
+e,
# and value is the cum-prob-dist)
# it will return a letter randomly but satisfying the distribution
sub which_letter {
    my $dist = shift;
    my $rand = rand;
    for (sort keys %$dist){
        if( $rand <= $dist->{$_} ){ return $_ }
    }
}
[download]

bw, bliako

In reply to Re^4: a random_data() implementation by bliako
in thread How to efficently pack a string of 63 characters by baxy77bax

Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!

Titles consisting of a single word are discouraged, and in most cases are disallowed outright.

Read Where should I post X? if you're not absolutely sure you're posting in the right place.

Please read these before you post! —

Posts may use any of the Perl Monks Approved HTML tags:

a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr

You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)

	For:		Use:
	&		`&`
	<		`<`
	>		`>`
	[		`[`
	]		`]`

Link using PerlMonks shortcuts! What shortcuts can I use for linking?

See Writeup Formatting Tips and other pages linked from there for more info.