in reply to Re^5: a random_data() implementation
in thread How to efficently pack a string of 63 characters
now I found some time.
Here is the testing script, incorporating my random_data() into your original test script LanX posted and tybalt89 posted:
use strict; # https://perlmonks.org/?node_id=11136582 use warnings; # originally https://perlmonks.org/?displ +aytype=displaycode;node_id=11136614;abspart=1;part=1 use feature 'say'; # using bliako's random_data() from https +://perlmonks.org/?node_id=11136639 use Data::Dumper; # easily reproduce results or comment it out srand 1234; my $factor = 3333; my $data = <<END; ABBCBCAAAAABBCBCACCCAAAAACAAAAABBBBBAAAAABBAAAAAAAABBCCCACCAABC BCCCBCAACAABBBCAAACCAAAAACAAAAABBBBBAAAAABBAAAAAAAABBCCCACCABBC ABCCBBBAAAABBABCACABCCCCCCAAAAABBCBBCCCCAAAAAAAAAAAAACCCACCACCC END my @data = split/\n/, $data; # push @data, map{rotate($data[-$_])} reverse 1..3 for 1.. $factor; # original code by LanX #push @data, map{random_data()} reverse 1..3 for 1.. $factor; # code to use the my $random_data = random_data_bliako(\@data, $factor); # and calculate and print statistics on the data just produced my ($xdist1, $xdist2) = CumProbDists($random_data); $data = join "", @data, @$random_data; # remove \n they can be re-ins +erted later for my $try ( [ 'gzip/gunzip', \&compgzip, \&uncompgzip ], [ '2 bit code, 6 bit runlength', \&comp62, \&uncomp62 ], [ '2 bits per letter', \&comp2bits, \&uncomp2bits ], [ 'groups of 5,2,1', \&comp5, \&uncomp5 ], ) { my ($method, $comp, $uncomp) = @$try; print "\n------------------------------ Compression by $method\n\n +"; print "length of data @{[ $data =~ tr/ABC// ]}\n"; my $compressed = $comp->($data); print "length of compressed data @{[ length $compressed ]}\n"; #use Data::Dump 'dd'; dd $compressed; # print unpack('H*', $compressed), "\n"; my $uncompressed = $uncomp->($compressed); printf "compressed to %.1f%%\n", 100 * length($compressed) / length $uncompressed; print $data eq $uncompressed ? "MATCH" : "************ no MATCH", +"\n"; } # ---- by lanx sub rotate { # fake data from original samp +le my $dbg =0; # say "--- O: ", my $orig =shift; # say my $rnd = int rand 63; my $head = reverse substr $orig,0,$rnd,""; # say "N: ",$orig."|".$head; my $new = $orig.$head; return $new; } sub random_data { #say my $new = join "", map{ chr int(rand 3) + ord "A" } 1..63; return $new; } # compress by groups of 5,2,1 to single letter sub comp5 { my @code = map glob('{A,B,C}' x $_), 5, 2, 1; my %code; @code{@code} = map chr, 1 .. @code; local $" = '|'; shift =~ s/(@code)/$code{$1}/gr } sub uncomp5 { my @code = map glob('{A,B,C}' x $_), 5, 2, 1; my %code; @code{map chr, 1 .. @code} = @code; join '', @code{split //, shift}; } # compress by lower two bits of letter sub comp2bits { my ($ans, $n) = ('', 0); vec($ans, $n++, 2) = 3 & ord $_ for split //, shift; $ans; } sub uncomp2bits { my $comp = shift; join '', map { ('', 'A', 'B', 'C')[ vec $comp, $_, 2] } 0 .. -1 + 4 * length $comp; } # compress by runlength or 6 bits length and 2 bits letter code sub comp62 { shift =~ s/([ABC])\1{0,62}/ chr( length($&) << 2 | ord($1) & 3) +/ger; } sub uncomp62 { shift =~ s/./ (($& & "\3") | '@') x (ord($&) >> 2) /gesr; } # compress by gzip use IO::Compress::Gzip qw(gzip); sub compgzip { gzip \(shift) => \(my $output); $output; } use IO::Uncompress::Gunzip qw(gunzip); sub uncompgzip { gunzip \(shift) => \(my $output); $output; } sub random_data_bliako { my $actual_data = shift; my $lines = shift; # this calculates the mono-gram and di-gram # prob.dist of the actual data. # mono-gram P(A), di-gram P(A|B) my ($dist1, $dist2) = CumProbDists($actual_data); my $width = length $actual_data->[0]; my @results; while( $lines-- ){ my @line; my $letter = which_letter($dist1); push @line, $letter; for (2..$width){ $letter = which_letter($dist2->{$letter}); push @line, $letter; } push @results, join('', @line); } return \@results; } #### end main #### # given some data in the form of an arrayref of strings, it will # calculate cumulative probability distribution (1st frequency, then p +.d. # and then cumulative p.d.) sub CumProbDists { # make a copy because it destructs $data my $data = [ @{$_[0]} ]; # the results: my %dist1; # cpd for each letter A, B, C my %dist2; # cpd for each digram, e.g. A->A, A->B, C->A etc. for my $aline (@$data){ ################################### # I hope this somewhat obsene regex # does not violate any CoCs ################################### while( $aline =~ s/^(.)(.)/$2/g ){ $dist1{$1}++; $dist2{$1}->{$2}++; } } print "Frequencies:\n"; print Dumper(\%dist1); print Dumper(\%dist2); # convert to prob.dist. my $sum = 0; $sum += $_ for values %dist1; $_ /= $sum for values %dist1; for my $v1 (keys %dist1){ $sum = 0; $sum += $_ for values %{$dist2{$v1}}; $_ /= $sum for values %{$dist2{$v1}}; } print "Probability Distribution:\n"; print Dumper(\%dist1); print Dumper(\%dist2); # convert to cumulative prob.dist. $sum = 0; for (sort keys %dist1){ $dist1{$_} += $sum; $sum = $dist1{$_}; } for my $v1 (keys %dist1){ $sum = 0; for (sort keys %{$dist2{$v1}}){ $dist2{$v1}->{$_} += $sum; $sum = $dist2{$v1}->{$_}; } } print "Cumulative Probability distribution:\n"; print Dumper(\%dist1); print Dumper(\%dist2); return (\%dist1, \%dist2) } # given a cum-prob-dist (as a hashref where key is the letter to choos +e, # and value is the cum-prob-dist) # it will return a letter randomly but satisfying the distribution sub which_letter { my $dist = shift; my $rand = rand; for (sort keys %$dist){ if( $rand <= $dist->{$_} ){ return $_ } } }
Here are the statistical distribution of the 3-line data provided initially:
Frequencies: $VAR1 = { 'B' => 44, 'A' => 93, 'C' => 49 }; $VAR1 = { 'C' => { 'C' => 25, 'B' => 5, 'A' => 19 }, 'A' => { 'C' => 11, 'A' => 66, 'B' => 16 }, 'B' => { 'C' => 16, 'A' => 6, 'B' => 22 } }; Probability Distribution: $VAR1 = { 'B' => '0.236559139784946', 'A' => '0.5', 'C' => '0.263440860215054' }; $VAR1 = { 'C' => { 'C' => '0.510204081632653', 'B' => '0.102040816326531', 'A' => '0.387755102040816' }, 'A' => { 'C' => '0.118279569892473', 'A' => '0.709677419354839', 'B' => '0.172043010752688' }, 'B' => { 'C' => '0.363636363636364', 'A' => '0.136363636363636', 'B' => '0.5' } }; Cumulative Probability distribution: $VAR1 = { 'B' => '0.736559139784946', 'A' => '0.5', 'C' => '1' }; $VAR1 = { 'C' => { 'C' => '1', 'B' => '0.489795918367347', 'A' => '0.387755102040816' }, 'A' => { 'C' => '1', 'A' => '0.709677419354839', 'B' => '0.881720430107527' }, 'B' => { 'C' => '1', 'A' => '0.136363636363636', 'B' => '0.636363636363636' } };
And here are the compression comparisons:
------------------------------ Compression by gzip/gunzip length of data 210168 length of compressed data 45076 compressed to 21.4% MATCH ------------------------------ Compression by 2 bit code, 6 bit runlen +gth length of data 210168 length of compressed data 83690 compressed to 39.8% MATCH ------------------------------ Compression by 2 bits per letter length of data 210168 length of compressed data 52542 compressed to 25.0% MATCH ------------------------------ Compression by groups of 5,2,1 length of data 210168 length of compressed data 42035 compressed to 20.0% MATCH
bw, bliako
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re^7: a random_data() implementation
by LanX (Saint) on Sep 10, 2021 at 19:23 UTC | |
by bliako (Abbot) on Sep 10, 2021 at 20:33 UTC | |
by LanX (Saint) on Sep 11, 2021 at 14:31 UTC | |
by LanX (Saint) on Sep 10, 2021 at 20:51 UTC |