use strict; # https://perlmonks.org/?node_id=11136582 use warnings; # originally https://perlmonks.org/?displaytype=displaycode;node_id=11136614;abspart=1;part=1 use feature 'say'; # using bliako's random_data() from https://perlmonks.org/?node_id=11136639 use Data::Dumper; # easily reproduce results or comment it out srand 1234; my $factor = 3333; my $data = <($data); print "length of compressed data @{[ length $compressed ]}\n"; #use Data::Dump 'dd'; dd $compressed; # print unpack('H*', $compressed), "\n"; my $uncompressed = $uncomp->($compressed); printf "compressed to %.1f%%\n", 100 * length($compressed) / length $uncompressed; print $data eq $uncompressed ? "MATCH" : "************ no MATCH", "\n"; } # ---- by lanx sub rotate { # fake data from original sample my $dbg =0; # say "--- O: ", my $orig =shift; # say my $rnd = int rand 63; my $head = reverse substr $orig,0,$rnd,""; # say "N: ",$orig."|".$head; my $new = $orig.$head; return $new; } sub random_data { #say my $new = join "", map{ chr int(rand 3) + ord "A" } 1..63; return $new; } # compress by groups of 5,2,1 to single letter sub comp5 { my @code = map glob('{A,B,C}' x $_), 5, 2, 1; my %code; @code{@code} = map chr, 1 .. @code; local $" = '|'; shift =~ s/(@code)/$code{$1}/gr } sub uncomp5 { my @code = map glob('{A,B,C}' x $_), 5, 2, 1; my %code; @code{map chr, 1 .. @code} = @code; join '', @code{split //, shift}; } # compress by lower two bits of letter sub comp2bits { my ($ans, $n) = ('', 0); vec($ans, $n++, 2) = 3 & ord $_ for split //, shift; $ans; } sub uncomp2bits { my $comp = shift; join '', map { ('', 'A', 'B', 'C')[ vec $comp, $_, 2] } 0 .. -1 + 4 * length $comp; } # compress by runlength or 6 bits length and 2 bits letter code sub comp62 { shift =~ s/([ABC])\1{0,62}/ chr( length($&) << 2 | ord($1) & 3) /ger; } sub uncomp62 { shift =~ s/./ (($& & "\3") | '@') x (ord($&) >> 2) /gesr; } # compress by gzip use IO::Compress::Gzip qw(gzip); sub compgzip { gzip \(shift) => \(my $output); $output; } use IO::Uncompress::Gunzip qw(gunzip); sub uncompgzip { gunzip \(shift) => \(my $output); $output; } sub random_data_bliako { my $actual_data = shift; my $lines = shift; # this calculates the mono-gram and di-gram # prob.dist of the actual data. # mono-gram P(A), di-gram P(A|B) my ($dist1, $dist2) = CumProbDists($actual_data); my $width = length $actual_data->[0]; my @results; while( $lines-- ){ my @line; my $letter = which_letter($dist1); push @line, $letter; for (2..$width){ $letter = which_letter($dist2->{$letter}); push @line, $letter; } push @results, join('', @line); } return \@results; } #### end main #### # given some data in the form of an arrayref of strings, it will # calculate cumulative probability distribution (1st frequency, then p.d. # and then cumulative p.d.) sub CumProbDists { # make a copy because it destructs $data my $data = [ @{$_[0]} ]; # the results: my %dist1; # cpd for each letter A, B, C my %dist2; # cpd for each digram, e.g. A->A, A->B, C->A etc. for my $aline (@$data){ ################################### # I hope this somewhat obsene regex # does not violate any CoCs ################################### while( $aline =~ s/^(.)(.)/$2/g ){ $dist1{$1}++; $dist2{$1}->{$2}++; } } print "Frequencies:\n"; print Dumper(\%dist1); print Dumper(\%dist2); # convert to prob.dist. my $sum = 0; $sum += $_ for values %dist1; $_ /= $sum for values %dist1; for my $v1 (keys %dist1){ $sum = 0; $sum += $_ for values %{$dist2{$v1}}; $_ /= $sum for values %{$dist2{$v1}}; } print "Probability Distribution:\n"; print Dumper(\%dist1); print Dumper(\%dist2); # convert to cumulative prob.dist. $sum = 0; for (sort keys %dist1){ $dist1{$_} += $sum; $sum = $dist1{$_}; } for my $v1 (keys %dist1){ $sum = 0; for (sort keys %{$dist2{$v1}}){ $dist2{$v1}->{$_} += $sum; $sum = $dist2{$v1}->{$_}; } } print "Cumulative Probability distribution:\n"; print Dumper(\%dist1); print Dumper(\%dist2); return (\%dist1, \%dist2) } # given a cum-prob-dist (as a hashref where key is the letter to choose, # and value is the cum-prob-dist) # it will return a letter randomly but satisfying the distribution sub which_letter { my $dist = shift; my $rand = rand; for (sort keys %$dist){ if( $rand <= $dist->{$_} ){ return $_ } } }