package LZW; use strict; use warnings; use Data::Dumper; use constant MAGIC_1=>0x2f; # magic number 1 (Set to 0x1f when compress compatible) use constant MAGIC_2=>0x9d; # compress magic number 2 use constant MAGIC=>pack 'C*',MAGIC_1,MAGIC_2; use constant BIT_MASK=>0x1f; # bit_mask for third byte to get # the max bits in the file. use constant BLOCK_MODE=>0x80; # bit_mask for whether we are # compress 2.0 compat (we arent) use constant WIPE=>256; # code to cause a string table dump use constant INIT_BITS=>9; # number of bits for the first emitted code use constant MAX_BITS=>12; # maximum bits to represent. this means # we can have 2**MAX_BITS codes and corresponding # hash table entries. use constant CODE_EOF=>2**MAX_BITS+1; # we use this to signal EOF. $|++; our $DEBUG=0; # used for debugging sub qquote { my $val=shift; return 'undef' unless defined $val; return qq("") unless length $val; my @parts=$val=~m/\G(.{1,80})/sg; #warn scalar @parts,"\n"; return (@parts>1?"\n":"").join(".\n", map { Data::Dumper::qquote($_) } @parts); } BEGIN { $Data::Dumper::Useqq=1; my @chars=map { chr ($_) } 0..255; sub _init { my $self=shift; print STDERR "_init()\n" if $DEBUG; my %default=( # index is the codes id, value is the string codes => [ @chars,'' ], # lookup from string to id strs => { map { $_ => ord($_) } @chars[0..255] }, # number of bits to output per code code_bits => INIT_BITS, # (2**code_bits) When the number of codes equals # this we add a bit to the code length threshold => (2**INIT_BITS), ); $self->{$_}=$default{$_} foreach keys %default; return $self; } sub init { my $self=shift; %$self=( # (2**n-1) Size of buffer holding chars representing bits. buf_max => 1023, # user supplied attributes go above here %$self, # per object initializers go below here # string representing the bitstream of the compressed file bit_buf => "", in => undef, # in filehandle out => undef, # out filehandle rbits =>0, # bits read wbits =>0, # bits written ); $self->_init(); } } sub new { my $class=shift; my $self=bless {@_},$class; return $self->init() } sub _open_files { my ($self,$infile,$outfile)=@_; if (-e $outfile) { if ($self->{overwrite}) { warn "Overwriting '$outfile'\n"; } else { Carp::croak("Won't overwrite '$outfile' unless". " overwrite option set\n"); } } open my $in,"<",$infile or die "Reading '$infile' : $!"; open my $out,">",$outfile or die "Writing '$outfile' : $!"; binmode $in; binmode $out; return ($in,$out); } sub expand_file { my ($self,$infile,$outfile)=@_; $infile or die "Must have a file to expand!"; unless ($outfile) { $outfile=$infile; $outfile=~s/\.plzw\z// or $outfile.=".out"; } warn "Expanding '$infile' to '$outfile'\n"; $self->expand_fh($self->_open_files($infile,$outfile)); return $outfile; } sub compress_file { my ($self,$infile,$outfile)=@_; $outfile||= "$infile.plzw"; warn "Compressing '$infile' to '$outfile'\n"; $self->compress_fh($self->_open_files($infile,$outfile)); return $outfile; } sub compress_fh { my ($self,$in,$out)=@_; $self->init() if $self->{in} or $self->{out}; @{$self}{qw(in out)}=($in,$out); my $string; my $buf=""; my $codes=$self->{codes}; read($in,$string,1) or die "Empty file or error! $!"; # compress signature, mode and maxbit and first char. print $out MAGIC,chr(BLOCK_MODE|MAX_BITS),$string; print STDERR "O > char : ".qquote($string)."\n" if $DEBUG; $self->{rbits}+=32; my $notfirst=0; while (<$in>) { $buf.=$_; print STDERR "READ:".qquote($buf)."\n" if $DEBUG; while ($buf) { my $char=substr($buf,0,1,''); my $append=$string.$char; $self->{rbits}+=8; if (exists($self->{strs}{$append})) { $string=$append; } else { if ($notfirst) { $self->output_code($self->{strs}{$string}) } else { $notfirst=1; } # add the code if (@$codes{strs}{$append}=$#$codes; printf STDERR "C>> %6d : %s\n",$#$codes,qquote($append) if $DEBUG; } elsif ($self->{wbits}/$self->{rbits}<.7) { $self->output_code(WIPE); $self->_init(); #$char=""; } $string=$char } } } $self->output_code($self->{strs}{$string} ) if length $string; $self->output_code( CODE_EOF ); warn "Read %d bits, wrote %d bits producing %%%5.2f compression\n", $self->{rbits},$self->{wbits},$self->{wbits}/$self->{rbits}; return 1 } # output_code $fh $code # Takes a filehandle and a numeric code and outputs it with the # correct number of code_bits for the number of codes so far involved. # thus we use 9 bit codes until we hit 512 then we go to 10 bit codes # etc. # We cache the codes emitted until we have a multiple of 8 (as we output # bytes/chars and not code_bits), and at the same time we do a bit of caching # so we call print and pack less often. sub output_code { my ($self,$code)=@_; Carp::confess 'undef!' unless defined $code; my $out=$self->{out}; # the condition here is against <= $#.. different from input_code while ($self->{threshold} <= $#{$self->{codes}} and $self->{code_bits}{threshold}<<=1; $self->{code_bits}++; } unless ($code==CODE_EOF) { my $bits=sprintf "%0*b",$self->{code_bits},$code; $self->{bit_buf}.=$bits; $self->{wbits}+=$self->{code_bits}; printf STDERR "O > %-6d : %12s : %s \t| ", $code,$bits,qquote($self->{codes}[$code]) if $DEBUG; } my $len=length($self->{bit_buf}); if (($len>$self->{buf_max}) or (CODE_EOF == $code)) { my $chunk=substr($self->{bit_buf},0, ($len>$self->{buf_max} ? int($len / 8) * 8 : $len),"" ); my $pack=pack "B*",$chunk; printf STDERR "\nOC> %2d:%2d:%4d:%s\n %s\n", length($pack),$self->{code_bits},length($chunk), qquote($chunk),qquote($pack),"\n" if $DEBUG; print $out $pack; #$self->{bit_buf}=""; } } sub expand_fh { my $self=shift; my ($in,$out)=@_; $self->init() if $self->{in} or $self->{out}; $self->{in}=$in; $self->{out}=$out; my $head; my $bytes=read($in,$head,3); unless ($bytes == 3 and substr($head,0,2) eq MAGIC) { Carp::confess("Not a Perl LZW file"); } if (ord(substr($head,3,1)) & BIT_MASK > MAX_BITS) { Carp::confess("Can't decompress this file, it used too large ". " a bitsize for me to handle.\n"); } my ($char,$old_code); read($in,$char,1) or die "Bad file!\n"; $old_code=ord($char); my $codes=$self->{codes}; print STDERR "First char: $char\n" if $DEBUG; print $out $char; $char=""; #reset char so the first symbol entered isnt the first char doubled. #old_code holds its ord value anyway. while ((my $new_code=$self->input_code()) != CODE_EOF) { if ($new_code==WIPE) { $self->_init(); $char=""; $old_code=WIPE; next; } print STDERR "Read: $new_code " .qquote($codes->[$new_code])."\n" if $DEBUG; my $str; if ( defined $codes->[$new_code]) { $str=$codes->[$new_code]; print STDERR "Found $new_code: ",qquote($str),"\n" if $DEBUG>1; } else { # this handles the KwKwK case $str=$codes->[$old_code].$char; print STDERR "Initializing current string for $new_code (", qquote($str).") from ".qquote($codes->[$old_code]), " and ",qquote($char)."\n" if $DEBUG; } $self->output_decoded($str); $char=substr($str,0,1); if (@$codes[$old_code].$char; printf STDERR "E<< Created: %6d : %s\n", $#$codes,qquote($codes->[$old_code].$char) if $DEBUG; } $old_code=$new_code; } } # input_code $in # Reads a code from the input filehandle. # This is a bit tricky. We are extracting variable numbers of code_bits from the stream each time. # As with the output we maintain a cache, but this time for the purpose of allowing us to easily # slice off the correct set of code_bits from the front of the stream. sub input_code { my $self=shift; my $in=$self->{in}; # make sure we are reading in the correct number of bits # the condition here is against <= @.. different from output_code while ($self->{threshold} <= @{$self->{codes}} and $self->{code_bits}{threshold}<<=1; $self->{code_bits}++; } # do we need to fill the buffer up with more bits? if (length($self->{bit_buf})<$self->{code_bits}) { if (not(eof $in)) { local $/=\do{my $size=int ($self->{buf_max} / 8)}; $self->{bit_buf}.=unpack "B*",scalar <$in>; print STDERR "RC< ",qquote($self->{bit_buf})."\n" if $DEBUG; } else { if ($DEBUG && $self->{bit_buf}=~/1/) { # this actually shouldnt be an issue but we do it anyway Carp::carp(length($self->{bit_buf})."<".$self->{code_bits}. ": $self->{bit_buf} : Corrupt file?\n".Dumper($self)); } return CODE_EOF; } } # read the bits from the buffer my $code_bits=substr($self->{bit_buf},0,$self->{code_bits},""); # now turn it into a number my $code=unpack('n',pack "B*",(0 x (16-length $code_bits)).$code_bits); print STDERR " < $code $self->{code_bits} '$code_bits'\n" if $DEBUG; return $code; } sub output_decoded { my ($self,$str)=@_; my $fh=$self->{out}; print $fh $str; } unless (caller) { local $DEBUG=1; my $obj=__PACKAGE__->new(overwrite=>1); my $compressed_as=$obj->compress_file($0); my $expanded_as=$obj->expand_file($compressed_as,$compressed_as.".out"); } 1; __END__ # From http://dogma.net/markn/articles/lzw/lzw.htm # decompress Read OLD_CODE output OLD_CODE CHARACTER = OLD_CODE WHILE there are still input characters DO Read NEW_CODE IF NEW_CODE is not in the translation table THEN STRING = get translation of OLD_CODE STRING = STRING+CHARACTER ELSE STRING = get translation of NEW_CODE END of IF output STRING CHARACTER = first character in STRING add OLD_CODE + CHARACTER to the translation table OLD_CODE = NEW_CODE END of WHILE # compress STRING = get input character WHILE there are still input characters DO CHARACTER = get input character IF STRING+CHARACTER is in the string table then STRING = STRING+character ELSE output the code for STRING add STRING+CHARACTER to the string table STRING = CHARACTER END of IF END of WHILE output the code for STRING