in reply to Processing an encoded file backwards
I've actually thought about this myself. seek, tell, sysseek, and sysread all operate on bytes, while read operates on bytes or characters depending on the I/O layers. So because we can only seek in bytes, I think the only way to approach it is to first read a chunk of bytes from the end of the file, and then look at what was read to determine whether a UTF-8 encoded character was chopped off - specifically, if the block of data begins with a byte of which the two high bits are 10xxxxxx, since that is a UTF-8 continuation byte. Discard those bytes, and you should then have a buffer that can be correctly decoded as UTF-8 and that you can inspect for how many characters it contains, how many lines, etc., depending on what you actually want your window to be counted on. So I took this opportunity to finally express my idea in code :-)
sub readbackwards_utf8 { # returns an iterator my ($fn, $window) = @_; die "Bad window $window" unless $window>=4; open my $fh, '<:raw', $fn or die "open $fn: $!"; my $curpos = -s $fh; return sub { if ( $curpos<1 ) { close $fh if $fh; $fh=undef; return } my $bytes = $curpos < $window ? $curpos : $window; seek($fh, $curpos-=$bytes, 0) or die "seek $curpos $fn: $!"; read($fh, my $buf, $bytes) == $bytes or die "read $bytes bytes at $curpos from $fn: $!"; while ( (ord(substr $buf, 0, 1) & 0b11000000)==0b10000000 ) { $buf = substr $buf, 1; $curpos++ } utf8::decode($buf); return $buf; } }
It would be pretty easy to wrap the iterator which the above code returns into another iterator that counts characters and lines, and returns chunks of that size. Of course, this is specific to UTF-8. For encodings with a fixed width, like UTF-16 or UTF-32, it would be somewhat easier.
use open qw/:std :utf8/; use Test::More; use File::Temp qw/tempfile/; my ($tempfh, $filename) = tempfile( UNLINK => 1 ); binmode $tempfh, ':encoding(UTF-8)'; print $tempfh "H\N{U+20AC}ll\N{U+00F6}, \N{U+1F5FA}!\n"; close $tempfh; #system('hexdump','-C',$filename); my $four = readbackwards_utf8($filename, 4); is $four->(), "!\n"; is $four->(), "\N{U+1F5FA}"; is $four->(), "\N{U+00F6}, "; is $four->(), "ll"; is $four->(), "H\N{U+20AC}"; is $four->(), undef; is $four->(), undef; my $five = readbackwards_utf8($filename, 5); is $five->(), "!\n"; is $five->(), " \N{U+1F5FA}"; is $five->(), "ll\N{U+00F6},"; is $five->(), "H\N{U+20AC}"; is $five->(), undef; my $six = readbackwards_utf8($filename, 6); is $six->(), "\N{U+1F5FA}!\n"; is $six->(), "ll\N{U+00F6}, "; is $six->(), "H\N{U+20AC}"; is $six->(), undef; my $seven = readbackwards_utf8($filename, 7); is $seven->(), " \N{U+1F5FA}!\n"; is $seven->(), "ll\N{U+00F6},"; is $seven->(), "H\N{U+20AC}"; is $seven->(), undef; for my $n (8..9) { my $eight = readbackwards_utf8($filename, $n); is $eight->(), ", \N{U+1F5FA}!\n"; is $eight->(), "H\N{U+20AC}ll\N{U+00F6}"; is $eight->(), undef; } my $ten = readbackwards_utf8($filename, 10); is $ten->(), "\N{U+00F6}, \N{U+1F5FA}!\n"; is $ten->(), "H\N{U+20AC}ll"; is $ten->(), undef; my $eleven = readbackwards_utf8($filename, 11); is $eleven->(), "l\N{U+00F6}, \N{U+1F5FA}!\n"; is $eleven->(), "H\N{U+20AC}l"; is $eleven->(), undef; for my $n (12..14) { my $twelve = readbackwards_utf8($filename, $n); is $twelve->(), "ll\N{U+00F6}, \N{U+1F5FA}!\n"; is $twelve->(), "H\N{U+20AC}"; is $twelve->(), undef; } my $fifteen = readbackwards_utf8($filename, 15); is $fifteen->(), "\N{U+20AC}ll\N{U+00F6}, \N{U+1F5FA}!\n"; is $fifteen->(), "H"; is $fifteen->(), undef; for my $n (16..17) { my $sixteen = readbackwards_utf8($filename, 16); is $sixteen->(), "H\N{U+20AC}ll\N{U+00F6}, \N{U+1F5FA}!\n"; is $sixteen->(), undef; } done_testing;
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re^2: Processing an encoded file backwards
by LanX (Saint) on Jan 18, 2020 at 20:41 UTC | |
by haukex (Archbishop) on Jan 18, 2020 at 20:56 UTC | |
by LanX (Saint) on Jan 18, 2020 at 21:16 UTC | |
by haukex (Archbishop) on Jan 18, 2020 at 21:53 UTC | |
by LanX (Saint) on Jan 18, 2020 at 22:30 UTC | |
|