#!/usr/bin/perl # Id: unicount.pl # Subject: show some Unicode statistics use warnings ; use strict ; use Data::Alias ; binmode STDOUT, ':utf8' ; my @table = # +--Name------+---qRegexp--------+-C-+-L-+-U-+ ( [ 'xdigit' , qr/[[:xdigit:]]/ , 0 , 0 , 0 ] , [ 'ascii' , qr/[[:ascii:]]/ , 0 , 0 , 0 ] , [ '\\d' , qr/\d/ , 0 , 0 , 0 ] , [ 'digit' , qr/[[:digit:]]/ , 0 , 0 , 0 ] , [ 'IsNumber' , qr/\p{IsNumber}/ , 0 , 0 , 0 ] , [ 'alpha' , qr/[[:alpha:]]/ , 0 , 0 , 0 ] , [ 'alnum' , qr/[[:alnum:]]/ , 0 , 0 , 0 ] , [ 'word' , qr/[[:word:]]/ , 0 , 0 , 0 ] , [ 'graph' , qr/[[:graph:]]/ , 0 , 0 , 0 ] , [ 'print' , qr/[[:print:]]/ , 0 , 0 , 0 ] , [ 'blank' , qr/[[:blank:]]/ , 0 , 0 , 0 ] , [ 'space' , qr/[[:space:]]/ , 0 , 0 , 0 ] , [ 'punct' , qr/[[:punct:]]/ , 0 , 0 , 0 ] , [ 'cntrl' , qr/[[:cntrl:]]/ , 0 , 0 , 0 ] , ) ; my @codepoints = ( 0x0000 .. 0xD7FF, 0xE000 .. 0xFDCF, 0xFDF0 .. 0xFFFD, 0x10000 .. 0x1FFFD, 0x20000 .. 0x2FFFD, # 0x30000 .. 0x3FFFD, # etc. ) ; for my $row ( @table ) { alias my ($name, $qrx, $count, $lower, $upper) = @$row ; printf "\n%s\n", $name ; my $n = 0 ; for ( @codepoints ) { local $_ = chr ; # int-2-char conversion $n++ ; if ( /$qrx/ ) { $count++ ; $lower++ if / [[:lower:]] /x ; $upper++ if / [[:upper:]] /x ; } } my $show_lower_upper = ($lower || $upper) ? sprintf( ' (lower:%6d, upper:%6d)' , $lower , $upper ) : '' ; printf "%6d /%6d =%7.3f%%%s\n" , $count , $n , 100 * $count / $n , $show_lower_upper } print "\n" ; __END__