in reply to Re: Entity statistics
in thread Entity statistics
See this example (using PRE instead of CODE to include the section mark):
#!/usr/bin/perl
use warnings;
use strict;
use feature qw{ say };
use experimental qw( signatures );
use utf8;
use XML::LibXML;
use Encode qw{ encode };
sub create_xml($xml) {
open my $out, '>:encoding(UTF-8)', $xml or die $!;
print {$out} <<~'__XML__';
<?xml version="1.0"?>
<!DOCTYPE root [
<!ENTITY sect "§">
]>
<root link="Art.VV">
A § 1 A
B Art.XVI B
C § 9 C
D § 7 D
E § 6 E
<!-- Should comments be included in statistics? Art.XXX -->
<?print "Should processing instructions be included?" Art.2 ?>
</root>
__XML__
}
sub validate_xml($xml) {
my $dom = 'XML::LibXML'->load_xml(location => $xml);
print $dom;
}
sub generate_statistics($xml) {
my @regexes = (qr/§\s*[0-9]/, qr/Art\.\s*[0-9IVX]/);
open my $in, '<:encoding(UTF-8)', $xml or die $!;
my $string = do { local $/; <$in> };
my @tally;
for my $i (0 .. $#regexes) {
my $regex = $regexes[$i];
++$tally[$i] while $string =~ /$regex/g;
}
for my $i (0 .. $#regexes) {
say encode('UTF-8', "$regexes[$i]:\t$tally[$i]");
}
}
my $xml = '1.xml';
create_xml($xml);
validate_xml($xml);
generate_statistics($xml);
unlink $xml;
<?xml version="1.0"?>
<!DOCTYPE root [
<!ENTITY sect "§">
]>
<root link="Art.VV">
A § 1 A
B Art.XVI B
C § 9 C
D § 7 D
E § 6 E
<!-- Should comments be included in statistics? Art.XXX -->
<?print "Should processing instructions be included?" Art.2 ?>
</root>
(?^u:§\s*[0-9]): 1
(?^:Art\.\s*[0-9IVX]): 4
Update: Included §.
Update 2: Print the XML to show how some representations of the section mark are equivalent.
Update 3: Added an attribute.
|
|---|