Hello Monks,
I have been working on some code that will take two html documents that holds translations, compairs it to the actual pages, and sets an order of which paragraphs match the closest.
The first issue, which may be the simplist to solve, is that my %liveData variable, when sent through XMLout produces bad xml. The tags are like <1> and <comp 1="101" 2="342">
The next problem is that this runs very slow. My whole process of doing this is very brute force. There must be a better way. Essentially I am running the levenshtein difference on each paragraph to each paragraph i find.
any help you can provide would be a great help
#!/usr/bin/perl
use strict;
use Encoding;
use Data::Dumper;
use feature ':5.10';
use Text::Levenshtein qw(distance);
use XML::Simple;
binmode STDOUT,":utf8";
$|++;
#grab translation files
open (E,"<:encoding(UTF-8)","site/en.htm");
open (F,"<:encoding(UTF-8)","site/fr.htm");
my ($en,$fr);
say "Pulling Translations";
$en.=$_ for (<E>);
$fr.=$_ for (<F>);
#parse them into memory
say "Parsing into memory";
$fr=~s/<span lang=FR-CA>//g;
$fr=~s/<\/span><\/p>/<\/p>/g;
my %Data;
while ($en=~/<p.*?>(?<id>\d+):(?<count>\d+):<\/p>.*?<p.*?margin.*?>(?<
+content>.*?)<\/p>/msig){
$Data{$+{id}}={count=>$+{count}, en=>$+{content}};
}
while ($fr=~/<p.*?>(?<id>\d+):(?<count>\d+):<\/p>.*?<p.*?margin.*?>(?<
+content>.*?)<\/p>/msig){
$Data{$+{id}}->{fr}=$+{content};
}
#Check for missing data
say "Corruption Check";
for (keys %Data){
die 'data corruption'.Dumper $Data{$_} unless (exists $Data{$_}->{
+en} && exists $Data{$_}->{fr} );
}
#Scan files for matches
#Get all paras in file
#dump into memory. could just use the original 'get' code
my %phrase;
say "Draw Live Snapshot";
foreach my $fname(`find ./live -type f | grep htm\$ `){
chomp $fname;
$fname=~m/(32brigade.*)/;
my $floc = $fname;
open (F, $fname);
my $file;
$file.=$_ for (<F>);
$file=~/\Q<!-- InstanceBeginEditable name="Content" -->\E(.*?)<!--
+ InstanceEndEditable -->/sm;
$file=$1;
#while ($file=~s/<img (?<img>.*?)>//smg){
# my $tmp = $+{img};
# my ($alt,$src);
# $tmp=~/alt=.*?"(?<alt>.*?)"/smi;
# $phrase{$+{alt}}++ if (exists $+{alt});
#}
while ($file=~m|<p>(?<para>.*?)</p>|gsm){
$phrase{$+{para}}->{'count'}++;
$phrase{$+{para}}->{'clean'}=$+{para};
$phrase{$+{para}}->{'clean'}=~s/<img (?<img>.*?)>//smg;
#store the file location
push @{$phrase{$+{para}}->{'file'}}, $floc;
}
# print $file;
}
#Re-sort the data
say "Sorting Data from snapshot";
my %liveData;
my $c;
for (sort {$a cmp $a} keys %phrase){
$c++;
$liveData{$c}={
dirty=>$_,
clean=>$phrase{$_}->{'clean'},
file=>$phrase{$_}->{'file'},
count=>$phrase{$_}->{'count'}
};
}
#Compare all. Record for each Data the levenshtein
say "Comparing with Levenshtein...";
my ($pc,$ptotal)=1; #process counter
$ptotal +=1 for (keys %liveData);
for my $ld(keys %liveData){
say "$pc / $ptotal - ".sprintf("%.3f",$pc++ / $ptotal * 100),"% Pr
+ocessing: $ld";
say $liveData{$ld}->{clean};
for my $rd(keys %Data){
#so... $para->{name=>...,comp=>{id ref,lev}
#limiter
#unless (length ($liveData{$ld}->{clean})>2*length($Data{$rd}-
+>{en})){
$liveData{$ld}->{comp}->{$rd}=distance ($liveData{$ld}->{c
+lean},$Data{$rd}->{en});
#}
}
#sort lev.
my $c=0;
my $i=0;
my $top;
my @m;
for my $id (sort {$liveData{$ld}->{comp}->{$a} <=> $liveData{$ld}-
+>{comp}->{$b}} keys %{$liveData{$ld}->{comp}}){
say "-"x200,"\nTop Match\n".$Data{$id}->{en} if $i++ < 1;
push @m, "-->$id: ". $liveData{$ld}->{comp}->{$id};
last if $c++ == 5;
}
say "Top 5 Matches:";
say for @m;
print "\n";
say "Backing Up\n","="x200;
my $x = XMLout(\%liveData);
open (X,">:encoding(UTF-8)","xml/liveData.xml");
print X $x;
close (X);
}
#implement highest within tolerance
#check for near misses
#ask for not even close
#See what was missed
my $c;
for my $datum (keys %Data){
if ($Data{$datum}->{count}>0){
print $datum;
$c++;
}
}
say "Failures: $c";