comment on

Hello Monks,

I have been working on some code that will take two html documents that holds translations, compairs it to the actual pages, and sets an order of which paragraphs match the closest.

The first issue, which may be the simplist to solve, is that my %liveData variable, when sent through XMLout produces bad xml. The tags are like <1> and <comp 1="101" 2="342">

The next problem is that this runs very slow. My whole process of doing this is very brute force. There must be a better way. Essentially I am running the levenshtein difference on each paragraph to each paragraph i find.

any help you can provide would be a great help

#!/usr/bin/perl
use strict;
use Encoding;
use Data::Dumper;
use feature ':5.10';
use Text::Levenshtein qw(distance);
use XML::Simple;
binmode STDOUT,":utf8";
$|++;
#grab translation files
open (E,"<:encoding(UTF-8)","site/en.htm");
open (F,"<:encoding(UTF-8)","site/fr.htm");
my ($en,$fr);
say "Pulling Translations";
$en.=$_ for (<E>);
$fr.=$_ for (<F>);
#parse them into memory
say "Parsing into memory";
$fr=~s/<span lang=FR-CA>//g;
$fr=~s/<\/span><\/p>/<\/p>/g;
my %Data;
while ($en=~/<p.*?>(?<id>\d+):(?<count>\d+):<\/p>.*?<p.*?margin.*?>(?<
+content>.*?)<\/p>/msig){
    $Data{$+{id}}={count=>$+{count}, en=>$+{content}};
}
while ($fr=~/<p.*?>(?<id>\d+):(?<count>\d+):<\/p>.*?<p.*?margin.*?>(?<
+content>.*?)<\/p>/msig){
    $Data{$+{id}}->{fr}=$+{content};
}
#Check for missing data
say "Corruption Check";
for (keys %Data){
    die 'data corruption'.Dumper $Data{$_} unless (exists $Data{$_}->{
+en} && exists $Data{$_}->{fr} );
}
#Scan files for matches

#Get all paras in file
#dump into memory. could just use the original 'get' code
my %phrase;
say "Draw Live Snapshot";
foreach my $fname(`find  ./live -type f | grep htm\$ `){
    chomp $fname;
    $fname=~m/(32brigade.*)/;
    my $floc = $fname;
    open (F, $fname);
    my $file;
    $file.=$_ for (<F>);
    $file=~/\Q<!-- InstanceBeginEditable name="Content" -->\E(.*?)<!--
+ InstanceEndEditable -->/sm;
    $file=$1;
    #while ($file=~s/<img (?<img>.*?)>//smg){
    #    my $tmp = $+{img};
    #    my ($alt,$src);
    #    $tmp=~/alt=.*?"(?<alt>.*?)"/smi;
    #    $phrase{$+{alt}}++ if (exists $+{alt});
    #}
    while ($file=~m|<p>(?<para>.*?)</p>|gsm){
        $phrase{$+{para}}->{'count'}++;
        $phrase{$+{para}}->{'clean'}=$+{para};
        $phrase{$+{para}}->{'clean'}=~s/<img (?<img>.*?)>//smg;
        #store the file location
        push @{$phrase{$+{para}}->{'file'}}, $floc;
    }
#    print $file;
    
}
#Re-sort the data
say "Sorting Data from snapshot";
my %liveData;
my $c;
for (sort {$a cmp $a} keys %phrase){
$c++;
$liveData{$c}={
        dirty=>$_,
        clean=>$phrase{$_}->{'clean'},
        file=>$phrase{$_}->{'file'},
        count=>$phrase{$_}->{'count'}
        };
}
#Compare all. Record for each Data the levenshtein
say "Comparing with Levenshtein...";
my ($pc,$ptotal)=1; #process counter
$ptotal +=1 for (keys %liveData);
for my $ld(keys %liveData){
    say "$pc / $ptotal - ".sprintf("%.3f",$pc++ / $ptotal * 100),"% Pr
+ocessing: $ld"; 
    say $liveData{$ld}->{clean};
    for my $rd(keys %Data){
        #so... $para->{name=>...,comp=>{id ref,lev}
        #limiter
        #unless (length ($liveData{$ld}->{clean})>2*length($Data{$rd}-
+>{en})){
            $liveData{$ld}->{comp}->{$rd}=distance ($liveData{$ld}->{c
+lean},$Data{$rd}->{en});
        #}
    }
    #sort lev.
    my $c=0; 
    my $i=0;
    my $top;
    my @m;
    for my $id (sort {$liveData{$ld}->{comp}->{$a} <=> $liveData{$ld}-
+>{comp}->{$b}} keys %{$liveData{$ld}->{comp}}){
        say "-"x200,"\nTop Match\n".$Data{$id}->{en} if $i++ < 1;
        push @m, "-->$id: ". $liveData{$ld}->{comp}->{$id};
        last if $c++ == 5;
    }
    say "Top 5 Matches:";
    say for @m;
    print "\n";
    say "Backing Up\n","="x200;
    my $x = XMLout(\%liveData);
    open (X,">:encoding(UTF-8)","xml/liveData.xml");
    print X $x;
    close (X);
}
#implement highest within tolerance
#check for near misses
#ask for not even close



#See what was missed
my $c;
for my $datum (keys %Data){
    if ($Data{$datum}->{count}>0){
        print $datum;
        $c++;
    }
}
say "Failures: $c";
[download]

In reply to XML::Simple saving incorrectly. Levenshtein running slow by zer

Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!

Titles consisting of a single word are discouraged, and in most cases are disallowed outright.

Read Where should I post X? if you're not absolutely sure you're posting in the right place.

Please read these before you post! —

Posts may use any of the Perl Monks Approved HTML tags:

a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr

You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)

	For:		Use:
	&		`&`
	<		`<`
	>		`>`
	[		`[`
	]		`]`

Link using PerlMonks shortcuts! What shortcuts can I use for linking?

See Writeup Formatting Tips and other pages linked from there for more info.