zer has asked for the wisdom of the Perl Monks concerning the following question:
I have been working on some code that will take two html documents that holds translations, compairs it to the actual pages, and sets an order of which paragraphs match the closest.
The first issue, which may be the simplist to solve, is that my %liveData variable, when sent through XMLout produces bad xml. The tags are like <1> and <comp 1="101" 2="342">
The next problem is that this runs very slow. My whole process of doing this is very brute force. There must be a better way. Essentially I am running the levenshtein difference on each paragraph to each paragraph i find.
any help you can provide would be a great help
#!/usr/bin/perl use strict; use Encoding; use Data::Dumper; use feature ':5.10'; use Text::Levenshtein qw(distance); use XML::Simple; binmode STDOUT,":utf8"; $|++; #grab translation files open (E,"<:encoding(UTF-8)","site/en.htm"); open (F,"<:encoding(UTF-8)","site/fr.htm"); my ($en,$fr); say "Pulling Translations"; $en.=$_ for (<E>); $fr.=$_ for (<F>); #parse them into memory say "Parsing into memory"; $fr=~s/<span lang=FR-CA>//g; $fr=~s/<\/span><\/p>/<\/p>/g; my %Data; while ($en=~/<p.*?>(?<id>\d+):(?<count>\d+):<\/p>.*?<p.*?margin.*?>(?< +content>.*?)<\/p>/msig){ $Data{$+{id}}={count=>$+{count}, en=>$+{content}}; } while ($fr=~/<p.*?>(?<id>\d+):(?<count>\d+):<\/p>.*?<p.*?margin.*?>(?< +content>.*?)<\/p>/msig){ $Data{$+{id}}->{fr}=$+{content}; } #Check for missing data say "Corruption Check"; for (keys %Data){ die 'data corruption'.Dumper $Data{$_} unless (exists $Data{$_}->{ +en} && exists $Data{$_}->{fr} ); } #Scan files for matches #Get all paras in file #dump into memory. could just use the original 'get' code my %phrase; say "Draw Live Snapshot"; foreach my $fname(`find ./live -type f | grep htm\$ `){ chomp $fname; $fname=~m/(32brigade.*)/; my $floc = $fname; open (F, $fname); my $file; $file.=$_ for (<F>); $file=~/\Q<!-- InstanceBeginEditable name="Content" -->\E(.*?)<!-- + InstanceEndEditable -->/sm; $file=$1; #while ($file=~s/<img (?<img>.*?)>//smg){ # my $tmp = $+{img}; # my ($alt,$src); # $tmp=~/alt=.*?"(?<alt>.*?)"/smi; # $phrase{$+{alt}}++ if (exists $+{alt}); #} while ($file=~m|<p>(?<para>.*?)</p>|gsm){ $phrase{$+{para}}->{'count'}++; $phrase{$+{para}}->{'clean'}=$+{para}; $phrase{$+{para}}->{'clean'}=~s/<img (?<img>.*?)>//smg; #store the file location push @{$phrase{$+{para}}->{'file'}}, $floc; } # print $file; } #Re-sort the data say "Sorting Data from snapshot"; my %liveData; my $c; for (sort {$a cmp $a} keys %phrase){ $c++; $liveData{$c}={ dirty=>$_, clean=>$phrase{$_}->{'clean'}, file=>$phrase{$_}->{'file'}, count=>$phrase{$_}->{'count'} }; } #Compare all. Record for each Data the levenshtein say "Comparing with Levenshtein..."; my ($pc,$ptotal)=1; #process counter $ptotal +=1 for (keys %liveData); for my $ld(keys %liveData){ say "$pc / $ptotal - ".sprintf("%.3f",$pc++ / $ptotal * 100),"% Pr +ocessing: $ld"; say $liveData{$ld}->{clean}; for my $rd(keys %Data){ #so... $para->{name=>...,comp=>{id ref,lev} #limiter #unless (length ($liveData{$ld}->{clean})>2*length($Data{$rd}- +>{en})){ $liveData{$ld}->{comp}->{$rd}=distance ($liveData{$ld}->{c +lean},$Data{$rd}->{en}); #} } #sort lev. my $c=0; my $i=0; my $top; my @m; for my $id (sort {$liveData{$ld}->{comp}->{$a} <=> $liveData{$ld}- +>{comp}->{$b}} keys %{$liveData{$ld}->{comp}}){ say "-"x200,"\nTop Match\n".$Data{$id}->{en} if $i++ < 1; push @m, "-->$id: ". $liveData{$ld}->{comp}->{$id}; last if $c++ == 5; } say "Top 5 Matches:"; say for @m; print "\n"; say "Backing Up\n","="x200; my $x = XMLout(\%liveData); open (X,">:encoding(UTF-8)","xml/liveData.xml"); print X $x; close (X); } #implement highest within tolerance #check for near misses #ask for not even close #See what was missed my $c; for my $datum (keys %Data){ if ($Data{$datum}->{count}>0){ print $datum; $c++; } } say "Failures: $c";
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re: XML::Simple saving incorrectly. Levenshtein running slow
by BrowserUk (Patriarch) on Aug 23, 2010 at 16:49 UTC | |
|
Re: XML::Simple saving incorrectly. Levenshtein running slow
by psini (Deacon) on Aug 23, 2010 at 17:04 UTC |