#!/usr/bin/perl use strict; use Encoding; use Data::Dumper; use feature ':5.10'; use Text::Levenshtein qw(distance); use XML::Simple; binmode STDOUT,":utf8"; $|++; #grab translation files open (E,"<:encoding(UTF-8)","site/en.htm"); open (F,"<:encoding(UTF-8)","site/fr.htm"); my ($en,$fr); say "Pulling Translations"; $en.=$_ for (); $fr.=$_ for (); #parse them into memory say "Parsing into memory"; $fr=~s///g; $fr=~s/<\/span><\/p>/<\/p>/g; my %Data; while ($en=~/(?\d+):(?\d+):<\/p>.*?(?.*?)<\/p>/msig){ $Data{$+{id}}={count=>$+{count}, en=>$+{content}}; } while ($fr=~/(?\d+):(?\d+):<\/p>.*?(?.*?)<\/p>/msig){ $Data{$+{id}}->{fr}=$+{content}; } #Check for missing data say "Corruption Check"; for (keys %Data){ die 'data corruption'.Dumper $Data{$_} unless (exists $Data{$_}->{en} && exists $Data{$_}->{fr} ); } #Scan files for matches #Get all paras in file #dump into memory. could just use the original 'get' code my %phrase; say "Draw Live Snapshot"; foreach my $fname(`find ./live -type f | grep htm\$ `){ chomp $fname; $fname=~m/(32brigade.*)/; my $floc = $fname; open (F, $fname); my $file; $file.=$_ for (); $file=~/\Q\E(.*?)/sm; $file=$1; #while ($file=~s/.*?)>//smg){ # my $tmp = $+{img}; # my ($alt,$src); # $tmp=~/alt=.*?"(?.*?)"/smi; # $phrase{$+{alt}}++ if (exists $+{alt}); #} while ($file=~m|

(?.*?)

|gsm){ $phrase{$+{para}}->{'count'}++; $phrase{$+{para}}->{'clean'}=$+{para}; $phrase{$+{para}}->{'clean'}=~s/.*?)>//smg; #store the file location push @{$phrase{$+{para}}->{'file'}}, $floc; } # print $file; } #Re-sort the data say "Sorting Data from snapshot"; my %liveData; my $c; for (sort {$a cmp $a} keys %phrase){ $c++; $liveData{$c}={ dirty=>$_, clean=>$phrase{$_}->{'clean'}, file=>$phrase{$_}->{'file'}, count=>$phrase{$_}->{'count'} }; } #Compare all. Record for each Data the levenshtein say "Comparing with Levenshtein..."; my ($pc,$ptotal)=1; #process counter $ptotal +=1 for (keys %liveData); for my $ld(keys %liveData){ say "$pc / $ptotal - ".sprintf("%.3f",$pc++ / $ptotal * 100),"% Processing: $ld"; say $liveData{$ld}->{clean}; for my $rd(keys %Data){ #so... $para->{name=>...,comp=>{id ref,lev} #limiter #unless (length ($liveData{$ld}->{clean})>2*length($Data{$rd}->{en})){ $liveData{$ld}->{comp}->{$rd}=distance ($liveData{$ld}->{clean},$Data{$rd}->{en}); #} } #sort lev. my $c=0; my $i=0; my $top; my @m; for my $id (sort {$liveData{$ld}->{comp}->{$a} <=> $liveData{$ld}->{comp}->{$b}} keys %{$liveData{$ld}->{comp}}){ say "-"x200,"\nTop Match\n".$Data{$id}->{en} if $i++ < 1; push @m, "-->$id: ". $liveData{$ld}->{comp}->{$id}; last if $c++ == 5; } say "Top 5 Matches:"; say for @m; print "\n"; say "Backing Up\n","="x200; my $x = XMLout(\%liveData); open (X,">:encoding(UTF-8)","xml/liveData.xml"); print X $x; close (X); } #implement highest within tolerance #check for near misses #ask for not even close #See what was missed my $c; for my $datum (keys %Data){ if ($Data{$datum}->{count}>0){ print $datum; $c++; } } say "Failures: $c";