CPAN Algorithm::Diff method: time to compare same two files 1000 times: 40.28 sec 39.68 sec 39.97 sec 40.17 sec 39.70 sec 39.71 sec 39.61 sec 39.82 sec 39.71 sec 39.60 sec avg. = 39.83 sec diffutils method: time to compare same two files 1000 times: 11.68 sec 11.72 sec 11.63 sec 11.69 sec 11.78 sec 11.62 sec 11.62 sec 11.78 sec 11.66 sec 11.62 sec avg. = 11.68 sec #### CPAN Algorithm::Diff method: time to compare 105 file pairs, 10 times: 926.7 sec 924.2 sec time to compare 105 file pairs, 1 time: 91.02 sec 91.09 sec 91.09 sec 91.10 sec 90.93 sec 91.19 sec 93.42 sec 91.58 sec avg time for a single comparison of 105 file pairs: 92.23 secs diffutils method: time to compare 105 file pairs, 10 times: 16.76 sec 16.65 sec 16.67 sec 16.68 sec 16.85 sec 16.71 sec 16.72 sec 16.80 sec 16.92 sec 16.84 sec avg. time for single comparison of 105 file pairs: 1.676 secs #### repeatedly compare same two files 1000 times: average times: Algorithm::Diff 39.83 sec diffutils 11.68 sec compare 105 different pairs of files 1 time: average times: Algorithm::Diff 92.23 sec diffutils 1.676 sec #### ## this is the framework: ## One of the two code snippets below are ## substituted for ### DIFF ALGORITHM HERE.. #!/usr/bin/perl use strict; use lib "/Users/allasso/AWS/utility/cpan/lib/perl5/site_perl"; require Algorithm::Diff; use Time::HiRes qw( time ); my($source_path_1, $source_path_2) = @ARGV; my $holdRS = $/; local $/; if (! open(FH, $source_path_1)) { print "unable to open source file 1: $source_path_1\n"; } my $filestring_1 = ; $/ = $holdRS; close(FH); $holdRS = $/; local $/; if (! open(FH, $source_path_2)) { print "unable to open source file 2: $source_path_2\n"; } my $filestring_2 = ; $/ = $holdRS; close(FH); $filestring_1 =~ s@\s+@\n@g; $filestring_2 =~ s@\s+@\n@g; my $time = time(); for my $count (0..999) { ### DIFF ALGORITHM HERE.. } my $time_4sig = time() - $time + .005; $time_4sig =~ s@^(.....).*@$1@; print STDERR "\n\net: ".$time_4sig."\n"; exit; ## this is the CPAN Algorithm::Diff code: my @seq1 = split(/\n/, $filestring_1); my @seq2 = split(/\n/, $filestring_2); my $diff = Algorithm::Diff->new( \@seq1, \@seq2 ); $diff->Base( 1 ); # Return line numbers, not indices while( $diff->Next() ) { next if $diff->Same(); my $sep = ''; if( ! $diff->Items(2) ) { printf "%d,%dd%d\n", $diff->Get(qw( Min1 Max1 Max2 )); } elsif( ! $diff->Items(1) ) { printf "%da%d,%d\n", $diff->Get(qw( Max1 Min2 Max2 )); } else { $sep = "\n---\n"; printf "%d,%dc%d,%d\n", $diff->Get(qw( Min1 Max1 Min2 Max2 )); } print "< $_" for $diff->Items(1); print $sep; print "> $_\n" for $diff->Items(2); } ## this is the diffutils code: if (! open(FH, ">/tmp/diff_774885959483_1")) { print "unable to open temporary file\n"; } print FH "$filestring_1"; close (FH); if (! open(FH, ">/tmp/diff_774885959483_2")) { print "unable to open temporary file\n"; } print FH "$filestring_2"; close (FH); print "$source_path_1 ::: $source_path_2\n"; print `diff --suppress-common-lines -y /tmp/diff_774885959483_1 /tmp/diff_774885959483_2`; #### #!/usr/bin/perl use strict; use lib "/Users/allasso/AWS/utility/cpan/lib/perl5/site_perl"; require Algorithm::Diff; use Time::HiRes qw( time ); my($source_path_1, $source_path_2) = @ARGV; $source_path_1 =~ s@\x2f*$@@; $source_path_2 =~ s@\x2f*$@@; my @src_list_1 = `find $source_path_1 -name "*.htm*"`; my @src_list_2 = `find $source_path_2 -name "*.htm*"`; my $time = time(); for my $count (0..9) { my $list_cnt = 0; for my $file_src_1 (@src_list_1) { my $file_src_2 = $src_list_2[$list_cnt++]; chomp $file_src_1; chomp $file_src_2; my $holdRS = $/; local $/; if (! open(FH, $file_src_1)) { print "unable to open source file 1: $file_src_1\n"; } my $filestring_1 = ; $/ = $holdRS; close(FH); $holdRS = $/; local $/; if (! open(FH, $file_src_2)) { print "unable to open source file 2: $file_src_2\n"; } my $filestring_2 = ; $/ = $holdRS; close(FH); $filestring_1 =~ s@\s+@\n@g; $filestring_2 =~ s@\s+@\n@g; ### DIFF ALGORITHM HERE.. } } my $time_4sig = time() - $time + .005; $time_4sig =~ s@^(.....).*@$1@; print STDERR "\n\net: ".$time_4sig."\n"; exit; #### ## full recursive script using CPAN Algorithm::Diff : #!/usr/bin/perl use strict; use lib "/Users/allasso/AWS/utility/cpan/lib/perl5/site_perl"; require Algorithm::Diff; use Time::HiRes qw( time ); my($source_path_1, $source_path_2) = @ARGV; $source_path_1 =~ s@\x2f*$@@; $source_path_2 =~ s@\x2f*$@@; my @src_list_1 = `find $source_path_1 -name "*.htm*"`; my @src_list_2 = `find $source_path_2 -name "*.htm*"`; my $time = time(); for my $count (0..9) { my $list_cnt = 0; for my $file_src_1 (@src_list_1) { my $file_src_2 = $src_list_2[$list_cnt++]; chomp $file_src_1; chomp $file_src_2; my $holdRS = $/; local $/; if (! open(FH, $file_src_1)) { print "unable to open source file 1: $file_src_1\n"; } my $filestring_1 = ; $/ = $holdRS; close(FH); $holdRS = $/; local $/; if (! open(FH, $file_src_2)) { print "unable to open source file 2: $file_src_2\n"; } my $filestring_2 = ; $/ = $holdRS; close(FH); $filestring_1 =~ s@\s+@\n@g; $filestring_2 =~ s@\s+@\n@g; ## begin CPAN algorithm: my @seq1 = split(/\n/, $filestring_1); my @seq2 = split(/\n/, $filestring_2); my $diff = Algorithm::Diff->new( \@seq1, \@seq2 ); $diff->Base( 1 ); # Return line numbers, not indices while( $diff->Next() ) { next if $diff->Same(); my $sep = ''; if( ! $diff->Items(2) ) { printf "%d,%dd%d\n", $diff->Get(qw( Min1 Max1 Max2 )); } elsif( ! $diff->Items(1) ) { printf "%da%d,%d\n", $diff->Get(qw( Max1 Min2 Max2 )); } else { $sep = "\n---\n"; printf "%d,%dc%d,%d\n", $diff->Get(qw( Min1 Max1 Min2 Max2 )); } print "< $_" for $diff->Items(1); print $sep; print "> $_\n" for $diff->Items(2); } ## end CPAN algorithm } } my $time_4sig = time() - $time + .005; $time_4sig =~ s@^(.....).*@$1@; print STDERR "\n\net: ".$time_4sig."\n"; exit; ## full recursive script using diffutils : #!/usr/bin/perl use strict; use lib "/Users/allasso/AWS/utility/cpan/lib/perl5/site_perl"; require Algorithm::Diff; use Time::HiRes qw( time ); my($source_path_1, $source_path_2) = @ARGV; $source_path_1 =~ s@\x2f*$@@; $source_path_2 =~ s@\x2f*$@@; my @src_list_1 = `find $source_path_1 -name "*.htm*"`; my @src_list_2 = `find $source_path_2 -name "*.htm*"`; my $time = time(); for my $count (0..9) { my $list_cnt = 0; for my $file_src_1 (@src_list_1) { my $file_src_2 = $src_list_2[$list_cnt++]; chomp $file_src_1; chomp $file_src_2; my $holdRS = $/; local $/; if (! open(FH, $file_src_1)) { print "unable to open source file 1: $file_src_1\n"; } my $filestring_1 = ; $/ = $holdRS; close(FH); $holdRS = $/; local $/; if (! open(FH, $file_src_2)) { print "unable to open source file 2: $file_src_2\n"; } my $filestring_2 = ; $/ = $holdRS; close(FH); $filestring_1 =~ s@\s+@\n@g; $filestring_2 =~ s@\s+@\n@g; ## begin diffutils algorithm: if (! open(FH, ">/tmp/diff_774885959483_1")) { print "unable to open temporary file\n"; } print FH "$filestring_1"; close (FH); if (! open(FH, ">/tmp/diff_774885959483_2")) { print "unable to open temporary file\n"; } print FH "$filestring_2"; close (FH); #print "$file_src_1 ::: $file_src_1\n"; print `diff --suppress-common-lines -y /tmp/diff_774885959483_1 /tmp/diff_774885959483_2`; ## end diffutils algorithm } } my $time_4sig = time() - $time + .005; $time_4sig =~ s@^(.....).*@$1@; print STDERR "\n\net: ".$time_4sig."\n"; exit;