$new_guy has asked for the wisdom of the Perl Monks concerning the following question:
Dear Monks,
I am becoming a bit more daring in trying to understand and use perl. Now I wrote the script below, which does excatly what i want but at an alarmingly slow pace. Is there any way you think I could speed it up as well as learn, so I could do this again in the future. My script compares entities in lines between 2 files
#!/usr/bin/perl use Bio::Perl; use IO::String; use Bio::SeqIO; use List::Util 'max'; use Text::CSV; use Array::Utils qw(:all); if (scalar(@ARGV) != 2) { print "\n"; print "Usage: compare.pl <master_file> <query_file>\n" ; print "The master file is the annotated one\n"; print "The query file is the non-annoted subset file\n"; print "\n"; exit(); } my ($file1,$file2) = @ARGV; #read in orthomcl results, gene-number and + taxa-number open(INFILE, $file1); ##remove/overwrite on all similarly named output files my $remove1 = "compared_annotations.txt"; if (unlink($remove1) == 1) { print "Existing \"$remove1\" file was removed\ +n"; } #Create an output file my $outputfile = "compared_annotations.txt"; if (! open(POS, ">>$outputfile") ) { print "Cannot open file \"$outputfile\" to write to!!\n\n" +; exit; } # For each line in the input file (i.e. each ortholog group)... while (my $line = <INFILE>) { chomp; # First, get the cluster number. our ($cluster, $other) = split(/\s/, $line, 2); #print "$cluster : ************************************************ +********************************************************************* +**************\n"; #print "$other\n"; #declare variables my $a; my $b; my $c; my $d; my $e; my $f; my $g; my $i; our @a = (); #make @a public my @c = (); ##remove white spaces in the data! (very annoying) $a = $other; $a =~ s/[\t ]+/ /g; $b = $a; $b =~ s/^ //mg; $c = $b; $c =~ s/ $//mg; chomp $c; # remove trailing white space in $c ('ugly stuff') #now break the entries into pieces and store them in an array! $d = $c; #remove comma's $d =~ s/,3/ 3/g; $d =~ s/\),/) /g; #@a = split(/(\))\s/, $d); #@a = split(/\)\s([^\)])/, $d); #@a = split(/(\))/, $d); $d =~ s/\)\s/)>/g; $e = $d; $e =~ s/\(unknown\)//g; #remove the unknown brackets from the draf +t genomes $f = $e; #push(@a, $f); @a = split(/>/, $f); # split the second half of the genome info in + a line into an array #print "@a***\n\n\n"; #foreach $g(@a){ #print "$g~~\n"; #check out the split bits #print "$g\n"; #} my $non_ommitted_pattern =~ /(\W+).+/; foreach (@a) {our @match = grep {$_ == $non_ommitted_pattern} @a; #print "@match\n\n\n\n"; } my ($w1, @w1) = read_query_file($file2); #pretty important - ie a +llows printing in the sub } ################################################### sub read_query_file ### Opens and reads file data into an array ### { my ($filename) = @_; unless (open(FILEDATA, $filename)) {print "\nCannot open file \"$filename\".\n"; exit; } while (my $line2 = <FILEDATA>){ chomp; # First, get the subset cluster number. our ($cluster1, $other1) = split(/\s/, $line2, 2); chomp $other1; #print "$cluster1 ############################################### +##################################################################### +#####\n"; #print "$other1\n"; my @A2 = split(/\s/, $other1); chomp @A2; #foreach my $A2(@A2){ #print "$A2\n"; #} #now do the matching foreach $i(@a){ foreach my $i2(@A2){ if($i =~ m/$i2/){ #print POS "$cluster $cluster1 ## $i $i2 @match\n\n"; print POS "$cluster,, $cluster1,, @match,,\n"; next; } else{next;} } } } next; #return; # } } ################################################### 1;
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re: file comparisons: - making it faster
by moritz (Cardinal) on Jun 20, 2011 at 14:05 UTC | |
|
Re: file comparisons: - making it faster
by roboticus (Chancellor) on Jun 20, 2011 at 14:59 UTC | |
|
Re: file comparisons: - making it faster
by Marshall (Canon) on Jun 20, 2011 at 15:14 UTC | |
|
Re: file comparisons: - making it faster
by rev_1318 (Chaplain) on Jun 20, 2011 at 14:42 UTC |