comment on

Dear Monks,

I am becoming a bit more daring in trying to understand and use perl. Now I wrote the script below, which does excatly what i want but at an alarmingly slow pace. Is there any way you think I could speed it up as well as learn, so I could do this again in the future. My script compares entities in lines between 2 files

#!/usr/bin/perl 

use Bio::Perl;
use IO::String;
use Bio::SeqIO;
use List::Util 'max';
use Text::CSV;
use Array::Utils qw(:all);

if (scalar(@ARGV) != 2) {
    
    print "\n";
    print "Usage: compare.pl <master_file> <query_file>\n" ;
    print "The master file is the annotated one\n";
    print "The query file is the non-annoted subset file\n";
    print "\n"; exit();
}

my ($file1,$file2) = @ARGV; #read in orthomcl results, gene-number and
+ taxa-number
open(INFILE, $file1);

##remove/overwrite on all similarly named output files
my $remove1 = "compared_annotations.txt";
if (unlink($remove1) == 1) {
                        print "Existing \"$remove1\" file was removed\
+n";
                          }

#Create an output file
my $outputfile = "compared_annotations.txt";
           if (! open(POS, ">>$outputfile") ) {
            print "Cannot open file \"$outputfile\" to write to!!\n\n"
+;
                exit;
        } 

# For each line in the input file (i.e. each ortholog group)...
while (my $line = <INFILE>) {
    chomp;

# First, get the cluster number.
    our ($cluster, $other) = split(/\s/, $line, 2);
    
   #print "$cluster : ************************************************
+*********************************************************************
+**************\n";
   #print "$other\n";
   
#declare variables   
    my $a;
    my $b;
    my $c;
    my $d;
    my $e;
    my $f;
    my $g;
    my $i;
    our @a = (); #make @a public
    my @c = ();
        
##remove white spaces in the data! (very annoying)
    $a = $other; 
    $a =~ s/[\t ]+/ /g;
    $b = $a;
    $b =~ s/^ //mg;
    $c = $b;
    $c =~ s/ $//mg;
    
    chomp $c; # remove trailing white space in $c ('ugly stuff')

#now break the entries into pieces and store them in an array!    
    $d = $c;
    
#remove comma's

    $d =~ s/,3/  3/g;
    $d =~ s/\),/)  /g;
     
    #@a = split(/(\))\s/, $d);
    #@a = split(/\)\s([^\)])/, $d);
    #@a = split(/(\))/, $d);

    $d =~ s/\)\s/)>/g;
    
    $e = $d;
    
    $e =~ s/\(unknown\)//g; #remove the unknown brackets from the draf
+t genomes 
    
    $f = $e;

    #push(@a, $f);
    
    @a = split(/>/, $f); # split the second half of the genome info in
+ a line into an array 
    
    #print "@a***\n\n\n";
    
    #foreach $g(@a){
        #print "$g~~\n";   #check out the split bits
    #print "$g\n";
    #}
    
       my $non_ommitted_pattern =~ /(\W+).+/;
    foreach (@a) {our @match = grep {$_ == $non_ommitted_pattern} @a;
    #print "@match\n\n\n\n";
    }
    
    my ($w1, @w1) = read_query_file($file2);  #pretty important - ie a
+llows printing in the sub

}
    
###################################################
sub read_query_file
### Opens and reads file data into an array ### 
    {
    my ($filename) = @_;    
    unless (open(FILEDATA, $filename))
        {print "\nCannot open file \"$filename\".\n";
        exit;
        }
    while (my $line2 = <FILEDATA>){
    chomp;
    # First, get the subset cluster number.
    our ($cluster1, $other1) = split(/\s/, $line2, 2);
    
    chomp $other1;
    
    #print "$cluster1  ###############################################
+#####################################################################
+#####\n";
    #print "$other1\n";
    
   my @A2 = split(/\s/, $other1);
    chomp @A2;
   
   #foreach my $A2(@A2){
    #print "$A2\n";
   #}
           
#now do the matching
        
    foreach $i(@a){
    foreach my $i2(@A2){
        if($i =~ m/$i2/){
        #print POS "$cluster $cluster1 ## $i  $i2 @match\n\n";
        print POS "$cluster,, $cluster1,, @match,,\n";
        next;
        }
        
        else{next;}
        }
    }
    }
    
           next;
        #return;
     #   }
    }
###################################################        
1;
[download]

In reply to file comparisons: - making it faster by $new_guy

Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!

Titles consisting of a single word are discouraged, and in most cases are disallowed outright.

Read Where should I post X? if you're not absolutely sure you're posting in the right place.

Please read these before you post! —

Posts may use any of the Perl Monks Approved HTML tags:

a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr

You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)

	For:		Use:
	&		`&`
	<		`<`
	>		`>`
	[		`[`
	]		`]`

Link using PerlMonks shortcuts! What shortcuts can I use for linking?

See Writeup Formatting Tips and other pages linked from there for more info.