Re^3: Data munging

Replies are listed 'Best First'.
Re^4: Data munging by umasuresh (Hermit) on Jan 22, 2010 at 20:41 UTC
copy & paste errors. Here is the code with use strict and all variables initialized and tested. use strict; use warnings; use Data::Dumper; # if (@ARGV < 1) # { # print "Usage: $0 inputDir \n"; #e.g ./ # exit; # } my $file1 = "ref.txt"; my $file2 = "query.txt"; # my($key1, $key2, $hash1, $hash2); open (IN1,'<'.$file1) \|\| die "*can't open the file $!\n"; my @lines1 = <IN1>; close IN1; #$i=0; for (@lines1) { chomp; my @a1 = split(/\t/, $_); my $key1 = $a1[0]; my $rs = $a1[1]; my $re = $a1[2]; #push(@{ $hash1->{$key1} } , "$rs\t$re" ); push(@{ $hash1->{$key1} } , $_ ); } open (IN2,'<'.$file2) \|\| die "can't open the file $!\n"; my @lines2 = <IN2>; close IN1; for (@lines2) { chomp; my @a2 = split(/\t/, $_); my $qs = $a2[1]; my $qe = $a2[2]; my $key2 = $a2[0]; #push(@{ $hash2->{$key2} } , "$qs\t$qe"); push(@{ $hash2->{$key2} } , $_ ); } #print Dumper(\%$hash2); my @common_keys = grep { exists $hash1->{$_} } sort keys %$hash2; my %seen; for (sort @common_keys) { for my $r (0..$#{ $hash1->{$_} }) { for my $q (0..$#{ $hash2->{$_} }) { #print "@{ $hash1->{$_} }[$i]\t@{ $hash2->{$_} }[$i]\t"; + my ($query_key, $query_start, $query_end, @qtail) = split( +/\t/, $hash2->{$_}[$q]); my ($ref_key, $ref_start, $ref_end, @rtail) = split(/\t/, +$hash1->{$_}[$r]); if( ($query_start >= $ref_start && $query_start < += $ref_end) \|\| ($query_end >= $ref_start && $query_end <= $r +ef_end) \|\| ($ref_start >= $query_start && $ref_start <= +$query_end) \|\| ($ref_end >= $query_start && $ref_end <= $que +ry_end) ) { $seen{$_}{$query_start}++; } } } } #print Dumper(\%seen); my $overlap_count; for my $key (sort keys %$hash2) { for my $i (0..$#{ $hash2->{$key} } ) { my @s = split(/\t/, $hash2->{$key}[$i]); my @head = @s[0..2]; my @tail = @s[3..$#s]; #print "$tail[0]\n"; my $start = $s[1]; # print "*$start\n"; if( exists $seen{$key}{$start} ) { $overlap_count = $seen{$key}{$start}; #print "$hash2->{$key}[$i]\t$overlap_count\n" print map {"$_\t"} insert_field(\@head, \$overlap_count, \ +@tail); print "\n" } else { $overlap_count = 0; #print "$hash2->{$key}[$i]\t$overlap_count\n"; print map {"$_\t"} insert_field(\@head, \$overlap_count, \ +@tail); print "\n" } #print "\n"; } } sub insert_field { my ($head, $insert, $tail) = @_; my @line; push(@line, @$head, $$insert, @$tail); return @line; } [download]	[reply] [d/l]
Re^5: Data munging by BrowserUk (Patriarch) on Jan 23, 2010 at 01:50 UTC
Hm. This is what I get from your latest code: C:\test>junk95 . Use of uninitialized value $start in exists at C:\test\junk95.pl line +95, <IN2> line 8. Use of uninitialized value $_ in concatenation (.) or string at C:\tes +t\junk95.pl line 108, <IN2> line 8. Use of uninitialized value $_ in concatenation (.) or string at C:\tes +t\junk95.pl line 108, <IN2> line 8. c1 100 12000 + AT 0 Use of uninitialized value $start in exists at C:\test\junk95.pl line +95, <IN2> line 8. Use of uninitialized value $_ in concatenation (.) or string at C:\tes +t\junk95.pl line 108, <IN2> line 8. Use of uninitialized value $_ in concatenation (.) or string at C:\tes +t\junk95.pl line 108, <IN2> line 8. c1 19800 20000 - AG 0 Use of uninitialized value $start in exists at C:\test\junk95.pl line +95, <IN2> line 8. Use of uninitialized value $_ in concatenation (.) or string at C:\tes +t\junk95.pl line 108, <IN2> line 8. Use of uninitialized value $_ in concatenation (.) or string at C:\tes +t\junk95.pl line 108, <IN2> line 8. c1 20049 20800 - GC 0 Use of uninitialized value $start in exists at C:\test\junk95.pl line +95, <IN2> line 8. Use of uninitialized value $_ in concatenation (.) or string at C:\tes +t\junk95.pl line 108, <IN2> line 8. Use of uninitialized value $_ in concatenation (.) or string at C:\tes +t\junk95.pl line 108, <IN2> line 8. c10 10080 10000 - TT 0 Use of uninitialized value $start in exists at C:\test\junk95.pl line +95, <IN2> line 8. Use of uninitialized value $_ in concatenation (.) or string at C:\tes +t\junk95.pl line 108, <IN2> line 8. Use of uninitialized value $_ in concatenation (.) or string at C:\tes +t\junk95.pl line 108, <IN2> line 8. c11 10078 14008 - TG 0 Use of uninitialized value $start in exists at C:\test\junk95.pl line +95, <IN2> line 8. Use of uninitialized value $_ in concatenation (.) or string at C:\tes +t\junk95.pl line 108, <IN2> line 8. Use of uninitialized value $_ in concatenation (.) or string at C:\tes +t\junk95.pl line 108, <IN2> line 8. c15 10078 14008 - TC 0 Use of uninitialized value $start in exists at C:\test\junk95.pl line +95, <IN2> line 8. Use of uninitialized value $_ in concatenation (.) or string at C:\tes +t\junk95.pl line 108, <IN2> line 8. Use of uninitialized value $_ in concatenation (.) or string at C:\tes +t\junk95.pl line 108, <IN2> line 8. c9 10078 14008 - AG 0 Use of uninitialized value $start in exists at C:\test\junk95.pl line +95, <IN2> line 8. Use of uninitialized value $_ in concatenation (.) or string at C:\tes +t\junk95.pl line 108, <IN2> line 8. Use of uninitialized value $_ in concatenation (.) or string at C:\tes +t\junk95.pl line 108, <IN2> line 8. c9 1078 10008 - TA 0 [download] Since I've already written it: #! perl -slw use strict; my %ref; open REF, '<', 'ref.txt' or die $!; while( <REF> ) { chomp; my @cols = split ' '; push @{ $ref{ $cols[0] } }, [ @cols[ 1, 2 ] ]; } close REF; open QUERY, '<', 'query.txt' or die $!; while( <QUERY> ) { chomp; my @cols = split ' '; if( exists $ref{ $cols[ 0 ] } ) { for my $ref ( @{ $ref{ $cols[ 0 ] } } ) { my( $sRef, $eRef ) = @$ref; next if $cols[ 1 ] > $eRef or $cols[ 2 ] < $sRef; my $so = $sRef > $cols[ 1 ] ? $sRef : $cols[ 1 ]; my $eo = $eRef < $cols[ 2 ] ? $eRef : $cols[ 2 ]; print join "\t", @cols[ 0 .. 2 ], ( $eo - $so + 1 ), @cols +[ 3 .. $#cols ]; } } } close QUERY; __END__ C:\test>819005 c1 100 12000 15 + AT c1 100 12000 171 + AT c1 100 12000 11901 + AT c1 100 12000 1 + AT c1 19800 20000 9 - AG c9 10078 14008 9 - AG c9 1078 10008 123 - TA [download] Examine what is said, not who speaks -- Silence betokens consent -- Love the truth but pardon error. "Science is about questioning the status quo. Questioning authority". In the absence of evidence, opinion is indistinguishable from prejudice. "I'd rather go naked than blow up my ass"	[reply] [d/l] [select]
Re^6: Data munging by umasuresh (Hermit) on Jan 25, 2010 at 18:46 UTC
BrowserUk, Thank you for trying and posting your version. I have been trying my code at my end and it has been working. I couldn't figure out why you were getting the errors that you listed in your post until now. I copied the ref.txt and query.txt from my earlier post and then tried my code and I got the same error as you. I realized that while formatting the code all tabs in the ref.txt and query.txt got replaced by spaces in the input text files while my local copy was still tab delimited. Once I replaced the tabs i.e. making the ref.txt and query.txt as tab delimited files, my code worked. May be I didn't explain the desired output properly. 1. match keys in ref and text, using the start and end, keep count of the number of overlapping fragments between the two. 2. If there is overlap, foreach key foreach start in the query, report the number of overlapping fragments with the reference. 3. If there are no overlaps insert a 0 in the desired column. Below is the desired output: `c1 100 12000 4 + AT c1 19800 20000 1 - AG c1 20049 20800 0 - GC c10 10080 10000 0 - TT c11 10078 14008 0 - TG c15 10078 14008 0 - TC c9 10078 14008 1 - AG c9 1078 10008 1 - TA` [download]	[reply] [d/l]
Re^7: Data munging by BrowserUk (Patriarch) on Jan 25, 2010 at 23:19 UTC
Re^4: Data munging by umasuresh (Hermit) on Jan 22, 2010 at 19:49 UTC
Hi BroswerUk, I am not trying to fool anyone to solve my problem! There has been a copy & paste error. I had commented use strict; at the beginning because it was giving me errors and that part was not copied in the code block that I posted! Initially I used %hash1 and %hash2, but then I changed them to references. Apparently I had missed the last block of code in the copy & paste. Here is the updated version with all variables initialized and tested. use strict; use warnings; use Data::Dumper; # if (@ARGV < 1) # { # print "Usage: $0 inputDir \n"; #e.g ./ # exit; # } my $file1 = "ref.txt"; my $file2 = "query.txt"; # my($key1, $key2, $hash1, $hash2); open (IN1,'<'.$file1) \|\| die "*can't open the file $!\n"; my @lines1 = <IN1>; close IN1; #$i=0; for (@lines1) { chomp; my @a1 = split(/\t/, $_); my $key1 = $a1[0]; my $rs = $a1[1]; my $re = $a1[2]; #push(@{ $hash1->{$key1} } , "$rs\t$re" ); push(@{ $hash1->{$key1} } , $_ ); } open (IN2,'<'.$file2) \|\| die "can't open the file $!\n"; my @lines2 = <IN2>; close IN1; for (@lines2) { chomp; my @a2 = split(/\t/, $_); my $qs = $a2[1]; my $qe = $a2[2]; my $key2 = $a2[0]; #push(@{ $hash2->{$key2} } , "$qs\t$qe"); push(@{ $hash2->{$key2} } , $_ ); } #print Dumper(\%$hash2); my @common_keys = grep { exists $hash1->{$_} } sort keys %$hash2; my %seen; for (sort @common_keys) { for my $r (0..$#{ $hash1->{$_} }) { for my $q (0..$#{ $hash2->{$_} }) { #print "@{ $hash1->{$_} }[$i]\t@{ $hash2->{$_} }[$i]\t"; + my ($query_key, $query_start, $query_end, @qtail) = split( +/\t/, $hash2->{$_}[$q]); my ($ref_key, $ref_start, $ref_end, @rtail) = split(/\t/, +$hash1->{$_}[$r]); if( ($query_start >= $ref_start && $query_start < += $ref_end) \|\| ($query_end >= $ref_start && $query_end <= $r +ef_end) \|\| ($ref_start >= $query_start && $ref_start <= +$query_end) \|\| ($ref_end >= $query_start && $ref_end <= $que +ry_end) ) { $seen{$_}{$query_start}++; } } } } #print Dumper(\%seen); my $overlap_count; for my $key (sort keys %$hash2) { for my $i (0..$#{ $hash2->{$key} } ) { my @s = split(/\t/, $hash2->{$key}[$i]); my @head = @s[0..2]; my @tail = @s[3..$#s]; #print "$tail[0]\n"; my $start = $s[1]; # print "*$start\n"; if( exists $seen{$key}{$start} ) { $overlap_count = $seen{$key}{$start}; #print "$hash2->{$key}[$i]\t$overlap_count\n" print map {"$_\t"} insert_field(\@head, \$overlap_count, \ +@tail); print "\n" } else { $overlap_count = 0; #print "$hash2->{$key}[$i]\t$overlap_count\n"; print map {"$_\t"} insert_field(\@head, \$overlap_count, \ +@tail); print "\n" } #print "\n"; } } sub insert_field { my ($head, $insert, $tail) = @_; my @line; push(@line, @$head, $$insert, @$tail); return @line; } [download] Your comment about me fooling is unacceptable to me. Please turn off the use strict at the beginning and check if you get an answer! Everyone can make mistakes when copying and pasting even if we check before pasting!	[reply] [d/l]


use strict;
use warnings;
use Data::Dumper;

# if (@ARGV < 1)
# {
    # print "Usage: $0 inputDir \n";
    #e.g ./
    # exit;
# }
my $file1 = "ref.txt";
my $file2 = "query.txt";
#

my($key1, $key2, $hash1, $hash2);

open (IN1,'<'.$file1) || die "***can't open the file $!\n";


my @lines1 = <IN1>;
close IN1;
#$i=0;
for (@lines1)
{
    chomp;
    
    my @a1 = split(/\t/, $_);
    
    my $key1 = $a1[0];
    my $rs = $a1[1];
    my $re = $a1[2];
    
    #push(@{ $hash1->{$key1} } , "$rs\t$re" );
    push(@{ $hash1->{$key1} } , $_ );

}


open (IN2,'<'.$file2) || die "***can't open the file $!\n";
my @lines2 = <IN2>;
close IN1;

for (@lines2)
{
    chomp;
    
    my @a2 = split(/\t/, $_);
    
    my $qs = $a2[1];
    my $qe = $a2[2];
    my $key2 = $a2[0];
    #push(@{ $hash2->{$key2} } , "$qs\t$qe");
    push(@{ $hash2->{$key2} } , $_ );

}

#print Dumper(\%$hash2);

my @common_keys = grep { exists $hash1->{$_} } sort keys %$hash2;

my %seen;
for (sort @common_keys)
{
    for my $r (0..$#{ $hash1->{$_} })
    {
        for my $q (0..$#{ $hash2->{$_} })
        {
            #print "@{ $hash1->{$_} }[$i]\t@{ $hash2->{$_} }[$i]\t";  
+      
            my ($query_key, $query_start, $query_end, @qtail) = split(
+/\t/, $hash2->{$_}[$q]);
            my ($ref_key, $ref_start, $ref_end, @rtail) = split(/\t/, 
+$hash1->{$_}[$r]);
            if(          ($query_start >= $ref_start && $query_start <
+= $ref_end) || 
                         ($query_end >= $ref_start && $query_end <= $r
+ef_end) || 
                         ($ref_start >= $query_start && $ref_start <= 
+$query_end) ||
                         ($ref_end >= $query_start && $ref_end <= $que
+ry_end) )  
            {
                $seen{$_}{$query_start}++;
            }
        }   
    }
    
}
#print Dumper(\%seen);
my $overlap_count;
for my $key (sort keys %$hash2)
{    
    for my $i (0..$#{ $hash2->{$key} } )
    {
        my @s = split(/\t/, $hash2->{$key}[$i]);
        my @head = @s[0..2];
        my @tail = @s[3..$#s];
        #print "***$tail[0]\n";
        my $start = $s[1];
        
        # print "***$start\n";
        if( exists $seen{$key}{$start} )
        {
            $overlap_count = $seen{$key}{$start};
            #print "$hash2->{$key}[$i]\t$overlap_count\n"
            
            print map {"$_\t"} insert_field(\@head, \$overlap_count, \
+@tail); 
            print "\n"
        }
        else
        {
            
            $overlap_count = 0;
            #print "$hash2->{$key}[$i]\t$overlap_count\n";
            print map {"$_\t"} insert_field(\@head, \$overlap_count, \
+@tail); 
            print "\n"
        }
        #print "\n";
    }
}





sub insert_field
{
    my ($head, $insert, $tail) = @_;
    my @line;
    push(@line, @$head, $$insert, @$tail);
    
    return @line;
    
}
[download]

[reply]
[d/l]

Hm. This is what I get from your latest code:

C:\test>junk95 .
Use of uninitialized value $start in exists at C:\test\junk95.pl line 
+95, <IN2> line 8.
Use of uninitialized value $_ in concatenation (.) or string at C:\tes
+t\junk95.pl line 108, <IN2> line 8.
Use of uninitialized value $_ in concatenation (.) or string at C:\tes
+t\junk95.pl line 108, <IN2> line 8.
c1    100    12000    +    AT                   0
Use of uninitialized value $start in exists at C:\test\junk95.pl line 
+95, <IN2> line 8.
Use of uninitialized value $_ in concatenation (.) or string at C:\tes
+t\junk95.pl line 108, <IN2> line 8.
Use of uninitialized value $_ in concatenation (.) or string at C:\tes
+t\junk95.pl line 108, <IN2> line 8.
c1    19800    20000    -    AG                 0
Use of uninitialized value $start in exists at C:\test\junk95.pl line 
+95, <IN2> line 8.
Use of uninitialized value $_ in concatenation (.) or string at C:\tes
+t\junk95.pl line 108, <IN2> line 8.
Use of uninitialized value $_ in concatenation (.) or string at C:\tes
+t\junk95.pl line 108, <IN2> line 8.
c1    20049    20800    -    GC                 0
Use of uninitialized value $start in exists at C:\test\junk95.pl line 
+95, <IN2> line 8.
Use of uninitialized value $_ in concatenation (.) or string at C:\tes
+t\junk95.pl line 108, <IN2> line 8.
Use of uninitialized value $_ in concatenation (.) or string at C:\tes
+t\junk95.pl line 108, <IN2> line 8.
c10    10080    10000    -    TT                        0
Use of uninitialized value $start in exists at C:\test\junk95.pl line 
+95, <IN2> line 8.
Use of uninitialized value $_ in concatenation (.) or string at C:\tes
+t\junk95.pl line 108, <IN2> line 8.
Use of uninitialized value $_ in concatenation (.) or string at C:\tes
+t\junk95.pl line 108, <IN2> line 8.
c11    10078    14008    -    TG                        0
Use of uninitialized value $start in exists at C:\test\junk95.pl line 
+95, <IN2> line 8.
Use of uninitialized value $_ in concatenation (.) or string at C:\tes
+t\junk95.pl line 108, <IN2> line 8.
Use of uninitialized value $_ in concatenation (.) or string at C:\tes
+t\junk95.pl line 108, <IN2> line 8.
c15    10078    14008    -    TC                        0
Use of uninitialized value $start in exists at C:\test\junk95.pl line 
+95, <IN2> line 8.
Use of uninitialized value $_ in concatenation (.) or string at C:\tes
+t\junk95.pl line 108, <IN2> line 8.
Use of uninitialized value $_ in concatenation (.) or string at C:\tes
+t\junk95.pl line 108, <IN2> line 8.
c9    10078    14008    -    AG                 0
Use of uninitialized value $start in exists at C:\test\junk95.pl line 
+95, <IN2> line 8.
Use of uninitialized value $_ in concatenation (.) or string at C:\tes
+t\junk95.pl line 108, <IN2> line 8.
Use of uninitialized value $_ in concatenation (.) or string at C:\tes
+t\junk95.pl line 108, <IN2> line 8.
c9    1078    10008    -    TA                  0
[download]

Since I've already written it:

#! perl -slw
use strict;

my %ref;
open REF, '<', 'ref.txt' or die $!;
while( <REF> ) {
    chomp;
    my @cols = split ' ';
    push @{ $ref{ $cols[0] } }, [ @cols[ 1, 2 ] ];
}
close REF;

open QUERY, '<', 'query.txt' or die $!;
while( <QUERY> ) {
    chomp;
    my @cols = split ' ';
    if( exists $ref{ $cols[ 0 ] } ) {
        for my $ref ( @{ $ref{ $cols[ 0 ] } } ) {
            my( $sRef, $eRef ) = @$ref;
            next if $cols[ 1 ] > $eRef or $cols[ 2 ] < $sRef;
            my $so = $sRef > $cols[ 1 ] ? $sRef : $cols[ 1 ];
            my $eo = $eRef < $cols[ 2 ] ? $eRef : $cols[ 2 ];
            print join "\t", @cols[ 0 .. 2 ], ( $eo - $so + 1 ), @cols
+[ 3 .. $#cols ];
        }
    }
}
close QUERY;
__END__
C:\test>819005
c1      100     12000   15      +       AT
c1      100     12000   171     +       AT
c1      100     12000   11901   +       AT
c1      100     12000   1       +       AT
c1      19800   20000   9       -       AG
c9      10078   14008   9       -       AG
c9      1078    10008   123     -       TA
[download]

Examine what is said, not who speaks -- Silence betokens consent -- Love the truth but pardon error.

"Science is about questioning the status quo. Questioning authority".

In the absence of evidence, opinion is indistinguishable from prejudice.

"I'd rather go naked than blow up my ass"

[reply]
[d/l]
[select]

c1    100    12000    4    +    AT
c1    19800    20000    1    -    AG
c1    20049    20800    0    -    GC
c10    10080    10000    0    -    TT
c11    10078    14008    0    -    TG
c15    10078    14008    0    -    TC
c9    10078    14008    1    -    AG
c9    1078    10008    1    -    TA
[download]

[reply]
[d/l]


use strict;
use warnings;
use Data::Dumper;

# if (@ARGV < 1)
# {
    # print "Usage: $0 inputDir \n";
    #e.g ./
    # exit;
# }
my $file1 = "ref.txt";
my $file2 = "query.txt";
#

my($key1, $key2, $hash1, $hash2);

open (IN1,'<'.$file1) || die "***can't open the file $!\n";


my @lines1 = <IN1>;
close IN1;
#$i=0;
for (@lines1)
{
    chomp;
    
    my @a1 = split(/\t/, $_);
    
    my $key1 = $a1[0];
    my $rs = $a1[1];
    my $re = $a1[2];
    
    #push(@{ $hash1->{$key1} } , "$rs\t$re" );
    push(@{ $hash1->{$key1} } , $_ );

}


open (IN2,'<'.$file2) || die "***can't open the file $!\n";
my @lines2 = <IN2>;
close IN1;

for (@lines2)
{
    chomp;
    
    my @a2 = split(/\t/, $_);
    
    my $qs = $a2[1];
    my $qe = $a2[2];
    my $key2 = $a2[0];
    #push(@{ $hash2->{$key2} } , "$qs\t$qe");
    push(@{ $hash2->{$key2} } , $_ );

}

#print Dumper(\%$hash2);

my @common_keys = grep { exists $hash1->{$_} } sort keys %$hash2;

my %seen;
for (sort @common_keys)
{
    for my $r (0..$#{ $hash1->{$_} })
    {
        for my $q (0..$#{ $hash2->{$_} })
        {
            #print "@{ $hash1->{$_} }[$i]\t@{ $hash2->{$_} }[$i]\t";  
+      
            my ($query_key, $query_start, $query_end, @qtail) = split(
+/\t/, $hash2->{$_}[$q]);
            my ($ref_key, $ref_start, $ref_end, @rtail) = split(/\t/, 
+$hash1->{$_}[$r]);
            if(          ($query_start >= $ref_start && $query_start <
+= $ref_end) || 
                         ($query_end >= $ref_start && $query_end <= $r
+ef_end) || 
                         ($ref_start >= $query_start && $ref_start <= 
+$query_end) ||
                         ($ref_end >= $query_start && $ref_end <= $que
+ry_end) )  
            {
                $seen{$_}{$query_start}++;
            }
        }   
    }
    
}
#print Dumper(\%seen);
my $overlap_count;
for my $key (sort keys %$hash2)
{    
    for my $i (0..$#{ $hash2->{$key} } )
    {
        my @s = split(/\t/, $hash2->{$key}[$i]);
        my @head = @s[0..2];
        my @tail = @s[3..$#s];
        #print "***$tail[0]\n";
        my $start = $s[1];
        
        # print "***$start\n";
        if( exists $seen{$key}{$start} )
        {
            $overlap_count = $seen{$key}{$start};
            #print "$hash2->{$key}[$i]\t$overlap_count\n"
            
            print map {"$_\t"} insert_field(\@head, \$overlap_count, \
+@tail); 
            print "\n"
        }
        else
        {
            
            $overlap_count = 0;
            #print "$hash2->{$key}[$i]\t$overlap_count\n";
            print map {"$_\t"} insert_field(\@head, \$overlap_count, \
+@tail); 
            print "\n"
        }
        #print "\n";
    }
}





sub insert_field
{
    my ($head, $insert, $tail) = @_;
    my @line;
    push(@line, @$head, $$insert, @$tail);
    
    return @line;
    
}
[download]

[reply]
[d/l]