in reply to Re: Data munging
in thread Data munging
1. match the common keys with multiple values representing fragments of each key. 2. for each key I compare the start and end columns of each fragment (column1 & column2). 3. for each key if the fragments overlap between the query and reference keep count of the overlaps. 4. report the query sequence and insert the overlap count in column3, retaining all the other columns.
Note: The reference and query files are really large running >= 300,000 lines. This post is really long, so please bear with me! My questions are below: 1. I feel that the code can be much simpler than what I have! 2. Again memory issue is a concern for large files! Here are the reference and query text files:------ref.txt--------- c1 120 134 - AG c2 120 134 + TC c1 130 300 - AA c1 12 13000 - AU c9 14000 14008 - GN c9 900 1200 - GX c10 10040 10050 + GG c1 19992 20005 - GG c1 12000 14000 + TT -----query.txt-------- c1 100 12000 + AT c1 19800 20000 - AG c1 20049 20800 - GC c9 10078 14008 - AG c11 10078 14008 - TG c15 10078 14008 - TC c9 1078 10008 - TA c10 10080 10000 - TT ------code------------- use strict; use warnings; use Data::Dumper; # if (@ARGV < 1) # { # print "Usage: $0 inputDir \n"; #e.g ./ # exit; # } my $file1 = "ref.txt"; my $file2 = "query.txt"; # my($key1, $key2, %hash1, %hash2); open (IN1,'<'.$file1) || die "***can't open the file $!\n"; my @lines1 = <IN1>; close IN1; #$i=0; for (@lines1) { chomp; my @a1 = split(/\t/, $_); my $key1 = $a1[0]; my $rs = $a1[1]; my $re = $a1[2]; #push(@{ $hash1->{$key1} } , "$rs\t$re" ); push(@{ $hash1->{$key1} } , $_ ); } open (IN2,'<'.$file2) || die "***can't open the file $!\n"; my @lines2 = <IN2>; close IN1; for (@lines2) { chomp; my @a2 = split(/\t/, $_); my $qs = $a2[1]; my $qe = $a2[2]; my $key2 = $a2[0]; #push(@{ $hash2->{$key2} } , "$qs\t$qe"); push(@{ $hash2->{$key2} } , $_ ); } #print Dumper(\%$hash2); @common_keys = grep { exists $hash1->{$_} } sort keys %$hash2; my %seen; for (sort @common_keys) { for my $r (0..$#{ $hash1->{$_} }) { for my $q (0..$#{ $hash2->{$_} }) { my ($query_key, $query_start, $query_end, @qtail) = split(/\t/, $hash2->{$_}[$q]); my ($ref_key, $ref_start, $ref_end, @rtail) = split(/\t/, +$hash1->{$_}[$r]); if( ($query_start >= $ref_start && $query_start < += $ref_end) || ($query_end >= $ref_start && $query_end <= $r +ef_end) || ($ref_start >= $query_start && $ref_start <= +$query_end) || ($ref_end >= $query_start && $ref_end <= $que +ry_end) ) { $seen{$_}{$query_start}++; } } } } #print Dumper(\%seen); my $overlap_count; for my $key (sort keys %$hash2) { for my $i (0..$#{ $hash2->{$key} } ) { my @s = split(/\t/, $hash2->{$key}[$i]); my @head = @s[0..2]; my @tail = @s[3..$#s]; #print "***$tail[0]\n"; my $start = $s[1]; # print "***$start\n"; if( exists $seen{$key}{$start} ) { $overlap_count = $seen{$key}{$start}; #print "$hash2->{$key}[$i]\t$overlap_count\n" print map {"$_\t"} insert_field(\@head, \$overlap_count, \ +@tail); print "\n" } else { $overlap_count = 0; #print "$hash2->{$key}[$i]\t$overlap_count\n"; print map {"$_\t"} insert_field(\@head, \$overlap_count, \ +@tail); print "\n" } #print "\n"; } }
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re^3: Data munging
by BrowserUk (Patriarch) on Jan 22, 2010 at 18:19 UTC | |
by umasuresh (Hermit) on Jan 22, 2010 at 20:41 UTC | |
by BrowserUk (Patriarch) on Jan 23, 2010 at 01:50 UTC | |
by umasuresh (Hermit) on Jan 25, 2010 at 18:46 UTC | |
by BrowserUk (Patriarch) on Jan 25, 2010 at 23:19 UTC | |
by umasuresh (Hermit) on Jan 22, 2010 at 19:49 UTC |