chrM:307 0 AGCGGGGA 129 chrM:307 0 AGCGGGGA 130 chrM:307 0 AGCGGGGA 129 chrM:308 0 AGCGGGGA 129 chrM:308 0 AGCGGGGA 130 chrM:308 0 AGCGGGGA 129 chrM:309 0 AGCGGGGA 129 chrM:309 0 AGCGGGGA 130 chrM:309 0 AGCGGGGA 129 chrM:307 0 TCAAAATG 130 chrM:308 0 TCAAAATG 130 chrM:309 0 TCAAAATG 130 chrM:307 0 TCACGGTG 130 chrM:308 0 TCACGGTG 130 chrM:309 0 TCACGGTG 130 chrM:307 0 TCAGCCTG 129 chrM:308 0 TCAGCCTG 129 chrM:309 0 TCAGCCTG 129 chrM:307 0 TCAGGGAG 130 chrM:308 0 TCAGGGAG 130 chrM:309 0 TCAGGGAG 130 chrM:307 1 TCAGGGTG 106 chrM:307 2 TCAGGGTG 130 chrM:307 2 TCAGGGTG 129 chrM:308 1 TCAGGGTG 106 chrM:308 2 TCAGGGTG 130 chrM:308 2 TCAGGGTG 129 chrM:309 1 TCAGGGTG 106 chrM:309 2 TCAGGGTG 130 chrM:309 2 TCAGGGTG 129 #### chrM:307 0 AGCGGGGA 130 #-->Line1 chrM:307 0 TCAAAATG 130 #-->Line2 chrM:307 0 TCACGGTG 130 #-->Line3 chrM:307 0 TCAGGGAG 130 #-->Line4 chrM:307 2 TCAGGGTG 130 #-->Line5 chrM:307 0 AGCGGGGA 129 #-->Line6 chrM:307 0 AGCGGGGA 129 #-->Line7 chrM:307 0 TCAGCCTG 129 #-->Line8 chrM:307 2 TCAGGGTG 129 #-->Line9 chrM:307 1 TCAGGGTG 106 #-->Line10 #### CLUSTER chrM:307 0 AGCGGGGA 130 #--> Select, first line sorted by col3, col2, col4(rev sort) # Line1 chrM:307 0 AGCGGGGA 129 #--> Select line with similar UMI (NO mismatch) # Line6 chrM:307 0 AGCGGGGA 129 #--> Select line with similar UMI (NO mismatch) # Line7 CLUSTER chrM:307 0 TCAAAATG 130 #--> Now selected, Second line sorted by col3, col2, col4(rev sort) #Line2 # NO similar UMI, 1 line cluster CLUSTER chrM:307 0 TCACGGTG 130 #--> Now selected, Third line sorted by col3, col2, col4(rev sort) # Line3 chrM:307 0 TCAGGGAG 130 #--> Select line with similar UMI (Two mismatch with line3 UMI) # Line4 chrM:307 2 TCAGGGTG 130 #--> Select line with similar UMI (One mismatch with line3 UMI) # Line5 chrM:307 2 TCAGGGTG 129 #--> Select line with similar UMI (One mismatch with line3 UMI) # Line9 chrM:307 1 TCAGGGTG 106 #--> Select line with similar UMI (One mismatch with line3 UMI) # Line10 CLUSTER chrM:307 0 TCAGCCTG 129 #--> Now selected, line#8 sorted by col3, col2, col4(rev sort) #Line8, make single line cluster CLUSTER ....and so on for next chr positions... #### CLUSTER chrM:307 1 TCAGGGTG 106 chrM:307 2 TCAGGGTG 130 chrM:307 2 TCAGGGTG 129 chrM:307 0 TCAGGGAG 130 chrM:307 0 TCAGCCTG 129 chrM:307 0 TCACGGTG 130 CLUSTER chrM:307 0 TCAAAATG 130 CLUSTER chrM:307 0 AGCGGGGA 129 chrM:307 0 AGCGGGGA 130 chrM:307 0 AGCGGGGA 129 CLUSTER chrM:308 1 TCAGGGTG 106 chrM:308 2 TCAGGGTG 130 chrM:308 2 TCAGGGTG 129 chrM:308 0 TCAGGGAG 130 chrM:308 0 TCAGCCTG 129 chrM:308 0 TCACGGTG 130 CLUSTER chrM:308 0 TCAAAATG 130 CLUSTER chrM:308 0 AGCGGGGA 129 chrM:308 0 AGCGGGGA 130 chrM:308 0 AGCGGGGA 129 CLUSTER chrM:309 1 TCAGGGTG 106 chrM:309 2 TCAGGGTG 130 chrM:309 2 TCAGGGTG 129 chrM:309 0 TCAGGGAG 130 chrM:309 0 TCAGCCTG 129 chrM:309 0 TCACGGTG 130 CLUSTER chrM:309 0 TCAAAATG 130 CLUSTER chrM:309 0 AGCGGGGA 129 chrM:309 0 AGCGGGGA 130 chrM:309 0 AGCGGGGA 129 CLUSTER #### #!/usr/bin/env/perl #use strict; #use warnings; my $InSam = $ARGV[0]; my $SampleID = $ARGV[1]; chomp $InSam; chomp $SampleID; open(DATA, "$InSam") || die "Can't open $InSam: $!\n"; open my $OFILE, '>', $SampleID.'.cluster_ChrPos.txt' or die "Cannot create file for output: $!"; use strict; my %seen=(); my @flds; my $UMI; my @cluster_ChrPos; while () { chomp; @flds=split /\t/; my $POS; if ($seen{$flds[0]}++) { $UMI=$flds[2]; my @subpats; for my $i (0..length($UMI)-1) { for my $j ($i+1..length($UMI)-1) { my $subpat = join('', substr($UMI, 0, $i), '.', # or '\\w' substr($UMI, $i+1, $j-$i-1), '.', # or '\\w' substr($UMI, $j+1), ); push @subpats, $subpat; } } my $pat = join('|', @subpats); if($flds[0] =~ m/"$pat"/) { #print "$pat\t$flds[0]"; push (@cluster_ChrPos, "$_"); push (@cluster_ChrPos, "\n"); } else { #push (@cluster_ChrPos, "DELIMIT\n"); push (@cluster_ChrPos, "$_"); push (@cluster_ChrPos, "\n"); } } else { push (@cluster_ChrPos, "DELIMIT\n"); push (@cluster_ChrPos, "$_"); push (@cluster_ChrPos, "\n"); } } push (@cluster_ChrPos, "DELIMIT"); print $OFILE "@cluster_ChrPos\n"; print "@cluster_ChrPos\n"; close $OFILE; close DATA;