chrM:307 0 AGCGGGGA 129
chrM:307 0 AGCGGGGA 130
chrM:307 0 AGCGGGGA 129
chrM:308 0 AGCGGGGA 129
chrM:308 0 AGCGGGGA 130
chrM:308 0 AGCGGGGA 129
chrM:309 0 AGCGGGGA 129
chrM:309 0 AGCGGGGA 130
chrM:309 0 AGCGGGGA 129
chrM:307 0 TCAAAATG 130
chrM:308 0 TCAAAATG 130
chrM:309 0 TCAAAATG 130
chrM:307 0 TCACGGTG 130
chrM:308 0 TCACGGTG 130
chrM:309 0 TCACGGTG 130
chrM:307 0 TCAGCCTG 129
chrM:308 0 TCAGCCTG 129
chrM:309 0 TCAGCCTG 129
chrM:307 0 TCAGGGAG 130
chrM:308 0 TCAGGGAG 130
chrM:309 0 TCAGGGAG 130
chrM:307 1 TCAGGGTG 106
chrM:307 2 TCAGGGTG 130
chrM:307 2 TCAGGGTG 129
chrM:308 1 TCAGGGTG 106
chrM:308 2 TCAGGGTG 130
chrM:308 2 TCAGGGTG 129
chrM:309 1 TCAGGGTG 106
chrM:309 2 TCAGGGTG 130
chrM:309 2 TCAGGGTG 129
####
chrM:307 0 AGCGGGGA 130 #-->Line1
chrM:307 0 TCAAAATG 130 #-->Line2
chrM:307 0 TCACGGTG 130 #-->Line3
chrM:307 0 TCAGGGAG 130 #-->Line4
chrM:307 2 TCAGGGTG 130 #-->Line5
chrM:307 0 AGCGGGGA 129 #-->Line6
chrM:307 0 AGCGGGGA 129 #-->Line7
chrM:307 0 TCAGCCTG 129 #-->Line8
chrM:307 2 TCAGGGTG 129 #-->Line9
chrM:307 1 TCAGGGTG 106 #-->Line10
####
CLUSTER
chrM:307 0 AGCGGGGA 130 #--> Select, first line sorted by col3, col2, col4(rev sort) # Line1
chrM:307 0 AGCGGGGA 129 #--> Select line with similar UMI (NO mismatch) # Line6
chrM:307 0 AGCGGGGA 129 #--> Select line with similar UMI (NO mismatch) # Line7
CLUSTER
chrM:307 0 TCAAAATG 130 #--> Now selected, Second line sorted by col3, col2, col4(rev sort) #Line2 # NO similar UMI, 1 line cluster
CLUSTER
chrM:307 0 TCACGGTG 130 #--> Now selected, Third line sorted by col3, col2, col4(rev sort) # Line3
chrM:307 0 TCAGGGAG 130 #--> Select line with similar UMI (Two mismatch with line3 UMI) # Line4
chrM:307 2 TCAGGGTG 130 #--> Select line with similar UMI (One mismatch with line3 UMI) # Line5
chrM:307 2 TCAGGGTG 129 #--> Select line with similar UMI (One mismatch with line3 UMI) # Line9
chrM:307 1 TCAGGGTG 106 #--> Select line with similar UMI (One mismatch with line3 UMI) # Line10
CLUSTER
chrM:307 0 TCAGCCTG 129 #--> Now selected, line#8 sorted by col3, col2, col4(rev sort) #Line8, make single line cluster
CLUSTER
....and so on for next chr positions...
####
CLUSTER
chrM:307 1 TCAGGGTG 106
chrM:307 2 TCAGGGTG 130
chrM:307 2 TCAGGGTG 129
chrM:307 0 TCAGGGAG 130
chrM:307 0 TCAGCCTG 129
chrM:307 0 TCACGGTG 130
CLUSTER
chrM:307 0 TCAAAATG 130
CLUSTER
chrM:307 0 AGCGGGGA 129
chrM:307 0 AGCGGGGA 130
chrM:307 0 AGCGGGGA 129
CLUSTER
chrM:308 1 TCAGGGTG 106
chrM:308 2 TCAGGGTG 130
chrM:308 2 TCAGGGTG 129
chrM:308 0 TCAGGGAG 130
chrM:308 0 TCAGCCTG 129
chrM:308 0 TCACGGTG 130
CLUSTER
chrM:308 0 TCAAAATG 130
CLUSTER
chrM:308 0 AGCGGGGA 129
chrM:308 0 AGCGGGGA 130
chrM:308 0 AGCGGGGA 129
CLUSTER
chrM:309 1 TCAGGGTG 106
chrM:309 2 TCAGGGTG 130
chrM:309 2 TCAGGGTG 129
chrM:309 0 TCAGGGAG 130
chrM:309 0 TCAGCCTG 129
chrM:309 0 TCACGGTG 130
CLUSTER
chrM:309 0 TCAAAATG 130
CLUSTER
chrM:309 0 AGCGGGGA 129
chrM:309 0 AGCGGGGA 130
chrM:309 0 AGCGGGGA 129
CLUSTER
####
#!/usr/bin/env/perl
#use strict;
#use warnings;
my $InSam = $ARGV[0];
my $SampleID = $ARGV[1];
chomp $InSam;
chomp $SampleID;
open(DATA, "$InSam") || die "Can't open $InSam: $!\n";
open my $OFILE, '>', $SampleID.'.cluster_ChrPos.txt' or die "Cannot create file for output: $!";
use strict;
my %seen=();
my @flds;
my $UMI;
my @cluster_ChrPos;
while ()
{
chomp;
@flds=split /\t/;
my $POS;
if ($seen{$flds[0]}++)
{
$UMI=$flds[2];
my @subpats;
for my $i (0..length($UMI)-1) {
for my $j ($i+1..length($UMI)-1) {
my $subpat = join('',
substr($UMI, 0, $i),
'.', # or '\\w'
substr($UMI, $i+1, $j-$i-1),
'.', # or '\\w'
substr($UMI, $j+1),
);
push @subpats, $subpat;
}
}
my $pat = join('|', @subpats);
if($flds[0] =~ m/"$pat"/)
{
#print "$pat\t$flds[0]";
push (@cluster_ChrPos, "$_");
push (@cluster_ChrPos, "\n");
}
else
{
#push (@cluster_ChrPos, "DELIMIT\n");
push (@cluster_ChrPos, "$_");
push (@cluster_ChrPos, "\n");
}
}
else
{
push (@cluster_ChrPos, "DELIMIT\n");
push (@cluster_ChrPos, "$_");
push (@cluster_ChrPos, "\n");
}
}
push (@cluster_ChrPos, "DELIMIT");
print $OFILE "@cluster_ChrPos\n";
print "@cluster_ChrPos\n";
close $OFILE;
close DATA;