in reply to Counting overlapping dimers for multiple sequences
Assuming that you are storing your sequences in files with meaningful names you could do something like this. Given these two example sequence files
knoppix@Microknoppix:~/perl/Monks$ head -99 spw961600_seq* ==> spw961600_seqA <== TCCAGATCCCTGGGGCCCCTGGGTGAGGGCAGCCAGACGCAACGTCTGGAGGAAGCT ==> spw961600_seqB <== CTGCGTTTCGACGCCATGGCTGAGCTGGAGACGGTCCTGCCCATGCTGCTC knoppix@Microknoppix:~/perl/Monks$
this script
use strict; use warnings; use Data::Dumper; my @seqFiles = glob q{spw961600_seq*}; my %counts; foreach my $seqFile ( @seqFiles ) { my $seq = do { open my $seqFH, q{<}, $seqFile or die qq{open: < $seqFile: $!\n}; local $/; <$seqFH>; }; while ( $seq =~ m{(?=(..))}g ) { $counts{ totals }->{ $1 } ++; $counts{ $seqFile }->{ $1 } ++; } } print Data::Dumper->Dumpxs( [ \ %counts ], [ qw{ *counts } ] );
builds this data structure
%counts = ( 'spw961600_seqB' => { 'AC' => 2, 'AG' => 2, 'CC' => 4, 'TG' => 7, 'AT' => 2, 'TC' => 3, 'GA' => 4, 'TT' => 2, 'CT' => 6, 'GG' => 3, 'CG' => 4, 'CA' => 2, 'GC' => 7, 'GT' => 2 }, 'spw961600_seqA' => { 'AC' => 2, 'AG' => 6, 'CC' => 7, 'TG' => 4, 'AT' => 1, 'TC' => 3, 'AA' => 2, 'GA' => 5, 'CT' => 4, 'CG' => 2, 'GG' => 9, 'GC' => 5, 'CA' => 4, 'GT' => 2 }, 'totals' => { 'AC' => 4, 'AG' => 8, 'CC' => 11, 'TG' => 11, 'AT' => 3, 'TC' => 6, 'AA' => 2, 'GA' => 9, 'TT' => 2, 'CT' => 10, 'CG' => 6, 'GG' => 12, 'GC' => 12, 'CA' => 6, 'GT' => 4 } );
I hope I have understood your question correctly and that this will help you move forward.
Cheers,
JohnGG
|
|---|