in reply to Re^2: mismatching characters in dna sequence
in thread mismatching characters in dna sequence

Character by character processing of strings is the single biggest weakness in perl's arsenal. As you've already used PDl, you probably have a compiler, so dropping into Inline::C should not be a problem for you.

The two routines below differ in the way the return the results. dnacmp() returns a list of strings like this:"123:A:C", whereas dnacmp2() concatenates all those into a single string for return:

#! perl -slw use strict; use Inline C => Config => BUILD_NOISY => 1; use Inline C => <<'END_C', NAME => 'dnacmp', CLEAN_AFTER_BUILD => 0; void dnacmp( char *a, char *b ) { int n = 0; Inline_Stack_Vars; Inline_Stack_Reset; while( *a && *b ) { if( *a != *b ) { Inline_Stack_Push( sv_2mortal( newSVpvf( "%d:%c:%c", n, *a +, *b ) ) ); } a++; b++; n++; } Inline_Stack_Done; return; } SV *dnacmp2( char *a, char *b ) { SV *res = newSVpv( "", 0 ); int n = 0; while( *a && *b ) { if( *a != *b ) sv_catpvf( res, "%d:%c:%c ", n, *a, *b ); ++a, ++b, ++n; } return res; } END_C use Data::Dump qw[ pp ]; use Time::HiRes qw[ time ]; sub rndStr{ join'', @_[ map{ rand @_ } 1 .. shift ] } print for dnacmp( 'ATTCCGGG', 'ATACCGC' ); print dnacmp2( 'ATTCGGG', 'ATACCGGC' ); our $N //= 10e3; our $L //= 8; my @seqs; $#seqs = $N-1; $seqs[ $_ ] = rndStr( $L, qw[ A C G T ] ) for 0 .. $#seqs; my $start = time; for my $i ( 0 .. $#seqs ) { for my $j ( $i+1 .. $#seqs ) { print "@seqs[ $i, $j ] : ", dnacmp2( @seqs[ $i, $j ] ); } } printf STDERR "Took %f seconds to cross compare $N sequences (%d compa +risons)\n", time() - $start, ($N**2-$N)/2; __END__ C:\test>dnacmp -N=142 -L=40 2:T:A 6:G:C 2:T:A 4:G:C Took 0.233414 seconds to cross compare 142 sequences (10011 comparison +s) C:\test>head out.dat 2:T:A 6:G:C 2:T:A 4:G:C CTTTACGGTCGCCAGTGCGCTAGCGGTGCTGGGTATGTCC TTTCGGTCAGGAATGGCGGCCAGCATATA +ATCGGAGCCGT : 0:C:T 3:T:C 4:A:G 5:C:G 6:G:T 7:G:C 8:T:A 9:C:G 11:C:A +12:C:A 13:A:T 15:T:G 16:G:C 17:C:G 20:T:C 24:G:A 25:G:T 26:T:A 27:G:T + 28:C:A 29:T:A 30:G:T 31:G:C 33:T:G 35:T:G 36:G:C 37:T:C 38:C:G 39:C: +T CTTTACGGTCGCCAGTGCGCTAGCGGTGCTGGGTATGTCC CCGGGAACTCATTGACTCATCTCTGCACG +CGGAAAGAGAC : 1:T:C 2:T:G 3:T:G 4:A:G 5:C:A 6:G:A 7:G:C 10:G:A 11:C:T + 12:C:T 13:A:G 14:G:A 15:T:C 16:G:T 18:G:A 19:C:T 20:T:C 21:A:T 22:G: +C 23:C:T 25:G:C 26:T:A 27:G:C 28:C:G 29:T:C 32:G:A 33:T:A 35:T:G 36:G +:A 37:T:G 38:C:A CTTTACGGTCGCCAGTGCGCTAGCGGTGCTGGGTATGTCC ACACCGTTACAGGAGCTTACGATAACGAC +CTGACCAATAC : 0:C:A 1:T:C 2:T:A 3:T:C 4:A:C 5:C:G 6:G:T 7:G:T 8:T:A 1 +0:G:A 11:C:G 12:C:G 15:T:C 16:G:T 17:C:T 18:G:A 20:T:G 22:G:T 23:C:A +24:G:A 25:G:C 26:T:G 27:G:A 29:T:C 30:G:T 32:G:A 33:T:C 34:A:C 35:T:A + 36:G:A 38:C:A CTTTACGGTCGCCAGTGCGCTAGCGGTGCTGGGTATGTCC TTTATGCTGTAAACTAGACTAACTCCCTT +AGCTACTGACG : 0:C:T 3:T:A 4:A:T 5:C:G 6:G:C 7:G:T 8:T:G 9:C:T 10:G:A +11:C:A 12:C:A 13:A:C 14:G:T 15:T:A 17:C:A 18:G:C 19:C:T 20:T:A 22:G:C + 23:C:T 24:G:C 25:G:C 26:T:C 27:G:T 28:C:T 29:T:A 31:G:C 32:G:T 33:T: +A 34:A:C 37:T:A 39:C:G CTTTACGGTCGCCAGTGCGCTAGCGGTGCTGGGTATGTCC CTCGTTCGGTATATGGCTGACGGAATTCG +CTACGGGCACG : 2:T:C 3:T:G 4:A:T 5:C:T 6:G:C 8:T:G 9:C:T 10:G:A 11:C:T + 12:C:A 13:A:T 15:T:G 16:G:C 17:C:T 19:C:A 20:T:C 21:A:G 23:C:A 24:G: +A 25:G:T 27:G:C 28:C:G 29:T:C 30:G:T 31:G:A 32:G:C 33:T:G 34:A:G 35:T +:G 36:G:C 37:T:A 39:C:G CTTTACGGTCGCCAGTGCGCTAGCGGTGCTGGGTATGTCC AAAAGGTGCGGCACTAGCGTAAACTGCAC +GAGCGTACACC : 0:C:A 1:T:A 2:T:A 3:T:A 4:A:G 5:C:G 6:G:T 8:T:C 9:C:G 1 +2:C:A 13:A:C 14:G:T 15:T:A 19:C:T 20:T:A 22:G:A 24:G:T 26:T:C 27:G:A +29:T:G 30:G:A 32:G:C 33:T:G 34:A:T 35:T:A 36:G:C 37:T:A CTTTACGGTCGCCAGTGCGCTAGCGGTGCTGGGTATGTCC TCAAGGGCGGCGGTAAAACAATGATGTAC +ACCTCGTCGCG : 0:C:T 1:T:C 2:T:A 3:T:A 4:A:G 5:C:G 7:G:C 8:T:G 9:C:G 1 +0:G:C 11:C:G 12:C:G 13:A:T 14:G:A 15:T:A 16:G:A 17:C:A 18:G:C 19:C:A +20:T:A 21:A:T 23:C:A 24:G:T 27:G:A 29:T:A 30:G:C 31:G:C 32:G:T 33:T:C + 34:A:G 36:G:C 37:T:G 39:C:G C:\test>tail out.dat TACATGCCCTATCCAGGCAACGCATTAGGCACTGCGTGTA ACAATGCGACGGCTCCAGACTTGATACCT +GTATCAAACTA : 0:T:A 1:A:C 2:C:A 7:C:G 8:C:A 9:T:C 10:A:G 11:T:G 13:C: +T 14:A:C 15:G:C 16:G:A 17:C:G 19:A:C 20:C:T 21:G:T 22:C:G 25:T:A 26:A +:C 27:G:C 28:G:T 29:C:G 30:A:T 31:C:A 33:G:C 34:C:A 35:G:A 36:T:A 37: +G:C TACATGCCCTATCCAGGCAACGCATTAGGCACTGCGTGTA GGTGGAAACGGCGTGTGTAGAAATTTATT +CACGGTACTGT : 0:T:G 1:A:G 2:C:T 3:A:G 4:T:G 5:G:A 6:C:A 7:C:A 9:T:G 1 +0:A:G 11:T:C 12:C:G 13:C:T 14:A:G 15:G:T 17:C:T 19:A:G 20:C:A 21:G:A +22:C:A 23:A:T 27:G:T 28:G:T 32:T:G 34:C:T 35:G:A 36:T:C 37:G:T 38:T:G + 39:A:T TACATGCCCTATCCAGGCAACGCATTAGGCACTGCGTGTA GATAGTTCCAAGCCACCTCCCACCCTGCT +CAACATCGCTA : 0:T:G 2:C:T 4:T:G 5:G:T 6:C:T 9:T:A 11:T:G 15:G:C 16:G: +C 17:C:T 18:A:C 19:A:C 21:G:A 23:A:C 24:T:C 26:A:G 27:G:C 28:G:T 31:C +:A 32:T:C 33:G:A 34:C:T 35:G:C 36:T:G 37:G:C TACATGCCCTATCCAGGCAACGCATTAGGCACTGCGTGTA AAGGTTGGTACTTACTAACCGGTTTGAGC +CGTTCAACGAA : 0:T:A 2:C:G 3:A:G 5:G:T 6:C:G 7:C:G 8:C:T 9:T:A 10:A:C +12:C:T 13:C:A 14:A:C 15:G:T 16:G:A 17:C:A 18:A:C 19:A:C 20:C:G 22:C:T + 23:A:T 25:T:G 28:G:C 30:A:G 31:C:T 33:G:C 34:C:A 35:G:A 36:T:C 38:T: +A ACAATGCGACGGCTCCAGACTTGATACCTGTATCAAACTA GGTGGAAACGGCGTGTGTAGAAATTTATT +CACGGTACTGT : 0:A:G 1:C:G 2:A:T 3:A:G 4:T:G 5:G:A 6:C:A 7:G:A 8:A:C 9 +:C:G 11:G:C 12:C:G 14:C:G 15:C:T 16:A:G 17:G:T 19:C:G 20:T:A 21:T:A 2 +2:G:A 23:A:T 25:A:T 26:C:A 27:C:T 29:G:C 30:T:A 31:A:C 32:T:G 33:C:G +34:A:T 36:A:C 37:C:T 38:T:G 39:A:T ACAATGCGACGGCTCCAGACTTGATACCTGTATCAAACTA GATAGTTCCAAGCCACCTCCCACCCTGCT +CAACATCGCTA : 0:A:G 1:C:A 2:A:T 4:T:G 5:G:T 6:C:T 7:G:C 8:A:C 9:C:A 1 +0:G:A 13:T:C 14:C:A 16:A:C 17:G:T 18:A:C 20:T:C 21:T:A 22:G:C 23:A:C +24:T:C 25:A:T 26:C:G 29:G:C 30:T:A 32:T:C 33:C:A 34:A:T 35:A:C 36:A:G ACAATGCGACGGCTCCAGACTTGATACCTGTATCAAACTA AAGGTTGGTACTTACTAACCGGTTTGAGC +CGTTCAACGAA : 1:C:A 2:A:G 3:A:G 5:G:T 6:C:G 8:A:T 9:C:A 10:G:C 11:G:T + 12:C:T 13:T:A 15:C:T 17:G:A 18:A:C 20:T:G 21:T:G 22:G:T 23:A:T 25:A: +G 26:C:A 27:C:G 28:T:C 29:G:C 30:T:G 31:A:T 36:A:C 37:C:G 38:T:A GGTGGAAACGGCGTGTGTAGAAATTTATTCACGGTACTGT GATAGTTCCAAGCCACCTCCCACCCTGCT +CAACATCGCTA : 1:G:A 3:G:A 5:A:T 6:A:T 7:A:C 9:G:A 10:G:A 11:C:G 12:G: +C 13:T:C 14:G:A 15:T:C 16:G:C 18:A:C 19:G:C 20:A:C 22:A:C 23:T:C 24:T +:C 26:A:G 27:T:C 31:C:A 32:G:C 33:G:A 35:A:C 36:C:G 37:T:C 38:G:T 39: +T:A GGTGGAAACGGCGTGTGTAGAAATTTATTCACGGTACTGT AAGGTTGGTACTTACTAACCGGTTTGAGC +CGTTCAACGAA : 0:G:A 1:G:A 2:T:G 4:G:T 5:A:T 6:A:G 7:A:G 8:C:T 9:G:A 1 +0:G:C 11:C:T 12:G:T 13:T:A 14:G:C 16:G:A 17:T:A 18:A:C 19:G:C 20:A:G +21:A:G 22:A:T 25:T:G 27:T:G 28:T:C 30:A:G 31:C:T 32:G:T 33:G:C 34:T:A + 37:T:G 38:G:A 39:T:A GATAGTTCCAAGCCACCTCCCACCCTGCTCAACATCGCTA AAGGTTGGTACTTACTAACCGGTTTGAGC +CGTTCAACGAA : 0:G:A 2:T:G 3:A:G 4:G:T 6:T:G 7:C:G 8:C:T 10:A:C 11:G:T + 12:C:T 13:C:A 14:A:C 15:C:T 16:C:A 17:T:A 20:C:G 21:A:G 22:C:T 23:C: +T 24:C:T 25:T:G 26:G:A 27:C:G 28:T:C 30:A:G 31:A:T 32:C:T 33:A:C 34:T +:A 35:C:A 36:G:C 37:C:G 38:T:A

With the rise and rise of 'Social' network sites: 'Computers are making people easier to use everyday'
Examine what is said, not who speaks -- Silence betokens consent -- Love the truth but pardon error.
"Science is about questioning the status quo. Questioning authority".
In the absence of evidence, opinion is indistinguishable from prejudice.

The start of some sanity?

Replies are listed 'Best First'.
Re^4: mismatching characters in dna sequence
by prbndr (Acolyte) on Dec 30, 2011 at 20:31 UTC

    ok, i think inline::c is a little beyond me. i'm trying it out with the following code:

    # V5: Inline::C #!/opt/local/bin/perl use strict; use Inline C => Config => BUILD_NOISY => 1; use Inline C => <<'END_C', NAME => 'dnacmp', CLEAN_AFTER_BUILD => 0; void dnacmp( char *a, char *b ) { int n = 0; Inline_Stack_Vars; Inline_Stack_Reset; while( *a && *b ) { if( *a != *b ) { Inline_Stack_Push( sv_2mortal( newSVpvf( "%d:%c:%c", n, *a +, *b ) ) ); } a++; b++; n++; } Inline_Stack_Done; return; } SV *dnacmp2( char *a, char *b ) { SV *res = newSVpv( "", 0 ); int n = 0; while( *a && *b ) { if( *a != *b ) sv_catpvf( res, "%d:%c:%c ", n, *a, *b ); ++a, ++b, ++n; } return res; } END_C use Bio::Seq; use Bio::DB::Sam; my $bamfile = Bio::DB::Sam->new( -bam => "eg1.bam" ); my @allReads = $bamfile->features( -type => 'match' ); for my $read (@allReads) { ( $ref, $matches, $query ) = $read->padded_alignment; print $read->qname, "\n", dnacmp2('$ref', '$query'); }

    first, i extract some information from the .bam file. then, i create the target and the test sequence on the fly in the subsequent for loop. $ref represents the target and $query represents the test (both are just strings). i then try to feed these two variables into dnacmp2 and print a few more things before it ($read->qname represents a sequence identifier), but it throws the following error:

    Global symbol "$query" requires explicit package name at mutations.pl +line 133. Execution of mutations.pl aborted due to compilation errors. Pradeep-Bandarus-MacBook-Pro:test pradeepbandaru$ perl mutations.pl Global symbol "$ref" requires explicit package name at mutations.pl li +ne 133. Global symbol "$matches" requires explicit package name at mutations.p +l line 133. Global symbol "$query" requires explicit package name at mutations.pl +line 133. Execution of mutations.pl aborted due to compilation errors.
    sorry if this is a rookie question, but what's going on with this?

      Hm. The first and most obvious problem is you've forgotten my here:

      my( $ref, $matches, $query ) = $read->padded_alignment; #...^^

      A more subtle potential problem is not having the shebang line as the first line of the file:

      # V5: Inline::C #!/opt/local/bin/perl

      I don't know what that first line is meant to do, but either delete it or move it below the Inline C code.

      Make those two corrections and then see what happens.

      You do have a C compiler correctly installed don't you?


      With the rise and rise of 'Social' network sites: 'Computers are making people easier to use everyday'
      Examine what is said, not who speaks -- Silence betokens consent -- Love the truth but pardon error.
      "Science is about questioning the status quo. Questioning authority".
      In the absence of evidence, opinion is indistinguishable from prejudice.

      The start of some sanity?

        thanks...this is really quick! it was really the missing "my" that did me in. is there a quick way to tabulate all the respective transitions? e.g.
        A:T = 234 A:G = 492 A:C = 273 T:A = 400 ...
        and so on... basically output the total number of each type of transition?