Beefy Boxes and Bandwidth Generously Provided by pair Networks
We don't bite newbies here... much
 
PerlMonks  

comment on

( [id://3333]=superdoc: print w/replies, xml ) Need Help??
#!/usr/bin/perl # Module and script to build a desired amino acid # sequence given a collection of nucleotide snippets # Author: Runrig (http://www.perlmonks.org) <dougw@cpan.org> # Copyright: This is free software. You can redistribute # and/or modify it under the same terms as perl itself. package DNA::Stitch; use strict; use warnings; # Table of nucleotide sequences and their corresponding # amino acid. my %amino = qw( GCT A GCC A GCA A GCG A TGT C TGC C GAT D GAC D GAA E GAG E TTT F TTC F GGT G GGC G GGA G GGG G CAT H CAC H ATT I ATC I ATA I AAA K AAG K TTA L TTG L CTT L CTA L CTG L CTC L ATG M AAT N AAC + N CCT P CCC P CCA P CCG P CAA Q CAG Q CGC R CGT R CGA R CGG R AGA R AGG R TCT S TCC S TCA S TCG S AGT S AGC S ACT T ACC T ACA T ACG T GTT V GTC V GTA V GTG V TGG W TAT Y TAC Y ); # For better performance, we could cache the following # globals (with Storable or by just Dumping them here # as hardcoded) rather than recalculating them every # time. # amino_st: For each one or two nucleotides which start # an amino acid, create a character class of which amino acids # those nucleotides can start. # first: For each amino acid, create an array of what # nucleotides can start the amino acid. # suffix: For each amino acid, create an array of what # the next nucleotide can be given the first one or two nucleotides. my (%amino_st, %first, %suffix); while ( my ($nuke, $amino) = each %amino) { my ($one, $two, $three) = split '', $nuke; undef $amino_st{$one}{$amino}; undef $amino_st{$one . $two}{$amino}; undef $first{$amino}{$one}; undef $suffix{$amino}{$one}{$two . $three}; undef $suffix{$amino}{$one . $two}{$three}; } # Regex to match invalid amino acid sequences my $invalid_amino_seq = '[^' . join('', keys %first) . ']'; $invalid_amino_seq = qr/$invalid_amino_seq/; # Collapse keys of hash ref to a simple array ref $first{$_} = [ keys %{$first{$_}} ] for keys %first; $amino_st{$_} = join '', keys %{$amino_st{$_}} for keys %amino_st; # Create a character class if there's more than one amino acid for (values %amino_st) { $_ = "[$_]" if length > 1; } # Given the first one one or two nucleotides of an amino acid, # create a regex to match the rest of the nucleotides. my %match; for my $amino (keys %suffix) { for my $prefix (keys %{$suffix{$amino}}) { my @suffix = keys %{$suffix{$amino}{$prefix}}; my %uniq; undef @uniq{map substr($_, 0, 1), @suffix}; $suffix{$amino}{$prefix} = [ keys %uniq ]; # Just join all clauses with 'or', because trying to use # character classes here only benefits the last few # clauses of a long 'or' string my $match = '^(?:' . join("|", @suffix) . ')$'; $match{$amino}{$prefix} = qr/$match/; } } # Find what amino acid sequences can match a given nucleotide # sequence for a given collection of nucleotide 'snippets'. # If you often use the same collection, you could safely cache # it with Storable. sub new { my $class = shift; my @library; for my $snip (@_) { die "Snippets must have 2 or more nucleotides" unless length($snip) >= 2; die "Bad nucleotide sequence '$snip'" if $snip =~ /[^ACGT]/; my $nuc_seq = $snip; my $prefix = ''; # First assume the amino acid starts with the first nucleotide # in a sequence. Then shift off the first nucleotide and assume # that the previous amino acid ends with the shifted nucleotide # and the amino acid sequence starts with the resulting # sequence. Repeat for the next nucleotide. SHIFT_SEQ: for my $i (0..2) { my $amino_seq = ''; while ($nuc_seq =~ /(...)/g) { next SHIFT_SEQ unless exists $amino{$1}; $amino_seq .= $amino{$1}; } my $suffix = ''; my $amino_len = length($amino_seq); if (my $suffix_len = length($nuc_seq) % 3) { $suffix = substr($nuc_seq, -$suffix_len); next SHIFT_SEQ unless exists $amino_st{$suffix}; $amino_seq .= $amino_st{$suffix}; } push @{ $library[$i] }, { SNIP=>$snip, PREFIX=>$prefix, $amino_seq ? (MATCH=>qr/^$amino_seq/) : (), SUFFIX=>$suffix, LENGTH=>$amino_len, }; } continue { $prefix .= substr($nuc_seq, 0, 1, '') if $i < 2} } bless \@library, $class; } # Build a target amino acid sequence given a collection of # nucleotide sequences, how many times we are allowed to 'glue' # them together, and how many extra nucleotides we are allowed # to just 'splice' on. sub build { my $lib = shift; die "Target is empty" if !defined($_[0]) or $_[0] eq ''; die "Bad amino acid in target" if $_[0] =~ $invalid_amino_seq; die "Glue must be integer" if defined($_[1]) and $_[1] =~ /\D/; die "Splices must be integer" if defined($_[2]) and $_[2] =~ /\D/; my $target = shift; my $glue = shift || 0; my $splices = shift || 0; $lib->_build($target, $glue, $splices, '', ''); } # Build the target amino acid, but accept additional parameters # which indicate what amino acid occurs before the target sequence # which we haven't completed yet, and what nucleotides we have # so far that begin that amino acid. sub _build { my ($lib, $target, $glue, $splices, $prefix_amino, $prefix_nuc) = @_; my $prefix_len = length $prefix_nuc; my $shift_num = (3 - $prefix_len) % 3; my @results; SEQ: for my $seq (@{ $lib->[$shift_num] }) { if ($prefix_amino) { next SEQ unless $seq->{PREFIX} =~ $match{$prefix_amino}{$prefix_nuc}; unless ($target) { push @results, [ $seq->{SNIP} ]; next SEQ; } } next SEQ if exists $seq->{MATCH} and $target !~ $seq->{MATCH}; my $nxt_target = substr($target, $seq->{LENGTH}); my $nxt_prefix_amino = $seq->{SUFFIX} ? substr($nxt_target, 0, 1, '') : ''; unless ( $nxt_target or $nxt_prefix_amino ) { push @results, [ $seq->{SNIP} ]; next SEQ; } next SEQ unless $glue; my $nxt = $lib->_build( $nxt_target, $glue-1, $splices, $nxt_prefix_amino, $seq->{SUFFIX}, ); next SEQ unless @$nxt; unshift @$_, $seq->{SNIP} for @$nxt; push @results, @$nxt; } return \@results unless $splices--; # Now allow for splicing on a single nucleotide instead of using # one of the snippets. if ($prefix_amino) { my $nxt_prefix_amino = ($prefix_len >= 2) ? '' : $prefix_amino; SPLICE1: for my $splice ( @{$suffix{$prefix_amino}{$prefix_nuc}} ) { push(@results, [ $splice ]), next SPLICE1 unless $target or $nxt_prefix_amino; my $nxt = $lib->_build( $target, $glue, $splices, $nxt_prefix_amino, $prefix_nuc.$splice ); next SPLICE1 unless @$nxt; unshift @$_, $splice for @$nxt; push @results, @$nxt; } return \@results; } $prefix_amino = substr($target, 0, 1, ''); SPLICE2: for my $splice ( @{$first{$prefix_amino}} ) { my $nxt = $lib->_build( $target, $glue, $splices, $prefix_amino, $splice ); next SPLICE2 unless @$nxt; unshift @$_, $splice for @$nxt; push @results, @$nxt; } return \@results; } package main; use strict; use warnings; my $glues = 7; my $splices = 2; my $target = "ETIWITWIKNGFMTR"; my @snips = qw( AACGGA AC ACCCGA CA CGATGTTGGATTCCAGATTATGCTA GAGACGATT GAGAGACCAGAGGACCCTCGA GAAACGATGAGGGGAATAA GCGGGTT GG GGTACTG TG TGTTTG TGGATA TTGGATTAAA ); my $lib = DNA::Stitch->new(@snips); print "@$_\n" for @{ $lib->build($target, $glues, $splices) };

In reply to Amino acid sequence builder by runrig

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post; it's "PerlMonks-approved HTML":



  • Are you posting in the right place? Check out Where do I post X? to know for sure.
  • Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
    <code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
  • Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
  • Want more info? How to link or How to display code and escape characters are good places to start.
Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others exploiting the Monastery: (5)
As of 2024-04-19 13:49 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found