comment on

#!/usr/bin/perl

# Module and script to build a desired amino acid
# sequence given a collection of nucleotide snippets
# Author: Runrig (http://www.perlmonks.org) <dougw@cpan.org>
# Copyright: This is free software. You can redistribute
# and/or modify it under the same terms as perl itself.

package DNA::Stitch;

use strict;
use warnings;

# Table of nucleotide sequences and their corresponding
# amino acid.
my %amino = qw(
 GCT A GCC A GCA A GCG A TGT C TGC C GAT D GAC D GAA E GAG E
 TTT F TTC F GGT G GGC G GGA G GGG G CAT H CAC H ATT I ATC I
 ATA I AAA K AAG K TTA L TTG L CTT L CTA L CTG L CTC L ATG M AAT N AAC
+ N
 CCT P CCC P CCA P CCG P CAA Q CAG Q CGC R CGT R CGA R CGG R AGA R
 AGG R TCT S TCC S TCA S TCG S AGT S AGC S ACT T ACC T ACA T
 ACG T GTT V GTC V GTA V GTG V TGG W TAT Y TAC Y
);

# For better performance, we could cache the following
# globals (with Storable or by just Dumping them here
# as hardcoded) rather than recalculating them every
# time.
# amino_st: For each one or two nucleotides which start
# an amino acid, create a character class of which amino acids
# those nucleotides can start.
# first: For each amino acid, create an array of what
# nucleotides can start the amino acid.
# suffix: For each amino acid, create an array of what
# the next nucleotide can be given the first one or two nucleotides.
my (%amino_st, %first, %suffix);
while ( my ($nuke, $amino) = each %amino) {
  my ($one, $two, $three) = split '', $nuke;
  undef $amino_st{$one}{$amino};
  undef $amino_st{$one . $two}{$amino};
  undef $first{$amino}{$one};
  undef $suffix{$amino}{$one}{$two . $three};
  undef $suffix{$amino}{$one . $two}{$three};
}
# Regex to match invalid amino acid sequences
my $invalid_amino_seq = '[^' . join('', keys %first) . ']';
$invalid_amino_seq = qr/$invalid_amino_seq/;

# Collapse keys of hash ref to a simple array ref
$first{$_} = [ keys %{$first{$_}} ] for keys %first;
$amino_st{$_} = join '', keys %{$amino_st{$_}} for keys %amino_st;
# Create a character class if there's more than one amino acid
for (values %amino_st) {
  $_ = "[$_]" if length > 1;
}

# Given the first one one or two nucleotides of an amino acid,
# create a regex to match the rest of the nucleotides.
my %match;
for my $amino (keys %suffix) {
  for my $prefix (keys %{$suffix{$amino}}) {
    my @suffix = keys %{$suffix{$amino}{$prefix}};
    my %uniq; undef @uniq{map substr($_, 0, 1), @suffix};
    $suffix{$amino}{$prefix} = [ keys %uniq ];
    # Just join all clauses with 'or', because trying to use
    # character classes here only benefits the last few
    # clauses of a long 'or' string
    my $match = '^(?:' . join("|", @suffix) . ')$';
    $match{$amino}{$prefix} = qr/$match/;
  }
}

# Find what amino acid sequences can match a given nucleotide
# sequence for a given collection of nucleotide 'snippets'.
# If you often use the same collection, you could safely cache
# it with Storable.
sub new {
  my $class = shift;
  my @library;
  for my $snip (@_) {
    die "Snippets must have 2 or more nucleotides"
      unless length($snip) >= 2;
    die "Bad nucleotide sequence '$snip'" if $snip =~ /[^ACGT]/;
    my $nuc_seq = $snip;
    my $prefix = '';
    # First assume the amino acid starts with the first nucleotide
    # in a sequence. Then shift off the first nucleotide and assume
    # that the previous amino acid ends with the shifted nucleotide
    # and the amino acid sequence starts with the resulting
    # sequence. Repeat for the next nucleotide.
    SHIFT_SEQ: for my $i (0..2) {
      my $amino_seq = '';
      while ($nuc_seq =~ /(...)/g) {
        next SHIFT_SEQ unless exists $amino{$1};
        $amino_seq .= $amino{$1};
      }
      my $suffix = '';
      my $amino_len = length($amino_seq);
      if (my $suffix_len = length($nuc_seq) % 3) {
        $suffix = substr($nuc_seq, -$suffix_len);
        next SHIFT_SEQ unless exists $amino_st{$suffix};
        $amino_seq .= $amino_st{$suffix};
      }
      push @{ $library[$i] }, {
        SNIP=>$snip, PREFIX=>$prefix,
        $amino_seq ? (MATCH=>qr/^$amino_seq/) : (),
        SUFFIX=>$suffix,
        LENGTH=>$amino_len,
      };
    } continue { $prefix .= substr($nuc_seq, 0, 1, '') if $i < 2}
  }
  bless \@library, $class;
}

# Build a target amino acid sequence given a collection of
# nucleotide sequences, how many times we are allowed to 'glue'
# them together, and how many extra nucleotides we are allowed
# to just 'splice' on.
sub build {
  my $lib = shift;
  die "Target is empty" if !defined($_[0]) or $_[0] eq '';
  die "Bad amino acid in target" if $_[0] =~ $invalid_amino_seq;
  die "Glue must be integer" if defined($_[1]) and $_[1] =~ /\D/;
  die "Splices must be integer" if defined($_[2]) and $_[2] =~ /\D/;
  my $target = shift;
  my $glue = shift || 0;
  my $splices = shift || 0;
  $lib->_build($target, $glue, $splices, '', '');
}

# Build the target amino acid, but accept additional parameters
# which indicate what amino acid occurs before the target sequence
# which we haven't completed yet, and what nucleotides we have
# so far that begin that amino acid.
sub _build {
  my ($lib, $target, $glue, $splices,
      $prefix_amino, $prefix_nuc) = @_;
  my $prefix_len = length $prefix_nuc;
  my $shift_num = (3 - $prefix_len) % 3;
  my @results;
  SEQ: for my $seq (@{ $lib->[$shift_num] }) {
    if ($prefix_amino) {
      next SEQ unless
        $seq->{PREFIX} =~ $match{$prefix_amino}{$prefix_nuc};
      unless ($target) {
        push @results, [ $seq->{SNIP} ];
        next SEQ;
      }
    }
    next SEQ if exists $seq->{MATCH} and $target !~ $seq->{MATCH};
    my $nxt_target = substr($target, $seq->{LENGTH});
    my $nxt_prefix_amino = $seq->{SUFFIX} ?
      substr($nxt_target, 0, 1, '') : '';
    unless ( $nxt_target or $nxt_prefix_amino ) {
      push @results, [ $seq->{SNIP} ];
      next SEQ;
    }
    next SEQ unless $glue;
    my $nxt = $lib->_build(
      $nxt_target, $glue-1, $splices,
      $nxt_prefix_amino, $seq->{SUFFIX},
    );
    next SEQ unless @$nxt;
    unshift @$_, $seq->{SNIP} for @$nxt;
    push @results, @$nxt;
  }
  return \@results unless $splices--;
  # Now allow for splicing on a single nucleotide instead of using
  # one of the snippets.
  if ($prefix_amino) {
    my $nxt_prefix_amino = ($prefix_len >= 2) ? '' : $prefix_amino;
    SPLICE1: for my $splice (
      @{$suffix{$prefix_amino}{$prefix_nuc}}
    ) {
      push(@results, [ $splice ]), next SPLICE1
        unless $target or $nxt_prefix_amino;
      my $nxt = $lib->_build(
        $target, $glue, $splices,
        $nxt_prefix_amino, $prefix_nuc.$splice
      );
      next SPLICE1 unless @$nxt;
      unshift @$_, $splice for @$nxt;
      push @results, @$nxt;
    }
    return \@results;
  }
  $prefix_amino = substr($target, 0, 1, '');
  SPLICE2: for my $splice ( @{$first{$prefix_amino}} ) {
    my $nxt = $lib->_build(
      $target, $glue, $splices, $prefix_amino, $splice
    );
    next SPLICE2 unless @$nxt;
    unshift @$_, $splice for @$nxt;
    push @results, @$nxt;
  }
  return \@results;
}

package main;

use strict;
use warnings;

my $glues = 7;
my $splices = 2;
my $target = "ETIWITWIKNGFMTR";
my @snips = qw( AACGGA AC ACCCGA CA
 CGATGTTGGATTCCAGATTATGCTA GAGACGATT
 GAGAGACCAGAGGACCCTCGA GAAACGATGAGGGGAATAA
 GCGGGTT GG GGTACTG TG TGTTTG TGGATA TTGGATTAAA
);

my $lib = DNA::Stitch->new(@snips);
print "@$_\n"
  for @{ $lib->build($target, $glues, $splices) };
[download]

In reply to Amino acid sequence builder by runrig

Are you posting in the right place? Check out Where do I post X? to know for sure.
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
Want more info? How to link or How to display code and escape characters are good places to start.


We don't bite newbies here... much
	PerlMonks