#!/usr/bin/perl
# Module and script to build a desired amino acid
# sequence given a collection of nucleotide snippets
# Author: Runrig (http://www.perlmonks.org) <dougw@cpan.org>
# Copyright: This is free software. You can redistribute
# and/or modify it under the same terms as perl itself.
package DNA::Stitch;
use strict;
use warnings;
# Table of nucleotide sequences and their corresponding
# amino acid.
my %amino = qw(
GCT A GCC A GCA A GCG A TGT C TGC C GAT D GAC D GAA E GAG E
TTT F TTC F GGT G GGC G GGA G GGG G CAT H CAC H ATT I ATC I
ATA I AAA K AAG K TTA L TTG L CTT L CTA L CTG L CTC L ATG M AAT N AAC
+ N
CCT P CCC P CCA P CCG P CAA Q CAG Q CGC R CGT R CGA R CGG R AGA R
AGG R TCT S TCC S TCA S TCG S AGT S AGC S ACT T ACC T ACA T
ACG T GTT V GTC V GTA V GTG V TGG W TAT Y TAC Y
);
# For better performance, we could cache the following
# globals (with Storable or by just Dumping them here
# as hardcoded) rather than recalculating them every
# time.
# amino_st: For each one or two nucleotides which start
# an amino acid, create a character class of which amino acids
# those nucleotides can start.
# first: For each amino acid, create an array of what
# nucleotides can start the amino acid.
# suffix: For each amino acid, create an array of what
# the next nucleotide can be given the first one or two nucleotides.
my (%amino_st, %first, %suffix);
while ( my ($nuke, $amino) = each %amino) {
my ($one, $two, $three) = split '', $nuke;
undef $amino_st{$one}{$amino};
undef $amino_st{$one . $two}{$amino};
undef $first{$amino}{$one};
undef $suffix{$amino}{$one}{$two . $three};
undef $suffix{$amino}{$one . $two}{$three};
}
# Regex to match invalid amino acid sequences
my $invalid_amino_seq = '[^' . join('', keys %first) . ']';
$invalid_amino_seq = qr/$invalid_amino_seq/;
# Collapse keys of hash ref to a simple array ref
$first{$_} = [ keys %{$first{$_}} ] for keys %first;
$amino_st{$_} = join '', keys %{$amino_st{$_}} for keys %amino_st;
# Create a character class if there's more than one amino acid
for (values %amino_st) {
$_ = "[$_]" if length > 1;
}
# Given the first one one or two nucleotides of an amino acid,
# create a regex to match the rest of the nucleotides.
my %match;
for my $amino (keys %suffix) {
for my $prefix (keys %{$suffix{$amino}}) {
my @suffix = keys %{$suffix{$amino}{$prefix}};
my %uniq; undef @uniq{map substr($_, 0, 1), @suffix};
$suffix{$amino}{$prefix} = [ keys %uniq ];
# Just join all clauses with 'or', because trying to use
# character classes here only benefits the last few
# clauses of a long 'or' string
my $match = '^(?:' . join("|", @suffix) . ')$';
$match{$amino}{$prefix} = qr/$match/;
}
}
# Find what amino acid sequences can match a given nucleotide
# sequence for a given collection of nucleotide 'snippets'.
# If you often use the same collection, you could safely cache
# it with Storable.
sub new {
my $class = shift;
my @library;
for my $snip (@_) {
die "Snippets must have 2 or more nucleotides"
unless length($snip) >= 2;
die "Bad nucleotide sequence '$snip'" if $snip =~ /[^ACGT]/;
my $nuc_seq = $snip;
my $prefix = '';
# First assume the amino acid starts with the first nucleotide
# in a sequence. Then shift off the first nucleotide and assume
# that the previous amino acid ends with the shifted nucleotide
# and the amino acid sequence starts with the resulting
# sequence. Repeat for the next nucleotide.
SHIFT_SEQ: for my $i (0..2) {
my $amino_seq = '';
while ($nuc_seq =~ /(...)/g) {
next SHIFT_SEQ unless exists $amino{$1};
$amino_seq .= $amino{$1};
}
my $suffix = '';
my $amino_len = length($amino_seq);
if (my $suffix_len = length($nuc_seq) % 3) {
$suffix = substr($nuc_seq, -$suffix_len);
next SHIFT_SEQ unless exists $amino_st{$suffix};
$amino_seq .= $amino_st{$suffix};
}
push @{ $library[$i] }, {
SNIP=>$snip, PREFIX=>$prefix,
$amino_seq ? (MATCH=>qr/^$amino_seq/) : (),
SUFFIX=>$suffix,
LENGTH=>$amino_len,
};
} continue { $prefix .= substr($nuc_seq, 0, 1, '') if $i < 2}
}
bless \@library, $class;
}
# Build a target amino acid sequence given a collection of
# nucleotide sequences, how many times we are allowed to 'glue'
# them together, and how many extra nucleotides we are allowed
# to just 'splice' on.
sub build {
my $lib = shift;
die "Target is empty" if !defined($_[0]) or $_[0] eq '';
die "Bad amino acid in target" if $_[0] =~ $invalid_amino_seq;
die "Glue must be integer" if defined($_[1]) and $_[1] =~ /\D/;
die "Splices must be integer" if defined($_[2]) and $_[2] =~ /\D/;
my $target = shift;
my $glue = shift || 0;
my $splices = shift || 0;
$lib->_build($target, $glue, $splices, '', '');
}
# Build the target amino acid, but accept additional parameters
# which indicate what amino acid occurs before the target sequence
# which we haven't completed yet, and what nucleotides we have
# so far that begin that amino acid.
sub _build {
my ($lib, $target, $glue, $splices,
$prefix_amino, $prefix_nuc) = @_;
my $prefix_len = length $prefix_nuc;
my $shift_num = (3 - $prefix_len) % 3;
my @results;
SEQ: for my $seq (@{ $lib->[$shift_num] }) {
if ($prefix_amino) {
next SEQ unless
$seq->{PREFIX} =~ $match{$prefix_amino}{$prefix_nuc};
unless ($target) {
push @results, [ $seq->{SNIP} ];
next SEQ;
}
}
next SEQ if exists $seq->{MATCH} and $target !~ $seq->{MATCH};
my $nxt_target = substr($target, $seq->{LENGTH});
my $nxt_prefix_amino = $seq->{SUFFIX} ?
substr($nxt_target, 0, 1, '') : '';
unless ( $nxt_target or $nxt_prefix_amino ) {
push @results, [ $seq->{SNIP} ];
next SEQ;
}
next SEQ unless $glue;
my $nxt = $lib->_build(
$nxt_target, $glue-1, $splices,
$nxt_prefix_amino, $seq->{SUFFIX},
);
next SEQ unless @$nxt;
unshift @$_, $seq->{SNIP} for @$nxt;
push @results, @$nxt;
}
return \@results unless $splices--;
# Now allow for splicing on a single nucleotide instead of using
# one of the snippets.
if ($prefix_amino) {
my $nxt_prefix_amino = ($prefix_len >= 2) ? '' : $prefix_amino;
SPLICE1: for my $splice (
@{$suffix{$prefix_amino}{$prefix_nuc}}
) {
push(@results, [ $splice ]), next SPLICE1
unless $target or $nxt_prefix_amino;
my $nxt = $lib->_build(
$target, $glue, $splices,
$nxt_prefix_amino, $prefix_nuc.$splice
);
next SPLICE1 unless @$nxt;
unshift @$_, $splice for @$nxt;
push @results, @$nxt;
}
return \@results;
}
$prefix_amino = substr($target, 0, 1, '');
SPLICE2: for my $splice ( @{$first{$prefix_amino}} ) {
my $nxt = $lib->_build(
$target, $glue, $splices, $prefix_amino, $splice
);
next SPLICE2 unless @$nxt;
unshift @$_, $splice for @$nxt;
push @results, @$nxt;
}
return \@results;
}
package main;
use strict;
use warnings;
my $glues = 7;
my $splices = 2;
my $target = "ETIWITWIKNGFMTR";
my @snips = qw( AACGGA AC ACCCGA CA
CGATGTTGGATTCCAGATTATGCTA GAGACGATT
GAGAGACCAGAGGACCCTCGA GAAACGATGAGGGGAATAA
GCGGGTT GG GGTACTG TG TGTTTG TGGATA TTGGATTAAA
);
my $lib = DNA::Stitch->new(@snips);
print "@$_\n"
for @{ $lib->build($target, $glues, $splices) };
|