#!/usr/local/bin/perl package BioParser; use strict; use warnings; use autodie; use Carp qw/croak cluck/; use DBM::Deep; use Fcntl; use feature qw/say switch/; use Statistics::Descriptive; use Storable; use GD::Graph::linespoints; $|++; my $file = q{}; my @data = (); # bp_content method sub bp_count { my ($data) = @_; # obtain the count of each nucleotide in the sequence my $a_count = ($data =~ tr/Aa//); my $c_count = ($data =~ tr/Cc//); my $g_count = ($data =~ tr/Gg//); my $t_count = ($data =~ tr/Tt//); return ($a_count, $c_count, $g_count, $t_count); } # calc_annealing_temp method sub calc_annealing_temp { my ($data) = @_; # returns the result of calling the calc_melting_temp method # in order to establish an initial temp return (&calc_melting_temp() - 5); } # calc_melting_temp method sub calc_melting_temp { my ($data) = @_; # obtain the nucleotide counts my ($a_count, $c_count, $g_count, $t_count) = &bp_count($data); # calculate approximate melting point temperature my $mp = 4 * ($g_count + $c_count) + 2 * ($a_count + $t_count); return $mp; } # 'codon_to_amino_acid' function sub codon_to_aa { # store the codon in the 'codon' variable my ($codon) = @_; # scan the value in the codon variable given ($codon) { # convert the codon to the appropriate amino acid and return it when (/GC./i) { 'A'; } when (/TG[TC]/i) { 'C'; } when (/GA[TC]/i) { 'D'; } when (/GA[AG]/i) { 'E'; } when (/TT[TC]/i) { 'F'; } when (/GG./i) { 'G'; } when (/CA[TC]/i) { 'H'; } when (/AT[TCA]/i) {'I'; } when (/AA[AG]/i) { 'K'; } when (/TT[AG]|CT./i) { 'L'; } when (/ATG/i) { 'M'; } when (/AA[TC]/i) { 'N'; } when (/CC./i) { 'P'; } when (/CA[AG]/i) { 'Q'; } when (/CG.|AG[AG]/i) { 'R'; } when (/TC.|AG[TC]/i) { 'S'; } when (/AC./i) { 'T'; } when (/GT./i) { 'V'; } when (/TGG/i) { 'W'; } when (/TA[TC]/i) { 'Y'; } when (/TA[AG]|TGA/i) { '*'; } # none of the previous regex patterns matched default { say "I'm sorry. I don't recognize that codon!"; } } } # complement method sub complement { my ($data) = @_; my $complement = q{}; # transliterate the sequence using base pairing rules ($complement = $data) =~ tr/acgtACGT/tgcaTGCA/; return $complement; } sub connect_db { # import the DBI module use DBI; # accept the necessary parameters for database connectivity # $db is the database and $sql is the sql statement my ($db, $sql) = @_; # connect to the database with no user ID or password specified # Handle errors and assign a reference to the $dbh scalar. # State the reason for the database related error my $dbh = DBI->connect("dbi:SQLite:dbname=$db", "", "", {RaiseError => 1}, ) or die $DBI::errstr; # Prepare the sql statement and store it in the statement handler # scalar my $sth = $dbh->prepare($sql); # Execute the sql statement stored in the statement handler $sth->execute(); # Retrieve the results from the query and store the results in the # result scalar my $result = $sth->fetch(); # Loop over the array reference and display the record(s) followed # by a new line for (@$result) { print $_, "\n"; } $sth->finish(); # Close the database connection $dbh->disconnect(); } # display_graphic method sub display_graphic { my ($data) = @_; } # display_sequence method sub display_sequence { my ($data) = @_; # displays the sequence unaltered return $data; } # gc_content method sub gc_content { my ($data) = @_; # establish lexical variables my ($a_count, $c_count, $g_count, $t_count); # obtain the length of the sequence my $len1 = length $data; # obtain the count of each nucleotide in the sequence $a_count = ($data =~ tr/Aa//); $c_count = ($data =~ tr/Cc//); $g_count = ($data =~ tr/Gg//); $t_count = ($data =~ tr/Tt//); # calculate the percentage of g/c content my $gc_content = (($g_count + $c_count) / $len1) * 100; return $gc_content; } # multiple_sequence_alignment method sub perform_multiple_alignment { # receives input and output files my ($in_file, $out_file) = @_; # display the message as a HERE document, uses the external program, muscle, # to perform the multiple sequence alignment using the specified parameters # with the clustalw format used by default. print<<'EOF'; Please wait while I perform your multiple sequence alignment. Thank you. EOF my @alignment = `muscle -in $in_file -clw`; print<<'EOF'; Alignment complete. Please retrieve your results in the designated output file. Thank you. EOF } # read_fasta_file method sub read_fasta_file { # receive the file name my ($file) = @_; my @data; # open the file for reading sysopen FH, $file, 'O_RDONLY', 0755; # read the entire file and store it in the @data array @data = ; # close the file handle close FH; # remove the fasta header from the data for (@data) { s/^\>.*$//; } # return the modified data to the caller return @data; } # read_genbank_file method sub read_genbank_file { # receive the file my ($file) = @_; } # read_pdb_file method sub read_pdb_file { # receive the file my ($file) = @_; } # read_swiss_prot_file method sub read_swiss_prot_file { # receive the file my ($file) = @_; } # reverse_complement method sub reverse_complement { my ($data) = @_; my $complement = q{}; my $revcom = q{}; # transliterate the sequence using base pairing rules # copies the initial sequence into the complement variable ($complement = $data) =~ tr/acgtACGT/tgcaTGCA/; # reverses the complement, stores the value in the revcom variable # and returns it $revcom = reverse $complement; return $revcom; } sub search_sequence { my ($data) = @_; } # seq_length method sub seq_length { my ($data) = @_; # return the length of the sequence return length ($data); } sub store_sequence_info { my ($data, %args) = @_ } # transcribe method sub transcribe { my ($data) = @_; my $mrna = q{}; # copies the sequence into the mrna variable then globally substitutes # thymine for uracil ($mrna = $data) =~ s/t/u/gi; return $mrna; } # 'translate_sequence' function sub translate_sequence { # my ($data) = @_; # declare loop counter variable my $i = q{}; # initialize the 'codon' variable my $codon = q{}; # initialize the 'protein' variable my $protein = q{}; # iterate over the dna sequence and extract each codon it finds # pass the codon to the 'codon2aa' function and assemble the protein # chain for ($i = 0; $i < (length ($data) - 2); $i += 3) { $codon = substr($data, $i, 3); $protein .= codon_to_aa($codon); } # return the protein to the calling object return $protein; } sub usage { my ($data) = @_; print "Usage forms:\n\n"; print '$obj = new GCCBioParser(-attribute => value)', "\n\n"; print '$obj->method(argument)', "\n"; } # 'AUTOLOAD' function sub AUTOLOAD { # declare the autoload variable to handle undefined functions our $AUTOLOAD; # warn the user that the function called does not exist warn "\nI don't see a function called $AUTOLOAD.\n"; warn "Perhaps you intended to call a different function?\n"; } # 'DESTROY' function # Note: This function is called automatically upon program completion sub DESTROY { # Freeing allocated memory... } 1; =head1 DESCRIPTION Allows the user to input a genetic sequence via a file read where upon the the sequence is parsed and the relevent information is parsed =head2 Methods B - Called automatically when the user attempts to call a non-existent method. B - Helper method (not called directly). Returns the individual counts of each nucelotide in the biological sequence. B - Displays the estimated annealing point temperature of the biological sequence. Calls the calc_melting_point method in order to obtain the starting temperature. B - Displays the estimated melting point temperature of the biological sequence. Calls the bp_count helper method in order to obtain the necessary frequencies. B - Receives a condon and returns the appropriate amino acid. B - Returns the complement of the sequence. Adheres to Chargaff's base pairing rules. B - Initiates a connection to a local or remote database for sequence and/or information retrieval. B - Called automatically when an object exits scope and the memory is was using needs to be re-claimed. B - Creates a two-dimensional or three-dimensional graphic representing aspects of the sequences under examination. The type of graphic is passed to the method in the form of a hash attribute with the image stored on a physical memory device. B - Returns the sequence passed to the constructor in the form of a file. B - Calculates the percentage of guanosine and cytosine, GC, in the biological sequence. B - Receives two file names, one for input and another for output. Retrieves two or more biological sequences contained within the input file and performs a multiple sequence alignment with the result stored in the output file passed to the method. B - Receives a fasta file and stores the file contents in an array. B - Receives a genbank file and stores the file contents in an array. B - Receives a swiss-prot file and stores the file contents in an array. B - Retrieves a complex data structure, via data deserialization, from a physical device. B - Returns the reverse complement of the sequence passed to the constructor using Chargaff's base pairing rules. B - Returns the length of the sequence. B - Stores a complex data structure, via data serialization, on a physical device for subsequent retrieval. B - Creates and stores sequence information, e.g. Genbank section data (accession, origin, etc.) in a DBM hash for rapid retrieval. B - Transcribes and displays the DNA -> mRNA sequence to standard output. B - Translates and displays the mRNA sequence into a polypeptide chain of amino acids. Uses the IUPAC naming conventions. B - Displays examples of how the module can be used for common tasks pertaining to bioinformatics and/or computational biology. =head1 REQUIRED ARGUMENTS =head1 OPTIONS The user can specify the format of the biological sequence, e.g., fasta, Genbank, protein, etc. =head1 DEPENDENCIES This library requires the user to have installed several, at the time of this writing, non-core modules. =head1 INCOMPATIBILITIES Every reasonable effort has been made to maintain compatibility with previous versions of Perl. =head1 BUGS AND LIMITATIONS None are known at this time =head1 AUTHOR =head1 LICENSE AND COPYRIGHT Copyright 2013 This library is free software; you may redistribute it under the same terms as Perl itself.