comment on

I have a Perl problem which is the following: I have created a flat MySQL database, consists of 2 tables and I need to write a Perl parser to load data from a Genbank file into the tables (http://www.ncbi.nlm.nih.gov/Sitemap/samplerecord). I want the elements of the the variables to be in long lists in order to load them straight into the columns. Also, I need to remove gene duplicates as well. I firstly was thinking about this below:

#!/usr/bin/perl -w

 

#use warnings;

use strict;

use DBI;

use DBD::mysql;

use BeginPerlBioinfo;      #                                          
+                                                                     
+ 

#use Test::More tests=> 15

use Exporter;

OUR @ISA = qw(EXPORTER);

OUR @EXPORT=qw(get_file_data get_accession get_gene get_origin get_tra
+nslation);

 

 

sub connect_to_db {

my ($dbh, $sth, $...., $..., $..., $...);

my $dbname = "1";

my $dbserver = "";

my $datasource = "dbi:mysql:database=$dbname;host = $dbserver";

my $username = "";

my $password = "p";

 

my $dbh= DBI->connect($datasource,$username, $password,{

    PrintError=>0,     #disable PrintError

                RaiseError=>1      #enable RaiseError

}) || die "Error opening database: $DBI::errstr\n";

     return ($dbh);

    }

 

$sth = $dbh-> prepare("INSERT INTO Chrom_4_AC (AC_number, Gene_id)

                       VALUES                 (?        ,?       )"); 
+    # use placeholders

 

#Declare and initialize variables

my $genbank='chrom_CDS-4.txt';

my record="";

my $gene_id="";

my $AC_number="";

my $DNA_seq="";

my $start_stop_cs="";

my aa_seq="";

my $transl_start_site="";

my $protein_prod_names="";

my $chromosomal_location="";

                                                                      
+            

#Open the Genbank file

#unless (open (GBFILE, $genbank)) {

#        print "CANNOT open Genbank file \n";

#                             exit;

#                            

# $/="//\n';   #set input separator and read in arecord to scalar

#$record = <GBFILE>

#reset input separator

#$/=$save_input_separator;

 

#$record = ~/^(LOCUS.**ORIGIN\s*\n)(.*)\/\/\n/s);

#print $record;

 

sub get_file_data {

 

my ($filename) = @_;

my @filedata =();

 

unless (open(GET_FILE_DATA, $filename)) {

print STDERR "CANNOT open file\n";

exit;

}

@filedata = <GET_FILE_DATA>;

close GET_FILE_DATA;

return @filedata;

}

 

                sub get_accession   {

   my $genbank = shift;

    if ($genbank = ~/ACCESSION\s*(\w+)/) {

     return $1;

    }  else   {

         return "error";

         }

     }                        

                                                                      
+             

                                                                      
+            

sub get_gene {

    my $genbank= shift;

    if($genbank=/gene="(.*?)"/s) {

      return $1;

    }  else {

                   return "error";

                     }

    }

               

               

                sub get_origin  {

                   my $seq;

                   chop $seq;

                   if($genbank =~/ORIGIN|s*(.*)\/\//s) {

                   $seq = $1;

                }   else   {

        return             "error";

                                }

                                $seq =~s/[\s\d]//g;

                                return uc($seq);

                                }

                               

                                sub get_translation  {

                                my $genbank=shift;

                                my $pro;

                                if($genbank=~/translation="(.*?)"/s) {

                                $pro = $1;

                                }   else  {

                                return "error";

                                }

                                $pro=~s/[\s]//g;

                                return uc($pro);

                                }
[download]

##### not filished, neither the connection with the database is closed##### ] But I do not know if it is a good way to solve such problems. Then, I thought to use hashes., where key is a kind of counter and value each element I want extract.


#!/usr/bin/perl   -w

 

use strict;

use Data::Dumper;

 

my $fastaSuffix =".fasta";

 

@ARGV =qw(-) unless @ARGV;

foreach my $filename (@ARGV)  {

     local *FILE;

open FILE, "<$filename" or die "Couldn't open '$filename': $!";

my $genbank = new GBlite(\*FILE);

while (my $entry= $genbank->nextENTRY)   {

my $ac_number=$entry->AC_number;

my $gene_id=$entry->gene_id;

my $DNA_seq=$entry->DNA_seq;

my $start_stop_cs=$entry->start_stop_cs;

my $aa_seq=$entry->aa_seq;

my $transl_start_site=$entry->transl_start_site;

my $protein_prod_names=$entry->protein_prod_names;

my $chromosomal_location=$entry->chromosomal_location;

 

my $fasta = $ac_number . $fastaSuffix;

local *FASTA;

warn "[working '$fasta']\n";

open FASTA, ">$fasta" or die "Couldn't open '$fasta': $!";

print FASTA ">$ac_number\n", ">$gene_id\n", ">$DNA_seq\n", ">$start_st
+op_cs\n", ">$aa_seq\n", ">$transl_start_site\n",  ">$protein_prod_nam
+es\n", ">$chromosomal_location\n";




 

package GBlite;

use strict;

 

 

sub new  {

my  ($class, $fh) = @_;

if (ref $fh !~ /GLOB/)   {die "GBlite error: new expects a GLOB refere
+nce not $fh\n"}                 

my $this =bless{};

$this-> {FH}=$fh;

$this-> {LASTLINE}="";

$this-> {DONE}=0;

return $this;

   }

 

sub netxEntry  {

my ($this)=@_;

$this->fastForward or return 0;

my $FH = $this ->{FH};

 

#Theses are the fields that will be kept

my ($ac_number, $gene_id, $DNA_seq, $start_stop_cs, $aa_seq, $transl_s
+tart_site, $protein_prod_names, $chromosomal_location);

 

 

#get ac_number which may be in several lines

my %ac_number;

while (<$FH>) {

   if (/^VERSION/)  {

    $this->{LASTLINE}=$_;

     last;

    }

else   {

$_= ~m/s*(\w+)/;

my @ac_number=split;

foreach my $ac_number(@ac_number)   {

   $ac_number{$ac_number}++;

     }

    }

   }

$ac_number=[keys %ac_number];

 

}
[download]

######not finished############## I am completely lost now.. Could you give me an idea how to solve this problem.. I appreciate any help.

In reply to some perl advice by malaguena

Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!

Titles consisting of a single word are discouraged, and in most cases are disallowed outright.

Read Where should I post X? if you're not absolutely sure you're posting in the right place.

Please read these before you post! —

Posts may use any of the Perl Monks Approved HTML tags:

a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr

You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)

	For:		Use:
	&		`&`
	<		`<`
	>		`>`
	[		`[`
	]		`]`

Link using PerlMonks shortcuts! What shortcuts can I use for linking?

See Writeup Formatting Tips and other pages linked from there for more info.