I have a Perl problem which is the following: I have created a flat MySQL database, consists of 2 tables and I need to write a Perl parser to load data from a Genbank file into the tables (http://www.ncbi.nlm.nih.gov/Sitemap/samplerecord). I want the elements of the the variables to be in long lists in order to load them straight into the columns. Also, I need to remove gene duplicates as well. I firstly was thinking about this below:
#!/usr/bin/perl -w #use warnings; use strict; use DBI; use DBD::mysql; use BeginPerlBioinfo; # + + #use Test::More tests=> 15 use Exporter; OUR @ISA = qw(EXPORTER); OUR @EXPORT=qw(get_file_data get_accession get_gene get_origin get_tra +nslation); sub connect_to_db { my ($dbh, $sth, $...., $..., $..., $...); my $dbname = "1"; my $dbserver = ""; my $datasource = "dbi:mysql:database=$dbname;host = $dbserver"; my $username = ""; my $password = "p"; my $dbh= DBI->connect($datasource,$username, $password,{ PrintError=>0, #disable PrintError RaiseError=>1 #enable RaiseError }) || die "Error opening database: $DBI::errstr\n"; return ($dbh); } $sth = $dbh-> prepare("INSERT INTO Chrom_4_AC (AC_number, Gene_id) VALUES (? ,? )"); + # use placeholders #Declare and initialize variables my $genbank='chrom_CDS-4.txt'; my record=""; my $gene_id=""; my $AC_number=""; my $DNA_seq=""; my $start_stop_cs=""; my aa_seq=""; my $transl_start_site=""; my $protein_prod_names=""; my $chromosomal_location=""; + #Open the Genbank file #unless (open (GBFILE, $genbank)) { # print "CANNOT open Genbank file \n"; # exit; # # $/="//\n'; #set input separator and read in arecord to scalar #$record = <GBFILE> #reset input separator #$/=$save_input_separator; #$record = ~/^(LOCUS.**ORIGIN\s*\n)(.*)\/\/\n/s); #print $record; sub get_file_data { my ($filename) = @_; my @filedata =(); unless (open(GET_FILE_DATA, $filename)) { print STDERR "CANNOT open file\n"; exit; } @filedata = <GET_FILE_DATA>; close GET_FILE_DATA; return @filedata; } sub get_accession { my $genbank = shift; if ($genbank = ~/ACCESSION\s*(\w+)/) { return $1; } else { return "error"; } } + + sub get_gene { my $genbank= shift; if($genbank=/gene="(.*?)"/s) { return $1; } else { return "error"; } } sub get_origin { my $seq; chop $seq; if($genbank =~/ORIGIN|s*(.*)\/\//s) { $seq = $1; } else { return "error"; } $seq =~s/[\s\d]//g; return uc($seq); } sub get_translation { my $genbank=shift; my $pro; if($genbank=~/translation="(.*?)"/s) { $pro = $1; } else { return "error"; } $pro=~s/[\s]//g; return uc($pro); }
##### not filished, neither the connection with the database is closed##### ] But I do not know if it is a good way to solve such problems. Then, I thought to use hashes., where key is a kind of counter and value each element I want extract.
#!/usr/bin/perl -w use strict; use Data::Dumper; my $fastaSuffix =".fasta"; @ARGV =qw(-) unless @ARGV; foreach my $filename (@ARGV) { local *FILE; open FILE, "<$filename" or die "Couldn't open '$filename': $!"; my $genbank = new GBlite(\*FILE); while (my $entry= $genbank->nextENTRY) { my $ac_number=$entry->AC_number; my $gene_id=$entry->gene_id; my $DNA_seq=$entry->DNA_seq; my $start_stop_cs=$entry->start_stop_cs; my $aa_seq=$entry->aa_seq; my $transl_start_site=$entry->transl_start_site; my $protein_prod_names=$entry->protein_prod_names; my $chromosomal_location=$entry->chromosomal_location; my $fasta = $ac_number . $fastaSuffix; local *FASTA; warn "[working '$fasta']\n"; open FASTA, ">$fasta" or die "Couldn't open '$fasta': $!"; print FASTA ">$ac_number\n", ">$gene_id\n", ">$DNA_seq\n", ">$start_st +op_cs\n", ">$aa_seq\n", ">$transl_start_site\n", ">$protein_prod_nam +es\n", ">$chromosomal_location\n"; package GBlite; use strict; sub new { my ($class, $fh) = @_; if (ref $fh !~ /GLOB/) {die "GBlite error: new expects a GLOB refere +nce not $fh\n"} my $this =bless{}; $this-> {FH}=$fh; $this-> {LASTLINE}=""; $this-> {DONE}=0; return $this; } sub netxEntry { my ($this)=@_; $this->fastForward or return 0; my $FH = $this ->{FH}; #Theses are the fields that will be kept my ($ac_number, $gene_id, $DNA_seq, $start_stop_cs, $aa_seq, $transl_s +tart_site, $protein_prod_names, $chromosomal_location); #get ac_number which may be in several lines my %ac_number; while (<$FH>) { if (/^VERSION/) { $this->{LASTLINE}=$_; last; } else { $_= ~m/s*(\w+)/; my @ac_number=split; foreach my $ac_number(@ac_number) { $ac_number{$ac_number}++; } } } $ac_number=[keys %ac_number]; }
######not finished############## I am completely lost now.. Could you give me an idea how to solve this problem.. I appreciate any help.

In reply to some perl advice by malaguena

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post, it's "PerlMonks-approved HTML":



  • Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!
  • Titles consisting of a single word are discouraged, and in most cases are disallowed outright.
  • Read Where should I post X? if you're not absolutely sure you're posting in the right place.
  • Please read these before you post! —
  • Posts may use any of the Perl Monks Approved HTML tags:
    a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr
  • You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)
            For:     Use:
    & &amp;
    < &lt;
    > &gt;
    [ &#91;
    ] &#93;
  • Link using PerlMonks shortcuts! What shortcuts can I use for linking?
  • See Writeup Formatting Tips and other pages linked from there for more info.