#!/usr/bin/perl -w package main; require 5.004; use strict; use Getopt::Std; ########## CONSTANTS ############### #my $ID_LIMIT = 10000 ; my $BUFFER_LIST = 2000 ; # WRITE BUFFER Size of array accumulated before writing to file. ########### VARIABLES #################### use vars qw($opt_h $opt_v $opt_d $opt_i $opt_o $opt_g $opt_p) ; use vars qw($inputFile $outputFile $id $sequenceType) ; use vars qw($number $cntr_idNumber @outputBuffer) ; use vars qw($numRecords) ; getopts('i:o:g:p:hvd') ; $opt_h and die <<"QQ_HELP_QQ" ; DESCRIPTION This program reads a fasta 'Genbank formatted file', retrieves the gb numbers by default, or gi numbers if specified and writes the numbers to a text file AVAILABILITY Requires perl 5.004 or higher SYNOPSIS makeGIList.pl -h -i [path/fileToRead] -o [path/fileToWrite] OPTIONS -h Print help message -v Verbose mode -i [path/fileToRead] File to process -o [path/fileToWrite] Destination -g [gi|gb|both] Make file with GI or Accession number or both -p [n|p] nucleotide or protein, DEFAULT = 'n' EXIT STATUS 0 Fail 1 Successful completion QQ_HELP_QQ #undocumented in help ... # DEBUG MODE if ($opt_d) { $opt_v = 'v' ; $opt_i = "D:\\datasets\\ncbi\\UniGene\\Ta.seq.uniq" ; $opt_o = "D:\\datasets\\ncbi\\UniGene\\gbnumTest.id" ; } # Input path parameter if ($opt_i) { $inputFile = $opt_i ; } else { die "an input file with its path is required" ; } # output path and filename if ($opt_o) { $outputFile = $opt_o ; } else { die "an ouput file with its path is required" ; } # id to fetch, GI, Accession, or both if ($opt_g) { if (($opt_g eq 'gi') or ($opt_g eq 'GI')) { $id = 'gi' ; } elsif (($opt_g eq 'gb') or ($opt_g eq 'GB')) { $id = 'gi' ; } elsif (($opt_g eq 'both') or ($opt_g eq 'BOTH')) { $id = 'both' ; } } else { $opt_g = 'gb' ; $id = $opt_g ; } # type of file if ($opt_p) { if (($opt_p eq 'n') or ($opt_p eq 'N')) { $sequenceType = 'n'; } elsif (($opt_p eq 'p') or ($opt_p eq 'P')) { $sequenceType = 'p' ; } } else { $opt_p = 'n' ; #default 'n' nucleotide $sequenceType = $opt_p ; } # Debug mode # print out variables mapped from input flags if ($opt_d) { print "\$opt_d: $opt_d DEBUG mode on" ; # undocumented print "\$opt_h: $opt_h\n" ; print "\$opt_v: $opt_v\n" ; print "\$opt_i: $opt_i ==> \$inputFile: $inputFile\n" ; print "\$opt_o: $opt_o ==> \$outputFile: $outputFile\n" ; print "\$opt_g: $opt_g ==> \$id: $id\n" ; print "\$opt_p: $opt_p\==> \$sequenceType: $sequenceType\n" ; } #################### END OF INPUT FLAGS ######################## ################################################################# #################### Main Program ############################## #open source file if (open (INPUT, "<$inputFile")) { $opt_v and print "opened file $inputFile\n" ; } else { die "Can not open $inputFile: $!.\n" ; } #open destination file if (open (OUTPUT, ">>$outputFile")) { $opt_v and print "opened file to write: $outputFile.\n" ; } else { die "Can not write to $outputFile: $!.\n" ; } #print first line of destination file #specify whether accession numbers are for 'n' nucleotides or 'p' proteins. if ($sequenceType eq 'n') { print OUTPUT ">nucleotide\n" ; close OUTPUT or die "can not close $outputFile $!\n" ; $opt_v and print "closed file to write after writing first line\n\n" } elsif ($sequenceType eq 'p') { print OUTPUT ">protein\n" ; close OUTPUT or die "can not close $outputFile $!\n" ; $opt_v and print "closed file to write after writing first line\n\n" } $opt_v and print "finding $id numbers\n" ; #verbose mode $cntr_idNumber = 0 ; #count records whiles we find the id numbers @outputBuffer = () ; #declare output buffer while (my $line = ) { #read the input file line by line chomp ($line) ; if ($number = &getIDnumber($line,$id)) { $cntr_idNumber += 1 ; #increment id counter $opt_v and print "$cntr_idNumber) $number\n" ; push @outputBuffer, $number ; # add id to buffer if (scalar @outputBuffer == $BUFFER_LIST) { # buffered output, print evevery $BUFFER_LIST lines if (open (OUTPUT, ">>$outputFile")) { $opt_v and print "opened file to write $BUFFER_LIST ids: $outputFile.\n" ; } else { die "Can not open $inputFile: $!.\n" ; } foreach my $item (@outputBuffer) { print OUTPUT "$item\n" ; } @outputBuffer = (); #reset buffer close OUTPUT or die "can not close $outputFile after writing $BUFFER_LIST ids $!\n" ; $opt_v and print "Closed $outputFile after writing $BUFFER_LIST ids\n\n" ; } } } # print remainder of ids # Flush rest of the buffer if (open (OUTPUT, ">>$outputFile")) { $opt_v and print "opened file to write remainder of ids: $outputFile.\n" ; } else { die "Can not open $inputFile: $!.\n" ; } foreach my $item (@outputBuffer) { #print out remaining id's from buffer print OUTPUT "$item\n" ; } close OUTPUT ; # close filehandle @outputBuffer = () ; #clear buffer $opt_v and print "Closed $outputFile after writing remainder of ids\n\n" ; $opt_v and print "$cntr_idNumber records\n" ; $opt_v and print "done\n" ; exit; #################### END of Main Program ####################### ################################################################## sub getIDnumber { my($line, $id) = @_ ; my $IDtoReturn = '' ; my $IDfromDef = '' ; if ($line =~ /^>/) { # fasta Definition Line $opt_d and print "\$line: $line\n"; if (($id eq 'gi') and ($line =~ /\/gi=.*?\s/)) { $IDtoReturn= $& ; chomp($IDtoReturn) ; } elsif (($id eq 'gb') and ($line =~ /\/gb=.*?\s/)) { $opt_d and print "$&: " ; $IDfromDef = $&; $IDtoReturn = substr($IDfromDef, 4) ; chomp($IDtoReturn) ; } } elsif ($line =~ /^$/){ # Blank line # do nothing } else { #sequence # do nothing } return $IDtoReturn }