#!/usr/bin/perl -w package main; require 5.004; use strict; use Getopt::Std; ########## CONSTANTS ############### my $ID_LIMIT = 10000 ; use vars qw($opt_h $opt_v $opt_d $opt_i $opt_o $opt_g $opt_p) ; use vars qw($inputFile $outputFile $id $sequenceType) ; use vars qw($key) ; use vars qw($numRecords) ; getopts('i:o:g:p:hvd') ; $opt_h and die <<"QQ_HELP_QQ" ; DESCRIPTION This program reads a fasta 'Genbank formatted file', retrieves the gi numbers, and writes the numbers to a text file AVAILABILITY Requires perl 5.004 or higher SYNOPSIS makeGIList.pl -h -i [path/fileToRead] -o [path/fileToWrite] OPTIONS -h Print help message -v Verbose mode -i [path/fileToRead] File to process -o [path/fileToWrite] Destination -g [gi|gb|both] Make file with GI or Accession number or both -p [n|p] nucleotide or protein, DEFAULT = 'n' EXIT STATUS 0 Fail 1 Successful completion QQ_HELP_QQ if ($opt_d) { $opt_v = 'v' ; $opt_i = "D:\\datasets\\ncbi\\UniGene\\Ta.seq.uniq" ; $opt_o = "D:\\datasets\\ncbi\\UniGene\\gbnumTest.id" ; } # Input path parameter if ($opt_i) { $inputFile = $opt_i ; } else { die "an input file with its path is required" ; } # output path and filename if ($opt_o) { $outputFile = $opt_o ; } else { die "an ouput file with its path is required" ; } # id to fetch, GI, Accession, or both if ($opt_g) { if (($opt_g eq 'gi') or ($opt_g eq 'GI')) { $id = 'gi' ; } elsif (($opt_g eq 'gb') or ($opt_g eq 'GB')) { $id = 'gi' ; } elsif (($opt_g eq 'both') or ($opt_g eq 'BOTH')) { $id = 'both' ; } } else { $opt_g = 'gb' ; $id = $opt_g ; } # type of file if ($opt_p) { if (($opt_p eq 'n') or ($opt_p eq 'N')) { $sequenceType = 'n'; } elsif (($opt_p eq 'p') or ($opt_p eq 'P')) { $sequenceType = 'p' ; } } else { $opt_p = 'p' ; $sequenceType = $opt_p ; } # Debug mode # print out variables mapped from input flags if ($opt_d) { print "\$opt_d: $opt_d DEBUG mode on" ; # undocumented print "\$opt_h: $opt_h\n" ; print "\$opt_v: $opt_v\n" ; print "\$opt_i: $opt_i ==> \$inputFile: $inputFile\n" ; print "\$opt_o: $opt_o ==> \$outputFile: $outputFile\n" ; print "\$opt_g: $opt_g ==> \$id: $id\n" ; print "\$opt_p: $opt_p\==> \$sequenceType: $sequenceType\n" ; } #################### END OF INPUT FLAGS ######################## ################################################################# #################### Main Program ############################## #open source file if (open (INPUT, "<$inputFile")) { $opt_v and print "opened file $inputFile\n" ; } else { die "Can not open $inputFile: $!.\n" ; } #open destination file if (open (OUTPUT, ">>$outputFile")) { $opt_v and print "opened file to write: $outputFile.\n" ; } else { die "Can not write to $outputFile: $!.\n" ; } $opt_v and print "finding $id numbers\n" ; while (my $line = ) { chomp ($line) ; if ($key = &getIDnumber($line,$id)) { print OUTPUT "$key\n" ; } ; } $opt_v and print "done\n" ; exit; #################### END of Main Program ####################### ################################################################## sub getIDnumber { my($line, $id) = @_ ; my $IDtoReturn = '' ; my $IDfromDef = '' ; if ($line =~ /^>/) { # fasta Definition Line if (($id eq 'gi') and ($line =~ /\/gi=.*?\s/)) { $IDtoReturn= $1 ; chomp($IDtoReturn) ; } elsif (($id eq 'gb') and ($line =~ /\/gb=.*?\s/)) { $IDfromDef = $&; $IDtoReturn = substr($IDfromDef, 4) ; chomp($IDtoReturn) ; } } elsif ($line =~ /^$/){ # Blank line # do nothing } else { #sequence # do nothing } return $IDtoReturn }