#! /usr/bin/perl -w # Script to Create Database files use strict; my $output; my $infile; my $summary; my $input = "C:\\Elliott\\Database\\Repository\\"; my $line; my $wait; my $linecount = 0; my @fieldsize; my @temparray; my $i; my @inputfiles = ("gene2accession", "gene2go","gene2sts","gene2unigene","gene2pubmed","gene2refseq","gene_history","gene_info","gene_refseq_uniprotkb_collab","generifs_basic","hiv_interactions","interactions"); $summary = $input."myentrezgenefilesummary.txt"; open (SUMMARY,"> $summary") or die "Cannot open $summary: $!"; print SUMMARY "This file lists the processed files their field and maximum field size\n"; print SUMMARY "This data can be used to determine the varchar field sizes in the novel therapies SQL database\n"; foreach (@inputfiles) { $linecount = 0; #get input and output files and open for reading/writing $infile = $input.$_; $output = $input.$_.".txt"; open (INFILE, "< $infile") or die "Cannot open $input: $!"; open (OUTFILE,"> $output") or die "Cannot open $output: $!"; print "FILE ", $infile, " OPEN", "\n"; while ($line = ) { chomp $line; if ($linecount == 0) { #if first line print field names #print "SUMMARY is".(is_writable_fh(\*SUMMARY)?"":"n't")." writable.\n"; if ($_ =~ /gene2accession/) { print OUTFILE "Taxon\tGeneID\tStatus\tRNA_Nucleotide_Accession\tRNA_Nucleotide_gi\tProtein_Accession\tProtein_gi\tGenomic_Nucleotide_Accession\tGenomic_Nucleotide_gi\tGenomic_Accession_Start_Pos\tGenomic_Accession_End_Pos\tOrientation\tAssembly\n"; print SUMMARY "Field lengths for file $_\n"; print SUMMARY "Taxon\tGeneID\tStatus\tRNA_Nucleotide_Accession\tRNA_Nucleotide_gi\tProtein_Accession\tProtein_gi\tGenomic_Nucleotide_Accession\tGenomic_Nucleotide_gi\tGenomic_Accession_Start_Pos\tGenomic_Accession_End_Pos\tOrientation\tAssembly\n"; } elsif ($_ =~ /gene2go/){ print OUTFILE "Taxon\tGeneID\tGO_ID\tEvidence\tQualifier\tGO_term\tPubMedID\tCategory\n"; print SUMMARY "Field lengths for file $_\n"; print SUMMARY "Taxon\tGeneID\tGO_ID\tEvidence\tQualifier\tGO_term\tPubMedID\tCategory\n"; } elsif ($_ =~ /gene2pubmed/){ print OUTFILE "Taxon\tGeneID\tPubMedID\n"; print SUMMARY "Field lengths for file $_\n"; print SUMMARY "Taxon\tGeneID\tPubMedID\n"; } elsif ($_ =~ /gene2refseq/){ print OUTFILE "Taxon\tGeneID\tStatus\tRNA_Nucleotide_Accession\tRNA_Nucleotide_gi\tProtein_Accession\tProtein_gi\tGenomic_Nucleotide_Accession\tGenomic_Nucleotide_gi\tGenomic_Accession_Start_Pos\tGenomic_Accession_End_Pos\tOrientation\tAssembly\n"; print SUMMARY "Field lengths for file $_\n"; print SUMMARY "Taxon\tGeneID\tStatus\tRNA_Nucleotide_Accession\tRNA_Nucleotide_gi\tProtein_Accession\tProtein_gi\tGenomic_Nucleotide_Accession\tGenomic_Nucleotide_gi\tGenomic_Accession_Start_Pos\tGenomic_Accession_End_Pos\tOrientation\tAssembly\n"; } elsif ($_ =~ /gene2sts/){ print OUTFILE "GeneID\tUniSTSID\n"; print SUMMARY "Field lengths for file $_\n"; print SUMMARY "GeneID\tUniSTSID\n"; } elsif ($_ =~ /gene2unigene/){ print OUTFILE "GeneID\tUnigeneUD\n"; print SUMMARY "Field lengths for file $_\n"; print SUMMARY "GeneID\tUnigeneUD\n"; } elsif ($_ =~ /gene_history/){ print OUTFILE "Taxon\tGeneID\tDiscontinued_GeneID\tDiscontinued_Symbol\n"; print SUMMARY "Field lengths for file $_\n"; print SUMMARY "Taxon\tGeneID\tDiscontinued_GeneID\tDiscontinued_Symbol\n"; } elsif ($_ =~ /gene_info/){ print OUTFILE "Taxon\tGeneID\tSymbol\tLocusTag\tSynonyms\tdbXrefs\tChromosome\tMap_Location\tDescription\tType_Of_Gene\tSymbol_From_Nomenclature_Authority\tFull_Name_From_Nomenclature_Authority\tNomenclature_Status\tOther_Designations\tModification_Date\n"; print SUMMARY "Field lengths for file $_\n"; print SUMMARY "Taxon\tGeneID\tSymbol\tLocusTag\tSynonyms\tdbXrefs\tChromosome\tMap_Location\tDescription\tType_Of_Gene\tSymbol_From_Nomenclature_Authority\tFull_Name_From_Nomenclature_Authority\tNomenclature_Status\tOther_Designations\tModification_Date\n"; } elsif ($_ =~ /gene_refseq_uniprotkb_collab/){ print OUTFILE "NCBI_Protein_Accession\tUniProtKB_Protein_Accession\n"; print SUMMARY "Field lengths for file $_\n"; print SUMMARY "NCBI_Protein_Accession\tUniProtKB_Protein_Accession\n"; } elsif ($_ =~ /generifs_basic/){ print OUTFILE "Taxon\tGeneID\tPubMedID\tLastUpdate\tGeneRIFText\n"; print OUTFILE "$line\n"; # no header row in file get field for comparison @fieldsize = split(/\t/, $line); print SUMMARY "Field lengths for file $_\n"; print SUMMARY "Taxon\tGeneID\tPubMedID\tLastUpdate\tGeneRIFText\n"; } elsif ($_ =~ /hiv_interactions/){ print OUTFILE "Taxon\tGeneID\tProductAccession\tProductName\tInteractionShortName\tInteractorTaxon\tInteractorGeneID\tInteractorProdictAccession\tInteractorProductName\tPubMedID\tLastUpdate\tGeneRIFText\n"; print OUTFILE "$line\n"; # no header row in file get field for comparison @fieldsize = split(/\t/, $line); print SUMMARY "Field lengths for file $_\n"; print SUMMARY "Taxon\tGeneID\tProductAccession\tProductName\tInteractionShortName\tInteractorTaxon\tInteractorGeneID\tInteractorProdictAccession\tInteractorProductName\tPubMedID\tLastUpdate\tGeneRIFText\n"; } elsif ($_ =~ /interactions/){ print OUTFILE "Taxon\tGeneID\tProteinAccession\tGeneName\tKeyPhrase\tInteractorTaxon\tInteractorGeneID\tInteractionType\tInteractorProductAccession\tInteractorProductName\tComplexID\tComplexIDType\tComplexName\tPubMedID\tLastUpdate\tGeneRIFText\tInteractionID\tInteractionIDType\n"; print SUMMARY "Field lengths for file $_\n"; print SUMMARY "Taxon\tGeneID\tProteinAccession\tGeneName\tKeyPhrase\tInteractorTaxon\tInteractorGeneID\tInteractionType\tInteractorProductAccession\tInteractorProductName\tComplexID\tComplexIDType\tComplexName\tPubMedID\tLastUpdate\tGeneRIFText\tInteractionID\tInteractionIDType\n"; } else { print "Header line for this input file not definaed please contact system administator.\n"; exit; } $linecount = 1; } else { #print line of data to outfile print OUTFILE "$line\n"; #get fields to test size against existing. @temparray = split(/\t/, $line); # if elements in array test new values to see if larger than existing. if (@fieldsize > 0) { #test the field in the line to see if larger than previous and replace field size value if so. for($i=0; $i<@temparray; $i++) { if (length($fieldsize[$i]) < length($temparray[$i])) { $fieldsize[$i] = $temparray[$i]; } } } #set fields array else { @fieldsize = split(/\t/, $line); } } } #print "SUMMARY is".(is_writable_fh(\*SUMMARY)?"":"n't")." writable.\n"; #print summary field size information to summary file foreach(@fieldsize) { print SUMMARY length($_), "\t"; } print SUMMARY "\n\n"; @fieldsize = (); print "FINSHED PROCESSING ", $infile, "\n"; $linecount =0; close INFILE; close OUTFILE; } close SUMMARY; sub is_writable_fh { my($fh)=@_; local $\=''; return print $fh ''; }