UPDATE Slightly better formatting
This is a GUI designed to display gene sequences for different bacteria. The file format is thus:
<cluster start> CGSSp11BS70_00075
<genes start>
>CGSSp11BS70_00075
ATGCATCTTTATTTTATGAAATTTTTTACTATTTTTTCCATTTTACCACAGAAAAAGTTG
AAAAACAAAGAAGAATCATACTTTTTTGTAATTTATAATAACATGTTTTCAAACACATTA
ATTGACACGAAGTTAATATTATTATACAATTATCTTATGGAAGTCTCAGATTTATATAGT
ATCAGATTAAAACATATTCTAGCAAGGGAGATAGCA
>RefSpSangerINV104B_02430
ATGCATCTTTATTTTATGAAATTTTTTACTATTTTTTCCATTTTACCACAGAAAAAGTTG
AAAAACAAAGAAGAATCATACTTTTTTGTAATTTATAATAACATGTTTTCAAACACATTA
ATTGACACGAAGTTAATATTATTATACAATTATCTTATGGAAGTCTCAGATTTATATAGT
ATCAGATTAAAACATATTCTAGCAAGGGAGATAGCA
<genes stop>
<adjuncts start>
<adjuncts stop>
<strains start>
CGSSp11BS70 1
CGSSp14BS292 0
CGSSp14BS69 0
CGSSp18BS74 0
CGSSp19BS75 0
CGSSp23BS72 0
CGSSp3BS71 0
CGSSp6BS73 0
CGSSp9BS68 0
CGSSp9vBS293 0
CGSSpBS397 0
CGSSpBS455 0
CGSSpBS457 0
CGSSpBS458 0
RefSpD39 0
RefSpPAT6420135 0
RefSpR6 0
RefSpSanger23F 0
RefSpSangerINV104B 1
RefSpSangerINV200 0
RefSpSangerOXC141 0
RefSpTIGR4 0
RefSpTIGR6706B 0
gnl 1
<strains stop>
<cluster stop>
This is one record. The idea is a "cluster" is named by a representative gene name which belongs to a group of related genes. The lines which begin with:
>strain_gene.name
Are the sequence associated with that particular gene. My program should read through the file, when it is in the portion of the record which examines the representative sequence $isGene is set to true, and adds all lines which are DNA (of that sequence). When $isStrains is true that means its counting the number of total strains that belong to the cluster etc.
The hash (%cHash) which I return should be of the type:
clutster_name=> array[0] == genesequence
array1 == list
of all
strains
in cluster
Hrm. Ok, that's kinda it. Please let me know any parts which are not clear!
#!/usr/local/bin/perl
use Tk;
use Cwd;
use warnings;
use strict;
use Data::Dumper;
# Main Window
my $mw = new MainWindow;
my $lab = $mw -> Label( -text => " Gene Cluster View
+ Utility ",
-font => "ansi 12 bold")
-> grid( -row =>1,
-column =>1,
-columnspan =>3,
-sticky =>"nsew");
#Declare that there is a menu
my $mbar = $mw -> Menu();
$mw -> configure(-menu => $mbar);
#The Main Buttons
my $file = $mbar -> cascade( -label => "File",
-underline => 0,
-tearoff => 0);
my $others = $mbar -> cascade( -label => "Others",
-underline => 0,
-tearoff => 0);
my $help = $mbar -> cascade( -label => "Help",
-underline => 0,
-tearoff => 0);
## File Menu ##
$file -> command( -label => "New",
-underline => 0);
#-command => sub { $txt -> delete('1.0','end');
+} );
$file -> command( -label => "Load clusterReport",
-underline => 0,
-command => [\&open_file, "Open"]);
$file -> command( -label => "Save",
-underline => 0,
-command => [\&save_file, "Save"]);
$file -> separator();
$file -> command( -label => "Exit",
-underline => 1,
-command => sub { exit } );
##List Boxes for the Genes##
my $strain_label = $mw -> Label( -text => " Strain List
+ ",
-font => "ansi 12 bold"
+)
-> grid( -row =>2,
-column =>1 );
my $core_label = $mw -> Label( -text => " Core Gene
+s List ",
-font => "ansi 12 bold"
+)
-> grid( -row =>2,
-column =>2 );
my $distributed_label = $mw ->Label( -text => " Distributed
+Genes List ",
-font => "ansi 12 bold"
+)
-> grid( -row =>2,
-column =>3 );
my $strain_lst = $mw
->Scrolled( 'Listbox',
-scrollbars =>"oe, os")
-> grid( -row =>3,
-column =>1 );
my $core_lst = $mw ->Scrolled( 'Listbox',
-scrollbars =>"oe, os")
-> grid( -row =>3,
-column =>2 );
my $distributed_lst = $mw
->Scrolled( 'Listbox',
-scrollbars =>"oe, os")
-> grid( -row =>3,
-column =>3 );
my $gene_txt = $mw
->Scrolled( 'Text',
-scrollbars =>"oe, os")
-> grid( -row =>4,
-column =>1,
-columnspan =>3 );
MainLoop;
#################
###SUBROUTINES###
#################
sub open_file {
my $types;
my $open_file_loc = $mw->getOpenFile(-filetypes => $types);
print qq{You chose to open "$open_file_loc"\n} if $open_file_loc
+;
my ($nRef, $hRef) = &parse_file($open_file_loc);
&disp_contents($nRef, $hRef);
}
sub save_file {
my $types;
my $save = $mw->getSaveFile(-filetypes => $types,
-initialfile => 'test');
print qq{You chose to save as "$save"\n} if $save;
}
#Parse the clusterGene file takes the file location as input
sub parse_file{
my $cluster;
my $strain;
my $isStrainLst = 0;
my $iscluster = 0;
my $isGene = 0;
my %cHash;
my $file_loc=shift;
my $count=0;
my $numStrains=0;
chdir "/";
open (F, "$file_loc") or die "Cannot open file because:\n $!";
while (<F>){
chomp;
#Reset bools if we reach end of record
if ( $isGene && /^>/){
$isGene = 0;
}
if(/<strains stop>/){
$isStrainLst = 0;
$numStrains=$count;#annoying, this will happen a lot. try
+to think of a better way
}
#Check if we are in a gene, if so we need to add its sequence
if ($isGene){
if (exists $cHash{$cluster}){
$cHash{$cluster}[0]=$cHash{$cluster}[0].$_;
}
else{
$cHash{$cluster}[0]=$_;
#print "Cluster is ".$cluster." and gene is $_\n";
}
}
if ($isStrainLst){
/\s+([a-zA-Z0-9]+)\s+([0-1])/;
push @{$cHash{$cluster}[1]}, $1 unless($2==0);#make HoAoA
$count++ unless ($numStrains>0);#count to determine core/d
+ist
}
if(/(<cluster start> )([a-zA-Z0-9]+)(_[a-zA-Z0-9]+)($)/){
$cluster = $2.$3;
$strain = $2;
}
#This is the start of the representative sequence
if (/^>$cluster/){
$isGene = 1;
}
#Find strain list(to diff core and dist genes)
if (/<strains start>/){
$isStrainLst=1;
}
}
return ($numStrains, \%cHash);
}
sub disp_contents{
my ($numStrains, $hashref) = @_;
print $numStrains;
if (ref($hashref) ne "HASH"){die "Expecting a hash ref got". ref($
+hashref);}
print Dumper($hashref);
}
|