in reply to Re^2: Having a problem with my code!
in thread Having a problem with my code!
I'm not sure about that. For example, I have no idea which variable this is
$data or %data? One is local, one is global.%data->{$_} = $df; #store constructed data set
I also can't get your program to produce any output, much less something with "star" or "mature" which would demonstrate your bug.
I've made a simple modification below to avoid using Switch when possible (no need to use Switch if perl comes with given/when/default)
#!/usr/bin/perl -- use strict; #~ $ perl -Mversion -le" print version->declare(q!v5.9.2!)->numify " use if $] > 5.009002 , feature => 'switch'; # given/when/default use if $] < 5.009003 , Switch => 'Perl6'; # given/when/default use HTML::Entities; use FileHandle; my $usage; $usage = "Usage:\n"; $usage .= "get_mirbase_fromhtml.pl batchfile\n"; $usage .= "batch file line format: sample_label (tab) path_to_file\n"; our $batchfile = shift or die $usage; our %paths; #store tag=>file here our %parsed_Files ; #tag=>result of parsing goes here; Contains 2Dim Arrays + of string our %data; #tag=>desired data for output; our %mirnas ; #master list of mirnas, each file will have a list with counts in +it, this one is just name=>1 to keep record #read batch file readbatch(); #execute parse on html files do_parsing(); #get desired data from parsed tables use_data(); use DDS; Dump( \%paths, \%parsed_Files, \%data, \%mirnas, ); #do any transformations/replacements (blanks with zeroes, etc.) #output data exit 0; sub use_data { my $file; my $data; my $df; my $i; my ( $predmirb_t, $undetmirb_t ); foreach ( sort( keys(%paths) ) ) { my $f=$_; $file = $parsed_Files{$_}; $df = DataFormat->new(); $predmirb_t = $file->{predmirb}->{table}; $undetmirb_t = $file->{undetmirb}->{table}; #print $_,"\n"; #predicted foreach ( $i = 0 ; $i <= $#$predmirb_t ; ++$i ) { if ($i) { $data = miRNA_data->new(); $data->{ID} = $$predmirb_t[$i][0]; $data->{total} = $$predmirb_t[$i][4]; $data->{mature} = $$predmirb_t[$i][5]; $data->{loop} = $$predmirb_t[$i][6]; $data->{star} = $$predmirb_t[$i][7]; $data->{mirbase_name} = $$predmirb_t[$i][9]; #store in format structure $df->{predmirb}->{ $data->{mirbase_name} . '|' . $data +->{ID} } = $data; } } #undetected my ($star); my ( $last1, $last3 ); my $name; my (%counts); foreach ( $i = 0 ; $i <= $#$undetmirb_t ; ++$i ) { if ($i) { $last1 = substr( $$undetmirb_t[$i][9], -1 ); $last3 = substr( $$undetmirb_t[$i][9], -3 ); if ( $last1 eq '*' || $last3 eq '-3p' ) { #mirdeep thinks 3p is star #print "This is a star seq!\n"; $star=1; } elsif ( $last3 eq '-5p' || $last1 ne '*' ) { # normally would have to check for -3p but that wa +s already done #mirdeep thinks 5p is mature #print "This is a mature seq!\n"; $star=0; } else { # should not be possible to get here print "Column 1: $$undetmirb_t[$i][0]\tColumn 10: +$$undetmirb_t[$i][9]\n"; die( "Something is wrong!! Sequence found doesn't +seem to be mature or star\n"); } if($last1 eq '*'){ $name = substr($$undetmirb_t[$i][9],0,-1); #print $$undetmirb_t[$i][9],"\t",$name,"\n"; } elsif($last3 eq '-5p'||$last3 eq '-3p'){ $name = substr($$undetmirb_t[$i][9],0,-3); #print $$undetmirb_t[$i][9],"\t",$name,"\n"; } else { $name =$$undetmirb_t[$i][9]; } print $name,"\n"; #determine if precursor is already marked in hash, if it is, get refer +ence, if not, make new one and store it # if (!defined($df->{undetmirb}->{$$undetmirb_t[$i][9]. + '|'. $$undetmirb_t[$i][0] }) ) if (!defined($df->{undetmirb}->{$name}) ) { $data = miRNA_data->new(); $data->{ID} = $$undetmirb_t[$i][0]; $data->{mirbase_name} = $$undetmirb_t[$i][9]; $data->{short_name} = $name; $data->{total} = $$undetmirb_t[$i][4]; $data->{loop} = ''; if($star){ $data->{star} = $$undetmirb_t[$i][5]; $data->{mature} = ''; } else { $data->{mature} = $$undetmirb_t[$i][5]; $data->{star} = ''; } # $df->{undetmirb} ->{ $data->{mirbase_name} . '|' +. $data->{ID} } = $data; $df->{undetmirb} ->{$name } = $data; $counts{$name}=0; } ++$counts{$name}; if($counts{$name}>2) { print $f,"\t",$$undetmirb_t[$i][0],"\n"; print "Name used: $name\n"; die "Same miRNA detected more than 2 times! (star +& mature)\n"; } if($counts{name}==2){ print "Duplicate Detected! \t",$f,"\t",$$undetmirb +_t[$i][0],"\n"; } #determine if line represents star or mature & store count in appropr +iate place if($star == 1){ $data->{star} = $$undetmirb_t[$i][5]; } elsif($star == 0) { $data->{mature} = $$undetmirb_t[$i][5]; } } } #undetected followup: Calc totals foreach(sort(keys(%{$df->{undetmirb}}))){ my $d=$df->{undetmirb}->{$_}; print $d->{mirbase_name}."\t".$d->{short_name}."\t".$d->{m +ature}."\t".$d->{star}."\t".$d->{total}."\n"; } #consolidate Hashes together $data{$_} = $df; #store constructed data set } return; } sub readbatch { my $f = FileHandle->new( '<' . $batchfile ); $f or die "File $f did not open!\n"; my @lines = <$f>; chomp(@lines); my @splt; foreach (@lines) { @splt = split( /\t/, $_ ); if ( $#splt + 1 != 2 ) { die "Bad Batch File Format!\n"; } $paths{ $splt[0] } = $splt[1]; } undef $f; } sub do_parsing { my $p; foreach ( sort( keys(%paths) ) ) { print "Label: $_ File: $paths{$_}\n"; $p = new MirDeepParse; $p->parse_file( $paths{$_} ); $parsed_Files{$_} = $p->getResult(); } } sub printTable { my $table = shift or die "No table given to printTable!\n"; foreach (@$table) { foreach (@$_) { print $_, "\t"; } print "\n"; } } package miRNA_data; sub new { my $class = shift; my $self = { ID => '', #tag ID / provisional ID/precursor nam +e (for undetected seqs) mirbase_name => '', #name of miRNA short_name=>'', #mirbase name with */-3p/-5p removed total => 0, mature => 0, loop => 0, star => 0, duplicates => 0 #bool will be 1 if duplicates are fou +nd }; bless $self, $class; return $self; } package DataFormat; sub new { my $class = shift; my $self = { label => shift, predmirb => {} , #will store miRNA_data entries from predicted mirbase set undetmirb => {} , #will store miRNA_data entries from undetected mirbase se +t mirb => {} #This will be the consolidation point keys constructed fr +om mirbase_name|ID }; bless( $self, $class ); #turns var into an object } # define the subclass package MirDeepParse; use base "HTML::Parser"; our $retval; our $row; our $col; our $cell_content; our $append_to_cell; our $inside_table; our $outside_row; our $outside_row_started; #arrays of data #state flags (for position in tables) #states: #'' -- initial state; Vars need initializing; #0.33--waiting for marker text; #0.67 -- found marker text for statistics table; waiting for table tag +s #1 -- in stat table #1.33 -- Table tags indicate end of table; waiting for marker text #1.67 -- found marker for novel; waiting for table tags; #2 -- in novel table #2.33 -- Table tags indicate end of table; waiting for marker text #2.67 -- found marker for predicted mirbase; waiting for table tags; #3 -- in predicted mirbase table #3.33 -- Table tags indicate end of table; waiting for marker text #3.67 -- found marker for undetected mirbase; waiting for table tags; #4 -- in undetected mirbase table #4.33 -- Table tags indicate end of table; waiting for EOF #column id depends on table: # stat table columns: #title row: (blank), novel miRNAs[3], known miRBase miRNAs[2], (bla +nk), blank #rest: miRDeep2 score, predicted by miRDeep2, estimated false posit +ives, estimated true positives, in species, in data, detected by miRD +eep2, estimated signal-to-noise, excision gearing # novel & predicted mirbase tables columns # provisional id, miRDeep2 score, estimated probability that the mi +RNA candidate is a true positive, rfam alert, total read count, matur +e read count, loop read count, star read count, significant randfold +p-value, miRBase miRNA, example miRBase miRNA with the same seed, UCS +C browser, NCBI blastn, consensus mature sequence, consensus star seq +uence, consensus precursor sequence # undetected table columns # miRBase precursor id, -, -, -, total read count, mature read coun +t(s), -, star read count, remaining reads, -, -, UCSC browser, NCBI b +lastn, miRBase mature sequence(s), miRBase star sequence(s), miRBase +precursor sequence sub new { my $class = shift; my $self = $class->SUPER::new(@_); $self->reset_vars(@_); #our @stats; #our @novel; #our @predicted; #our @undetected; return $self; } sub getResult { my $self = shift; my $t = $retval; undef $retval; return $t; } sub reset_vars { my $self = shift; $self->{state} = 0.33; $retval = mirdeep_html->new(); $row = 0; $col = 0; $cell_content = ''; $append_to_cell = 0; $outside_row = 1; $outside_row_started = 0; $inside_table = 0; } sub getState { my $self = shift; return $self->{state}; } sub text { my ( $self, $text ) = @_; # just print out the original text given ( $self->{state} ) { #print "Current State: $state\n"; #print "$text\n"; when (0) { die("Parsing Attempted before Initializing vars\n") +; } when (0.33) { if ( $text eq "Survey of miRDeep2 performance for score cut-offs -10 + to 10" ) { #print "STAT TABLE MARKER FOUND!\n"; $self->{state} = 0.67; return; } } when (0.67) { } when (1) { $self->checktext( $retval->{stats}, $text ); } when (1.33) { if ( $text eq "novel miRNAs predicted by miRDeep2" ) { #print "NOVEL TABLE MARKER FOUND!\n"; $self->{state} = 1.67; return; } } when (1.67) { } when (2) { $self->checktext( $retval->{novel}, $text ); } when (2.33) { if ( $text eq "miRBase miRNAs in dataset" ) { #print "PREDICTED MIRBASE TABLE MARKER FOUND!\n"; $self->{state} = 2.67; return; } } when (2.67) { } when (3) { $self->checktext( $retval->{predmirb}, $text ); +} when (3.33) { if ( $text eq "miRBase miRNAs not detected by miRDeep2" ) +{ #print "UNDETECTED MIRBASE TABLE MARKER FOUND!\n"; $self->{state} = 3.67; return; } } when (3.67) { } when (4) { $self->checktext( $retval->{undetmirb}, $text ); + } when (4.33) { } default { die "Unrecognized state encountered!\n"; } } #print $text; } sub checktext { my $self = shift; my $curtable = shift; my $text = shift; if ($append_to_cell) { $cell_content .= $text; #print $text; } } sub comment { my ( $self, $comment ) = @_; #Nothing to do, html comments not important here } sub start { my ( $self, $tag, $attr, $attrseq, $origtext ) = @_; # print out original text #print $origtext; given ( $self->{state} ) { when (0) { die("Parsing Attempted before Initializing vars\ +n"); } when (0.33) { } when (0.67) { if ( $tag eq 'table' ) { $self->{state} = 1; $inside_table = 1; return; } } when (1) { $self->checktagStart( $retval->{stats}, $tag ); } when (1.33) { } when (1.67) { if ( $tag eq 'table' ) { $self->{state} = 2; $inside_table = 1; return; } } when (2) { $self->checktagStart( $retval->{novel}, $tag ); } when (2.33) { } when (2.67) { if ( $tag eq 'table' ) { $self->{state} = 3; $inside_table = 1; return; } } when (3) { $self->checktagStart( $retval->{predmirb}, $tag ); +} when (3.33) { } when (3.67) { if ( $tag eq 'table' ) { $self->{state} = 4; $inside_table = 1; return; } } when (4) { $self->checktagStart( $retval->{undetmirb}, $tag ); + } when (4.33) { } default { die "Unrecognized state encountered!\n"; } } } sub end { my ( $self, $tag, $origtext ) = @_; # print out original text #print $origtext; given ( $self->{state} ) { when (0) { die("Parsing Attempted before Initializing vars\ +n"); } when (0.33) { } when (0.67) { } when (1) { $self->checktagEnd( $retval->{stats}, 1.33, $tag + ); } when (1.33) { } when (1.67) { } when (2) { $self->checktagEnd( $retval->{novel}, 2.33, $tag + ); } when (2.33) { } when (2.67) { } when (3) { $self->checktagEnd( $retval->{predmirb}, 3.33, $ +tag ); } when (3.33) { } when (3.67) { } when (4) { $self->checktagEnd( $retval->{undetmirb}, 4.33, +$tag ); } when (4.33) { } default { die "Unrecognized state encountered!\n"; } } } sub checktagStart { my $self = shift; my $curtable = shift; my $tag = shift; if ( $tag eq 'tr' ) { $outside_row = 0; $outside_row_started = 0; #print "<tr>\n"; ++$row; $curtable->addrow(); #print STDERR "-TR TAG STARTED-\n"; } elsif ( $inside_table && ( $tag eq 'td' || $tag eq 'th' ) ) { if ( $outside_row && !$outside_row_started ) { $outside_row_started = 1; ++$row; $curtable->addrow(); } if ($append_to_cell) { $cell_content = decode($cell_content); #print "inside a table?: $inside_table\n"; #print "outside row?: $outside_row\n"; #print "outside row started?: $outside_row_started;\n"; $curtable->addcell($cell_content); $cell_content = getstring(); } #print "\t<td>"; $append_to_cell = 1; ++$col; #print STDERR "-TD TAG STARTED-\n"; } elsif ( $tag eq 'span' ) { $append_to_cell = 0; } } sub checktagEnd { my $self = shift; my $curtable = shift; my $nextstate = shift; my $tag = shift; if ( $tag eq "table" ) { #print "TABLE TAG END!\n"; $self->{state} = $nextstate; $inside_table = 0; $outside_row_started = 0; #print "State Changed to $nextstate \n"; return; } elsif ( $tag eq 'tr' ) { $outside_row = 1; #print "</tr>\n"; #print STDERR "-TR TAG ENDED-\n"; } elsif ( $inside_table && ( $tag eq 'td' || $tag eq 'th' ) ) { #print "</td>\n"; $append_to_cell = 0; $cell_content = decode($cell_content); #print "inside a table?: $inside_table\n"; #print "outside row?: $outside_row\n"; #print "outside row started?: $outside_row_started;\n"; $curtable->addcell($cell_content); #print STDERR $cell_content,"\n"; $cell_content = getstring(); #print STDERR $cell_content,"\n"; #print STDERR "-TD TAG ENDED-\n"; } elsif ( $tag eq 'span' ) { $append_to_cell = 1; } } sub getstring { my $str = ''; return $str; } sub decode { my $str = shift; $str =~ s/&#([0123456789]+)/getstrequiv($1)/eg; $str = HTML::Entities::decode_entities($str); return $str; } sub getstrequiv { my $num = shift; given ($num) { when (177) { return '+/-'; #since the single char is non-ascii, better to +replace it } when (160) { return ' '; #it'll be handled by decode_entities } default { die "unrecognized code: $num!"; } } } package mirdeep_html; sub new { my $class = shift; my $self = { stats => data_table->new(), novel => data_table->new(), predmirb => data_table->new(), undetmirb => data_table->new() }; bless $self, $class; return $self; } package data_table; sub new { my $class = shift; my $self = { table => getarray(), col => -1, row => -1 }; bless $self, $class; return $self; } sub getarray { my @arr; return \@arr; } sub addcell { if ( $#_ + 1 != 2 ) { die "Wrong number of arguments to addcell! Need: 2, Given: " . + @_ . " \n"; } my $self = shift; my $item = shift; if ( !defined( ${ $self->{table} }[ $self->{row} ] ) ) { die "Row referenced by index $self->{row} not defined!\n"; } push @{ $self->{table}[ $self->{row} ] }, $item; ++$self->{col}; } sub addrow { my $self = shift; my @row; push( @{ $self->{table} }, \@row ); ++$self->{row}; $self->{col} = -1; }
__END__ Label: sample_label File: result_19_11_2010_t_15_48_17.html $HASH1 = { "sample_label " => ' result_19_11_2010_t_15_48_17.html' }; $HASH2 = { "sample_label " => bless( { novel => bless( { col => -1, row => -1, table => [] }, 'data_table' ), predmirb => bless( { col => -1, row => -1, table => [] }, 'data_table' ), stats => bless( { col => -1, row => -1, table => [] }, 'data_table' ), undetmirb => bless( { col => -1, row => -1, table => [] }, 'data_table' ) }, 'mirdeep_html' ) }; $HASH3 = { "sample_label " => bless( { label => undef, mirb => {}, predmirb => {}, undetmirb => {} }, 'DataFormat' ) }; $HASH4 = {};
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re^4: Having a problem with my code!
by kotoroshinoto (Initiate) on Dec 22, 2010 at 22:04 UTC |