Thanks for that info, I somewhat suspected that already, but my main point was just to get it to work first. I'm pretty sure they're not responsible for the error, because it happened after the program is finished with the parser.

I'm not sure about that. For example, I have no idea which variable this is

%data->{$_} = $df; #store constructed data set
$data or %data? One is local, one is global.

I also can't get your program to produce any output, much less something with "star" or "mature" which would demonstrate your bug.

I've made a simple modification below to avoid using Switch when possible (no need to use Switch if perl comes with given/when/default)

#!/usr/bin/perl -- use strict; #~ $ perl -Mversion -le" print version->declare(q!v5.9.2!)->numify " use if $] > 5.009002 , feature => 'switch'; # given/when/default use if $] < 5.009003 , Switch => 'Perl6'; # given/when/default use HTML::Entities; use FileHandle; my $usage; $usage = "Usage:\n"; $usage .= "get_mirbase_fromhtml.pl batchfile\n"; $usage .= "batch file line format: sample_label (tab) path_to_file\n"; our $batchfile = shift or die $usage; our %paths; #store tag=>file here our %parsed_Files ; #tag=>result of parsing goes here; Contains 2Dim Arrays + of string our %data; #tag=>desired data for output; our %mirnas ; #master list of mirnas, each file will have a list with counts in +it, this one is just name=>1 to keep record #read batch file readbatch(); #execute parse on html files do_parsing(); #get desired data from parsed tables use_data(); use DDS; Dump( \%paths, \%parsed_Files, \%data, \%mirnas, ); #do any transformations/replacements (blanks with zeroes, etc.) #output data exit 0; sub use_data { my $file; my $data; my $df; my $i; my ( $predmirb_t, $undetmirb_t ); foreach ( sort( keys(%paths) ) ) { my $f=$_; $file = $parsed_Files{$_}; $df = DataFormat->new(); $predmirb_t = $file->{predmirb}->{table}; $undetmirb_t = $file->{undetmirb}->{table}; #print $_,"\n"; #predicted foreach ( $i = 0 ; $i <= $#$predmirb_t ; ++$i ) { if ($i) { $data = miRNA_data->new(); $data->{ID} = $$predmirb_t[$i][0]; $data->{total} = $$predmirb_t[$i][4]; $data->{mature} = $$predmirb_t[$i][5]; $data->{loop} = $$predmirb_t[$i][6]; $data->{star} = $$predmirb_t[$i][7]; $data->{mirbase_name} = $$predmirb_t[$i][9]; #store in format structure $df->{predmirb}->{ $data->{mirbase_name} . '|' . $data +->{ID} } = $data; } } #undetected my ($star); my ( $last1, $last3 ); my $name; my (%counts); foreach ( $i = 0 ; $i <= $#$undetmirb_t ; ++$i ) { if ($i) { $last1 = substr( $$undetmirb_t[$i][9], -1 ); $last3 = substr( $$undetmirb_t[$i][9], -3 ); if ( $last1 eq '*' || $last3 eq '-3p' ) { #mirdeep thinks 3p is star #print "This is a star seq!\n"; $star=1; } elsif ( $last3 eq '-5p' || $last1 ne '*' ) { # normally would have to check for -3p but that wa +s already done #mirdeep thinks 5p is mature #print "This is a mature seq!\n"; $star=0; } else { # should not be possible to get here print "Column 1: $$undetmirb_t[$i][0]\tColumn 10: +$$undetmirb_t[$i][9]\n"; die( "Something is wrong!! Sequence found doesn't +seem to be mature or star\n"); } if($last1 eq '*'){ $name = substr($$undetmirb_t[$i][9],0,-1); #print $$undetmirb_t[$i][9],"\t",$name,"\n"; } elsif($last3 eq '-5p'||$last3 eq '-3p'){ $name = substr($$undetmirb_t[$i][9],0,-3); #print $$undetmirb_t[$i][9],"\t",$name,"\n"; } else { $name =$$undetmirb_t[$i][9]; } print $name,"\n"; #determine if precursor is already marked in hash, if it is, get refer +ence, if not, make new one and store it # if (!defined($df->{undetmirb}->{$$undetmirb_t[$i][9]. + '|'. $$undetmirb_t[$i][0] }) ) if (!defined($df->{undetmirb}->{$name}) ) { $data = miRNA_data->new(); $data->{ID} = $$undetmirb_t[$i][0]; $data->{mirbase_name} = $$undetmirb_t[$i][9]; $data->{short_name} = $name; $data->{total} = $$undetmirb_t[$i][4]; $data->{loop} = ''; if($star){ $data->{star} = $$undetmirb_t[$i][5]; $data->{mature} = ''; } else { $data->{mature} = $$undetmirb_t[$i][5]; $data->{star} = ''; } # $df->{undetmirb} ->{ $data->{mirbase_name} . '|' +. $data->{ID} } = $data; $df->{undetmirb} ->{$name } = $data; $counts{$name}=0; } ++$counts{$name}; if($counts{$name}>2) { print $f,"\t",$$undetmirb_t[$i][0],"\n"; print "Name used: $name\n"; die "Same miRNA detected more than 2 times! (star +& mature)\n"; } if($counts{name}==2){ print "Duplicate Detected! \t",$f,"\t",$$undetmirb +_t[$i][0],"\n"; } #determine if line represents star or mature & store count in appropr +iate place if($star == 1){ $data->{star} = $$undetmirb_t[$i][5]; } elsif($star == 0) { $data->{mature} = $$undetmirb_t[$i][5]; } } } #undetected followup: Calc totals foreach(sort(keys(%{$df->{undetmirb}}))){ my $d=$df->{undetmirb}->{$_}; print $d->{mirbase_name}."\t".$d->{short_name}."\t".$d->{m +ature}."\t".$d->{star}."\t".$d->{total}."\n"; } #consolidate Hashes together $data{$_} = $df; #store constructed data set } return; } sub readbatch { my $f = FileHandle->new( '<' . $batchfile ); $f or die "File $f did not open!\n"; my @lines = <$f>; chomp(@lines); my @splt; foreach (@lines) { @splt = split( /\t/, $_ ); if ( $#splt + 1 != 2 ) { die "Bad Batch File Format!\n"; } $paths{ $splt[0] } = $splt[1]; } undef $f; } sub do_parsing { my $p; foreach ( sort( keys(%paths) ) ) { print "Label: $_ File: $paths{$_}\n"; $p = new MirDeepParse; $p->parse_file( $paths{$_} ); $parsed_Files{$_} = $p->getResult(); } } sub printTable { my $table = shift or die "No table given to printTable!\n"; foreach (@$table) { foreach (@$_) { print $_, "\t"; } print "\n"; } } package miRNA_data; sub new { my $class = shift; my $self = { ID => '', #tag ID / provisional ID/precursor nam +e (for undetected seqs) mirbase_name => '', #name of miRNA short_name=>'', #mirbase name with */-3p/-5p removed total => 0, mature => 0, loop => 0, star => 0, duplicates => 0 #bool will be 1 if duplicates are fou +nd }; bless $self, $class; return $self; } package DataFormat; sub new { my $class = shift; my $self = { label => shift, predmirb => {} , #will store miRNA_data entries from predicted mirbase set undetmirb => {} , #will store miRNA_data entries from undetected mirbase se +t mirb => {} #This will be the consolidation point keys constructed fr +om mirbase_name|ID }; bless( $self, $class ); #turns var into an object } # define the subclass package MirDeepParse; use base "HTML::Parser"; our $retval; our $row; our $col; our $cell_content; our $append_to_cell; our $inside_table; our $outside_row; our $outside_row_started; #arrays of data #state flags (for position in tables) #states: #'' -- initial state; Vars need initializing; #0.33--waiting for marker text; #0.67 -- found marker text for statistics table; waiting for table tag +s #1 -- in stat table #1.33 -- Table tags indicate end of table; waiting for marker text #1.67 -- found marker for novel; waiting for table tags; #2 -- in novel table #2.33 -- Table tags indicate end of table; waiting for marker text #2.67 -- found marker for predicted mirbase; waiting for table tags; #3 -- in predicted mirbase table #3.33 -- Table tags indicate end of table; waiting for marker text #3.67 -- found marker for undetected mirbase; waiting for table tags; #4 -- in undetected mirbase table #4.33 -- Table tags indicate end of table; waiting for EOF #column id depends on table: # stat table columns: #title row: (blank), novel miRNAs[3], known miRBase miRNAs[2], (bla +nk), blank #rest: miRDeep2 score, predicted by miRDeep2, estimated false posit +ives, estimated true positives, in species, in data, detected by miRD +eep2, estimated signal-to-noise, excision gearing # novel & predicted mirbase tables columns # provisional id, miRDeep2 score, estimated probability that the mi +RNA candidate is a true positive, rfam alert, total read count, matur +e read count, loop read count, star read count, significant randfold +p-value, miRBase miRNA, example miRBase miRNA with the same seed, UCS +C browser, NCBI blastn, consensus mature sequence, consensus star seq +uence, consensus precursor sequence # undetected table columns # miRBase precursor id, -, -, -, total read count, mature read coun +t(s), -, star read count, remaining reads, -, -, UCSC browser, NCBI b +lastn, miRBase mature sequence(s), miRBase star sequence(s), miRBase +precursor sequence sub new { my $class = shift; my $self = $class->SUPER::new(@_); $self->reset_vars(@_); #our @stats; #our @novel; #our @predicted; #our @undetected; return $self; } sub getResult { my $self = shift; my $t = $retval; undef $retval; return $t; } sub reset_vars { my $self = shift; $self->{state} = 0.33; $retval = mirdeep_html->new(); $row = 0; $col = 0; $cell_content = ''; $append_to_cell = 0; $outside_row = 1; $outside_row_started = 0; $inside_table = 0; } sub getState { my $self = shift; return $self->{state}; } sub text { my ( $self, $text ) = @_; # just print out the original text given ( $self->{state} ) { #print "Current State: $state\n"; #print "$text\n"; when (0) { die("Parsing Attempted before Initializing vars\n") +; } when (0.33) { if ( $text eq "Survey of miRDeep2 performance for score cut-offs -10 + to 10" ) { #print "STAT TABLE MARKER FOUND!\n"; $self->{state} = 0.67; return; } } when (0.67) { } when (1) { $self->checktext( $retval->{stats}, $text ); } when (1.33) { if ( $text eq "novel miRNAs predicted by miRDeep2" ) { #print "NOVEL TABLE MARKER FOUND!\n"; $self->{state} = 1.67; return; } } when (1.67) { } when (2) { $self->checktext( $retval->{novel}, $text ); } when (2.33) { if ( $text eq "miRBase miRNAs in dataset" ) { #print "PREDICTED MIRBASE TABLE MARKER FOUND!\n"; $self->{state} = 2.67; return; } } when (2.67) { } when (3) { $self->checktext( $retval->{predmirb}, $text ); +} when (3.33) { if ( $text eq "miRBase miRNAs not detected by miRDeep2" ) +{ #print "UNDETECTED MIRBASE TABLE MARKER FOUND!\n"; $self->{state} = 3.67; return; } } when (3.67) { } when (4) { $self->checktext( $retval->{undetmirb}, $text ); + } when (4.33) { } default { die "Unrecognized state encountered!\n"; } } #print $text; } sub checktext { my $self = shift; my $curtable = shift; my $text = shift; if ($append_to_cell) { $cell_content .= $text; #print $text; } } sub comment { my ( $self, $comment ) = @_; #Nothing to do, html comments not important here } sub start { my ( $self, $tag, $attr, $attrseq, $origtext ) = @_; # print out original text #print $origtext; given ( $self->{state} ) { when (0) { die("Parsing Attempted before Initializing vars\ +n"); } when (0.33) { } when (0.67) { if ( $tag eq 'table' ) { $self->{state} = 1; $inside_table = 1; return; } } when (1) { $self->checktagStart( $retval->{stats}, $tag ); } when (1.33) { } when (1.67) { if ( $tag eq 'table' ) { $self->{state} = 2; $inside_table = 1; return; } } when (2) { $self->checktagStart( $retval->{novel}, $tag ); } when (2.33) { } when (2.67) { if ( $tag eq 'table' ) { $self->{state} = 3; $inside_table = 1; return; } } when (3) { $self->checktagStart( $retval->{predmirb}, $tag ); +} when (3.33) { } when (3.67) { if ( $tag eq 'table' ) { $self->{state} = 4; $inside_table = 1; return; } } when (4) { $self->checktagStart( $retval->{undetmirb}, $tag ); + } when (4.33) { } default { die "Unrecognized state encountered!\n"; } } } sub end { my ( $self, $tag, $origtext ) = @_; # print out original text #print $origtext; given ( $self->{state} ) { when (0) { die("Parsing Attempted before Initializing vars\ +n"); } when (0.33) { } when (0.67) { } when (1) { $self->checktagEnd( $retval->{stats}, 1.33, $tag + ); } when (1.33) { } when (1.67) { } when (2) { $self->checktagEnd( $retval->{novel}, 2.33, $tag + ); } when (2.33) { } when (2.67) { } when (3) { $self->checktagEnd( $retval->{predmirb}, 3.33, $ +tag ); } when (3.33) { } when (3.67) { } when (4) { $self->checktagEnd( $retval->{undetmirb}, 4.33, +$tag ); } when (4.33) { } default { die "Unrecognized state encountered!\n"; } } } sub checktagStart { my $self = shift; my $curtable = shift; my $tag = shift; if ( $tag eq 'tr' ) { $outside_row = 0; $outside_row_started = 0; #print "<tr>\n"; ++$row; $curtable->addrow(); #print STDERR "-TR TAG STARTED-\n"; } elsif ( $inside_table && ( $tag eq 'td' || $tag eq 'th' ) ) { if ( $outside_row && !$outside_row_started ) { $outside_row_started = 1; ++$row; $curtable->addrow(); } if ($append_to_cell) { $cell_content = decode($cell_content); #print "inside a table?: $inside_table\n"; #print "outside row?: $outside_row\n"; #print "outside row started?: $outside_row_started;\n"; $curtable->addcell($cell_content); $cell_content = getstring(); } #print "\t<td>"; $append_to_cell = 1; ++$col; #print STDERR "-TD TAG STARTED-\n"; } elsif ( $tag eq 'span' ) { $append_to_cell = 0; } } sub checktagEnd { my $self = shift; my $curtable = shift; my $nextstate = shift; my $tag = shift; if ( $tag eq "table" ) { #print "TABLE TAG END!\n"; $self->{state} = $nextstate; $inside_table = 0; $outside_row_started = 0; #print "State Changed to $nextstate \n"; return; } elsif ( $tag eq 'tr' ) { $outside_row = 1; #print "</tr>\n"; #print STDERR "-TR TAG ENDED-\n"; } elsif ( $inside_table && ( $tag eq 'td' || $tag eq 'th' ) ) { #print "</td>\n"; $append_to_cell = 0; $cell_content = decode($cell_content); #print "inside a table?: $inside_table\n"; #print "outside row?: $outside_row\n"; #print "outside row started?: $outside_row_started;\n"; $curtable->addcell($cell_content); #print STDERR $cell_content,"\n"; $cell_content = getstring(); #print STDERR $cell_content,"\n"; #print STDERR "-TD TAG ENDED-\n"; } elsif ( $tag eq 'span' ) { $append_to_cell = 1; } } sub getstring { my $str = ''; return $str; } sub decode { my $str = shift; $str =~ s/&#([0123456789]+)/getstrequiv($1)/eg; $str = HTML::Entities::decode_entities($str); return $str; } sub getstrequiv { my $num = shift; given ($num) { when (177) { return '+/-'; #since the single char is non-ascii, better to +replace it } when (160) { return '&nbsp'; #it'll be handled by decode_entities } default { die "unrecognized code: $num!"; } } } package mirdeep_html; sub new { my $class = shift; my $self = { stats => data_table->new(), novel => data_table->new(), predmirb => data_table->new(), undetmirb => data_table->new() }; bless $self, $class; return $self; } package data_table; sub new { my $class = shift; my $self = { table => getarray(), col => -1, row => -1 }; bless $self, $class; return $self; } sub getarray { my @arr; return \@arr; } sub addcell { if ( $#_ + 1 != 2 ) { die "Wrong number of arguments to addcell! Need: 2, Given: " . + @_ . " \n"; } my $self = shift; my $item = shift; if ( !defined( ${ $self->{table} }[ $self->{row} ] ) ) { die "Row referenced by index $self->{row} not defined!\n"; } push @{ $self->{table}[ $self->{row} ] }, $item; ++$self->{col}; } sub addrow { my $self = shift; my @row; push( @{ $self->{table} }, \@row ); ++$self->{row}; $self->{col} = -1; }
__END__ Label: sample_label File: result_19_11_2010_t_15_48_17.html $HASH1 = { "sample_label " => ' result_19_11_2010_t_15_48_17.html' }; $HASH2 = { "sample_label " => bless( { novel => bless( { col => -1, row => -1, table => [] }, 'data_table' ), predmirb => bless( { col => -1, row => -1, table => [] }, 'data_table' ), stats => bless( { col => -1, row => -1, table => [] }, 'data_table' ), undetmirb => bless( { col => -1, row => -1, table => [] }, 'data_table' ) }, 'mirdeep_html' ) }; $HASH3 = { "sample_label " => bless( { label => undef, mirb => {}, predmirb => {}, undetmirb => {} }, 'DataFormat' ) }; $HASH4 = {};
I suggest you take this modification, do some more cleanup, reduce the line count (html/code, divide and conquer) and ask again :)

In reply to Re^3: Having a problem with my code! by Anonymous Monk
in thread Having a problem with my code! by kotoroshinoto

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post, it's "PerlMonks-approved HTML":



  • Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!
  • Titles consisting of a single word are discouraged, and in most cases are disallowed outright.
  • Read Where should I post X? if you're not absolutely sure you're posting in the right place.
  • Please read these before you post! —
  • Posts may use any of the Perl Monks Approved HTML tags:
    a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr
  • You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)
            For:     Use:
    & &amp;
    < &lt;
    > &gt;
    [ &#91;
    ] &#93;
  • Link using PerlMonks shortcuts! What shortcuts can I use for linking?
  • See Writeup Formatting Tips and other pages linked from there for more info.