##
#!/usr/bin/perl --
use strict;
#~ $ perl -Mversion -le" print version->declare(q!v5.9.2!)->numify "
use if $] > 5.009002 , feature => 'switch'; # given/when/default
use if $] < 5.009003 , Switch => 'Perl6'; # given/when/default
use HTML::Entities;
use FileHandle;
my $usage;
$usage = "Usage:\n";
$usage .= "get_mirbase_fromhtml.pl batchfile\n";
$usage .= "batch file line format: sample_label (tab) path_to_file\n";
our $batchfile = shift or die $usage;
our %paths; #store tag=>file here
our %parsed_Files
; #tag=>result of parsing goes here; Contains 2Dim Arrays of string
our %data; #tag=>desired data for output;
our %mirnas
; #master list of mirnas, each file will have a list with counts in it, this one is just name=>1 to keep record
#read batch file
readbatch();
#execute parse on html files
do_parsing();
#get desired data from parsed tables
use_data();
use DDS; Dump(
\%paths,
\%parsed_Files,
\%data,
\%mirnas,
);
#do any transformations/replacements (blanks with zeroes, etc.)
#output data
exit 0;
sub use_data {
my $file;
my $data;
my $df;
my $i;
my ( $predmirb_t, $undetmirb_t );
foreach ( sort( keys(%paths) ) ) {
my $f=$_;
$file = $parsed_Files{$_};
$df = DataFormat->new();
$predmirb_t = $file->{predmirb}->{table};
$undetmirb_t = $file->{undetmirb}->{table};
#print $_,"\n";
#predicted
foreach ( $i = 0 ; $i <= $#$predmirb_t ; ++$i ) {
if ($i) {
$data = miRNA_data->new();
$data->{ID} = $$predmirb_t[$i][0];
$data->{total} = $$predmirb_t[$i][4];
$data->{mature} = $$predmirb_t[$i][5];
$data->{loop} = $$predmirb_t[$i][6];
$data->{star} = $$predmirb_t[$i][7];
$data->{mirbase_name} = $$predmirb_t[$i][9];
#store in format structure
$df->{predmirb}->{ $data->{mirbase_name} . '|' . $data->{ID} } =
$data;
}
}
#undetected
my ($star);
my ( $last1, $last3 );
my $name;
my (%counts);
foreach ( $i = 0 ; $i <= $#$undetmirb_t ; ++$i ) {
if ($i) {
$last1 = substr( $$undetmirb_t[$i][9], -1 );
$last3 = substr( $$undetmirb_t[$i][9], -3 );
if ( $last1 eq '*' || $last3 eq '-3p' ) {
#mirdeep thinks 3p is star
#print "This is a star seq!\n";
$star=1;
}
elsif ( $last3 eq '-5p' || $last1 ne '*' )
{
# normally would have to check for -3p but that was already done
#mirdeep thinks 5p is mature
#print "This is a mature seq!\n";
$star=0;
}
else { # should not be possible to get here
print "Column 1: $$undetmirb_t[$i][0]\tColumn 10: $$undetmirb_t[$i][9]\n";
die( "Something is wrong!! Sequence found doesn't seem to be mature or star\n");
}
if($last1 eq '*'){
$name = substr($$undetmirb_t[$i][9],0,-1);
#print $$undetmirb_t[$i][9],"\t",$name,"\n";
} elsif($last3 eq '-5p'||$last3 eq '-3p'){
$name = substr($$undetmirb_t[$i][9],0,-3);
#print $$undetmirb_t[$i][9],"\t",$name,"\n";
} else {
$name =$$undetmirb_t[$i][9];
}
print $name,"\n";
#determine if precursor is already marked in hash, if it is, get reference, if not, make new one and store it
# if (!defined($df->{undetmirb}->{$$undetmirb_t[$i][9]. '|'. $$undetmirb_t[$i][0] }) )
if (!defined($df->{undetmirb}->{$name}) )
{
$data = miRNA_data->new();
$data->{ID} = $$undetmirb_t[$i][0];
$data->{mirbase_name} = $$undetmirb_t[$i][9];
$data->{short_name} = $name;
$data->{total} = $$undetmirb_t[$i][4];
$data->{loop} = '';
if($star){
$data->{star} = $$undetmirb_t[$i][5];
$data->{mature} = '';
} else {
$data->{mature} = $$undetmirb_t[$i][5];
$data->{star} = '';
}
# $df->{undetmirb} ->{ $data->{mirbase_name} . '|' . $data->{ID} } = $data;
$df->{undetmirb} ->{$name } = $data;
$counts{$name}=0;
}
++$counts{$name};
if($counts{$name}>2) {
print $f,"\t",$$undetmirb_t[$i][0],"\n";
print "Name used: $name\n";
die "Same miRNA detected more than 2 times! (star & mature)\n";
}
if($counts{name}==2){
print "Duplicate Detected! \t",$f,"\t",$$undetmirb_t[$i][0],"\n";
}
#determine if line represents star or mature & store count in appropriate place
if($star == 1){
$data->{star} = $$undetmirb_t[$i][5];
} elsif($star == 0) {
$data->{mature} = $$undetmirb_t[$i][5];
}
}
}
#undetected followup: Calc totals
foreach(sort(keys(%{$df->{undetmirb}}))){
my $d=$df->{undetmirb}->{$_};
print $d->{mirbase_name}."\t".$d->{short_name}."\t".$d->{mature}."\t".$d->{star}."\t".$d->{total}."\n";
}
#consolidate Hashes together
$data{$_} = $df; #store constructed data set
}
return;
}
sub readbatch {
my $f = FileHandle->new( '<' . $batchfile );
$f or die "File $f did not open!\n";
my @lines = <$f>;
chomp(@lines);
my @splt;
foreach (@lines) {
@splt = split( /\t/, $_ );
if ( $#splt + 1 != 2 ) { die "Bad Batch File Format!\n"; }
$paths{ $splt[0] } = $splt[1];
}
undef $f;
}
sub do_parsing {
my $p;
foreach ( sort( keys(%paths) ) ) {
print "Label: $_ File: $paths{$_}\n";
$p = new MirDeepParse;
$p->parse_file( $paths{$_} );
$parsed_Files{$_} = $p->getResult();
}
}
sub printTable {
my $table = shift or die "No table given to printTable!\n";
foreach (@$table) {
foreach (@$_) {
print $_, "\t";
}
print "\n";
}
}
package miRNA_data;
sub new {
my $class = shift;
my $self = {
ID => '', #tag ID / provisional ID/precursor name (for undetected seqs)
mirbase_name => '', #name of miRNA
short_name=>'', #mirbase name with */-3p/-5p removed
total => 0,
mature => 0,
loop => 0,
star => 0,
duplicates => 0 #bool will be 1 if duplicates are found
};
bless $self, $class;
return $self;
}
package DataFormat;
sub new {
my $class = shift;
my $self = {
label => shift,
predmirb => {}
, #will store miRNA_data entries from predicted mirbase set
undetmirb => {}
, #will store miRNA_data entries from undetected mirbase set
mirb =>
{} #This will be the consolidation point keys constructed from mirbase_name|ID
};
bless( $self, $class ); #turns var into an object
}
# define the subclass
package MirDeepParse;
use base "HTML::Parser";
our $retval;
our $row;
our $col;
our $cell_content;
our $append_to_cell;
our $inside_table;
our $outside_row;
our $outside_row_started;
#arrays of data
#state flags (for position in tables)
#states:
#'' -- initial state; Vars need initializing;
#0.33--waiting for marker text;
#0.67 -- found marker text for statistics table; waiting for table tags
#1 -- in stat table
#1.33 -- Table tags indicate end of table; waiting for marker text
#1.67 -- found marker for novel; waiting for table tags;
#2 -- in novel table
#2.33 -- Table tags indicate end of table; waiting for marker text
#2.67 -- found marker for predicted mirbase; waiting for table tags;
#3 -- in predicted mirbase table
#3.33 -- Table tags indicate end of table; waiting for marker text
#3.67 -- found marker for undetected mirbase; waiting for table tags;
#4 -- in undetected mirbase table
#4.33 -- Table tags indicate end of table; waiting for EOF
#column id depends on table:
# stat table columns:
#title row: (blank), novel miRNAs[3], known miRBase miRNAs[2], (blank), blank
#rest: miRDeep2 score, predicted by miRDeep2, estimated false positives, estimated true positives, in species, in data, detected by miRDeep2, estimated signal-to-noise, excision gearing
# novel & predicted mirbase tables columns
# provisional id, miRDeep2 score, estimated probability that the miRNA candidate is a true positive, rfam alert, total read count, mature read count, loop read count, star read count, significant randfold p-value, miRBase miRNA, example miRBase miRNA with the same seed, UCSC browser, NCBI blastn, consensus mature sequence, consensus star sequence, consensus precursor sequence
# undetected table columns
# miRBase precursor id, -, -, -, total read count, mature read count(s), -, star read count, remaining reads, -, -, UCSC browser, NCBI blastn, miRBase mature sequence(s), miRBase star sequence(s), miRBase precursor sequence
sub new {
my $class = shift;
my $self = $class->SUPER::new(@_);
$self->reset_vars(@_);
#our @stats;
#our @novel;
#our @predicted;
#our @undetected;
return $self;
}
sub getResult {
my $self = shift;
my $t = $retval;
undef $retval;
return $t;
}
sub reset_vars {
my $self = shift;
$self->{state} = 0.33;
$retval = mirdeep_html->new();
$row = 0;
$col = 0;
$cell_content = '';
$append_to_cell = 0;
$outside_row = 1;
$outside_row_started = 0;
$inside_table = 0;
}
sub getState {
my $self = shift;
return $self->{state};
}
sub text {
my ( $self, $text ) = @_;
# just print out the original text
given ( $self->{state} ) {
#print "Current State: $state\n";
#print "$text\n";
when (0) { die("Parsing Attempted before Initializing vars\n"); }
when (0.33) {
if ( $text eq
"Survey of miRDeep2 performance for score cut-offs -10 to 10" )
{
#print "STAT TABLE MARKER FOUND!\n";
$self->{state} = 0.67;
return;
}
}
when (0.67) { }
when (1) { $self->checktext( $retval->{stats}, $text ); }
when (1.33) {
if ( $text eq "novel miRNAs predicted by miRDeep2" ) {
#print "NOVEL TABLE MARKER FOUND!\n";
$self->{state} = 1.67;
return;
}
}
when (1.67) { }
when (2) { $self->checktext( $retval->{novel}, $text ); }
when (2.33) {
if ( $text eq "miRBase miRNAs in dataset" ) {
#print "PREDICTED MIRBASE TABLE MARKER FOUND!\n";
$self->{state} = 2.67;
return;
}
}
when (2.67) { }
when (3) { $self->checktext( $retval->{predmirb}, $text ); }
when (3.33) {
if ( $text eq "miRBase miRNAs not detected by miRDeep2" ) {
#print "UNDETECTED MIRBASE TABLE MARKER FOUND!\n";
$self->{state} = 3.67;
return;
}
}
when (3.67) { }
when (4) { $self->checktext( $retval->{undetmirb}, $text ); }
when (4.33) { }
default { die "Unrecognized state encountered!\n"; }
}
#print $text;
}
sub checktext {
my $self = shift;
my $curtable = shift;
my $text = shift;
if ($append_to_cell) {
$cell_content .= $text;
#print $text;
}
}
sub comment {
my ( $self, $comment ) = @_;
#Nothing to do, html comments not important here
}
sub start {
my ( $self, $tag, $attr, $attrseq, $origtext ) = @_;
# print out original text
#print $origtext;
given ( $self->{state} ) {
when (0) { die("Parsing Attempted before Initializing vars\n"); }
when (0.33) { }
when (0.67) {
if ( $tag eq 'table' ) {
$self->{state} = 1;
$inside_table = 1;
return;
}
}
when (1) { $self->checktagStart( $retval->{stats}, $tag ); }
when (1.33) { }
when (1.67) {
if ( $tag eq 'table' ) {
$self->{state} = 2;
$inside_table = 1;
return;
}
}
when (2) { $self->checktagStart( $retval->{novel}, $tag ); }
when (2.33) { }
when (2.67) {
if ( $tag eq 'table' ) {
$self->{state} = 3;
$inside_table = 1;
return;
}
}
when (3) { $self->checktagStart( $retval->{predmirb}, $tag ); }
when (3.33) { }
when (3.67) {
if ( $tag eq 'table' ) {
$self->{state} = 4;
$inside_table = 1;
return;
}
}
when (4) { $self->checktagStart( $retval->{undetmirb}, $tag ); }
when (4.33) { }
default { die "Unrecognized state encountered!\n"; }
}
}
sub end {
my ( $self, $tag, $origtext ) = @_;
# print out original text
#print $origtext;
given ( $self->{state} ) {
when (0) { die("Parsing Attempted before Initializing vars\n"); }
when (0.33) { }
when (0.67) { }
when (1) { $self->checktagEnd( $retval->{stats}, 1.33, $tag ); }
when (1.33) { }
when (1.67) { }
when (2) { $self->checktagEnd( $retval->{novel}, 2.33, $tag ); }
when (2.33) { }
when (2.67) { }
when (3) { $self->checktagEnd( $retval->{predmirb}, 3.33, $tag ); }
when (3.33) { }
when (3.67) { }
when (4) { $self->checktagEnd( $retval->{undetmirb}, 4.33, $tag ); }
when (4.33) { }
default { die "Unrecognized state encountered!\n"; }
}
}
sub checktagStart {
my $self = shift;
my $curtable = shift;
my $tag = shift;
if ( $tag eq 'tr' ) {
$outside_row = 0;
$outside_row_started = 0;
#print "\n";
++$row;
$curtable->addrow();
#print STDERR "-TR TAG STARTED-\n";
}
elsif ( $inside_table && ( $tag eq 'td' || $tag eq 'th' ) ) {
if ( $outside_row && !$outside_row_started ) {
$outside_row_started = 1;
++$row;
$curtable->addrow();
}
if ($append_to_cell) {
$cell_content = decode($cell_content);
#print "inside a table?: $inside_table\n";
#print "outside row?: $outside_row\n";
#print "outside row started?: $outside_row_started;\n";
$curtable->addcell($cell_content);
$cell_content = getstring();
}
#print "\t";
$append_to_cell = 1;
++$col;
#print STDERR "-TD TAG STARTED-\n";
}
elsif ( $tag eq 'span' ) {
$append_to_cell = 0;
}
}
sub checktagEnd {
my $self = shift;
my $curtable = shift;
my $nextstate = shift;
my $tag = shift;
if ( $tag eq "table" ) {
#print "TABLE TAG END!\n";
$self->{state} = $nextstate;
$inside_table = 0;
$outside_row_started = 0;
#print "State Changed to $nextstate \n";
return;
}
elsif ( $tag eq 'tr' ) {
$outside_row = 1;
#print " \n";
#print STDERR "-TR TAG ENDED-\n";
}
elsif ( $inside_table && ( $tag eq 'td' || $tag eq 'th' ) ) {
#print "\n";
$append_to_cell = 0;
$cell_content = decode($cell_content);
#print "inside a table?: $inside_table\n";
#print "outside row?: $outside_row\n";
#print "outside row started?: $outside_row_started;\n";
$curtable->addcell($cell_content);
#print STDERR $cell_content,"\n";
$cell_content = getstring();
#print STDERR $cell_content,"\n";
#print STDERR "-TD TAG ENDED-\n";
}
elsif ( $tag eq 'span' ) {
$append_to_cell = 1;
}
}
sub getstring {
my $str = '';
return $str;
}
sub decode {
my $str = shift;
$str =~ s/([0123456789]+)/getstrequiv($1)/eg;
$str = HTML::Entities::decode_entities($str);
return $str;
}
sub getstrequiv {
my $num = shift;
given ($num) {
when (177) {
return
'+/-'; #since the single char is non-ascii, better to replace it
}
when (160) {
return ' '; #it'll be handled by decode_entities
}
default {
die "unrecognized code: $num!";
}
}
}
package mirdeep_html;
sub new {
my $class = shift;
my $self = {
stats => data_table->new(),
novel => data_table->new(),
predmirb => data_table->new(),
undetmirb => data_table->new()
};
bless $self, $class;
return $self;
}
package data_table;
sub new {
my $class = shift;
my $self = {
table => getarray(),
col => -1,
row => -1
};
bless $self, $class;
return $self;
}
sub getarray {
my @arr;
return \@arr;
}
sub addcell {
if ( $#_ + 1 != 2 ) {
die "Wrong number of arguments to addcell! Need: 2, Given: " . @_
. " \n";
}
my $self = shift;
my $item = shift;
if ( !defined( ${ $self->{table} }[ $self->{row} ] ) ) {
die "Row referenced by index $self->{row} not defined!\n";
}
push @{ $self->{table}[ $self->{row} ] }, $item;
++$self->{col};
}
sub addrow {
my $self = shift;
my @row;
push( @{ $self->{table} }, \@row );
++$self->{row};
$self->{col} = -1;
}
####
__END__
Label: sample_label File: result_19_11_2010_t_15_48_17.html
$HASH1 = { "sample_label " => ' result_19_11_2010_t_15_48_17.html' };
$HASH2 = { "sample_label " => bless( {
novel => bless( {
col => -1,
row => -1,
table => []
}, 'data_table' ),
predmirb => bless( {
col => -1,
row => -1,
table => []
}, 'data_table' ),
stats => bless( {
col => -1,
row => -1,
table => []
}, 'data_table' ),
undetmirb => bless( {
col => -1,
row => -1,
table => []
}, 'data_table' )
}, 'mirdeep_html' ) };
$HASH3 = { "sample_label " => bless( {
label => undef,
mirb => {},
predmirb => {},
undetmirb => {}
}, 'DataFormat' ) };
$HASH4 = {};