#!/usr/bin/perl use strict; use warnings; use diagnostics; use HTML::TableExtract; use FileHandle; my $usage; $usage = "Usage:\n"; $usage .= "get_mirbase_fromhtml.pl batchfile\n"; $usage .= "batch file line format: sample_label (tab) path_to_file\n"; our $batchfile = shift or die $usage; our %paths; #store tag=>file here our %data; #tag=>desired data for output; our %mirnas ; #master list of mirnas, each file will have a list with counts in it, this one is just name=>1 to keep record #read batch file readbatch(); #execute parse on html files do_parsing(); #do any transformations/replacements (blanks with zeroes, etc.) #output data exit 0; sub readbatch { my $f = FileHandle->new( '<' . $batchfile ); $f or die "File $f did not open!\n"; my @lines = <$f>; chomp(@lines); my @splt; foreach (@lines) { @splt = split( /\t/, $_ ); if ( $#splt + 1 != 2 ) { die "Bad Batch File Format!\n"; } $paths{ $splt[0] } = $splt[1]; } undef $f; } sub do_parsing { foreach ( sort( keys(%paths) ) ) { print "Label: $_ File: $paths{$_}\n"; $data{$_} = getMirb( $_,$paths{$_} ); } } sub printTable { my $table = shift or die "No table given to printTable!\n"; foreach (@$table) { foreach (@$_) { print $_, "\t"; } print "\n"; } } sub getMirb { my $label = shift or die "Must give label as first arg to getMirb\n"; my $file = shift or die "Must give filename as second arg to getMirb\n"; my ( %Predmirb, %UndetMirb, %Combined ); #hashes of data items my ( @Predmirb_t, @UndetMirb_t ); #tables @Predmirb_t = getPredMirb($file); # printTable( \@Predmirb_t ); convertPredmirb( \@Predmirb_t, \%Predmirb ); @UndetMirb_t = getUndetMirb($file); # printTable( \@UndetMirb_t ); convertUndetmirb( \@UndetMirb_t, \%UndetMirb ); combinePred_Undet( \%Predmirb, \%UndetMirb, \%Combined ); return %Combined; } sub getshortname { my $name = shift; my ( $last1, $last3 ); my $short; $last1 = substr( $$_[1], -1 ); $last3 = substr( $$_[1], -3 ); if ( $last1 eq '*' ) { $short = substr( $$_[1], 0, -1 ); #print $$_[1],"\t",$short,"\n"; } elsif ( $last3 eq '-5p' || $last3 eq '-3p' ) { $short = substr( $$_[1], 0, -3 ); #print $$_[1],"\t",$short,"\n"; } else { $short = $$_[1]; } # print $short, "\n"; return $short; } sub convertPredmirb { my ( $arr, $hash ) = @_; my $data; foreach (@$arr) { $data = miRNA_data->new(); $data->{ID} = $$_[0]; $data->{mirbase_name} = $$_[1]; $data->{total} = $$_[2]; $data->{mature} = $$_[3]; $data->{loop} = $$_[4]; $data->{star} = $$_[5]; $data->{short_name} = getshortname( $data->{mirbase_name} ); #store in format structure $hash->{ $data->{mirbase_name} . '|' . $data->{ID} } = $data; } } sub convertUndetmirb { my ( $arr, $hash ) = @_; my ( $star, $data ); my ( $last1, $last3, $name ); my %counts={}; foreach (@$arr) { $last1 = substr( $$_[1], -1 ); $last3 = substr( $$_[1], -3 ); if ( $last1 eq '*' || $last3 eq '-3p' ) { #mirdeep thinks 3p is star #print "This is a star seq!\n"; $star = 1; } elsif ( $last3 eq '-5p' || $last1 ne '*' ) { # normally would have to check for -3p but that was already done #mirdeep thinks 5p is mature #print "This is a mature seq!\n"; $star = 0; } else { # should not be possible to get here print "Column 1: $$_[0]\tColumn 10: $$_[1]\n"; die( "Something is wrong!! Sequence found doesn't seem to be mature or star\n" ); } $name = getshortname( $$_[1] ); print "$$_[1] \t shortens to \t $name\n"; #determine if precursor is already marked in hash, if it is, get reference, if not, make new one and store it if ( !defined( $hash->{$name} ) ) { $data = miRNA_data->new(); $data->{ID} = $$_[0]; $data->{mirbase_name} = $$_[1]; $data->{total} = $$_[2]; $data->{loop} = $$_[4]; $data->{short_name} = $name; if ($star) { $data->{star} = $$_[3]; $data->{mature} = ''; } else { $data->{mature} = $$_[3]; $data->{star} = ''; } $hash->{ $data->{short_name} } = $data; $counts{ $data->{short_name} } = 1; } else { ++$counts{ $data->{short_name} }; if ( $counts{ $data->{short_name} } > 2 ) { print $data->{mirbase_name}, "\n"; print "Name used: $name\n"; die "Same miRNA detected more than 2 times! (star & mature)\n"; } if ( $counts{ $data->{short_name} } == 2 ) {print "Duplicate Detected! \t", $data->{mirbase_name},"\n";} #determine if line represents star or mature & store count in appropriate place if ( $star == 1 ) { $data->{star} = $$_[3]; } elsif ( $star == 0 ) { $data->{mature} = $$_[3]; } } } } sub combinePred_Undet { my ( $pred, $undet, $comb ) = @_; #TODO fill this out warn "warning:: combinePred_Undet not implemented yet!\n"; } sub getPredMirb { my $file = shift or die "Must give filename as arg to getPredMirb\n"; my $PredMirbParser = HTML::TableExtract->new( headers => ['tag id'], slice_columns => 0, keep_headers => 0 ); my @arr; $PredMirbParser->parse_file($file); for my $table ( $PredMirbParser->tables ) { for my $row ( $table->rows ) { push( @arr, parsePredMirbRow($row) ); } } return @arr; } sub parsePredMirbRow { my $row = shift; my @data = (); #ID push( @data, $$row[0] ); #mirbase name push( @data, $$row[9] ); #total push( @data, $$row[4] ); #mature push( @data, $$row[5] ); #loop push( @data, $$row[6] ); #star push( @data, $$row[7] ); # # foreach (@$row) { # if ( defined($_) ) { # print $_, "\t"; # } # else { # print "_\t"; # } # } # print "\n"; # print join("\t",@data),"\n"; return \@data; } sub getUndetMirb { my $file = shift or die "Must give filename as arg to getUndetMirb\n"; my $UndetMirbParser = HTML::TableExtract->new( headers => ['miRBase precursor id'], slice_columns => 0, keep_headers => 0 ); my @arr; $UndetMirbParser->parse_file($file); for my $table ( $UndetMirbParser->tables ) { for my $row ( $table->rows ) { push( @arr, parseUndetPredMirbRow($row) ); } } return @arr; } sub parseUndetPredMirbRow { my $row = shift; my @data; #ID push( @data, $$row[0] ); #mirbase name push( @data, $$row[9] ); #total push( @data, $$row[4] ); #mature push( @data, $$row[5] ); #loop push( @data, '' ); #star push( @data, '' ); # foreach (@$row) { # if ( defined($_) ) { # print $_, "\t"; # } # else { # print "_\t"; # } # } # print "\n"; # print join("\t",@data),"\n"; return \@data; } package miRNA_data; sub new { my $class = shift; my $self = { ID => '', #tag ID / provisional ID/precursor name (for undetected seqs) mirbase_name => '', #name of miRNA short_name => '', #mirbase name with */-3p/-5p removed total => 0, mature => 0, loop => 0, star => 0, duplicates => 0 #bool will be 1 if duplicates are found }; bless $self, $class; return $self; } package DataFormat; sub new { my $class = shift; my $self = { label => shift, predmirb => {}, #will store miRNA_data entries from predicted mirbase set undetmirb => {}, #will store miRNA_data entries from undetected mirbase set mirb => {} #This will be the consolidation point keys constructed from mirbase_name|ID }; bless( $self, $class ); #turns var into an object } package mirdeep_html; sub new { my $class = shift; my $self = { stats => data_table->new(), novel => data_table->new(), predmirb => data_table->new(), undetmirb => data_table->new() }; bless $self, $class; return $self; } package data_table; sub new { my $class = shift; my $self = { table => getarray(), col => -1, row => -1 }; bless $self, $class; return $self; } sub getarray { my @arr; return \@arr; } sub addcell { if ( $#_ + 1 != 2 ) { die "Wrong number of arguments to addcell! Need: 2, Given: " . @_ . " \n"; } my $self = shift; my $item = shift; if ( !defined( ${ $self->{table} }[ $self->{row} ] ) ) { die "Row referenced by index $self->{row} not defined!\n"; } push @{ $self->{table}[ $self->{row} ] }, $item; ++$self->{col}; } sub addrow { my $self = shift; my @row; push( @{ $self->{table} }, \@row ); ++$self->{row}; $self->{col} = -1; }