#!/usr/bin/perl
use strict;
use warnings;
use diagnostics;
use HTML::TableExtract;
use FileHandle;

my $usage;
$usage = "Usage:\n";
$usage .= "get_mirbase_fromhtml.pl batchfile\n";
$usage .= "batch file line format: sample_label (tab) path_to_file\n";
our $batchfile = shift or die $usage;
our %paths;    #store tag=>file here
our %data;     #tag=>desired data for output;
our %mirnas
  ; #master list of mirnas, each file will have a list with counts in it, this one is just name=>1 to keep record

#read batch file
readbatch();

#execute parse on html files
do_parsing();

#do any transformations/replacements (blanks with zeroes, etc.)

#output data

exit 0;

sub readbatch {
	my $f = FileHandle->new( '<' . $batchfile );
	$f or die "File $f did not open!\n";
	my @lines = <$f>;
	chomp(@lines);
	my @splt;
	foreach (@lines) {
		@splt = split( /\t/, $_ );
		if ( $#splt + 1 != 2 ) { die "Bad Batch File Format!\n"; }
		$paths{ $splt[0] } = $splt[1];
	}
	undef $f;
}

sub do_parsing {
	foreach ( sort( keys(%paths) ) ) {
		print "Label: $_ File: $paths{$_}\n";
		$data{$_} = getMirb( $_,$paths{$_} );
	}
}

sub printTable {
	my $table = shift or die "No table given to printTable!\n";
	foreach (@$table) {
		foreach (@$_) {
			print $_, "\t";
		}
		print "\n";
	}
}

sub getMirb {
	my $label = shift or die "Must give label as first arg to getMirb\n";
	my $file  = shift or die "Must give filename as second arg to getMirb\n";
	my ( %Predmirb, %UndetMirb, %Combined );    #hashes of data items
	my ( @Predmirb_t, @UndetMirb_t );           #tables
	@Predmirb_t = getPredMirb($file);
#	printTable( \@Predmirb_t );
	convertPredmirb( \@Predmirb_t, \%Predmirb );
	@UndetMirb_t = getUndetMirb($file);
#	printTable( \@UndetMirb_t );
	convertUndetmirb( \@UndetMirb_t, \%UndetMirb );
	combinePred_Undet( \%Predmirb, \%UndetMirb, \%Combined );
	return %Combined;
}

sub getshortname {
	my $name = shift;
	my ( $last1, $last3 );
	my $short;
	$last1 = substr( $$_[1], -1 );
	$last3 = substr( $$_[1], -3 );
	if ( $last1 eq '*' ) {
		$short = substr( $$_[1], 0, -1 );

		#print $$_[1],"\t",$short,"\n";
	}
	elsif ( $last3 eq '-5p' || $last3 eq '-3p' ) {
		$short = substr( $$_[1], 0, -3 );

		#print $$_[1],"\t",$short,"\n";
	}
	else { $short = $$_[1]; }
#	print $short, "\n";
	return $short;
}

sub convertPredmirb {
	my ( $arr, $hash ) = @_;
	my $data;
	foreach (@$arr) {
		$data                 = miRNA_data->new();
		$data->{ID}           = $$_[0];
		$data->{mirbase_name} = $$_[1];
		$data->{total}        = $$_[2];
		$data->{mature}       = $$_[3];
		$data->{loop}         = $$_[4];
		$data->{star}         = $$_[5];
		$data->{short_name}   = getshortname( $data->{mirbase_name} );

		#store in format structure
		$hash->{ $data->{mirbase_name} . '|' . $data->{ID} } = $data;
	}
}

sub convertUndetmirb {
	my ( $arr, $hash ) = @_;
	my ( $star, $data );
	my ( $last1, $last3, $name );
	my %counts={};
	foreach (@$arr) {
		$last1 = substr( $$_[1], -1 );
		$last3 = substr( $$_[1], -3 );
		if ( $last1 eq '*' || $last3 eq '-3p' ) {

			#mirdeep thinks 3p is star
			#print "This is a star seq!\n";
			$star = 1;
		}
		elsif ( $last3 eq '-5p' || $last1 ne '*' ) {

			# normally would have to check for -3p but that was already done
			#mirdeep thinks 5p is mature
			#print "This is a mature seq!\n";
			$star = 0;
		}
		else {

			# should not be possible to get here
			print "Column 1: $$_[0]\tColumn 10: $$_[1]\n";
			die(
"Something is wrong!! Sequence found doesn't seem to be mature or star\n"
			);
		}
		$name = getshortname( $$_[1] );
		print "$$_[1] \t shortens to \t $name\n";
#determine if precursor is already marked in hash, if it is, get reference, if not, make new one and store it
		if ( !defined( $hash->{$name} ) ) {
			$data                 = miRNA_data->new();
			$data->{ID}           = $$_[0];
			$data->{mirbase_name} = $$_[1];
			$data->{total}        = $$_[2];
			$data->{loop}         = $$_[4];
			$data->{short_name}   = $name;

			if ($star) {
				$data->{star}   = $$_[3];
				$data->{mature} = '';
			}
			else {
				$data->{mature} = $$_[3];
				$data->{star}   = '';
			}
			$hash->{ $data->{short_name} } = $data;
			$counts{ $data->{short_name} } = 1;
		}
		else {
			++$counts{ $data->{short_name} };
			if ( $counts{ $data->{short_name} } > 2 ) {
				print $data->{mirbase_name}, "\n";
				print "Name used: $name\n";
				die "Same miRNA detected more than 2 times! (star & mature)\n";
			}
			if ( $counts{ $data->{short_name} } == 2 ) {print "Duplicate Detected! \t", $data->{mirbase_name},"\n";}

 #determine if line represents star or mature & store count in appropriate place
			if ( $star == 1 ) {
				$data->{star} = $$_[3];
			}
			elsif ( $star == 0 ) {
				$data->{mature} = $$_[3];
			}
		}
	}
}

sub combinePred_Undet {
	my ( $pred, $undet, $comb ) = @_;
	#TODO fill this out
	warn "warning:: combinePred_Undet not implemented yet!\n";
}

sub getPredMirb {
	my $file = shift or die "Must give filename as arg to getPredMirb\n";
	my $PredMirbParser = HTML::TableExtract->new(
		headers       => ['tag id'],
		slice_columns => 0,
		keep_headers  => 0
	);
	my @arr;
	$PredMirbParser->parse_file($file);
	for my $table ( $PredMirbParser->tables ) {
		for my $row ( $table->rows ) {
			push( @arr, parsePredMirbRow($row) );
		}
	}
	return @arr;
}

sub parsePredMirbRow {
	my $row  = shift;
	my @data = ();

	#ID
	push( @data, $$row[0] );

	#mirbase name
	push( @data, $$row[9] );

	#total
	push( @data, $$row[4] );

	#mature
	push( @data, $$row[5] );

	#loop
	push( @data, $$row[6] );

	#star
	push( @data, $$row[7] );

	#
	#	foreach (@$row) {
	#		if ( defined($_) ) {
	#			print $_, "\t";
	#		}
	#		else {
	#			print "_\t";
	#		}
	#	}
	#	print "\n";
	#	print join("\t",@data),"\n";
	return \@data;
}

sub getUndetMirb {
	my $file = shift or die "Must give filename as arg to getUndetMirb\n";
	my $UndetMirbParser = HTML::TableExtract->new(
		headers       => ['miRBase precursor id'],
		slice_columns => 0,
		keep_headers  => 0
	);
	my @arr;
	$UndetMirbParser->parse_file($file);
	for my $table ( $UndetMirbParser->tables ) {
		for my $row ( $table->rows ) {
			push( @arr, parseUndetPredMirbRow($row) );
		}
	}
	return @arr;
}

sub parseUndetPredMirbRow {
	my $row = shift;
	my @data;

	#ID
	push( @data, $$row[0] );

	#mirbase name
	push( @data, $$row[9] );

	#total
	push( @data, $$row[4] );

	#mature
	push( @data, $$row[5] );

	#loop
	push( @data, '' );

	#star
	push( @data, '' );

	#	foreach (@$row) {
	#		if ( defined($_) ) {
	#			print $_, "\t";
	#		}
	#		else {
	#			print "_\t";
	#		}
	#	}
	#	print "\n";
	#	print join("\t",@data),"\n";
	return \@data;
}

package miRNA_data;

sub new {
	my $class = shift;
	my $self  = {
		ID => '',  #tag ID / provisional ID/precursor name (for undetected seqs)
		mirbase_name => '',    #name of miRNA
		short_name   => '',    #mirbase name with */-3p/-5p removed
		total        => 0,
		mature       => 0,
		loop         => 0,
		star         => 0,
		duplicates   => 0      #bool will be 1 if duplicates are found

	};
	bless $self, $class;
	return $self;
}

package DataFormat;

sub new {
	my $class = shift;
	my $self  = {
		label    => shift,
		predmirb => {},    #will store miRNA_data entries from predicted mirbase set
		undetmirb => {},    #will store miRNA_data entries from undetected mirbase set
		mirb => {} #This will be the consolidation point keys constructed from mirbase_name|ID
	};
	bless( $self, $class );    #turns var into an object
}

package mirdeep_html;

sub new {
	my $class = shift;
	my $self  = {
		stats     => data_table->new(),
		novel     => data_table->new(),
		predmirb  => data_table->new(),
		undetmirb => data_table->new()
	};
	bless $self, $class;
	return $self;
}

package data_table;

sub new {
	my $class = shift;
	my $self  = {
		table => getarray(),
		col   => -1,
		row   => -1
	};
	bless $self, $class;
	return $self;
}

sub getarray {
	my @arr;
	return \@arr;
}

sub addcell {
	if ( $#_ + 1 != 2 ) {
		die "Wrong number of arguments to addcell! Need: 2, Given: " . @_
		  . " \n";
	}
	my $self = shift;
	my $item = shift;
	if ( !defined( ${ $self->{table} }[ $self->{row} ] ) ) {
		die "Row referenced by index $self->{row} not defined!\n";
	}

	push @{ $self->{table}[ $self->{row} ] }, $item;
	++$self->{col};
}

sub addrow {
	my $self = shift;
	my @row;
	push( @{ $self->{table} }, \@row );
	++$self->{row};
	$self->{col} = -1;
}