#!/usr/bin/perl

use strict;
use DBI;

use vars qw(%ODBC_type $db_connect_string $db_connect_user $db_connect_pass);
use vars qw($anal_distinct_values_limit $anal_dribble_rows $anal_threshold_percentile);

#-----------------------------------------------------------
# Site-specific tailoring:
#
# Configure the following for your relational database
#
my $host = $ENV{ORACLE_HOST} || `hostname`;
chomp $host;
$db_connect_string = "dbi:Oracle:host=$host;sid=$ENV{ORACLE_SID}";
$db_connect_user = $ENV{PMS_LOGIN};
$db_connect_pass = $ENV{PMS_PASSWD};

%ODBC_type = (
	3 => 'Numeric',
	12 => 'Character',
);
#
# The values below affect the output of the analysis.
$anal_distinct_values_limit = 100; # Display everything for fewer that this number of values
$anal_dribble_rows = 5;		   # Show isolated values if this accounts for < threshold
$anal_threshold_percentile = 1;    # Display everything for freqs > this percentile
# For a column which has fewer than $anal_distinct_values_limit values, all values and 
# counts will be shown. If there are more than this number of values, all values that have
# a count of more than $anal_threshold_percentile as a percent of the number of rows, are 
# output with their counts.
#
# Also, the tail of the distribution - all values with fewer than $anal_dribble_rows counts
# are output - unless there are too many of them.
#
# This script also detects and reports unique values

$| = 1;

print "Connecting to database...\n";
my $dbh = DBI->connect($db_connect_string,$db_connect_user,$db_connect_pass,
		{RaiseError=>1,AutoCommit=>1});

if (!@ARGV) {
	print <<END;

Usage: perl dbanal.pl schema table [table...]

Script creates files of the form table.anal in the current working directory

END

	exit 1;
}

my $schema = shift @ARGV;

foreach my $table (@ARGV) {
	my $outf = $table.'.anal';
	open OUT,">$outf" or die "Failed to create $outf: $!\n";

	print "\nTable $table ";

# No rows query to get the column details

	my $sth = $dbh->prepare("select * from $schema.$table where 0 = 1");
	$sth->execute;

	my @columns = @{$sth->{NAME}};
	my @types = @{$sth->{TYPE}};

# Now get the row count

	my $rows = $dbh->selectall_arrayref("select count(*) from $schema.$table")->[0][0];

# Iterate around columns

	COL:
	foreach my $col (@columns) {
		print ".";

# Get column's data type

		my $type = shift @types;
		my $typename = $ODBC_type{$type} || "Unknown data type $type";

		print OUT "\n$col($typename)\n";

# Now get the distribution for the column

		my $dist = $dbh->selectall_arrayref
			("select $col,count(*) from $schema.$table group by $col");
# Make it ranked:
# Sort by count(*) descending, then value ascending. 

		my @dist_ranked = sort {$b->[1] <=> $a->[1] ||
			(($type == 3) ? ($a->[0] <=> $b->[0]) : ($a->[0] cmp $b->[0]))} @$dist;

# Put quotes round values if it's not a numeric

		@dist_ranked = map {$_->[0] = "'$_->[0]'";$_} @dist_ranked
			if $type != 3;

		if (@dist_ranked < $anal_distinct_values_limit) {

# Easy case, few distinct values.
			&output_val($rows,@$_) foreach (@dist_ranked);
		} else {

# Detect unique values
			if ($dist_ranked[0][1] == 1) {
				&output_val($rows, "Unique values");
			} else {

# Print top of distribution
				my $phrase = "Top";
				my $accum_freq = 0;
				while ($dist_ranked[0][1] * 100 / $rows 
						> $anal_threshold_percentile) {
					my $val = shift @dist_ranked;
					&output_val($rows ,@$val);
					$phrase = "Next lowest";
					$accum_freq += $val->[1];
				}

# and print the point at which it tails off
			        &output_val($rows, "$phrase freq", $dist_ranked[0][1]);

# Now find if there is a tail worth printing out

				while ((($rows - $accum_freq) * 100 / $rows >
				    $anal_threshold_percentile) ||
				    ($dist_ranked[0][1] > $anal_dribble_rows) ||
				    (@dist_ranked > $anal_distinct_values_limit)) {

					my $val = shift @dist_ranked;
					$accum_freq += $val->[1];
					next COL if !@dist_ranked;
					redo if $dist_ranked[0][1] == $val->[1];
				}
				print OUT " "x20,"Tail values\n" if @dist_ranked;
				foreach (@dist_ranked) {
					&output_val($rows ,@$_);
				}
			}
		}
	}
	close OUT;
}
print "\n";
$dbh->disconnect;

# output_val - do the formatting here ( called from several places)

sub output_val {
	my ($total,$value,$freq) = @_;

	printf OUT "           %-30s %10d(%3.1f\%)\n",$value,$freq,100*$freq/$total;
}