#!/usr/bin/perl use strict; use DBI; use vars qw(%ODBC_type $db_connect_string $db_connect_user $db_connect_pass); use vars qw($anal_distinct_values_limit $anal_dribble_rows $anal_threshold_percentile); #----------------------------------------------------------- # Site-specific tailoring: # # Configure the following for your relational database # my $host = $ENV{ORACLE_HOST} || `hostname`; chomp $host; $db_connect_string = "dbi:Oracle:host=$host;sid=$ENV{ORACLE_SID}"; $db_connect_user = $ENV{PMS_LOGIN}; $db_connect_pass = $ENV{PMS_PASSWD}; %ODBC_type = ( 3 => 'Numeric', 12 => 'Character', ); # # The values below affect the output of the analysis. $anal_distinct_values_limit = 100; # Display everything for fewer that this number of values $anal_dribble_rows = 5; # Show isolated values if this accounts for < threshold $anal_threshold_percentile = 1; # Display everything for freqs > this percentile # For a column which has fewer than $anal_distinct_values_limit values, all values and # counts will be shown. If there are more than this number of values, all values that have # a count of more than $anal_threshold_percentile as a percent of the number of rows, are # output with their counts. # # Also, the tail of the distribution - all values with fewer than $anal_dribble_rows counts # are output - unless there are too many of them. # # This script also detects and reports unique values $| = 1; print "Connecting to database...\n"; my $dbh = DBI->connect($db_connect_string,$db_connect_user,$db_connect_pass, {RaiseError=>1,AutoCommit=>1}); if (!@ARGV) { print <$outf" or die "Failed to create $outf: $!\n"; print "\nTable $table "; # No rows query to get the column details my $sth = $dbh->prepare("select * from $schema.$table where 0 = 1"); $sth->execute; my @columns = @{$sth->{NAME}}; my @types = @{$sth->{TYPE}}; # Now get the row count my $rows = $dbh->selectall_arrayref("select count(*) from $schema.$table")->[0][0]; # Iterate around columns COL: foreach my $col (@columns) { print "."; # Get column's data type my $type = shift @types; my $typename = $ODBC_type{$type} || "Unknown data type $type"; print OUT "\n$col($typename)\n"; # Now get the distribution for the column my $dist = $dbh->selectall_arrayref ("select $col,count(*) from $schema.$table group by $col"); # Make it ranked: # Sort by count(*) descending, then value ascending. my @dist_ranked = sort {$b->[1] <=> $a->[1] || (($type == 3) ? ($a->[0] <=> $b->[0]) : ($a->[0] cmp $b->[0]))} @$dist; # Put quotes round values if it's not a numeric @dist_ranked = map {$_->[0] = "'$_->[0]'";$_} @dist_ranked if $type != 3; if (@dist_ranked < $anal_distinct_values_limit) { # Easy case, few distinct values. &output_val($rows,@$_) foreach (@dist_ranked); } else { # Detect unique values if ($dist_ranked[0][1] == 1) { &output_val($rows, "Unique values"); } else { # Print top of distribution my $phrase = "Top"; my $accum_freq = 0; while ($dist_ranked[0][1] * 100 / $rows > $anal_threshold_percentile) { my $val = shift @dist_ranked; &output_val($rows ,@$val); $phrase = "Next lowest"; $accum_freq += $val->[1]; } # and print the point at which it tails off &output_val($rows, "$phrase freq", $dist_ranked[0][1]); # Now find if there is a tail worth printing out while ((($rows - $accum_freq) * 100 / $rows > $anal_threshold_percentile) || ($dist_ranked[0][1] > $anal_dribble_rows) || (@dist_ranked > $anal_distinct_values_limit)) { my $val = shift @dist_ranked; $accum_freq += $val->[1]; next COL if !@dist_ranked; redo if $dist_ranked[0][1] == $val->[1]; } print OUT " "x20,"Tail values\n" if @dist_ranked; foreach (@dist_ranked) { &output_val($rows ,@$_); } } } } close OUT; } print "\n"; $dbh->disconnect; # output_val - do the formatting here ( called from several places) sub output_val { my ($total,$value,$freq) = @_; printf OUT " %-30s %10d(%3.1f\%)\n",$value,$freq,100*$freq/$total; }