#!/usr/bin/perl use strict; use DBI; use vars qw(%ODBC_type $db_connect_string $db_connect_user $db_connect +_pass); use vars qw($anal_distinct_values_limit $anal_dribble_rows $anal_thres +hold_percentile); #----------------------------------------------------------- # Site-specific tailoring: # # Configure the following for your relational database # my $host = $ENV{ORACLE_HOST} || `hostname`; chomp $host; $db_connect_string = "dbi:Oracle:host=$host;sid=$ENV{ORACLE_SID}"; $db_connect_user = $ENV{PMS_LOGIN}; $db_connect_pass = $ENV{PMS_PASSWD}; %ODBC_type = ( 3 => 'Numeric', 12 => 'Character', ); # # The values below affect the output of the analysis. $anal_distinct_values_limit = 100; # Display everything for fewer that + this number of values $anal_dribble_rows = 5; # Show isolated values if this accou +nts for < threshold $anal_threshold_percentile = 1; # Display everything for freqs > th +is percentile # For a column which has fewer than $anal_distinct_values_limit values +, all values and # counts will be shown. If there are more than this number of values, +all values that have # a count of more than $anal_threshold_percentile as a percent of the +number of rows, are # output with their counts. # # Also, the tail of the distribution - all values with fewer than $ana +l_dribble_rows counts # are output - unless there are too many of them. # # This script also detects and reports unique values $| = 1; print "Connecting to database...\n"; my $dbh = DBI->connect($db_connect_string,$db_connect_user,$db_connect +_pass, {RaiseError=>1,AutoCommit=>1}); if (!@ARGV) { print <<END; Usage: perl dbanal.pl schema table [table...] Script creates files of the form table.anal in the current working dir +ectory END exit 1; } my $schema = shift @ARGV; foreach my $table (@ARGV) { my $outf = $table.'.anal'; open OUT,">$outf" or die "Failed to create $outf: $!\n"; print "\nTable $table "; # No rows query to get the column details my $sth = $dbh->prepare("select * from $schema.$table where 0 = 1" +); $sth->execute; my @columns = @{$sth->{NAME}}; my @types = @{$sth->{TYPE}}; # Now get the row count my $rows = $dbh->selectall_arrayref("select count(*) from $schema. +$table")->[0][0]; # Iterate around columns COL: foreach my $col (@columns) { print "."; # Get column's data type my $type = shift @types; my $typename = $ODBC_type{$type} || "Unknown data type $type"; print OUT "\n$col($typename)\n"; # Now get the distribution for the column my $dist = $dbh->selectall_arrayref ("select $col,count(*) from $schema.$table group by $col") +; # Make it ranked: # Sort by count(*) descending, then value ascending. my @dist_ranked = sort {$b->[1] <=> $a->[1] || (($type == 3) ? ($a->[0] <=> $b->[0]) : ($a->[0] cmp $b->[ +0]))} @$dist; # Put quotes round values if it's not a numeric @dist_ranked = map {$_->[0] = "'$_->[0]'";$_} @dist_ranked if $type != 3; if (@dist_ranked < $anal_distinct_values_limit) { # Easy case, few distinct values. &output_val($rows,@$_) foreach (@dist_ranked); } else { # Detect unique values if ($dist_ranked[0][1] == 1) { &output_val($rows, "Unique values"); } else { # Print top of distribution my $phrase = "Top"; my $accum_freq = 0; while ($dist_ranked[0][1] * 100 / $rows > $anal_threshold_percentile) { my $val = shift @dist_ranked; &output_val($rows ,@$val); $phrase = "Next lowest"; $accum_freq += $val->[1]; } # and print the point at which it tails off &output_val($rows, "$phrase freq", $dist_ranked[0] +[1]); # Now find if there is a tail worth printing out while ((($rows - $accum_freq) * 100 / $rows > $anal_threshold_percentile) || ($dist_ranked[0][1] > $anal_dribble_rows) || (@dist_ranked > $anal_distinct_values_limit)) { my $val = shift @dist_ranked; $accum_freq += $val->[1]; next COL if !@dist_ranked; redo if $dist_ranked[0][1] == $val->[1]; } print OUT " "x20,"Tail values\n" if @dist_ranked; foreach (@dist_ranked) { &output_val($rows ,@$_); } } } } close OUT; } print "\n"; $dbh->disconnect; # output_val - do the formatting here ( called from several places) sub output_val { my ($total,$value,$freq) = @_; printf OUT " %-30s %10d(%3.1f\%)\n",$value,$freq,100*$fr +eq/$total; }
Edited 2002-05-20 by Ovid
In reply to Generic database table analysis by rinceWind
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |