Category: Database
Author/Contact Info rinceWind
Description: I have lost count of the number of times I have been asked "What's in the database" or "Can I see the data", by business analysts and users. If the DB table is small enough I usually give them an Excel spreadsheet with the whole table in.

This script caters for larger tables, which would take forever to produce - and not be what the user wanted anyway. What the code produces is an analysis of the field values occurring in every field - descending frequencies.

#!/usr/bin/perl

use strict;
use DBI;

use vars qw(%ODBC_type $db_connect_string $db_connect_user $db_connect
+_pass);
use vars qw($anal_distinct_values_limit $anal_dribble_rows $anal_thres
+hold_percentile);

#-----------------------------------------------------------
# Site-specific tailoring:
#
# Configure the following for your relational database
#
my $host = $ENV{ORACLE_HOST} || `hostname`;
chomp $host;
$db_connect_string = "dbi:Oracle:host=$host;sid=$ENV{ORACLE_SID}";
$db_connect_user = $ENV{PMS_LOGIN};
$db_connect_pass = $ENV{PMS_PASSWD};

%ODBC_type = (
    3 => 'Numeric',
    12 => 'Character',
);
#
# The values below affect the output of the analysis.
$anal_distinct_values_limit = 100; # Display everything for fewer that
+ this number of values
$anal_dribble_rows = 5;           # Show isolated values if this accou
+nts for < threshold
$anal_threshold_percentile = 1;    # Display everything for freqs > th
+is percentile
# For a column which has fewer than $anal_distinct_values_limit values
+, all values and 
# counts will be shown. If there are more than this number of values, 
+all values that have
# a count of more than $anal_threshold_percentile as a percent of the 
+number of rows, are 
# output with their counts.
#
# Also, the tail of the distribution - all values with fewer than $ana
+l_dribble_rows counts
# are output - unless there are too many of them.
#
# This script also detects and reports unique values

$| = 1;

print "Connecting to database...\n";
my $dbh = DBI->connect($db_connect_string,$db_connect_user,$db_connect
+_pass,
        {RaiseError=>1,AutoCommit=>1});

if (!@ARGV) {
    print <<END;

Usage: perl dbanal.pl schema table [table...]

Script creates files of the form table.anal in the current working dir
+ectory

END

    exit 1;
}

my $schema = shift @ARGV;

foreach my $table (@ARGV) {
    my $outf = $table.'.anal';
    open OUT,">$outf" or die "Failed to create $outf: $!\n";

    print "\nTable $table ";

# No rows query to get the column details

    my $sth = $dbh->prepare("select * from $schema.$table where 0 = 1"
+);
    $sth->execute;

    my @columns = @{$sth->{NAME}};
    my @types = @{$sth->{TYPE}};

# Now get the row count

    my $rows = $dbh->selectall_arrayref("select count(*) from $schema.
+$table")->[0][0];

# Iterate around columns

    COL:
    foreach my $col (@columns) {
        print ".";

# Get column's data type

        my $type = shift @types;
        my $typename = $ODBC_type{$type} || "Unknown data type $type";

        print OUT "\n$col($typename)\n";

# Now get the distribution for the column

        my $dist = $dbh->selectall_arrayref
            ("select $col,count(*) from $schema.$table group by $col")
+;
# Make it ranked:
# Sort by count(*) descending, then value ascending. 

        my @dist_ranked = sort {$b->[1] <=> $a->[1] ||
            (($type == 3) ? ($a->[0] <=> $b->[0]) : ($a->[0] cmp $b->[
+0]))} @$dist;

# Put quotes round values if it's not a numeric

        @dist_ranked = map {$_->[0] = "'$_->[0]'";$_} @dist_ranked
            if $type != 3;

        if (@dist_ranked < $anal_distinct_values_limit) {

# Easy case, few distinct values.
            &output_val($rows,@$_) foreach (@dist_ranked);
        } else {

# Detect unique values
            if ($dist_ranked[0][1] == 1) {
                &output_val($rows, "Unique values");
            } else {

# Print top of distribution
                my $phrase = "Top";
                my $accum_freq = 0;
                while ($dist_ranked[0][1] * 100 / $rows 
                        > $anal_threshold_percentile) {
                    my $val = shift @dist_ranked;
                    &output_val($rows ,@$val);
                    $phrase = "Next lowest";
                    $accum_freq += $val->[1];
                }

# and print the point at which it tails off
                    &output_val($rows, "$phrase freq", $dist_ranked[0]
+[1]);

# Now find if there is a tail worth printing out

                while ((($rows - $accum_freq) * 100 / $rows >
                    $anal_threshold_percentile) ||
                    ($dist_ranked[0][1] > $anal_dribble_rows) ||
                    (@dist_ranked > $anal_distinct_values_limit)) {

                    my $val = shift @dist_ranked;
                    $accum_freq += $val->[1];
                    next COL if !@dist_ranked;
                    redo if $dist_ranked[0][1] == $val->[1];
                }
                print OUT " "x20,"Tail values\n" if @dist_ranked;
                foreach (@dist_ranked) {
                    &output_val($rows ,@$_);
                }
            }
        }
    }
    close OUT;
}
print "\n";
$dbh->disconnect;

# output_val - do the formatting here ( called from several places)

sub output_val {
    my ($total,$value,$freq) = @_;

    printf OUT "           %-30s %10d(%3.1f\%)\n",$value,$freq,100*$fr
+eq/$total;
}

Edited 2002-05-20 by Ovid

Replies are listed 'Best First'.
Re: Generic database table analysis
by gav^ (Curate) on May 21, 2002 at 01:38 UTC
    Perhaps shortening analysis to 'anal' isn't such a good idea when you end up with $anal_dribble_rows?

    :)

    gav^