comment on

Thanks. I will try this out.

And here is some simple code I have so far. It just loops through the column and collects characteristics of the column into attributes that might be useful. For example, one thing it does is determine if the row is sorted. If it is, and the header is not in the proper sort order, it sets a "has_sort_mismatch" flag. One thing the code accounts for is that the header might be multiple rows anywhere in the first 5 rows of the spreadsheet. That makes things slightly trickier.

package Data::Column ;

use Moose;
use Scalar::Util qw(looks_like_number);
use Log::Log4perl::Shortcuts qw (:all);
use namespace::autoclean;

has 'first_cell'            => (is => 'ro', isa => 'Defined', lazy => 
+1, default => 0, writer => '_set_first_cell' );
has 'blank_count'           => (is => 'ro', isa => 'Int', lazy => 1, d
+efault => 0, writer => '_set_blank_count' );
has 'mixed_case_count'           => (is => 'ro', isa => 'Int', lazy =>
+ 1, default => 0, writer => '_set_mixed_case_count' );
has 'upper_case_count'           => (is => 'ro', isa => 'Int', lazy =>
+ 1, default => 0, writer => '_set_upper_case_count' );
has 'lower_case_count'           => (is => 'ro', isa => 'Int', lazy =>
+ 1, default => 0, writer => '_set_lower_case_count' );
has 'preceding_blank_count' => (is => 'ro', isa => 'Int', lazy => 1, d
+efault => 0, writer => '_set_preceding_blank_count' );
has 'following_blank_count' => (is => 'ro', isa => 'Int', lazy => 1, d
+efault => 0, writer => '_set_following_blank_count' );
has 'unique_count'          => (is => 'ro', isa => 'Int', lazy => 1, d
+efault => 0, writer => '_set_unique_count' );
has 'strings_count'         => (is => 'ro', isa => 'Int', lazy => 1, d
+efault => 0, writer => '_set_strings_count' );
has 'min_length'            => (is => 'ro', isa => 'Int', lazy => 1, d
+efault => 0, writer => '_set_min_length' );
has 'max_length'            => (is => 'ro', isa => 'Int', lazy => 1, d
+efault => 0, writer => '_set_max_length' );
has 'numbers_count'         => (is => 'ro', isa => 'Int', lazy => 1, d
+efault => 0, writer => '_set_numbers_count' );
has 'percent_blank'         => (is => 'ro', isa => 'Num', lazy => 1, d
+efault => 0, writer => '_set_percent_blank' );
has 'percent_strings'       => (is => 'ro', isa => 'Num', lazy => 1, d
+efault => 0, writer => '_set_percent_strings' );
has 'percent_numbers'       => (is => 'ro', isa => 'Num', lazy => 1, d
+efault => 0, writer => '_set_percent_numbers' );
has 'non_blank_count'       => (is => 'ro', isa => 'Int', lazy => 1, d
+efault => 0, writer => '_set_non_blank_count' );
has 'spaces_count'          => (is => 'ro', isa => 'Int', lazy => 1, d
+efault => 0, writer => '_set_spaces_count' );
has 'is_all_strings'        => (is => 'ro', isa => 'Bool', lazy => 1, 
+default => 0, writer => '_set_is_all_strings' );
has 'has_sort_mismatch'     => (is => 'ro', isa => 'Bool', lazy => 1, 
+default => 0, writer => '_set_has_sort_mismatch' );
has 'is_all_numbers'        => (is => 'ro', isa => 'Bool', lazy => 1, 
+default => 0, writer => '_set_is_all_numbers' );
has 'is_pure'               => (is => 'ro', isa => 'Bool', lazy => 1, 
+default => 0, writer => '_set_is_pure' );
has 'is_sorted'             => (is => 'ro', isa => 'Bool', lazy => 1, 
+default => 0, writer => '_set_is_sorted' );
has 'is_mixed'              => (is => 'ro', isa => 'Bool', lazy => 1, 
+default => 0, writer => '_set_is_mixed' );
has 'first_five_rows'       => (is => 'ro', isa => 'ArrayRef[Int]', la
+zy => 1, default => 0, writer => '_set_first_five_rows' );
has 'data'                  => (traits => ['Array'], is => 'ro', requi
+red => 1,
                                isa => 'ArrayRef', default => sub { []
+ },
                                handles => { count => 'count', is_empt
+y => 'is_empty', elements => 'elements', get_data => 'get' },
                              );

sub BUILD {
  my $s = shift;

  my ($blank_count, $string_count, $number_count, $strings_with_spaces
+) = (0) x 4;
  my $last_cell;
  my $count = 0;
  my %uniques = ();
  my $sort_order = '';
  my $current_sort_order;
  my $is_sorted = 1;
  my $min_length = 0;
  my $max_length = 0;
  my $first_non_blank_data_cell = 0;
  my @first_five_rows = ();
  my $preceding_blank_row_count = 0;
  my $following_blank_row_count = 0;
  my $lower_case_count = 0;
  my $upper_case_count = 0;
  my $mixed_case_count = 0;

  foreach my $cell ( $s->elements ) {
    $count++;
    my $is_string = 0;
    if (!$cell) {
      $blank_count++;
      push (@first_five_rows, 0) if $count < 6;
      next;
    }

    if (!$first_non_blank_data_cell && $count > 5) {
      $first_non_blank_data_cell = $count;
    }

    push (@first_five_rows, 1) if $count < 6;

    if (length $cell < $min_length || !$min_length) {
      $min_length = length $cell;
    }

    if (length $cell > $max_length || !$max_length) {
      $max_length = length $cell;
    }

    $uniques{$cell} = 1;

    if ( !looks_like_number($cell) ) {
      $string_count++;
      $is_string = 1;
      my $has_lower = $cell =~ /[a-z]/;
      my $has_upper = $cell =~ /[A-Z]/;
      $mixed_case_count++ if ($has_lower && $has_upper);
      $lower_case_count++ if $has_lower && !$has_upper;
      $upper_case_count++ if $has_upper && !$has_lower;
    } elsif ($cell) {
      $number_count++;
    }
    if ($cell =~ / /) {
      $strings_with_spaces++;
    }

    if ($is_sorted && $count > 4 && $last_cell && ( $cell ne $last_cel
+l ) ) {
      if ($is_string) {
        if ($cell gt $last_cell) {
          $current_sort_order = 'asc';
        } else {
          $current_sort_order = 'desc';
        }
      } else {
        if ($cell > $last_cell) {
          $current_sort_order = 'asc';
        } else {
          $current_sort_order = 'desc';
        }
      }

      $sort_order = $current_sort_order if !$sort_order;

      if ( $sort_order && $sort_order ne $current_sort_order) {
        $is_sorted = 0;
      }
    } elsif ($count > 4) {
      $last_cell = $cell;
    }
  }

  my $non_blank_count = $s->count - $blank_count;

  if ($non_blank_count == $string_count && $string_count) {
    $s->_set_is_all_strings(1);
    $s->_set_is_pure(1);
  } elsif ($non_blank_count == $number_count && $number_count) {
    $s->_set_is_all_numbers(1);
    $s->_set_is_pure(1);
  } else {
    $s->_set_is_mixed(1);
  }

  my $first_non_blank_row = 0;
  my $row_count = 0;
  foreach my $row (@first_five_rows) {
    if ($row) {
      $first_non_blank_row = $row_count;
      last;
    }
    $row_count++;
  }
  foreach my $row (@first_five_rows[$row_count + 1, 4]) {
    if ($row && $following_blank_row_count) {
      last;
    }
    if (!$row) {
      $following_blank_row_count++;
    }
  }

  my $sort_mismatch = 0;
  my $first_cell = $s->get_data($first_non_blank_row);
  my $first_cell_is_all_upper = 0;
  my $first_cell_is_all_lower = 0;
  my $first_cell_is_mixed = 0;
  if (!looks_like_number($first_cell)) {
    my $first_cell_has_lower = $first_cell =~ /[a-z]/;
    my $first_cell_has_upper = $first_cell =~ /[A-Z]/;
    my $first_cell_is_mixed if ($first_cell_has_lower && $first_cell_h
+as_upper);
    if (!$first_cell_is_mixed) {
      if ($first_cell =~ /[a-z]/) {
        $first_cell_is_all_lower = 1;
      } elsif ($first_cell =~ /[A-Z]/) {
        $first_cell_is_all_upper = 1;
      } else {
        $first_cell_is_mixed = 1;
      }
    }
  }

  if ($is_sorted) {
    my $first_cell_is_greater;
    if (looks_like_number $first_cell) {
      $first_cell_is_greater = $first_cell > $first_non_blank_data_cel
+l;
    } else {
      $first_cell_is_greater = $first_cell gt $first_non_blank_data_ce
+ll;
    }
    logd($first_cell_is_greater);
    if ($first_cell_is_greater && $sort_order eq 'asc') {
      logd('hi');
      $sort_mismatch = 1;
    }
    if (!$first_cell_is_greater && $sort_order eq 'desc') {
      $sort_mismatch = 1;
    }
  }

  my $non_blank = $s->count - $blank_count;
  $s->_set_non_blank_count($non_blank);
  $s->_set_has_sort_mismatch($sort_mismatch);
  $s->_set_mixed_case_count($mixed_case_count);
  $s->_set_upper_case_count($upper_case_count);
  $s->_set_lower_case_count($lower_case_count);
  $s->_set_percent_blank($blank_count / $s->count * 100);
  $s->_set_percent_strings($string_count / $non_blank * 100);
  $s->_set_percent_numbers($number_count / $non_blank * 100);
  $s->_set_spaces_count($strings_with_spaces);
  $s->_set_blank_count($blank_count);
  $s->_set_max_length($max_length);
  $s->_set_min_length($min_length);
  $s->_set_strings_count($string_count);
  $s->_set_numbers_count($number_count);
  $s->_set_unique_count(scalar keys %uniques);
  $s->_set_preceding_blank_count($first_non_blank_row);
  $s->_set_following_blank_count($following_blank_row_count);
  $s->_set_is_sorted($is_sorted) if $s->count > 6;
  $s->_set_first_five_rows(\@first_five_rows);
  $s->_set_first_cell($first_cell);
}

sub has_blanks {
  my $s = shift;

  return $s->blank_count > 0;
}
[download]

$PM = "Perl Monk's";
$MCF = "Most Clueless ~~Friar~~ ~~Abbot~~ ~~Bishop~~ ~~Pontiff~~ ~~Deacon~~ ~~Curate~~ ~~Priest~~ Vicar";
$nysus = $PM . ' ' . $MCF;
Click here if you love Perl Monks

In reply to Re^2: Useful heuristics for analyzing arrays of data to determine column header by nysus
in thread Useful heuristics for analyzing arrays of data to determine column header by nysus

Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!

Titles consisting of a single word are discouraged, and in most cases are disallowed outright.

Read Where should I post X? if you're not absolutely sure you're posting in the right place.

Please read these before you post! —

Posts may use any of the Perl Monks Approved HTML tags:

a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr

You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)

	For:		Use:
	&		`&`
	<		`<`
	>		`>`
	[		`[`
	]		`]`

Link using PerlMonks shortcuts! What shortcuts can I use for linking?

See Writeup Formatting Tips and other pages linked from there for more info.