package Data::Column ; use Moose; use Scalar::Util qw(looks_like_number); use Log::Log4perl::Shortcuts qw (:all); use namespace::autoclean; has 'first_cell' => (is => 'ro', isa => 'Defined', lazy => 1, default => 0, writer => '_set_first_cell' ); has 'blank_count' => (is => 'ro', isa => 'Int', lazy => 1, default => 0, writer => '_set_blank_count' ); has 'mixed_case_count' => (is => 'ro', isa => 'Int', lazy => 1, default => 0, writer => '_set_mixed_case_count' ); has 'upper_case_count' => (is => 'ro', isa => 'Int', lazy => 1, default => 0, writer => '_set_upper_case_count' ); has 'lower_case_count' => (is => 'ro', isa => 'Int', lazy => 1, default => 0, writer => '_set_lower_case_count' ); has 'preceding_blank_count' => (is => 'ro', isa => 'Int', lazy => 1, default => 0, writer => '_set_preceding_blank_count' ); has 'following_blank_count' => (is => 'ro', isa => 'Int', lazy => 1, default => 0, writer => '_set_following_blank_count' ); has 'unique_count' => (is => 'ro', isa => 'Int', lazy => 1, default => 0, writer => '_set_unique_count' ); has 'strings_count' => (is => 'ro', isa => 'Int', lazy => 1, default => 0, writer => '_set_strings_count' ); has 'min_length' => (is => 'ro', isa => 'Int', lazy => 1, default => 0, writer => '_set_min_length' ); has 'max_length' => (is => 'ro', isa => 'Int', lazy => 1, default => 0, writer => '_set_max_length' ); has 'numbers_count' => (is => 'ro', isa => 'Int', lazy => 1, default => 0, writer => '_set_numbers_count' ); has 'percent_blank' => (is => 'ro', isa => 'Num', lazy => 1, default => 0, writer => '_set_percent_blank' ); has 'percent_strings' => (is => 'ro', isa => 'Num', lazy => 1, default => 0, writer => '_set_percent_strings' ); has 'percent_numbers' => (is => 'ro', isa => 'Num', lazy => 1, default => 0, writer => '_set_percent_numbers' ); has 'non_blank_count' => (is => 'ro', isa => 'Int', lazy => 1, default => 0, writer => '_set_non_blank_count' ); has 'spaces_count' => (is => 'ro', isa => 'Int', lazy => 1, default => 0, writer => '_set_spaces_count' ); has 'is_all_strings' => (is => 'ro', isa => 'Bool', lazy => 1, default => 0, writer => '_set_is_all_strings' ); has 'has_sort_mismatch' => (is => 'ro', isa => 'Bool', lazy => 1, default => 0, writer => '_set_has_sort_mismatch' ); has 'is_all_numbers' => (is => 'ro', isa => 'Bool', lazy => 1, default => 0, writer => '_set_is_all_numbers' ); has 'is_pure' => (is => 'ro', isa => 'Bool', lazy => 1, default => 0, writer => '_set_is_pure' ); has 'is_sorted' => (is => 'ro', isa => 'Bool', lazy => 1, default => 0, writer => '_set_is_sorted' ); has 'is_mixed' => (is => 'ro', isa => 'Bool', lazy => 1, default => 0, writer => '_set_is_mixed' ); has 'first_five_rows' => (is => 'ro', isa => 'ArrayRef[Int]', lazy => 1, default => 0, writer => '_set_first_five_rows' ); has 'data' => (traits => ['Array'], is => 'ro', required => 1, isa => 'ArrayRef', default => sub { [] }, handles => { count => 'count', is_empty => 'is_empty', elements => 'elements', get_data => 'get' }, ); sub BUILD { my $s = shift; my ($blank_count, $string_count, $number_count, $strings_with_spaces) = (0) x 4; my $last_cell; my $count = 0; my %uniques = (); my $sort_order = ''; my $current_sort_order; my $is_sorted = 1; my $min_length = 0; my $max_length = 0; my $first_non_blank_data_cell = 0; my @first_five_rows = (); my $preceding_blank_row_count = 0; my $following_blank_row_count = 0; my $lower_case_count = 0; my $upper_case_count = 0; my $mixed_case_count = 0; foreach my $cell ( $s->elements ) { $count++; my $is_string = 0; if (!$cell) { $blank_count++; push (@first_five_rows, 0) if $count < 6; next; } if (!$first_non_blank_data_cell && $count > 5) { $first_non_blank_data_cell = $count; } push (@first_five_rows, 1) if $count < 6; if (length $cell < $min_length || !$min_length) { $min_length = length $cell; } if (length $cell > $max_length || !$max_length) { $max_length = length $cell; } $uniques{$cell} = 1; if ( !looks_like_number($cell) ) { $string_count++; $is_string = 1; my $has_lower = $cell =~ /[a-z]/; my $has_upper = $cell =~ /[A-Z]/; $mixed_case_count++ if ($has_lower && $has_upper); $lower_case_count++ if $has_lower && !$has_upper; $upper_case_count++ if $has_upper && !$has_lower; } elsif ($cell) { $number_count++; } if ($cell =~ / /) { $strings_with_spaces++; } if ($is_sorted && $count > 4 && $last_cell && ( $cell ne $last_cell ) ) { if ($is_string) { if ($cell gt $last_cell) { $current_sort_order = 'asc'; } else { $current_sort_order = 'desc'; } } else { if ($cell > $last_cell) { $current_sort_order = 'asc'; } else { $current_sort_order = 'desc'; } } $sort_order = $current_sort_order if !$sort_order; if ( $sort_order && $sort_order ne $current_sort_order) { $is_sorted = 0; } } elsif ($count > 4) { $last_cell = $cell; } } my $non_blank_count = $s->count - $blank_count; if ($non_blank_count == $string_count && $string_count) { $s->_set_is_all_strings(1); $s->_set_is_pure(1); } elsif ($non_blank_count == $number_count && $number_count) { $s->_set_is_all_numbers(1); $s->_set_is_pure(1); } else { $s->_set_is_mixed(1); } my $first_non_blank_row = 0; my $row_count = 0; foreach my $row (@first_five_rows) { if ($row) { $first_non_blank_row = $row_count; last; } $row_count++; } foreach my $row (@first_five_rows[$row_count + 1, 4]) { if ($row && $following_blank_row_count) { last; } if (!$row) { $following_blank_row_count++; } } my $sort_mismatch = 0; my $first_cell = $s->get_data($first_non_blank_row); my $first_cell_is_all_upper = 0; my $first_cell_is_all_lower = 0; my $first_cell_is_mixed = 0; if (!looks_like_number($first_cell)) { my $first_cell_has_lower = $first_cell =~ /[a-z]/; my $first_cell_has_upper = $first_cell =~ /[A-Z]/; my $first_cell_is_mixed if ($first_cell_has_lower && $first_cell_has_upper); if (!$first_cell_is_mixed) { if ($first_cell =~ /[a-z]/) { $first_cell_is_all_lower = 1; } elsif ($first_cell =~ /[A-Z]/) { $first_cell_is_all_upper = 1; } else { $first_cell_is_mixed = 1; } } } if ($is_sorted) { my $first_cell_is_greater; if (looks_like_number $first_cell) { $first_cell_is_greater = $first_cell > $first_non_blank_data_cell; } else { $first_cell_is_greater = $first_cell gt $first_non_blank_data_cell; } logd($first_cell_is_greater); if ($first_cell_is_greater && $sort_order eq 'asc') { logd('hi'); $sort_mismatch = 1; } if (!$first_cell_is_greater && $sort_order eq 'desc') { $sort_mismatch = 1; } } my $non_blank = $s->count - $blank_count; $s->_set_non_blank_count($non_blank); $s->_set_has_sort_mismatch($sort_mismatch); $s->_set_mixed_case_count($mixed_case_count); $s->_set_upper_case_count($upper_case_count); $s->_set_lower_case_count($lower_case_count); $s->_set_percent_blank($blank_count / $s->count * 100); $s->_set_percent_strings($string_count / $non_blank * 100); $s->_set_percent_numbers($number_count / $non_blank * 100); $s->_set_spaces_count($strings_with_spaces); $s->_set_blank_count($blank_count); $s->_set_max_length($max_length); $s->_set_min_length($min_length); $s->_set_strings_count($string_count); $s->_set_numbers_count($number_count); $s->_set_unique_count(scalar keys %uniques); $s->_set_preceding_blank_count($first_non_blank_row); $s->_set_following_blank_count($following_blank_row_count); $s->_set_is_sorted($is_sorted) if $s->count > 6; $s->_set_first_five_rows(\@first_five_rows); $s->_set_first_cell($first_cell); } sub has_blanks { my $s = shift; return $s->blank_count > 0; }