Here is the algorithm.
is a central fragment. The score zero means the data field value is no longer needed; a score equal one means the value is needed in the results; and a score greater than one means the value is needed but which record is undetermined.$freq->{$fld_no}->{ $record[$fld_num] } = 0;
In the above, I write of an ideal solution size. Determining the best possible and worst possible size of our answer allows breaking off processing when it's pointless.
Possibly change the permutation machine so that each "permutation" array is treated as a wheel, reducing the number of arrays needing to be created. I think the walking is the time sink; not creating the walkways.
Here's unguaranteed code.
#!/usr/bin/perl use warnings; use strict; use FileHandle; use Clone 'clone'; use Data::Compare; $|++; my @files = qw( File1 File2 File3 ); my $sig_flds = [ 0 .. 2 ]; # config, the fields that we care about @$sig_flds = sort @$sig_flds; my $answer = []; my $freq = {}; my $input; # AoA ref # put data into an array for my $file (@files) { my $fh = FileHandle->new( $file, 'r' ) or die "Cannot open $file"; scalar <$fh>; # eat header line my $data = [<$fh>]; chomp @$data; $_ = [ parse_csv( $_, ',' ) ] for @$data; close $fh or die "Cannot close $file"; push @$input, @$data; } my $sanity_check = make_ck_list( $sig_flds, $input); sub make_ck_list { my ( $sig_flds, $input) = @_; my %answer_check; for my $rec ( @$input ) { for my $fld ( @$sig_flds ) { $answer_check{ $fld }{ $rec->[$fld] } = 1; } } return \%answer_check; } print "Orig unscored input size(", scalar(@$input), "):$/"; display( $sig_flds, $input, $freq ); sample( $sig_flds, $input, $answer, $freq ); print "Solution size(", scalar(@$answer), "):$/"; display( $sig_flds, $answer, $freq ); my $answer_list = make_ck_list( $sig_flds, $answer); print "Bad answer$/" unless Compare( $answer_list, $sanity_check ); exit; # create a smallest possible sample with all values represented. sub sample { my ( $sig_flds, $input, $answer, $freq ) = @_; local $_; my $part_answer = []; $input = _reduce_input( $sig_flds, $input, $part_answer, $freq ); # are we lucky/done ? return do { @$answer = @$part_answer; return } if !defined $input; # reassemble data w/o duplicates @$input = ( @$input, @$part_answer ); # partition into separate problems my $partition = partition( $sig_flds, $input ); # solve each problem, dumping all the solutions together # process each partition my $ct; for my $part (@$partition) { # print "Starting partition", ++$ct, $/; $freq = {}; $part_answer = []; # this does a little extra work this time $input = _reduce_input( $sig_flds, $part, $part_answer, $freq +); solve_part( $sig_flds, $input, $part_answer, $freq ) if defined $input; push @$answer, @$part_answer; $input = []; } return; } # solve_part -- solve a partition of data by going # through all permutations until a good solution # is found sub solve_part { my ( $sig_flds, $input, $answer, $freq ) = @_; $answer = [] if not defined $answer; my $part_answer = $answer; # factor to predict size of solution my ( $best_possible_size, $worst_possible_size ) = size_answer( $sig_flds, $input, $answer, $freq ); # master needs list; my %master_needs = %$freq; # delete @$part_answer items from needs for my $rec (@$part_answer) { for my $fld (@$sig_flds) { if ( exists $master_needs{$fld}->{ $rec->[$fld] } ) { delete $master_needs{$fld}->{ $rec->[$fld] }; } } } my $iter_perms = permute(@$input); my $cur_best_size = $worst_possible_size; my $best_answer; my $total_perms = factorial( scalar @$input ); my $ct; while ( my @perm = $iter_perms->() ) { ++$ct; my $curr_ans; my $needs = clone( \%master_needs ); @$curr_ans = @$part_answer; while (@perm) { my $curr_rec = pop @perm; if ( useful( $sig_flds, $curr_rec, $needs ) ) { push @$curr_ans, $curr_rec; for my $fld (@$sig_flds) { delete $needs->{$fld}{ $curr_rec->[$fld] }; } } } if ( $ct % 1000 == 0 ) { # XXX #print # " Permutation: $ct/$total_perms " # . "target size: $best_possible_size best $cur_best_si +ze$/"; } if ( empty( $sig_flds, $needs ) ) { if ( @$curr_ans < $cur_best_size ) { $best_answer = $curr_ans; $cur_best_size = @$curr_ans; #print "$/Permutation $ct$/"; #print "We have a new contender: "; #print "Target size: $best_possible_size "; #print "Contender's size: $cur_best_size$/"; if ( $cur_best_size <= $best_possible_size ) { @$answer = @$best_answer; return; } #display( $sig_flds, $best_answer, $freq ); } } } @$answer = @$best_answer; return; } # size_answer -- look at the data and determine: # 1. the smallest possible size of an answer (or less if we must gues +s), # 2. the largest possible size of an answer (or more if we must guess +), # This is an optimization: if smallest is accurate, we can break off # when one of the best solutions is found; if largest is accurate we # can stop processing a permutation when its solution is too large. # I haven't considered how well this can be estimated. # , the smallest is the size of the largest set of # data values in a significant field; and the largest is the greater +of # the input size or ( sum of significant field sets plus scalar @$sig_ +flds # minus one. # sub size_answer { my ( $sig_flds, $input, $answer, $freq ) = @_; my ( $best_possible_size, $worst_possible_size ); # XXX stub return ( 1, 1000 ); } # _reduce_input -- Remove duplicate records and cull # items that are easy to determine are unneeded. # Put items that are definitely needed into @$part_answer. # Delete culls from @$input. And build occurrence %$frequency. sub _reduce_input { my ( $sig_flds, $input, $part_answer, $freq ) = @_; local $_; # create frequency info for my $fld (@$sig_flds) { ++$freq->{$fld}->{ $_->[$fld] } for @$input; } my $start; do { return if not defined @$input; $start = @$input; # progress flag # sort and remove dupes @$input = sort { cmp_on_flds( $sig_flds, $freq, $a, $b ) } @$i +nput; $input = unique_on_flds( $sig_flds, $input, $freq ); my $progress; do { $progress = 0; $_ = -1; while ( ref $input && $_ < @$input - 1 ) { $_++; if ( moveable( $_, $sig_flds, $input, $freq ) ) { move( $_, $sig_flds, $input, $part_answer, $freq ) +; ++$progress; next; } if ( removeable( $_, $sig_flds, $input, $freq ) ) { remove( $_, $sig_flds, $input, $part_answer, $freq + ); ++$progress; } } } while ($progress); } while ( ref $input && $start != @$input ); return $input; } sub factorial { my $n = int(shift); my $ret = $n; while ( $n > 1 ) { $ret *= ( $n - 1 ); $n -= 1; } return $ret; } # return true if rec will fill any possible need in answer sub useful { my ( $sig_flds, $curr_rec, $needs ) = @_; for my $fld (@$sig_flds) { if ( exists $needs->{$fld}{ $curr_rec->[$fld] } ) { return 1; } } return; } # check a needs/freq hash for emptiness sub empty { my ( $sig_flds, $needs ) = @_; local $_; die "No tying" if defined tied $needs; for (@$sig_flds) { return 1 if "0" ne $needs->{$_}; } return 0; } # move item from input to output sub move { my ( $rec, $sig_flds, $input, $answer, $freq ) = @_; local $, = ' '; # print "moving @{$input->[$rec]}$/"; for my $fld (@$sig_flds) { $freq->{$fld}->{ $input->[$rec]->[$fld] } = 0; } push @$answer, $input->[$rec]; splice @$input, $rec, 1; return; } # determine if item should go to output sub moveable { my ( $rec, $sig_flds, $input, $freq ) = @_; # is moveable if any fld is unique for my $fld (@$sig_flds) { return 1 if 1 == $freq->{$fld}->{ $input->[$rec]->[$fld] }; } return 0; } # remove unnecessary item from input sub remove { my ( $rec, $sig_flds, $input, $answer, $freq ) = @_; local $, = ' '; # print "removing @{$input->[$rec]}$/"; for my $fld (@$sig_flds) { if ( $freq->{$fld}->{ $input->[$rec]->[$fld] } != 0 ) { --$freq->{$fld}->{ $input->[$rec]->[$fld] }; } } splice @$input, $rec, 1; return; } # determine if item is unneeded in input sub removeable { my ( $rec, $sig_flds, $input, $freq ) = @_; my $flds_needed = 0; my $removeable = 0; my $score = 0; # removeable if only 1 field needed and that's not unique or if # no fields needed for my $fld (@$sig_flds) { ++$flds_needed if $freq->{$fld}->{ $input->[$rec]->[$fld] }; $score += $freq->{$fld}->{ $input->[$rec]->[$fld] }; } $removeable = 1 if 1 == $flds_needed || 0 == $flds_needed; return $removeable; } # cmp using a \@list_of_field_indices cmp two \@arrays, # if both records' fields have a frequency of 0 their values don't mat +ter sub cmp_on_flds { my ( $sig_flds, $freq, $aa, $bb ) = @_; my $ret; for my $fld (@$sig_flds) { no warnings 'uninitialized'; if ( 0 == $freq->{$fld}->{ $aa->[$fld] } && 0 == $freq->{$fld}->{ $bb->[$fld] } ) { next; } $ret = $aa->[$fld] cmp $bb->[$fld]; return $ret if $ret; } return 0; } # delete items from a sorted \@array where \@fields_indexed are same # and adjust frequency info sub unique_on_flds { my ( $sig_flds, $data, $freq ) = @_; my ( $prev, $ret ); while (@$data) { my $not_dupe = @$sig_flds; my $curr = shift @$data; $not_dupe = cmp_on_flds( $sig_flds, $freq, $curr, $prev ); $prev = $curr; if ($not_dupe) { push @$ret, $curr; } else { # adjust frequencies for my $fld (@$sig_flds) { --$freq->{$fld}->{ $curr->[$fld] }; } } } return $ret; } # display scored records sub display { my ( $sig_flds, $aref, $freq ) = @_; local $_; local $" = ' '; for my $el (@$aref) { no warnings 'uninitialized'; for (@$sig_flds) { printf "%4i %-6s ", $freq->{$_}->{ $el->[$_] }, $el->[$_]; } print " @$el"; print $/; } print $/; return; } # this is from perlmonks.org!Kraythorne to parse input files sub parse_csv { my $text = shift; my $delim = shift; # not used my $type = undef; if ( $delim && $delim =~ /comma|tab|pipe|fixed/i ) { ( # $type, undef, $delim ) = $delim =~ m/\A (.+) \s - \s (.+) \z/xms; } my @new = (); if ($delim) { while ( $text =~ m{ # the first part groups the phrase inside the quotes. # see explanation of this pattern in MRE "([^\"\\]*(?:\\.[^\"\\]*)*)"$delim? | ([^$delim]+)$delim? | $delim }gx ) { if ( defined $+ ) { push( @new, $+ ); } else { push( @new, '' ); } } push( @new, '' ) if substr( $text, -1, 1 ) eq ($delim); } else { $new[0] = $text; } return @new; # list of values that were comma-separated } # permute -- generate an iter yielding all the permutations # of a list, basically lifted from "Higher Order Perl", # M.J. Dominus, pp 134-135 sub permute { my @data = @_; my $perm = 0; return sub { do { $perm++; return @data } if !$perm; my $i; my $p = $perm; for ( $i = 1; $i <= @data && $p % $i == 0; $i++ ) { $p /= $i; } my $d = $p % $i; my $j = @data - $i; return if $j < 0; @data[ $j + 1 .. $#data ] = reverse @data[ $j + 1 .. $#data ]; @data[ $j, $j + $d ] = @data[ $j + $d, $j ]; $perm++; return @data; }; } # partition -- split an array so that each partition shares # no values per significant field w/ another partition # example: # part 1: ( [a,a,a],[b,a,a],[b,c,d],[e,f,d] ) # part 2: ( [f,g,g],[x,g,x],[y,z,x],[z,z,z] ) # sub partition { my ( $sig_flds, $input ) = @_; my $ret; while (@$input) { my $part = []; my $part_has = {}; my $curr = pop @$input; _add_rec2part( $sig_flds, $curr, $part, $part_has ); my $remainder = $input; my $next_remainder = []; while (@$remainder) { $curr = pop @$remainder; if ( _part_needs_rec( $sig_flds, $curr, $part, $part_has ) + ) { _add_rec2part( $sig_flds, $curr, $part, $part_has ); } else { push @$next_remainder, $curr; } } push @$ret, $part; @$remainder = @$next_remainder; } return $ret; } # boolean, does the partition need this record sub _part_needs_rec { my ( $sig_flds, $rec, $part, $freq ) = @_; for my $f (@$sig_flds) { if ( $freq->{$f}->{ $rec->[$f] } ) { return 1; } } return; } # add record to partition and adjust partition's needs hash sub _add_rec2part { my ( $sig_flds, $rec, $part, $freq ) = @_; push @$part, $rec; for my $fld (@$sig_flds) { $freq->{$fld}->{ $rec->[$fld] } = 1; } return; }
In reply to Re: Most efficient record selection method?
by rir
in thread Most efficient record selection method?
by Kraythorne
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |