in reply to Fast/Efficient Sort for Large Files
sub fill_wk_files { # this subroutine fills the first two work files for a tape sort fr +om # the specified input file my $in_fi_param = shift; my $curr_wk_fi = shift; my $non_curr_wk_fi = shift; my $spare_wk_fi; my $wk_rec; while (1) { # will eventually return straight out of the inmost loo +p my @fill_array = (); for (my $fill_ct = 0; $fill_ct < 25000; $fill_ct++) { # 25000 is a tuning magic number if (not defined($wk_rec = <$in_fi_param>)) { # sort the final batch of records, output them and return foreach(sort(@fill_array)) { print $curr_wk_fi "$_\n"; } return; } rotor_print(); # progress monitor chomp $wk_rec; push(@fill_array, $wk_rec); # top up the buffer array } # sort the latest batch of records, output them and switch the w +ork files foreach(sort(@fill_array)) { print $curr_wk_fi "$_\n"; } $spare_wk_fi = $curr_wk_fi; $curr_wk_fi = $non_curr_wk_fi; $non_curr_wk_fi = $spare_wk_fi; } return; # dummy long stop } sub merge_sort_seqs { # this subroutine alternates pairs of work files, merging records f +rom one lot into # the other, returning the number of sorted sequences it produced a +nd the number of # records output as a list to drive later logic my $in_fi_1_param = shift; my $in_fi_2_param = shift; my $curr_out_fi = shift; my $non_curr_out_fi = shift; my $spare_out_fi; my $in_1_rec; my $in_2_rec; my $curr_rec; # this flag will be true if the current merged record is to come fr +om the first file my $curr_from_1; my $sort_seq_cter = 0; my $out_ct = 0; # needs to be maintained within merge_sort_seqs # inputting the first records to initialise the process my $eof_1_flag = (defined($in_1_rec = <$in_fi_1_param>)) ? 0 : 1; # flags need to + be explicit my $eof_2_flag = (defined($in_2_rec = <$in_fi_2_param>)) ? 0 : 1; # flags need to + be explicit my $seq_1_break; my $seq_2_break; if ($eof_1_flag and $eof_2_flag) { # special case - no records at a +ll return (0, 0); } while (not ($eof_1_flag and $eof_2_flag)) { # some records left $sort_seq_cter++; # reset the break flags to false in most cases, but allow for eo +f $seq_1_break = ($eof_1_flag) ? 1 : 0; # flags need to be explici +t $seq_2_break = ($eof_2_flag) ? 1 : 0; # flags need to be explici +t while (not ($seq_1_break and $seq_2_break)) { # merge and output the next pair of sorted sequences (one mig +ht be empty) rotor_print(); # progress monitor # work out record to output if ($seq_1_break) { # special case - only second file chomp $in_2_rec; $curr_from_1 = 0; $curr_rec = $in_2_rec; } elsif ($seq_2_break) { # special case - only first file chomp $in_1_rec; $curr_from_1 = 1; $curr_rec = $in_1_rec; } else { chomp $in_1_rec; chomp $in_2_rec; $curr_from_1 = ($in_1_rec lt $in_2_rec) ? 1 : 0; # flags n +eed to be explicit $curr_rec = ($curr_from_1) ? $in_1_rec : $in_2_rec; } print $curr_out_fi "$curr_rec\n"; $out_ct++; # needs to be maintained within merge_sort_seqs if ($curr_from_1) { # input the next record from the first work file, setting +eof and break flags $eof_1_flag = (defined($in_1_rec = <$in_fi_1_param>)) ? 0 : 1; # flags n +eed to be explicit if ($eof_1_flag) { $seq_1_break = 1; } else { chomp $in_1_rec; $seq_1_break = ($in_1_rec lt $curr_rec) ? 1 : 0; # flags need to be ex +plicit } } else { # input the next record from the second work file, setting + eof and break flags $eof_2_flag = (defined($in_2_rec = <$in_fi_2_param>)) ? 0 : 1; # flags n +eed to be explicit if ($eof_2_flag) { $seq_2_break = 1; } else { chomp $in_2_rec; $seq_2_break = ($in_2_rec lt $curr_rec) ? 1 : 0; # flags need to be ex +plicit } } } # switch the output work files $spare_out_fi = $curr_out_fi; $curr_out_fi = $non_curr_out_fi; $non_curr_out_fi = $spare_out_fi; } return ($sort_seq_cter, $out_ct); } sub tape_sort { # this subroutine carries out a tape sort on the input and output f +iles specified, # treating the entire record as a key # i.e. it applies the tape sort algorithm using sequentially access +ed temporary files # it returns the number of records output my ($in_fi_prm, $out_fi_prm) = @_; my $in_fi; my $out_fi; # ids to be used in merge passes my $wk_fi_id_1 = $sort_wk_dir . "Sortwk_1.tmp"; my $wk_fi_id_2 = $sort_wk_dir . "Sortwk_2.tmp"; my $wk_fi_id_3 = $sort_wk_dir . "Sortwk_3.tmp"; my $wk_fi_id_4 = $sort_wk_dir . "Sortwk_4.tmp"; # handles to be used in merge passes my $wk_fi_1; my $wk_fi_2; my $wk_fi_3; my $wk_fi_4; open ($in_fi, "<", $in_fi_prm) or die "Can't open $in_fi_prm for in +put: $!"; open ($wk_fi_1, ">", $wk_fi_id_1) or die "Can't open $wk_fi_id_1 fo +r output: $!"; open ($wk_fi_2, ">", $wk_fi_id_2) or die "Can't open $wk_fi_id_2 fo +r output: $!"; reset_rotor("preparing work files from $in_fi_prm for tape sort"); fill_wk_files($in_fi, $wk_fi_1, $wk_fi_2); close ($wk_fi_1) or die "Error closing file: $!"; close ($wk_fi_2) or die "Error closing file: $!"; close ($in_fi) or die "Error closing file: $!"; # more ids to be used in merge passes my $in_wk_a = $wk_fi_id_1; my $in_wk_b = $wk_fi_id_2; my $out_wk_a = $wk_fi_id_3; my $out_wk_b = $wk_fi_id_4; my $spare_wk_a; my $spare_wk_b; # this will count how many sorted stretches were produced on a pass my $sort_seq_ct = 999999; # dummy sentinel value to start loop off my $sort_ct; # to count records sorted my $pass_ct; # to count passes made while ($sort_seq_ct > 2) { # this will alternate pairs of work files, merging records from +one lot # into the other, until there are only two (or fewer) input work + files left open ($wk_fi_1, "<", $in_wk_a) or die "Can't open $in_wk_a for i +nput: $!"; open ($wk_fi_2, "<", $in_wk_b) or die "Can't open $in_wk_b for i +nput: $!"; open ($wk_fi_3, ">", $out_wk_a) or die "Can't open $out_wk_a for + output: $!"; open ($wk_fi_4, ">", $out_wk_b) or die "Can't open $out_wk_b for + output: $!"; $pass_ct++; reset_rotor("merging work records for tape sort - pass $pass_ct" +); ($sort_seq_ct, $sort_ct) = merge_sort_seqs($wk_fi_1, $wk_fi_2, $ +wk_fi_3, $wk_fi_4); print "\nNumber of sorted sequences generated: $sort_seq_ct\n"; close ($wk_fi_1) or die "Error closing file: $!"; close ($wk_fi_2) or die "Error closing file: $!"; close ($wk_fi_3) or die "Error closing file: $!"; close ($wk_fi_4) or die "Error closing file: $!"; # switch input and output file ids $spare_wk_a = $in_wk_a; $in_wk_a = $out_wk_a; $out_wk_a = $spare_wk_a; $spare_wk_b = $in_wk_b; $in_wk_b = $out_wk_b; $out_wk_b = $spare_wk_b; } # do a final merge on the last sorted sequences, outputting to the +desired output file # freeing up space as soon as convenient warn "Could not delete both of $out_wk_a and $out_wk_b\n" if (unlink ($out_wk_a, $out_wk_b) != 2); open ($wk_fi_1, "<", $in_wk_a) or die "Can't open $in_wk_a for inpu +t: $!"; open ($wk_fi_2, "<", $in_wk_b) or die "Can't open $in_wk_b for inpu +t: $!"; open ($out_fi, ">", $out_fi_prm) or die "Can't open $out_fi_prm for + output: $!"; reset_rotor("merging work records to $out_fi_prm for tape sort - fi +nal pass"); # the final parameter is a dummy ($sort_seq_ct, $sort_ct) = merge_sort_seqs($wk_fi_1, $wk_fi_2, $out +_fi, $out_fi); if ($sort_seq_ct > 1) { die "Could not complete sort from $in_fi_prm to $out_fi_prm"; } close ($wk_fi_1) or die "Error closing file: $!"; close ($wk_fi_2) or die "Error closing file: $!"; # freeing up space as soon as convenient warn "Could not delete both of $in_wk_a and $in_wk_b\n" if (unlink ($in_wk_a, $in_wk_b) != 2); close ($out_fi) or die "Error closing file: $!"; return $sort_ct; }
Edit by tye to add READMORE
|
|---|