open(IN1, '<', $input1) or die "Can't read source file $input1 : $!\n"; my $minlength = 1<<20; my %distrbtn; while() { next if />/; chomp; my $len = length(); $distrbtn{$len}++; $minlength = $len if ( $minlength > $len ); } close IN1; $minlength -= 3; open(IN2, '<', $input2) or die "Can't read source file $input2 : $!\n"; my $header; my @source; my %source_lengths; while() { chomp; if ( />/ ) { $header = $_; } elsif ( length() >= $minlength ) { push @source, $header; push @source, $_; push @{$source_lengths{length()}}, $#source; } } close IN2; #### my $max_source_length = ( sort {$b<=>$a} keys %source_lengths )[0]; for my $key ( sort {$a<=>$b} keys %distrbtn ) { my $size = $key - 3; my $freq = $distrbtn{$key}; # find the first set of source strings of equal or greater size: my $source_key = $key; while ( $source_key <= $max_source_length and not exists( $source_lengths{$source_key} )) { $source_key++; } if ( $source_key > $max_source_length ) { die "We can't do this: strings from $input2 aren't long enough"; } ... } #### my @usable_sources = @{$source_lengths{$source_key}}; printf "for an input1 string of length %d, we can choose from %d input2 strings\n", $size, scalar @usable_sources; # in case we want to add more sources that happen to be longer: $source_key++; while ( $source_key <= $max_source_length and scalar @usable_sources < $freq ) { push @usable_sources, @{$source_lengths{$source_key}} if exists($source_lengths{$source_key}; } if ( $freq > @usable_sources ) { warn "We ran short of desired frequency for length $size\n"; elsif ( $freq < @usable_sources ) { # do something to randomly remove items from @usable_sources... } for my $offset ( @usable_sources ) { my $header = $source[$offset-1]; my $string = $source[$offset]; # do whatever... }