#!/usr/bin/perl -w =head1 Simple Data Sampler This program extracts a set of random lines from the file(s) specified. =head1 Usage: sample_lines.pl [<Option>*] <InFile> <InFile>* =head1 Options: =over =item -per-thousand <p> Control number of lines in the sample--try to keep <p> lines for every thousand seen. I<NOTE:> We don't try to enforce the number of lines per thousand to this value, we just use it to choose when to print a line (with possible contiguous lines). =item -contiguous <c> Keep <c> lines after each line selected to print (so we always get at least <c> contiguous lines, default=0. I<NOTE:> See -contig-max =item -contig-max <cm> Randomizes number of contiguous lines to print after selected lines (see -contiguous). Print between <c> and <cm> lines after each selected line. =item -minimum-skip <ms> Minimum number of lines to skip between selected lines. =back The options are implemented very simply, as this isn't supposed to be the "ultimate data sampler", just a simple way to get a random set of lines from a text file. =cut use strict; use warnings; use Getopt::Long; ##### # Handle command-line options ##### my $contig_min; my $contig_max; my $minimum_skip; my $per_thousand = 6.5; my $result = GetOptions ( "contiguous=i" => \$contig_min, "contig-max=i" => \$contig_max, "minimum-skip=i" => \$minimum_skip, "per-thousand=i" => \$per_thousand, ); if (defined $contig_max) { $contig_min = 0 unless defined $contig_min; $contig_max = $contig_min if $contig_max < $contig_min; } if (defined $contig_min) { $contig_max = $contig_min if !defined $contig_max; } $per_thousand = $per_thousand / 1000.0; ##### # Sample the data ##### while (my $InFile = shift) { open INF, '<', $InFile or die "Can't open '$InFile': $!\n"; while (<INF>) { next if $per_thousand < rand; print; if (defined $contig_min) { print scalar <INF> for 0 .. $contig_min + rand +($contig_max-$contig_min); } if (defined $minimum_skip) { <INF> for (1 .. $minimum_skip); } } close INF or die "...closing '$InFile': $!\n"; }
In reply to Data Sampler (Extract sample from large text file) by roboticus
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |