use strict;
use warnings;
use Text::CSV;
use DBI;
# input filename, and output file template with %d for interval #
my $input_filename = 'td.data';
my $output_filename = 'split_%d.data';
# Divide loci into groups of one million per output file
sub calculate_interval { return int((shift) / 1000000) };
my $dbh = DBI->connect ("dbi:CSV:", undef, undef, {
csv_eol => "\n",
csv_sep_char => "\t",
csv_class => "Text::CSV_XS",
csv_null => 1,
csv_tables => { genetics => {
f_file => $input_filename,,
col_names => [qw(a b c d locus f g h i j k l m n o)],
}},
RaiseError => 1,
PrintError => 1,
}) or die $DBI::errstr;
# Magic
my $sth = $dbh->prepare("select * from genetics order by locus");
$sth->execute;
# Grunt work to output into separate files
$, = "\t";
my $output;
my $output_interval = -1;
while (my @row = $sth->fetchrow_array) {
my $interval = calculate_interval $row[4];
if ($interval ne $output_interval) {
$output_interval = $interval;
open $output, '>', sprintf($output_filename, $interval)
or die "$output_filename $!";
}
print $output @row, "\n";
}
####
0 50 4 46 723430 0 2 1 2 1 1 1 1 3 1
0 50 4 46 5533723430 0 2 1 2 1 1 1 1 3 1
0 50 4 46 33723430 0 2 1 2 1 1 1 1 3 1
0 50 2 48 654732 0 1 1 1 0 2 3 2 1 3
####
split_0.data:0 50 2 48 654732 0 1 1 1 0 2 3 2 1 3
split_0.data:0 50 4 46 723430 0 2 1 2 1 1 1 1 3 1
split_33.data:0 50 4 46 33723430 0 2 1 2 1 1 1 1 3 1
split_5533.data:0 50 4 46 5533723430 0 2 1 2 1 1 1 1 3 1