Beefy Boxes and Bandwidth Generously Provided by pair Networks
Think about Loose Coupling
 
PerlMonks  

File Chunkifier

by husker (Chaplain)
on Jun 15, 2004 at 14:09 UTC ( [id://366897]=sourcecode: print w/replies, xml ) Need Help??
Category: Utility Scripts
Author/Contact Info husker (Mark Landin)
Description: Splits a file into N evenly-sized chunks, or into chunks with at most N lines each. Works on Windows or UNIX. Allows header or footer text to be prepended/appended to each output file.
#!/opt/perl5/bin/perl -w
##  
##  Split a file into 'num' equal-size chunks, or into chunks with 
##  at most 'num' lines each.
##
##  Usage:
##     chunkify <-n num | -l num> [-v] [-o <output name>] 
##         [-f <footer>] [-h <header>]  <input file>
##
#

use strict;
use Getopt::Std;

my ($verbose, $outfront, $outback, $froot, $chunks, $lines);
my ($infile, $headfile, @headlines, $footfile, @footlines);
my ($rem,@line,$fname,$x,$y,$opts);
my (%Options);

##
##  Right now, assume the output file is based on the input file.
##  This can be overridden by the -f option.
##
$infile = $ARGV[$#ARGV];
$froot = $infile;

$opts = getopts ('vo:h:f:n:l:',\%Options);

$lines=0;
$chunks=0;

if ($opts) {
    if ($Options{v}) {
        $verbose=$Options{v};
    }
    if ($Options{o}) {
        $froot = $Options{o};             
    }
    if ($Options{l} && $Options{n}) {
        print "\nERROR: Cannot use -l and -n simultaneously.\n";
        exit(1);
    }
    if (! $Options{l} && ! $Options{n}) {
        print "\nERROR: Either -n or -l must be specified.\n";
        exit (1);
    }
    if ($Options{n}) {
        $chunks=$Options{n};
    }
    if ($Options{l}) {
        $lines=$Options{l};
    }
    if ($Options{f}) {
        $footfile=$Options{f};
    }
    if ($Options{h}) {
        $headfile=$Options{h};
    }
        
} else {
    if (exists $Options{n} && !defined $Options{n}) {
        print "\nERROR: -n requires a numeric argument if used.\n";
    }
    if (exists $Options{l} && !defined $Options{l}) {
        print "\nERROR: -l requires a numeric argument if used.\n";
    }
    if (exists $Options{o} && !defined $Options{o}) {
        print "\nERROR: -o requires a filename if used.\n";
    }
    if (exists $Options{h} && !defined $Options{h}) {
        print "\nERROR: -h requires a filename if used.\n";
    }    
    if (exists $Options{f} && !defined $Options{f}) {
        print "\nERROR: -f requires a filename if used.\n";
    }
    ShowUsage();
}

$x=0;
$y=1;

($outfront,$outback) = split ("%",$froot);


##
##  If header and/or footer files were specified, get their contents
##

if (defined $headfile) {
    open (INFILE, $headfile) || die "Could not open header file $headf
+ile: $!";
    @headlines = <INFILE>;
    close (INFILE)
}

if (defined $footfile) {
    open (INFILE, $footfile) || die "Could not open footer file $footf
+ile: $!";
    @footlines = <INFILE>;
    close (INFILE)
}

open (INFILE, $infile) || die "Could not open file $infile: $!";

##
##  Either we were given the "lines" option, or we were given the "chu
+nks" option.
##  If we got chunks, we need to figoure out how many lines that's goi
+ng to be, and
##  the only way is to read through the file, counting lines.
##


if ($chunks != 0) {
    while (<INFILE>) {
        $x++;
    }
    $lines = sprintf "%d", ($x / $chunks);
##  Did it divide out evenly? (eg: 141 lines into 13 "equal" files ain
+'t evenly)
##  If not, some files will need an extra line.
    foreach $y (1..$chunks) {
        $line[$y]=$lines
    }
    $rem = $x - ($chunks * $lines);
    if ($rem != 0) {
        foreach $y (1..$rem) {
            $line[$y]++;
        }
    }    
    print "Input file has $x lines; will put at least $lines lines in 
+each output file.\n" if $verbose;
    close INFILE || die "ERROR: Could not close $infile: $!";
        open (INFILE, $infile) || die "Could not open file $infile: $!
+";;
}
    
$y=0;
until (eof INFILE) {
    $y++;
    if (defined $outback) {
        $fname = $outfront . (sprintf "%0.3d",$y) . $outback; 
        } else {
        $fname = $outfront . (sprintf "%0.3d",$y);
    }
    $lines= $line[$y] if (defined $line[$y]);
    print "Creating $fname with $lines lines\n" if $verbose;
    write_chunk();

}

close (INFILE);

sub write_chunk {
    open (OUTFILE,">$fname") || die "ERROR Could not open output file 
+$fname: $!";
    $x=0;
    if (@headlines) {
        print OUTFILE @headlines || die "ERROR writing header to $fnam
+e: $!";
    }
    while(<INFILE>) {
        print OUTFILE $_ || die "ERROR writing to output file $fname: 
+$!";
        $x++;
        last unless $x % $lines;  
    }
    if (@footlines) {
        print OUTFILE @footlines || die "ERROR writing footer to $fnam
+e: $!";
    }
    close OUTFILE || die "ERROR closing $fname: $!";
}

sub ShowUsage {
    print <<EOD;

Usage: chunkify.pl <-n num | -l num> [-o <output filename>] [-v] 
        [-h <header text file>] [-f <footer text file>] <input file na
+me>

Required paremeters (specify only one):
   -n : split input file name into 'num' (more-or-less) equal-size fil
+es.
   -l : split input file into files with 'num' lines.
   In most cases, not all output files will have the same number of li
+nes. If -l is used,
     each file will have that many lines, except for the last file, wh
+ich may have fewer.
     If -n is used, each file will have X or X+1 lines, where X is cal
+culated based on 
     number of output files required and size of the input file.
Optional parameters:
   -v : verbose output
   -o : output file name. Use % sign to indicate position of sequence 
+number. If % not
        specified, sequence number will be appended. 
    Examples:
       file%.txt: will generate file001.txt, file002.txt, file003.txt 
+...
       file.txt: will generate file.txt001, file.txt002, file.txt003 .
+..
       %file: will generate 001file, 002file, 003file ...
    If -o is not specified, output filename will be built from input f
+ile name, with 
    sequence number appended.
   -h : file containing text to be prepended to each created output fi
+le.
   -f : file containing text to be appended to each created output fil
+e.
EOD
}

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Node Status?
node history
Node Type: sourcecode [id://366897]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others admiring the Monastery: (4)
As of 2024-04-25 13:37 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found