| Category: | Utility Scripts |
| Author/Contact Info | Corion |
| Description: | A counterpart to part, it allows you to join two files side by side according to common values. This is similar to the join UNIX command except that the join command expects the input files to be sorted according to the keys, while this program will slurp the second file into a hash and then output the result according to the order of the first file. Optionally (and untested) it can use a tied hash as on-disk storage in the case that the storage for the files is larger than the available RAM. |
#!/usr/bin/perl -w
use strict;
use Getopt::Long;
use File::Temp qw( :POSIX );
use vars qw($VERSION);
$VERSION = '0.03';
# Try to load Pod::Usage and install a fallback if it doesn't exist
eval {
require Pod::Usage;
Pod::Usage->import();
1;
} or do {
*pod2usage = sub {
die "Error in command line.\n";
};
};
GetOptions(
"disk" => \my $do_tie,
"on|j=s" => \my $joincol,
"left|j1|1=s" => \my @left_key_cols,
"right|j2|2=s" => \my @right_key_cols,
"output|o" => \my @output_fieldlist,
"delimiter|t|d=s" => \my $delimiter,
"output-delimiter|od" => \my $output_delimiter,
"missing|v=i" => \my @missing,
"null|n=s" => \my $nullvalue,
"warn-on-duplicates|u=s" => \my @warn_on_duplicates,
"die-on-duplicates|s=s" => \my @die_on_duplicates,
"smart-duplicates" => \my $smart_duplicates,
"progress|verbose|p" => \my $progress,
'help' => \my $help,
'version' => \my $version,
) or pod2usage(2);
pod2usage(1) if $help;
if (defined $version) {
print "$VERSION\n";
exit 0;
};
pod2usage("$0: No files given.") if ((@ARGV == 0) && (-t STDIN));
$delimiter ||= "\t";
$nullvalue ||= "";
$output_delimiter ||= $delimiter;
$joincol ||= 1;
if (! @left_key_cols) { @left_key_cols = $joincol; };
if (! @right_key_cols) { @right_key_cols = $joincol; };
my %output_missing = map { $_ => $_ } @missing;
my %col_count;
for (\@left_key_cols, \@right_key_cols) {
@$_ = map { split /,/ } @$_;
};
if (@left_key_cols != @right_key_cols) {
local $" = ",";
warn "Left keys: @left_key_cols\n";
warn "Right keys: @right_key_cols\n";
die "Differing number of key columns between left and right - that
+ is wrong.\n";
};
# Adjust the indices for the join columns:
for (@left_key_cols, @right_key_cols) {
$_--
};
my @output_cols = ('1.*','2.%');
if (@output_fieldlist) {
@output_cols = map { split /,/ } @output_fieldlist;
};
if (@warn_on_duplicates and not @die_on_duplicates) {
@warn_on_duplicates = (2);
};
my %on_duplicate;
$on_duplicate{ $_ } = sub { warn "Duplicate key '$_[0]' for row >>$_[1
+]<< in file $_[2]\n" } for @warn_on_duplicates;
$on_duplicate{ $_ } = sub { die "Duplicate key '$_[0]' for row >>$_[1
+]<< in file $_[2]\n" } for @die_on_duplicates;
my %right; # The index into the right file
my %seen; # The keys we processed from the left file
my @CLEANUP;
if ($do_tie) {
require DB_File;
my $rn = tmpnam;
tie %right, 'DB_File', $rn;
my $sn = tmpnam;
tie %seen, 'DB_File', $sn;
push @CLEANUP, $rn, $sn;
};
END {
if ($do_tie) {
untie %right;
untie %seen;
};
for (@CLEANUP) {
unlink $_ or warn "Couldn't remove tempfile '$_' : $!\n";
};
};
my ($left,$right) = @ARGV;
# Read the right file into the hash
open my $rfh, "<", $right
or die "Couldn't read '$right': $!";
open my $lfh, "<", $left
or die "Couldn't read '$left': $!";
sub key {
my ($cols,$col_info) = @_;
return join $delimiter, @{ $cols }[ @$col_info ];
};
sub output {
my @lr = (@_);
my @output = map { /^(\d)\.(\d+)/ or die "Invalid column spec '$_'
+"; $lr[$1-1]->[$2-1] } @output_cols;
print join($delimiter, @output), "\n";
};
sub expand_output_columns {
my (@list) = @_;
my %keycols = (
1 => +{ map { $_+1 => 1 } @left_key_cols },
2 => +{ map { $_+1 => 1 } @right_key_cols },
);
my @res = map { /(\d)\.\*/ ? (map { "$1.$_" } (1..$col_count{ $1 }
+))
: /(\d)\.\%/ ? (map { "$1.$_" } grep { ! exists $key
+cols{$1}{$_}} (1..$col_count{ $1 }))
: $_
} @list;
@res
};
warn "Reading $right"
if $progress;
while (<$rfh>) {
chomp;
my @right_cols = split /\Q$delimiter\E/;
$col_count{ 2 } ||= @right_cols;
my $key = key( \@right_cols, \@right_key_cols );
if ($right{ $key } and $on_duplicate{2}) {
my $diff = $right{ $key } ne $_;
if ($diff or !$smart_duplicates) {
$on_duplicate{2}->($key,$_,$right)
};
};
$right{ $key } = $_;
};
# Read the left file and output the generated lines (if any)
warn "Processing $left"
if $progress;
my $expanded_output_columns;
while (<$lfh>) {
chomp;
my @left_cols = split /\Q$delimiter\E/;
$col_count{ 1 } ||= @left_cols;
my $key = key( \@left_cols, \@left_key_cols );
if ($seen{ $key } and $on_duplicate{1}) {
my $diff = $seen{ $key } ne $_;
if ($diff or !$smart_duplicates) {
$on_duplicate{1}->($key,$_,$left)
};
};
$seen{ $key }++;
my $out;
my @right_cols;
if (exists $right{ $key }) {
@right_cols = split /\Q$delimiter\E/, $right{ $key };
} else {
@right_cols = ($nullvalue) x $col_count{ 2 };
};
if (exists $right{ $key } or $output_missing{1}) {
if (! $expanded_output_columns) {
@output_cols = expand_output_columns(@output_cols);
$expanded_output_columns++;
};
output \@left_cols, \@right_cols;
};
};
@output_cols = expand_output_columns(@output_cols);
if ($output_missing{2}) {
warn "Writing right-missing keys"
if $progress;
my @left_cols = ($nullvalue) x $col_count{ 1 };
while ((my ($key,$v)) = each %right) {
if (! $seen{ $key }) {
my @right_cols = split /\Q$delimiter\E/, $v;
output \@left_cols, \@right_cols;
};
};
};
__END__
=head1 NAME
join - join two files by common key columns
=head1 SYNOPSIS
join.pl [OPTIONS] FILE1 FILE2
join.pl --on 1,2 file1.txt file2.txt
join.pl --left 1,2 --right 3,4 file1.txt file2.txt
=head1 OPTIONS
=item B<--on COL> - specify a single column number to join both files
+on
This is a shorthand for C<--left COL --right COL>
=item B<--missing FILE> - output rows only in one file
C<--missing 1> will output rows that only exist in the left file.
=item B<--null VAL> - string for the null value
When a row is output through the C<--missing> option, the missing
values will be replaced by the value given.
The default is an empty string, "".
Example: --null NULL
=item B<--warn-on-duplicates FILE> - output a warning if duplicate key
+s are found in the file
=item B<--die-on-duplicates FILE> - die if duplicate keys are found in
+ the file
These options govern how the program behaves when it encounters
duplicate keys in a file.
=item B<--smart-duplicates> - be smart about duplicates
This setting enables smart duplicate handling that will
only consider a row as duplicate if the key is identical but
the remaining values differ.
=item B<--left COL1,COL2> - specify key columns for the left file
=item B<--right COL1,COL2> - specify key columns for the right file
The column counts starting at 1. The default column is 1.
=item B<--output COL1,COL2> - specify columns to output
If you want to reorder or omit columns use this to
list the columns. Each column must be in the format
C<M.N> where M is either 1 for the left file or
2 for the right file, and N is the column number.
There are two shorthands:
C<M.*> will include all columns from the source file
in source order.
C<M.%> will include all columns from the source file
except the key columns in source order.
The default is C<1.* 2.%>, which will append
the non-key columns of the right file to the left file.
=item B<--delimiter DEL> - specify column input delimiter
The default input delimiter is a tab. No automatic
delimiter recognition is done yet.
=item B<--output-delimiter DEL> - specify output column delimiter
The output column delimiter defaults to the input
column delimiter.
=item B<--progress> - be verbose in the progress
Some diagnostic messages will be output to STDERR
as the program progresses.
=item B<--disk> - use disk memory for joining instead of RAM
This will use disk memory for storing the index instead
of using RAM.
=item B<--version> - print program version
Outputs the program version.
=item B<--help> - print this page
Outputs this help text.
|
|
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re: join - join two files according to a common key
by Anonymous Monk on Mar 12, 2018 at 16:09 UTC | |
by Anonymous Monk on Mar 12, 2018 at 16:11 UTC |