#!/usr/bin/perl

=intro
Loci File Filter

This program will accept as input one file of genome loci (such as a VCF file)
and one BED file of regions, filtering the loci file to include only the loci
in the regions specified in the BED file. The program should be invoked with
two arguments: first the name of the BED file, then the name of the loci file.

Specifications:
Both files should be whitespace-delimited 
   The loci file should have at least 2 columns
      column 1 is the chromosome number
      column 2 is the coordinate of the locus
         any (optional) extra columns will be kept in the output file
   The BED file has exactly 3 columns
      column 1 is the chromosome number
      column 2 is the start coordinate
         the start coordinate itself is excluded i.e. output begins from start coordinate + 1
      column 3 is the end coordinate
         the end coordinate itself is included
Both files may have header/comment lines beginning with a '#' character
   These lines in the loci file are always included in the output file in their entirety
Both files may or may not have the letters 'chr' before the chromosome number
Both files should be sorted by coordinate, but chromosome order may differ between files
=cut

# Get input filenames on program invocation:
die "give the BED and loci (VCF) filenames as invocation arguments"
  unless @ARGV == 2;
my ( $bed, $loci ) = @ARGV;

# Check order of filenames given on invocation is correct:
my ($ext) = $bed =~ /(\.[^.]+)$/;    # get the file extension (should be '.bed')
if ( $ext ne '.bed' ) {
    die "please enter the BED filename *first*, followed by the loci filename";
}

# Get 2 input and 1 output filehandles:
open my $bed_file,  '<', "$bed"  or die "cannot open BED file: $!\n";
open my $loci_file, '<', "$loci" or die "cannot open loci file: $!\n";
open my $out_file, '>', "$loci.filtered_loci"
  or die "cannot open output file: $!\n";

# Build a hash structure from the BED file. Each coordinate between the start and
# the end coordinates is given a true value (i.e. 1); everything else is excluded:
my %sticks;
while (<$bed_file>) {
    chomp;
    next if /^#/;   # skip lines beginning with '#'

    my ( $bed_chr, $bed_start, $bed_end ) = split(/\s+/);
    $bed_chr =~ s/\D*(\d+)/$1/;    # remove 'chr' (should it exist)

    foreach my $coord ( ( $bed_start + 1 ) .. $bed_end ) {
        vec( $sticks{$bed_chr} //= '', $coord, 1 ) = 1;
    }
}

# Run through each line of the loci file:
while (<$loci_file>) {
    chomp;

    # keep lines beginning with '#':
    if (/^#/) {
        print $out_file "$_\n";
        next;    # go immediately to next line in the loci file
    }

	# of the remaining lines, only keep those that have a 'true' entry in the BED file hash:
    my ( $loci_chr, $loci_pos ) = split(/\s+/);    # ignore extra columns
    $loci_chr =~ s/\D*(\d+)/$1/;    # remove 'chr' (should it exist)

    if ( vec( $sticks{$loci_chr}, $loci_pos, 1 ) ) {
        print $out_file "$_\n";
    }
}