#!/usr/bin/perl =intro Loci File Filter This program will accept as input one file of genome loci (such as a VCF file) and one BED file of regions, filtering the loci file to include only the loci in the regions specified in the BED file. The program should be invoked with two arguments: first the name of the BED file, then the name of the loci file. Specifications: Both files should be whitespace-delimited The loci file should have at least 2 columns column 1 is the chromosome number column 2 is the coordinate of the locus any (optional) extra columns will be kept in the output file The BED file has exactly 3 columns column 1 is the chromosome number column 2 is the start coordinate the start coordinate itself is excluded i.e. output begins from start coordinate + 1 column 3 is the end coordinate the end coordinate itself is included Both files may have header/comment lines beginning with a '#' character These lines in the loci file are always included in the output file in their entirety Both files may or may not have the letters 'chr' before the chromosome number Both files should be sorted by coordinate, but chromosome order may differ between files =cut # Get input filenames on program invocation: die "give the BED and loci (VCF) filenames as invocation arguments" unless @ARGV == 2; my ( $bed, $loci ) = @ARGV; # Check order of filenames given on invocation is correct: my ($ext) = $bed =~ /(\.[^.]+)$/; # get the file extension (should be '.bed') if ( $ext ne '.bed' ) { die "please enter the BED filename *first*, followed by the loci filename"; } # Get 2 input and 1 output filehandles: open my $bed_file, '<', "$bed" or die "cannot open BED file: $!\n"; open my $loci_file, '<', "$loci" or die "cannot open loci file: $!\n"; open my $out_file, '>', "$loci.filtered_loci" or die "cannot open output file: $!\n"; # Build a hash structure from the BED file. Each coordinate between the start and # the end coordinates is given a true value (i.e. 1); everything else is excluded: my %sticks; while (<$bed_file>) { chomp; next if /^#/; # skip lines beginning with '#' my ( $bed_chr, $bed_start, $bed_end ) = split(/\s+/); $bed_chr =~ s/\D*(\d+)/$1/; # remove 'chr' (should it exist) foreach my $coord ( ( $bed_start + 1 ) .. $bed_end ) { vec( $sticks{$bed_chr} //= '', $coord, 1 ) = 1; } } # Run through each line of the loci file: while (<$loci_file>) { chomp; # keep lines beginning with '#': if (/^#/) { print $out_file "$_\n"; next; # go immediately to next line in the loci file } # of the remaining lines, only keep those that have a 'true' entry in the BED file hash: my ( $loci_chr, $loci_pos ) = split(/\s+/); # ignore extra columns $loci_chr =~ s/\D*(\d+)/$1/; # remove 'chr' (should it exist) if ( vec( $sticks{$loci_chr}, $loci_pos, 1 ) ) { print $out_file "$_\n"; } }