11 608502 11 1016988 11 1017495 11 1018088 11 3661585 #### #!/usr/bin/perl use warnings; use strict; use v5.10; # Create program to read in a series of VCF files, outputting a two-column # file consisting of chromosome and coordinate for each site seen in one or # more files ('master-site-list'). die "Give the names of each VCF" if @ARGV < 1; my $number_of_files = @ARGV; my $infile; my $current_file_number = 0; my %hash_of_chroms; open my $outfile, '>', "master-site-list" or die "Cannot open output file: $!"; until ( $current_file_number >= $number_of_files ) { open $infile, '<', $ARGV[$current_file_number] or die "Cannot open VCF file: $!"; while (<$infile>) { chomp; next if /^#/; my ( $chr, $coord ) = split(/\s+/); push @{ $hash_of_chroms{$chr} }, $coord unless defined $hash_of_chroms{$chr}[$coord]; # don't add duplicates } $current_file_number++; close $infile; } my %sorted_hash_of_chroms; foreach my $chrom ( keys %hash_of_chroms ) { push @{ $sorted_hash_of_chroms{$chrom} }, sort { $hash_of_chroms{$a}->[0] <=> $hash_of_chroms{$b}->[0] } keys %hash_of_chroms; } foreach my $chrom ( keys %hash_of_chroms ) { say $outfile "$chrom\t@{ $hash_of_chroms{$chrom} }"; } close $outfile; #### ##fileformat=VCFv4.1 ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA19238 1 874816 . C CT 10 FINAL_AMBG SF=bcm_illum,bi_illum;validation_ratio=52/1019;sets=bcm_illum,bi_illum;samples=NA19238 GT 1/0 1 1647893 . C CTTTCTT 30 FINAL_NOT_CONFIRMED SF=bcm_illum,bcm_solid;validation_ratio=0/3380;sets=bcm_illum,bcm_solid;samples=NA19238 GT 0/0 1 7889972 rs57875989 GAGAATCCATCCCATCCTACTGCCAGCGCTCTGTCCACAGGATCGCCTCCCATGA G 33190 DESIGN_FAIL SF=bi_illum GT ./. 1 14106394 . A ACTC 100 FINAL_CONFIRMED SF=bcm_illum,bcm_solid,bc_illum;validation_ratio=864/1754;sets=bcm_illum,bcm_solid,bc_illum;samples=NA19238 GT 1/0