#!/usr/bin/perl use strict; use warnings; use v5.14; # Transpose columns to rows, in preparation for using BioPerl: die "need two arguments (i.e. chr cont) at invocation" unless @ARGV == 2; chomp( my $chr_num = shift ); chomp( my $cont = shift ); open my $in_file, '<', "chr${chr_num}_exome_snps_only.vcf_$cont" or die "Cannot open input file: $!"; open my $out_file, '>', "chr${chr_num}_exome_snps_only_transposed_$cont" or die "Cannot open output file: $!"; my @rows = (); my @transposed = (); while (<$in_file>) { chomp; push @rows, [split]; } foreach my $row (@rows) { foreach my $column ( 0 .. $#{$row} ) { push( @{ $transposed[$column] }, $row->[$column] ); } } foreach my $new_row (@transposed) { foreach my $new_col ( @{$new_row} ) { print $out_file $new_col, "\t"; } print $out_file "\n"; } # Remove unnecessary rows, clean up the SNPs and put into CSV format: open my $new_in_file, '<', "chr${chr_num}_exome_snps_only_transposed_$cont" or die "Cannot open new input file: $!"; open my $new_out_file, '>', "chr${chr_num}_exome_snps_processed_$cont" or die "Cannot open new output file: $!"; while (<$new_in_file>) { chomp; next if $. == 1; # remove header line s/POS/SAMPLE/ if $. == 2; s/\s+/,/g if $. == 2; # replace whitespace with a comma on ID row next if $. >= 3 and $. <= 9; # remove unwanted lines s/\s+(\d)\|(\d)\S+/,$1 $2/g; # clean up the SNPs & put into CSV format print $new_out_file "$_\n"; } =cut sample $in_file (note: hundreds more columns i.e. individuals beginning with HG, and hundreds of thousands more rows): #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG00553 HG00554 HG00637 22 16287226 . C T 100 PASS SNP;BAVGPOST=0.986;BRSQ=0.635;LDAF=0.0167;AVGPOST=0.9859;RSQ=0.6343;ERATE=0.0026;THETA=0.0022;AC=23;AN=2184 GT:DS:GL:BD 0|0:0.000:-0.48,-0.48,-0.48:0.021 0|0:0.000:-0.00,-2.69,-5.00:0.0001 0|0:0.000:-0.00,-2.53,-5.00:0.0001 22 16287365 . C T 100 PASS SNP;BAVGPOST=0.998;BRSQ=0.540;LDAF=0.0030;AVGPOST=0.9959;RSQ=0.4172;ERATE=0.0009;THETA=0.0201;AC=2;AN=2184 GT:DS:GL:BD 0|0:0.000:-0.48,-0.48,-0.48:0.0024 0|0:0.000:-0.00,-4.40,-5.00:0 0|0:0.000:-0.02,-1.38,-5.00:0.0002 22 16287649 . G A 100 PASS SNP;BAVGPOST=0.949;BRSQ=0.519;LDAF=0.0645;AVGPOST=0.9342;RSQ=0.5421;ERATE=0.0115;THETA=0.0095;AC=81;AN=2184 GT:DS:GL:BD 0|0:0.050:-0.17,-0.50,-2.67:0.072 0|0:0.000:-0.00,-2.34,-5.00:0.0002 0|0:0.000:-0.01,-1.86,-5.00:0.0024 22 16287784 . C T 100 PASS SNP;BAVGPOST=0.998;BRSQ=0.844;LDAF=0.0089;AVGPOST=0.9941;RSQ=0.7151;ERATE=0.0030;THETA=0.0043;AC=13;AN=2184 GT:DS:GL:BD 0|0:0.000:-0.48,-0.48,-0.48:0.0104 0|0:0.000:-0.00,-2.79,-5.00:0.0001 0|0:0.000:0.00,-5.00,-5.00:0 22 16287851 . G A 100 PASS SNP;BAVGPOST=0.939;BRSQ=0.764;LDAF=0.1473;AVGPOST=0.9183;RSQ=0.7385;ERATE=0.0015;THETA=0.0309;AC=280;AN=2184 GT:DS:GL:BD 0|0:0.100:-0.08,-0.78,-5.00:0.0406 0|0:0.000:-0.00,-2.76,-5.00:0 0|0:0.000:-0.00,-3.49,-5.00:0 22 16287912 . G A 100 PASS SNP;BAVGPOST=0.998;BRSQ=0.800;LDAF=0.0057;AVGPOST=0.9983;RSQ=0.8580;ERATE=0.0006;THETA=0.0210;AC=11;AN=2184 GT:DS:GL:BD 0|0:0.000:-0.12,-0.63,-4.70:0.0028 0|0:0.000:-0.01,-1.91,-5.00:0.0003 0|0:0.000:-0.00,-2.30,-5.00:0 sample $new_out_file (note: hundreds of thousands more columns, hundreds more rows) SAMPLE,16287215,16287226,16287365,16287649,16287784 HG00553,0 0,0 0,0 0,0 0,0 0 HG00554,0 0,0 0,0 0,0 0,0 0 HG00637,0 0,0 0,0 0,0 0,0 0 HG00638,0 0,0 0,0 0,0 0,0 0 HG00640,0 0,0 0,0 0,0 0,0 0 HG00641,0 0,0 0,0 0,0 0,0 0 HG00731,0 0,0 0,0 0,0 0,0 0