#!/usr/bin/perl
use warnings;
use strict;

# Formats a PennCNV output file to remove extraneous information. Also removes
# CNVs containing less than 5 consecutive SNPs. Invoke this program with the name 
# of the file to be modified.

# Open input file and create output file, adding suffix '.truncated' to filename:
open( my $file, '<', $ARGV[0] ) or die "Cannot open for reading, $!";
open( my $out, '>', $ARGV[0] . '.truncated' )
  or die "Cannot open for writing, $!";

# Print headings to output file:
printf $out "%-28s %-30s %-12s %-10s %-18s %-22s %-21s %s \n\n", "Sample I.D.",
"Chromosome & coordinates", "Copy number", "No. SNPs", "CNV length (bp)", "First SNP", "Last SNP", "Overlapping Gene(s)";

# Loop that matches each valid line of input file, using capturing parentheses to
# isolate each separate field. We also use '/x' flag to allow arbitrary whitespace
# so we can break the regular expression up and add comments for readability:
while (<$file>) {
    if (
/
(chr\d+:\d+-\d+) 	# $1	Chromosome & coordinates
\s+
(numsnp=\d+)		# $2	Number of SNPs
\s+
(length=\S+)		# $3	CNV length (bp)
\s+
(state\d+)		# $4	HMM state
,
(cn=\d+)		# $5	Copy number
\s+
(\S+)		        # $6	File directory
\/
(\S+)			# $7	Sample I.D.
\s+
(startsnp=rs\d+)	# $8	First SNP in CNV
\s+
(endsnp=rs\d+)		# $9	Last SNP in CNV
\s+
(\S+)			# $10	Gene(s) overlapping CNV
\s+
(\S+)			# $11	Distance of gene(s) from CNV
/x

       and ( !/numsnp=[1-4]\s+/ )  # we also ignore CNVs with less than 5 SNPs

	   ) {
# Print each line to output file using left-justified formatting:
printf $out "%-28s %-30s %-12s %-10s %-18s %-22s %-21s %s \n",
	         $7, $1, $5, $2, $3, $8, $9, $10;
    }
}

# Close the open filehandles:
close($file);
close($out);