#!/usr/local/bin/perl

# this script finds the residue numbers that are associated
# with a particular domain and retrieves the catalytic site 
# residues for that domain

use strict;
use English;
use Data::Dumper;
use UNIVERSAL qw(isa);
use FileHandle;

# declare some variables

my $domain;
my $fsg;

# this is the directory that the domain files are stored

my $dir = "<my directory>";

# this is a file containing the domains of interest. 

my $domFile = shift;

# we are going to go through this file one domain structure at a time
# so we can pull out the appropriate CSA data

# this is a file containing CSA data (from the website

my $csafile = shift;

my %csa_hash;

my %dom_hash;

# lets open the CSA data file

open(CSAFILE, "$csafile") or die "unable to open $csafile: $!\n";

# ok lets get CSA information into a hash lookup

while(<CSAFILE>)
{
    my @csa_data = split(/\,/, $_);

    my $pdb = $csa_data[0];
    my $chain = $csa_data[3];
    my $res = $csa_data[4];

    # i need to concatenate the three for now to obtain
    # a unique key for the hash

    my $csa_record = "$pdb" . "." . "$chain" . "." . "$res";

    # now lets place all the contents in a hash

    $csa_hash{$csa_record} = 1;
}

# lets close the CSA file

close(CSAFILE);

# test printing the array containing the CSA data

#print Dumper(%csa_hash); looks good

# lets open the domFile file

open(DOMFILE, "$domFile") || die "ERROR: Unable to open $domFile for reading: $!\n";

# we can go though the file using a while loop

# we want to get all the residues in the domain 
# in a hash

while(<DOMFILE>)
{
    my @data = split(/\s+/, $_);

    $domain = $data[0];
    #$fsg = $data[1];

    # now to get the residue list of the
    # domain

    # we want this in a hash
    
    # open structure file for reading

    open(FILE, "$dir$domain") || die "Sorry, unable to open $dir$domain for reading\n";

    while (my $line = <FILE>)
    {
	
	chomp $line;
	my @ff = split /\s+/,$line;
	
	my $number = $ff[5];
	my $CA = $ff[2]; # looking just for CA atoms of pdb file
	
	if("$CA" eq "CA")
	{
	   $dom_hash{$domain}{$number} = 1; 
	    
	}
		
    }
    
}

# ok, now we have both the CSA and domain information in hashes
# it should be easy to compare them

for my $protein_dom ( keys %dom_hash )
{
    for my $residue (keys %{ $dom_hash{$protein_dom} } )
    {
	# in order to compare the domain data and the
	# CSA data I need to extract the chain data

	my $domchain = substr($protein_dom, 0, 5);

	# ok, now we just check if the same information 
	# can be found in the CSA hash
    
	while ((my $key, my $value) = each(%csa_hash))
	{
	    #print "$key\n";

	    # ok we now need to split up the key in the hash

	    my $pdb = substr($key, 0, 4);
	    
	    my $chain = substr($key, 5, 1);
	    
	    my $res = substr($key, 7, 6);
	    
	    $res =~/\S/g; # getting rid of excess white space

       	    #print "$pdb $chain $res\n"; # works great. using substring as I dont really want to create an array

	    my $pdbchain = "$pdb" . "$chain"; # concatenate together so to compare with domain data

	    # print "$pdbchain\n"; # works fine

	    # lets be on the safe side and remove white space
	    # for domchain and pdbchain

	    $domchain =~/\S/g;
	    $pdbchain =~/\S/g; 

	    chomp($domchain);
	    chomp($pdbchain);

	    # lets find CSA data for a particular domain

	    if("$domchain" eq "$pdbchain")
	    {	

		# print off residues that are common
		# to both

		if("$res" eq "$residue")
		{
		    print "$domchain $pdbchain $residue $res\n";

		    
		}	
	    }
	    
  
	}
    }

}