#!/usr/bin/perl -w use strict; ## watch typos and errors $|++; ## output my $begin = time; ## just seeing how long we take ## change these my $root = 'C:\Windows\'; ## where the base of our script is my $source = "$root/PDBFind.txt"; ## do NOT use the full PDBFind.TXT file, I mean it! my $out = "$root/out.txt"; ## file to save blast results to my $blast_exe = 'C:\Blast\bl2seq.exe'; ## where our blast program is my $options = '-G 11 -E 1 -W 3 -X 50 -F F -p blastp'; ## our blast options my $min_size = 10; ## don't blast shorty sequences, this isn't a protein anyway ## prolly don't change these my $blast_i = "$root/i.$$"; ## temp files to hold sequences my $blast_j = "$root/j.$$"; ## quick recap so we know what we are doing and where we are print <<"ENDOFPRINT"; SCRIPT : $0 IN : '$source' OUT : '$out' BLAST : '$blast_exe $options -i "$blast_i" -j "$blast_j"' ENDOFPRINT ## lines in PDBFind I care about my $DELIMITER = '//'; my $ID = 'ID : '; my $CHAIN = 'Chain : '; my $SEQUENCE = ' Sequence : '; my $COMPOUND = 'Compound : '; my $SOURCE = 'Source : '; my $HEADER = 'Header : '; ## counters my $pdb_count = 0; my $chain_count = 0; my $blast_count = 0; my $wild_count = 0; my $stub_count = 0; my $dna_count = 0; ## variables relevant to data I am looking for my ( $id, $chain ); my ( @chain_seq_pairs, @chain_wild, @chain_stubby, @chain_dna ); ## open files open( SOURCE, "<$source" ) or die "Can not open $source for reading: $!"; open( OUT, ">$out" ) or die "Can not open $out for writing: $!"; ## parse source and blast chains while ( ) { chomp; next if ( /^\s*$/ ); ## match lines if ( /^$DELIMITER/ ) { ## a holder my @been_blasted; ## a little sanity next unless $id; ## up counter $pdb_count++; ## part of final counter print OUT "${ID}${id}\n"; ## blast everything foreach my $i ( @chain_seq_pairs ) { foreach my $j ( @been_blasted ) { print OUT "${id} $i->[0] $j->[0]", '-' x 30, "\n"; ## fill our blast i and j temp files open( BLASTI, ">$blast_i" ) or die "Can not open $blast_i for writing: $!"; open( BLASTJ, ">$blast_j" ) or die "Can not open $blast_j for writing: $!"; print BLASTI $i->[1]; print BLASTJ $j->[1]; close( BLASTI ); close( BLASTJ ); ## blast away my @blast_results = split( "\n", `$blast_exe $options -i "$blast_i" -j "$blast_j"` ); ## increase counter $blast_count++; ## now cycle through lines, ## put filters, what have yous here foreach my $line ( @blast_results ) { print OUT "${id} $i->[0] $j->[0]| $line\n"; } } ## save current sequence to be blasted against push( @been_blasted, $i ); } print OUT "\n"; ## increase counters $chain_count += scalar( @chain_seq_pairs ); $wild_count += scalar( @chain_wild ); $stub_count += scalar( @chain_stubby ); $dna_count += scalar( @chain_dna ); ## clean up a few things undef @chain_seq_pairs; undef @chain_dna; undef @chain_stubby; undef @chain_wild; undef $id; undef $chain; next; } if ( /^$ID/ ) { $id = $'; next; } ## These are noops, but this is how you can catch their values ## if ( /^$COMPOUND/ ) {} ## if ( /^$HEADER/ ) {} ## if ( /^$SOURCE/ ) {} if ( /^$CHAIN/ ) { $chain = $'; next; } if ( /^$SEQUENCE/ ) { my $tmp = $'; my $bad = 0; if ( $tmp =~ /^x+$/i ) { ## we're a straight wild card chain ## ignore because that means we can be anything ## just a bunch of aa's stuck together but ## we don't know / care about specifics push( @chain_wild, [ $chain, $tmp ] ); $bad++; } if ( $tmp ne uc( $tmp ) ) { ## we're dna, crude test, but it works on the data ## DNA is lower case push( @chain_dna, [ $chain, $tmp ] ); $bad++; } if ( length( $tmp ) <= $min_size ) { ## we're too short push( @chain_stubby, [ $chain, $tmp ] ); $bad++; } next if $bad; push( @chain_seq_pairs, [ $chain, $tmp ] ); next; } } ## clean up close( OUT ); close( SOURCE ); unlink( $blast_i, $blast_j ); ## no error checking?! my $end = time - $begin; print <<"ENDOFPRINT"; PDBs found : $pdb_count Chains found : $chain_count Blasts performed : $blast_count Wild sequences : $wild_count Stubby sequences : $stub_count DNA sequences : $dna_count TIME : $end seconds ENDOFPRINT print "\nDone\n"; exit(0);