#### The file has the following headers for each record###
Exon #
Gene id
Nm_id
snoRNA Key
text Sequence Query, subject
Gene name and weblink
##Start the records###
3
GI:91982771
NM_001040105.1
snoRNA 10
Query 4 TGGAGTCAAT 13
||||||||||
Sbjct 4854 TGGAGTCAAT 4845
Homo sapiens mucin 17, cell surface associated (MUC17), mRNA.
http://www.ncbi.nlm.nih.gov/sites/entrez?cmd=Retrieve&db=nucleotide&dopt=GenBank&RID=UDU305DZ01N&log%24=nuclalign&blast_rank=97&list_uids=91982771
3
GI:154448895
NM_001100162.1
snoRNA 25, 26 and 27
Query 2 CCTGGAGTCGAGTG 15
||||||||||||||
Sbjct 146 CCTGGAGTCGAGTG 133
Homo sapiens exportin 7 (XPO7), transcript variant 3, mRNA.
http://www.ncbi.nlm.nih.gov/sites/entrez?cmd=Retrieve&db=nucleotide&dopt=GenBank&RID=UDW41RSS01S&log%24=nuclalign&blast_rank=2&list_uids=154448895
31
4 different hits
GI:153945877
NM_002458.1
snoRNA 25, 26 and 27
Query 3 CTGGAGTCGAGTG 15
|||||||||||||
Sbjct 6818 CTGGAGTCGAGTG 6806
Query 3 CTGGAGTCGAGTG 15
|||||||||||||
Sbjct 8489 CTGGAGTCGAGTG 8477
Query 3 CTGGAGTCGAGTG 15
|||||||||||||
Sbjct 10589 CTGGAGTCGAGTG 10577
Query 3 CTGGAGTCGAGTG 15
|||||||||||||
Sbjct 12260 CTGGAGTCGAGTG 12248
Homo sapiens mucin 5B, oligomeric mucus/gel-forming (MUC5B), mRNA.
http://www.ncbi.nlm.nih.gov/sites/entrez?cmd=Retrieve&db=nucleotide&dopt=GenBank&RID=UDW41RSS01S&log%24=nuclalign&blast_rank=9&list_uids=153945877
4
GI:150418008
NM_206862.2
snoRNA 25, 26 and 27
Query 1 ACCTGGAGTCGAG 13
|||||||||||||
Sbjct 4775 ACCTGGAGTCGAG 4763
Homo sapiens transforming, acidic coiled-coil containing protein 2 (TACC2), transcript variant 1, mRNA.
http://www.ncbi.nlm.nih.gov/sites/entrez?cmd=Retrieve&db=nucleotide&dopt=GenBank&RID=UDW41RSS01S&log%24=nuclalign&blast_rank=10&list_uids=150418008
####
#!/usr/local/bin/perl
use strict;
use warnings;
open (FH,'<',"F:/Bioinformatics_NCBI/20MARCH_10/PERL Analysis/test.txt") or die("$!\n");
open(FO, '>',"F:/Bioinformatics_NCBI/20MARCH_10/PERL Analysis/testOut.txt") or die ("$!\n"); #TESTING
my (@snoRNA, @geneID, @productID, @geneNames, @references,@queries,@subjects);
while(){
chomp;
if(/(?=^\d+$)/../(?=http:.*)\n/){ #range matching
# s/\W+\n+!\W+//;
next unless /(\w+ |\| | \n+)/x; #except for words | pipes | \n
print FO $_, "\n" ;
}
if(/snoRNA(\s+|\d+)[\s|-|\d]/){ #snoRNA
push @snoRNA, $_;
}
if(/^\d+$/){ #exon Numbers
push @exonNumbers, $_;
}
if(/^GI:\d+[\.\d+]/){ #gene Names
push @geneID , $_;
}
if(/^NM_\d+[\.\d+]/){ #gene product ID
my $name = $_;
$name =~ s/\s+$//; #substitute the trailing blanks..
push @productID, $name;
}
if(/homo sapiens[\w+\W+]/i){ #gene name, Need MultiLine support..
my $name = $_;
push @geneNames, $name;
}
if(/http:.*/){ #web refs, need multiline support..
my $name = $_;
push @references, $name;
}
if(/^Query(\s+)\d+\s+[agtc]/i){ #Prepare the query and subject arrays
my $queryName = $_;
$queryName =~ s/$1//;
push @queries, $queryName;
}
if(/^sbjct(\s+)\d+\s+[agtc]/i){
my $sbjctName = $_;
$sbjctName =~ s/$1//;
push @subjects, $sbjctName;
}
}
####