Hello, I'm a beginner programmer, and I know absolutely nothing about Perl. :) However my colleague gave me a Perl script, which basically searches for a list of keywords in an XML file, then it displays the matching lines. But my colleague needs other lines from the XML entry which contained the matching line. How can I do this? Here is the code:
#!/usr/bin/perl use strict; use warnings FATAL => 'all'; our $VERSION=1.01; use English '-no_match_vars'; use Getopt::Long; use Pod::Usage; use Readonly; use Term::ANSIColor qw(:constants); use Try::Tiny; use XML::Simple; use Win32::Console::ANSI; $Term::ANSIColor::AUTORESET = 1; Readonly::Scalar my $EXIT_SUCCESS => 0; Readonly::Scalar my $EXIT_SUCCESS_NO_HIT => 0; Readonly::Scalar my $EXIT_SUCCESS_HIT => 1; Readonly::Scalar my $EXIT_FAILURE_USAGE => 2; Readonly::Scalar my $EXIT_FAILURE_BLAST => 3; Readonly::Scalar my $EXIT_FAILURE_LIST => 4; Readonly::Scalar my $EXIT_FAILURE_GENERIC => 5; Readonly::Scalar my $EMPTY_LIST_SIZE => -1; my $blastFile; my $blackListFile; my $verbose; my $caseSensitive; my $outFile; sub fatalError { my ($msg, $st) = @_; print {*STDERR} RED "*** $msg$RS"; exit ((defined $st)?$st:$EXIT_FAILURE_GENERIC); } sub logWarning { my $msg = shift; print {*STDERR} RED "* $msg$RS"; return; } sub logMsg { my $msg = shift; return unless $verbose; print {*STDERR} WHITE "$msg$RS"; return; } sub parseCmdline { my $help; my $man; GetOptions ( 'verbose' => \$verbose, 'help' => \$help, 'man' => \$man, 'blast=s' => \$blastFile, 'list=s' => \$blackListFile, 'outfile=s' => \$outFile, 'case' => \$caseSensitive, ) or pod2usage(-exitval => $EXIT_FAILURE_USAGE); pod2usage(-exitval => $EXIT_SUCCESS) if ($help); pod2usage(-exitval => $EXIT_SUCCESS, -verbose => 2) if ($man); pod2usage(-exitval => $EXIT_FAILURE_USAGE) if ( ($#ARGV>$EMPTY_LIST_SIZE) or (not defined $blastFile) or (not defined $blackListFile) ); return; } sub readBlast { logMsg("Reading BLAST file $blastFile"); my $blast; try { $blast = XMLin($blastFile); } catch { fatalError("Cannot read BLAST file: $_", $EXIT_FAILURE_BLAST); }; return $blast; } sub getMicroOrgs { my $blast = shift; logMsg('Getting micro organisms from BLAST file'); my $hits = $blast->{BlastOutput_iterations}->{Iteration}->{Iterati +on_hits}->{Hit}; my $ret; push @{$ret}, $_->{Hit_def} foreach (@{$hits}); fatalError('No BLAST iteration found in BLAST file', $EXIT_FAILURE +_BLAST) if ($#{$ret} == $EMPTY_LIST_SIZE); return $ret; } sub readBlackList { my $ret = []; my $fh; logMsg('Reading black list'); open $fh, '<', $blackListFile or fatalError("Cannot open black lis +t file: $ERRNO", $EXIT_FAILURE_LIST); while (my $line = <$fh>) { chomp $line; $line =~ s/#.*//g; next if $line =~ /^\s*$/; if ($caseSensitive) { push @{$ret}, { entry=>$line, rexp=>qr/$line/ #compiled regular expression, so that +we only have to use =~ on the stored value }; } else { push @{$ret}, { entry=>$line, rexp=>qr/$line/i #compiled regular expression, so that + we only have to use =~ on the stored value }; } } close $fh or logWarning("Cannot close $blackListFile: $ERRNO"); return $ret; } sub getHits { my ($tmp1, $tmp2) = @_; logMsg('Checking hits'); my @microOrgs = @{$tmp1}; my @blackList = @{$tmp2}; my $ret = { patterns => {}, cnt => 0 }; foreach my $pa (@blackList) { $ret->{patterns}->{$pa->{entry}} = []; foreach my $mo (@microOrgs) { if ($mo =~ $pa->{rexp}) { push @{$ret->{patterns}->{$pa->{entry}}}, $mo; $ret->{cnt} = $ret->{cnt} + 1; } } } return $ret; } sub printReport { my $hits = shift; print {*STDOUT} CYAN "+--------+$RS"; print {*STDOUT} CYAN "| Report |$RS"; print {*STDOUT} CYAN "+--------+$RS"; print {*STDOUT} $RS; print {*STDOUT} CYAN 'Report generated at ' . localtime(time) . ' +by ' . getlogin . $RS; print {*STDOUT} $RS; print {*STDOUT} CYAN sprintf("BLAST file: %s$RS", $blastFile); print {*STDOUT} CYAN sprintf("Black list file: %s$RS", $blackListF +ile); print {*STDOUT} $RS; print {*STDOUT} CYAN 'Clash search result: '; if ($hits->{cnt} == 0) { print {*STDOUT} GREEN "No clash $RS"; return; } print {*STDOUT} RED sprintf("Clash: %d match(es) found$RS", $hits- +>{cnt}); print {*STDOUT} $RS; print {*STDOUT} CYAN "Clash details$RS"; print {*STDOUT} CYAN "-------------$RS"; foreach my $pattern (keys %{$hits->{patterns}}) { print {*STDOUT} CYAN "$pattern: "; my @hits = @{$hits->{patterns}->{$pattern}}; #if ($#hits == $EMPTY_LIST_SIZE) { # print {*STDOUT} GREEN '0'; # print {*STDOUT} CYAN " match$RS"; #} else { if ($#hits != $EMPTY_LIST_SIZE) { print {*STDOUT} RED ($#hits + 1); print {*STDOUT} CYAN " match(es)$RS"; foreach my $entry (@hits) { print {*STDOUT} RED "\t$entry$RS"; } } } return; } sub saveReport { my $hits = shift; logMsg("Saving report to $outFile in HTML format"); my $fh; open $fh, '>', $outFile or fatalError("Cannot open $outFile for wr +iting: $ERRNO"); print {$fh} '<?xml version="1.0"?>' . $RS; print {$fh} '<html><head><title>blastHit report</title></head><bod +y>'; print {$fh} '<h1>Report</h1>'; print {$fh} '<p>Report generated at ' . localtime(time) . ' by ' . + getlogin . '</p>'; print {$fh} sprintf('<p>BLAST file: %s</p>', $blastFile); print {$fh} sprintf('<p>Black list file: %s</p>', $blackListFile); print {$fh} '<h2>Clash search result</h2>'; if ($hits->{cnt} == 0) { print {$fh} '<p style="color:#005500">No clash</p>'; return; } print {$fh} sprintf ('<p style="color:#990000">Clash: %d match(es) + found</p>', $hits->{cnt}); print {$fh} '<h3>Clash(es) details</h3>'; print {$fh} '<ul>'; foreach my $pattern (keys %{$hits->{patterns}}) { my @hits = @{$hits->{patterns}->{$pattern}}; if ($#hits > $EMPTY_LIST_SIZE) { print {$fh} sprintf('<li>%s: <span style="color:#990000">% +d hit(s)</span></li>', $pattern, ($#hits+1)); print {$fh} '<ul>'; foreach my $entry (@hits) { print {$fh} sprintf('<li style="color:#990000">%s</li> +', $entry); } print {$fh} '</ul>'; } } print {$fh} '<ul>'; print {$fh} '</body></html>'; close $fh or logWarning("Cannot close $outFile: $ERRNO"); return; } sub main { my $blast = readBlast(); my $microOrgs = getMicroOrgs($blast); my $blackList = readBlackList(); my $hits = getHits($microOrgs, $blackList); printReport($hits); saveReport($hits) if ($outFile); return ($hits->{cnt}==0) ? $EXIT_SUCCESS_HIT : $EXIT_SUCCESS_NO_HI +T; } parseCmdline(); main();
(Copyright E<<>CE<>> 2014 Michael Hooreman Use is free to use, adapt and retribribute this tool, but must keep this notice and the original copyright notice. It is strongly appreciate to share any change or comment with the author.)
As I understood it first reads the XML file into the blast variable, then creates an array (microOrgs) of the Hit_def fields of the XML. Then it reads the keywords file, then compares the keywords with the Hit_def array and displays the matches.
As the seemingly simplest solution for me it would be OK if the elements of the microOrgs array were not just the Hit_def fields, but the corresponding Hit_num and Hsp_identity fields too. Dereferencing from the blast variable:
$blast->{BlastOutput_iterations}->{Iteration}->{Iteration_hits}->{Hit} +->{Hit_num} $blast->{BlastOutput_iterations}->{Iteration}->{Iteration_hits}->{Hit} +->{Hit_hsps}->{Hsp}->{Hsp_identity}
So then one element would look like eg. Hit_def | Hit_num | Hsp_identity, and the keywords are searched against this, and so in the output for a match we would get all three information on the same line.
I hope it is clear. Thanks a lot for any help in advance!
In reply to multiple XML fields in one line by smice
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |