Cappadonna3030 has asked for the wisdom of the Perl Monks concerning the following question:

Hello: I recently wrote a script that will extract medical data from XML. I've been using XML::DOM. however, upon further testing. I realize that for my needs (this will be an automated script handling upto 7 20k XML documents at one time), that DOM isn't flexible enough.

Upon further research, realize that XML::SAX (or any general event driven XML parser) is probably my best bet for my requirements. However, I've never used SAX before and I have no idea how to begin to change over my code. I I've been pooring over web documents (including an online PDF from the PERL and XML books from O'reilly and New Rider, and I am still lost (or maybe its panic taking hold?)

Here is my original, XML::DOM code

package CathReport; require XML::DOM; use Date::Manip; use POSIX qw(ceil floor); use strict; sub new { my $self = {}; $self->{CASE_HEADER} = undef; $self->{PATIENT} = undef; $self->{STAFF} = undef; $self->{PROC} = undef; $self->{LAB} = undef; $self->{PROC} = undef; $self->{ALDRETE} = undef; $self->{MED} = undef; $self->{CONDITIONS} = undef; bless($self); # but see below return $self; } 1; ## ³ sub CamPatData { my $obj = shift; Date_Init("TZ=EST"); if (@_) { my $file = shift; my $parser = XML::DOM::Parser->new(); my $doc = $parser->parsefile($file); my (@patdemo, @header); my (%staff_set, %labs, %inv, %cond); my(%staff, %p1, %lab, %p2, %diag); my (%aldrete, %notes, %vital, %tools); my (%header); my ($cathno, $t_arrive, $t_end, $t_start, $t_event, $event_nam +e); my %cathevents; my ($ln, $fn, $sex, $dob, $htu, $htn, $wtu, $wtn, $patnum, $pa +tient, $dos, $stat, $proct, $endt); my ($s1, $s2, $s3, $workerbee); my ($chemical, $bc, $bu, $bv); my ($procname, $procsite, @sites, $amounts, $connum, $loc, $pi +n); my ($cdate, $ctime, $csite, $chr1, $chr2); foreach my $cde ($doc->getElementsByTagName('CathDefinedEvents +')) { $event_name = $cde->getAttribute('DefinedEventText'); $t_event = $cde->getElementsByTagName('EventTime')->item(0 +)->getFirstChild->getNodeValue; $cathevents{$event_name} = $t_event; } #Get Patient Data foreach $patient($doc->getElementsByTagName('SA_PATIENT')) { $ln = $patient->getElementsByTagName('MY_LAST_NAME')->item +(0)->getFirstChild->getNodeValue; $fn = $patient->getElementsByTagName('MY_FIRST_NAME')->ite +m(0)->getFirstChild->getNodeValue; $dob = $patient->getElementsByTagName('MY_BIRTH_DATE')->it +em(0)->getFirstChild->getNodeValue; $sex = $patient->getElementsByTagName('MY_GENDER')->item(0 +)->getFirstChild->getNodeValue; $dob = $patient->getElementsByTagName('MY_BIRTH_DATE')->it +em(0)->getFirstChild->getNodeValue; } foreach $patient($doc->getElementsByTagName('CathStudy')) { $htn = $patient->getElementsByTagName('PatHeight')->item(0 +)->getFirstChild->getNodeValue; $htu = $patient->getElementsByTagName('PatHeight')->item(0 +)->getAttribute('Units'); $wtn = $patient->getElementsByTagName('PatWeight')->item(0 +)->getFirstChild->getNodeValue; $wtu = $patient->getElementsByTagName('PatWeight')->item(0 +)->getAttribute('Units'); $patnum = $patient->getElementsByTagName('StudyID')->item( +0)->getFirstChild->getNodeValue; } #Rounding Off $wtn = ceil($wtn); $htn = ceil($htn); my $date = ParseDate($dob); $dob = UnixDate($date,"%m/%d/%Y"); @patdemo = ($ln, $fn, $dob, $sex, $htn, $htu, $wtn, $wtu, $pat +num); foreach $workerbee($doc->getElementsByTagName('CathStaff')) { $s1 = $workerbee->getElementsByTagName('LastName')->item(0 +)->getFirstChild->getNodeValue; $s2 = $workerbee->getElementsByTagName('FirstName')->item( +0)->getFirstChild->getNodeValue; $s3 = $workerbee->getAttribute('Role'); $staff_set{$s3} = "$s2 $s1"; } foreach my $chemical($doc->getElementsByTagName('BloodComposit +ion')) { my $bc = $chemical->getAttribute('BCMeasurement'); my $bv = $chemical->getElementsByTagName('Value')->item(0) +->getFirstChild->getNodeValue; #$bv = floor($bv); $bv = sprintf("%.2f", $bv); $lab{$bc} = "$bv"; } foreach $pin($doc->getElementsByTagName('CathProcedure')) { $procname = $pin->getAttribute('Procedure'); $connum = $pin->getAttribute('ProcNum'); #Check site $inv{$connum} = "$procname"; } foreach my $dictation($doc->getElementsByTagName('CathLogE +vent')) { my $timestamp = $dictation->getElementsByTagName('Even +tTime')->item(0)->getFirstChild->getNodeValue; my $notetext = $dictation->getElementsByTagName('Event +Text')->item(0)->getFirstChild->getNodeValue; $timestamp =~s/T/ /g; my $date = ParseDate($timestamp); #$timestamp = UnixDate($date,"%m/%d/%Y"); + $timestamp = UnixDate($date,"%R"); $notes{$timestamp} = $notetext; } foreach my $oa($doc->getElementsByTagName('CathOA')) { my $oatime = $oa->getElementsByTagName('OAtime')->ite +m(0)->getFirstChild->getNodeValue; my $sysbp = $oa->getElementsByTagName('Systolic')->i +tem(0)->getFirstChild->getNodeValue; my $diabp = $oa->getElementsByTagName('Diastolic')-> +item(0)->getFirstChild->getNodeValue; my $hrt = $oa->getElementsByTagName('HeartRate')->it +em(0)->getFirstChild->getNodeValue; my $sao2 = $oa->getElementsByTagName('SaO2')->item(0 +)->getFirstChild->getNodeValue; my $resp = $oa->getElementsByTagName('RespRate')->it +em(0)->getFirstChild->getNodeValue; #my $oacom = $oa->getElementsByTagName('OAText')->it +em(0)->getFirstChild->getNodeValue; my $oacom = ""; $oatime =~s/T/ /g; $oatime = UnixDate($oatime,"%R"); $vital{$oatime} = [$sysbp, $diabp, $hrt, $sao2, $res +p,$oacom]; } ###Aldrete foreach my $ald ($doc->getElementsByTagName('CathAldrete') +) { my $aldtype = $ald->getAttribute('AldreteType'); my $aldact = $ald->getElementsByTagName('Activity' +)->item(0)->getFirstChild->getNodeValue; my $aldresp = $ald->getElementsByTagName('Respirat +ion')->item(0)->getFirstChild->getNodeValue; my $aldcirc = $ald->getElementsByTagName('Circulat +ion')->item(0)->getFirstChild->getNodeValue; my $aldloc = $ald->getElementsByTagName('LOC')->it +em(0)->getFirstChild->getNodeValue; my $aldcolor = $ald->getElementsByTagName('Color') +->item(0)->getFirstChild->getNodeValue; my $aldtot = $ald->getElementsByTagName('TotalScor +e')->item(0)->getFirstChild->getNodeValue; $aldrete{$aldtype} = [$aldact, $aldresp, $aldloc, +$aldcolor, $aldtot, $aldcirc]; } #$obj->{CONDITIONS} = \%cond; $obj->{PATIENT} = \@patdemo; $obj->{STAFF} = \%staff_set; $obj->{LAB} = \%lab; $obj->{PROC} = \%inv; $obj->{NOTES} = \%notes; $obj->{ONASS} = \%vital; $obj->{ALDRETE} = \%aldrete; } return $obj; }

Any ideas would appreciate

Cappadonna

Readmore tags added by GrandFather

Replies are listed 'Best First'.
Re: Rewriting XML::DOM based module as XML::SAX
by planetscape (Chancellor) on Jun 10, 2006 at 17:59 UTC
Re: Rewriting XML::DOM based module as XML::SAX
by GrandFather (Saint) on Jun 10, 2006 at 20:10 UTC

    or for something only a little different it may be worth taking a look at XML::Twig.

    The code you present could be mapped easily into almost any XML parsing module. What are the new areas that you want to be able to handle that DOM is a pain for?

    I notice that at one point you have a for loop over 'SA_PATIENT' elements and retain data from the last element found which is later saved to @patdemo. Is it intended behaviour that you only retain the last element's data?

    By the way you could have trimed the code down to about a dozen lines and included some sample data (keep the data small too) for posting purposes - then people may feel more inclined to post sample solutions using other XML systems (I started, but couldn't be bothered - too much code all the same).


    DWIM is Perl's answer to Gödel

      Well, there are two really. First, every XML file will have the same general structure but not the same information. For example, the XML file A may have a social security number while B doesn't have one. Or, study C may contain EKG readings that don't show up in A or B. Using DOM, I would have to build a level of intelligence and redundancy to ensure that the script doesn't crap out if it doesn't see certain information. From what I've read, the event driven model is more appropriate, since I getting data fed from a another server.

      the other is speed. On my local workstation, the script runs fine. But this module will be part of a (slightly) larger automated script for a hospital in which multiple patients (XML files) will input at the same time. Like most patient care systems, I can't afford for this thing to crash b/c of memory hogging.

      Okay:

      Here is the code:

      sub CamPatData { my $obj = shift; Date_Init("TZ=EST"); if (@_) { my $file = shift; my $parser = XML::DOM::Parser->new(); my $doc = $parser->parsefile($file); my (@patdemo); my ($ln, $fn, $sex, $dob, $htu, $htn, $wtu, $wtn, $patnum, $pa +tient); #Get Patient Data foreach $patient($doc->getElementsByTagName('SA_PATIENT')) { $ln = $patient->getElementsByTagName('MY_LAST_NAME')->item +(0)->getFirstChild->getNodeValue; $fn = $patient->getElementsByTagName('MY_FIRST_NAME')->ite +m(0)->getFirstChild->getNodeValue; $dob = $patient->getElementsByTagName('MY_BIRTH_DATE')->it +em(0)->getFirstChild->getNodeValue; $sex = $patient->getElementsByTagName('MY_GENDER')->item(0 +)->getFirstChild->getNodeValue; $dob = $patient->getElementsByTagName('MY_BIRTH_DATE')->it +em(0)->getFirstChild->getNodeValue; } foreach $patient($doc->getElementsByTagName('CathStudy')) { $htn = $patient->getElementsByTagName('PatHeight')->item(0 +)->getFirstChild->getNodeValue; $htu = $patient->getElementsByTagName('PatHeight')->item(0 +)->getAttribute('Units'); $wtn = $patient->getElementsByTagName('PatWeight')->item(0 +)->getFirstChild->getNodeValue; $wtu = $patient->getElementsByTagName('PatWeight')->item(0 +)->getAttribute('Units'); $patnum = $patient->getElementsByTagName('StudyID')->item( +0)->getFirstChild->getNodeValue; } #Rounding Off $wtn = ceil($wtn); $htn = ceil($htn); my $date = ParseDate($dob); $dob = UnixDate($date,"%m/%d/%Y"); @patdemo = ($ln, $fn, $dob, $sex, $htn, $htu, $wtn, $wtu, $pat +num); }

      Here is the Sample Data:

      </CathStudy> <SA_PATIENT> <MY_PATIENT_ID>XXXXX</MY_PATIENT_ID> <MY_BIRTH_DATE>123-45-6789</MY_BIRTH_DATE> <MY_FIRST_NAME>BERT</MY_FIRST_NAME> <MY_GENDER>M</MY_GENDER> <MY_LAST_NAME>PUPPET</MY_LAST_NAME> <Address>123 SESAME ST</Address> <City>NEW YORK</City> <State>NY</State> <Zip>10110</Zip> <Race>Caucasian</Race> </SA_PATIENT>

      Any suggestions?

        Looks like the simplest way to do this might be to slurp all the data into a hash using XML::Simple then either clean up the hash or extract the data from it into another hash as is appropriate:

        use strict; use warnings; use Data::Dump::Streamer; use XML::Simple; my $str = do {local $/; <DATA>}; my %data = %{XMLin ($str)}; Dump (\%data);