comment on

I have a file that looks very much like a hash, and I want to convert it into one. Based on suggestions from the following node, I tokenized the file and am using XML::Simple to read it into a hash.

This works fine for small files (maybe 800 lines), but when the file gets to be too long (20,000 lines), this method takes a very long time.

Does anyone have any suggesions for speeding this up?

Here is the file structure:

Record FileRecord  "1.8"
{
  Record LotRecord "1234567.123"
  {
    Record WaferRecord "01"
    {
      Field DieOrigin 2 {0, 0}
      Field OrientationInstructions 1 {""}
      Field ProcessEquipmentState 7 {"NONE", "", "", "", "", "", ""}
      Field SampleCenterLocation 2 {-100000, 900000}
      Field SlotNumber 1 {2}

      List DefectList
      {
        Columns 32 { int32 DEFECTID,  int32 XREL,  int32 YREL,  int32 
+XINDEX,  int32 YINDEX,  
        int32 XSIZE,  int32 YSIZE,  float DEFECTAREA,  int32 DSIZE,  i
+nt32 CLASSNUMBER,  
        int32 TEST,  int32 CLUSTERNUMBER,  int32 ROUGHBINNUMBER,  int3
+2 FINEBINNUMBER,  int32 REVIEWSAMPLE,  
        float CONTRAST,  int32 CHANNELID,  int32 MANSEMCLASS,  int32 A
+UTOONSEMCLASS,  int32 MICROSIGCLASS,  
        int32 MACROSIGCLASS,  int32 AUTOOFFSEMCLASS,  int32 AUTOOFFOPT
+ADC,  int32 FACLASS,  int32 INTENSITY,  
        float KILLPROB,  int32 MACROSIGID,  int32 REGIONID,  ImageList
+ IMAGEINFO,  
        int32 POLARITY,  float CRITICALAREA,  int32 MANOPTCLASS  }
        Data 902
        {
          1 11111 722222 -3 16 1000 600 58000 2000 0 1 0 11 0 0 0.0000
+ 
            0 0 0 0 0 0 0 0 0 0.0000 0 0  N 0 0.0000 0 ;
          1 11111 722222 -3 16 1000 600 58000 2000 0 1 0 11 0 0 0.0000
+ 
            0 0 0 0 0 0 0 0 0 0.0000 0 0  N 0 0.0000 0 ;
[download]

Here is the current code I am using:

use strict;
use warnings;
use XML::Simple;

my $file = 'SINGLE.000';
open(AFILE, "<$file");
my @file = <AFILE>;
close AFILE;

my (@array, @tags, @newfile, @listarray, @datatags, @dataarray, @defec
+tdata);
my ($currenttag, $line, $string, $currentline, $listindex, $bracket);

my $index = 0;   #counts line number in @file

while ($index <= $#file){
    
    $currentline = $file[$index];

    $currentline =~ tr/\"/ /; #remove the quotes
    $currentline =~ s/\s*//;  #removed the leading whitespace...
    
    if ($currentline =~ /Record/){
        @array = split(/(\s+)/, $currentline);
        if(!$array[4]){$array[4] = 'none'};
        $currentline = "\<$array[2] value\=\"$array[4]\"\>\n";  #write
+ xml
        push(@tags, $array[2]); #push the RECORD type into the tags ar
+ray for later closing
        push (@newfile, $currentline);
        $index++;
    }
    
    elsif ($currentline =~ /^\}/){     #if we hit a back bracket, end 
+the data item and by popping off the last value in the hash
        $currenttag = pop(@tags);
        $currentline = "\<\/$currenttag\>\n";  #write xml
        push (@newfile, $currentline);
        $index++;
    }
    
    elsif ($currentline =~ /Field/){
        $currentline =~ s/\{(.*)\}//;  #pull data in between brackets.
        $line = $1;
        $line =~ s/\s//g;  #clear out whitespace
        @array = split(/\s+/, $currentline);
        my $string = "\<$array[1]\>$line\<\/$array[1]\>\n";  #write xm
+l
        push(@newfile, $string);
        $index++
    }
    
    elsif($currentline =~ /^List/){
         @array = split(/(\s+)/, $currentline);
         $currentline = "\<$array[2]\>\n";
         push(@tags, $array[2]);
         push (@newfile, $currentline);
         $index++;
         
         $bracket = 1;
         @listarray = ();
         while ($bracket){     #read all dat in between the brackets a
+nd break into a list for hash keys
             chomp($file[$index]);
             push(@listarray, $file[$index]);
             if ($file[$index] =~ /\}/){$bracket = 0}
             $index++;
         }
         
         $string = join('', @listarray);
         $string =~ s/ImageList|float|int32|Columns|string|\s+|\{|\}|\
+d//g;  #clean up list
         @datatags = split(/,/, $string);
    }
    
    elsif($currentline =~ /^Data/){    # this whole thing breaks up th
+e long string of numbers into their approp. hash key value
         @array = split(/(\s+)/, $currentline);
         $currentline = "\<DATA VALUE = \"$array[2]\"\>\n";
         push(@tags, 'DATA');
         push (@newfile, $currentline);
         $index++;
         
        @dataarray = ();
         until ($file[$index] =~ /\}$/){
             chomp($file[$index]);
             push(@dataarray, $file[$index]);
             $index++;
         }
         
        $string = join('', @dataarray);
        $string =~ s/\{|\}$|\"//g;  #clean up list 
        
        @dataarray = split(/\;/, $string);
        foreach (@dataarray){
            $_ =~ s/\s+/,/g;     #remove all the excess white space.
        };
        
        foreach (@dataarray){           #each line in the data array n
+eeds to be split 
            @defectdata = split(/\,/, $_);      # here we split by com
+ma
            shift (@defectdata);                #first item is a space
            $currentline = "\<i"."$defectdata[0]\>\n";   #create defec
+tid key
            push (@newfile, $currentline);
            
            $listindex = 0;
            foreach(@datatags){
                unless($defectdata[$listindex]){$defectdata[$listindex
+] = 0};
                $currentline = "\<$_\>$defectdata[$listindex]\<\/$_\>\
+n";
                $listindex++;
                push(@newfile, $currentline);
             }
            $currentline = "\<\/i"."$defectdata[0]\>\n";
            push (@newfile, $currentline); 
            
        }
    }
    else{$index++};
}

unshift(@newfile, "\<\?xml version\=\"1\.0\"\?\>\n");    #add the xml 
+header info

my $newfile = join('', @newfile);
my $data = XMLin($newfile);
my %data = %{$data};

open(BFILE, ">test.xml");
print BFILE @newfile;
close BFILE;
[download]

~~David~~

In reply to Recursive Hash and XML::Simple Processing Time by ~~David~~

Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!

Titles consisting of a single word are discouraged, and in most cases are disallowed outright.

Read Where should I post X? if you're not absolutely sure you're posting in the right place.

Please read these before you post! —

Posts may use any of the Perl Monks Approved HTML tags:

a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr

You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)

	For:		Use:
	&		`&`
	<		`<`
	>		`>`
	[		`[`
	]		`]`

Link using PerlMonks shortcuts! What shortcuts can I use for linking?

See Writeup Formatting Tips and other pages linked from there for more info.