No, I don't know how to monitor that. Not sure this is a memory issue. I just bought this desktop which has 16GB memory. Below is the major part of the code.
foreach (@infiles) { my $data; my $address = $_; unless(open (FILE,"<$_")){ print STDERR "could not open $_: $!\n"; next;} local $/; $data=<FILE>; my($html) = $data =~ /(<p>|<\/P>|&nbsp;|<html>)/im; if (defined($html)){ $data =~ s/<[^>]+>/\n/g; $data =~ s/&nbsp;//ig; $data =~ s/&lt;/</ig; $data =~ s/&gt;/>/ig; $data =~ s/&quot;/"/ig; $data =~ s/&amp;/&/ig; $data =~ s/&#\d+;/&/ig; $data =~ s/&&/ &/ig; $data =~ s/(\w+)&(\d+)/$1 $2/ig; } my($block, $lines, $auditor, $auditorcity, $auditorstate, $date_audited, $keyword1, $keyword2, $keyword3, $keyword4, +$block2,); ($keyword1) = $data =~ /(have(\s*|\n)audited [^a])/i; ($keyword2) = $data =~ /(our(\s+|\n)audits|consent to)/i; if (!defined($keyword1) && !defined($keyword2)) {($block) ="" +;} elsif (!defined($keyword1) && defined($keyword2)) {($block) = + $data =~ /($keyword2(?:[^\n]*\n){1,350})/im;} elsif(defined($keyword1)) {($block) = $data =~ /($keyword1( +?:[^\n]*\n){1,350})/im;} ($keyword3) = $block =~ /(((REPORT(\s+|\n)OF|CONSENT OF)|)\s +*INDEPENDENT\s*(CERTIFIED|registered|registered CERTIFIED|)\s*PUBLIC\ +s*(ACCOUNTANTS|accounting(\s+|\n)firm)|REPORT OF INDEPENDENT ACCOUNTA +NTS|REPORT OF INDEPENDENT AUDITORS|)/im; if (defined($keyword3)) {($lines) = substr ($block,1,index($b +lock,$keyword3)-1);} elsif (!defined($keyword3)){($lines) = $block;} ($auditor) = $lines =~ /^\s*((?:[A-Z]\w+|\/\w+|&\s*\w+).+?\s* +(LLP|L\.L\.P\.|LLC|LTD|P.A.|P\.C\.|PC)(\.|))$/m; if(defined ($auditor)){ ($auditorcity) = $lines =~ /$auditor\s*(?:(?:.+?\s*(?:LLP|L\ +.L\.P\.|LLC|LTD|P.A.|P\.C\.|PC)(?:\.|))|\(.+\)|)\s*(.+?)(?<![\d]),\s* +(?:.+?)$/m; ($auditorstate) = $lines =~ /$auditor\s*(?:(?:.+?\s*(?:LLP|L\ +.L\.P\.|LLC|LTD|P.A.|P\.C\.|PC)(?:\.|)|\(.+\)|)\s*.+?(?<![\d]),\s*)(. ++?)$/m; ($date_audited) = $lines =~ /$auditorstate\s*(\w+\s*(\d\d|\d) +,\s*\d{4})\s*($|except|\()/m; } if(!defined($auditorcity) && !defined($auditorstate)){ ($auditorcity) = $lines =~ /^\s*(\w+)(?<![\d]),\s*(?:\w+)(? +<!LLP)$/m; ($auditorstate) = $lines =~ /^\s*(?:\w+(?<![\d]),\s*)(\w+)(? +<!LLP)$/m; ($date_audited) = $lines =~ /^\s*(\w+\s*(\d\d|\d),\s*\d{4})\ +s*($|except|\()/m; } if(defined ($auditor) && !defined($auditorcity) && !defined($ +auditorstate)){ ($auditorcity) = $lines =~ /^\s*(\w+|\w+ \w+|\w+ \w+ \w+)(?: +(?<![\d]),\s*(?:\w+|\w+ \w+|\w+ [^a] \w+))(?<![\d])$/m; ($auditorstate) = $lines =~ /^\s*(?:(?:\w+|\w+ \w+|\w+ \w+ \w ++)[^\d],\s*)(\w+|\w+ \w+|\w+ [^a] \w+)(?<![\d])$/m;} if (!defined($auditorcity) && !defined($auditorstate) && defi +ned($date_audited)){ ($auditorcity) = $lines =~ /^\s*(\w+|\w+ \w+|\w+ \w+ \w+)(?: +(?<![\d]),\s*(?:\w+|\w+ \w+|\w+ [^a] \w+))\s*$date_audited/m; ($auditorstate) = $lines =~ /^\s*(?:(?:\w+|\w+ \w+|\w+ \w+ \w ++)[^\d],\s*)(\w+|\w+ \w+|\w+ [^a] \w+)\s*$date_audited/m; } if (!defined($auditor) && defined($auditorcity)){ ($auditor) = $lines =~ /\n{2,}\s*(.+?)?\s+$auditorcity/m; } if (!defined($auditor) && defined($auditorcity)){ ($auditor) = $lines =~ /(?:\/s\/)\s*([^-]+?)\s*$auditorcit +y/m;} if (!defined($auditor)){ ($auditor) = $lines =~ /(?:\/s\/)\s*([^-]+?)$/m;} if (!defined($auditor)){ ($lines) = $data =~ /(consent to(?:[^\n]*\n){1,90})/im; ($auditor) = $lines =~ /^\s*(.+?\s*(LLP|L\.L\.P\.|LLC|L +TD|P.A.|P\.C\.|PC))$/m; if(defined ($auditor) && !defined($auditorcity) && !defined($ +auditorstate)){ ($auditorcity) = $lines =~ /$auditor\s*(.+?),\s*(?:.+?)$/m; ($auditorstate) = $lines =~ /$auditor\s*(?:.+?,\s*)(.+?)$/m;} } if (!defined($auditor)){ ($auditor) = $data =~ /((PWC|KPMG|ERNST & YOUNG|DELOITTE & + TOUCHE|PRICEWATERHOUSECOOPERS|Young LLP)\s*(LLP|))/i;} if(defined ($auditor) && !defined($auditorcity) && !defined($au +ditorstate)){ ($auditorcity) = $lines =~ /$auditor\s*(.+?)(?<![\d]),\s*(?:.+ +?)$/m; ($auditorstate) = $lines =~ /$auditor\s*(?:.+?(?<![\d]),\s*)(.+ +?)$/m;} if(defined ($auditor) && !defined($auditorcity) && !defined($au +ditorstate)){ ($auditorcity) = $lines =~ /$auditor\s*(?:\w+ (?:\d|\d\d)),\s* +\d{2,4}\s*(.+?),(?:.+?)$/m; ($auditorstate) = $lines =~ /$auditor\s*(?:\w+ (?:\d|\d\d)),\s* +\d{2,4}\s*(?:.+?),(.+?)$/m;} if(!defined($date_audited)){ ($date_audited) = $lines =~ /^\s*(\w+\s*(\d\d|\d),\s*\d{4})(,| +$)/m;} print OUTFILE "$auditor\t"; print OUTFILE "$auditorcity\t"; print OUTFILE "$auditorstate\t"; print OUTFILE "$date_audited\n";

In reply to Re^2: extracting strings from text files by perlyr
in thread extracting strings from text files by perlyr

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post, it's "PerlMonks-approved HTML":



  • Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!
  • Titles consisting of a single word are discouraged, and in most cases are disallowed outright.
  • Read Where should I post X? if you're not absolutely sure you're posting in the right place.
  • Please read these before you post! —
  • Posts may use any of the Perl Monks Approved HTML tags:
    a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr
  • You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)
            For:     Use:
    & &amp;
    < &lt;
    > &gt;
    [ &#91;
    ] &#93;
  • Link using PerlMonks shortcuts! What shortcuts can I use for linking?
  • See Writeup Formatting Tips and other pages linked from there for more info.