leighgable has asked for the wisdom of the Perl Monks concerning the following question:
And here is some sample data. Normally it's in several files in a directory.#!/usr/bin/perl # turn on perl safety features use strict; use warnings; #initialize modules use XML::Twig; use DirHandle; my ($dir, $filepath, @filename, @filepath_list, %company_hist, @co +mpany_list, %reference_hist, @reference_list); $dir = $ARGV[0] or die "Must specify directory"; @filepath_list = get_file_list($dir); foreach $filepath (@filepath_list) { my $twig = XML::Twig -> new( twig_roots => { 'article/reference' => \&get_ref, company => \&get_code }); $company_hist{$_}++ for @company_list; #sort results from my @unique_comp = keys %company_hist; #"get_code" sub then @company_list = ( sort { $company_hist{$b} <=> #return 3 most f +req $company_hist{$a} } @unique_comp )[0..2]; #codes. #my $ref_length = scalar( @reference_list ); #take ref list and $reference_hist{$_}++ for @reference_list; #eliminate duplicate +s my @unique_ref = keys %reference_hist; #return a tally my $uni_count = scalar ( @unique_ref ); my @by_date_tally = get_date(\@unique_ref); my $dup_count = (scalar( @reference_list ) - $uni_count); #my($k, $v); #while ( ($k,$v) = each %reference_hist ) { #print "$k => $v\n"; #} #print "File name: ", print_file_name($filepath), "\n"; #print "Total Dupicate Articles: $dup_count\n"; #print "Total Articles Found: $uni_count\n"; #print "@company_list\n"; undef %company_hist; #reinitialize global undef @company_list; #vars. undef %reference_hist; # undef @reference_list; # $twig->parsefile($filepath); # purge to save mem. $twig->purge; } #end of foreach loop exit(0); sub get_file_list { $dir = shift; print $dir, "\n"; my $dh = DirHandle->new($dir) or die "can't open directory"; return sort # sort pathnames grep { -f } # choose only files map { "$dir/$_" } # create full paths grep { !/^\./ } # filter out dot files $dh->read(); # read all filenames } sub print_file_name { my($path, $position, $path_strip); #take filepath and $path = $_[0]; #return filename. $position = rindex($path,"/") + 1; # $path_strip = substr($path, $position); #print "For file: $path_strip\n"; return $path_strip; } sub get_code { my $company; #get company code my( $twig, $elt)= @_; #attribute and $company = $elt->{'att'}->{'code'}; #put into array push @company_list, $company; #return @company_list; } sub get_ref { my( $twig, $elt)= @_; #take reference elt my $ref = $elt; #and return just the my $position = rindex($ref->text(), "/") + 1; #reference ID str +ing my $ref_strip = substr($ref->text(), $position); push @reference_list, $ref_strip; } sub get_date { my $ref; #my @refs = @_; foreach $ref (@_) { print @$ref, "\t"; } print "\n\n"; }
<article> <accessionno>MTPW000020090731e57v004mr</accessionno> <reference>distdoc:archive/ArchiveDoc::Article/MTPW000020090 +731e57v004mr</reference> <baselanguage>EN</baselanguage> <copyright>(c) 2009 M2 Communications, Ltd. All Rights Reser +ved. </copyright> <headline> <paragraph display="Proportional" truncation="None" lang=" +EN">Anadys Pharmaceuticals, Inc (NASDAQ:ANDS) is the Highest Percenta +ge Gainers Among NASDAQ Stocks During Morning Trading Hours; Microsof +t Corporation (NASDAQ:<hlt>MSFT</hlt>) And Orthofix International NV +(NASDAQ:OFIX) Round Out Top Three Percentage Gainers During Morning T +rading Hours</paragraph> </headline> <publicationdate> <date>2009-07-31</date> </publicationdate> <sourcename>M2 Presswire</sourcename> <company code="oficks"> <name>Orthofix International N.V.</name> <newsmentions>0</newsmentions> <newshits>0</newshits> </company> <company code="scrptg"> <name>Anadys Pharmaceuticals Inc</name> <newsmentions>0</newsmentions> <newshits>0</newshits> </company> <company code="mcrost"> <name>Microsoft Corporation</name> <newsmentions>0</newsmentions> <newshits>0</newshits> </company> <industry code="i3302"> <name>Computers/Electronics</name> </industry> <industry code="i3302021"> <name>Applications Software</name> </industry> <industry code="i257"> <name>Pharmaceuticals</name> </industry> <industry code="i330202"> <name>Software</name> </industry> <industry code="icomp"> <name>Computing</name> </industry> <industry code="i3302020"> <name>Systems Software</name> </industry> <industry code="i372"> <name>Medical Equipment/Supplies</name> </industry> <industry code="i951"> <name>Health Care</name> </industry> <industry code="iphmed"> <name>Medical/Surgical Instruments/Apparatus/Devices</name +> </industry> <region code="usa"> <name>United States</name> </region> <region code="namz"> <name>North American Countries/Regions</name> </region> <newssubject code="c42" position="0"> <name>Labor/Personnel Issues</name> </newssubject> <newssubject code="ghepat" position="0"> <name>Hepatitis</name> </newssubject> <newssubject code="mstock" position="0"> <name>Stock Exchanges</name> </newssubject> <newssubject code="npress" position="0"> <name>Press Release</name> </newssubject> <newssubject code="ccat" position="0"> <name>Corporate/Industrial News</name> </newssubject> <newssubject code="gcat" position="0"> <name>Political/General News</name> </newssubject> <newssubject code="ghea" position="0"> <name>Health</name> </newssubject> <newssubject code="gmed" position="0"> <name>Medical Conditions</name> </newssubject> <newssubject code="m11" position="0"> <name>Equity Markets</name> </newssubject> <newssubject code="mcat" position="0"> <name>Commodity/Financial Market News</name> </newssubject> <newssubject code="ncat" position="0"> <name>Content Types</name> </newssubject> <newssubject code="nfact" position="0"> <name>Factiva Filters</name> </newssubject> <newssubject code="nfce" position="0"> <name>FC&E Exclusion Filter</name> </newssubject> <newssubject code="nfcpin" position="0"> <name>FC&E Industry News Filter</name> </newssubject> <sourcecode>MTPW</sourcecode> <tailparagraphs> </tailparagraphs> <contact>Liquid Tycoon | e-mail: info@LiquidTycoon.com | Tel: +1 214 556 6798 </contact> <logo source="http://logos.factiva.com.ezproxy.insead.edu" i +mage="mtpwLogo.gif"></logo> <wordcount>679</wordcount> </article>
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re: Returning the right results, but at the wrong time from subroutines
by moritz (Cardinal) on Sep 06, 2009 at 22:08 UTC | |
by leighgable (Acolyte) on Sep 07, 2009 at 00:50 UTC | |
|
Re: Returning the right results, but at the wrong time from subroutines
by toolic (Bishop) on Sep 06, 2009 at 23:03 UTC | |
by leighgable (Acolyte) on Sep 07, 2009 at 00:55 UTC | |
|
Re: Returning the right results, but at the wrong time from subroutines
by Jenda (Abbot) on Sep 07, 2009 at 23:12 UTC |