Sang has asked for the wisdom of the Perl Monks concerning the following question:
Any suggestions on a better way to get the same results? As the comments state I'm going to try moving everything over from regexp to HTML::TokeParser, curious if there's an easier way to store the data than an array of arrays with its resultant need for refs, etc.
This is my first crack at perl so, please, be brutal. =)
#! perl use URI::URL (); use LWP::Simple qw( get ); use strict; my @document = get_page_to_array(); my ($STF1, $STF3) = parse_to_arrays(@document); arrays_to_webpage( $STF1, $STF3 ); # Begin Ye Olde Nasty Newbie Code # The weird STF1/STF3 vars/arrays are named after the # datasets being retrieved, e.g. 'Summary Tape File 1A' sub get_page_to_array { # Form the query, get the page, store it in $document # Is there a way to format the query below using just the # CGI module? # Use url() from URI::URL not CGI (only affects CGI version) my $url = URI::URL::url('http://www.census.gov/cgi-bin/gazetteer'); $url->query_form( city => "Tulsa", state => "OK" ); my $document = get( $url ); # Put each link on its own line $document =~ s|</a>,\s|</a>\n|g; # Make an array out of the string my @document = split /\n/, $document; return @document; } sub parse_to_arrays { # Regexp parse the webpage to grab the elements we want # The page is dynamically generated so it's fairly "safe" # to do this with regular expressions. Next step is going # to be trying to get the same results with HTML::TokeParser # The site doesn't close its <LI> tags though so I'm not sure # how I'm going to grab the text between them without regexp my $entry = ''; my @tmp = my @STF1 = my @STF3 = (); my @document = @_; foreach( @document ) { # Grab the text between the <LI> tags m|^<li><strong>(.*?)</strong>(.*?)<br>| and do { $entry = $1 . $2; + next}; # Grab the actual links m|<a href=(.*?)>(.*?)</a>| and do { my $url = $1; my $text = $2; # Store the links/text for STF1 & STF3 data # seperately because we're going to request # different things from each one later if ( $text eq 'STF1A' ) { # Send us the data, not a data selection page $url=~s/CMD=TABLES/CMD=RET/; @tmp = ( $entry, $url, $text ); push @STF1, [ @tmp ]; } elsif ( $text eq 'STF3A' ) { $url=~s/CMD=TABLES/CMD=RET/; @tmp = ( $entry, $url, $text ); push @STF3, [ @tmp ]; } else { # Do nothing...should I put something here? }; next; }; } # Return references to the arrays my $STF1 = \@STF1; my $STF3 = \@STF3; return ($STF1, $STF3); } sub arrays_to_webpage{ # Make the page my $i = my $aref = (); my ( $STF1, $STF3 ) = @_; # Variables below tell it what tables # we want and how to format the output my $STF1_Tables = '/FMT=HTML/T=P1'; my $STF3_Tables = '/FMT=HTML/T=P2'; open( OUTPUT, ">output.html" ) || die "Couldn't open 'output.html': +$!\n"; print OUTPUT "<HTML><BODY>\n"; # Ick...there has got to be a better way to define the range on this for $i ( 0 .. scalar( @$STF1 )-1 ) { $aref = @$STF1[$i]; print OUTPUT "<LI>$aref->[0] </LI><a href=$aref->[1]$STF1_Tables>$ +aref->[2]</a><br/>\n"; } print OUTPUT "<P>\n"; for $i ( 0 .. scalar( @$STF3 )-1 ) { $aref = @$STF3[$i]; print OUTPUT "<LI>$aref->[0] </LI><a href=$aref->[1]$STF3_Tables>$ +aref->[2]</a><br/>\n"; } print OUTPUT "</BODY></HTML>"; close( OUTPUT ); }
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re: Critique / Suggestions?
by chromatic (Archbishop) on Jan 05, 2002 at 13:00 UTC | |
by Sang (Acolyte) on Jan 05, 2002 at 13:34 UTC | |
|
Re: Critique / Suggestions?
by Juerd (Abbot) on Jan 05, 2002 at 17:58 UTC |