#! perl use URI::URL (); use LWP::Simple qw( get ); use strict; my @document = get_page_to_array(); my ($STF1, $STF3) = parse_to_arrays(@document); arrays_to_webpage( $STF1, $STF3 ); # Begin Ye Olde Nasty Newbie Code # The weird STF1/STF3 vars/arrays are named after the # datasets being retrieved, e.g. 'Summary Tape File 1A' sub get_page_to_array { # Form the query, get the page, store it in $document # Is there a way to format the query below using just the # CGI module? # Use url() from URI::URL not CGI (only affects CGI version) my $url = URI::URL::url('http://www.census.gov/cgi-bin/gazetteer'); $url->query_form( city => "Tulsa", state => "OK" ); my $document = get( $url ); # Put each link on its own line $document =~ s|,\s|\n|g; # Make an array out of the string my @document = split /\n/, $document; return @document; } sub parse_to_arrays { # Regexp parse the webpage to grab the elements we want # The page is dynamically generated so it's fairly "safe" # to do this with regular expressions. Next step is going # to be trying to get the same results with HTML::TokeParser # The site doesn't close its
  • tags though so I'm not sure # how I'm going to grab the text between them without regexp my $entry = ''; my @tmp = my @STF1 = my @STF3 = (); my @document = @_; foreach( @document ) { # Grab the text between the
  • tags m|^
  • (.*?)(.*?)
    | and do { $entry = $1 . $2; next}; # Grab the actual links m|(.*?)| and do { my $url = $1; my $text = $2; # Store the links/text for STF1 & STF3 data # seperately because we're going to request # different things from each one later if ( $text eq 'STF1A' ) { # Send us the data, not a data selection page $url=~s/CMD=TABLES/CMD=RET/; @tmp = ( $entry, $url, $text ); push @STF1, [ @tmp ]; } elsif ( $text eq 'STF3A' ) { $url=~s/CMD=TABLES/CMD=RET/; @tmp = ( $entry, $url, $text ); push @STF3, [ @tmp ]; } else { # Do nothing...should I put something here? }; next; }; } # Return references to the arrays my $STF1 = \@STF1; my $STF3 = \@STF3; return ($STF1, $STF3); } sub arrays_to_webpage{ # Make the page my $i = my $aref = (); my ( $STF1, $STF3 ) = @_; # Variables below tell it what tables # we want and how to format the output my $STF1_Tables = '/FMT=HTML/T=P1'; my $STF3_Tables = '/FMT=HTML/T=P2'; open( OUTPUT, ">output.html" ) || die "Couldn't open 'output.html': $!\n"; print OUTPUT "\n"; # Ick...there has got to be a better way to define the range on this for $i ( 0 .. scalar( @$STF1 )-1 ) { $aref = @$STF1[$i]; print OUTPUT "
  • $aref->[0]
  • [1]$STF1_Tables>$aref->[2]
    \n"; } print OUTPUT "

    \n"; for $i ( 0 .. scalar( @$STF3 )-1 ) { $aref = @$STF3[$i]; print OUTPUT "

  • $aref->[0]
  • [1]$STF3_Tables>$aref->[2]
    \n"; } print OUTPUT ""; close( OUTPUT ); }