#! perl
use URI::URL ();
use LWP::Simple qw( get );
use strict;
my @document = get_page_to_array();
my ($STF1, $STF3) = parse_to_arrays(@document);
arrays_to_webpage( $STF1, $STF3 );
# Begin Ye Olde Nasty Newbie Code
# The weird STF1/STF3 vars/arrays are named after the
# datasets being retrieved, e.g. 'Summary Tape File 1A'
sub get_page_to_array {
# Form the query, get the page, store it in $document
# Is there a way to format the query below using just the
# CGI module?
# Use url() from URI::URL not CGI (only affects CGI version)
my $url = URI::URL::url('http://www.census.gov/cgi-bin/gazetteer');
$url->query_form( city => "Tulsa", state => "OK" );
my $document = get( $url );
# Put each link on its own line
$document =~ s|,\s|\n|g;
# Make an array out of the string
my @document = split /\n/, $document;
return @document;
}
sub parse_to_arrays {
# Regexp parse the webpage to grab the elements we want
# The page is dynamically generated so it's fairly "safe"
# to do this with regular expressions. Next step is going
# to be trying to get the same results with HTML::TokeParser
# The site doesn't close its
tags though so I'm not sure
# how I'm going to grab the text between them without regexp
my $entry = '';
my @tmp = my @STF1 = my @STF3 = ();
my @document = @_;
foreach( @document ) {
# Grab the text between the tags
m|^(.*?)(.*?)
| and do { $entry = $1 . $2; next};
# Grab the actual links
m|(.*?)| and do {
my $url = $1;
my $text = $2;
# Store the links/text for STF1 & STF3 data
# seperately because we're going to request
# different things from each one later
if ( $text eq 'STF1A' ) {
# Send us the data, not a data selection page
$url=~s/CMD=TABLES/CMD=RET/;
@tmp = ( $entry, $url, $text );
push @STF1, [ @tmp ];
} elsif ( $text eq 'STF3A' ) {
$url=~s/CMD=TABLES/CMD=RET/;
@tmp = ( $entry, $url, $text );
push @STF3, [ @tmp ];
} else {
# Do nothing...should I put something here?
};
next;
};
}
# Return references to the arrays
my $STF1 = \@STF1;
my $STF3 = \@STF3;
return ($STF1, $STF3);
}
sub arrays_to_webpage{
# Make the page
my $i = my $aref = ();
my ( $STF1, $STF3 ) = @_;
# Variables below tell it what tables
# we want and how to format the output
my $STF1_Tables = '/FMT=HTML/T=P1';
my $STF3_Tables = '/FMT=HTML/T=P2';
open( OUTPUT, ">output.html" ) || die "Couldn't open 'output.html': $!\n";
print OUTPUT "\n";
# Ick...there has got to be a better way to define the range on this
for $i ( 0 .. scalar( @$STF1 )-1 ) {
$aref = @$STF1[$i];
print OUTPUT "$aref->[0] [1]$STF1_Tables>$aref->[2]
\n";
}
print OUTPUT "\n";
for $i ( 0 .. scalar( @$STF3 )-1 ) {
$aref = @$STF3[$i];
print OUTPUT "
$aref->[0] [1]$STF3_Tables>$aref->[2]
\n";
}
print OUTPUT "";
close( OUTPUT );
}