#!/usr/bin/perl -wT
# The Pig-Latinizer: A CGI Program for Surfing the Web in Pig Latin
# by David Glick [davisagli], 6/20/2001
# This is my first CGI program, so comments/improvements are very welcome!
# - can't translate anything created in javascripts
# Main - forms on translated pages will at best not lead to
# limitations: another translated page, and at worst will not work
# - doesn't translate text in pictures :)
# always!
use strict;
use warnings;
use CGI qw/:standard/; # standard CGI.pm
use HTML::Parser(); # for parsing HTML
use HTML::Entities; # for decoding HTML character entities
require HTML::Element; # for manipulating HTML
require LWP::Simple; # for retrieving document to translate
use URI; # for resolving relative URI's into absolute ones
use URI::Escape; # for decoding URI character escapes
# initialize variables
my $base = 0; # does file already contain tag?
my $pig = ''; # result of translation
my $error = ''; # to hold any error messages
# main code block:
# retrieve URI of file to translate, and taint-check it
my $t_uri = param('page') || '';
my ($uri) = ($t_uri =~ /^(http:\/\/.+)$/);
# if we have a url that looks valid...
if ( defined $uri ) {
# retrieve content-type of this file
my ($content_type) = LWP::Simple::head( $uri );
$content_type ||= 'text/html';
# if it's an HTML doc...
if ($content_type =~ /text\/html/) {
# decode URI escapes and retrieve the file
$uri = uri_unescape( $uri );
my $html = LWP::Simple::get($uri) or ($error .= p("Error retrieving
file $uri" . br . "Please check your spelling or try another
website.") and sendform());
print header;
# start the parser -- which basically calls &pigify for each plain-text
# block and &tag for each opening HTML tag, adding the result of
# everything to $result
HTML::Parser->new( api_version => 3,
default_h => [sub { $pig .= shift }, 'text'],
text_h=>[sub{$pig.=(shift()?shift():pigify(shift))},'is_cdata, text'],
start_h => [sub {$pig .= tag(@_) }, 'tagname, attr, text']
)->parse($html);
# add base tag if needed -- this tells the browser where relative URLs
# in the doc are relative to
$pig = "$pig" unless $base;
print $pig;
# if it wasn't in HTML format, redirect the browser to access the file
# directly instead of through this script (this prevents the script from
# downloading large images/files only to forward them to the user) :
} else {
print redirect( -uri => $uri );
}
# end program so we don't print out the form
exit 0
}
# the following only runs if a valid URL was not supplied:
$error .= p('You must enter a valid URL beginning with http://')
if param('notfirst');
sendform();
# (end of main code block)
sub sendform
# prints out a form where the user can enter a URL to translate
# (including any error messages)
{
# format the error message in red
$error = font( {color => '#FF0000'}, $error) if $error;
#print out the form
print header,
start_html( -title => 'The Pig-Latinizer: Surf the Web in Pig Latin' ),
h1( 'The Pig-Latinizer: Surf the Web in Pig Latin' ),
$error,
start_form( -method => 'get' ),
textfield( -name => 'page',
-default => 'http://',
-size => 50 ),
submit( -value => "Pigify" ),
hidden( -name => 'notfirst', # this field is so we can suppress
-value => '1' ), # errors on the first run
end_form,
end_html;
# end the script
exit 0;
}
sub tag
# (called by the parser for each opening HTML tag)
# Points links to pigified version of pages, and handles some translation.
{
my ($name, $attr, $code) = @_;
my $tag = 0;
# check if this is a tag (if so, we won't supply one later)
if ($name eq 'base' and exists $attr->{href}) {
$base = 1;
return $code;
}
# create an HTML element object from this tag
$tag = HTML::Element->new($name,%{$attr});
# point links (in a and area tags) to pigified pages
if ($name eq 'a' or $name eq 'area' and exists $attr->{href}) {
$tag->attr('href', url . "?page=" . URI->new_abs($tag->attr('href'),$uri));
}
# point to pigified versions of frames
if ($name eq 'frame' and exists $attr->{src}) {
$tag->attr('src', url . "?page=" . URI->new_abs($tag->attr('src'),$uri));
}
# translate alt attributes
if (exists $attr->{alt}) {
$tag->attr('alt', pigify($tag->attr('alt')));
}
# translate value attributes
if (exists $attr->{value}) {
$tag->attr('value', pigify($tag->attr('value')));
}
# return the new HTML for this opening tag
$tag->starttag;
}
sub pigify
# translates an entire phrase into pig latin
{
my ($text) = @_;
$text = decode_entities($text);
# I love this regexp :) or rather, the fact that I finally
# managed to successfully create it...
$text =~ s/(?:^|(?<=[^a-z])) # start match either at beginning of string
# or following a non-letter
((?:qu # then match either 'qu'
|y(?=[aeiou]) # or y used as consonant at start of word
|[^\W\daeiouy]+ # or series of consonants
|(?=y[^aeiou]) # or nothing preceding y used as vowel (i.e. yttrium)
|(?=[aeiou]))) # or nothing preceding a vowel
# (this is all captured into $1)
([a-z']*) # then capture a series of letters and ' into $2
(?:(?=[^a-z])|$) # and end match prior to non-letter or end of string
# replace, adding 'ay' if started with consonant or 'way' if
# started with vowel ($1 eq '' if it started with a vowel)
/fixcaps($2.($1||'w').'ay',"$1$2")/egisx;
$text = encode_entities($text);
}
sub fixcaps
# called by &pigify to convert translated word to correct case,
# based on the original untranslated word
{
my ($word, $orig) = @_;
$word=($orig=~/[A-Z]/?($orig=~/^[A-Z']+$/?uc$word:ucfirst lc$word):lc$word);
}