#!/usr/bin/perl -wT # The Pig-Latinizer: A CGI Program for Surfing the Web in Pig Latin # by David Glick [davisagli], 6/20/2001 # This is my first CGI program, so comments/improvements are very welcome! # - can't translate anything created in javascripts # Main - forms on translated pages will at best not lead to # limitations: another translated page, and at worst will not work # - doesn't translate text in pictures :) # always! use strict; use warnings; use CGI qw/:standard/; # standard CGI.pm use HTML::Parser(); # for parsing HTML use HTML::Entities; # for decoding HTML character entities require HTML::Element; # for manipulating HTML require LWP::Simple; # for retrieving document to translate use URI; # for resolving relative URI's into absolute ones use URI::Escape; # for decoding URI character escapes # initialize variables my $base = 0; # does file already contain tag? my $pig = ''; # result of translation my $error = ''; # to hold any error messages # main code block: # retrieve URI of file to translate, and taint-check it my $t_uri = param('page') || ''; my ($uri) = ($t_uri =~ /^(http:\/\/.+)$/); # if we have a url that looks valid... if ( defined $uri ) { # retrieve content-type of this file my ($content_type) = LWP::Simple::head( $uri ); $content_type ||= 'text/html'; # if it's an HTML doc... if ($content_type =~ /text\/html/) { # decode URI escapes and retrieve the file $uri = uri_unescape( $uri ); my $html = LWP::Simple::get($uri) or ($error .= p("Error retrieving file $uri" . br . "Please check your spelling or try another website.") and sendform()); print header; # start the parser -- which basically calls &pigify for each plain-text # block and &tag for each opening HTML tag, adding the result of # everything to $result HTML::Parser->new( api_version => 3, default_h => [sub { $pig .= shift }, 'text'], text_h=>[sub{$pig.=(shift()?shift():pigify(shift))},'is_cdata, text'], start_h => [sub {$pig .= tag(@_) }, 'tagname, attr, text'] )->parse($html); # add base tag if needed -- this tells the browser where relative URLs # in the doc are relative to $pig = "$pig" unless $base; print $pig; # if it wasn't in HTML format, redirect the browser to access the file # directly instead of through this script (this prevents the script from # downloading large images/files only to forward them to the user) : } else { print redirect( -uri => $uri ); } # end program so we don't print out the form exit 0 } # the following only runs if a valid URL was not supplied: $error .= p('You must enter a valid URL beginning with http://') if param('notfirst'); sendform(); # (end of main code block) sub sendform # prints out a form where the user can enter a URL to translate # (including any error messages) { # format the error message in red $error = font( {color => '#FF0000'}, $error) if $error; #print out the form print header, start_html( -title => 'The Pig-Latinizer: Surf the Web in Pig Latin' ), h1( 'The Pig-Latinizer: Surf the Web in Pig Latin' ), $error, start_form( -method => 'get' ), textfield( -name => 'page', -default => 'http://', -size => 50 ), submit( -value => "Pigify" ), hidden( -name => 'notfirst', # this field is so we can suppress -value => '1' ), # errors on the first run end_form, end_html; # end the script exit 0; } sub tag # (called by the parser for each opening HTML tag) # Points links to pigified version of pages, and handles some translation. { my ($name, $attr, $code) = @_; my $tag = 0; # check if this is a tag (if so, we won't supply one later) if ($name eq 'base' and exists $attr->{href}) { $base = 1; return $code; } # create an HTML element object from this tag $tag = HTML::Element->new($name,%{$attr}); # point links (in a and area tags) to pigified pages if ($name eq 'a' or $name eq 'area' and exists $attr->{href}) { $tag->attr('href', url . "?page=" . URI->new_abs($tag->attr('href'),$uri)); } # point to pigified versions of frames if ($name eq 'frame' and exists $attr->{src}) { $tag->attr('src', url . "?page=" . URI->new_abs($tag->attr('src'),$uri)); } # translate alt attributes if (exists $attr->{alt}) { $tag->attr('alt', pigify($tag->attr('alt'))); } # translate value attributes if (exists $attr->{value}) { $tag->attr('value', pigify($tag->attr('value'))); } # return the new HTML for this opening tag $tag->starttag; } sub pigify # translates an entire phrase into pig latin { my ($text) = @_; $text = decode_entities($text); # I love this regexp :) or rather, the fact that I finally # managed to successfully create it... $text =~ s/(?:^|(?<=[^a-z])) # start match either at beginning of string # or following a non-letter ((?:qu # then match either 'qu' |y(?=[aeiou]) # or y used as consonant at start of word |[^\W\daeiouy]+ # or series of consonants |(?=y[^aeiou]) # or nothing preceding y used as vowel (i.e. yttrium) |(?=[aeiou]))) # or nothing preceding a vowel # (this is all captured into $1) ([a-z']*) # then capture a series of letters and ' into $2 (?:(?=[^a-z])|$) # and end match prior to non-letter or end of string # replace, adding 'ay' if started with consonant or 'way' if # started with vowel ($1 eq '' if it started with a vowel) /fixcaps($2.($1||'w').'ay',"$1$2")/egisx; $text = encode_entities($text); } sub fixcaps # called by &pigify to convert translated word to correct case, # based on the original untranslated word { my ($word, $orig) = @_; $word=($orig=~/[A-Z]/?($orig=~/^[A-Z']+$/?uc$word:ucfirst lc$word):lc$word); }