Pig Latin Web Filter

   1: #!/usr/bin/perl -wT
   2: 
   3: # The Pig-Latinizer: A CGI Program for Surfing the Web in Pig Latin
   4: # by David Glick [davisagli], 6/20/2001
   5: 
   6: # This is my first CGI program, so comments/improvements are very welcome!
   7: 
   8: #               - can't translate anything created in javascripts
   9: #     Main      - forms on translated pages will at best not lead to
  10: #  limitations:   another translated page, and at worst will not work
  11: #               - doesn't translate text in pictures :)
  12: 
  13: # always!
  14: use strict;
  15: use warnings;
  16: 
  17: use CGI qw/:standard/;  # standard CGI.pm
  18: use HTML::Parser();     # for parsing HTML
  19: use HTML::Entities;     # for decoding HTML character entities
  20: require HTML::Element;  # for manipulating HTML
  21: require LWP::Simple;    # for retrieving document to translate
  22: use URI;                # for resolving relative URI's into absolute ones
  23: use URI::Escape;        # for decoding URI character escapes
  24: 
  25: # initialize variables
  26: my $base = 0;           # does file already contain <BASE HREF=...> tag?
  27: my $pig = '';           # result of translation
  28: my $error = '';         # to hold any error messages
  29: 
  30: # main code block:
  31: 
  32: # retrieve URI of file to translate, and taint-check it
  33: my $t_uri = param('page') || '';
  34: my ($uri) = ($t_uri =~ /^(http:\/\/.+)$/);
  35: 
  36: # if we have a url that looks valid...
  37: if ( defined $uri ) {
  38: 
  39:   # retrieve content-type of this file
  40:   my ($content_type) = LWP::Simple::head( $uri );
  41:   $content_type ||= 'text/html';
  42: 
  43:   # if it's an HTML doc...
  44:   if ($content_type =~ /text\/html/) {
  45: 
  46:     # decode URI escapes and retrieve the file
  47:     $uri = uri_unescape( $uri );
  48:     my $html = LWP::Simple::get($uri) or ($error .= p("Error retrieving
  49:          file $uri" . br . "Please check your spelling or try another
  50:          website.") and sendform());
  51: 
  52:     print header;
  53: 
  54:     # start the parser -- which basically calls &pigify for each plain-text
  55:     # block and &tag for each opening HTML tag, adding the result of
  56:     # everything to $result
  57: 
  58:     HTML::Parser->new( api_version => 3,
  59:       default_h => [sub { $pig .= shift }, 'text'],
  60:       text_h=>[sub{$pig.=(shift()?shift():pigify(shift))},'is_cdata, text'],
  61:       start_h => [sub {$pig .= tag(@_) }, 'tagname, attr, text']
  62:       )->parse($html);
  63: 
  64:     # add base tag if needed -- this tells the browser where relative URLs
  65:     # in the doc are relative to
  66:     $pig = "<base href=\"$uri\">$pig" unless $base;
  67: 
  68:     print $pig;
  69: 
  70:   # if it wasn't in HTML format, redirect the browser to access the file
  71:   # directly instead of through this script (this prevents the script from
  72:   # downloading large images/files only to forward them to the user) :
  73: 
  74:   } else {
  75:     print redirect( -uri  =>  $uri );
  76:   }
  77: 
  78:   # end program so we don't print out the form
  79:   exit 0
  80: }
  81: 
  82: # the following only runs if a valid URL was not supplied:
  83: 
  84: $error .= p('You must enter a valid URL beginning with http://')
  85:   if param('notfirst');
  86: sendform();
  87: 
  88: # (end of main code block)
  89: 
  90: sub sendform
  91: # prints out a form where the user can enter a URL to translate
  92: # (including any error messages)
  93: {
  94:   # format the error message in red
  95:   $error = font( {color => '#FF0000'}, $error) if $error;
  96: 
  97:   #print out the form
  98:   print header,
  99:         start_html( -title => 'The Pig-Latinizer: Surf the Web in Pig Latin' ),
 100:         h1( 'The Pig-Latinizer: Surf the Web in Pig Latin' ),
 101:         $error,
 102:         start_form( -method   => 'get' ),
 103:         textfield( -name      => 'page',
 104:                    -default   => 'http://',
 105:                    -size      => 50 ),
 106:         submit( -value => "Pigify" ),
 107:         hidden( -name  => 'notfirst',   # this field is so we can suppress
 108:                 -value => '1' ),        # errors on the first run
 109:         end_form,
 110:         end_html;
 111: 
 112:   # end the script
 113:   exit 0;
 114: }
 115: 
 116: sub tag
 117: # (called by the parser for each opening HTML tag)
 118: # Points links to pigified version of pages, and handles some translation.
 119: {
 120:   my ($name, $attr, $code) = @_;
 121:   my $tag = 0;
 122: 
 123:   # check if this is a <base href=...> tag (if so, we won't supply one later)
 124:   if ($name eq 'base' and exists $attr->{href}) {
 125:     $base = 1;
 126:     return $code;
 127:   }
 128: 
 129:   # create an HTML element object from this tag
 130:   $tag = HTML::Element->new($name,%{$attr});
 131: 
 132:   # point links (in a and area tags) to pigified pages
 133:   if ($name eq 'a' or $name eq 'area' and exists $attr->{href}) {
 134:     $tag->attr('href', url . "?page=" . URI->new_abs($tag->attr('href'),$uri));
 135:   }
 136: 
 137:   # point to pigified versions of frames
 138:   if ($name eq 'frame' and exists $attr->{src}) {
 139:     $tag->attr('src', url . "?page=" . URI->new_abs($tag->attr('src'),$uri));
 140:   }
 141: 
 142:   # translate alt attributes
 143:   if (exists $attr->{alt}) {
 144:     $tag->attr('alt', pigify($tag->attr('alt')));
 145:   }
 146: 
 147:   # translate value attributes
 148:   if (exists $attr->{value}) {
 149:     $tag->attr('value', pigify($tag->attr('value')));
 150:   }
 151: 
 152:   # return the new HTML for this opening tag
 153:   $tag->starttag;
 154: }
 155: 
 156: sub pigify
 157: # translates an entire phrase into pig latin
 158: {
 159:   my ($text) = @_;
 160:   $text = decode_entities($text);
 161: 
 162:   # I love this regexp :)  or rather, the fact that I finally
 163:   # managed to successfully create it...
 164: 
 165:   $text =~ s/(?:^|(?<=[^a-z])) # start match either at beginning of string
 166:                                # or following a non-letter
 167: 
 168:      ((?:qu           # then match either 'qu'
 169:      |y(?=[aeiou])    # or y used as consonant at start of word
 170:      |[^\W\daeiouy]+  # or series of consonants
 171:      |(?=y[^aeiou])   # or nothing preceding y used as vowel (i.e. yttrium)
 172:      |(?=[aeiou])))   # or nothing preceding a vowel
 173:                       # (this is all captured into $1)
 174: 
 175:      ([a-z']*)        # then capture a series of letters and ' into $2
 176:      (?:(?=[^a-z])|$) # and end match prior to non-letter or end of string
 177: 
 178:       # replace, adding 'ay' if started with consonant or 'way' if
 179:       # started with vowel ($1 eq '' if it started with a vowel)
 180:       /fixcaps($2.($1||'w').'ay',"$1$2")/egisx;
 181: 
 182:   $text = encode_entities($text);
 183: }
 184: 
 185: sub fixcaps
 186: # called by &pigify to convert translated word to correct case,
 187: # based on the original untranslated word
 188: {
 189:   my ($word, $orig) = @_;
 190:   $word=($orig=~/[A-Z]/?($orig=~/^[A-Z']+$/?uc$word:ucfirst lc$word):lc$word);
 191: }
Comment on Pig Latin Web Filter Download Code
Replies are listed 'Best First'.
Re: Pig Latin Web Filter by orkysoft (Friar) on Jun 21, 2001 at 03:20 UTC
I suppose you could make an HTTP proxy to work around the forms problem :-)	[reply]