1: #!/usr/bin/perl -wT
2:
3: # The Pig-Latinizer: A CGI Program for Surfing the Web in Pig Latin
4: # by David Glick [davisagli], 6/20/2001
5:
6: # This is my first CGI program, so comments/improvements are very welcome!
7:
8: # - can't translate anything created in javascripts
9: # Main - forms on translated pages will at best not lead to
10: # limitations: another translated page, and at worst will not work
11: # - doesn't translate text in pictures :)
12:
13: # always!
14: use strict;
15: use warnings;
16:
17: use CGI qw/:standard/; # standard CGI.pm
18: use HTML::Parser(); # for parsing HTML
19: use HTML::Entities; # for decoding HTML character entities
20: require HTML::Element; # for manipulating HTML
21: require LWP::Simple; # for retrieving document to translate
22: use URI; # for resolving relative URI's into absolute ones
23: use URI::Escape; # for decoding URI character escapes
24:
25: # initialize variables
26: my $base = 0; # does file already contain <BASE HREF=...> tag?
27: my $pig = ''; # result of translation
28: my $error = ''; # to hold any error messages
29:
30: # main code block:
31:
32: # retrieve URI of file to translate, and taint-check it
33: my $t_uri = param('page') || '';
34: my ($uri) = ($t_uri =~ /^(http:\/\/.+)$/);
35:
36: # if we have a url that looks valid...
37: if ( defined $uri ) {
38:
39: # retrieve content-type of this file
40: my ($content_type) = LWP::Simple::head( $uri );
41: $content_type ||= 'text/html';
42:
43: # if it's an HTML doc...
44: if ($content_type =~ /text\/html/) {
45:
46: # decode URI escapes and retrieve the file
47: $uri = uri_unescape( $uri );
48: my $html = LWP::Simple::get($uri) or ($error .= p("Error retrieving
49: file $uri" . br . "Please check your spelling or try another
50: website.") and sendform());
51:
52: print header;
53:
54: # start the parser -- which basically calls &pigify for each plain-text
55: # block and &tag for each opening HTML tag, adding the result of
56: # everything to $result
57:
58: HTML::Parser->new( api_version => 3,
59: default_h => [sub { $pig .= shift }, 'text'],
60: text_h=>[sub{$pig.=(shift()?shift():pigify(shift))},'is_cdata, text'],
61: start_h => [sub {$pig .= tag(@_) }, 'tagname, attr, text']
62: )->parse($html);
63:
64: # add base tag if needed -- this tells the browser where relative URLs
65: # in the doc are relative to
66: $pig = "<base href=\"$uri\">$pig" unless $base;
67:
68: print $pig;
69:
70: # if it wasn't in HTML format, redirect the browser to access the file
71: # directly instead of through this script (this prevents the script from
72: # downloading large images/files only to forward them to the user) :
73:
74: } else {
75: print redirect( -uri => $uri );
76: }
77:
78: # end program so we don't print out the form
79: exit 0
80: }
81:
82: # the following only runs if a valid URL was not supplied:
83:
84: $error .= p('You must enter a valid URL beginning with http://')
85: if param('notfirst');
86: sendform();
87:
88: # (end of main code block)
89:
90: sub sendform
91: # prints out a form where the user can enter a URL to translate
92: # (including any error messages)
93: {
94: # format the error message in red
95: $error = font( {color => '#FF0000'}, $error) if $error;
96:
97: #print out the form
98: print header,
99: start_html( -title => 'The Pig-Latinizer: Surf the Web in Pig Latin' ),
100: h1( 'The Pig-Latinizer: Surf the Web in Pig Latin' ),
101: $error,
102: start_form( -method => 'get' ),
103: textfield( -name => 'page',
104: -default => 'http://',
105: -size => 50 ),
106: submit( -value => "Pigify" ),
107: hidden( -name => 'notfirst', # this field is so we can suppress
108: -value => '1' ), # errors on the first run
109: end_form,
110: end_html;
111:
112: # end the script
113: exit 0;
114: }
115:
116: sub tag
117: # (called by the parser for each opening HTML tag)
118: # Points links to pigified version of pages, and handles some translation.
119: {
120: my ($name, $attr, $code) = @_;
121: my $tag = 0;
122:
123: # check if this is a <base href=...> tag (if so, we won't supply one later)
124: if ($name eq 'base' and exists $attr->{href}) {
125: $base = 1;
126: return $code;
127: }
128:
129: # create an HTML element object from this tag
130: $tag = HTML::Element->new($name,%{$attr});
131:
132: # point links (in a and area tags) to pigified pages
133: if ($name eq 'a' or $name eq 'area' and exists $attr->{href}) {
134: $tag->attr('href', url . "?page=" . URI->new_abs($tag->attr('href'),$uri));
135: }
136:
137: # point to pigified versions of frames
138: if ($name eq 'frame' and exists $attr->{src}) {
139: $tag->attr('src', url . "?page=" . URI->new_abs($tag->attr('src'),$uri));
140: }
141:
142: # translate alt attributes
143: if (exists $attr->{alt}) {
144: $tag->attr('alt', pigify($tag->attr('alt')));
145: }
146:
147: # translate value attributes
148: if (exists $attr->{value}) {
149: $tag->attr('value', pigify($tag->attr('value')));
150: }
151:
152: # return the new HTML for this opening tag
153: $tag->starttag;
154: }
155:
156: sub pigify
157: # translates an entire phrase into pig latin
158: {
159: my ($text) = @_;
160: $text = decode_entities($text);
161:
162: # I love this regexp :) or rather, the fact that I finally
163: # managed to successfully create it...
164:
165: $text =~ s/(?:^|(?<=[^a-z])) # start match either at beginning of string
166: # or following a non-letter
167:
168: ((?:qu # then match either 'qu'
169: |y(?=[aeiou]) # or y used as consonant at start of word
170: |[^\W\daeiouy]+ # or series of consonants
171: |(?=y[^aeiou]) # or nothing preceding y used as vowel (i.e. yttrium)
172: |(?=[aeiou]))) # or nothing preceding a vowel
173: # (this is all captured into $1)
174:
175: ([a-z']*) # then capture a series of letters and ' into $2
176: (?:(?=[^a-z])|$) # and end match prior to non-letter or end of string
177:
178: # replace, adding 'ay' if started with consonant or 'way' if
179: # started with vowel ($1 eq '' if it started with a vowel)
180: /fixcaps($2.($1||'w').'ay',"$1$2")/egisx;
181:
182: $text = encode_entities($text);
183: }
184:
185: sub fixcaps
186: # called by &pigify to convert translated word to correct case,
187: # based on the original untranslated word
188: {
189: my ($word, $orig) = @_;
190: $word=($orig=~/[A-Z]/?($orig=~/^[A-Z']+$/?uc$word:ucfirst lc$word):lc$word);
191: }