0: #!/usr/bin/perl -wT
1:
2: # The Pig-Latinizer: A CGI Program for Surfing the Web in Pig Latin
3: # by David Glick [davisagli], 6/20/2001
4:
5: # This is my first CGI program, so comments/improvements are very welcome!
6:
7: # - can't translate anything created in javascripts
8: # Main - forms on translated pages will at best not lead to
9: # limitations: another translated page, and at worst will not work
10: # - doesn't translate text in pictures :)
11:
12: # always!
13: use strict;
14: use warnings;
15:
16: use CGI qw/:standard/; # standard CGI.pm
17: use HTML::Parser(); # for parsing HTML
18: use HTML::Entities; # for decoding HTML character entities
19: require HTML::Element; # for manipulating HTML
20: require LWP::Simple; # for retrieving document to translate
21: use URI; # for resolving relative URI's into absolute ones
22: use URI::Escape; # for decoding URI character escapes
23:
24: # initialize variables
25: my $base = 0; # does file already contain <BASE HREF=...> tag?
26: my $pig = ''; # result of translation
27: my $error = ''; # to hold any error messages
28:
29: # main code block:
30:
31: # retrieve URI of file to translate, and taint-check it
32: my $t_uri = param('page') || '';
33: my ($uri) = ($t_uri =~ /^(http:\/\/.+)$/);
34:
35: # if we have a url that looks valid...
36: if ( defined $uri ) {
37:
38: # retrieve content-type of this file
39: my ($content_type) = LWP::Simple::head( $uri );
40: $content_type ||= 'text/html';
41:
42: # if it's an HTML doc...
43: if ($content_type =~ /text\/html/) {
44:
45: # decode URI escapes and retrieve the file
46: $uri = uri_unescape( $uri );
47: my $html = LWP::Simple::get($uri) or ($error .= p("Error retrieving
48: file $uri" . br . "Please check your spelling or try another
49: website.") and sendform());
50:
51: print header;
52:
53: # start the parser -- which basically calls &pigify for each plain-text
54: # block and &tag for each opening HTML tag, adding the result of
55: # everything to $result
56:
57: HTML::Parser->new( api_version => 3,
58: default_h => [sub { $pig .= shift }, 'text'],
59: text_h=>[sub{$pig.=(shift()?shift():pigify(shift))},'is_cdata, text'],
60: start_h => [sub {$pig .= tag(@_) }, 'tagname, attr, text']
61: )->parse($html);
62:
63: # add base tag if needed -- this tells the browser where relative URLs
64: # in the doc are relative to
65: $pig = "<base href=\"$uri\">$pig" unless $base;
66:
67: print $pig;
68:
69: # if it wasn't in HTML format, redirect the browser to access the file
70: # directly instead of through this script (this prevents the script from
71: # downloading large images/files only to forward them to the user) :
72:
73: } else {
74: print redirect( -uri => $uri );
75: }
76:
77: # end program so we don't print out the form
78: exit 0
79: }
80:
81: # the following only runs if a valid URL was not supplied:
82:
83: $error .= p('You must enter a valid URL beginning with http://')
84: if param('notfirst');
85: sendform();
86:
87: # (end of main code block)
88:
89: sub sendform
90: # prints out a form where the user can enter a URL to translate
91: # (including any error messages)
92: {
93: # format the error message in red
94: $error = font( {color => '#FF0000'}, $error) if $error;
95:
96: #print out the form
97: print header,
98: start_html( -title => 'The Pig-Latinizer: Surf the Web in Pig Latin' ),
99: h1( 'The Pig-Latinizer: Surf the Web in Pig Latin' ),
100: $error,
101: start_form( -method => 'get' ),
102: textfield( -name => 'page',
103: -default => 'http://',
104: -size => 50 ),
105: submit( -value => "Pigify" ),
106: hidden( -name => 'notfirst', # this field is so we can suppress
107: -value => '1' ), # errors on the first run
108: end_form,
109: end_html;
110:
111: # end the script
112: exit 0;
113: }
114:
115: sub tag
116: # (called by the parser for each opening HTML tag)
117: # Points links to pigified version of pages, and handles some translation.
118: {
119: my ($name, $attr, $code) = @_;
120: my $tag = 0;
121:
122: # check if this is a <base href=...> tag (if so, we won't supply one later)
123: if ($name eq 'base' and exists $attr->{href}) {
124: $base = 1;
125: return $code;
126: }
127:
128: # create an HTML element object from this tag
129: $tag = HTML::Element->new($name,%{$attr});
130:
131: # point links (in a and area tags) to pigified pages
132: if ($name eq 'a' or $name eq 'area' and exists $attr->{href}) {
133: $tag->attr('href', url . "?page=" . URI->new_abs($tag->attr('href'),$uri));
134: }
135:
136: # point to pigified versions of frames
137: if ($name eq 'frame' and exists $attr->{src}) {
138: $tag->attr('src', url . "?page=" . URI->new_abs($tag->attr('src'),$uri));
139: }
140:
141: # translate alt attributes
142: if (exists $attr->{alt}) {
143: $tag->attr('alt', pigify($tag->attr('alt')));
144: }
145:
146: # translate value attributes
147: if (exists $attr->{value}) {
148: $tag->attr('value', pigify($tag->attr('value')));
149: }
150:
151: # return the new HTML for this opening tag
152: $tag->starttag;
153: }
154:
155: sub pigify
156: # translates an entire phrase into pig latin
157: {
158: my ($text) = @_;
159: $text = decode_entities($text);
160:
161: # I love this regexp :) or rather, the fact that I finally
162: # managed to successfully create it...
163:
164: $text =~ s/(?:^|(?<=[^a-z])) # start match either at beginning of string
165: # or following a non-letter
166:
167: ((?:qu # then match either 'qu'
168: |y(?=[aeiou]) # or y used as consonant at start of word
169: |[^\W\daeiouy]+ # or series of consonants
170: |(?=y[^aeiou]) # or nothing preceding y used as vowel (i.e. yttrium)
171: |(?=[aeiou]))) # or nothing preceding a vowel
172: # (this is all captured into $1)
173:
174: ([a-z']*) # then capture a series of letters and ' into $2
175: (?:(?=[^a-z])|$) # and end match prior to non-letter or end of string
176:
177: # replace, adding 'ay' if started with consonant or 'way' if
178: # started with vowel ($1 eq '' if it started with a vowel)
179: /fixcaps($2.($1||'w').'ay',"$1$2")/egisx;
180:
181: $text = encode_entities($text);
182: }
183:
184: sub fixcaps
185: # called by &pigify to convert translated word to correct case,
186: # based on the original untranslated word
187: {
188: my ($word, $orig) = @_;
189: $word=($orig=~/[A-Z]/?($orig=~/^[A-Z']+$/?uc$word:ucfirst lc$word):lc$word);
190: } In reply to Pig Latin Web Filter by davisagli
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |