comment on

0: #!/usr/bin/perl -wT
1: 
2: # The Pig-Latinizer: A CGI Program for Surfing the Web in Pig Latin
3: # by David Glick [davisagli], 6/20/2001
4: 
5: # This is my first CGI program, so comments/improvements are very welcome!
6: 
7: #               - can't translate anything created in javascripts
8: #     Main      - forms on translated pages will at best not lead to
9: #  limitations:   another translated page, and at worst will not work
10: #               - doesn't translate text in pictures :)
11: 
12: # always!
13: use strict;
14: use warnings;
15: 
16: use CGI qw/:standard/;  # standard CGI.pm
17: use HTML::Parser();     # for parsing HTML
18: use HTML::Entities;     # for decoding HTML character entities
19: require HTML::Element;  # for manipulating HTML
20: require LWP::Simple;    # for retrieving document to translate
21: use URI;                # for resolving relative URI's into absolute ones
22: use URI::Escape;        # for decoding URI character escapes
23: 
24: # initialize variables
25: my $base = 0;           # does file already contain <BASE HREF=...> tag?
26: my $pig = '';           # result of translation
27: my $error = '';         # to hold any error messages
28: 
29: # main code block:
30: 
31: # retrieve URI of file to translate, and taint-check it
32: my $t_uri = param('page') || '';
33: my ($uri) = ($t_uri =~ /^(http:\/\/.+)$/);
34: 
35: # if we have a url that looks valid...
36: if ( defined $uri ) {
37: 
38:   # retrieve content-type of this file
39:   my ($content_type) = LWP::Simple::head( $uri );
40:   $content_type ||= 'text/html';
41: 
42:   # if it's an HTML doc...
43:   if ($content_type =~ /text\/html/) {
44: 
45:     # decode URI escapes and retrieve the file
46:     $uri = uri_unescape( $uri );
47:     my $html = LWP::Simple::get($uri) or ($error .= p("Error retrieving
48:          file $uri" . br . "Please check your spelling or try another
49:          website.") and sendform());
50: 
51:     print header;
52: 
53:     # start the parser -- which basically calls &pigify for each plain-text
54:     # block and &tag for each opening HTML tag, adding the result of
55:     # everything to $result
56: 
57:     HTML::Parser->new( api_version => 3,
58:       default_h => [sub { $pig .= shift }, 'text'],
59:       text_h=>[sub{$pig.=(shift()?shift():pigify(shift))},'is_cdata, text'],
60:       start_h => [sub {$pig .= tag(@_) }, 'tagname, attr, text']
61:       )->parse($html);
62: 
63:     # add base tag if needed -- this tells the browser where relative URLs
64:     # in the doc are relative to
65:     $pig = "<base href=\"$uri\">$pig" unless $base;
66: 
67:     print $pig;
68: 
69:   # if it wasn't in HTML format, redirect the browser to access the file
70:   # directly instead of through this script (this prevents the script from
71:   # downloading large images/files only to forward them to the user) :
72: 
73:   } else {
74:     print redirect( -uri  =>  $uri );
75:   }
76: 
77:   # end program so we don't print out the form
78:   exit 0
79: }
80: 
81: # the following only runs if a valid URL was not supplied:
82: 
83: $error .= p('You must enter a valid URL beginning with http://')
84:   if param('notfirst');
85: sendform();
86: 
87: # (end of main code block)
88: 
89: sub sendform
90: # prints out a form where the user can enter a URL to translate
91: # (including any error messages)
92: {
93:   # format the error message in red
94:   $error = font( {color => '#FF0000'}, $error) if $error;
95: 
96:   #print out the form
97:   print header,
98:         start_html( -title => 'The Pig-Latinizer: Surf the Web in Pig Latin' ),
99:         h1( 'The Pig-Latinizer: Surf the Web in Pig Latin' ),
100:         $error,
101:         start_form( -method   => 'get' ),
102:         textfield( -name      => 'page',
103:                    -default   => 'http://',
104:                    -size      => 50 ),
105:         submit( -value => "Pigify" ),
106:         hidden( -name  => 'notfirst',   # this field is so we can suppress
107:                 -value => '1' ),        # errors on the first run
108:         end_form,
109:         end_html;
110: 
111:   # end the script
112:   exit 0;
113: }
114: 
115: sub tag
116: # (called by the parser for each opening HTML tag)
117: # Points links to pigified version of pages, and handles some translation.
118: {
119:   my ($name, $attr, $code) = @_;
120:   my $tag = 0;
121: 
122:   # check if this is a <base href=...> tag (if so, we won't supply one later)
123:   if ($name eq 'base' and exists $attr->{href}) {
124:     $base = 1;
125:     return $code;
126:   }
127: 
128:   # create an HTML element object from this tag
129:   $tag = HTML::Element->new($name,%{$attr});
130: 
131:   # point links (in a and area tags) to pigified pages
132:   if ($name eq 'a' or $name eq 'area' and exists $attr->{href}) {
133:     $tag->attr('href', url . "?page=" . URI->new_abs($tag->attr('href'),$uri));
134:   }
135: 
136:   # point to pigified versions of frames
137:   if ($name eq 'frame' and exists $attr->{src}) {
138:     $tag->attr('src', url . "?page=" . URI->new_abs($tag->attr('src'),$uri));
139:   }
140: 
141:   # translate alt attributes
142:   if (exists $attr->{alt}) {
143:     $tag->attr('alt', pigify($tag->attr('alt')));
144:   }
145: 
146:   # translate value attributes
147:   if (exists $attr->{value}) {
148:     $tag->attr('value', pigify($tag->attr('value')));
149:   }
150: 
151:   # return the new HTML for this opening tag
152:   $tag->starttag;
153: }
154: 
155: sub pigify
156: # translates an entire phrase into pig latin
157: {
158:   my ($text) = @_;
159:   $text = decode_entities($text);
160: 
161:   # I love this regexp :)  or rather, the fact that I finally
162:   # managed to successfully create it...
163: 
164:   $text =~ s/(?:^|(?<=[^a-z])) # start match either at beginning of string
165:                                # or following a non-letter
166: 
167:      ((?:qu           # then match either 'qu'
168:      |y(?=[aeiou])    # or y used as consonant at start of word
169:      |[^\W\daeiouy]+  # or series of consonants
170:      |(?=y[^aeiou])   # or nothing preceding y used as vowel (i.e. yttrium)
171:      |(?=[aeiou])))   # or nothing preceding a vowel
172:                       # (this is all captured into $1)
173: 
174:      ([a-z']*)        # then capture a series of letters and ' into $2
175:      (?:(?=[^a-z])|$) # and end match prior to non-letter or end of string
176: 
177:       # replace, adding 'ay' if started with consonant or 'way' if
178:       # started with vowel ($1 eq '' if it started with a vowel)
179:       /fixcaps($2.($1||'w').'ay',"$1$2")/egisx;
180: 
181:   $text = encode_entities($text);
182: }
183: 
184: sub fixcaps
185: # called by &pigify to convert translated word to correct case,
186: # based on the original untranslated word
187: {
188:   my ($word, $orig) = @_;
189:   $word=($orig=~/[A-Z]/?($orig=~/^[A-Z']+$/?uc$word:ucfirst lc$word):lc$word);
190: }

In reply to Pig Latin Web Filter by davisagli

Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!

Titles consisting of a single word are discouraged, and in most cases are disallowed outright.

Read Where should I post X? if you're not absolutely sure you're posting in the right place.

Please read these before you post! —

Posts may use any of the Perl Monks Approved HTML tags:

a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr

You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)

	For:		Use:
	&		`&`
	<		`<`
	>		`>`
	[		`[`
	]		`]`

Link using PerlMonks shortcuts! What shortcuts can I use for linking?

See Writeup Formatting Tips and other pages linked from there for more info.