#!/usr/bin/perl
# $Header: Penyemakan Ejaan (html2text) $
use integer;
use HTML::Parser;
use CGI;
my $parser = HTML::Parser->new('api_version' => 3,
'start_h' => [ \&start, "tagname, attr" ],
'end_h' => [ \&end, 'tagname' ],
'text_h' => [ \&text, 'dtext' ]);
####
# stack is a stack of $doc values
my @stack;
# $count is used to count section numbers within the document
# (each section being defined by an anchor)
my $count;
# $doc accumulates the text of the document. It might be a string, or
# an array ref containing subsiduary $doc values or further array refs
# which contain a tag and a $doc value.
my $doc;
my $debug = 0;
my $width = 76;
my %names;
# FILE OUTPUT
open (Out, '/home/ayob/temp/test.txt') || die "Fail tidak ada\n";
@file = ;
$file = @file;
close Out;
open (In, '>/home/ayob/projekmajor/cc') || die "File does not exists\n";
for ($n = 0; $n <= $file; $n++) {
$_ = $file[$n];
print In $_;
}
$n--;
print In "\n$n.\t$operator-$nombor\t$nama";
close In;
# restore the previous context, and return the old one
sub popit {
my $d = $doc;
$doc = pop(@stack);
print In "POP[$#stack]: doc-> {$doc}\n" if $debug;
return $d;
}
# handle a start tag
sub start {
my ($tag, $attrs) = @_;
return if $tag eq "ul";
if($tag ne "a") {
if(!defined $doc) {
$doc = [];
}
}
print In "<$tag>\n" if $debug;
print In map(" $_=$$attrs{$_}\n", keys %$attrs) if $debug;
if($tag eq "h1"
|| $tag eq "h2"
|| $tag eq "h3"
|| $tag eq "p"
|| $tag eq "cite"
|| $tag eq "pre"
|| $tag eq "li") {
push(@stack, $doc);
$doc = "";
print In "NEW[$#stack]: doc-> {$doc}\n" if $debug;
} elsif($tag eq "table"
|| $tag eq "tr"
|| $tag eq "td"
|| $tag eq "th") {
push(@stack, $doc);
if($tag eq "td" || $tag eq "th") {
undef $doc;
print In "NEW[$#stack]: doc-> {undef}\n" if $debug;
} else {
$doc = [];
print In "NEW[$#stack]: doc-> {$doc}\n" if $debug;
}
} elsif($tag eq "a") {
if(exists $$attrs{'href'}
&& $$attrs{'href'} =~ /^\#/) {
my $name = $$attrs{'href'};
$name =~ s/^\#//;
if(!exists $names{$name}) {
$names{$name} = $count++;
}
text("[" . $names{$name} . "] ");
} elsif(exists $$attrs{'name'}) {
my $name = $$attrs{'name'};
if(exists $names{$name}) {
text("[" . $names{$name} . "] ");
}
}
} elsif($tag eq "em") {
text("_");
}
}
# handle an end tag
sub end {
my ($tag) = @_;
print In "$tag>\n" if $debug;
if($tag eq "h1"
|| $tag eq "h2"
|| $tag eq "h3"
|| $tag eq "p"
|| $tag eq "cite"
|| $tag eq "pre"
|| $tag eq "li") {
my $d = popit;
push (@$doc, [$tag, $d]);
} elsif($tag eq "em") {
text("_");
} elsif($tag eq "table") {
my $table = popit;
my $width = 0;
for my $row (@$table) {
my $thiswidth = $#$row + 1;
$width = $thiswidth if($thiswidth > $width);
}
push(@$doc, ["table", $table, $width]);
} elsif($tag eq "tr") {
# XXX rowspan
my $row = popit;
push(@$doc, $row);
} elsif($tag eq "td" || $tag eq "th") {
# XXX colspan
if(!defined $doc) {
$doc = "";
}
my $cell = popit;
push(@$doc, $cell);
}
}
# handle text
sub text {
my ($text) = @_;
if(!defined $doc) {
$doc = "";
}
if(!ref $doc) {
$doc .= $text;
print In "TXT[$#stack]: doc-> {$doc}\n" if $debug;
}
}
sub format ($$) {
my ($doc, $width) = @_;
my @ret = ();
if(!ref $doc) {
return &format([["p", $doc]], $width);
}
for my $e (@$doc) {
my ($type, $value, $cols) = @$e;
if($type eq "h1") {
$value =~ s/\s+/ /;
$value =~ s/^ //;
$value =~ s/ $//;
my $l = length $value;
push(@ret, "") if($#ret >= 0 && $ret[$#ret] ne "");
my $padding = ($l < $width
? (" " x (($width - $l) / 2))
: "");
push(@ret, $padding . $value);
push(@ret, $padding . ("-" x $l));
} elsif($type eq "p"
|| $type eq "cite"
|| $type eq "h2"
|| $type eq "h3"
|| $type eq "li") {
my @s = split(/\s+/, $value);
my @break = ($type eq "p" || $type eq "li") ? ("") : ("", "");
push(@ret, @break) if($#ret >= 0 && $ret[$#ret] ne "");
my $x;
my $line;
my $prefix = "";
if($type eq "li") {
$prefix = " ";
$line = " *";
$x = 2;
}
for my $word (@s) {
if(defined $line && $x + 1 + length $word > $width) {
if(defined $line) {
push(@ret, $line);
push(@ret, '-' x length $line) if($type eq 'h2');
}
undef $line;
}
if(!defined $line) {
$line = $prefix;
$x = length $prefix;
}
if($x != 0) {
$line .= " ";
++$x;
}
$line .= "$word";
$x += length $word;
}
if(defined $line) {
push(@ret, $line);
push(@ret, '-' x length $line) if($type eq 'h2');
}
} elsif($type eq "pre") {
push(@ret, "") if($#ret >= 0 && $ret[$#ret] ne "");
push(@ret, split(/\n/, $value));
} elsif($type eq "table") {
push(@ret, "") if($#ret >= 0 && $ret[$#ret] ne "");
# for each row, fit into 1/N of the space available
# XXX do something more sophisticated
my $colwidth = ($width - 1) / $cols - 1;
# this will be the maximum column width
my @max;
for my $try (1, 2) {
my $divider;
my $total;
if($try == 2) {
$total = -1;
for my $w (@max) {
$total += $w + 1;
}
push(@ret, "," . ("-" x $total) . ".");
$divider = "+";
for my $w (@max) {
$divider .= "-" x $w;
$divider .= "+";
}
}
my $first = 1;
for my $row (@$value) {
if ($first) {
$first = 0;
} elsif($try == 2) {
push(@ret, $divider);
}
# format each cell
my @formatted = ();
for my $cell (@$row) {
my @f = &format($cell, $colwidth);
push(@formatted, \@f);
}
# pick apart the formatted cells row by row, and reassemble
my $r = 0;
for (;;) {
my $line = "|";
my $keepgoing = 0;
my $c = 0;
for my $cell (@formatted) {
my $part;
if ($r <= $#$cell) {
$part = $$cell[$r];
$keepgoing = 1;
} else {
$part = "";
}
my $pl = length $part;
if($try == 1) {
++$pl if($pl < $colwidth);
if(!defined $max[$c] || $pl > $max[$c]) {
$max[$c] = $pl;
}
} else {
$part .= " " x ($max[$c] - $pl) if($pl < $max[$c]);
$line .= "$part|";
}
++$c;
}
last if !$keepgoing;
if($try == 2) {
push(@ret, $line);
}
++$r;
}
}
if($try == 2) {
push(@ret, "`" . ("-" x $total) . "'");
}
}
}
}
return @ret;
}
my $files = 0;
while($#ARGV >= 0) {
local $_ = shift;
if(/^-/) {
last if($_ eq "--");
if($_ eq "-debug") {
++$debug;
} else {
die "$0: unknown option '$_'\n";
}
} else {
$doc = [];
$count = 1;
%names = ();
$parser->parse_file($_);
(print STDOUT map("$_\n", &format($doc, $width)))
or die "$0: writing to stdout: $!\n";
++$files;
}
}
if(!$files) {
$doc = [];
$count = 1;
%names = ();
$parser->parse_file(*STDIN);
(print STDOUT map("$_\n", &format($doc, $width)))
or die "$0: writing to stdout: $!\n";
}
(close STDOUT) or die "$0: closing stdout: $!\n";