use warnings; use strict; use utf8; use Fcntl; use HTML::TreeBuilder; use HTML::Entities; use HTML::TableContentParser; =head1 HTML to TWiki text file converter This script takes an HTML document generated by exporting a Word document as HTML and generates TWiki pages suitable for dropping directly into the TWiki folder. Two passes are required by html2wiki. The first pass generates a translation file containing various substitution strings that control behaviour of the conversion process in the second pass. The translation file will require editing before the second pass if you expect good results. Note that the entire document is processed in memory so there may be performance issues when large documents are being processed. Some manual preprocessing of the document to remove tables of contents, tables of figures and indexes will yeald a better result. Generally the search facilities provided by TWiki obviate the need for such tables in any case. Word tables can cause all sorts of grief. Watch where tables are anchored - html2wiki gets confused at times and may either split tables or lose parts of a table if the table is anchored to a heading paragraph. Heading lines in tables can also cause trouble, resulting in the heading cells being put into seperate rows. =head2 Running html2wiki For both passes html2wiki takes the path to the HTML document to be converted on the command line. The first pass generates a translation file in the document's location with the document's name and a .html2wiki extension. The second pass generates a file or files in the document's location with names generated using an abbreviated form of the document's name as a base and incorporating an abbreviated form of the headings text. During either pass html2wiki may generate warning or error messages. =head2 Translation File The translation file consists of a number of sections. Each section is a list of lines containing configuration information or comprising a match key, substitution text, substitution parameters and (optional) context text. Any line starting with a # is a comment and is ignored by html2wiki except the special #= section markers. =head3 XlateParams The translate parameters section provides general confinguration information. The parameters are commented where necessary and generally comprise the following entries: =head4 "WikiNameRoot" Provides the base for the generated file names. This name must conform to TWiki file naming conventions which follow WikiWord conventions. =head4 "SamePageHeader" Provides the header level below which information is keept in the same topic page. wiki2html will generate new topic pages (new files) for each heading it finds down to this level. =head4 "ParentTopicName" is the WikiWord name of the (parent) topic which contains the link to the first topic page of the generated files. This should always be provided. If it is not then the topic linking information provided by TWiki when browsing the generated pages will be broken. =head4 "AuthorsWikiName" is the WikiWord name of the putitive author of the generated files. =head3 Subs The "subs" section allows substitution strings to be provided. The substitutions are performed on the generated TWiki text. =head3 ElementSubs This section allows management of HTML elements by either ignoring them or pretending that they are a different type of element. This can be used to translate elements into elements for example, or to translate particular paragraph styles to heading elements. Use "-" to have an element ignored. Note that this does not generally ignore the contents of the element, but will suppress the direct effects of the element. For example a paragraph element with a particular set of attributes may be ignored so that it doesn't generate a paragraph break (this may be usefull in tables). =cut #The following variables are initialised in either the main block #or in ParseXlateFile. They are used in many places. my $OriginalName; # Name of the source (HTML) file my @WikiFiles; # Files generated and referenced (ie image files) my $TranslateName; # Name of the translation file my %XlateParams; # Hash of general translation parameter names and values my %ElementSubs; # Control processing for various elements my %Anchors; # Active anchors. A name and matching href are required for linking my @Subs; # Substitution pairs for the final pass my %ParentPageNames; # Names that have been used for parent pages my $Noisy = 0; # Generate progress messages # Abbreviate is used to generate abbreviated files names from headings for topic # pages. sub Abbreviate { my @Result; while ($_ = shift) { last if ! defined $_; my $Abbrev = substr $_, 0, 1, ""; #Retain first char and preserve its case tr/A-Z/a-z/; if (length ($_) > 4 and ! /[0-9]/) { tr/A-Za-z//cd; tr/aeiou//d; s/(.)\1+/$1/gi; s/ck/k/g; s/ptn/pn/g; s/tng/tg/g; s/thr/tr/g; s/vnt/vt/g; s/ltn/ln/g; s/lb/b/g; s/tt/t/g; } $Abbrev .= $_; push @Result, $Abbrev; } return wantarray ? @Result : join " ", @Result;; } #ScanForTagSubs scans the tree to identify all the tags that are used in the #document for inclusion in the translation file. # #Default actions are provided for a small number of common tags and attributes. sub ScanForTagSubs { my $Tree = shift; my $MaxPos = 0; my @Tags = $Tree->look_down ("_tag", qr/./); foreach my $Element (@Tags) { my $Tag = $Element->starttag (); my $Action = ""; if ($Tag =~ /courier/i) {$Action = ""} elsif ($Tag =~ /size=7/i) {$Action = "

"} elsif ($Tag =~ /size=6/i) {$Action = "

"} elsif ($Tag =~ /size=5/i) {$Action = "

"} elsif ($Tag =~ /size=4/i) {$Action = "

"} elsif ($Tag =~ /size=3/i) {$Action = "

"} elsif ($Tag =~ /size=2/i) {$Action = "
"} elsif ($Tag =~ //i) {$Action = ""} $Tag =~ s/\G(.*?)[\n\r]+/$1/gs; $ElementSubs{"$Tag"} = $Action; } } #GenerateXlateFile builds the translation file that is used to control document #conversion. sub GenerateXlateFile ($) { my $Tree = shift; open (outFile, ">$TranslateName") or die "Unable to create translation file: $TranslateName ($!)\n"; print outFile "# The first string of each line is a key and should not be altered.\n"; print outFile "# The second string is a value that may be altered as required Each\n"; print outFile "# section describes permissable values.\n"; print outFile "# Do not alter #= lines!\n"; print outFile "\n"; # The following return in %XlateParams my @Keys = ( ["WikiNameRoot", "# Root name for the generated wiki files"], ["SamePageHeader", "# All headers below this number generate a new wiki page"], ["ParentTopicName", "# Name of the parent wiki page"], ["AuthorsWikiName", "# WikiName of user attributed with generating the pages"], ); print outFile "#=XlateParams\n"; print outFile "# WikiNameRoot must be valid as part of a file name and should be unique in the intended TWiki context.\n"; for my $key (@Keys) { printf outFile "%-20s %-20s %s\n", "\"$key->[0]\",", "\"$XlateParams{$key->[0]}\"", "$key->[1]" } my $ContextStr; my $Action; # The following return in @Subs print outFile <\" Replace tags with given tag # Automatic linking for an href can be suppressed by deleting either # or both of the or entries below. ElementSubs foreach my $Key (sort keys %ElementSubs) { $Action = $ElementSubs{$Key}; printf outFile "%-20s %s\n", "\"$Key\",", "\"$Action\""; } close outFile; } #Assumes $TranslateName has been set. Parses translation file to extract the #document conversion parameters. sub LoadXlateFile { my $SkipTags = shift || ""; my $State = "Searching"; open (inFile, "<$TranslateName") or return; while () { chomp; s/(^#(?!=).*|(?/gi; $Link =~ tr/a-zA-Z0-9_//dc; $Link =~ s/^[0-9_]*/LinK/; ++$Anchors{"$Link:$Mode"}; } else { $ElementSubs {$Tag} = $Action; } } else { print STDERR "Don't know how to handle $State.\n"; $State = "Searching"; } } close inFile; } #PrintHeader prints TWiki topic page file meta headers. # #outFile is a file handle. #parentTopicName is the name of the parent page to this one. sub PrintHeader ($$) { my $outFile = shift; my $ParentTopicName = shift; my $AuthorsWikiName = $XlateParams {"AuthorsWikiName"}; my $now = time; print $outFile "%META:TOPICINFO{author=\"$AuthorsWikiName\"". " date=\"$now\" format=\"1.0\" version=\"1.2\"}%\n"; print $outFile "%META:TOPICPARENT{name=\"$ParentTopicName\"}%\n"; print $outFile "\n"; } #FinalFixup performs a final pass through the created files to fix up anchor #links and poorly handled symbol translations. # #FinalFixup also tidies up tables by increasing the number of cells in each row #to match the number in the row containing the greatest number of cells. This #causes the last cell on each such widened row to span the remaining width of #the table. sub FinalFixup () { foreach my $Filename (@WikiFiles) { open (inFile, "<$Filename"); my @Lines = ; close inFile; my $LineNum = 0; my $TableStart = undef; my $TableEnd = undef; my $CellCount = 0; foreach my $Line (@Lines) { chomp $Line; my $IsRow = $Line =~ m/^\|/; $TableStart = $LineNum if $IsRow and ! $TableStart; if ($TableStart and $IsRow) {# Scan table lines my $cells = $Line =~ tr/|//; $CellCount = $cells if $cells > $CellCount; $TableEnd = $LineNum; } if ($TableStart and ! $IsRow) {# End of table foreach my $Line (@Lines [$TableStart .. $TableEnd]) { my $cells = $Line =~ tr/|//; $Line .= "|" x ($CellCount - $cells) if $cells < $CellCount; } $CellCount = 0; $TableStart = undef; } my $RefPos = index ($Line, '[[#'); if ($RefPos != -1) {# Fix up the reference my ($Ref) = $Line =~ /\[\[#(.*?)]/g; substr $Line, $RefPos + 2, length ($Ref) + 1, $Anchors{$Ref}; } # Fix up symbols foreach my $Row (@Subs) { my @Pair; @Pair [0, 1] = @$Row; $Line =~ s/\Q$Pair[0]\E/$Pair[1]/g; } } continue {++$LineNum;} open (outFile, ">~$Filename"); print outFile join "\n", @Lines; close outFile; unlink $Filename; rename "~$Filename", $Filename; } } #Handle a table element. Try to prevent table nastyness escaping to the #remainder of the document! sub ConvertTable { my $this = shift; return ConvertTableHTML ($this->as_HTML()); } sub ConvertTableHTML { my $tableAsHTML = shift; my $tp = HTML::TableContentParser->new; my $tableCleanHTML = ''; $tp->parse ($tableAsHTML); for my $table (@{$tp->parse($tableAsHTML)}) { for my $row (@{$table->{rows}}) { $tableCleanHTML .= ''; for my $cell (@{$row->{cells}}) { my $data = $cell->{data} || ''; $data =~ s/\n|
//g if defined $data; $tableCleanHTML .= "$data"; } $tableCleanHTML .= "\n"; } } return Convert ($tableCleanHTML, 1); } sub BuildTree ($) { my $Tree = HTML::TreeBuilder->new (); $Tree->ignore_unknown (0); $Tree->attr_encoded (0); $Tree->parse (shift); $Tree->eof (); $Tree = $Tree->guts (); return $Tree; } #Does the HTML to TWiki conversion using the translation tables and parameters #that have already been read in from the translation file. sub Convert ($) { my ($html, $tableMode) = @_; $tableMode ||= 0; my $Tree = BuildTree ($html); return '' if ! defined $Tree; my $WikiText; my $empty_element_map = $Tree->_empty_element_map; my(@C) = [$Tree]; # a stack containing lists of children # I is a stack of indexes to current position in corresponding lists in @C # In each of these, 0 is the active point my(@I) = (-1); # initial value must be -1 for each list my @Context = ""; # Contains stack of current nodes my @AnchorStack; my @QueuedAnchors; # Anchors queued for the end of a table my $this; # current node my $content_r; # child list of $this my $TagName; my $InOList = 0; my $InUList = 0; my $ParaCount = 0; my $InHeader = 0; # Loop over the tree while (@C) { # Post processing # Move to next item in this frame if(!defined($I[0]) or ++$I[0] >= @{$C[0]}) { $this = $Context [0]; if (defined $this and ref $this) { my $StartTag = $this->starttag (); my $Action = $ElementSubs {$StartTag}; $StartTag = $Action if defined ($Action) and $Action ne ""; if ($StartTag =~ /^" : "

" if ! $InOList and ! $InUList; } elsif ($StartTag =~ /^/gi; $Link =~ tr/a-zA-Z0-9_//dc; $Link =~ s/^[0-9_]*/LinK/; my $Text = shift @AnchorStack; $Text = 'here' if ! defined $Text; $WikiText .= "
" if defined $Anchors{"$Link:name"}; } elsif ($StartTag =~ /^/gi; $Link =~ tr/a-zA-Z0-9_//dc; $Link =~ s/^[0-9_]*/LinK/; if (defined $Anchors{"$Link:href"}) { my $Text; $Text .= "
" if ! ($WikiText =~ /
$/); $Text .= "

"; $WikiText .= $Text; } } elsif ($StartTag =~ /^/g if ! $InOList and ! $InUList and ! $tableMode; chomp $this; $this =~ s/\xA0/ /gs; $this =~ s/\x09/<3sp>/gs; if (@AnchorStack) { my $Index = 0; while ($Index < @AnchorStack) { $AnchorStack[$Index] .= $this; ++$Index; } } else { $WikiText .= $this; } } else {# Process this element my $StartTag = $this->starttag (); $StartTag =~ s/[\r\n]*//gs; # Ignore elements nested in headers except anchors if ($InHeader and ! $StartTag =~ /^delete_content (); } elsif ($StartTag =~ /^/) {$WikiText .= ""} elsif ($StartTag =~ /^/) {$WikiText .= ""} elsif ($StartTag =~ /^/) {$WikiText .= "<h1>"} elsif ($StartTag =~ /^(<li[aAiI]?)\b/) {$WikiText .= $InOList ? "$1>" : "<li>";} elsif ($StartTag =~ /^<a href\b/) { unshift @AnchorStack, ""; } elsif ($StartTag =~ /^<ol\b/) { $WikiText .= "<br><br>" if ! $InOList and ! $InUList; ++$InOList; } elsif ($StartTag =~ /^<ul\b/) { $WikiText .= "<br><br>" if ! $InOList and ! $InUList; ++$InUList; } elsif ($StartTag =~ /^<img\b/) { $WikiText .= $StartTag; } } # Now queue up content list for the current element... if( ref $this and not ( # ...except for those which not($content_r = $this->{'_content'} and @$content_r) and # ...have empty content lists $this->{'_empty_element'} || $empty_element_map->{$this->{'_tag'} || ''} # ...and that don't get post-order callbacks ) ) { unshift @Context, $this; unshift @I, -1; unshift @C, $content_r || []; } } print "Generated marked up TWiki\n" if $Noisy; my $sp = qr/(?:\s|<3sp>)/; my $face = qr/(?:b|i|code)/; my $pre = qr/(?:<ul>|<ol>|<h[1-6]>|<a [a-zA-Z0-9_]+>)/; $WikiText =~ s/<br>\s+/<br>/g; # Remove leading spaces $WikiText =~ s/\s+<br>/<br>/g; # Remove trailing spaces $WikiText =~ s/(<h[1-6]>)(?:<br>)+/$1/g; print "Removed spurious white space at line ends\n" if $Noisy; my $Touched; do { $Touched = 0; # Remove multiple blank lines $Touched |= $WikiText =~ s/(?:<br>){3,}/<br><br>/g; # Remove empty face elements $Touched |= $WikiText =~ s/<($face)>((?:<br>|$sp)*)<\/\1>/$2/g; # Migrate various tags adjacent to text $WikiText =~ s/((?:<br>)+)(<a [^>]*:.*?>)/$2$1/g; $Touched |= $WikiText =~ s/(<$face>)((?:<br>|$sp|$pre|\W+)+)/$2$1/g; $Touched |= $WikiText =~ s/((?:<br>|$sp|$pre|\W+)+)<\/($face)>/<\/$2>$1/g; } while ($Touched); print "Removed various empty elements\n" if $Noisy; $WikiText =~ s/<br><br>($pre)/<br>$1/g; $WikiText =~ s/($pre)($pre)/$1<br>$2/g; $WikiText =~ s/(<\/?code><\/?b>|<\/?b><\/?code>)/==/g; $WikiText =~ s/(<\/?i><\/?b>|<\/?b><\/?i>)/__/g; $WikiText =~ s/<\/?b>/*/g; $WikiText =~ s/<\/?i>/_/g; $WikiText =~ s/<\/?code>/=/g; $WikiText =~ s/($pre|<br>)(?:$sp)+/$1/g; $WikiText =~ s/<br>(?:•|<ul>)/<br> * /g; $WikiText =~ s/<ol>/<br> 1 /g; $WikiText =~ s/<ol([a|A|i|I])>/<br> $1 /g; print "Inserted list line prefixes\n" if $Noisy; $WikiText =~ s/<3sp>/ /g; $WikiText =~ s/<h([1-6])>(?{"+"x$1})/<br>---$^R /g; print "Inserted header line prefixes\n" if $Noisy; $WikiText =~ s/<a ([^>]*):(.*?)>/[[#$1][$2]]/g; $WikiText =~ s/<a (.*?)>/#$1/g; print "Inserted links\n" if $Noisy; # Put in the line breaks $WikiText =~ s/<br>/\n/gs; $WikiText =~ s/\n \* \n/\n/gs; $WikiText =~ s/^\n+//gs; print "Restored line breaks\n" if $Noisy; return $WikiText; } sub OutputTWiki ($) { my @Lines = split /\n/, shift; my @Files; my @HeaderOffset; my @PageTag; my $WikiNameRoot = $XlateParams {"WikiNameRoot"}; my $SamePageHeader = $XlateParams {"SamePageHeader"}; my %ImageFiles; my %AddedImages; my $FirstHeader = 1; unshift @PageTag, $XlateParams {"ParentTopicName"}; unshift @PageTag, "$WikiNameRoot"; $ParentPageNames {$PageTag[0]} = 1; unshift @WikiFiles, ("$PageTag[0].txt"); open ($Files [0], ">$WikiFiles[0]"); PrintHeader ($Files [0], $PageTag[1]); $HeaderOffset [0] = 1; foreach my $Line (@Lines) { # Fix up cell breaks $Line =~ s/<brr?>\|/|/gi; $Line =~ s/\|<brr?>/|/gi; $Line =~ s/(^<brr?>\s*|\s*<brr?>$)//gi; $Line =~ s/<brr>/<br>/gi; my ($Plusses) = $Line =~ /^---(\++)/g; # Image file processing if ($Line =~ /(<img\b.*?src="(.*?)".*?>)/ && -e $2) { my $Filename = "$WikiNameRoot$2"; if (defined $ImageFiles {$Filename}) {$Filename .= "-" . ++$ImageFiles {$Filename};} else {$ImageFiles {$Filename} = 1;} sysopen inFile, $2, O_BINARY | O_RDONLY; sysopen outFile, $Filename, O_BINARY | O_WRONLY | O_CREAT; my $Buffer; my $Len; while ($Len = sysread inFile, $Buffer, 2048) {syswrite outFile, $Buffer, $Len;} close inFile; close outFile; $AddedImages {$Filename} = -s $Filename; my $Link = "<img src=\"%ATTACHURLPATH%/$Filename\" alt=\"$Filename\"/>"; substr $Line, $-[0], length $1, $Link; } # Anchor processing if ($Line =~ /^#/) {# Anchor my ($Tag) = $Line =~ /^#(\w+)/; $Anchors {$Tag} = "$PageTag[0]#$Tag"; } if (defined $Plusses) {$Line =~ s/\*(.*)\*/$1/g;} if (! defined $Plusses or length ($Plusses) >= $SamePageHeader) {# Non-header line or same page header print {$Files [0]} $Line."\n"; next; } # Header processing $Plusses = length ($Plusses); while ($HeaderOffset [0] >= $Plusses and @Files > 1) {# Pop a level print {$Files [0]} "</noautolink>\n"; foreach my $ImageName (sort keys %AddedImages) { my $MetaLine = "%META:FILEATTACHMENT{name=\""; $MetaLine .= $ImageName; $MetaLine .= "\" attr=\"h\" comment=\"\" date=\""; $MetaLine .= time; $MetaLine .= "\" path=\""; $MetaLine .= $ImageName; $MetaLine .= "\" size=\""; $MetaLine .= $AddedImages {$ImageName}; $MetaLine .= "\" user=\""; $MetaLine .= $XlateParams {"AuthorsWikiName"}; $MetaLine .= "\" version=\"1.1\"}%\n"; print {$Files [0]} $MetaLine; } %AddedImages = (); close $Files [0]; shift @Files; shift @HeaderOffset; shift @PageTag; } # Now on right page for header if ($Plusses == 1 && $FirstHeader or $Plusses >= $SamePageHeader) {# Stay on same page $FirstHeader &&= $Plusses != 1; print {$Files [0]} $Line."\n"; next; } # Push a level my $HeaderText = (substr $Line, $Plusses + 3); my $Tag = $HeaderText; $Tag =~ tr/0-9A-Za-z //cd; $Tag =~ tr/a-z/A-Z/; $Tag = join "", Abbreviate (split " ", $Tag); my $PageName = $WikiNameRoot.$Tag; if (exists $ParentPageNames{$PageName}) { $PageName .= ++$ParentPageNames {$PageName}; } else { $ParentPageNames {$PageName} = 1; } unshift @PageTag, $PageName; unshift @WikiFiles, ("$PageTag[0].txt"); unshift @HeaderOffset, $Plusses; print {$Files [0]} "---" . "+" x $Plusses . "[[$PageTag[0]][$HeaderText]]\n"; unshift @Files, undef; open ($Files [0], ">$WikiFiles[0]"); PrintHeader ($Files [0], $PageTag[1]); print {$Files [0]} $Line."\n"; print "$WikiFiles[1] parent of $WikiFiles[0]\n" if $Noisy; } foreach my $ImageName (sort keys %AddedImages) { my $MetaLine = "%META:FILEATTACHMENT{name=\""; $MetaLine .= $ImageName; $MetaLine .= "\" attr=\"h\" comment=\"\" date=\""; $MetaLine .= time; $MetaLine .= "\" path=\""; $MetaLine .= $ImageName; $MetaLine .= "\" size=\""; $MetaLine .= $AddedImages {$ImageName}; $MetaLine .= "\" user=\""; $MetaLine .= $XlateParams {"AuthorsWikiName"}; $MetaLine .= "\" version=\"1.1\"}%\n"; print {$Files [0]} $MetaLine; } %AddedImages = (); do { print {$Files [0]} "</noautolink>\n"; close $Files [0]; shift @Files; } while (@Files); FinalFixup (); print "Done\n"; } sub HelpAndExit { print shift while scalar @_; print "\n"; print "html2wiki [-n] <source document>\n"; print "\n"; print " -n Turn on noisy mode. Lists character and tag/attribute substitutions\n"; print " as they are found in the first pass and progress in the second pass.\n"; print " <source document> The document to convert from HTML to TWiki format.\n"; exit (-1); } sub SetDefaults ($) { $XlateParams {"WikiNameRoot"} = shift; $XlateParams {"SamePageHeader"} = 4; $XlateParams {"ParentTopicName"} = "Main"; $XlateParams {"AuthorsWikiName"} = "TWikiGuest"; LoadXlateFile ("ElementSubs "); } #The main block processes the command line to retreive the file name of the #document to process. It then checks to see if the translation file exists and #either calls GenerateXlateFile to generate it, or calls ParseXlateFile and #Convert to perform the conversion. my $Param = shift; HelpAndExit if ! defined $Param; if ($Param =~ /^-n$/) { $Noisy = 1; $OriginalName = shift; } else {$OriginalName = $Param;} HelpAndExit ("Document file name required\n") if ! defined $OriginalName; HelpAndExit ("Document file <$OriginalName> not found\n") if ! (-e $OriginalName); (my $WikiNameRoot = $OriginalName) =~ s/(.*)\..*?$/$1/; $WikiNameRoot =~ tr/ \-_a-zA-Z0-9\x00-\xFF/ a-zA-Z0-9/d; # Strip "nasty" characters my @Words; push @Words, ucfirst $_ foreach split " ", $WikiNameRoot; $WikiNameRoot = join "", Abbreviate (@Words); $TranslateName = "$OriginalName.html2wiki"; SetDefaults ($WikiNameRoot); my $html = ""; open inFile, "<$OriginalName"; while (<inFile>) { chomp; if (($html =~ /[^\s>]$/) and (/^\b/)) {$html .= " ";} s/ / /g; $html .= $_; } close inFile; $html =~ s/[\r\n]+/ /gs; $html =~ s/<!--.*?-->//g; # Delete comments print "Loaded $OriginalName\n" if $Noisy; print "Parsed html\n" if $Noisy; if (! (-e $TranslateName) || -M $TranslateName > -M $OriginalName) { my $Tree = BuildTree ($html); GenerateXlateFile ($Tree); print "Translate file has been generated as: $TranslateName"; } else { print "Converting to TWiki\n" if $Noisy; LoadXlateFile (); OutputTWiki (Convert ($html)); print "Loaded configuration information\n" if $Noisy; if (1 == @WikiFiles) { print "Conversion is complete. The following file was generated: $WikiFiles[0]\n"; } else { print "Conversion is complete. The following files were generated:\n"; printf " $_\n" while $_ = pop @WikiFiles; } }