/) {$WikiText .= "<h1>"} elsif ($StartTag =~ /^(<li[aAiI]?)\b/) {$WikiText .= $InOList ? "$1>" : "<li>";} elsif ($StartTag =~ /^<a href\b/) { unshift @AnchorStack, ""; } elsif ($StartTag =~ /^<ol\b/) { $WikiText .= " " if ! $InOList and ! $InUList; ++$InOList; } elsif ($StartTag =~ /^<ul\b/) { $WikiText .= " " if ! $InOList and ! $InUList; ++$InUList; } elsif ($StartTag =~ /^<img\b/) { $WikiText .= $StartTag; } } # Now queue up content list for the current element... if( ref $this and not ( # ...except for those which not($content_r = $this->{'_content'} and @$content_r) and # ...have empty content lists $this->{'_empty_element'} || $empty_element_map->{$this->{'_tag'} || ''} # ...and that don't get post-order callbacks ) ) { unshift @Context, $this; unshift @I, -1; unshift @C, $content_r || []; } } print "Generated marked up TWiki\n" if $Noisy; my $sp = qr/(?:\s|<3sp>)/; my $face = qr/(?:b|i|code)/; my $pre = qr/(?:<ul>|<ol>|<h[1-6]>|<a [a-zA-Z0-9_]+>)/; $WikiText =~ s/ \s+/ /g; # Remove leading spaces $WikiText =~ s/\s+ / /g; # Remove trailing spaces $WikiText =~ s/(<h[1-6]>)(?: )+/$1/g; print "Removed spurious white space at line ends\n" if $Noisy; my $Touched; do { $Touched = 0; # Remove multiple blank lines $Touched |= $WikiText =~ s/(?: ){3,}/ /g; # Remove empty face elements $Touched |= $WikiText =~ s/<($face)>((?: |$sp)*)<\/\1>/$2/g; # Migrate various tags adjacent to text $WikiText =~ s/((?: )+)(<a [^>]*:.*?>)/$2$1/g; $Touched |= $WikiText =~ s/(<$face>)((?: |$sp|$pre|\W+)+)/$2$1/g; $Touched |= $WikiText =~ s/((?: |$sp|$pre|\W+)+)<\/($face)>/<\/$2>$1/g; } while ($Touched); print "Removed various empty elements\n" if $Noisy; $WikiText =~ s/ ($pre)/ $1/g; $WikiText =~ s/($pre)($pre)/$1 $2/g; $WikiText =~ s/(<\/?code><\/?b>|<\/?b><\/?code>)/==/g; $WikiText =~ s/(<\/?i><\/?b>|<\/?b><\/?i>)/__/g; $WikiText =~ s/<\/?b>/*/g; $WikiText =~ s/<\/?i>/_/g; $WikiText =~ s/<\/?code>/=/g; $WikiText =~ s/($pre| )(?:$sp)+/$1/g; $WikiText =~ s/ (?:•|<ul>)/ * /g; $WikiText =~ s/<ol>/ 1 /g; $WikiText =~ s/<ol([a|A|i|I])>/ $1 /g; print "Inserted list line prefixes\n" if $Noisy; $WikiText =~ s/<3sp>/ /g; $WikiText =~ s/<h([1-6])>(?{"+"x$1})/ ---$^R /g; print "Inserted header line prefixes\n" if $Noisy; $WikiText =~ s/<a ([^>]*):(.*?)>/[[#$1][$2]]/g; $WikiText =~ s/<a (.*?)>/#$1/g; print "Inserted links\n" if $Noisy; # Put in the line breaks $WikiText =~ s/ /\n/gs; $WikiText =~ s/\n \* \n/\n/gs; $WikiText =~ s/^\n+//gs; print "Restored line breaks\n" if $Noisy; return $WikiText; } sub OutputTWiki ($) { my @Lines = split /\n/, shift; my @Files; my @HeaderOffset; my @PageTag; my $WikiNameRoot = $XlateParams {"WikiNameRoot"}; my $SamePageHeader = $XlateParams {"SamePageHeader"}; my %ImageFiles; my %AddedImages; my $FirstHeader = 1; unshift @PageTag, $XlateParams {"ParentTopicName"}; unshift @PageTag, "$WikiNameRoot"; $ParentPageNames {$PageTag[0]} = 1; unshift @WikiFiles, ("$PageTag[0].txt"); open ($Files [0], ">$WikiFiles[0]"); PrintHeader ($Files [0], $PageTag[1]); $HeaderOffset [0] = 1; foreach my $Line (@Lines) { # Fix up cell breaks $Line =~ s/<brr?>\|/|/gi; $Line =~ s/\|<brr?>/|/gi; $Line =~ s/(^<brr?>\s*|\s*<brr?>$)//gi; $Line =~ s/<brr>/ /gi; my ($Plusses) = $Line =~ /^---(\++)/g; # Image file processing if ($Line =~ /(<img\b.*?src="(.*?)".*?>)/ && -e $2) { my $Filename = "$WikiNameRoot$2"; if (defined $ImageFiles {$Filename}) {$Filename .= "-" . ++$ImageFiles {$Filename};} else {$ImageFiles {$Filename} = 1;} sysopen inFile, $2, O_BINARY | O_RDONLY; sysopen outFile, $Filename, O_BINARY | O_WRONLY | O_CREAT; my $Buffer; my $Len; while ($Len = sysread inFile, $Buffer, 2048) {syswrite outFile, $Buffer, $Len;} close inFile; close outFile; $AddedImages {$Filename} = -s $Filename; my $Link = "<img src=\"%ATTACHURLPATH%/$Filename\" alt=\"$Filename\"/>"; substr $Line, $-[0], length $1, $Link; } # Anchor processing if ($Line =~ /^#/) {# Anchor my ($Tag) = $Line =~ /^#(\w+)/; $Anchors {$Tag} = "$PageTag[0]#$Tag"; } if (defined $Plusses) {$Line =~ s/\*(.*)\*/$1/g;} if (! defined $Plusses or length ($Plusses) >= $SamePageHeader) {# Non-header line or same page header print {$Files [0]} $Line."\n"; next; } # Header processing $Plusses = length ($Plusses); while ($HeaderOffset [0] >= $Plusses and @Files > 1) {# Pop a level print {$Files [0]} "</noautolink>\n"; foreach my $ImageName (sort keys %AddedImages) { my $MetaLine = "%META:FILEATTACHMENT{name=\""; $MetaLine .= $ImageName; $MetaLine .= "\" attr=\"h\" comment=\"\" date=\""; $MetaLine .= time; $MetaLine .= "\" path=\""; $MetaLine .= $ImageName; $MetaLine .= "\" size=\""; $MetaLine .= $AddedImages {$ImageName}; $MetaLine .= "\" user=\""; $MetaLine .= $XlateParams {"AuthorsWikiName"}; $MetaLine .= "\" version=\"1.1\"}%\n"; print {$Files [0]} $MetaLine; } %AddedImages = (); close $Files [0]; shift @Files; shift @HeaderOffset; shift @PageTag; } # Now on right page for header if ($Plusses == 1 && $FirstHeader or $Plusses >= $SamePageHeader) {# Stay on same page $FirstHeader &&= $Plusses != 1; print {$Files [0]} $Line."\n"; next; } # Push a level my $HeaderText = (substr $Line, $Plusses + 3); my $Tag = $HeaderText; $Tag =~ tr/0-9A-Za-z //cd; $Tag =~ tr/a-z/A-Z/; $Tag = join "", Abbreviate (split " ", $Tag); my $PageName = $WikiNameRoot.$Tag; if (exists $ParentPageNames{$PageName}) { $PageName .= ++$ParentPageNames {$PageName}; } else { $ParentPageNames {$PageName} = 1; } unshift @PageTag, $PageName; unshift @WikiFiles, ("$PageTag[0].txt"); unshift @HeaderOffset, $Plusses; print {$Files [0]} "---" . "+" x $Plusses . "[[$PageTag[0]][$HeaderText]]\n"; unshift @Files, undef; open ($Files [0], ">$WikiFiles[0]"); PrintHeader ($Files [0], $PageTag[1]); print {$Files [0]} $Line."\n"; print "$WikiFiles[1] parent of $WikiFiles[0]\n" if $Noisy; } foreach my $ImageName (sort keys %AddedImages) { my $MetaLine = "%META:FILEATTACHMENT{name=\""; $MetaLine .= $ImageName; $MetaLine .= "\" attr=\"h\" comment=\"\" date=\""; $MetaLine .= time; $MetaLine .= "\" path=\""; $MetaLine .= $ImageName; $MetaLine .= "\" size=\""; $MetaLine .= $AddedImages {$ImageName}; $MetaLine .= "\" user=\""; $MetaLine .= $XlateParams {"AuthorsWikiName"}; $MetaLine .= "\" version=\"1.1\"}%\n"; print {$Files [0]} $MetaLine; } %AddedImages = (); do { print {$Files [0]} "</noautolink>\n"; close $Files [0]; shift @Files; } while (@Files); FinalFixup (); print "Done\n"; } sub HelpAndExit { print shift while scalar @_; print "\n"; print "html2wiki [-n] <source document>\n"; print "\n"; print " -n Turn on noisy mode. Lists character and tag/attribute substitutions\n"; print " as they are found in the first pass and progress in the second pass.\n"; print " <source document> The document to convert from HTML to TWiki format.\n"; exit (-1); } sub SetDefaults ($) { $XlateParams {"WikiNameRoot"} = shift; $XlateParams {"SamePageHeader"} = 4; $XlateParams {"ParentTopicName"} = "Main"; $XlateParams {"AuthorsWikiName"} = "TWikiGuest"; LoadXlateFile ("ElementSubs "); } #The main block processes the command line to retreive the file name of the #document to process. It then checks to see if the translation file exists and #either calls GenerateXlateFile to generate it, or calls ParseXlateFile and #Convert to perform the conversion. my $Param = shift; HelpAndExit if ! defined $Param; if ($Param =~ /^-n$/) { $Noisy = 1; $OriginalName = shift; } else {$OriginalName = $Param;} HelpAndExit ("Document file name required\n") if ! defined $OriginalName; HelpAndExit ("Document file <$OriginalName> not found\n") if ! (-e $OriginalName); (my $WikiNameRoot = $OriginalName) =~ s/(.*)\..*?$/$1/; $WikiNameRoot =~ tr/ \-_a-zA-Z0-9\x00-\xFF/ a-zA-Z0-9/d; # Strip "nasty" characters my @Words; push @Words, ucfirst $_ foreach split " ", $WikiNameRoot; $WikiNameRoot = join "", Abbreviate (@Words); $TranslateName = "$OriginalName.html2wiki"; SetDefaults ($WikiNameRoot); my $html = ""; open inFile, "<$OriginalName"; while (<inFile>) { chomp; if (($html =~ /[^\s>]$/) and (/^\b/)) {$html .= " ";} s/ / /g; $html .= $_; } close inFile; $html =~ s/[\r\n]+/ /gs; $html =~ s///g; # Delete comments print "Loaded $OriginalName\n" if $Noisy; print "Parsed html\n" if $Noisy; if (! (-e $TranslateName) || -M $TranslateName > -M $OriginalName) { my $Tree = BuildTree ($html); GenerateXlateFile ($Tree); print "Translate file has been generated as: $TranslateName"; } else { print "Converting to TWiki\n" if $Noisy; LoadXlateFile (); OutputTWiki (Convert ($html)); print "Loaded configuration information\n" if $Noisy; if (1 == @WikiFiles) { print "Conversion is complete. The following file was generated: $WikiFiles[0]\n"; } else { print "Conversion is complete. The following files were generated:\n"; printf " $_\n" while $

use warnings; use strict; use utf8; use Fcntl; use HTML::TreeBuilder; use HTML::Entities; use HTML::TableContentParser; =head1 HTML to TWiki text file converter This script takes an HTML document generated by exporting a Word document as HTML and generates TWiki pages suitable for dropping directly into the TWiki folder. Two passes are required by html2wiki. The first pass generates a translation file containing various substitution strings that control behaviour of the conversion process in the second pass. The translation file will require editing before the second pass if you expect good results. Note that the entire document is processed in memory so there may be performance issues when large documents are being processed. Some manual preprocessing of the document to remove tables of contents, tables of figures and indexes will yeald a better result. Generally the search facilities provided by TWiki obviate the need for such tables in any case. Word tables can cause all sorts of grief. Watch where tables are anchored - html2wiki gets confused at times and may either split tables or lose parts of a table if the table is anchored to a heading paragraph. Heading lines in tables can also cause trouble, resulting in the heading cells being put into seperate rows. =head2 Running html2wiki For both passes html2wiki takes the path to the HTML document to be converted on the command line. The first pass generates a translation file in the document's location with the document's name and a .html2wiki extension. The second pass generates a file or files in the document's location with names generated using an abbreviated form of the document's name as a base and incorporating an abbreviated form of the headings text. During either pass html2wiki may generate warning or error messages. =head2 Translation File The translation file consists of a number of sections. Each section is a list of lines containing configuration information or comprising a match key, substitution text, substitution parameters and (optional) context text. Any line starting with a # is a comment and is ignored by html2wiki except the special #= section markers. =head3 XlateParams The translate parameters section provides general confinguration information. The parameters are commented where necessary and generally comprise the following entries: =head4 "WikiNameRoot" Provides the base for the generated file names. This name must conform to TWiki file naming conventions which follow WikiWord conventions. =head4 "SamePageHeader" Provides the header level below which information is keept in the same topic page. wiki2html will generate new topic pages (new files) for each heading it finds down to this level. =head4 "ParentTopicName" is the WikiWord name of the (parent) topic which contains the link to the first topic page of the generated files. This should always be provided. If it is not then the topic linking information provided by TWiki when browsing the generated pages will be broken. =head4 "AuthorsWikiName" is the WikiWord name of the putitive author of the generated files. =head3 Subs The "subs" section allows substitution strings to be provided. The substitutions are performed on the generated TWiki text. =head3 ElementSubs This section allows management of HTML elements by either ignoring them or pretending that they are a different type of element. This can be used to translate elements into

 elements for
example, or to translate particular paragraph styles to heading elements.

Use "-" to have an element ignored. Note that this does not generally ignore the
contents of the element, but will suppress the direct effects of the element. For
example a paragraph element with a particular set of attributes may be ignored so
that it doesn't generate a paragraph break (this may be usefull in tables).

=cut


#The following variables are initialised in either the main block
#or in ParseXlateFile. They are used in many places.

my $OriginalName;   # Name of the source (HTML) file
my @WikiFiles;      # Files generated and referenced (ie image files)
my $TranslateName;  # Name of the translation file
my %XlateParams;    # Hash of general translation parameter names and values
my %ElementSubs;    # Control processing for various elements
my %Anchors;        # Active anchors. A name and matching href are required for linking
my @Subs;           # Substitution pairs for the final pass
my %ParentPageNames; # Names that have been used for parent pages
my $Noisy = 0;      # Generate progress messages


# Abbreviate is used to generate abbreviated files names from headings for topic
# pages.

sub Abbreviate
{
my @Result;

while ($_ = shift)
    {
    last if ! defined $_;
    my $Abbrev = substr $_, 0, 1, ""; #Retain first char and preserve its case
    tr/A-Z/a-z/;

    if (length ($_) > 4 and ! /[0-9]/)
      {
      tr/A-Za-z//cd;
      tr/aeiou//d;
      s/(.)\1+/$1/gi;
      s/ck/k/g;
      s/ptn/pn/g;
      s/tng/tg/g;
      s/thr/tr/g;
      s/vnt/vt/g;
      s/ltn/ln/g;
      s/lb/b/g;
      s/tt/t/g;
      }

    $Abbrev .= $_;
    push @Result, $Abbrev;
    }

return wantarray ? @Result : join " ", @Result;;
}


#ScanForTagSubs scans the tree to identify all the tags that are used in the
#document for inclusion in the translation file.
#
#Default actions are provided for a small number of common tags and attributes.

sub ScanForTagSubs
{
my $Tree = shift;
my $MaxPos = 0;
my @Tags = $Tree->look_down ("_tag", qr/./);

foreach my $Element (@Tags)
  {
  my $Tag = $Element->starttag ();
  my $Action = "";

  if ($Tag =~ /courier/i) {$Action = ""}
  elsif ($Tag =~ /size=7/i) {$Action = ""}
  elsif ($Tag =~ /size=6/i) {$Action = "
"}
  elsif ($Tag =~ /size=5/i) {$Action = "
"}
  elsif ($Tag =~ /size=4/i) {$Action = "
"}
  elsif ($Tag =~ /size=3/i) {$Action = "
"}
  elsif ($Tag =~ /size=2/i) {$Action = "
"}
  elsif ($Tag =~ //i) {$Action = ""}
  $Tag =~ s/\G(.*?)[\n\r]+/$1/gs;
  $ElementSubs{"$Tag"} = $Action;
  }
}


#GenerateXlateFile builds the translation file that is used to control document
#conversion.

sub GenerateXlateFile ($)
{
my $Tree = shift;
open (outFile, ">$TranslateName") or die "Unable to create translation file: $TranslateName ($!)\n";
print outFile "# The first string of each line is a key and should not be altered.\n";
print outFile "# The second string is a value that may be altered as required Each\n";
print outFile "# section describes permissable values.\n";
print outFile "# Do not alter #= lines!\n";
print outFile "\n";

# The following return in %XlateParams
my @Keys = (
  ["WikiNameRoot", "# Root name for the generated wiki files"],
  ["SamePageHeader", "# All headers below this number generate a new wiki page"],
  ["ParentTopicName", "# Name of the parent wiki page"],
  ["AuthorsWikiName", "# WikiName of user attributed with generating the pages"],
  );
print outFile "#=XlateParams\n";
print outFile "# WikiNameRoot must be valid as part of a file name and should be unique in the intended TWiki context.\n";
for my $key (@Keys) {
  printf outFile "%-20s %-20s %s\n", "\"$key->[0]\",", "\"$XlateParams{$key->[0]}\"",  "$key->[1]"
}

my $ContextStr;
my $Action;

# The following return in @Subs
print outFile <\"  Replace tags with given tag
# Automatic linking for an href can be suppressed by deleting either
# or both of the  or  entries below.
ElementSubs

foreach my $Key (sort keys %ElementSubs)
  {
  $Action = $ElementSubs{$Key};
  printf outFile "%-20s %s\n", "\"$Key\",", "\"$Action\"";
  }

close outFile;
}


#Assumes $TranslateName has been set. Parses translation file to extract the
#document conversion parameters.

sub LoadXlateFile
{
my $SkipTags = shift || "";
my $State = "Searching";
open (inFile, "<$TranslateName") or return;

while ()
  {
  chomp;
  s/(^#(?!=).*|(?/gi;
      $Link =~ tr/a-zA-Z0-9_//dc;
      $Link =~ s/^[0-9_]*/LinK/;
      ++$Anchors{"$Link:$Mode"};
      }
    else
      {
      $ElementSubs {$Tag} = $Action;
      }
    }
  else
    {
    print STDERR "Don't know how to handle $State.\n";
    $State = "Searching";
    }
  }

close inFile;
}


#PrintHeader prints TWiki topic page file meta headers.
#
#outFile is a file handle.
#parentTopicName is the name of the parent page to this one.

sub PrintHeader ($$)
{
my $outFile = shift;
my $ParentTopicName = shift;
my $AuthorsWikiName = $XlateParams {"AuthorsWikiName"};
my $now = time;

print $outFile "%META:TOPICINFO{author=\"$AuthorsWikiName\"".
  " date=\"$now\" format=\"1.0\" version=\"1.2\"}%\n";
print $outFile "%META:TOPICPARENT{name=\"$ParentTopicName\"}%\n";
print $outFile "\n";
}


#FinalFixup performs a final pass through the created files to fix up anchor
#links and poorly handled symbol translations.
#
#FinalFixup also tidies up tables by increasing the number of cells in each row
#to match the number in the row containing the greatest number of cells. This
#causes the last cell on each such widened row to span the remaining width of
#the table.

sub FinalFixup ()
{
foreach my $Filename (@WikiFiles)
  {
  open (inFile, "<$Filename");
  my @Lines = ;
  close inFile;

  my $LineNum = 0;
  my $TableStart = undef;
  my $TableEnd = undef;
  my $CellCount = 0;

  foreach my $Line (@Lines)
    {
    chomp $Line;
    my $IsRow = $Line =~ m/^\|/;

    $TableStart = $LineNum if $IsRow and ! $TableStart;
    if ($TableStart and $IsRow)
      {# Scan table lines
      my $cells = $Line =~ tr/|//;
      $CellCount = $cells if $cells > $CellCount;
      $TableEnd = $LineNum;
      }

    if ($TableStart and ! $IsRow)
      {# End of table
      foreach my $Line (@Lines [$TableStart .. $TableEnd])
        {
        my $cells = $Line =~ tr/|//;
        $Line .= "|" x ($CellCount - $cells) if $cells < $CellCount;
        }

      $CellCount = 0;
      $TableStart = undef;
      }

    my $RefPos = index ($Line, '[[#');
    if ($RefPos != -1)
      {# Fix up the reference
      my ($Ref) = $Line =~ /\[\[#(.*?)]/g;
      substr $Line, $RefPos + 2, length ($Ref) + 1, $Anchors{$Ref};
      }

    # Fix up symbols
    foreach my $Row (@Subs)
      {
      my @Pair;
      @Pair [0, 1] = @$Row;
      $Line =~ s/\Q$Pair[0]\E/$Pair[1]/g;
      }
    }
    continue {++$LineNum;}

  open (outFile, ">~$Filename");
  print outFile join "\n", @Lines;
  close outFile;
  unlink $Filename;
  rename "~$Filename", $Filename;
  }
}


#Handle a table element. Try to prevent table nastyness escaping to the
#remainder of the document!

sub ConvertTable {
  my $this = shift;
  return ConvertTableHTML ($this->as_HTML());
}

sub ConvertTableHTML {
  my $tableAsHTML = shift;
  my $tp = HTML::TableContentParser->new;
  my $tableCleanHTML = '';
  
  $tp->parse ($tableAsHTML);
  
  for my $table (@{$tp->parse($tableAsHTML)}) {
    for my $row (@{$table->{rows}}) {
      $tableCleanHTML .= '';
      for my $cell (@{$row->{cells}}) {
        my $data = $cell->{data} || '';
        $data =~ s/\n|
//g if defined $data;
        $tableCleanHTML .= "$data";
      }
      $tableCleanHTML .= "\n";
    }
  }
 
return Convert ($tableCleanHTML, 1);
}

sub BuildTree ($)
{
my $Tree = HTML::TreeBuilder->new ();

$Tree->ignore_unknown (0);
$Tree->attr_encoded (0);
$Tree->parse (shift);
$Tree->eof ();
$Tree = $Tree->guts ();
return $Tree;
}


#Does the HTML to TWiki conversion using the translation tables and parameters
#that have already been read in from the translation file.

sub Convert ($)
{
my ($html, $tableMode) = @_;

$tableMode ||= 0;

my $Tree = BuildTree ($html);
return '' if ! defined $Tree;

my $WikiText;
my $empty_element_map = $Tree->_empty_element_map;

my(@C) = [$Tree]; # a stack containing lists of children
# I is a stack of indexes to current position in corresponding lists in @C
# In each of these, 0 is the active point
my(@I) = (-1); # initial value must be -1 for each list
my @Context = ""; # Contains stack of current nodes
my @AnchorStack;
my @QueuedAnchors; # Anchors queued for the end of a table

my $this;  # current node
my $content_r; # child list of $this
my $TagName;
my $InOList = 0;
my $InUList = 0;
my $ParaCount = 0;
my $InHeader = 0;

# Loop over the tree
while (@C)
  {
  # Post processing
  # Move to next item in this frame
  if(!defined($I[0]) or ++$I[0] >= @{$C[0]})
    {
    $this = $Context [0];
    if (defined $this and ref $this)
      {
      my $StartTag = $this->starttag ();
      my $Action = $ElementSubs {$StartTag};
      $StartTag = $Action if defined ($Action) and $Action ne "";

      if ($StartTag =~ /^" : "

" if ! $InOList and ! $InUList;
        }
      elsif ($StartTag =~ /^/gi;
        $Link =~ tr/a-zA-Z0-9_//dc;
        $Link =~ s/^[0-9_]*/LinK/;
        my $Text = shift @AnchorStack;
        $Text = 'here' if ! defined $Text;
        $WikiText .= "" if defined $Anchors{"$Link:name"};
        }
      elsif ($StartTag =~ /^/gi;
        $Link =~ tr/a-zA-Z0-9_//dc;
        $Link =~ s/^[0-9_]*/LinK/;

        if (defined $Anchors{"$Link:href"})
          {
          my $Text;
          $Text .= "
" if ! ($WikiText =~ /
$/);
          $Text .= "
";
          $WikiText .= $Text;
          }
        }
      elsif ($StartTag =~ /^/g if ! $InOList and ! $InUList and ! $tableMode;
    chomp $this;
    $this =~ s/\xA0/ /gs;
    $this =~ s/\x09/<3sp>/gs;
    if (@AnchorStack)
      {
      my $Index = 0;
      while ($Index < @AnchorStack)
        {
        $AnchorStack[$Index] .= $this;
        ++$Index;
        }
      }
    else
      {
      $WikiText .= $this;
      }
    }
  else
    {# Process this element
    my $StartTag = $this->starttag ();
    $StartTag =~ s/[\r\n]*//gs;

    # Ignore elements nested in headers except anchors
    if ($InHeader and ! $StartTag =~ /^delete_content ();
      }
    elsif ($StartTag =~ /^/)      {$WikiText .= ""}
    elsif ($StartTag =~ /^/)   {$WikiText .= ""}
    elsif ($StartTag =~ /^/)  {$WikiText .= "<h1>"}
    elsif ($StartTag =~ /^(<li[aAiI]?)\b/) {$WikiText .= $InOList ? "$1>" : "<li>";}
    elsif ($StartTag =~ /^<a href\b/)
      {
      unshift @AnchorStack, "";
      }
    elsif ($StartTag =~ /^<ol\b/)
      {
      $WikiText .= "<br><br>" if ! $InOList and ! $InUList;
      ++$InOList;
      }
    elsif ($StartTag =~ /^<ul\b/)
      {
      $WikiText .= "<br><br>" if ! $InOList and ! $InUList;
      ++$InUList;
      }
    elsif ($StartTag =~ /^<img\b/)
      {
      $WikiText .= $StartTag;
      }
   }

  # Now queue up content list for the current element...
  if(
    ref $this and
    not
      ( # ...except for those which
      not($content_r = $this->{'_content'} and @$content_r) and
          # ...have empty content lists
      $this->{'_empty_element'} || $empty_element_map->{$this->{'_tag'} || ''}
          # ...and that don't get post-order callbacks
      )
    )
    {
    unshift @Context, $this;
    unshift @I, -1;
    unshift @C, $content_r || [];
    }
  }

print "Generated marked up TWiki\n" if $Noisy;

my $sp = qr/(?:\s|<3sp>)/;
my $face = qr/(?:b|i|code)/;
my $pre = qr/(?:<ul>|<ol>|<h[1-6]>|<a [a-zA-Z0-9_]+>)/;

$WikiText =~ s/<br>\s+/<br>/g; # Remove leading spaces
$WikiText =~ s/\s+<br>/<br>/g; # Remove trailing spaces
$WikiText =~ s/(<h[1-6]>)(?:<br>)+/$1/g;
print "Removed spurious white space at line ends\n" if $Noisy;

my $Touched;
do
  {
  $Touched = 0;
  # Remove multiple blank lines
  $Touched |= $WikiText =~ s/(?:<br>){3,}/<br><br>/g;
  # Remove empty face elements
  $Touched |= $WikiText =~ s/<($face)>((?:<br>|$sp)*)<\/\1>/$2/g;
  # Migrate various tags adjacent to text
  $WikiText =~ s/((?:<br>)+)(<a [^>]*:.*?>)/$2$1/g;
  $Touched |= $WikiText =~ s/(<$face>)((?:<br>|$sp|$pre|\W+)+)/$2$1/g;
  $Touched |= $WikiText =~ s/((?:<br>|$sp|$pre|\W+)+)<\/($face)>/<\/$2>$1/g;
  }
while ($Touched);
print "Removed various empty elements\n" if $Noisy;

$WikiText =~ s/<br><br>($pre)/<br>$1/g;
$WikiText =~ s/($pre)($pre)/$1<br>$2/g;
$WikiText =~ s/(<\/?code><\/?b>|<\/?b><\/?code>)/==/g;
$WikiText =~ s/(<\/?i><\/?b>|<\/?b><\/?i>)/__/g;
$WikiText =~ s/<\/?b>/*/g;
$WikiText =~ s/<\/?i>/_/g;
$WikiText =~ s/<\/?code>/=/g;
$WikiText =~ s/($pre|<br>)(?:$sp)+/$1/g;
$WikiText =~ s/<br>(?:•|<ul>)/<br>   * /g;
$WikiText =~ s/<ol>/<br>   1 /g;
$WikiText =~ s/<ol([a|A|i|I])>/<br>   $1 /g;
print "Inserted list line prefixes\n" if $Noisy;

$WikiText =~ s/<3sp>/   /g;
$WikiText =~ s/<h([1-6])>(?{"+"x$1})/<br>---$^R /g;
print "Inserted header line prefixes\n" if $Noisy;

$WikiText =~ s/<a ([^>]*):(.*?)>/[[#$1][$2]]/g;
$WikiText =~ s/<a (.*?)>/#$1/g;
print "Inserted links\n" if $Noisy;

# Put in the line breaks
$WikiText =~ s/<br>/\n/gs;
$WikiText =~ s/\n   \* \n/\n/gs;
$WikiText =~ s/^\n+//gs;

print "Restored line breaks\n" if $Noisy;

return $WikiText;
}


sub OutputTWiki ($)
{
my @Lines = split /\n/, shift;
my @Files;
my @HeaderOffset;
my @PageTag;
my $WikiNameRoot = $XlateParams {"WikiNameRoot"};
my $SamePageHeader = $XlateParams {"SamePageHeader"};
my %ImageFiles;
my %AddedImages;
my $FirstHeader = 1;

unshift @PageTag, $XlateParams {"ParentTopicName"};
unshift @PageTag, "$WikiNameRoot";
$ParentPageNames {$PageTag[0]} = 1;
unshift @WikiFiles, ("$PageTag[0].txt");

open ($Files [0], ">$WikiFiles[0]");
PrintHeader ($Files [0], $PageTag[1]);
$HeaderOffset [0] = 1;

foreach my $Line (@Lines)
  {
  # Fix up cell breaks
  $Line =~ s/<brr?>\|/|/gi;
  $Line =~ s/\|<brr?>/|/gi;
  $Line =~ s/(^<brr?>\s*|\s*<brr?>$)//gi;
  $Line =~ s/<brr>/<br>/gi;

  my ($Plusses) = $Line =~ /^---(\++)/g;

  # Image file processing
  if ($Line =~ /(<img\b.*?src="(.*?)".*?>)/ && -e $2)
    {
    my $Filename = "$WikiNameRoot$2";
    if (defined $ImageFiles {$Filename})
      {$Filename .= "-" . ++$ImageFiles {$Filename};}
    else
      {$ImageFiles {$Filename} = 1;}

    sysopen inFile, $2, O_BINARY | O_RDONLY;
    sysopen outFile, $Filename, O_BINARY | O_WRONLY | O_CREAT;
    my $Buffer;
    my $Len;

    while ($Len = sysread inFile, $Buffer, 2048)
      {syswrite outFile, $Buffer, $Len;}
    close inFile;
    close outFile;

    $AddedImages {$Filename} = -s $Filename;
    my $Link = "<img src=\"%ATTACHURLPATH%/$Filename\" alt=\"$Filename\"/>";
    substr $Line, $-[0], length $1, $Link;
    }

  # Anchor processing
  if ($Line =~ /^#/)
    {# Anchor
    my ($Tag) = $Line =~ /^#(\w+)/;
    $Anchors {$Tag} = "$PageTag[0]#$Tag";
    }

  if (defined $Plusses)
    {$Line =~ s/\*(.*)\*/$1/g;}

  if (! defined $Plusses or length ($Plusses) >= $SamePageHeader)
    {# Non-header line or same page header
    print {$Files [0]} $Line."\n";
    next;
    }

  # Header processing
  $Plusses = length ($Plusses);
  while ($HeaderOffset [0] >= $Plusses and @Files > 1)
    {# Pop a level
    print {$Files [0]} "</noautolink>\n";

    foreach my $ImageName (sort keys %AddedImages)
      {
      my $MetaLine = "%META:FILEATTACHMENT{name=\"";
      $MetaLine .= $ImageName;
      $MetaLine .= "\" attr=\"h\" comment=\"\" date=\"";
      $MetaLine .= time;
      $MetaLine .= "\" path=\"";
      $MetaLine .= $ImageName;
      $MetaLine .= "\" size=\"";
      $MetaLine .= $AddedImages {$ImageName};
      $MetaLine .= "\" user=\"";
      $MetaLine .= $XlateParams {"AuthorsWikiName"};
      $MetaLine .= "\" version=\"1.1\"}%\n";
      print {$Files [0]} $MetaLine;
      }

    %AddedImages = ();
    close $Files [0];
    shift @Files;
    shift @HeaderOffset;
    shift @PageTag;
    }

  # Now on right page for header
  if ($Plusses == 1 && $FirstHeader or $Plusses >= $SamePageHeader)
    {# Stay on same page
    $FirstHeader &&= $Plusses != 1;
    print {$Files [0]} $Line."\n";
    next;
    }

  # Push a level
  my $HeaderText = (substr $Line, $Plusses + 3);
  my $Tag = $HeaderText;
  $Tag =~ tr/0-9A-Za-z //cd;
  $Tag =~ tr/a-z/A-Z/;
  $Tag = join "", Abbreviate (split " ", $Tag);

  my $PageName = $WikiNameRoot.$Tag;
  if (exists $ParentPageNames{$PageName})
    {
    $PageName .= ++$ParentPageNames {$PageName};
    }
  else
    {
    $ParentPageNames {$PageName} = 1;
    }

  unshift @PageTag, $PageName;
  unshift @WikiFiles, ("$PageTag[0].txt");
  unshift @HeaderOffset, $Plusses;

  print {$Files [0]} "---" . "+" x $Plusses . "[[$PageTag[0]][$HeaderText]]\n";

  unshift @Files, undef;
  open ($Files [0], ">$WikiFiles[0]");
  PrintHeader ($Files [0], $PageTag[1]);
  print {$Files [0]} $Line."\n";
  print "$WikiFiles[1] parent of $WikiFiles[0]\n" if $Noisy;
  }

foreach my $ImageName (sort keys %AddedImages)
  {
  my $MetaLine = "%META:FILEATTACHMENT{name=\"";
  $MetaLine .= $ImageName;
  $MetaLine .= "\" attr=\"h\" comment=\"\" date=\"";
  $MetaLine .= time;
  $MetaLine .= "\" path=\"";
  $MetaLine .= $ImageName;
  $MetaLine .= "\" size=\"";
  $MetaLine .= $AddedImages {$ImageName};
  $MetaLine .= "\" user=\"";
  $MetaLine .= $XlateParams {"AuthorsWikiName"};
  $MetaLine .= "\" version=\"1.1\"}%\n";
  print {$Files [0]} $MetaLine;
  }

%AddedImages = ();

do
  {
  print {$Files [0]} "</noautolink>\n";
  close $Files [0];
  shift @Files;
  }
while (@Files);

FinalFixup ();
print "Done\n";
}


sub HelpAndExit
{
print shift while scalar @_;
print "\n";
print "html2wiki [-n] <source document>\n";
print "\n";
print "  -n Turn on noisy mode. Lists character and tag/attribute substitutions\n";
print "     as they are found in the first pass and progress in the second pass.\n";
print "  <source document> The document to convert from HTML to TWiki format.\n";

exit (-1);
}


sub SetDefaults ($)
{
$XlateParams {"WikiNameRoot"} = shift;
$XlateParams {"SamePageHeader"} = 4;
$XlateParams {"ParentTopicName"} = "Main";
$XlateParams {"AuthorsWikiName"} = "TWikiGuest";
LoadXlateFile ("ElementSubs ");
}


#The main block processes the command line to retreive the file name of the
#document to process. It then checks to see if the translation file exists and
#either calls GenerateXlateFile to generate it, or calls ParseXlateFile and
#Convert to perform the conversion.

my $Param = shift;

HelpAndExit if ! defined $Param;

if ($Param =~ /^-n$/)
  {
  $Noisy = 1;
  $OriginalName = shift;
  }
else
  {$OriginalName = $Param;}

HelpAndExit ("Document file name required\n") if ! defined $OriginalName;
HelpAndExit ("Document file <$OriginalName> not found\n") if ! (-e $OriginalName);

(my $WikiNameRoot = $OriginalName) =~ s/(.*)\..*?$/$1/;
$WikiNameRoot =~ tr/ \-_a-zA-Z0-9\x00-\xFF/   a-zA-Z0-9/d; # Strip "nasty" characters
my @Words;
push @Words, ucfirst $_ foreach split " ", $WikiNameRoot;
$WikiNameRoot = join "", Abbreviate (@Words);
$TranslateName = "$OriginalName.html2wiki";
SetDefaults ($WikiNameRoot);

my $html = "";

open inFile, "<$OriginalName";
while (<inFile>)
  {
  chomp;
  if (($html =~ /[^\s>]$/) and (/^\b/))
    {$html .= " ";}
  s/ / /g;
  $html .= $_;
  }
close inFile;
$html =~ s/[\r\n]+/ /gs;
$html =~ s/<!--.*?-->//g; # Delete comments

print "Loaded $OriginalName\n" if $Noisy;

print "Parsed html\n" if $Noisy;

if (! (-e $TranslateName) || -M $TranslateName > -M $OriginalName)
  {
  my $Tree = BuildTree ($html);
  GenerateXlateFile ($Tree);
  print "Translate file has been generated as: $TranslateName";
  }
else
  {
  print "Converting to TWiki\n" if $Noisy;
  LoadXlateFile ();
  OutputTWiki (Convert ($html));
  print "Loaded configuration information\n" if $Noisy;

  if (1 == @WikiFiles)
    {
    print "Conversion is complete. The following file was generated: $WikiFiles[0]\n";
    }
  else
    {
    print "Conversion is complete. The following files were generated:\n";
    printf "    $_\n" while $_ = pop @WikiFiles;
    }
  }

"} elsif ($Tag =~ /size=6/i) {$Action = "

"} elsif ($Tag =~ /size=5/i) {$Action = "

"} elsif ($Tag =~ /size=4/i) {$Action = "

"} elsif ($Tag =~ /size=3/i) {$Action = "

"} elsif ($Tag =~ /size=2/i) {$Action = "