Here's my solution to make pretty (hand editable) HTML ,set $dos for cr/lf and $nostyle to remove all style information:
#!/usr/bin/perl
$nostyle=1;
$dos=1;
while(<>){$text.=$_;}
$text=~s/content="Microsoft Word \d+"/content="wordclean.pl"/g;
$text=~s/(\r|\n)+/ /g;
$text=~s/<\/?o:.+?>//g;
$text=~s/<!--.+-->//g;
$text=~s/xmlns(:.+?)?=".+?"//g;
$text=~s/mso-.+?:\s?.+?'/'/g;
$text=~s/mso-.+?:\s?.+?;//g;
$text=~s/style=''//g;
$text=~s#style='.+?'##g if ($nostyle);
$text=~s/<link rel=File-List href=".+?">//g;
$text=~s/class=\w+//g;
$text=~s/<\/?st1:\w+>//g;
$text=~s/\s+>/>/g;
$text=~s/>\s+</></g;
$text=~s/\s+/ /g;
$text=~s#</?span>##g if ($nostyle);
$text=~s#<span style='font-size:12.0pt;\s?'>(.+?)</span>#$1#g;
$text=~s#<span[^>]*>\s*</span>##g;
$text=~s#<span>(.+)</span>#$1#g;
$text=~s/(<\w.+?>)/\n$1/g;
$text=~s/\n<b>/<b>/g;
$text=~s#</(html|body|head|tr|td|table|div)>#\n</$1>#g;
$text=~s#\n<html>#<html>#;
$text=~s#\n#\r\n#g if ($dos);
print $text;
|