in reply to Re: How to clean-up Microsoft Word HTML
in thread How to clean-up Microsoft Word HTML
#!/usr/bin/perl $nostyle=1; $dos=1; while(<>){$text.=$_;} $text=~s/content="Microsoft Word \d+"/content="wordclean.pl"/g; $text=~s/(\r|\n)+/ /g; $text=~s/<\/?o:.+?>//g; $text=~s/<!--.+-->//g; $text=~s/xmlns(:.+?)?=".+?"//g; $text=~s/mso-.+?:\s?.+?'/'/g; $text=~s/mso-.+?:\s?.+?;//g; $text=~s/style=''//g; $text=~s#style='.+?'##g if ($nostyle); $text=~s/<link rel=File-List href=".+?">//g; $text=~s/class=\w+//g; $text=~s/<\/?st1:\w+>//g; $text=~s/\s+>/>/g; $text=~s/>\s+</></g; $text=~s/\s+/ /g; $text=~s#</?span>##g if ($nostyle); $text=~s#<span style='font-size:12.0pt;\s?'>(.+?)</span>#$1#g; $text=~s#<span[^>]*>\s*</span>##g; $text=~s#<span>(.+)</span>#$1#g; $text=~s/(<\w.+?>)/\n$1/g; $text=~s/\n<b>/<b>/g; $text=~s#</(html|body|head|tr|td|table|div)>#\n</$1>#g; $text=~s#\n<html>#<html>#; $text=~s#\n#\r\n#g if ($dos); print $text;
|
|---|