#!/usr/bin/perl -- use strict; use warnings; use Data::Dump qw/ dd /; my $data = q{ Tag Attributes a href, name, target, class, title, rel abbr title b (none) big (none) blockquote class, cite, lang, dir br1 / c2 (none) caption class center (none) code2 (none) col align, col, span, width, class colgroup align, col, span, width, class dd (none) del class, datetime, cite div class dl (none) dt (none) em (none) font1 size, color, class h11 class, align h21 class, align h3 class, align h4 class, align h5 class, align h6 class, align hr / i (none) ins class, datetime, cite li value ol type, start p align, class pre1 class readmore3 title small (none) span class, title spoiler4 (none) strike (none) strong (none) sub (none) sup (none) table width, cellpadding, cellspacing, border, bgcolor, class tbody width, align, valign, colspan, rowspan, bgcolor, height, class td width, align, valign, colspan, rowspan, bgcolor, height, class tfoot width, align, valign, colspan, rowspan, bgcolor, height, class th width, align, valign, colspan, rowspan, bgcolor, height, class thead width, align, valign, colspan, rowspan, bgcolor, height, class tr width, align, valign, colspan, bgcolor, height, class tt class u (none) ul type wbr / }; open DATA, '<', \$data; my %f; while(){ length or next; my( $tag, @atts ) = grep length, grep { $_ !~ /\Q(none)\E|\// } split /[\s,]+/, $_; $tag or next; $tag =~ s/[1234]//g; $f{$tag}=\@atts; } delete $f{Tag}; dd( \%f ); print join ",", map { my @atts = @{ $f{$_} }; my $out = $_; if( @atts ){ $out .= "["; $out .= join '|', @atts; $out .= "]"; } $out; } sort keys %f ; print "\n\n"; __END__ { a => ["href", "name", "target", "class", "title", "rel"], abbr => ["title"], b => [], big => [], blockquote => ["class", "cite", "lang", "dir"], br => [], c => [], caption => ["class"], center => [], code => [], col => ["align", "col", "span", "width", "class"], colgroup => ["align", "col", "span", "width", "class"], dd => [], del => ["class", "datetime", "cite"], div => ["class"], dl => [], dt => [], em => [], font => ["size", "color", "class"], h => ["class", "align"], h5 => ["class", "align"], h6 => ["class", "align"], hr => [], i => [], ins => ["class", "datetime", "cite"], li => ["value"], ol => ["type", "start"], p => ["align", "class"], pre => ["class"], readmore => ["title"], small => [], span => ["class", "title"], spoiler => [], strike => [], strong => [], sub => [], sup => [], table => [ "width", "cellpadding", "cellspacing", "border", "bgcolor", "class", ], tbody => [ "width", "align", "valign", "colspan", "rowspan", "bgcolor", "height", "class", ], td => [ "width", "align", "valign", "colspan", "rowspan", "bgcolor", "height", "class", ], tfoot => [ "width", "align", "valign", "colspan", "rowspan", "bgcolor", "height", "class", ], th => [ "width", "align", "valign", "colspan", "rowspan", "bgcolor", "height", "class", ], thead => [ "width", "align", "valign", "colspan", "rowspan", "bgcolor", "height", "class", ], tr => [ "width", "align", "valign", "colspan", "bgcolor", "height", "class", ], tt => ["class"], u => [], ul => ["type"], wbr => [], } #### valid_elements : "a[href|name|target|class|title|rel],abbr[title],b,big,blockquote[class|cite|lang|dir],br,c,caption[class],center,code,col[align|col|span|width|class],colgroup[align|col|span|width|class],dd,del[class|datetime|cite],div[class],dl,dt,em,font[size|color|class],h[class|align],h3[class|align],h4[class|align],h5[class|align],h6[class|align],hr,i,ins[class|datetime|cite],li[value],ol[type|start],p[align|class],pre[class],readmore3[title],small,span[class|title],spoiler4,strike,strong,sub,sup,table[width|cellpadding|cellspacing|border|bgcolor|class],tbody[width|align|valign|colspan|rowspan|bgcolor|height|class],td[width|align|valign|colspan|rowspan|bgcolor|height|class],tfoot[width|align|valign|colspan|rowspan|bgcolor|height|class],th[width|align|valign|colspan|rowspan|bgcolor|height|class],thead[width|align|valign|colspan|rowspan|bgcolor|height|class],tr[width|align|valign|colspan|bgcolor|height|class],tt[class],u,ul[type],wbr"