#!/usr/bin/perl -w use strict; use XML::Twig; my $t= XML::Twig->new( pretty_print => 'indented'); $t->parse( \*DATA); # get the terms my @dt= $t->descendants( 'dt'); # wrap the terms in tags foreach my $dt (@dt) { my $name= lc $dt->text; $name=~ s/\W/_/g; # normalize the term so it can be a name $dt->insert( a => { name => $name }); # insert the a element in the dt } # create a hash term (litteral text to match) => name to link to my %dt= map { $_->text, $_->first_child('a')->att( 'name') } @dt; # create a (potentially huge!) regexp or'ing all the texts to match # the texts are sorted by reverse length so 'foo bar' comes before 'foo' # note that the actual word matched needs to be captured my $regexp_text= join '|', sort { length( $b) <=> length( $a) } keys %dt; my $regexp= qr/\b($regexp_text)\b/i; # now go through the interesting parts of document and match away! # an other option would be to get all text descendants and to # 'next' if the context is not right (like a h or whatever my @new_links; foreach my $text ($t->descendants( qr/^(p|dd)$/)) { next if( $text->in_context( 'a')); # that's where we skip text in links # this is the magic method that creates a new link_to_dt element wrapped # around the term in the text and returns the list of created elements push @new_links, $text->split( { return_matched_elt => 1 }, $regexp, 'link_to_dt'); } # now we need to replace those link_to_dt elements with real links. foreach my $link ( @new_links) { $link->set_gi( 'a'); # turn it into an html link my $href= "#$dt{lc($link->text)}"; # the target is the normalized text $link->set_att( href => $href); } # time to output the whole thing $t->print; __DATA__

Glossary

foo
1. interj. Term of disgust. 2. [very common] Used very generally as a sample name for absolutely anything, esp. programs and files (esp. scratch files). 3. First on the standard list of metasyntactic variables used in syntax examples. See also bar, baz, qux, quux, corge, grault, garply, waldo, fred, plugh, xyzzy, thud.
bar
1. [very common] The second metasyntactic variable, after foo and before baz. "Suppose we have two functions: FOO and BAR. FOO calls BAR...." 2. Often appended to foo to produce foobar.
toto
French equivalent of foo. See also tata, tutu, titi
foo bar
This one is here just to show that this can get a little tricky!

Text

This para could describe anything really, as it references foo and bar (it actually references foo several times, including my own foo) but also to foo bar.

My own foo

My own foo is usually called toto, although wombat is also cool.

Note that the code is barred from linking... foo or bar as part of a word