sub do_bondagefiles { my ($url, $html) = @_; $_ = $html; 1 while (s@@ @gsi); # lose comments # Search-and-replace, with @ seperating the search part, # the replace part, and the search options. # . These are the HTML comment characters. # .*? matches anything in between the HTML # comment characters. The dot means "any character", # the * means "0 or more of", and the ? means "the # shortest match", instead of the default of the longest. # After the @ is the next argument, the replace string. # It's a single space. So will be # replaced by a single space. # After the next @ is the final argument to the RE, # the options. Options here are g, s, and i. g means # "global"; if you find the same match multiple times, # replace all of them. s means treat newlines as regular # characters, instead of treating them specially. i # means case insensitive search, which doesn't matter, # since all of the characters in the search are symbols, # which don't have a case. s/[\r\n]+/ /gs; s@^.*?(, followed by a word boundary, # followed by the literal string blacktri.gif, # followed by another word boundary, followed by # zero or more of any character, followed by the # end of the string. Replace with an empty string. # Search treats newline as normal characters, and is case # insensitive. s@(]*\bHREF\b)@\n\001\001\001\n$1@gi; # Save this: # * The literal string , # followed by # * the literal string HREF, followed by # * another word boundary. # Replace this with a newline character, three # characters with character code 1, another newline, # and the captured string. # Case treats newlines as normal characters, and is case # insensitive. my @sec1 = split (/\n\001\001\001\n/s); my @sec2 = (); foreach (@sec1) { next if (m/^\s*$/s); s@^\s*]*?\bHREF=\"([^<>\"]+)\"[^<>]*>\s*(.*?)\s*\s*@@i +s || error ("unparsable entry (url) in $url"); # Search for the beginning of the string, followed # by # (taking the shortest match), then another word # boundary character, then the string HREF=" # Save into register 1: # * one or more characters which are none of # <, >, or ". # Then look for a quote, followed by zero or more # characters which are neither < nor >, followed by # a > character, followed by zero or more spaces. # Save into register 2: # * Zero or more characters (shortest match) # followed by zero or more spaces, followed by , # followed by zero or more spaces. # Replace with the empty string. # Search is case-insensitive, and newlines are treated # as regular characters. # * my $eurl = $1; # $1 is register 1 from the above RE. my $title = $2; # $2 is register 2 from the above RE. my $date = ''; my $body = $_; $body =~ s@<[^<>]*>@@g; # lose tags in body push @sec2, ($eurl, $date, $title, $body); } return @sec2; }