in reply to Remove all duplicates after regex capture

This is essentially the same approach haukex uses, but with some different features:

Script remove_dup_lines_1.pl:
use warnings; use strict; use Data::Dump qw(dd); my %mycorpus = ( a => "<blah blah blah blah title:*this is text I want 1* blah blah blah", b => "blah title:*this is text I do not want* title:*this is text I want one* blah title:*this is text I do not want* blah", c => "blah blah title:*this is text I do not want* title:*this is also text I do not want* title:*this is text I want A* title:*This Is Text I Do Not Want* title:*this is ALSO text I DO NOT WANT* extra stuff title:*this is text I want over multiple lines B* more stuff yada title:*this \t \t is\ttext I \t\t\t do not want* title:*this is text I want C* blah", ); my $open_delim = my $close_delim = do { my $delim = '*'; # single delimiter character die "bad delimiter '$delim'" unless length($delim) == 1; quotemeta $delim; # can be any character }; my $rx_intro = qr{ title: $open_delim }xms; my $rx_outro = qr{ $close_delim }xms; my $rx_body = qr{ [^$close_delim]* }xms; # print "$rx_intro $rx_body $rx_outro \n"; for my $filename (sort keys %mycorpus) { my $content = $mycorpus{$filename}; my %titles; my $order; while ($content =~ m{ $rx_intro ($rx_body) $rx_outro }xmsg) { my $title = $1; my $normal = normalize($title); @{ $titles{$normal} }{ qw(title order count) } = ($title, ++$order, ++$titles{$normal}{count}); } # dd \%titles; print "$filename: '$_->{title}' \n" for sort { $a->{order} <=> $b->{order} } grep $_->{count} == 1, values %titles ; } exit; # subroutines ###################################################### sub normalize { my ($string, ) = @_; $string =~ tr{ \t\n}{ }s; # squeeze spaces/tabs/newlines to 1 spa +ce $string = lc $string; return $string; }
Output:
c:\@Work\Perl\monks\Maire>perl remove_dup_lines_1.pl a: 'this is text I want 1' b: 'this is text I want one' c: 'this is text I want A' c: 'this is text I want over multiple lines B' c: 'this is text I want C'


Give a man a fish:  <%-{-{-{-<

Replies are listed 'Best First'.
Re^2: Remove all duplicates after regex capture
by Maire (Scribe) on Aug 20, 2018 at 06:31 UTC
    Thank you very much for this (and also for your very clear explanations!).