#! perl -slw use strict; ## Load the words list my @idx; { local( @ARGV ) = '845818.idx'; @idx = <>; }; chomp @idx; print "Words: ", ~~@idx; $/ = ''; ## Paragraph mode ## hash of bitstrings; keyed by para number; ## each bit represents one word in @idx my %paras; while( <> ) { $paras{ $. } //= ''; ## Initalise bitstring for my $i ( 0 .. $#idx ) { ## for each word index my $word = $idx[ $i ]; ## get the word if( m[\b\Q$word]i ) { ## If it is found in the paragraph ## Set the bit for this word in the bit string for this para vec( $paras{ $. }, $i, 1 ) = 1; } } } print "Paras: ", scalar keys %paras; ## Order keys (para numbers) by the bits set ## (words found) in bitstring (descending) my @parasByWordCount = sort { unpack( '%32b*', $paras{ $b } ) <=> unpack( '%32b*', $paras{ $a } ) } keys %paras; ## Initialise the minimal set with the para ## that contains the most words my @set = shift @parasByWordCount; ## And initalise the mask to its bitstring my $mask = $paras{ $set[ 0 ] }; ## While there are still paras to consider ## (considering them in descending order) while( @parasByWordCount ) { ## Get the next one to be considered my $next = shift @parasByWordCount; ## If adding (ORing) the next bitstring with the mask makes a difference if( ( $mask | $paras{ $next } ) ne $mask ) { ## $next contains new paras not yet covered, ## so add it to the set and the mask push @set, $next; $mask |= $paras{ $next } } ## otherwise just discard it } ## Now we have a set of paras that covers all the words ## Is it minimal? For my artificial testcase it appears to be ## But I offer no proof. That's for others (probably to discount). printf "All the words are covered by %d paras:\n", scalar @set; printf "%3d : %s\n", $_, unpack 'b*', $paras{ $_ } for sort{ $a<=>$b } @set; printf "All : %s\n", unpack 'b*', $mask; __END__ c:\test>845818 fannyhill.txt Words: 599 Paras: 612 All the words are covered by 34 paras: 62 : 11000010011000001000111000001110000001100100000000000000000001101101000001000000000000000000101100000010001000000000100000011000000000000001000100000000001000000010010000000100110000000001000101000000001110010000010110010001111000000010001100001001011001100000001100100000000000000100000100000000000000000000000010000000001010000011110000101000000110100000001010110001011011000000000000001000000101100010100001000000001011000010000000010000001001010000000100100000001000111000010010010000000000001000010000001000110001010011100001010001000111000000010001000001000011101101110110100100001110100001000 82 : 11100000000010001010111000110110110101000000001000000110000001101100000000100000000001000000100001101010001110001000001000100000010000000001010110000000001001100000000010100001100000000000000001000100000110011100100111001000000010100000011010001100000000100000010110011000100000000100000001000000001000100000010010000000000000100000111011000010001011100000001101100111000110000100100000000000000000000100010000001000000010000000000000100000000000001000010010100000000000001000000000110010000000011000000000010000110101100100100100100101011000100000001110000010000000101001100001101000001101000111000 139 : 11100000011010001000111110000110110101100000100000100110001001101000000000001000000010000000010000000000000001000001000000000100010000000001010000000000000000000110000010100000100110011000000101100000000110111000010110010001100011100000001010001000000001110110000110000000100000011000000101010000000100000000000010000000000010010010000000100000000011001001001000101001100100010000000000000000000100000010100000000000000000000000000000000000001011000000000000110000010100001001010000000010000000100001000000001001111111100100100000000011001100000000000111000001010000111001100000100010000100000101100 197 : 11000000000000001000111000000110110001000101000010100001001001101000000100000000100001000000000100000010001000000000010000000000000110000001000000100001000000001100010010000001100010000000000101000000000110011010000110010000000111100010001010001000000000110010001110000000100000000000000101000000001111000000010011000000000000000000101000100000000010101100001100101110000010000100100000010100100100000010100000110100000000000010000000001101001010110000000000000100000000000000000000010100000000000000000000001000110111100000100000000011000110100010101100000000000000001001100100101110000100100110000 248 : 11010001011000001000111110000110110001010011000000000000000001000000000010000000000001001100001000000000000000010000000100000010000110000000100100000000000001100000000011101000110000010000011001001000000110000000010110100001101011101010001000001100010010000000000000000100010000000010001001011000100000000110000010000000000000000110100000000000000010110000001100101001100001000110000011010000000000000010000000010001000000100001000000000010001000000000000000110000100000001000000000000000000000010000000111000001110001100010100000000101010110100000000110000000000000100000000000100011000100000101001 253 : 1110000001100000001011100011011011000100111100000011010000000110001000000100000010000100000000000111101000100100000000000000000001011000000000000010010000000110001000001000010010000001100000000100010000011101100001111000000000001110001100100000100010000010000000010010000111100000100111001001010000000000000001001000000100011000001011000001000101001110100000110010000100000000010011000000000011010000001010010000110000000010010000000000100100000000100000010000000010000000100001010100001001000000100000000001101111000100101010011000110100011000000000100000010100000010000001110010101100010110 290 : 11000000011000011010111000000110110001000000000000000100000100101100000000001100000011100000100000010011101000000000010000001000000000000000100000000000000001000010000110000000101010010101010001001100000110011000001110000001101011110000001000001100100000100000000000000000000001001100000010011000000010000000000110000001001000011011100000000000000010111000001000100100000110001000000000100100000000010000010000000000000000000000010100000001001010000000100000111000001000011001000100000000101010101000100001000001110100000100100110000011000000100010000110000010100000111000000010100110001100000101010 307 : 11000000000000001000111000110110110101100000101000010110001001111000000000100000000000000000000010000010001000000000011000010010010110100001011000101000001000000100000010000000100010000100000001000000000111000000000110000000000011100000001000001000010010110110010000100000100000010101000011110000100011001000000010010011000010010000100000100000010010101000001100111010001110000100100011000000000100000000001000000000000000000000000000000001001010001000000001100001010000001101010000100110001000000000010000000000110001010010001000001101100000000000001100000010000011111101100000111011000111000111001 326 : 11010000100000101000111000001000110001000010000000001100000001100000000000000000100001000000000010010000000000000100000000100000010000001001000000101000010000000000000010010001100000000000000100000010100110011100110110000101110010100000001000001100000000110000000000000000010000000101010000010000000010000000000110000000000010000010100010100000000011100000001000101001000000010000000100110000001100000100000001100100000011000100100000000001000000001000000000110000100000001000000000000000001100100100000111100000011100001000100000010001000100000000000000000010000100101001100000101010010100000101100 335 : 11000000000010001000111100111110110111000011001000100000000101100010000001101001000001000000001110000010001000000100001000111000000110011000010001000101100001110000010011000000110110001000010001100010000110010000101111010010000111110011001010001100010011111101110110000000101100110001010111100000000010110000010010010001001010110000111000110000000011111000001100101100001010000101100000000001001101100110000000010000001110100011000100001000010000000011010000100000110000011110010000000011000000000000010111001001111101111100101100001001000111000000001111100100100011111111100100111010011111000010000 383 : 11110100000000101000111111000110110011000001000010000010011001101000000001001001000001100000000000000010001000000000000000101001000110000000100100100100000100000000000111000000100000000000011001100100000111011000010110110000000011100110001100011001010001000001000000100000010001010001010010000001000010000000000010000100010010100000110000000001000011100000001000101101001010000000000010000000000000000000000100000001000000000000000000001010001000000000000000000001000000001001000000000010000000000000000000011011011100101000101000011111100110010000001000000000000000101101110100100011100101000011000 384 : 11000100011000101000111110000110110001001100000000000100000001111000000000000000001001010000000000000010100001000000000000100000010000000110010000000000010010010010000110000000100000000000000001100000000110000000011111000000000010100110001010001001000010100000000000000100000001000000100001010000000000000100000000000000000010000010100010000001100111100000011001111001011100000100100000100000000000000010100000010000000000000000010000000000001110000000000000100000100000001001000000010000100000101000000101000001110101100011101110011101100000000000000111000000000000101011110100101010000101010001100 385 : 11000000000000001101111101000110110011000101000000100101000001101000000001100000101001000000000001000110001001000000000100000000010000001110000100000101101000000110010010000100101000001000000011000100000110010000000110001010000011100011011010001100110011110001001110100001110000010100000010000000000011010100000010000000010000000010100000100100110010101000001100111110001100010100100000000000100101000110000000110001000000000001000000000000000000001000010100000000010000001001000000000000010000000001000001000001110101100100100110000101010111100000001111101000001010111001110010101000000101000111100 392 : 11010000011000001000111011110110111111000010101000100000000001101100100000000000010001010001000011100110010000000000000000000001000110010001100111000111000000010110010010010101100011000100000000000010000111000000010111000000000010100000001000001000100000110000001000000000110000100000000001010000000010010100000010000000000100000000101110100010010010111100101100111001000111000100000011000101000000000100000000000000000000000000001101000000001100000001010100000000101000101101100000000000010010000000000000000001110101101100101000000011000110101000000111000010000000111100000110101110010101000101001 393 : 11000010111110001100111000110110110101000011001010001000000001101000100010001000101001000000010001110011101100000000101000001000010100000101000100000110001101100000001010100100100000010100011100000000000110010000010110111000000010100010001000001000010111100010000101000010101000000001010010010100000000010100000011100011110000100011110000100001000011101100011100100111000110010100110000100000000001010100000000001100011000000000000100010001001101001101000000000000001010001101000000000010001010111000010110010000111101110010100110000111010110000011001110000010100000111110010110100000000100000101101 395 : 11100000011011001000111000111110110001100001001010010000100001100011000010000000000011000000000001000010101000010010000000101000010100000101000001000000011101100110100010100000100000010000000011101010011110011010111110110000000010100100001100011000000110110010010101001000010101011001100101110000100000010100010011111110000110100000111000010000010010111101001000101111010111010101110000010100011100010110000000010101000010100001001000100001001100111010001000000000001001001101110000000110001101100000110000001101110101100011100000000111001111000100000110000010000000101001110100101010000101100011100 398 : 11000000011000001001111010000110110001000011001001000000000011110010000010001010001101100000000001010010001000000000000000000000000000000001000000100001000000000010000110000100000000000001011001000100001110000000000110000001110011110000001000101000000010110001011000100000000001010001000010000001001011010000110111100001010010000010100010100000010010101000001011110001001110000100110000000101100111011000000001010010000000000000000000000001001000000000010000100000100110001100010000010000001100000000000110000001110100000011100000000001010111000100001100000000000100111001110110110010100101100011000 404 : 11000000011100101000111100000110110111000111001011110100100011110000110000000101000001010001110111111010001000010000000001110000010110001101000101000111111001100010000011000000110011010111010001011010000110011100010111110001100010110011101001011100111011110110000111101001100000011000000101010000001010010100000010010111100110110010111011110000000110111100011000111111010111110100000011010000101101001010000000010000100000000111011100010001001010011100000010000100011000001101010010001010000000100010000110001011111101110110101110111101011111010010001111000111100011101101110110101011001101000111101 415 : 1110000000000001100011100000011011000100000000001010001001000111001000000110000100001100000000111000001000000100101010000001000000011100000100000010100110000001001001001010000011001000000100010100100000110101000001111101000110111110000000100010100001111011011111011000000101001011000011000000000011101000000000001001000100000010000000000000000010001011100100100011101100111001010000000000000000110000001110000000100001010000001000001000000000100100000000000011111101011000110001000100000010000001000011000010000111110100110000000010011100000010101101011100010101000010100111011010101000011100 421 : 11101000000011001000111000110110110001010000011100100000010001110000000000100000000001000000110111100010001000001110001000011000000110000001011000000100001000001100000010100000100011011001110000000000001110010000000111010001101011110000101010001000000110111110010100000010101010110001000111000000101110000000010011010011001010010000000011110000000010101000001100101010011111000000100000000011000101000000000010000100000011000110000000001010000000000000000000110000011101001100000001100110101100000000010110001001111001101100100000110111100111101000010111000000000000101001100100101111010100100101001 424 : 11101000011010001000111000000110110011011000101000100111100101110011000000100100000011000000010010010010001001101000011000110000010110001101111100100000001000010011010110110000110010011000010010000010000110011010000111000001100111110000001110001100010011110101010110000000100110110011100011000000001010001000000010000011000110000010001010101110010010111000001110101001000110001101100011010011011100000011010000000001010000001110111110001100001001010000010000110100011000011101011100100000101000100000111000001001111101101100101110011111100111000010100111000001100010101111100110101110010101000101001 433 : 11001000000011001000111001000110110001100011101011000010000001111001000000100000000010000000010111011010011000000000001000101001010110001101100101100001101101100100001010000100100011100000000010000011010111000000010111001000000010110011001000011100010110110110000110101000100000000001010000000010000010000000010010000010000010110010101000000010000110101000001100111001001110000100100100000011100111000110000010001011001010000000110100001111100000001001000001111010001000001000011010000110100000100000010000101001111001011100101000101101100101101000001111000101000011101101110111100011000111000010000 435 : 11100010011000000000111110000110111001001101100000000000001001010000111000011001000001010000000000000000000010000000000000101100110110100000000000000000010000000000000010010100100001010110000001000100000110000000010110000001100000100000001001001000000000110011000000100000000001001000000000000000000010000100000010000000000001000110100011000000000110101001011100101001100110101110100000010100101110000010001000110000100010010010000000000000001000010000000001100001100000001000000000000000000000100000010000001000110000100010000000000101000000010000000101000010000000101010011010100010100101000100000 442 : 11010000011000111000111001001111110001000010100001000000001001110000110000001000111001000000100000010000000010000000010000000000110000000001000000100000000010000010000110000100100010010001000101000100000110000000011110000000000011110000001000001101000011100001100000000010000000100001001010000000000010010101000010000001001110100010110110000001101011111010011110111011011101000000110000010100000000000000000100010011000000000010000000010000001000010001110000000001010100001000000100000010110000000000000000000000111100101001101110000101010100010100001110000010000100111011110101101110011101100101100 448 : 1101000001100001100011100000111011000100000100000100000000011110001000100001010001000100000000000000000000000000000001100010100101011000000000000011000000000001110000001000011010000000011000000100101000011000000000011000000111001110001100100000100100001110000001000100001001000001010000000100000000001000011000001000010100000000011010001110000010001011100000100111100101111001000011000001011000010100000000100010000001000001000000000010000000000000000001000010000000000000100000010000000000000010001000000010000111000111001000111100001100011000000000111000001101100011101110010010101000010110 456 : 11100001011011001100111010001110110101100111101010010111000001110000111011010000110001001110110100010011100000000011110000100100010110000000011101000001111001100110000110100000111110000101010101001010000110110010010110010101100111100011011010001101010110100011010110100000110000011100010011000000000110110100100011000101100011110110111000100000111010101001001100111111000111010100101011000001010101000110111010010000000010011011001001011101001111000010100000110100101101001101111000010100000001101001000000101011111101100111101110111011001101100100000111110000100010111101110111111011000111010110000 458 : 11100000011010001000111010001110110001100010101000001000000001100010110001000000000001011000010000010011001000001011010000100100000111010000000000000100000000000000000010000000100000010100000101001110100110100000001111000000000011110011011110001000100000100110010100000010100000010110010011100000000011101100000010010001011000100000110000000000100010111000101000110101010110010000001000000010000101000010100100010000110000000110001000100010000000001110010000100100110000001000010000000011001000101000100000001010110101000110100110000111000110100010100110000101100000111101110100111010000110001010000 467 : 11000000100011011000111010110110110111000110011010101101000101101100100000000110000000010000100101110010101011000010000000100000011100000001100111000001110010010100000010100100100010000000010101100100000110011100010111000011110011110011001010001100000011110000000100101000100010000001011011010000000010100100010011010001000010000000111000000010001010101000011000111001011111000101101011100010001001000100010000100000000000100011110100010001001000000001000000110101110000001100000001110110000000101000010000101011110101010010101111100111011110100011011100000011001000111011110110101011001101001001100 472 : 11100001000000100000111111000110110001000000001000000000000011100000000000011000000100011100101001000000000000000001000010001100010000000000000000101000000100000000000010000000110001110000000000000010010110011100010111010001100011100100001000001100000001100011000000000000000000001001000010010000011010000000000010000000000010001100100000000000000010101000001111101101010000100101000000010000000101000000000000000000000000000000000000001000001000100000000000100000000000001000000101000010010000100000000000101010111001101000111100010001000000110010001001110000000000111001110110100010100100000111100 499 : 11000000000011001000111000000110110011101011100000001010010101100000110000000000100001000000100011001110001000000010000000100100000100110100011000100001000010010110000010000000100100001000010001001100000111011010100111001001110010100011001000001110010111110011010001000000000000100101010000000000000110000110000110000000000010101010101000110010010010101100001000100101001111010101100000000000000101000110100000001000000100000010000100001111001100001110010000000000101101001101010001100010110000101000100000000001110101110100101000101001000111000010100100000100010000111001110111101011000101001011000 506 : 11100010011011011010111010001110110001010111101000100010001101100011100000010000010001011000000000010111011110100000000010100101010100101101111000100100010011110011001010100000100000011001010001001110000111011100011111000001111011110011001110011000111010100001110000000010001011011001100010000000011011011000000011000000000010100110100011100000000110111000001101111011000100110100100011000101010001110011000000110100000000010000110000000010001000100000010100100000011000101101000000000010110001101001000100100001111100001011101111011011100110000010111110000010010011111111110111111011000100100100000 517 : 11000000000011111000111011000110110001000010001001000000000001110000001000010010010001000001000000000010000000000000001000000001000000000000000000000000000000010000000010000000100001000000000000000010000110000011010110000001100000010011001000001000000100111000001000100000010000001000000110000000000000000110000010000001000010100100000100000000000010100010001000101101011111000100110100000000000000000010000001010000000000010100000000110001001010000000000100100000100000001001100000000000000000000000000100000001111100111010100100000011010000000100001100000010000000101001111100100010110100100110000 525 : 11000000000000000000111000000110000011000010001000000100010001000000000000000010100001000000000000000111000010000000000000000000010000000000010000000000001000011000010010000000110001100000010001000100000110000000010111010111101010100011001000001100000001110100000000010000000000010000000011000000001000001100001010000001010010100010100010100000000010101000001000101000011100000101000000110001001000010000000010000000011000000000000100000000001001000000000010100011000011011001100000100010000000100000000000000001110001001100001000010001000101001000001101110010000000101101100010100111000111000001001 557 : 11000001000000000000111010000110110011000000001000000000000001101110001000010000000000010000000000000010001010000000000100000001000000000000100000000000000000000001000111001001100000010000010001110000000110000000000111000001110010100011001011001000110000110000000000100000000100101011010000000010000001000000000010000010000001000110100011000000110010101010111100101101001110000000000011000010001100000000010000100000000000000001000000010001001001000000000000000000000000001000000000000000000000110000000000000001010000000000100000000001000000101010101001000001010010111111100111100010110110100001100 All : 11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111