in reply to Tips on how to perform this regex query

where NOT all, but more than 90% of the small string is included in the bi string

Hm. Your sample does not demonstrate that. The best match is 12.01% (at offset 575).

The simplest (and fastest pure Perl) method is to use bitwise-xor masking at each of the possible offsets and the count the number of nulls (zero-bytes) in the result. Here is code that demonstrates that:

#! perl -slw use strict; my $bigstring="MNRIYSLRYSAVARGFIAVSEFARKCVHKSVRRLCFPVLLLIPVLFSAGSLAGTV +NNELGYQLFRDFAENKGMFRPGATNIAIYNKQGEFVGTLDKAAMPDFSAVDSEIGVATLINPQYIASVK +HNGGYTNVSFGDGENRYNIVDRNNAPSLDFHAPRLDKLVTEVAPTAVTAQGAVAGAYLDKERYPVFYRL +GSGTQYIKDSNGQLTKMGGAYSWLTGGTVGSLSSYQNGEMISTSSGLVFDYKLNGAMPIYGEAGDSGSP +LFAFDTVQNKWVLVGVLTAGNGAGGRGNNWAVIPLDFIGQKFNEDNDAPVTFRTSEGGALEWSFNSSTG +AGALTQGTTTYAMHGQQGNDLNAGKNLIFQGQNGQINLKDSVSQGAGSLTFRDNYTVTTSNGSTWTGAG +IVVDNGVSVNWQVNGVKGDNLHKIGEGTLTVQGTGINEGGLKVGDGKVVLNQQADNKGQVQAFSSVNIA +SGRPTVVLTDERQVNPDTVSWGYRGGTLDVNGNSLTFHQLKAADYGAVLANNVDKRATITLDYALRADK +VALNGWSESGKGTAGNLYKYNNPYTNTTDYFILKQSTYGYFPTDQSSNATWEFVGHSQGDAQKLVADRF +NTAGYLFHGQLKGNLNVDNRLPEGVTGALVMDGAADISGTFTQENGRLTLQGHPVIHAYNTQSVADKLA +ASGDHSVLTQPTSFSQEDWENRSFTFDRLSLKNTDFGLGRNATLNTTIQADNSSVTLGDSRVFIDKNDG +QGTAFTLEEGTSVATKDADKSVFNGTVNLDNQSVLNINDIFNGGIQANNSTVNISSDSAVLGNSTLTST +ALNLNKGANALASQSFVSDGPVNISDATLSLNSRPDEVSHTLLPVYDYAGSWNLKGDDARLNVGPYSML +SGNINVQDKGTVTLGGEGELSPDLTLQNQMLYSLFNGYRNIWSGSLNAPDATVSMTDTQWSMNGNSTAG +NMKLNRTIVGFNGGTSPFTTLTTDNLDAVQSAFVMRTDLNKADKLVINKSATGHDNSIWVNFLKKPSNK +DTLDIPLVSAPEATADNLFRASTRVVGFSDVTPILSVRKEDGKKEWVLDGYQVARNDGQGKAAATFMHI +SYNNFITEVNNLNKRMGDLRDINGEAGTWVRLLNGSGSADGGFTDHYTLLQMGADRKHELGSMDLFTGV +MATYTDTDASADLYSGKTKSWGGGFYASGLFRSGAYFDVIAKYIHNENKYDLNFAGAGKQNFRSHSLYA +GAEVGYRYHLTDTTFVEPQAELVWGRLQGQTFNWNDSGMDVSMRRNSVNPLVGRTGVVSGKTFSGKDWS +LTARAGLHYEFDLTDSADVHLKDAAGEHQINGRKDSRMLYGVGLNARFGDNTRLGLEVERSAFGKYNTD +DAINANIRYSF"; my $smallstring="GTMARNDGQGKAAATFMHISYNNFITEVDNLNKRMGDLRDINGEAGTWVRLLN +GSGSADGGFTDHYTLLQMGADRKHELGSMDLFTGVMATYTDTDASADLYSGKTKSWGGGFYASGLFRSG +AYFDVIAKYIHNENKYDLNFAGAGKQNFRSHSLYAGAEVGYRYHLTDTTFVEPQAELVWGRLQGQTFNW +NDSGMDVSMRRNSVNPLVGRTGVVSGKTFSGKDWSLTARAGLHYEFDLTDSADVHLKDAAGEHQINGRK +DSRMLYGVGLNARFGDNTRLGLEVERSAFGKYNTDDAINANIRYSFLE"; my $lenBig = length $bigstring; my $lenSmall = length $smallstring; for my $o ( 0 .. ( $lenBig - $lenSmall + 1 ) ) { my $masked = substr( $bigstring, $o, $lenSmall ) ^ $smallstring; my $matched = $masked =~ tr[\0][]; if( ( $matched / $lenSmall ) > 0.10 ) { $masked =~ tr[\1-\255][ ]; $masked =~ tr[\0][*]; printf "%.2f%% match at offset %u\n", $matched / $lenSmall * 1 +00, $o; print substr $bigstring, $o, $lenSmall; print $smallstring; print $masked; } }

The output:

C:\test>1070240.pl 10.06% match at offset 50 LAGTVNNELGYQLFRDFAENKGMFRPGATNIAIYNKQGEFVGTLDKAAMPDFSAVDSEIGVATLINPQYI +ASVKHNGGYTNVSFGDGENRYNIVDRNNAPSLDFHAPRLDKLVTEVAPTAVTAQGAVAGAYLDKERYPV +FYRLGSGTQYIKDSNGQLTKMGGAYSWLTGGTVGSLSSYQNGEMISTSSGLVFDYKLNGAMPIYGEAGD +SGSPLFAFDTVQNKWVLVGVLTAGNGAGGRGNNWAVIPLDFIGQKFNEDNDAPVTFRTSEGGALEWSFN +SSTGAGALTQGTTTYAMHGQQGNDLNAGKNL GTMARNDGQGKAAATFMHISYNNFITEVDNLNKRMGDLRDINGEAGTWVRLLNGSGSADGGFTDHYTLLQ +MGADRKHELGSMDLFTGVMATYTDTDASADLYSGKTKSWGGGFYASGLFRSGAYFDVIAKYIHNENKYD +LNFAGAGKQNFRSHSLYAGAEVGYRYHLTDTTFVEPQAELVWGRLQGQTFNWNDSGMDVSMRRNSVNPL +VGRTGVVSGKTFSGKDWSLTARAGLHYEFDLTDSADVHLKDAAGEHQINGRKDSRMLYGVGLNARFGDN +TRLGLEVERSAFGKYNTDDAINANIRYSFLE * * * * * * * + * * * * * * + * * * * ** * * * + * ** * * * * + * * 10.06% match at offset 143 VDRNNAPSLDFHAPRLDKLVTEVAPTAVTAQGAVAGAYLDKERYPVFYRLGSGTQYIKDSNGQLTKMGGA +YSWLTGGTVGSLSSYQNGEMISTSSGLVFDYKLNGAMPIYGEAGDSGSPLFAFDTVQNKWVLVGVLTAG +NGAGGRGNNWAVIPLDFIGQKFNEDNDAPVTFRTSEGGALEWSFNSSTGAGALTQGTTTYAMHGQQGND +LNAGKNLIFQGQNGQINLKDSVSQGAGSLTFRDNYTVTTSNGSTWTGAGIVVDNGVSVNWQVNGVKGDN +LHKIGEGTLTVQGTGINEGGLKVGDGKVVLN GTMARNDGQGKAAATFMHISYNNFITEVDNLNKRMGDLRDINGEAGTWVRLLNGSGSADGGFTDHYTLLQ +MGADRKHELGSMDLFTGVMATYTDTDASADLYSGKTKSWGGGFYASGLFRSGAYFDVIAKYIHNENKYD +LNFAGAGKQNFRSHSLYAGAEVGYRYHLTDTTFVEPQAELVWGRLQGQTFNWNDSGMDVSMRRNSVNPL +VGRTGVVSGKTFSGKDWSLTARAGLHYEFDLTDSADVHLKDAAGEHQINGRKDSRMLYGVGLNARFGDN +TRLGLEVERSAFGKYNTDDAINANIRYSFLE * * * * * * + ** * * * ** + * * * * * * * + * * * * * *** + * * * 10.71% match at offset 211 GAYSWLTGGTVGSLSSYQNGEMISTSSGLVFDYKLNGAMPIYGEAGDSGSPLFAFDTVQNKWVLVGVLTA +GNGAGGRGNNWAVIPLDFIGQKFNEDNDAPVTFRTSEGGALEWSFNSSTGAGALTQGTTTYAMHGQQGN +DLNAGKNLIFQGQNGQINLKDSVSQGAGSLTFRDNYTVTTSNGSTWTGAGIVVDNGVSVNWQVNGVKGD +NLHKIGEGTLTVQGTGINEGGLKVGDGKVVLNQQADNKGQVQAFSSVNIASGRPTVVLTDERQVNPDTV +SWGYRGGTLDVNGNSLTFHQLKAADYGAVLA GTMARNDGQGKAAATFMHISYNNFITEVDNLNKRMGDLRDINGEAGTWVRLLNGSGSADGGFTDHYTLLQ +MGADRKHELGSMDLFTGVMATYTDTDASADLYSGKTKSWGGGFYASGLFRSGAYFDVIAKYIHNENKYD +LNFAGAGKQNFRSHSLYAGAEVGYRYHLTDTTFVEPQAELVWGRLQGQTFNWNDSGMDVSMRRNSVNPL +VGRTGVVSGKTFSGKDWSLTARAGLHYEFDLTDSADVHLKDAAGEHQINGRKDSRMLYGVGLNARFGDN +TRLGLEVERSAFGKYNTDDAINANIRYSFLE * * * **** * * + * * ** * + ** * * * * * * * + * * * ** * + * * * * 11.04% match at offset 227 YQNGEMISTSSGLVFDYKLNGAMPIYGEAGDSGSPLFAFDTVQNKWVLVGVLTAGNGAGGRGNNWAVIPL +DFIGQKFNEDNDAPVTFRTSEGGALEWSFNSSTGAGALTQGTTTYAMHGQQGNDLNAGKNLIFQGQNGQ +INLKDSVSQGAGSLTFRDNYTVTTSNGSTWTGAGIVVDNGVSVNWQVNGVKGDNLHKIGEGTLTVQGTG +INEGGLKVGDGKVVLNQQADNKGQVQAFSSVNIASGRPTVVLTDERQVNPDTVSWGYRGGTLDVNGNSL +TFHQLKAADYGAVLANNVDKRATITLDYALR GTMARNDGQGKAAATFMHISYNNFITEVDNLNKRMGDLRDINGEAGTWVRLLNGSGSADGGFTDHYTLLQ +MGADRKHELGSMDLFTGVMATYTDTDASADLYSGKTKSWGGGFYASGLFRSGAYFDVIAKYIHNENKYD +LNFAGAGKQNFRSHSLYAGAEVGYRYHLTDTTFVEPQAELVWGRLQGQTFNWNDSGMDVSMRRNSVNPL +VGRTGVVSGKTFSGKDWSLTARAGLHYEFDLTDSADVHLKDAAGEHQINGRKDSRMLYGVGLNARFGDN +TRLGLEVERSAFGKYNTDDAINANIRYSFLE * * * * * * + * * * * * * * + * * * * * * * * + * * * * * * * * +* * * * * 11.69% match at offset 276 GVLTAGNGAGGRGNNWAVIPLDFIGQKFNEDNDAPVTFRTSEGGALEWSFNSSTGAGALTQGTTTYAMHG +QQGNDLNAGKNLIFQGQNGQINLKDSVSQGAGSLTFRDNYTVTTSNGSTWTGAGIVVDNGVSVNWQVNG +VKGDNLHKIGEGTLTVQGTGINEGGLKVGDGKVVLNQQADNKGQVQAFSSVNIASGRPTVVLTDERQVN +PDTVSWGYRGGTLDVNGNSLTFHQLKAADYGAVLANNVDKRATITLDYALRADKVALNGWSESGKGTAG +NLYKYNNPYTNTTDYFILKQSTYGYFPTDQS GTMARNDGQGKAAATFMHISYNNFITEVDNLNKRMGDLRDINGEAGTWVRLLNGSGSADGGFTDHYTLLQ +MGADRKHELGSMDLFTGVMATYTDTDASADLYSGKTKSWGGGFYASGLFRSGAYFDVIAKYIHNENKYD +LNFAGAGKQNFRSHSLYAGAEVGYRYHLTDTTFVEPQAELVWGRLQGQTFNWNDSGMDVSMRRNSVNPL +VGRTGVVSGKTFSGKDWSLTARAGLHYEFDLTDSADVHLKDAAGEHQINGRKDSRMLYGVGLNARFGDN +TRLGLEVERSAFGKYNTDDAINANIRYSFLE * * * * * * * * * * * * + * * * ** * * + * * * * * * ** + * * * * * * * * + * 11.04% match at offset 310 PVTFRTSEGGALEWSFNSSTGAGALTQGTTTYAMHGQQGNDLNAGKNLIFQGQNGQINLKDSVSQGAGSL +TFRDNYTVTTSNGSTWTGAGIVVDNGVSVNWQVNGVKGDNLHKIGEGTLTVQGTGINEGGLKVGDGKVV +LNQQADNKGQVQAFSSVNIASGRPTVVLTDERQVNPDTVSWGYRGGTLDVNGNSLTFHQLKAADYGAVL +ANNVDKRATITLDYALRADKVALNGWSESGKGTAGNLYKYNNPYTNTTDYFILKQSTYGYFPTDQSSNA +TWEFVGHSQGDAQKLVADRFNTAGYLFHGQL GTMARNDGQGKAAATFMHISYNNFITEVDNLNKRMGDLRDINGEAGTWVRLLNGSGSADGGFTDHYTLLQ +MGADRKHELGSMDLFTGVMATYTDTDASADLYSGKTKSWGGGFYASGLFRSGAYFDVIAKYIHNENKYD +LNFAGAGKQNFRSHSLYAGAEVGYRYHLTDTTFVEPQAELVWGRLQGQTFNWNDSGMDVSMRRNSVNPL +VGRTGVVSGKTFSGKDWSLTARAGLHYEFDLTDSADVHLKDAAGEHQINGRKDSRMLYGVGLNARFGDN +TRLGLEVERSAFGKYNTDDAINANIRYSFLE * * * * * + * * * * * * * +** * * * *** * * * * * * + * * ** +* * * * 10.06% match at offset 536 DKVALNGWSESGKGTAGNLYKYNNPYTNTTDYFILKQSTYGYFPTDQSSNATWEFVGHSQGDAQKLVADR +FNTAGYLFHGQLKGNLNVDNRLPEGVTGALVMDGAADISGTFTQENGRLTLQGHPVIHAYNTQSVADKL +AASGDHSVLTQPTSFSQEDWENRSFTFDRLSLKNTDFGLGRNATLNTTIQADNSSVTLGDSRVFIDKND +GQGTAFTLEEGTSVATKDADKSVFNGTVNLDNQSVLNINDIFNGGIQANNSTVNISSDSAVLGNSTLTS +TALNLNKGANALASQSFVSDGPVNISDATLS GTMARNDGQGKAAATFMHISYNNFITEVDNLNKRMGDLRDINGEAGTWVRLLNGSGSADGGFTDHYTLLQ +MGADRKHELGSMDLFTGVMATYTDTDASADLYSGKTKSWGGGFYASGLFRSGAYFDVIAKYIHNENKYD +LNFAGAGKQNFRSHSLYAGAEVGYRYHLTDTTFVEPQAELVWGRLQGQTFNWNDSGMDVSMRRNSVNPL +VGRTGVVSGKTFSGKDWSLTARAGLHYEFDLTDSADVHLKDAAGEHQINGRKDSRMLYGVGLNARFGDN +TRLGLEVERSAFGKYNTDDAINANIRYSFLE * * * * * + * * * * * * * + * * * * * + * * * * * * * +* * * * ** * 12.01% match at offset 575 YGYFPTDQSSNATWEFVGHSQGDAQKLVADRFNTAGYLFHGQLKGNLNVDNRLPEGVTGALVMDGAADIS +GTFTQENGRLTLQGHPVIHAYNTQSVADKLAASGDHSVLTQPTSFSQEDWENRSFTFDRLSLKNTDFGL +GRNATLNTTIQADNSSVTLGDSRVFIDKNDGQGTAFTLEEGTSVATKDADKSVFNGTVNLDNQSVLNIN +DIFNGGIQANNSTVNISSDSAVLGNSTLTSTALNLNKGANALASQSFVSDGPVNISDATLSLNSRPDEV +SHTLLPVYDYAGSWNLKGDDARLNVGPYSML GTMARNDGQGKAAATFMHISYNNFITEVDNLNKRMGDLRDINGEAGTWVRLLNGSGSADGGFTDHYTLLQ +MGADRKHELGSMDLFTGVMATYTDTDASADLYSGKTKSWGGGFYASGLFRSGAYFDVIAKYIHNENKYD +LNFAGAGKQNFRSHSLYAGAEVGYRYHLTDTTFVEPQAELVWGRLQGQTFNWNDSGMDVSMRRNSVNPL +VGRTGVVSGKTFSGKDWSLTARAGLHYEFDLTDSADVHLKDAAGEHQINGRKDSRMLYGVGLNARFGDN +TRLGLEVERSAFGKYNTDDAINANIRYSFLE * * * * * * * * * * + * * * ** * * * + * * * * * * + * * * * * ** * + * * * * * 10.06% match at offset 813 TALNLNKGANALASQSFVSDGPVNISDATLSLNSRPDEVSHTLLPVYDYAGSWNLKGDDARLNVGPYSML +SGNINVQDKGTVTLGGEGELSPDLTLQNQMLYSLFNGYRNIWSGSLNAPDATVSMTDTQWSMNGNSTAG +NMKLNRTIVGFNGGTSPFTTLTTDNLDAVQSAFVMRTDLNKADKLVINKSATGHDNSIWVNFLKKPSNK +DTLDIPLVSAPEATADNLFRASTRVVGFSDVTPILSVRKEDGKKEWVLDGYQVARNDGQGKAAATFMHI +SYNNFITEVNNLNKRMGDLRDINGEAGTWVR GTMARNDGQGKAAATFMHISYNNFITEVDNLNKRMGDLRDINGEAGTWVRLLNGSGSADGGFTDHYTLLQ +MGADRKHELGSMDLFTGVMATYTDTDASADLYSGKTKSWGGGFYASGLFRSGAYFDVIAKYIHNENKYD +LNFAGAGKQNFRSHSLYAGAEVGYRYHLTDTTFVEPQAELVWGRLQGQTFNWNDSGMDVSMRRNSVNPL +VGRTGVVSGKTFSGKDWSLTARAGLHYEFDLTDSADVHLKDAAGEHQINGRKDSRMLYGVGLNARFGDN +TRLGLEVERSAFGKYNTDDAINANIRYSFLE * * * * * * + * * * * *** + * ** * + * * * * * * * * * * * + * * * 11.69% match at offset 970 TTLTTDNLDAVQSAFVMRTDLNKADKLVINKSATGHDNSIWVNFLKKPSNKDTLDIPLVSAPEATADNLF +RASTRVVGFSDVTPILSVRKEDGKKEWVLDGYQVARNDGQGKAAATFMHISYNNFITEVNNLNKRMGDL +RDINGEAGTWVRLLNGSGSADGGFTDHYTLLQMGADRKHELGSMDLFTGVMATYTDTDASADLYSGKTK +SWGGGFYASGLFRSGAYFDVIAKYIHNENKYDLNFAGAGKQNFRSHSLYAGAEVGYRYHLTDTTFVEPQ +AELVWGRLQGQTFNWNDSGMDVSMRRNSVNP GTMARNDGQGKAAATFMHISYNNFITEVDNLNKRMGDLRDINGEAGTWVRLLNGSGSADGGFTDHYTLLQ +MGADRKHELGSMDLFTGVMATYTDTDASADLYSGKTKSWGGGFYASGLFRSGAYFDVIAKYIHNENKYD +LNFAGAGKQNFRSHSLYAGAEVGYRYHLTDTTFVEPQAELVWGRLQGQTFNWNDSGMDVSMRRNSVNPL +VGRTGVVSGKTFSGKDWSLTARAGLHYEFDLTDSADVHLKDAAGEHQINGRKDSRMLYGVGLNARFGDN +TRLGLEVERSAFGKYNTDDAINANIRYSFLE * * * * * * * * + * * * * * * * * + * * * * * * * * * + * * * * * * * + * * * * 10.06% match at offset 1054 ILSVRKEDGKKEWVLDGYQVARNDGQGKAAATFMHISYNNFITEVNNLNKRMGDLRDINGEAGTWVRLLN +GSGSADGGFTDHYTLLQMGADRKHELGSMDLFTGVMATYTDTDASADLYSGKTKSWGGGFYASGLFRSG +AYFDVIAKYIHNENKYDLNFAGAGKQNFRSHSLYAGAEVGYRYHLTDTTFVEPQAELVWGRLQGQTFNW +NDSGMDVSMRRNSVNPLVGRTGVVSGKTFSGKDWSLTARAGLHYEFDLTDSADVHLKDAAGEHQINGRK +DSRMLYGVGLNARFGDNTRLGLEVERSAFGK GTMARNDGQGKAAATFMHISYNNFITEVDNLNKRMGDLRDINGEAGTWVRLLNGSGSADGGFTDHYTLLQ +MGADRKHELGSMDLFTGVMATYTDTDASADLYSGKTKSWGGGFYASGLFRSGAYFDVIAKYIHNENKYD +LNFAGAGKQNFRSHSLYAGAEVGYRYHLTDTTFVEPQAELVWGRLQGQTFNWNDSGMDVSMRRNSVNPL +VGRTGVVSGKTFSGKDWSLTARAGLHYEFDLTDSADVHLKDAAGEHQINGRKDSRMLYGVGLNARFGDN +TRLGLEVERSAFGKYNTDDAINANIRYSFLE * * * * * ** + * * ** * * * + * * * ** + ** * * * * * * * + * * *

With the rise and rise of 'Social' network sites: 'Computers are making people easier to use everyday'
Examine what is said, not who speaks -- Silence betokens consent -- Love the truth but pardon error.
"Science is about questioning the status quo. Questioning authority".
In the absence of evidence, opinion is indistinguishable from prejudice.

Replies are listed 'Best First'.
Re^2: Tips on how to perform this regex query
by choroba (Cardinal) on Jan 11, 2014 at 10:47 UTC
    It depends on whether you count deletion and insertion as possible operations, too. But if you do not, xor is a good trick.
    لսႽ† ᥲᥒ⚪⟊Ⴙᘓᖇ Ꮅᘓᖇ⎱ Ⴙᥲ𝇋ƙᘓᖇ
      It depends on whether you count deletion and insertion as possible operations, too.

      Have you found an application where 'edit distance' is actually meaningful?


      With the rise and rise of 'Social' network sites: 'Computers are making people easier to use everyday'
      Examine what is said, not who speaks -- Silence betokens consent -- Love the truth but pardon error.
      "Science is about questioning the status quo. Questioning authority".
      In the absence of evidence, opinion is indistinguishable from prejudice.
        Well, besides DNA matching, maybe typo correcting?
        Update:I have also read some papers on correlation between edit distance and understanding people with speech impediments.
        لսႽ† ᥲᥒ⚪⟊Ⴙᘓᖇ Ꮅᘓᖇ⎱ Ⴙᥲ𝇋ƙᘓᖇ
Re^2: Tips on how to perform this regex query
by Anonymous Monk on Jan 12, 2014 at 00:00 UTC
    my $lenSmall = length $smallstring; for my $o ( 0 .. $lenBig - 1 ) { my $masked = substr( $bigstring, $o, $lenSmall ) ^ $smallstring; my $matched = $masked =~ tr[\0][]; if( ( $matched / $lenSmall ) > 0.90 ) { $masked =~ tr[\1-\255][ ]; $masked =~ tr[\0][*]; printf "%.2f%% match at offset %u\n", $matched / $lenSmall * 1 +00, $o; print substr $bigstring, $o, $lenSmall; print $smallstring; print $masked; } }
    98.05% match at offset 1071 YQVARNDGQGKAAATFMHISYNNFITEVNNLNKRMGDLRDINGEAGTWVRLLNGSGSADGGFTDHYTLLQ +MGADRKHELGSMDLFTGVMATYTDTDASADLYSGKTKSWGGGFYASGLFRSGAYFDVIAKYIHNENKYD +LNFAGAGKQNFRSHSLYAGAEVGYRYHLTDTTFVEPQAELVWGRLQGQTFNWNDSGMDVSMRRNSVNPL +VGRTGVVSGKTFSGKDWSLTARAGLHYEFDLTDSADVHLKDAAGEHQINGRKDSRMLYGVGLNARFGDN +TRLGLEVERSAFGKYNTDDAINANIRYSF GTMARNDGQGKAAATFMHISYNNFITEVDNLNKRMGDLRDINGEAGTWVRLLNGSGSADGGFTDHYTLLQ +MGADRKHELGSMDLFTGVMATYTDTDASADLYSGKTKSWGGGFYASGLFRSGAYFDVIAKYIHNENKYD +LNFAGAGKQNFRSHSLYAGAEVGYRYHLTDTTFVEPQAELVWGRLQGQTFNWNDSGMDVSMRRNSVNPL +VGRTGVVSGKTFSGKDWSLTARAGLHYEFDLTDSADVHLKDAAGEHQINGRKDSRMLYGVGLNARFGDN +TRLGLEVERSAFGKYNTDDAINANIRYSFLE ************************* ***************************************** +********************************************************************* +********************************************************************* +********************************************************************* +*****************************

      c:\test\anonymonks.pl Global symbol "$lenBig" requires explicit package name at C:\test\1070 +240.pl line 9. Execution of C:\test\1070240.pl aborted due to compilation errors.
      Correct for that and compare:
      my $bigstring="MNRIYSLRYSA..."; my $smallstring="GTMARNDGQGKAA..."; my $lenBig = length $bigstring; my $lenSmall = length $smallstring; for my $o ( 0 .. ( $lenBig - $lenSmall + 1 ) ) { my $masked = substr( $bigstring, $o, $lenSmall ) ^ $smallstring; my $matched = $masked =~ tr[\0][]; if( ( $matched / $lenSmall ) > 0.095 ) { $masked =~ tr[\1-\255][ ]; $masked =~ tr[\0][*]; printf "%.2f%% match at offset %u\n", $matched / $lenSmall * 1 +00, $o; print substr $bigstring, $o, $lenSmall; print $smallstring; print $masked; } } ]; $otherstring = q[ my $bigstring="MNRIYSLRYSA..."; my $smallstring="GTMARNDGQGKAA..."; my $lenBig = length $bigstring; my $lenSmall = length $smallstring; for my $o ( 0 .. $lenBig - 1 ) { my $masked = substr( $bigstring, $o, $lenSmall ) ^ $smallstring; my $matched = $masked =~ tr[\0][]; if( ( $matched / $lenSmall ) > 0.90 ) { $masked =~ tr[\1-\255][ ]; $masked =~ tr[\0][*]; printf "%.2f%% match at offset %u\n", $matched / $lenSmall * 1 +00, $o; print substr $bigstring, $o, $lenSmall; print $smallstring; print $masked; } } ]; print compare( $firststring, $otherstring ); __END__ 99.4% plagerised

      Original thought is a rare commodity.


      With the rise and rise of 'Social' network sites: 'Computers are making people easier to use everyday'
      Examine what is said, not who speaks -- Silence betokens consent -- Love the truth but pardon error.
      "Science is about questioning the status quo. Questioning authority".
      In the absence of evidence, opinion is indistinguishable from prejudice.
        Original though is a rare commodity.
        s/though/thought/
        plagerised
        patched!
      corrected and improved
      #! perl -slw use strict; my $bigstring="MNRIYSLRYSAVARGFIAVSEFARKCVHKSVRRLCFPVLLLIPVLFSAGSLAGTV +NNELGYQLFRDFAENKGMFRPGATNIAIYNKQGEFVGTLDKAAMPDFSAVDSEIGVATLINPQYIASVK +HNGGYTNVSFGDGENRYNIVDRNNAPSLDFHAPRLDKLVTEVAPTAVTAQGAVAGAYLDKERYPVFYRL +GSGTQYIKDSNGQLTKMGGAYSWLTGGTVGSLSSYQNGEMISTSSGLVFDYKLNGAMPIYGEAGDSGSP +LFAFDTVQNKWVLVGVLTAGNGAGGRGNNWAVIPLDFIGQKFNEDNDAPVTFRTSEGGALEWSFNSSTG +AGALTQGTTTYAMHGQQGNDLNAGKNLIFQGQNGQINLKDSVSQGAGSLTFRDNYTVTTSNGSTWTGAG +IVVDNGVSVNWQVNGVKGDNLHKIGEGTLTVQGTGINEGGLKVGDGKVVLNQQADNKGQVQAFSSVNIA +SGRPTVVLTDERQVNPDTVSWGYRGGTLDVNGNSLTFHQLKAADYGAVLANNVDKRATITLDYALRADK +VALNGWSESGKGTAGNLYKYNNPYTNTTDYFILKQSTYGYFPTDQSSNATWEFVGHSQGDAQKLVADRF +NTAGYLFHGQLKGNLNVDNRLPEGVTGALVMDGAADISGTFTQENGRLTLQGHPVIHAYNTQSVADKLA +ASGDHSVLTQPTSFSQEDWENRSFTFDRLSLKNTDFGLGRNATLNTTIQADNSSVTLGDSRVFIDKNDG +QGTAFTLEEGTSVATKDADKSVFNGTVNLDNQSVLNINDIFNGGIQANNSTVNISSDSAVLGNSTLTST +ALNLNKGANALASQSFVSDGPVNISDATLSLNSRPDEVSHTLLPVYDYAGSWNLKGDDARLNVGPYSML +SGNINVQDKGTVTLGGEGELSPDLTLQNQMLYSLFNGYRNIWSGSLNAPDATVSMTDTQWSMNGNSTAG +NMKLNRTIVGFNGGTSPFTTLTTDNLDAVQSAFVMRTDLNKADKLVINKSATGHDNSIWVNFLKKPSNK +DTLDIPLVSAPEATADNLFRASTRVVGFSDVTPILSVRKEDGKKEWVLDGYQVARNDGQGKAAATFMHI +SYNNFITEVNNLNKRMGDLRDINGEAGTWVRLLNGSGSADGGFTDHYTLLQMGADRKHELGSMDLFTGV +MATYTDTDASADLYSGKTKSWGGGFYASGLFRSGAYFDVIAKYIHNENKYDLNFAGAGKQNFRSHSLYA +GAEVGYRYHLTDTTFVEPQAELVWGRLQGQTFNWNDSGMDVSMRRNSVNPLVGRTGVVSGKTFSGKDWS +LTARAGLHYEFDLTDSADVHLKDAAGEHQINGRKDSRMLYGVGLNARFGDNTRLGLEVERSAFGKYNTD +DAINANIRYSF"; my $smallstring="GTMARNDGQGKAAATFMHISYNNFITEVDNLNKRMGDLRDINGEAGTWVRLLN +GSGSADGGFTDHYTLLQMGADRKHELGSMDLFTGVMATYTDTDASADLYSGKTKSWGGGFYASGLFRSG +AYFDVIAKYIHNENKYDLNFAGAGKQNFRSHSLYAGAEVGYRYHLTDTTFVEPQAELVWGRLQGQTFNW +NDSGMDVSMRRNSVNPLVGRTGVVSGKTFSGKDWSLTARAGLHYEFDLTDSADVHLKDAAGEHQINGRK +DSRMLYGVGLNARFGDNTRLGLEVERSAFGKYNTDDAINANIRYSFLE"; my $lenBig = length $bigstring; my $lenSmall = length $smallstring; my $threshold = 0.9; for my $o ( 0 .. $lenBig - $threshold*$lenSmall ) { my $masked = substr( $bigstring, $o, $lenSmall ) ^ $smallstring; my $matched = $masked =~ tr[\0][]; if( ( $matched / $lenSmall ) > $threshold ) { $masked =~ tr[\1-\255][ ]; $masked =~ tr[\0][*]; printf "%.2f%% match at offset %u\n", $matched / $lenSmall * 1 +00, $o; print substr $bigstring, $o, $lenSmall; print $smallstring; print $masked; } }
      98.05% match at offset 1071 YQVARNDGQGKAAATFMHISYNNFITEVNNLNKRMGDLRDINGEAGTWVRLLNGSGSADGGFTDHYTLLQ +MGADRKHELGSMDLFTGVMATYTDTDASADLYSGKTKSWGGGFYASGLFRSGAYFDVIAKYIHNENKYD +LNFAGAGKQNFRSHSLYAGAEVGYRYHLTDTTFVEPQAELVWGRLQGQTFNWNDSGMDVSMRRNSVNPL +VGRTGVVSGKTFSGKDWSLTARAGLHYEFDLTDSADVHLKDAAGEHQINGRKDSRMLYGVGLNARFGDN +TRLGLEVERSAFGKYNTDDAINANIRYSF GTMARNDGQGKAAATFMHISYNNFITEVDNLNKRMGDLRDINGEAGTWVRLLNGSGSADGGFTDHYTLLQ +MGADRKHELGSMDLFTGVMATYTDTDASADLYSGKTKSWGGGFYASGLFRSGAYFDVIAKYIHNENKYD +LNFAGAGKQNFRSHSLYAGAEVGYRYHLTDTTFVEPQAELVWGRLQGQTFNWNDSGMDVSMRRNSVNPL +VGRTGVVSGKTFSGKDWSLTARAGLHYEFDLTDSADVHLKDAAGEHQINGRKDSRMLYGVGLNARFGDN +TRLGLEVERSAFGKYNTDDAINANIRYSFLE ************************* ***************************************** +********************************************************************* +********************************************************************* +********************************************************************* +*****************************
        Better
        for my $o ( ($threshold-1)*$lenSmall .. $lenBig - $threshold*$lenSmall + ) {
        But substr needs to handle negative offsets then.

        Left as exercise!