in reply to Re^4: need help with a regex
in thread need help with a regex

I think we're getting somewhere, though wrapping the data in a <readmore> tag would be helpful for later reviewers of the thread. Rather than slurping the file, you can use the newlines to split alternating records. A base level while on the file handle, followed by an if on whether it is a header or not, and then a while(m//g) to iterate over the matches. Since you throw away the word characters you were matching before and after your sequence capture, you can drop those so pos and length will yield your expected result (see jwkrahn's insight).

#!/usr/bin/perl use strict; use warnings; my $header; while (<DATA>) { if (/^(>.*)/s) { $header = $1; } else { while (/([VMFWLCAI]{8,})/mg) { my $sequence = $1; print $header, "contains $sequence at position ", pos() - +length($sequence), "\n"; $header = ""; } } } __DATA__ >P31946 | Homo sapiens (Human). | NCBI_TaxID=9606; | 246 | Name=YWH +AB; MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSSWRVISSIEQK +TERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFYLKMKGDYFRYLSEVASGDN +KQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFYYEILNSPEKACSLAKTAFDEAIAELDTL +NEESYKDSTLIMQLLRDNLTLWTSENQGDEGDAGEGEN >P62258 | Homo sapiens (Human). | NCBI_TaxID=9606; | 255 | Name=YWH +AE; MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLSVAYKNVIGARRASWRIISSIEQKE +ENKGGEDKLKMIREYRQMVETELKLICCDILDVLDKHLIPAANTGESKVFYYKMKGDYHRYLAEFATGN +DRKEAAENSLVAYKAASDIAMTELPPTHPIRLGLALNFSVFYYEILNSPDRACRLAKAAFDDAIAELDT +LSEESYKDSTLIMQLLRDNLTLWTSDMQGDGEEQNKEALQDVEDENQ >Q04917 | Homo sapiens (Human). | NCBI_TaxID=9606; | 246 | Name=YWH +AH; Synonyms=YWHA1; MGDREQLLQRARLAEQAERYDDMASAMKAVTELNEPLSNEDRNLLSVAYKNVVGARRSSWRVISSIEQKT +MADGNEKKLEKVKAYREKIEKELETVCNDVLSLLDKFLIKNCNDFQYESKVFYLKMKGDYYRYLAEVAS +GEKKNSVVEASEAAYKEAFEISKEQMQPTHPIRLGLALNFSVFYYEIQNAPEQACLLAKQAFDDAIAEL +DTLNEDSYKDSTLIMQLLRDNLTLWTSDQQDEEAGEGN >P30450 | Homo sapiens (Human). | NCBI_TaxID=9606; | 365 | Name=HLA +-A; Synonyms=HLAA; MAVMAPRTLVLLLSGALALTQTWAGSHSMRYFYTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME +PRAPWIEQEGPEYWDRNTRNVKAHSQTDRANLGTLRGYYNQSEDGSHTIQRMYGCDVGPDGRFLRGYQQ +DAYDGKDYIALNEDLRSWTAADMAAQITQRKWETAHEAEQWRAYLEGRCVEWLRRYLENGKETLQRTDA +PKTHMTHHAVSDHEATLRCWALSFYPAEITLTWQRDGEDQTQDTELVETRPAGDGTFQKWASVVVPSGQ +EQRYTCHVQHEGLPKPLTLRWEPSSQPTIPIVGIIAGLVLFGAVIAGAVVAAVMWRRKSSDRKGGSYSQ +AASSDSAQGSDMSLTACKV >Q156A1 | Homo sapiens (Human). | NCBI_TaxID=9606; | 80 | Name=ATXN +8; MQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ +QQQQQQQQQQ >Q9UQB9 | Homo sapiens (Human). | NCBI_TaxID=9606; | 309 | Name=AUR +KC; Synonyms=AIE2, AIK3, ARK3, STK13; MSSPRAVVQLGKAQPAGEELATANQTAQQPSSPAMRRLTVDDFEIGRPLGKGKFGNVYLARLKESHFIVA +LKVLFKSQIEKEGLEHQLRREIEIQAHLQHPNILRLYNYFHDARRVYLILEYAPRGELYKELQKSEKLD +EQRTATIIEELADALTYCHDKKVIHRDIKPENLLLGFRGEVKIADFGWSVHTPSLRRKTMCGTLDYLPP +EMIEGRTYDEKVDLWCIGVLCYELLVGYPPFESASHSETYRRILKVDVRFPLSMPLGARDLISRLLRYQ +PLERLPLAQILKHPWVQAHSRRVLPPCAQMAS >O75366 | Homo sapiens (Human). | NCBI_TaxID=9606; | 819 | Name=AVI +L; MPLTSAFRAVDNDPGIIVWRIEKMELALVPVSAHGNFYEGDCYVILSTRRVASLLSQDIHFWIGKDSSQD +EQSCAAIYTTQLDDYLGGSPVQHREVQYHESDTFRGYFKQGIIYKQGGVASGMKHVETNTYDVKRLLHV +KGKRNIRATEVEMSWDSFNRGDVFLLDLGKVIIQWNGPESNSGERLKAMLLAKDIRDRERGGRAEIGVI +EGDKEAASPELMKVLQDTLGRRSIIKPTVPDEIIDQKQKSTIMLYHISDSAGQLAVTEVATRPLVQDLL +NHDDCYILDQSGTKIYVWKGKGATKAEKQAAMSKALGFIKMKSYPSSTNVETVNDGAESAMFKQLFQKW +SVKDQTMGLGKTFSIGKIAKVFQDKFDVTLLHTKPEVAAQERMVDDGNGKVEVWRIENLELVPVEYQWY +GFFYGGDCYLVLYTYEVNGKPHHILYIWQGRHASQDELAASAYQAVEVDRQFDGAAVQVRVRMGTEPRH +FMAIFKGKLVIFEGGTSRKGNAEPDPPVRLFQIHGNDKSNTKAVEVPAFASSLNSNDVFLLRTQAEHYL +WYGKGSSGDERAMAKELASLLCDGSENTVAEGQEPAEFWDLLGGKTPYANDKRLQQEILDVQSRLFECS +NKTGQFVVTEITDFTQDDLNPTDVMLLDTWDQVFLWIGAEANATEKESALATAQQYLHTHPSGRDPDTP +ILIIKQGFEPPIFTGWFLAWDPNIWSAGKTYEQLKEELGDAAAIMRITADMKNATLSLNSNDSEPKYYP +IAVLLKNQNQELPEDVNPAKKENYLSEQDFVSVFGITRGQFAALPGWKQLQMKKEKGLF >Q9UPA5 | Homo sapiens (Human). | NCBI_TaxID=9606; | 3926 | Name=BS +N; Synonyms=KIAA0434, ZNF231; MGNEVSLEGGAGDGPLPPGGAGPGPGPGPGPGAGKPPSAPAGGGQLPAAGAARSTAVPPVPGPGPGPGPG +PGPGSTSRRLDPKEPLGNQRAASPTPKQASATTPGHESPRETRAQGPAGQEADGPRRTLQVDSRTQRSG +RSPSVSPDRGSTPTSPYSVPQIAPLPSSTLCPICKTSDLTSTPSQPNFNTCTQCHNKVCNQCGFNPNPH +LTQVKEWLCLNCQMQRALGMDMTTAPRSKSQQQLHSPALSPAHSPAKQPLGKPDQERSRGPGGPQPGSR +QAETARATSVPGPAQAAAPPEVGRVSPQPPQPTKPSTAEPRPPAGEAPAKSATAVPAGLGATEQTQEGL +TGKLFGLGASLLTQASTLMSVQPEADTQGQPAPSKGTPKIVFNDASKEAGPKPLGSGPGPGPAPGAKTE +PGARMGPGSGPGALPKTGGTTSPKHGRAEHQAASKAAAKPKTMPKERAICPLCQAELNVGSKSPANYNT +CTTCRLQVCNLCGFNPTPHLVEKTEWLCLNCQTKRLLEGSLGEPTPLPPPTSQQPPVGAPHRASGTSPL +KQKGPQGLGQPSGPLPAKASPLSTKASPLPSKASPQAKPLRASEPSKTPSSVQEKKTRVPTKAEPMPKP +PPETTPTPATPKVKSGVRRAEPATPVVKAVPEAPKGGEAEDLVGKPYSQDASRSPQSLSDTGYSSDGIS +SSQSEITGVVQQEVEQLDSAGVTGPHPPSPSEIHKVGSSMRPLLQAQGLAPSERSKPLSSGTGEEQKQR +PHSLSITPEAFDSDEELEDILEEDEDSAEWRRRREQQDTAESSDDFGSQLRHDYVEDSSEGGLSPLPPQ +PPARAAELTDEDFMRRQILEMSAEEDNLEEDDTATSGRGLAKHGTQKGGPRPRPEPSQEPAALPKRRLP +HNATTGYEELLPEGGSAEATDGSGTLQGGLRRFKTIELNSTGSYGHELDLGQGPDPSLDREPELEMESL +TGSPEDRSRGEHSSTLPASTPSYTSGTSPTSLSSLEEDSDSSPSRRQRLEEAKQQRKARHRSHGPLLPT +IEDSSEEEELREEEELLREQEKMREVEQQRIRSTARKTRRDKEELRAQRRRERSKTPPSNLSPIEDASP +TEELRQAAEMEELHRSSCSEYSPSPSLDSEAEALDGGPSRLYKSGSEYNLPTFMSLYSPTETPSGSSTT +PSSGRPLKSAEEAYEEMMRKAELLQRQQGQAAGARGPHGGPSQPTGPRGLGSFEYQDTTDREYGQAAQP +AAEGTPASLGAAVYEEILQTSQSIVRMRQASSRDLAFAEDKKKEKQFLNAESAYMDPMKQNGGPLTPGT +SPTQLAAPVSFSTPTSSDSSGGRVIPDVRVTQHFAKETQDPLKLHSSPASPSSASKEIGMPFSQGPGTP +ATTAVAPCPAGLPRGYMTPASPAGSERSPSPSSTAHSYGHSPTTANYGSQTEDLPQAPSGLAAAGRAAR +EKPLSASDGEGGTPQPSRAYSYFASSSPPLSPSSPSESPTFSPGKMGPRATAEFSTQTPSPAPASDMPR +SPGAPTPSPMVAQGTQTPHRPSTPRLVWQESSQEAPFMVITLASDASSQTRMVHASASTSPLCSPTETQ +PTTHGYSQTTPPSVSQLPPEPPGPPGFPRVPSAGADGPLALYGWGALPAENISLCRISSVPGTSRVEPG +PRTPGTAVVDLRTAVKPTPIILTDQGMDLTSLAVEARKYGLALDPIPGRQSTAVQPLVINLNAQEHTFL +ATATTVSITMASSVFMAQQKQPVVYGDPYQSRLDFGQGGGSPVCLAQVKQVEQAVQTAPYRSGPRGRPR +EAKFARYNLPNQVAPLARRDVLITQMGTAQSIGLKPGPVPEPGAEPHRATPAELRSHALPGARKPHTVV +VQMGEGTAGTVTTLLPEEPAGALDLTGMRPESQLACCDMVYKLPFGSSCTGTFHPAPSVPEKSMADAAP +PGQSSSPFYGPRDPEPPEPPTYRAQGVVGPGPHEEQRPYPQGLPGRLYSSMSDTNLAEAGLNYHAQRIG +QLFQGPGRDSAMDLSSLKHSYSLGFADGRYLGQGLQYGSVTDLRHPTDLLAHPLPMRRYSSVSNIYSDH +RYGPRGDAVGFQEASLAQYSATTAREISRMCAALNSMDQYGGRHGSGGGGPDLVQYQPQHGPGLSAPQS +LVPLRPGLLGNPTFPEGHPSPGNLAQYGPAAGQGTAVRQLLPSTATVRAADGMIYSTINTPIAATLPIT +TQPASVLRPMVRGGMYRPYASGGITAVPLTSLTRVPMIAPRVPLGPTGLYRYPAPSRFPIASSVPPAEG +PVYLGKPAAAKAPGAGGPSRPEMPVGAAREEPLPTTTPAAIKEAAGAPAPAPLAGQKPPADAAPGGGSG +ALSRPGFEKEEASQEERQRKQQEQLLQLERERVELEKLRQLRLQEELERERVELQRHREEEQLLVQREL +QELQTIKHHVLQQQQEERQAQFALQREQLAQQRLQLEQIQQLQQQLQQQLEEQKQRQKAPFPAACEAPG +RGPPLAAAELAQNGQYWPPLTHAAFIAMAGPEGLGQPREPVLHRGLPSSASDMSLQTEEQWEASRSGIK +KRHSMPRLRDACELESGTEPCVVRRIADSSVQTDDEDGESRYLLSRRRRARRSADCSVQTDDEDSAEWE +QPVRRRRSRLPRHSDSGSDSKHDATASSSSAAATVRAMSSVGIQTISDCSVQTEPDQLPRVSPAIHITA +ATDPKVEIVRYISAPEKTGRGESLACQTEPDGQAQGVAGPQLVGPTAISPYLPGIQIVTPGPLGRFEKK +KPDPLEIGYQAHLPPESLSQLVSRQPPKSPQVLYSPVSPLSPHRLLDTSFASSERLNKAHVSPQKHFTA +DSALRQQTLPRPMKTLQRSLSDPKPLSPTAEESAKERFSLYQHQGGLGSQVSALPPNSLVRKVKRTLPS +PPPEEAHLPLAGQASPQLYAASLLQRGLTGPTTVPATKASLLRELDRDLRLVEHESTKLRKKQAELDEE +EKEIDAKLKYLELGITQRKESLAKDRGGRDYPPLRGLGEHRDYLSDSELNQLRLQGCTTPAGQFVDFPA +TAAAPATPSGPTAFQQPRFQPPAPQYSAGSGGPTQNGFPAHQAPTYPGPSTYPAPAFPPGASYPAEPGL +PNQQAFRPTGHYAGQTPMPTTQSTLFPVPADSRAPLQKPRQTSLADLEQKVPTNYEVIASPVVPMSSAP +SETSYSGPAVSSGYEQGKVPEVPRAGDRGSVSQSPAPTYPSDSHYTSLEQNVPRNYVMIDDISELTKDS +TSTAPDSQRLEPLGPGSSGRPGKEPGEPGVLDGPTLPCCYARGEEESEEDSYDPRGKGGHLRSMESNGR +PASTHYYGDSDYRHGARVEKYGPGPMGPKHPSKSLAPAAISSKRSKHRKQGMEQKISKFSPIEEAKDVE +SDLASYPPPAVSSSLVSRGRKFQDEITYGLKKNVYEQQKYYGMSSRDAVEDDRIYGGSSRSRAPSAYSG +EKLSSHDFSGWGKGYEREREAVERLQKAGPKPSSLSMAHSRVRPPMRSQASEEESPVSPLGRPRPAGGP +LPPGGDTCPQFCSSHSMPDVQEHVKDGPRAHAYKREEGYILDDSHCVVSDSEAYHLGQEETDWFDKPRD +ARSDRFRHHGGHAVSSSSQKRGPARHSYHDYDEPPEEGLWPHDEGGPGRHASAKEHRHGDHGRHSGRHT +GEEPGRRAAKPHARDLGRHEARPHSQPSSAPAMPKKGQPGYPSSAEYSQPSRASSAYHHASDSKKGSRQ +AHSGPAALQSKAEPQAQPQLQGRQAAPGPQQSQSPSSRQIPSGAASRQPQTQQQQQGLGLQPPQQALTQ +ARLQQQSQPTTRGSAPAASQPAGKPQPGPSTATGPQPAGPPRAEQTNGSKGTAKAPQQGRAPQAQPAPG +PGPAGVKAGARPGGTPGAPAGQPGADGESVFSKILPGGAAEQAGKLTEAVSAFGKKFSSFW >Q9NSI6 | Homo sapiens (Human). | NCBI_TaxID=9606; | 2320 | Name=BR +WD1; Synonyms=C21orf107, WDR9; MAEPSSARRPVPLIESELYFLIARYLSAGPCRRAAQVLVQELEQYQLLPKRLDWEGNEHNRSYEELVLSN +KHVAPDHLLQICQRIGPMLDKEIPPSISRVTSLLGAGRQSLLRTAKDCRHTVWKGSAFAALHRGRPPEM +PVNYGSPPNLVEIHRGKQLTGCSTFSTAFPGTMYQHIKMHRRILGHLSAVYCVAFDRTGHRIFTGSDDC +LVKIWSTHNGRLLSTLRGHSAEISDMAVNYENTMIAAGSCDKIIRVWCLRTCAPVAVLQGHTGSITSLQ +FSPMAKGSQRYMVSTGADGTVCFWQWDLESLKFSPRPLKFTEKPRPGVQMLCSSFSVGGMFLATGSTDH +VIRMYFLGFEAPEKIAELESHTDKVDSIQFCNNGDRFLSGSRDGTARIWRFEQLEWRSILLDMATRISG +DLSSEEERFMKPKVTMIAWNQNDSIVVTAVNDHVLKVWNSYTGQLLHNLMGHADEVFVLETHPFDSRIM +LSAGHDGSIFIWDITKGTKMKHYFNMIEGQGHGAVFDCKFSQDGQHFACTDSHGHLLIFGFGCSKPYEK +IPDQMFFHTDYRPLIRDSNNYVLDEQTQQAPHLMPPPFLVDVDGNPHPTKYQRLVPGRENSADEHLIPQ +LGYVATSDGEVIEQIISLQTNDNDERSPESSILDGMIRQLQQQQDQRMGADQDTIPRGLSNGEETPRRG +FRRLSLDIQSPPNIGLRRSGQVEGVRQMHQNAPRSQIATERDLQAWKRRVVVPEVPLGIFRKLEDFRLE +KGEEERNLYIIGRKRKTLQLSHKSDSVVLVSQSRQRTCRRKYPNYGRRNRSWRELSSGNESSSSVRHET +SCDQSEGSGSSEEDEWRSDRKSESYSESSSDSSSRYSDWTADAGINLQPPLRTSCRRRITRFCSSSEDE +ISTENLSPPKRRRKRKKENKPKKENLRRMTPAELANMEHLYEFHPPVWITDTTLRKSPFVPQMGDEVIY +FRQGHEAYIEAVRRNNIYELNPNKEPWRKMDLRDQELVKIVGIRYEVGPPTLCCLKLAFIDPATGKLMD +KSFSIRYHDMPDVIDFLVLRQFYDEARQRNWQSCDRFRSIIDDAWWFGTVLSQEPYQPQYPDSHFQCYI +VRWDNTEIEKLSPWDMEPIPDNVDPPEELGASISVTTDELEKLLYKPQAGEWGQKSRDEECDRIISGID +QLLNLDIAAAFAGPVDLCTYPKYCTVVAYPTDLYTIRMRLVNRFYRRLSALVWEVRYIEHNARTFNEPE +SVIARSAKKITDQLLKFIKNQHCTNISELSNTSENDEQNAEDLDDSDLPKTSSGRRRVHDGKKSIRATN +YVESNWKKQCKELVNLIFQCEDSEPFRQPVDLVEYPDYRDIIDTPMDFGTVRETLDAGNYDSPLEFCKD +IRLIFSNAKAYTPNKRSKIYSMTLRLSALFEEKMKKISSDFKIGQKFNEKLRRSQRFKQRQNCKGDSQP +NKSIRNLKPKRLKSQTKIIPELVGSPTQSTSSRTAYLGTHKTSAGISSGVTSGDSSDSAESSERRKRNR +PITNGSTLSESEVEDSLATSLSSSASSSSEESKESSRARESSSRSGLSRSSNLRVTRTRAAQRKTGPVS +LANGCGRKATRKRVYLSDSDNNSLETGEILKARAGNNRKVLRKCAAVAANKIKLMSDVEENSSSESVCS +GRKLPHRNASAVARKKLLHNSEDEQSLKSEIEEEELKDENQPLPVSSSHTAQSNVDESENRDSESESDL +RVARKNWHANGYKSHTPAPSKTKFLKIESSEEDSKSHDSDHACNRTAGPSTSVQKLKAESISEEADSEP +GRSGGRKYNTFHKNASFFKKTKILSDSEDSESEEQDREDGKCHKMEMNPISGNLNCDPIAMSQCSSDHG +CETDLDSDDDKIEKPNNFMKDSASQDNGLSRKISRKRVCSSDSDSSLQVVKKSSKARTGLLRITRRCAA +TAANKIKLMSDVEDVSLENVHTRSKNGRKKPLHLACTTAKKKLSDCEGSVHCEVPSEQYACEGKPPDPD +SEGSTKVLSQALNGDSDSEDMLNSEHKHRHTNIHKIDAPSKRKSSSVTSSGEDSKSHIPGSETDRTFSS +ESTLAQKATAENNFEVELNYGLRRWNGRRLRTYGKAPFSKTKVIHDSQETAEKEVKRKRSHPELENVKI +SETTGNSKFRPDTSSKSSDLGSVTESDIDCTDNTKTKRRKTKGKAKVVRKEFVPRDREPNTKVRTCMHN +QKDAVQMPSETLKAKMVPEKVPRRCATVAANKIKIMSNLKETISGPENVWIRKSSRKLPHRNASAAAKK +KLLNVYKEDDTTINSESEKELEDINRKMLFLRGFRSWKENAQ >Q96KE9 | Homo sapiens (Human). | NCBI_TaxID=9606; | 485 | Name=BTB +D6; Synonyms=BDPL; MAAELYAPASAAAADLANSNAGAAVGRKAGPRSPPSAPAPAPPPPAPAPPTLGNNHQESPGWRCCRPTLR +ERNALMFNNELMADVHFVVGPPGATRTVPAHKYVLAVGSSVFYAMFYGDLAEVKSEIHIPDVEPAAFLI +LLKYMYSDEIDLEADTVLATLYAAKKYIVPALAKACVNFLETSLEAKNACVLLSQSRLFEEPELTQRCW +EVIDAQAEMALRSEGFCEIDRQTLEIIVTREALNTKEAVVFEAVLNWAEAECKRQGLPITPRNKRHVLG +RALYLVRIPTMTLEEFANGAAQSDILTLEETHSIFLWYTATNKPRLDFPLTKRKGLAPQRCHRFQSSAY +RSNQWRYRGRCDSIQFAVDRRVFIAGLGLYGSSSGKAEYSVKIELKRLGVVLAQNLTKFMSDGSSNTFP +VWFEHPVQVEQDTFYTASAVLDGSELSYFGQEGMTEVQCGKVAFQFQCSSDSTNGTGVQGGQIPELIFY +A >P0C7T9 | Homo sapiens (Human). | NCBI_TaxID=9606; | 278 | Name=BZW +1L1; MENSERNKLAMLTGVLLANGTLNASILNSLYNENLVKEGVSAAFAVKLFKSWINEKDINAVAASLRKVSM +DNRLMELFPANKQSVEHFTKYFTEAGLKELSEYVRNQQTIGARKELQKELQEQMSRGDPFKDIILYVKE +EMKKNNIPEPVVIGIVWSSVMSTVEWNKKEELVAEQAIKHLKQYSPLLAAFTTQGQSELTLLLKIQEYC +YDNIHFMKAFQKIVVLFYKAEVLSEEPILKWYKDAHVAKGKSVFLEQMKKFVEWLKNAEEESESEAEEG +D >Q8IYA2 | Homo sapiens (Human). | NCBI_TaxID=9606; | 1237 | Name=CC +DC144C; MVSWGGEKRGGAEGSPKPAVYATRKTGSVRSQEDQWYLGYPGDQWSSGFSYSWWKNSVGSESKHGEGALD +QPQHDVRLEDLGELHRAARSGDVPGVEHVLVPGDTGVDKRDRKKSIQQLVPEYKEKQTPESLPQNNNPD +WHPTNLTLSDETCQRSKNLKVDDKCPSVSPSMPENQSATKELGQMNLTEREKMDTGVKTSQEPEMAKDC +DREDIPIYPVLPHVQKSEEMRIEQGKLEWKNQLKLVINELKQRFGEIYEKYKIPACPEEEPLLDNSTRG +TDVKDIPFNLTNNIPGCEEEDASEISVSVVFETFPEQKEPSLKNIIHSYYHPYSGSQEHVCQSSSKLHL +HENKLDCDNDNKPGIGHIFSTDKNFHNDASTKKARNPEVVTVEMKEDQEFDLQMTKNMNQNSDSGSTNN +YKSLKPKLENLSSLPPDSDRTSEVYLHEELQQDMQKFKNEVNTLEEEFLALKKENVQLHKEVEEEMEKH +RSNSTELSGTLTDGTTVGNDDDGLNQQIPRKENGEHDRLALKQENEEKRNADMLYNKDSEQLRIKEEEC +GKVVETKQQLKWNLRRLVKELRTVVQERNDAQKQLSEEQDARILQDQILTSKQKELEMAQKKRNPEISH +RHQKEKDLFHENCMLQEEIALLRLEIDTIKNQNKQKEKKYFEDIEVVKEKNDNLQKIIKRNEETLTETI +LQYSGQLNNLTAENKMLNSELENGKENQERLEIEMESYRCRLAAAVHDCDQSQTARDLKLDFQRTRQEW +VRLHDKMKVDMSGLQAKNEILSEKLSNAESKINSLQIQLHNTRDALGRESLILERVQRDLSQTQCQKKE +TEQMYQSKLKKYIAKQESVEERLSQLQSENMLLRQQLDDVHKKANSQEKTISTIQDQFHSAAKNLQAES +EKQILSLQEKNKELMDEYNHLKERMDQCEKEKAGRKIDLTEAQETVPSRCLHLDAENEVLQLQQTLFSM +KAIQKQCETLQKNKKQLKQEVVNLKSYMERNMLERGEAEWHKLLIEERARKEIEEKLNEAILTLQKQAA +VSHEQLAQLREDNTTSIKTQMELTVIDLESEISRIKTSQADFNKTKLERYKELYLEEVKVRESLSNELS +RTNEMIAEVSTQLTVEKEQTRSRSLFTAYATRPVLESPCVGNLNDSEGLNRKHIPRKKRSALKDMESYL +LKMQQKLQNDLTAEVAGSSQTGLHRIPQCSSFSSSSLHLLLCSICQPFFLILQLLLNMNLDPI >A7MBM2 | Homo sapiens (Human). | NCBI_TaxID=9606; | 1401 | Name=DI +SP2; Synonyms=DISPB, KIAA1742; MDGDSSSSSGGSGPAPGPGPEGEQRPEGEPLAPDGGSPDSTQTKAVPPEASPERSCSLHSCPLEDPSSSS +GPPPTTSTLQPVGPSSPLAPAHFTYPRALQEYQGGSSLPGLGDRAALCSHGSSLSPSPAPSQRDGTWKP +PAVQHHVVSVRQERAFQMPKSYSQLIAEWPVAVLMLCLAVIFLCTLAGLLGARLPDFSKPLLGFEPRDT +DIGSKLVVWRALQALTGPRKLLFLSPDLELNSSSSHNTLRPAPRGSAQESAVRPRRMVEPLEDRRQENF +FCGPPEKSYAKLVFMSTSSGSLWNLHAIHSMCRMEQDQIRSHTSFGALCQRTAANQCCPSWSLGNYLAV +LSNRSSCLDTTQADAARTLALLRTCALYYHSGALVPSCLGPGQNKSPRCAQVPTKCSQSSAIYQLLHFL +LDRDFLSPQTTDYQVPSLKYSLLFLPTPKGASLMDIYLDRLATPWGLADNYTSVTGMDLGLKQELLRHF +LVQDTVYPLLALVAIFFGMALYLRSLFLTLMVLLGVLGSLLVAFFLYQVAFRMAYFPFVNLAALLLLSS +VCANHTLIFFDLWRLSKSQLPSGGLAQRVGRTMHHFGYLLLVSGLTTSAAFYASYLSRLPAVRCLALFM +GTAVLVHLALTLVWLPASAVLHERYLARGCARRARGRWEGSAPRRLLLALHRRLRGLRRAAAGTSRLLF +QRLLPCGVIKFRYIWICWFAALAAGGAYIAGVSPRLRLPTLPPPGGQVFRPSHPFERFDAEYRQLFLFE +QLPQGEGGHMPVVLVWGVLPVDTGDPLDPRSNSSLVRDPAFSASGPEAQRWLLALCHRARNQSFFDTLQ +EGWPTLCFVETLQRWMESPSCARLGPDLCCGHSDFPWAPQFFLHCLKMMALEQGPDGTQDLGLRFDAHG +SLAALVLQFQTNFRNSPDYNQTQLFYNEVSHWLAAELGMAPPGLRRGWFTSRLELYSLQHSLSTEPAVV +LGLALALAFATLLLGTWNVPLSLFSVAAVAGTVLLTVGLLVLLEWQLNTAEALFLSASVGLSVDFTVNY +CISYHLCPHPDRLSRVAFSLRQTSCATAVGAAALFAAGVLMLPATVLLYRKLGIILMMVKCVSCGFASF +FFQSLCCFFGPEKNCGQILWPCAHLPWDAGTGDPGGEKAGRPRPGSVGGMPGSCSEQYELQPLARRRSP +SFDTSTATSKLSHRPSVLSEDLQLHDGPCCSRPPPAPASPRELLLDHQAVFSQCPALQTSSPYKQAGPS +PKTRARQDSQGEEAEPLPASPEAPAHSPKAKAADPPDGFCSSASTLEGLSVSDETCLSTSEPSARVPDS +VGVSPDDLDDTGQPVLERGQLNGKRDTLWLALRETVYDPSLPASHHSSLSWKGRGGPGDGSPVVLPNSQ +PDLPDVWLRRPSTHTSGYSS >Q96HU8 | Homo sapiens (Human). | NCBI_TaxID=9606; | 199 | Name=DIR +AS2; MPEQSNDYRVAVFGAGGVGKSSLVLRFVKGTFRESYIPTVEDTYRQVISCDKSICTLQITDTTGSHQFPA +MQRLSISKGHAFILVYSITSRQSLEELKPIYEQICEIKGDVESIPIMLVGNKCDESPSREVQSSEAEAL +ARTWKCAFMETSAKLNHNVKELFQELLNLEKRRTVSLQIDGKKSKQQKRKEKLKGKCVIM >Q8N4W6 | Homo sapiens (Human). | NCBI_TaxID=9606; | 341 | Name=DNA +JC22; MAKGLLVTYALWAVGGPAGLHHLYLGRDSHALLWMLTLGGGGLGWLWEFWKLPSFVAQANRAQGQRQSPR +GVTPPLSPIRFAAQVIVGIYFGLVALISLSSMVNFYIVALPLAVGLGVLLVAAVGNQTSDFKNTLGSAF +LTSPIFYGRPIAILPISVAASITAQRHRRYKALVASEPLSVRLYRLGLAYLAFTGPLAYSALCNTAATL +SYVAETFGSFLNWFSFFPLLGRLMEFVLLLPYRIWRLLMGETGFNSSCFQEWAKLYEFVHSFQDEKRQL +AYQVLGLSEGATNEEIHRSYQELVKVWHPDHNLDQTEEAQRHFLEIQAAYEVLSQPRKPWGSRR

outputs:

>P30450 | Homo sapiens (Human). | NCBI_TaxID=9606; | 365 | Name=HLA +-A; Synonyms=HLAA; contains AVVAAVMW at position 324 >A7MBM2 | Homo sapiens (Human). | NCBI_TaxID=9606; | 1401 | Name=DI +SP2; Synonyms=DISPB, KIAA1742; contains VAVLMLCLAVIFLC at position 169 contains LLALVAIFF at position 492 contains IWICWFAALAA at position 704 contains LALALAFA at position 969

Note you'd omitted 'I' from your original character set.

Replies are listed 'Best First'.
Re^6: need help with a regex
by aquinom (Sexton) on Oct 22, 2010 at 22:50 UTC
    Final code, FYI: Thanks again
    #!/usr/bin/perl use strict; use warnings; my $infile = $ARGV[0]; unless (open (INFILE, "$infile")){ print STDERR "Can't open $infile $!\n"; die; } my ($header, $count , $match) = ('',0,0); print "\n\n"; #This algorithm will work only when the sequence is on one line while (<INFILE>) { chomp; if ($_ =~ />(.*)/) { #a header line $header = $1; $count++; #keep running total of seque +nce number } else { #not a header my $i = 0; while ( $_ =~ /([VILMFWCA]{8,})/g) { my $domain = $1; my $len = length $doma +in; $len--; if ($i == 0){ print "Hydroph +obic strecth found in: $header\n"; $match++; $i++; } print "$domain\n"; print "The match was a +t potistion: ", pos() - $len, "\n";; } if ($i > 0){ print "\n\n"; } } } close INFILE; print "Hydrophobic region(s) found in $match sequences out of $count s +equences\n";