#!/usr/bin/perl use warnings; use strict; use diagnostics; open( INFILE, "<", 'myosin.fasta') or die $!; #open original myosin.fasta for reading open( OUTFILE, ">", 'modifiedHeaders.fasta') or die $!; #open/create new fasta for writing my %headerHash; #create hash for the header my %seq; my %header; #loop through old file while (){ chomp; my $line = $_; # use if/then to seperate headers from seq, while regex to select only species name in header if ($line =~ /^>/){ my $header = $line; if ($header =~ /(>gi.*)\[(.+)\](>gi.*)\[(.+)\]/){ my $headerHash; $headerHash{$1} = $2; # return $headerHash; } else{ my $seq = $line; # return $seq ; print OUTFILE $headerHash, "\n", $seq, "\n"; } # if ($header =~ /(>gi.*)\[(.+)\](>gi.*)\[(.+)\]/){ # my $headerHash = $2;} #print $headerHash # print OUTFILE $headerHash{$2}, "\n", $seq, "\n"; } } #print OUTFILE $headerHash, "\n", $seq, "\n"; #### >gi|115527082|ref|NP_005954.3| myosin-1 [Homo sapiens] >gi|226694176|sp|P12882.3|MYH1_HUMAN RecName: Full=Myosin-1; AltName: Full=Myosin heavy chain 1; AltName: Full=Myosin heavy chain 2x; Short=MyHC-2x; AltName: Full=Myosin heavy chain IIx/d; Short=MyHC-IIx/d; AltName: Full=Myosin heavy chain, skeletal muscle, adult 1 [Homo sapiens] >gi|119610411|gb|EAW90005.1| hCG1986604, isoform CRA_b MSSDSEMAIFGEAAPFLRKSERERIEAQNKPFDAKTSVFVVDPKESFVKATVQSREGGKVTAKTEAGATVTVKDDQVFPM NPPKYDKIEDMAMMTHLHEPAVLYNLKERYAAWMIYTYSGLFCVTVNPYKWLPVYNAEVVTAYRGKKRQEAPPHIFSISD NAYQFMLTDRENQSILITGESGAGKTVNTKRVIQYFATIAVTGEKKKEEVTSGKMQGTLEDQIISANPLLEAFGNAKTVR NDNSSRFGKFIRIHFGTTGKLASADIETYLLEKSRVTFQLKAERSYHIFYQIMSNKKPDLIEMLLITTNPYDYAFVSQGE ITVPSIDDQEELMATDSAIEILGFTSDERVSIYKLTGAVMHYGNMKFKQKQREEQAEPDGTEVADKAAYLQNLNSADLLK ALCYPRVKVGNEYVTKGQTVQQVYNAVGALAKAVYDKMFLWMVTRINQQLDTKQPRQYFIGVLDIAGFEIFDFNSLEQLC INFTNEKLQQFFNHHMFVLEQEEYKKEGIEWTFIDFGMDLAACIELIEKPMGIFSILEEECMFPKATDTSFKNKLYEQHL GKSNNFQKPKPAKGKPEAHFSLIHYAGTVDYNIAGWLDKNKDPLNETVVGLYQKSAMKTLALLFVGATGAEAEAGGGKKG GKKKGSSFQTVSALFRENLNKLMTNLRSTHPHFVRCIIPNETKTPGAMEHELVLHQLRCNGVLEGIRICRKGFPSRILYA #### Homo sapiens MSSDSEMAIFGEAAPFLRKSERERIEAQNKPFDAKTSVFVVDPKESFVKATVQSREGGKVTAKTEAGATVTVKDDQVFPM NPPKYDKIEDMAMMTHLHEPAVLYNLKERYAAWMIYTYSGLFCVTVNPYKWLPVYNAEVVTAYRGKKRQEAPPHIFSISD NAYQFMLTDRENQSILITGESGAGKTVNTKRVIQYFATIAVTGEKKKEEVTSGKMQGTLEDQIISANPLLEAFGNAKTVR NDNSSRFGKFIRIHFGTTGKLASADIETYLLEKSRVTFQLKAERSYHIFYQIMSNKKPDLIEMLLITTNPYDYAFVSQGE ITVPSIDDQEELMATDSAIEILGFTSDERVSIYKLTGAVMHYGNMKFKQKQREEQAEPDGTEVADKAAYLQNLNSADLLK ALCYPRVKVGNEYVTKGQTVQQVYNAVGALAKAVYDKMFLWMVTRINQQLDTKQPRQYFIGVLDIAGFEIFDFNSLEQLC INFTNEKLQQFFNHHMFVLEQEEYKKEGIEWTFIDFGMDLAACIELIEKPMGIFSILEEECMFPKATDTSFKNKLYEQHL GKSNNFQKPKPAKGKPEAHFSLIHYAGTVDYNIAGWLDKNKDPLNETVVGLYQKSAMKTLALLFVGATGAEAEAGGGKKG GKKKGSSFQTVSALFRENLNKLMTNLRSTHPHFVRCIIPNETKTPGAMEHELVLHQLRCNGVLEGIRICRKGFPSRILYA