## open the file and read in data
my $list_file = '/g/Viruses/prophage_data/emptySeqList_aa.txt';
## try to use single quotes when
## you don't need string interpolation,
## e.g., no variables or "\n"
open (my $fh, '<', $list_file);
## it is often preferable to use a
## variable to store a filehandle
my @lines = <$fh>; # reads entire file in one go
## This is technically bad form,
## but assuming your file isn't too big, it's fine
close ($fh);
my $text = join ('', @lines); # combines all lines into one string
## Here's where your file format will change the code
## I'm assuming nothing is in the file but gene ids,
## and that each id consists of letters, numbers, and underscores.
## This regex will identify all geneids (using \w+)
## and store them as hash keys.
my $geneids_to_remove = {}; # create a hash reference
$text =~ s/(\w+)
(?{ # in regex code
$geneids_to_remove->{$1} = 1; # store geneids in a hash
})
//gx;
####
#### Per line ####
my $ptt_file="/g/Viruses/prophage_data/prophage_region.ptt1";
open ($fh, '<', $ppt_file);
## precompile a regex to capture the geneid on each line
## I assume the gene id is the first thing on each line
my $gene_id;
my $rx_find_geneid = qr/^(\w+) (?{ $gene_id = $1; })/x;
## I prefer to avoid $_ for clarity
my $saved_lines = '';
while (my $line = <$fh>)
{
## run precompiled regex
$line =~ /$rx_find_geneid/;
## check to see if it exists in the hash
## if not, save it
if (! exists $geneids_to_remove->{$gene_id})
{
$saved_lines .= $line;
}
}
close ($fh);
####
#### One big regex ####
## don't do this and the previous
## read in file
my $ptt_file="/g/Viruses/prophage_data/prophage_region.ptt1";
open ($fh, '<', $ppt_file);
@lines = <$fh>;
close ($fh);
$text = join ('', @lines);
## you don't need to precompile this -- it's for clarity
## and in case you ever want to remove these from multiple
## files, i.e., put it in a loop
## Again, I assume the geneid is at the front of the line.
my $saved_lines = '';
my $rx_rm_lines = qr/
(^(\w+).+$ [\r\n])
(?{
if (! exists $geneids_to_remove->{$2})
{
$saved_lines .= $1;
}
})
/xm; # the 'm' modifier enables multiline regex
## run the regex (you can use s/.../$1/g if you
## don't want to destroy the string as you search
$text =~ s/$rx_rm_lines//g;
####
## write out saved data
open ($fh, '>', $outfile);
print $fh $saved_lines;
close ($fh);