#!/usr/bin/env perl # # search a large file for lines containing a regex # use strict; use warnings; use Data::Dump 'pp'; my @rexlist; my $cnt=0; while () { next if /^\s*($|#)/; s/\s+$//; my ($name, $rex) = split /:/, $_; my $regex = qr/$rex/; ++$cnt; open my $FH, '>', "FILESRCH.$cnt" or die $!; push @rexlist, [ $regex, $name, $FH ]; } open my $IFH, '<', "a_big_file" or die "$!"; $cnt =0; my %cnts; my $lines=0; my $start = time; while (my $line = <$IFH>) { ++$cnt; ++$lines; if ($lines % 100000 == 0) { my $secs = time - $start; print "$lines: $secs s\n"; } #last if $cnt>50; #print "$.: $line"; my $matches = 0; for my $r (@rexlist) { my ($rex, $name, $OFH) = @$r; if ($line =~ $rex) { print $OFH $line; #print "match $matches ($name)\n"; ++$cnts{$name}; } ++$matches; } #print "\n"; } print pp(\%cnts); __DATA__ aNumber:'\d+' CorporateRecord:'CORPORATE' null:NULL oldRec:'200[0-3]-\d\d-\d\d newRec:'20?[4-9]-\d\d-\d\d newRec2: '201\d-\d\d-\d\d #### $ time perl large_file_regex_search.pl 100000: 1 s 200000: 2 s 300000: 3 s 400000: 4 s 500000: 5 s 600000: 5 s 700000: 6 s 800000: 7 s 900000: 8 s 1000000: 10 s 1100000: 15 s 1200000: 18 s 1300000: 20 s 1400000: 23 s 1500000: 25 s 1600000: 29 s 1700000: 35 s 1800000: 42 s 1900000: 47 s 2000000: 53 s 2100000: 60 s 2200000: 66 s 2300000: 71 s 2400000: 75 s 2500000: 81 s 2600000: 87 s 2700000: 92 s 2800000: 98 s 2900000: 103 s 3000000: 107 s 3100000: 113 s 3200000: 119 s 3300000: 124 s 3400000: 129 s 3500000: 135 s 3600000: 142 s 3700000: 151 s 3800000: 158 s 3900000: 166 s 4000000: 173 s 4100000: 181 s { aNumber => 4140847, CorporateRecord => 149943, newRec2 => 783275, null => 4140847, oldRec => 987898, } real 3m5.660s user 1m6.390s sys 0m16.875s $ $ ls -al FI* -rw-r--r-- 1 Roboticus None 1261 May 30 12:12 FILES.ddl.sql -rw-r--r-- 1 Roboticus None 3248770142 Jul 21 08:47 FILESRCH.1 -rw-r--r-- 1 Roboticus None 116430098 Jul 21 08:47 FILESRCH.2 -rw-r--r-- 1 Roboticus None 3248770142 Jul 21 08:47 FILESRCH.3 -rw-r--r-- 1 Roboticus None 769188466 Jul 21 08:47 FILESRCH.4 -rw-r--r-- 1 Roboticus None 0 Jul 21 08:44 FILESRCH.5 -rw-r--r-- 1 Roboticus None 613214364 Jul 21 08:47 FILESRCH.6