in reply to Efficient way to handle huge number of records?

The following code will generate the database using BerkeleyDB. I generated the keys, you would read them in from you file.

#!/usr/local/bin/perl -w our $Debug = 0; our $SortLog = 0; use strict; + use BerkeleyDB; our %Session = (); use Time::HiRes qw( gettimeofday ); use Fcntl qw( :flock ); my $DBHome = "/home/FlexBase/"; if ( substr($DBHome,-1) ne "/" ) { $DBHome .= "/"; } if ( ! -d $DBHome ) { mkdir $DBHome or die "DBHome directory not correct!"; } if ( ! ( ( -r _ )&&( -w _ ) ) ) { die "DBHome directory is not valid!" +; } system("rm /home/FlexBase/*"); ## Temporary for testing our $filename = "/home/FlexBase/TestBDB"; use constant CACHE => '1048576'; my $Cachesize = CACHE; our $env = new BerkeleyDB::Env ( -Home => '/home/FlexBase', -Cachesize => CACHE, -Flags => DB_INIT_MPOOL|DB_INIT_CDB|DB_CREATE ) or die "cannot open ENV: $! $BerkeleyDB::Error\n"; our %Pyr = (); our $BLK = 512; our $db1 = tie %Pyr, 'BerkeleyDB::Btree', ( -Filename => "$filename", -Env => $env, # -Pagesize => 4096, ## Use of this makes the Ber +keleyDB operate worst! -Flags => DB_CREATE ); # create some data our $Chr="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123 +456789|"; our $size = length($Chr)-1; our $Rno = 0; our $LockFile = "../tmp/BDBmail_lock"; system("touch $LockFile"); open(LOCK,"+<",$LockFile) || die "Cannot open $LockFile:$!"; # Associate it with the database my $enable = $env->cds_enabled(); if ( $enable ) { print "cds_enabled\n"; } my $ret = 0; my $cnt = 0; my $Tot = 0; my $howmany = 11_000_000; my $ky = ""; my $data = "a" x 80; my $ap +pkey = "-123456789"; # $howmany = 2_000; ## Testing our ( $WR, $RD ); if ( $SortLog ) { my $logFile = "$filename"."_Write"; open($WR,">",$logFile) || die "Cannot open $logFile:$!"; $logFile = "$filename"."_Read"; open($RD,">",$logFile) || die "Cannot open $logFile:$!"; } our $NAME = "BerkeleyDB"; our $kb = "_KB"; our ($pmem1,$pmem2) = &Display_Mem_Usage($$,$NAME,0); print "## Start: VSZ-$pmem1$kb RSS-$pmem2$kb\tBLOCK: $BLK ( $howma +ny )\n"; my ( $cursor, $lock, $wpm, $key, $count, $k ); my $stime = gettimeofday; while ( $cnt < $howmany ) { # my $st = $cnt % 58; # my $chr = chr( 122-$st ); $key = "A-$cnt"; $cnt++; $Tot += length($key)+length($data); flock( LOCK, LOCK_EX ); $lock = $db1->db_cursor(DB_WRITECURSOR); $ret = $db1->db_put($key, $data); undef $lock; flock( LOCK, LOCK_UN ); if ($ret != 0 ) { die "Write failed on BDB3: $ret\n"; } if ( $SortLog ) { print $WR "$key$data\n"; } } flock( LOCK, LOCK_EX ); $lock = $db1->db_cursor(DB_WRITECURSOR); my $strt = chr(1); $data = ""; $ret = $db1->db_put($strt, $data); my $dbend = chr(255); $data = ""; $ret = $db1->db_put($dbend, $dat +a); undef $lock; flock( LOCK, LOCK_UN ); $db1->db_sync(); undef $cursor; my $etime = gettimeofday - $stime; $wpm = int($howmany/$etime); print "Write: $etime $wpm\/sec $cnt\n"; if ( $SortLog ) { close $WR; system("sort -o /home/FlexBase/TestBDB_Write.sort /home/FlexBa +se/TestBDB_Write"); } $stime = gettimeofday; $count = 0; $key = " "; $data = ""; $ret = 0; while ($ret == 0) { flock( LOCK, LOCK_SH ); $cursor = $db1->db_cursor(); $ret = $cursor->c_get($key, $data, DB_SET); $ret = $cursor->c_get($key, $data, DB_NEXT); flock( LOCK, LOCK_UN ); undef $cursor; if ( ord($key) == 1 ) { next; } if ( ord($key) == 255 ) { last; } $0 = "SLock 3 on $key"; $count++; ## if ( $count > 28 ) { last; } if ( $SortLog ) { print $RD "$key$data\n"; } ## print "$key\t$data\t$count\n"; } $db1->db_sync(); $etime = gettimeofday - $stime; $wpm = int($count/$etime); print "ReadNext: $etime $wpm\/sec Total: $count\n"; if ( $SortLog ) { close $RD; system("/usr/bin/sort -o /home/FlexBase/TestBDB_Read.sort /hom +e/FlexBase/TestBDB_Read"); } my ($mem1,$mem2) = &Display_Mem_Usage($$,$NAME,0); my $diff1 = $mem1 - $pmem1; my $diff2 = $mem2 - $pmem2; print "## End: VSZ-$mem1$kb RSS-$mem2$kb Diff:$diff1|$diff2$kb +\tBLOCK: $BLK\n\n"; exit; sub Display_Mem_Usage { # VSZ is size in KBytes of the virtual memory ( VSZ * 1024 ) # RSS is size in pages of real memory ( 1024 * RSS ) my $cpid = shift; my $name = shift; my $from = shift; my $var = ""; my $fh; my $tm = ""; if ( $Debug >= 2 ) { $tm = scalar localtime() . " "; } if ( ! ( kill 0 => $cpid ) ) ## Check that pid is active { return ( -1, -1 ); } my $arg = qq| -o "vsz rssize" -p $cpid|; open ( $fh, "-|", "/bin/ps $arg" ) or die "Prefork: Not open \'$ar +g\': $!"; while (<$fh>) { $var .= $_; } close $fh; my $rno = my @ref = split(/\n/,$var); if ( $rno < 2 ) { return ( -1, -1 ); } my $info = join(" ", split " ", $ref[1]); my ($vmem,$rmem) = ( split(/\ /,$info) ); return ( $vmem , $rmem ); } 1;

And the output would be.

# perl Show11M.plx cds_enabled ## Start: VSZ-10292_KB RSS-4808_KB BLOCK: 512 ( 11000000 ) Write: 302.145446062088 36406/sec 11000000 ReadNext: 340.443499088287 32310/sec Total: 11000000 ## End: VSZ-10292_KB RSS-6264_KB Diff:0|1456_KB BLOCK: 4096

And the directory where the database resides would contain:

-rw-r--r-- 1 24576 Dec 11 08:37 /home/FlexBase/__db.001 -rw-r--r-- 1 245760 Dec 11 08:37 /home/FlexBase/__db.002 -rw-r--r-- 1 1318912 Dec 11 08:37 /home/FlexBase/__db.003 -rw-r--r-- 1 548864 Dec 11 08:37 /home/FlexBase/__db.004 -rw-r--r-- 1 2100948992 Dec 11 08:26 /home/FlexBase/TestBDB -rw-r--r-- 1 989888890 Dec 11 08:34 /home/FlexBase/TestBDB_Read -rw-r--r-- 1 989888890 Dec 11 08:37 /home/FlexBase/TestBDB_Read.sort -rw-r--r-- 1 989888890 Dec 11 08:26 /home/FlexBase/TestBDB_Write -rw-r--r-- 1 989888890 Dec 11 08:29 /home/FlexBase/TestBDB_Write.sor +t

And to test that all the data was loaded:

# md5sum /home/FlexBase/TestBDB_Read /home/FlexBase/TestBDB_Write cd6ae6c532bc918f4ccdda9cc1d3f671 /home/FlexBase/TestBDB_Read.sort cd6ae6c532bc918f4ccdda9cc1d3f671 /home/FlexBase/TestBDB_Write.sor +t

'diff' runs out of memory with files this large. But 'md5sum' has no problem. The extra step to verify the database is loaded is optional, but it is nice to know you have all of the data. I test different databases all the time, so there is a lot of extra code that you can delete. I did more deleting then adding to get what you needed.

Good Luck

"Well done is better than well said." - Benjamin Franklin