Re^2: Reducing memory footprint when doing a lookup of millions of coordinates

Postgres may not be portable than SQLite. Postgres has geometric data type and functions, and if you take begin and end for a box of no height on the x axis, Postgres's box operator will work. For exmaple, Begin=100 and End=200 record is coordinate of ((100,0),(200,0)).

http://www.postgresql.org/docs/8.2/static/functions-geometry.html

Your sample data of 5301598 records took me an hour to load into default installation of Postgres, but it gives me very quick response.

###SQL###

drop table chromosomes;
create table chromosomes(
        score   integer
        ,div            numeric(6,1)
        ,del            numeric(6,1)
        ,ins            numeric(6,1)
        ,q_sequence     text
        --,begin1               integer
        --,end1         integer
        ,pos    box
        ,left_paren     text
        ,repeat1                text
        ,repeat2                text
        ,class_family   text
        ,begin2         text
        ,end2           text
        ,left_paren2   text
        ,ID             integer
);

create index idx_chromosomes_1 on chromosomes(q_sequence);
create index idx_chromosomes_2 on chromosomes using gist(pos);
[download]

###PERL###

use strict;
use warnings;
use DBI;

my($dbh);

system('date');
$dbh = DBI->connect('dbi:Pg:dbname=test1','','',{AutoCommit=>0})
                        or die $dbh->errstr;

#populate($dbh);
select_test($dbh);

$dbh->disconnect;

system('date');

exit 0;

sub populate{

        my($dbh)=@_;
        my($sth,$sql,$cnt);
        $cnt=0;

        $sql="insert into chromosomes values (?,?,?,?,?,?,?,?,?,?,?,?,
+?,?)";
        $sth=$dbh->prepare($sql) or die $dbh->errstr;
        eval{
                open my $fh , "<", "hg19.fa.out" or die $!;
                while(my $line=<$fh>){
                        $cnt++;
                        next if $cnt <= 3;

                        chomp $line;
                        $line =~ s/^\s+//;
                        my @vals=split(/\s+/, $line);
                        #begin and end to box type
                        $vals[5]="(($vals[5],0),($vals[6],0))";
                        splice(@vals,6,1);

                        $sth->execute(@vals);
                }
                close $fh;
                $sth->finish;
        };
        if($@){
                $dbh->rollback;
        }else {
                $dbh->commit;
        }
}
sub select_test{
        my($dbh)=@_;
        my($sth,$sql);

        #SQL for overlaps
        #$sql="select * from chromosomes where pos && box '((10000,0),
+(20000,0))'";
        #SQL for include
        $sql="select * from chromosomes where pos @ box '((10000,0),(2
+0000,0))'";

        $sth=$dbh->prepare($sql) or die $dbh->errstr;
        $sth->execute  or die $dbh->errstr;
        while(my $href= $sth->fetchrow_hashref){
                print $href->{pos} . "\n";
        }
        $sth->finish;
}
[download]

Comment on Re^2: Reducing memory footprint when doing a lookup of millions of coordinates Select or Download Code

Replies are listed 'Best First'.
Re^3: Reducing memory footprint when doing a lookup of millions of coordinates by tospo (Hermit) on Mar 01, 2011 at 13:00 UTC
Interesting, I wasn't aware of these (I'm not using PostgreSQL very much).	[reply]