# keeps a list fed in object sub get_next_indexpending { my $self= shift; unless( defined $self->{pending_queue} and scalar @{$self->{pending_queue}} ){ # this operation can be expensive. SO, i get 50 at a time, and cache it in the object # as the API, it seems like you just keep asking for the next one # we do not actually query the db for the next one, because that would be EXCRUCIATINGLY SLOW # even asking for many more, could be slow # i've fiddled around with maybe 3 or 4 ways of doing this operation, this works well # it's been debugged a lot, there have been MANY bugs doing this, so DONT FUCK WITH IT :-) # multiple indexers *can* have the same list- that's ok, because only one will lock # the funny thing is if you select 50 at a time, and you have 51 indexers.. then what???? # I THINK THERE IS A RACE CONDITION HERE # I think there should be a formula for : # ( how many indexers are running * NUMBER ) = LIMIT my $LIMIT = 50; # we could be querying a LOT ? It seems that would be wasteful to all hell. # maybe the select can be random or ordered in different ways, alternating ???? debug("pending queue is empty.. "); #make sure it's defined $self->{pending_queue}=[]; if (defined $self->{gpd_stopflag} and $self->{gpd_stopflag} ){ debug("stopflag was raised, no more in pending. Will prepare and execute.."); return; # so we return undef. } debug("will refeed next $LIMIT"); # this is a hack replacement since i cant prepare with passing offset my $gpd = $self->dbh_sth( # can not figure out how to pass offset to a prepped query 'SELECT abs_path, md5sum FROM files WHERE NOT EXISTS'. '(SELECT id FROM md5sum WHERE md5sum.md5sum = files.md5sum LIMIT 1)'. "GROUP BY md5sum LIMIT $LIMIT" ); # i realized getting first 50 or first whatever.. IS ALWAYS VALID # Because if it is already been indexed by another indexer.. that operation is committed # and subsequent selects to next pending list.. will no longer return that file as a result # SO, dont use an incrementing offset. seems like it made sense.. but NO. $gpd->execute; debug("ok.\nWill iterate through results.."); while (my @row = $gpd->fetchrow_array){ # WAS USING for!!! # debug("into queue [@row])"); push @{$self->{pending_queue}}, \@row; } debug(sprintf "got [%s]\n", scalar @{$self->{pending_queue}}); # how about.. if count is less then 50, turn on a stop flag so we dont keep requesting pending.. ??? if (scalar @{$self->{pending_queue}} < 50 ){ $self->{gpd_stopflag} = 1; debug( sprintf "got less then 50 (got %s), turning on stop flag\n", scalar @{$self->{pending_queue}}); } scalar @{$self->{pending_queue}} or warn("no more pending files found"); } my $a = shift @{$self->{pending_queue}}; defined $a or return; my ($abs_path,$md5sum) = @$a; debug("returning abs path, $md5sum\n"); $abs_path or die("missing abs path"); $md5sum or die("missing md5sum"); return ($abs_path,$md5sum); } =head3 get_next_indexpending() no argument returns abs_path, md5sum string for next file in queue you should attempt to lock afterwards beacuse of the nature of indexing, it can take a long time, and we may be running multiple indexers, so attempting to lock is needed if none in pending, returns undef everytime you call get_next_indexpending, it returns a different file while( my ($abs_path,$md5sum) = $self->get_next_indexpending ){ # lock or next } The md5sum string is the md5 hex sum for the file data at the time the files table was updated you should check it again on disk so you know it has not changed in the meantime, and also, if you are remote indexing to make sure the data was not corrupted in transit This sub DOES return either those two values OR undef. =cut