#!/usr/bin/perl use strict; use CGI; use URI::Escape; ############################################################################## # # ############################################################################## if ($ARGV[0]=~/\.gz$/i) { open STDIN,"zcat $ARGV[0] |"; shift @ARGV; } ############################################################################## ## sub breakquery { my $sz=shift; my %res; $sz="\L$sz"; $sz=~s/\.[a-z]{2}\.//g; $sz=~s/\@[a-z]{2}.\d+//g; while ($sz=~s/[\"\']([^\"\']+?)[\"\']//) { $res{$1}++; } $sz=~s/\s{2}/ /g; $sz=~s/[\+\'\"\$\(\)]//g; if ($sz) { my @terms=split /[\s,]/,$sz; for my $t (@terms) { $t=~s/^\s+//; $t=~s/\s+$//; next if $t=~/^\s*$/ || $t=~/^..{0,1}$/ || $t!~/^[a-z0-9\-]+$/ || $t=~/^[0-9]+$/; $res{$t}++ unless grep /^$t$/,qw( and not or adj of the for with ); } } sort keys %res; } ############################################################################## while (<>) { next unless m{GET /netacgi/nph-brs\?([^\s]+)}; my $cgi=new CGI($1); next unless defined $cgi; my $db=$cgi->param("d"); next if $db=~/^\s*$/; $db="\U$db"; next if grep /^$db$/,qw( CHNH CHCA ); next if length($db)!=4; my $s4=$cgi->param("s4"); next unless defined $s4; next if $s4=~/^\s*$/; my @terms=breakquery $s4; print STDERR "\r"; printf STDERR "%.100s","$db ".join(" ",@terms); # $db $t } #### anx57-105.dialup.emory.edu - - [01/Mar/2001:00:00:21 -0500] "GET /detail/detail.html HTTP/1.0" 200 19308 "http://chid.nih.gov/netacgi/nph-brs?op4=and&op5=and&op6=and&op7=and&op8=and&op9=and&op10=and&d=CHCP&l=20&Sect1=CINK&co3=and&pg4=all&s4=underserved&co4=and&pg5=mj&s5=cervical+cancer&co5=and&pg6=de&s6=&co6=and&pg7=au,cn&s7=&co7=and&pg8=ti&s8=&co8=and&pg9=ac&s9=&co9=and&pg10=so,av&s10=&s1=@YR%3E=1995+or+199X.&co1=and&s3=&co2=and&s2=&Sect2=IMAGE&Sect3=THESOFF&Sect3=PLUROFF&Sect4=HITOFF&p=1&u=/detail/detail.html&r=8&f=G" "Mozilla/4.73 [en] (Win95; U)"