# Problem was: I want no 'ä' in the database (which is a utf8 database).
# Solution was HTML::Entities qw/decode_entities/;
#!/bin/sh
time zcat ~/dl/dblp.uni-trier.de/xml/dblp.xml.gz \
| perl -MEncode -MHTML::Entities -ne '
if( m/^
([^\n]*)<.title>/ ) {
my $title = $1;
$title =~ s{\\}{}g;
next if ($title eq "Home Page"
|| $title eq "Editorial."
|| $title eq "Preface."
|| $title eq "Introduction."
|| $title eq "Foreword."
|| $title eq "Guest Editorial."
|| $title eq "Book Reviews."
);
print encode("UTF8", (decode_entities($title) . "\n" ), Encode::FB_CROAK);
}' \
| psql -c "
drop table if exists dblp;
create table dblp (title text);
copy dblp from stdin;
" ;
echo "select count(*) from dblp " | psql ;
echo "select * from dblp where position ('&' in title) > 0 limit 40" | psql ;