#!/usr/bin/perl -w use JSON::XS; use Lingua::Identify qw(:language_identification); #use lib qw(/home/corman/perlmodules); #use SqlSupport; my $dbh = connectpgdb('****','****','****','Pg','localhost'); my @items = getsqlcol($dbh,"select tweet_text from twitter order by random() limit 10000"); my $sample; my $idx = 0; foreach my $tweet (@items) { my $lang = langof($tweet); if ($lang =~ /ru|bg|uk/) { $sample->[$idx]->{text} = $tweet; $sample->[$idx]->{lang} = $lang; $sample->[$idx]->{len} =length($tweet); $idx++; } } print "$idx items\n"; open(OUT,">twitter-non-en.json") or die "Can't open output: $!"; binmode OUT, ':utf8'; print OUT encode_json($sample); close OUT; sub connectpgdb { # this is used to connect with DBD::Pg my ($database,$user,$password,$driver,$server) = @_; my $url = "DBI:$driver:dbname=$database;host=$server;port=5432"; my $dbh = DBI->connect( $url, $user, $password,{AutoCommit=>1,RaiseError=>1,PrintError=>0}) or die "connectdb can't connect to psql: $!\n"; return $dbh; } sub getsqlcol { my ($dbh,$sqlstatement)= @_; my @results = (); my $sth = $dbh->prepare($sqlstatement); my @col; $sth->execute || die "Could not execute MySQL statement: $sqlstatement"; while (@col=$sth->fetchrow_array) { push(@results,$col[0]); } return @results; }