Endless has asked for the wisdom of the Perl Monks concerning the following question:
sub _process_json { my $jfile = shift; my $json; { local $/; #Enable 'slurp' mode # xxx Might have trouble wi +th larger jsons (10+ mb) open my $fh, "<", "$jfile"; $json = <$fh>; close $fh; } my $json_data = decode_json($json); # Go through each interaction (twitter message) my @interactions = $json_data -> {'interactions'}; # A scalar of a +n array of hashes while ( (my $key, my $value) = each $interactions[0] ) { my $tweetid = $value -> {'twitter'} -> {'id'}; if (exists $duplicates{$tweetid}){ $duplicate_count++; next; # Skip duplicates }else{ $duplicates{$tweetid} = (); $tweets_file_count++; } # Dates of form 'Fri, 01 Mar 2013 01:21:14 +0000' my $created_at = epoch_sec($value -> {'twitter'} -> {'created_at'} +); my $klout = ($value -> {'klout'} -> {'score'}) // ""; # Optional i +n DS jsons my $screen_name = $value -> {'twitter'} -> {'user'} -> {'screen_na +me'}; my $text = decode_entities($value -> {'twitter'} -> {'text'}); # Formatting for the final output $text =~ s/\R/\t/g; # Remove linebreaks $text =~ s/"/""/g; # Swap quotations print $out_file "$tweetid,", "$created_at,", "$klout,", "$screen_name,", "\"$text\"", "\n"; } #END while (each tweet) } #END _process_json use Inline C => q@ int epoch_sec(char * date) { char *tz_str = date + 26; struct tm tm; int tz; if ( strlen(date) != 31 || strptime(date, "%a, %d %b %Y %T", &tm) == NULL || sscanf(tz_str, "%d", &tz) != 1) { printf("Invalid date %s\n", date); return 0; } return timegm(&tm) - (tz < 0 ? -1 : 1)*(abs(tz)/100*3600 + abs(tz)%100*60); } @;
|
---|
Replies are listed 'Best First'. | |
---|---|
Re: Optimize Large, Complex JSON decoding
by Anonymous Monk on Sep 19, 2013 at 00:20 UTC | |
Re: Optimize Large, Complex JSON decoding
by Anonymous Monk on Sep 19, 2013 at 00:54 UTC | |
by Endless (Beadle) on Sep 19, 2013 at 01:30 UTC | |
by Anonymous Monk on Sep 19, 2013 at 03:35 UTC |