in reply to Perl Hashes in C?

Stick to Perl, use the underlying C routines, and the hash capabilities of perl (unless you run out of memory)
Once you binary read a pixel and have 48 bits of data, it can be unpacked to a large integer.
However, for speed, you should not use 3 times a 16 bit value, then shift them together, but use quad values, which only are available if your perl is 64 bit. Check Pack/Unpack Tutorial (aka How the System Stores Data)
But still, in perl, it would be better to unpack to hex strings, which are much shorter (thus faster), and use those as keys in a hash counter, like so:

my %SEEN; my $key; open( FH, '<', "image.raw" ) or die $!; binmode(FH); my $startpos = 10; # skip header seek(FH, $startpos, 0); $/ = \42; # set record size while( <FH> ) { $key = join("", unpack 'H42', $_ ); $SEEN{$key}++; # not sure how to calculate stop } for my $key (keys %SEEN){ print "key $key seen $SEEN{$key} times\n"; } # Add method to go from hex key to numeric rgb values if you want

Replies are listed 'Best First'.
Re^2: Perl Hashes in C?
by Anonymous Monk on Aug 12, 2015 at 05:46 UTC
    I have PerlMagick read the file into a blob
    Unpack the blob into a very long array of unsigned shorts
    Calc the compound, multi-channel values for Yellow, Purple Cyan and White and hash them all
    The array slice makes it a great deal faster than segmenting it into pixels
    $im = Image::Magick->new(size=>$xyr, type=>'TrueColor', depth=>16); $err=$im->Read(filename=>"RGB:$raw"); warn "$err" if $err; # Read RA +W ($blob) = $im->ImageToBlob(); $bl=length $blob; @ushort=unpack("S*", $blob); $blen=length $blob; $ulen=scalar @ushort; printf("CR: $lbl Blob len $blen unpacks to $ulen uint16s -> %d pix + -> %4.6f B/us\n", $blen / 6, $blen/$ulen); for($ii=0; $ii < scalar @ushort; $ii+=3) { ($rr, $gg, $bb) = @ushort[$ii .. $ii+2]; # Array slice # Show full integers in 4Gig range for Yellow, purple, cyan $yy=sprintf("%10d", $rr*$gg); # White=4294967296 (4,294,967,29 +6) 10 dig $pp=sprintf("%10d", $rr*$bb); $cc=sprintf("%10d", $gg*$bb); # (2^16)^3 = 2^48 = 2.815E14, 15 sig figs MAX. # Doubles have 52/53 bits of Significand $tt=sprintf("%15d", $rr*$gg*$bb); # TRUE color, R*G*B??? $c2v2c{r}{$rr}++; # Count Red channel points with this value $c2v2c{g}{$gg}++; # Count Gre channel points with this value $c2v2c{b}{$bb}++; # Count Blu channel points with this value $c2v2c{y}{$yy}++; # Count Blu channel points with this value $c2v2c{p}{$pp}++; # Count Blu channel points with this value $c2v2c{c}{$cc}++; # Count Blu channel points with this value $c2v2c{t}{$tt}++; # Count Blu channel points with this value } $vt=0; # Values Total; sum of all values foreach $chan (@leg) { # sort keys %c2v2c $clr = $leg{$chan}; # y -> Yellow %v2c = %{$c2v2c{$chan}}; # Values -> Count_of_this_value hash $vc = scalar keys %v2c; # Distinct values for this channel $vt += $vc; # Value Count accumulator push @vc, "$clr $vc"; } printf("CR: $lbl Chan Value Counts: %s, tot=$vt\n", join(", ", @vc +)); return $vc; # Last ValueCount should be for T, TrueRGB } # End Count_Rgb().
    Looking at the log file:
    3), R pf-2015.0531-249465.srgb.7360x4912.raw , crop_wh=577,428, blob= +1481736 B -> 6.000000 B/p CR: srgb Blob len 1481736 unpacks to 740868 uint16s -> 246956 pix -> 2 +.000000 B/us CR: srgb Chan Value Counts: Red 18955, Green 16919, Blue 10862, Yellow + 236778, Purple 221564, Cyan 222509, TrueRGB 225558, tot=953145 Distinct RGB colors=225558 = 91.335% -> 1 dup clr / 11.54 pix Elapsed time = 9.67 sec
    IT does 4 of these in almost 10 seconds, but look at the pixel count; < 1/4 MegaPix

    The hash only has to deal with 91% of 1/4 MB or 225,558 226 Thousand unique colors.

    Scale this to 30 MILLION unique colors and the time factor goes from 9 seconds to 9 DAYS!

    I have the C version down to 3 minutes flat with a 7920 Unsorted limit and 2.77 Million unique colors!

      This was the code example from FreeBeerReekingMonk: ----------------------------- my %SEEN; my $key; open( FH, '<', "image.raw" ) or die $!; binmode(FH); my $startpos = 10; # skip header seek(FH, $startpos, 0); $/ = \42; # set record size while( <FH> ) { $key = join("", unpack 'H42', $_ ); $SEEN{$key}++; # not sure how to calculate stop } for my $key (keys %SEEN){ print "key $key seen $SEEN{$key} times\n"; } # Add method to go from hex key to numeric rgb values if you want ================================= I have no idea where the \42 came from. I changed it to a record size=6. Then changed the Unpack from 'H42' to 'H12' 12 hex = 6 bytes. It works: # The RIGHT ANSWER IS: RGB hash has 27645898 distinct colors. sub rgb_hash2() { my %SEEN; my $key; # 27645898 colors 216913920 B 36152320 pixels! $raw="pf-2015.0531-249465.pprgb.7360x4912.raw"; # $fs=-s $raw; # RAW file size in BYTES open( FH, '<', "$raw" ) or die $!; binmode(FH); # RGB 16 BITS/CHANNEL = 48BITs/PIXEL = 6 BYTE QUANTA -> 12 HEX digits $/ = \6; # set record size. REF 2 INT, "\6" == 6 B #$/ = \42; # set record size while( <FH> ) { # Smash each 12 Hex rec into 48 BIT UINT48? #$key = join("", unpack 'S3', $_ ); $key = join("", unpack 'H12', $_ ); $SEEN{$key}++; } close FH; $sk=scalar keys %SEEN; # Scalar Keys. How many COLORS? printf("Seen{%d}, file_size=%2.6fMB, -> %1.6f B/key\n", $sk, $fs/1.0E6, $fs/$sk); return $sk; } I also tried it with 3 Unsigned Shorts too. ===================================================== ===================================================== Monk Ed suggested something at a bit lower level with a sysread into a tiny 4096 * 6 byte buffer and nibbling the buffer substr. If you monkey with the buffer size and try a size not evenly divisible by 6, it breaks. D'oh! =============================================== Ed's ~ code (may be partially mangled by me). %Image = (); keys %Image = 4096 * 128; $rdamt = 4194304 unless $rdamt; # 4 MByte! 4096 * 6; $compraw = -s $file; $pixels = int ( $compraw / 6 ); open ( $in, "<", "$file") or die "$!\n"; while( 1 ) { $size = sysread( $in, $buffer, $rdamt ); if ( $size == 0 ) { last; } while( $buffer ) { if ( length( $buffer ) < 6 ) { last; } $key = substr( $buffer, 0, 6, '' ); $Image{$key}++; } } close $in; $uniq = keys %Image; =============================================== PERFORMANCE SYSREAD with buffer size Event 'Read_Raw_16384' elapsed time = 0.520 min = 10.43%. Event 'Read_Raw_24576' elapsed time = 0.519 min = 10.41%. Event 'Read_Raw_4096' elapsed time = 0.525 min = 10.54%. 10.54% -> Read_Raw_4096 10.43% -> Read_Raw_16384 10.41% -> Read_Raw_24576 << Ed's Magic Number 4096 * 6 Average ~= 0.5 min Record read, unpack to HEX Event 'Read_Raw_Hex0' elapsed time = 0.781 min = 15.68%. Event 'Read_Raw_Hex1' elapsed time = 0.711 min = 14.26%. Event 'Read_Raw_Hex2' elapsed time = 0.704 min = 14.13%. Event 'Read_Raw_Hex3' elapsed time = 0.707 min = 14.18%. 15.68% -> Read_Raw_Hex0 14.26% -> Read_Raw_Hex1 14.18% -> Read_Raw_Hex3 14.13% -> Read_Raw_Hex2 Average ~= 0.7 min Ed's magic number beat out the others tried by an eyelash! Tried Unpacking 3 UShorts also. Seems like it should be faster to unpack 3, 2 byte ints than 12 hex: Event 'Read_Raw_UINT16x3Pack0' elapsed time=0.959 min=25.82% Event 'Read_Raw_UINT16x3Pack1' elapsed time=0.919 min=24.76% Event 'Read_Raw_UINT16x3Pack2' elapsed time=0.918 min=24.73% Event 'Read_Raw_UINT16x3Pack3' elapsed time=0.917 min=24.70% 25.82% -> Read_Raw_UINT16x3Pack0 24.76% -> Read_Raw_UINT16x3Pack1 24.73% -> Read_Raw_UINT16x3Pack2 24.70% -> Read_Raw_UINT16x3Pack3 NO! Hex Unpack is faster that UINT16 on the same data and by a sizable margin, The low level sysreads without buffering Gets the GOLD MEDAL! There was another INLINE C method from BrowserUk that looked perfect, but it used GD with "TRUECOLOR" which on my system means 24 bit and my data are all 16 so I didn't try it. What started as a 2 hour problem has been whittled down to 20 seconds. Ye Ha! Thanks for all the great ideas! Brian ======================================================== Inline C version from BrowserUk: by BrowserUk on Aug 11, 2015 at 20:28 UTC This drops into Inline::C to explore the bitmap and construct the hash +. It takes 23 seconds to process a 125 megapixel image: #! perl -slw use strict; use Inline C => Config => BUILD_NOISY => 1; use Inline C => <<'END_C', NAME => '_1138218', CLEAN_AFTER_BUILD =>0; typedef unsigned __int64 U64; typedef unsigned int U32; typedef unsigned char U8; HV* countColors( SV *img ) { STRLEN l; U32 *rgba = (U32*)SvPVx( img, l ), i; HV* counts = newHV(); l /= 4; for( i = 0; i < l; ++i ) { if( hv_exists( counts, (char*)&rgba[ i ], 4 ) ) { SV **val = hv_fetch( counts, (char*)&rgba[ i ], 4, 0 ); SvIV_set( *val, SvIV( *val ) + 1 ); } else { SV *val = newSViv( 1 ); hv_store( counts, (char*)&rgba[ i ], 4, val, 0 ); } } return counts; } END_C use Time::HiRes qw[ time ]; use Data::Dump qw[ pp ]; use List::Util qw[ sum ]; use GD; GD::Image->trueColor(1); my $i = GD::Image->new( $ARGV[ 0 ] ) or die $!; my $gd = $i->gd; my( undef, $width, $height, undef, undef ) = unpack 'nnnCV', substr( $ +gd, 0, 11, '' ); printf "Width:%u height:%u pixels:%u\n", $width, $height, length( $gd +) / 4; my $start = time; my $href = countColors( $gd ); printf "Took %f seconds\n", time() -$start; printf "Found %u colors in %u pixels\n", scalar( keys %{ $href } ), su +m( values %{ $href } );