_mm_prefetch(&bytevecM[i], _MM_HINT_T0);
_mm_prefetch(&bytevecM[i^64], _MM_HINT_T0);
####
_mm_prefetch(&bytevecM[(unsigned int)(i) & 0xffffff80], _MM_HINT_T0);
_mm_prefetch(&bytevecM[64+((unsigned int)(i) & 0xffffff80)], _MM_HINT_T0);
##
##
; 100 : UNROLL(q8)
0x1400028e0 Block 178:
0x1400028e0 mov eax, r9d 7.217s
0x1400028e3 xor rax, rdi 0.060s
0x1400028e6 movzx r10d, byte ptr [rax+rsi*1] 0.100s
0x1400028eb test r10d, r10d 2.508s
0x1400028ee jz 0x140002a0b
##
##
; 99 : for (q8 = 14; q8 < 128; ++q8) {
0x140002a0b Block 192:
0x140002a0b inc r9d 7.008s
0x140002a0e cmp r9d, 0x80 0.690s
0x140002a15 jl 0x1400028e0