d8 = (d7 ^ q8) * H_PRIME ^ HASH_LEN; d9 = d8 & ~127; d9 %= 1001; if (d9 < 0) d9 += 1001; if (d9 < 500 - 127) continue; if (d9 > 500 + 127) continue; dist = d9 < 500 ? 500 - d9 : d9 - 500; q9 = (d8 & 127) ^ dist; if (q9 == 0 || q9 == 10 || q9 == 13) continue; #### c9 = (c7 ^ q8) * H_PRIME ^ HASH_LEN ^ q9; c9 %= 1001; if (c9 < 0) c9 += 1001; if (c9 != 100) continue; #### #define H_PRIME 1000003 #define HASH_LEN 11 // my_m128_t mimics Intel __m128i type typedef struct my_m128_t my_m128_t; struct my_m128_t { short m128i_i16[8]; }; int inner( const unsigned char* bytevecM, int startval, int endval, int m2, int d2, int c2, int l2, int x2, my_m128_t* ps) { int m3, m4, m5, m6, m7; int d3, d4, d5, d6, d7, d9; int c3, c4, c5, c6, c7, c9; int l3, l4, l5, l6, l7, l9; int x3, x4, x5, x6, x7, x9; int q3, q4, q5, q6, q7, q8, q9; int iret = 0; for (q3 = startval; q3 < endval; ++q3) { if (q3 == 10 || q3 == 13) continue; m3 = (m2 ^ q3) * H_PRIME; d3 = (d2 ^ q3) * H_PRIME; c3 = (c2 ^ q3) * H_PRIME; l3 = (l2 ^ q3) * H_PRIME; x3 = (x2 ^ q3) * H_PRIME; for (q4 = 1; q4 < 128; ++q4) { if (q4 == 10 || q4 == 13) continue; m4 = (m3 ^ q4) * H_PRIME; d4 = (d3 ^ q4) * H_PRIME; c4 = (c3 ^ q4) * H_PRIME; l4 = (l3 ^ q4) * H_PRIME; x4 = (x3 ^ q4) * H_PRIME; for (q5 = 1; q5 < 128; ++q5) { if (q5 == 10 || q5 == 13) continue; m5 = (m4 ^ q5) * H_PRIME; d5 = (d4 ^ q5) * H_PRIME; c5 = (c4 ^ q5) * H_PRIME; l5 = (l4 ^ q5) * H_PRIME; x5 = (x4 ^ q5) * H_PRIME; for (q6 = 1; q6 < 128; ++q6) { if (q6 == 10 || q6 == 13) continue; m6 = (m5 ^ q6) * H_PRIME; d6 = (d5 ^ q6) * H_PRIME; c6 = (c5 ^ q6) * H_PRIME; l6 = (l5 ^ q6) * H_PRIME; x6 = (x5 ^ q6) * H_PRIME; for (q7 = 1; q7 < 128; ++q7) { if (q7 == 10 || q7 == 13) continue; m7 = (m6 ^ q7) * H_PRIME; d7 = (d6 ^ q7) * H_PRIME; c7 = (c6 ^ q7) * H_PRIME; l7 = (l6 ^ q7) * H_PRIME; x7 = (x6 ^ q7) * H_PRIME; for (q8 = 1; q8 < 128; ++q8) { if (q8 == 10 || q8 == 13) continue; q9 = bytevecM[(unsigned int)(m7 ^ q8)]; if (q9 == 0) continue; d9 = (d7 ^ q8) * H_PRIME ^ HASH_LEN ^ q9; d9 %= 1001; if (d9 < 0) d9 += 1001; if (d9 != 500) continue; c9 = (c7 ^ q8) * H_PRIME ^ HASH_LEN ^ q9; c9 %= 1001; if (c9 < 0) c9 += 1001; if (c9 != 100) continue; l9 = (l7 ^ q8) * H_PRIME ^ HASH_LEN ^ q9; l9 %= 1001; if (l9 < 0) l9 += 1001; if (l9 != 50) continue; x9 = (x7 ^ q8) * H_PRIME ^ HASH_LEN ^ q9; x9 %= 1001; if (x9 < 0) x9 += 1001; if (x9 != 10) continue; ps[iret].m128i_i16[0] = q3; ps[iret].m128i_i16[1] = q4; ps[iret].m128i_i16[2] = q5; ps[iret].m128i_i16[3] = q6; ps[iret].m128i_i16[4] = q7; ps[iret].m128i_i16[5] = q8; ps[iret].m128i_i16[6] = q9; ++iret; } } } } } } return iret; } #### _mm_prefetch(&bytevecM[(unsigned int)m7 & 0xffffff80], _MM_HINT_T0); _mm_prefetch(&bytevecM[64+((unsigned int)m7 & 0xffffff80)], _MM_HINT_T0); #### BOOL MySetLockPagesPrivilege(HANDLE hProcess, BOOL bEnable) { struct { DWORD Count; LUID_AND_ATTRIBUTES Privilege[1]; } Info; HANDLE Token; if ( !OpenProcessToken(hProcess, TOKEN_ADJUST_PRIVILEGES, &Token) ) { printf("Cannot open process token.\n"); return FALSE; } Info.Count = 1; if (bEnable) { Info.Privilege[0].Attributes = SE_PRIVILEGE_ENABLED; } else { Info.Privilege[0].Attributes = 0; } if ( !LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &(Info.Privilege[0].Luid)) ) { printf("Cannot get privilege for %s.\n", SE_LOCK_MEMORY_NAME); return FALSE; } if ( !AdjustTokenPrivileges(Token, FALSE, (PTOKEN_PRIVILEGES)&Info, 0, NULL, NULL) ) { printf ("Cannot adjust token privileges (%u)\n", GetLastError()); return FALSE; } // ERROR_NOT_ALL_ASSIGNED // 1300 (0x514) // Not all privileges or groups referenced are assigned to the caller. DWORD dwErr = GetLastError(); printf("AdjustTokenPrivileges dwErr=%u\n", dwErr); if (dwErr != ERROR_SUCCESS) { printf("Cannot enable the SE_LOCK_MEMORY_NAME privilege"); return FALSE; } CloseHandle(Token); return TRUE; } void LargeMemInit() { if ( !MySetLockPagesPrivilege(GetCurrentProcess(), TRUE) ) { fprintf(stderr, "MySetLockPagesPrivilege failed.\n"); exit(1); } } // VirtualAlloc may fail with: ERROR_NO_SYSTEM_RESOURCES = 1450. To fix, reboot. static void* my_big_malloc(size_t s) { DWORD dwErr; void* pMem; pMem = VirtualAlloc( NULL, // Let system decide where to allocate it s, // size of memory block MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES, PAGE_READWRITE ); if (pMem == NULL) { // Error 1314 = ERROR_PRIVILEGE_NOT_HELD dwErr = GetLastError(); fprintf(stderr, "VirtualAlloc failed: dwError=%d\n", (int)dwErr); exit(1); } printf("VirtualAlloc ok p=%p\n", pMem); return pMem; } #### #include #include #define H_PRIME 1000003 #define HASH_LEN 11 // my_m128_t mimics Intel __m128i type typedef struct my_m128_t my_m128_t; struct my_m128_t { short m128i_i16[8]; }; int inner( const unsigned char* bytevecM, int startval, int endval, int m2, int d2, int c2, int l2, int x2, my_m128_t* ps) { __m128i s2 = _mm_set_epi32(x2, l2, c2, d2); __m128i hp = _mm_set1_epi32(H_PRIME); __m128i s3, s4, s5, s6, s7; int m3, m4, m5, m6, m7; int d9; int c9; int l9; int x9; int q3, q4, q5, q6, q7, q8, q9; int iret = 0; for (q3 = startval; q3 < endval; ++q3) { if (q3 == 10 || q3 == 13) continue; m3 = (m2 ^ q3) * H_PRIME; s3 = _mm_mullo_epi32(_mm_xor_si128(s2, _mm_set1_epi32(q3)), hp); for (q4 = 1; q4 < 128; ++q4) { if (q4 == 10 || q4 == 13) continue; m4 = (m3 ^ q4) * H_PRIME; s4 = _mm_mullo_epi32(_mm_xor_si128(s3, _mm_set1_epi32(q4)), hp); for (q5 = 1; q5 < 128; ++q5) { if (q5 == 10 || q5 == 13) continue; m5 = (m4 ^ q5) * H_PRIME; s5 = _mm_mullo_epi32(_mm_xor_si128(s4, _mm_set1_epi32(q5)), hp); for (q6 = 1; q6 < 128; ++q6) { if (q6 == 10 || q6 == 13) continue; m6 = (m5 ^ q6) * H_PRIME; s6 = _mm_mullo_epi32(_mm_xor_si128(s5, _mm_set1_epi32(q6)), hp); for (q7 = 1; q7 < 128; ++q7) { if (q7 == 10 || q7 == 13) continue; m7 = (m6 ^ q7) * H_PRIME; s7 = _mm_mullo_epi32(_mm_xor_si128(s6, _mm_set1_epi32(q7)), hp); _mm_prefetch(&bytevecM[(unsigned int)m7 & 0xffffff80], _MM_HINT_T0); _mm_prefetch(&bytevecM[64+((unsigned int)m7 & 0xffffff80)], _MM_HINT_T0); for (q8 = 1; q8 < 128; ++q8) { if (q8 == 10 || q8 == 13) continue; q9 = bytevecM[(unsigned int)(m7 ^ q8)]; if (q9 == 0) continue; d9 = (s7.m128i_i32[0] ^ q8) * H_PRIME ^ HASH_LEN ^ q9; d9 %= 1001; if (d9 < 0) d9 += 1001; if (d9 != 500) continue; c9 = (s7.m128i_i32[1] ^ q8) * H_PRIME ^ HASH_LEN ^ q9; c9 %= 1001; if (c9 < 0) c9 += 1001; if (c9 != 100) continue; l9 = (s7.m128i_i32[2] ^ q8) * H_PRIME ^ HASH_LEN ^ q9; l9 %= 1001; if (l9 < 0) l9 += 1001; if (l9 != 50) continue; x9 = (s7.m128i_i32[3] ^ q8) * H_PRIME ^ HASH_LEN ^ q9; x9 %= 1001; if (x9 < 0) x9 += 1001; if (x9 != 10) continue; ps[iret].m128i_i16[0] = q3; ps[iret].m128i_i16[1] = q4; ps[iret].m128i_i16[2] = q5; ps[iret].m128i_i16[3] = q6; ps[iret].m128i_i16[4] = q7; ps[iret].m128i_i16[5] = q8; ps[iret].m128i_i16[6] = q9; ++iret; } } } } } } return iret; } #### #define UNROLL(qx) \ q9 = bytevecM[(unsigned int)(m7 ^ qx)]; \ if (q9 != 0) { \ d9 = (s7.m128i_i32[0] ^ qx) * H_PRIME ^ HASH_LEN ^ q9; \ d9 %= 1001; if (d9 < 0) d9 += 1001; \ if (d9 == 500) { \ c9 = (s7.m128i_i32[1] ^ qx) * H_PRIME ^ HASH_LEN ^ q9; \ c9 %= 1001; if (c9 < 0) c9 += 1001; \ if (c9 == 100) { \ l9 = (s7.m128i_i32[2] ^ qx) * H_PRIME ^ HASH_LEN ^ q9; \ l9 %= 1001; if (l9 < 0) l9 += 1001; \ if (l9 == 50) { \ x9 = (s7.m128i_i32[3] ^ qx) * H_PRIME ^ HASH_LEN ^ q9; \ x9 %= 1001; if (x9 < 0) x9 += 1001; \ if (x9 == 10) { \ ps[iret].m128i_i16[0] = q3; \ ps[iret].m128i_i16[1] = q4; \ ps[iret].m128i_i16[2] = q5; \ ps[iret].m128i_i16[3] = q6; \ ps[iret].m128i_i16[4] = q7; \ ps[iret].m128i_i16[5] = qx; \ ps[iret].m128i_i16[6] = q9; \ ++iret; \ } \ } \ } \ } \ } #### for (q8 = 1; q8 < 128; ++q8) { if (q8 == 10 || q8 == 13) continue; q9 = bytevecM[(unsigned int)(m7 ^ q8)]; if (q9 == 0) continue; d9 = (s7.m128i_i32[0] ^ q8) * H_PRIME ^ HASH_LEN ^ q9; d9 %= 1001; if (d9 < 0) d9 += 1001; if (d9 != 500) continue; c9 = (s7.m128i_i32[1] ^ q8) * H_PRIME ^ HASH_LEN ^ q9; c9 %= 1001; if (c9 < 0) c9 += 1001; if (c9 != 100) continue; l9 = (s7.m128i_i32[2] ^ q8) * H_PRIME ^ HASH_LEN ^ q9; l9 %= 1001; if (l9 < 0) l9 += 1001; if (l9 != 50) continue; x9 = (s7.m128i_i32[3] ^ q8) * H_PRIME ^ HASH_LEN ^ q9; x9 %= 1001; if (x9 < 0) x9 += 1001; if (x9 != 10) continue; ps[iret].m128i_i16[0] = q3; ps[iret].m128i_i16[1] = q4; ps[iret].m128i_i16[2] = q5; ps[iret].m128i_i16[3] = q6; ps[iret].m128i_i16[4] = q7; ps[iret].m128i_i16[5] = q8; ps[iret].m128i_i16[6] = q9; ++iret; } #### UNROLL(1) UNROLL(2) UNROLL(3) UNROLL(4) UNROLL(5) UNROLL(6) UNROLL(7) UNROLL(8) UNROLL(9) UNROLL(11) UNROLL(12) for (q8 = 14; q8 < 128; ++q8) { UNROLL(q8) } #### UNROLL(1) UNROLL(2) UNROLL(3) ... UNROLL(127)