d8 = (d7 ^ q8) * H_PRIME ^ HASH_LEN;
d9 = d8 & ~127;
d9 %= 1001; if (d9 < 0) d9 += 1001;
if (d9 < 500 - 127) continue;
if (d9 > 500 + 127) continue;
dist = d9 < 500 ? 500 - d9 : d9 - 500;
q9 = (d8 & 127) ^ dist;
if (q9 == 0 || q9 == 10 || q9 == 13) continue;
####
c9 = (c7 ^ q8) * H_PRIME ^ HASH_LEN ^ q9;
c9 %= 1001; if (c9 < 0) c9 += 1001;
if (c9 != 100) continue;
##
##
#define H_PRIME 1000003
#define HASH_LEN 11
// my_m128_t mimics Intel __m128i type
typedef struct my_m128_t my_m128_t;
struct my_m128_t { short m128i_i16[8]; };
int inner(
const unsigned char* bytevecM,
int startval, int endval,
int m2, int d2, int c2, int l2, int x2,
my_m128_t* ps)
{
int m3, m4, m5, m6, m7;
int d3, d4, d5, d6, d7, d9;
int c3, c4, c5, c6, c7, c9;
int l3, l4, l5, l6, l7, l9;
int x3, x4, x5, x6, x7, x9;
int q3, q4, q5, q6, q7, q8, q9;
int iret = 0;
for (q3 = startval; q3 < endval; ++q3) {
if (q3 == 10 || q3 == 13) continue;
m3 = (m2 ^ q3) * H_PRIME;
d3 = (d2 ^ q3) * H_PRIME;
c3 = (c2 ^ q3) * H_PRIME;
l3 = (l2 ^ q3) * H_PRIME;
x3 = (x2 ^ q3) * H_PRIME;
for (q4 = 1; q4 < 128; ++q4) {
if (q4 == 10 || q4 == 13) continue;
m4 = (m3 ^ q4) * H_PRIME;
d4 = (d3 ^ q4) * H_PRIME;
c4 = (c3 ^ q4) * H_PRIME;
l4 = (l3 ^ q4) * H_PRIME;
x4 = (x3 ^ q4) * H_PRIME;
for (q5 = 1; q5 < 128; ++q5) {
if (q5 == 10 || q5 == 13) continue;
m5 = (m4 ^ q5) * H_PRIME;
d5 = (d4 ^ q5) * H_PRIME;
c5 = (c4 ^ q5) * H_PRIME;
l5 = (l4 ^ q5) * H_PRIME;
x5 = (x4 ^ q5) * H_PRIME;
for (q6 = 1; q6 < 128; ++q6) {
if (q6 == 10 || q6 == 13) continue;
m6 = (m5 ^ q6) * H_PRIME;
d6 = (d5 ^ q6) * H_PRIME;
c6 = (c5 ^ q6) * H_PRIME;
l6 = (l5 ^ q6) * H_PRIME;
x6 = (x5 ^ q6) * H_PRIME;
for (q7 = 1; q7 < 128; ++q7) {
if (q7 == 10 || q7 == 13) continue;
m7 = (m6 ^ q7) * H_PRIME;
d7 = (d6 ^ q7) * H_PRIME;
c7 = (c6 ^ q7) * H_PRIME;
l7 = (l6 ^ q7) * H_PRIME;
x7 = (x6 ^ q7) * H_PRIME;
for (q8 = 1; q8 < 128; ++q8) {
if (q8 == 10 || q8 == 13) continue;
q9 = bytevecM[(unsigned int)(m7 ^ q8)];
if (q9 == 0) continue;
d9 = (d7 ^ q8) * H_PRIME ^ HASH_LEN ^ q9;
d9 %= 1001; if (d9 < 0) d9 += 1001;
if (d9 != 500) continue;
c9 = (c7 ^ q8) * H_PRIME ^ HASH_LEN ^ q9;
c9 %= 1001; if (c9 < 0) c9 += 1001;
if (c9 != 100) continue;
l9 = (l7 ^ q8) * H_PRIME ^ HASH_LEN ^ q9;
l9 %= 1001; if (l9 < 0) l9 += 1001;
if (l9 != 50) continue;
x9 = (x7 ^ q8) * H_PRIME ^ HASH_LEN ^ q9;
x9 %= 1001; if (x9 < 0) x9 += 1001;
if (x9 != 10) continue;
ps[iret].m128i_i16[0] = q3;
ps[iret].m128i_i16[1] = q4;
ps[iret].m128i_i16[2] = q5;
ps[iret].m128i_i16[3] = q6;
ps[iret].m128i_i16[4] = q7;
ps[iret].m128i_i16[5] = q8;
ps[iret].m128i_i16[6] = q9;
++iret;
}
}
}
}
}
}
return iret;
}
##
##
_mm_prefetch(&bytevecM[(unsigned int)m7 & 0xffffff80], _MM_HINT_T0);
_mm_prefetch(&bytevecM[64+((unsigned int)m7 & 0xffffff80)], _MM_HINT_T0);
##
##
BOOL MySetLockPagesPrivilege(HANDLE hProcess, BOOL bEnable)
{
struct {
DWORD Count;
LUID_AND_ATTRIBUTES Privilege[1];
} Info;
HANDLE Token;
if ( !OpenProcessToken(hProcess, TOKEN_ADJUST_PRIVILEGES, &Token) ) {
printf("Cannot open process token.\n");
return FALSE;
}
Info.Count = 1;
if (bEnable) {
Info.Privilege[0].Attributes = SE_PRIVILEGE_ENABLED;
}
else {
Info.Privilege[0].Attributes = 0;
}
if ( !LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &(Info.Privilege[0].Luid)) ) {
printf("Cannot get privilege for %s.\n", SE_LOCK_MEMORY_NAME);
return FALSE;
}
if ( !AdjustTokenPrivileges(Token, FALSE, (PTOKEN_PRIVILEGES)&Info, 0, NULL, NULL) ) {
printf ("Cannot adjust token privileges (%u)\n", GetLastError());
return FALSE;
}
// ERROR_NOT_ALL_ASSIGNED
// 1300 (0x514)
// Not all privileges or groups referenced are assigned to the caller.
DWORD dwErr = GetLastError();
printf("AdjustTokenPrivileges dwErr=%u\n", dwErr);
if (dwErr != ERROR_SUCCESS) {
printf("Cannot enable the SE_LOCK_MEMORY_NAME privilege");
return FALSE;
}
CloseHandle(Token);
return TRUE;
}
void LargeMemInit()
{
if ( !MySetLockPagesPrivilege(GetCurrentProcess(), TRUE) ) {
fprintf(stderr, "MySetLockPagesPrivilege failed.\n");
exit(1);
}
}
// VirtualAlloc may fail with: ERROR_NO_SYSTEM_RESOURCES = 1450. To fix, reboot.
static void* my_big_malloc(size_t s)
{
DWORD dwErr;
void* pMem;
pMem = VirtualAlloc(
NULL, // Let system decide where to allocate it
s, // size of memory block
MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES,
PAGE_READWRITE
);
if (pMem == NULL) {
// Error 1314 = ERROR_PRIVILEGE_NOT_HELD
dwErr = GetLastError();
fprintf(stderr, "VirtualAlloc failed: dwError=%d\n", (int)dwErr);
exit(1);
}
printf("VirtualAlloc ok p=%p\n", pMem);
return pMem;
}
##
##
#include
#include
#define H_PRIME 1000003
#define HASH_LEN 11
// my_m128_t mimics Intel __m128i type
typedef struct my_m128_t my_m128_t;
struct my_m128_t { short m128i_i16[8]; };
int inner(
const unsigned char* bytevecM,
int startval, int endval,
int m2, int d2, int c2, int l2, int x2,
my_m128_t* ps)
{
__m128i s2 = _mm_set_epi32(x2, l2, c2, d2);
__m128i hp = _mm_set1_epi32(H_PRIME);
__m128i s3, s4, s5, s6, s7;
int m3, m4, m5, m6, m7;
int d9;
int c9;
int l9;
int x9;
int q3, q4, q5, q6, q7, q8, q9;
int iret = 0;
for (q3 = startval; q3 < endval; ++q3) {
if (q3 == 10 || q3 == 13) continue;
m3 = (m2 ^ q3) * H_PRIME;
s3 = _mm_mullo_epi32(_mm_xor_si128(s2, _mm_set1_epi32(q3)), hp);
for (q4 = 1; q4 < 128; ++q4) {
if (q4 == 10 || q4 == 13) continue;
m4 = (m3 ^ q4) * H_PRIME;
s4 = _mm_mullo_epi32(_mm_xor_si128(s3, _mm_set1_epi32(q4)), hp);
for (q5 = 1; q5 < 128; ++q5) {
if (q5 == 10 || q5 == 13) continue;
m5 = (m4 ^ q5) * H_PRIME;
s5 = _mm_mullo_epi32(_mm_xor_si128(s4, _mm_set1_epi32(q5)), hp);
for (q6 = 1; q6 < 128; ++q6) {
if (q6 == 10 || q6 == 13) continue;
m6 = (m5 ^ q6) * H_PRIME;
s6 = _mm_mullo_epi32(_mm_xor_si128(s5, _mm_set1_epi32(q6)), hp);
for (q7 = 1; q7 < 128; ++q7) {
if (q7 == 10 || q7 == 13) continue;
m7 = (m6 ^ q7) * H_PRIME;
s7 = _mm_mullo_epi32(_mm_xor_si128(s6, _mm_set1_epi32(q7)), hp);
_mm_prefetch(&bytevecM[(unsigned int)m7 & 0xffffff80], _MM_HINT_T0);
_mm_prefetch(&bytevecM[64+((unsigned int)m7 & 0xffffff80)], _MM_HINT_T0);
for (q8 = 1; q8 < 128; ++q8) {
if (q8 == 10 || q8 == 13) continue;
q9 = bytevecM[(unsigned int)(m7 ^ q8)];
if (q9 == 0) continue;
d9 = (s7.m128i_i32[0] ^ q8) * H_PRIME ^ HASH_LEN ^ q9;
d9 %= 1001; if (d9 < 0) d9 += 1001;
if (d9 != 500) continue;
c9 = (s7.m128i_i32[1] ^ q8) * H_PRIME ^ HASH_LEN ^ q9;
c9 %= 1001; if (c9 < 0) c9 += 1001;
if (c9 != 100) continue;
l9 = (s7.m128i_i32[2] ^ q8) * H_PRIME ^ HASH_LEN ^ q9;
l9 %= 1001; if (l9 < 0) l9 += 1001;
if (l9 != 50) continue;
x9 = (s7.m128i_i32[3] ^ q8) * H_PRIME ^ HASH_LEN ^ q9;
x9 %= 1001; if (x9 < 0) x9 += 1001;
if (x9 != 10) continue;
ps[iret].m128i_i16[0] = q3;
ps[iret].m128i_i16[1] = q4;
ps[iret].m128i_i16[2] = q5;
ps[iret].m128i_i16[3] = q6;
ps[iret].m128i_i16[4] = q7;
ps[iret].m128i_i16[5] = q8;
ps[iret].m128i_i16[6] = q9;
++iret;
}
}
}
}
}
}
return iret;
}
##
##
#define UNROLL(qx) \
q9 = bytevecM[(unsigned int)(m7 ^ qx)]; \
if (q9 != 0) { \
d9 = (s7.m128i_i32[0] ^ qx) * H_PRIME ^ HASH_LEN ^ q9; \
d9 %= 1001; if (d9 < 0) d9 += 1001; \
if (d9 == 500) { \
c9 = (s7.m128i_i32[1] ^ qx) * H_PRIME ^ HASH_LEN ^ q9; \
c9 %= 1001; if (c9 < 0) c9 += 1001; \
if (c9 == 100) { \
l9 = (s7.m128i_i32[2] ^ qx) * H_PRIME ^ HASH_LEN ^ q9; \
l9 %= 1001; if (l9 < 0) l9 += 1001; \
if (l9 == 50) { \
x9 = (s7.m128i_i32[3] ^ qx) * H_PRIME ^ HASH_LEN ^ q9; \
x9 %= 1001; if (x9 < 0) x9 += 1001; \
if (x9 == 10) { \
ps[iret].m128i_i16[0] = q3; \
ps[iret].m128i_i16[1] = q4; \
ps[iret].m128i_i16[2] = q5; \
ps[iret].m128i_i16[3] = q6; \
ps[iret].m128i_i16[4] = q7; \
ps[iret].m128i_i16[5] = qx; \
ps[iret].m128i_i16[6] = q9; \
++iret; \
} \
} \
} \
} \
}
##
##
for (q8 = 1; q8 < 128; ++q8) {
if (q8 == 10 || q8 == 13) continue;
q9 = bytevecM[(unsigned int)(m7 ^ q8)];
if (q9 == 0) continue;
d9 = (s7.m128i_i32[0] ^ q8) * H_PRIME ^ HASH_LEN ^ q9;
d9 %= 1001; if (d9 < 0) d9 += 1001;
if (d9 != 500) continue;
c9 = (s7.m128i_i32[1] ^ q8) * H_PRIME ^ HASH_LEN ^ q9;
c9 %= 1001; if (c9 < 0) c9 += 1001;
if (c9 != 100) continue;
l9 = (s7.m128i_i32[2] ^ q8) * H_PRIME ^ HASH_LEN ^ q9;
l9 %= 1001; if (l9 < 0) l9 += 1001;
if (l9 != 50) continue;
x9 = (s7.m128i_i32[3] ^ q8) * H_PRIME ^ HASH_LEN ^ q9;
x9 %= 1001; if (x9 < 0) x9 += 1001;
if (x9 != 10) continue;
ps[iret].m128i_i16[0] = q3;
ps[iret].m128i_i16[1] = q4;
ps[iret].m128i_i16[2] = q5;
ps[iret].m128i_i16[3] = q6;
ps[iret].m128i_i16[4] = q7;
ps[iret].m128i_i16[5] = q8;
ps[iret].m128i_i16[6] = q9;
++iret;
}
##
##
UNROLL(1)
UNROLL(2)
UNROLL(3)
UNROLL(4)
UNROLL(5)
UNROLL(6)
UNROLL(7)
UNROLL(8)
UNROLL(9)
UNROLL(11)
UNROLL(12)
for (q8 = 14; q8 < 128; ++q8) {
UNROLL(q8)
}
##
##
UNROLL(1)
UNROLL(2)
UNROLL(3)
...
UNROLL(127)