Issue
For my open source project cachegrand we are implementing AARCH64 support and although most of the port is completed we are sorting out a feature to perform an accelerated array search using NEON instructions.
The logic we use is pretty simple:
- in input there is an array of 14 uint32 elements, the value to find and a mask to ignore certain matches
- the code has to find any value that matches a specific uint32
- build a bitmask
- the least significant bits of the bitmask match the begin of the array
- the bitmask is then & with the skip indices mask
- and then the trailing zeros are counted to determine the index of the first occurance
It's a very rare occurance that the skip indices mask is actually used, I would say that 99.9% of the cases will be zero.
I have come up with the following implementation, but I have no experience with ARMv8 NEON instruction and feels a bit clunky, especially so I was wondering if there is a way to make it faster and/or better.
For reference, currently the code is compiled only with GCC.
uint8_t hashtable_mcmp_support_hash_search_armv8a_neon_14(
uint32_t hash,
volatile uint32_t* hashes,
uint32_t skip_indexes_mask) {
uint32x4_t tmp;
uint32_t compacted_result_mask = 0;
uint32_t skip_indexes_mask_inv = ~skip_indexes_mask;
static const int32x4_t shift = {0, 1, 2, 3};
uint32x4_t cmp_vector = vdupq_n_u32(hash);
uint32x4_t ring_vector_0_3 = vld1q_u32((hashtable_hash_half_t*)hashes + 0);
uint32x4_t cmp_vector_0_3 = vceqq_u32(ring_vector_0_3, cmp_vector);
tmp = vshrq_n_u32(cmp_vector_0_3, 31);
compacted_result_mask |= vaddvq_u32(vshlq_u32(tmp, shift)) << 0;
uint32x4_t ring_vector_4_7 = vld1q_u32((hashtable_hash_half_t*)hashes + 4);
uint32x4_t cmp_vector_4_7 = vceqq_u32(ring_vector_4_7, cmp_vector);
tmp = vshrq_n_u32(cmp_vector_4_7, 31);
compacted_result_mask |= vaddvq_u32(vshlq_u32(tmp, shift)) << 4;
uint32x4_t ring_vector_8_11 = vld1q_u32((hashtable_hash_half_t*)hashes + 8);
uint32x4_t cmp_vector_8_11 = vceqq_u32(ring_vector_8_11, cmp_vector);
tmp = vshrq_n_u32(cmp_vector_8_11, 31);
compacted_result_mask |= vaddvq_u32(vshlq_u32(tmp, shift)) << 8;
uint32x4_t ring_vector_10_13 = vld1q_u32((hashtable_hash_half_t*)hashes + 10);
uint32x4_t cmp_vector_10_13 = vceqq_u32(ring_vector_10_13, cmp_vector);
tmp = vshrq_n_u32(cmp_vector_10_13, 31);
compacted_result_mask |= vaddvq_u32(vshlq_u32(tmp, shift)) << 10;
return __builtin_ctz(compacted_result_mask & skip_indexes_mask_inv);
}
Just for reference, here the AVX2 code
static inline uint8_t hashtable_mcmp_support_hash_search_avx2_14(
uint32_t hash,
volatile uint32_t* hashes,
uint32_t skip_indexes_mask) {
uint32_t compacted_result_mask = 0;
uint32_t skip_indexes_mask_inv = ~skip_indexes_mask;
__m256i cmp_vector = _mm256_set1_epi32(hash);
// The second load, load from the 6th uint32 to the 14th uint32, _mm256_loadu_si256 always loads 8 x uint32
for(uint8_t base_index = 0; base_index < 12; base_index += 6) {
__m256i ring_vector = _mm256_loadu_si256((__m256i*) (hashes + base_index));
__m256i result_mask_vector = _mm256_cmpeq_epi32(ring_vector, cmp_vector);
// Uses _mm256_movemask_ps to reduce the bandwidth
compacted_result_mask |= (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(result_mask_vector)) << (base_index);
}
return _tzcnt_u32(compacted_result_mask & skip_indexes_mask_inv);
}
On a side question, do you think it's worth to implement support for SVE2 instructions? Especially taking into account that this is a pretty simple operation and looks like there might not be mandatory support for 256 bits registers (which probably would be the biggest benefit of using SVE2 in this specific context)
Solution
Booleans don't need 32 bits each: shrink them to 8 bits ASAP by vuzp1
and vomovn
prior to doing further operations.
uint8_t hashtable_mcmp_support_hash_search_armv8a_neon_14(
uint32_t hash,
volatile uint32_t* hashes,
uint32_t skip_indexes_mask)
{
uint16x8_t tmp16a, tmp16b;
uint8x8_t tmp8a, tmp8b;
uint32_t tmp;
static const uint8x8_t mask = {1, 2, 4, 8, 16, 32, 64, 128};
uint32x4_t cmp_vector = vdupq_n_u32(hash);
uint32x4x3_t ring_vector_0_11 = vld1q_u32_x3((uint32_t *)hashes);
uint32x4_t ring_vector_10_13 = vld1q_u32((uint32_t *)hashes+10);
ring_vector_0_11.val[0] = vceqq_u32(ring_vector_0_11.val[0], cmp_vector);
ring_vector_0_11.val[1] = vceqq_u32(ring_vector_0_11.val[1], cmp_vector);
ring_vector_0_11.val[2] = vceqq_u32(ring_vector_0_11.val[2], cmp_vector);
ring_vector_10_13 = vceqq_u32(ring_vector_10_13, cmp_vector);
tmp16a = vuzp1q_u16(ring_vector_0_11.val[0], ring_vector_0_11.val[1]);
tmp16b = vuzp1q_u16(ring_vector_0_11.val[2], ring_vector_10_13);
tmp8a = vmovn_u16(tmp16a);
tmp8b = vmovn_u16(tmp16b);
tmp8a = vand_u8(tmp8a, mask);
tmp8b = vand_u8(tmp8b, mask);
tmp = (uint32_t)vaddv_u8(tmp8a) | (uint32_t)(vaddv_u8(tmp8b)<<8);
return __builtin_ctz(tmp &~ skip_indexes_mask);
}
And I don't think sve
will bring a meaningful performance boost since the performance is more or less crippled at the end (vaddv
and especially the transfer to arm registers)
If you are dealing with thousands of 14 entry arrays, you should consider redesigning your function to writing into an 8bit array instead of returning in arm register each and every time. That will eliminate the most time consuming pipeline hazard caused by the Neon to arm transfer.
#include <arm_neon.h>
#include <arm_acle.h>
void hashtable_mcmp_support_hash_search_armv8a_neon_14_b(
uint8_t *pDst,
uint32_t hash,
volatile uint32_t* hashes,
uint32_t skip_indexes_mask, uint32_t number_of_arrays)
{
uint16x8_t tmp16a, tmp16b;
uint16x4_t tmp;
uint8x8_t tmp8a, tmp8b;
static const uint8x8_t mask = {128, 64, 32, 16, 8, 4, 2, 1};
uint32x4_t cmp_vector = vdupq_n_u32(hash);
skip_indexes_mask = __rbit(skip_indexes_mask)>>16;
uint16x4_t index_mask = vdup_n_u16((uint16_t) skip_indexes_mask);
uint32x4x4_t ring_vector;
while (number_of_arrays--)
{
ring_vector = vld1q_u32_x4((uint32_t *)hashes);
hashes += 16;
ring_vector.val[0] = vceqq_u32(ring_vector.val[0], cmp_vector);
ring_vector.val[1] = vceqq_u32(ring_vector.val[1], cmp_vector);
ring_vector.val[2] = vceqq_u32(ring_vector.val[2], cmp_vector);
ring_vector.val[3] = vceqq_u32(ring_vector.val[3], cmp_vector);
tmp16a = vuzp1q_u16(vreinterpretq_u16_u32(ring_vector.val[0]), vreinterpretq_u16_u32(ring_vector.val[1]));
tmp16b = vuzp1q_u16(vreinterpretq_u16_u32(ring_vector.val[2]), vreinterpretq_u16_u32(ring_vector.val[3]));
tmp8a = vmovn_u16(tmp16a);
tmp8b = vmovn_u16(tmp16b);
tmp8a = vand_u8(tmp8a, mask);
tmp8b = vand_u8(tmp8b, mask);
tmp8a[1] = vaddv_u8(tmp8a);
tmp8a[0] = vaddv_u8(tmp8b);
tmp = vbic_u16(vreinterpret_u16_u8(tmp8a), index_mask);
tmp = vclz_u16(tmp);
vst1_lane_u8(pDst++,vreinterpret_u8_u16(tmp), 0);
}
}
Above is an "improved" version
- It assumes the arrays to be in contiguous memory with 8 bytes padding which is perferrable for the cache efficiency unless the memory requirement is a problem.
- Instead of returning an 8bit result, it writes the results into memory directly, avoiding pipeline hazards caused by neon to arm transfer.
- It still suffers from
vaddv
latency(8 cycles). You can unroll the loop so that it processes 2 or even 4 arrays per iteration in order to hide that latency.
Answered By - Jake 'Alquimista' LEE Answer Checked By - Pedro (WPSolving Volunteer)