From b3f6a2415e525b68c7893d36cb7d224e02646872 Mon Sep 17 00:00:00 2001 From: Andrii Mytroshyn Date: Wed, 1 Jan 2025 22:20:39 +0200 Subject: [PATCH] Optimize while loop to use 4 elements instead of 7 for calculation, improving performance on older CPUs --- src/bitops.c | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/src/bitops.c b/src/bitops.c index 44312c6af..0caaea881 100644 --- a/src/bitops.c +++ b/src/bitops.c @@ -59,41 +59,30 @@ long long redisPopcount(void *s, long count) { goto remain; } - /* Count bits 28 bytes at a time */ + /* Count bits 16 bytes at a time */ p4 = (uint32_t*)p; - while(count>=28) { - uint32_t aux1, aux2, aux3, aux4, aux5, aux6, aux7; + while(count>=16) { + uint32_t aux1, aux2, aux3, aux4; aux1 = *p4++; aux2 = *p4++; aux3 = *p4++; aux4 = *p4++; - aux5 = *p4++; - aux6 = *p4++; - aux7 = *p4++; - count -= 28; + count -= 16; aux1 = aux1 - ((aux1 >> 1) & 0x55555555); aux1 = (aux1 & 0x33333333) + ((aux1 >> 2) & 0x33333333); + aux1 = (aux1 + (aux1 >> 4)) & 0x0F0F0F0F; aux2 = aux2 - ((aux2 >> 1) & 0x55555555); aux2 = (aux2 & 0x33333333) + ((aux2 >> 2) & 0x33333333); + aux2 = (aux2 + (aux2 >> 4)) & 0x0F0F0F0F; aux3 = aux3 - ((aux3 >> 1) & 0x55555555); aux3 = (aux3 & 0x33333333) + ((aux3 >> 2) & 0x33333333); + aux3 = (aux3 + (aux3 >> 4)) & 0x0F0F0F0F; aux4 = aux4 - ((aux4 >> 1) & 0x55555555); aux4 = (aux4 & 0x33333333) + ((aux4 >> 2) & 0x33333333); - aux5 = aux5 - ((aux5 >> 1) & 0x55555555); - aux5 = (aux5 & 0x33333333) + ((aux5 >> 2) & 0x33333333); - aux6 = aux6 - ((aux6 >> 1) & 0x55555555); - aux6 = (aux6 & 0x33333333) + ((aux6 >> 2) & 0x33333333); - aux7 = aux7 - ((aux7 >> 1) & 0x55555555); - aux7 = (aux7 & 0x33333333) + ((aux7 >> 2) & 0x33333333); - bits += ((((aux1 + (aux1 >> 4)) & 0x0F0F0F0F) + - ((aux2 + (aux2 >> 4)) & 0x0F0F0F0F) + - ((aux3 + (aux3 >> 4)) & 0x0F0F0F0F) + - ((aux4 + (aux4 >> 4)) & 0x0F0F0F0F) + - ((aux5 + (aux5 >> 4)) & 0x0F0F0F0F) + - ((aux6 + (aux6 >> 4)) & 0x0F0F0F0F) + - ((aux7 + (aux7 >> 4)) & 0x0F0F0F0F))* 0x01010101) >> 24; + aux4 = (aux4 + (aux4 >> 4)) & 0x0F0F0F0F; + bits += ((aux1 + aux2 + aux3 + aux4)* 0x01010101) >> 24; } p = (unsigned char*)p4;