You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							799 lines
						
					
					
						
							25 KiB
						
					
					
				
			
		
		
		
			
			
			
				
					
				
				
					
				
			
		
		
	
	
							799 lines
						
					
					
						
							25 KiB
						
					
					
				
								/*
							 | 
						|
								 * libpopcnt.h - C/C++ library for counting the number of 1 bits (bit
							 | 
						|
								 * population count) in an array as quickly as possible using
							 | 
						|
								 * specialized CPU instructions i.e. POPCNT, AVX2, AVX512, NEON.
							 | 
						|
								 *
							 | 
						|
								 * Copyright (c) 2016 - 2024, Kim Walisch
							 | 
						|
								 * Copyright (c) 2016 - 2018, Wojciech Muła
							 | 
						|
								 *
							 | 
						|
								 * All rights reserved.
							 | 
						|
								 *
							 | 
						|
								 * Redistribution and use in source and binary forms, with or without
							 | 
						|
								 * modification, are permitted provided that the following conditions are met:
							 | 
						|
								 *
							 | 
						|
								 * 1. Redistributions of source code must retain the above copyright notice, this
							 | 
						|
								 *    list of conditions and the following disclaimer.
							 | 
						|
								 * 2. Redistributions in binary form must reproduce the above copyright notice,
							 | 
						|
								 *    this list of conditions and the following disclaimer in the documentation
							 | 
						|
								 *    and/or other materials provided with the distribution.
							 | 
						|
								 *
							 | 
						|
								 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
							 | 
						|
								 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
							 | 
						|
								 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
							 | 
						|
								 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
							 | 
						|
								 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
							 | 
						|
								 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
							 | 
						|
								 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
							 | 
						|
								 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
							 | 
						|
								 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
							 | 
						|
								 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
							 | 
						|
								 */
							 | 
						|
								
							 | 
						|
								#ifndef LIBPOPCNT_H
							 | 
						|
								#define LIBPOPCNT_H
							 | 
						|
								
							 | 
						|
								#include <stdint.h>
							 | 
						|
								
							 | 
						|
								#ifndef __has_builtin
							 | 
						|
								#define __has_builtin(x) 0
							 | 
						|
								#endif
							 | 
						|
								
							 | 
						|
								#ifndef __has_attribute
							 | 
						|
								#define __has_attribute(x) 0
							 | 
						|
								#endif
							 | 
						|
								
							 | 
						|
								#ifndef __has_include
							 | 
						|
								#define __has_include(x) 0
							 | 
						|
								#endif
							 | 
						|
								
							 | 
						|
								#ifdef __GNUC__
							 | 
						|
								#define LIBPOPCNT_GNUC_PREREQ(x, y) (__GNUC__ > x || (__GNUC__ == x && __GNUC_MINOR__ >= y))
							 | 
						|
								#else
							 | 
						|
								#define LIBPOPCNT_GNUC_PREREQ(x, y) 0
							 | 
						|
								#endif
							 | 
						|
								
							 | 
						|
								#ifdef __clang__
							 | 
						|
								#define LIBPOPCNT_CLANG_PREREQ(x, y) (__clang_major__ > x || (__clang_major__ == x && __clang_minor__ >= y))
							 | 
						|
								#else
							 | 
						|
								#define LIBPOPCNT_CLANG_PREREQ(x, y) 0
							 | 
						|
								#endif
							 | 
						|
								
							 | 
						|
								#if (_MSC_VER < 1900) && !defined(__cplusplus)
							 | 
						|
								#define inline __inline
							 | 
						|
								#endif
							 | 
						|
								
							 | 
						|
								#if (defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64))
							 | 
						|
								#define LIBPOPCNT_X86_OR_X64
							 | 
						|
								#endif
							 | 
						|
								
							 | 
						|
								#if LIBPOPCNT_GNUC_PREREQ(4, 2) || __has_builtin(__builtin_popcount)
							 | 
						|
								#define LIBPOPCNT_HAVE_BUILTIN_POPCOUNT
							 | 
						|
								#endif
							 | 
						|
								
							 | 
						|
								#if LIBPOPCNT_GNUC_PREREQ(4, 2) || LIBPOPCNT_CLANG_PREREQ(3, 0)
							 | 
						|
								#define LIBPOPCNT_HAVE_ASM_POPCNT
							 | 
						|
								#endif
							 | 
						|
								
							 | 
						|
								#if defined(LIBPOPCNT_X86_OR_X64) && (defined(LIBPOPCNT_HAVE_ASM_POPCNT) || defined(_MSC_VER))
							 | 
						|
								#define LIBPOPCNT_HAVE_POPCNT
							 | 
						|
								#endif
							 | 
						|
								
							 | 
						|
								/* GCC compiler */
							 | 
						|
								#if defined(LIBPOPCNT_X86_OR_X64) && LIBPOPCNT_GNUC_PREREQ(5, 0)
							 | 
						|
								#define LIBPOPCNT_HAVE_AVX2
							 | 
						|
								#endif
							 | 
						|
								
							 | 
						|
								/* GCC compiler */
							 | 
						|
								#if defined(LIBPOPCNT_X86_OR_X64) && LIBPOPCNT_GNUC_PREREQ(11, 0)
							 | 
						|
								#define LIBPOPCNT_HAVE_AVX512
							 | 
						|
								#endif
							 | 
						|
								
							 | 
						|
								/* Clang (Unix-like OSes) */
							 | 
						|
								#if defined(LIBPOPCNT_X86_OR_X64) && !defined(_MSC_VER)
							 | 
						|
								#if LIBPOPCNT_CLANG_PREREQ(3, 8) && __has_attribute(target) \
							 | 
						|
								    && (!defined(__apple_build_version__) || __apple_build_version__ >= 8000000)
							 | 
						|
								#define LIBPOPCNT_HAVE_AVX2
							 | 
						|
								#endif
							 | 
						|
								#if LIBPOPCNT_CLANG_PREREQ(9, 0) && __has_attribute(target) \
							 | 
						|
								    && (!defined(__apple_build_version__) || __apple_build_version__ >= 8000000)
							 | 
						|
								#define LIBPOPCNT_HAVE_AVX512
							 | 
						|
								#endif
							 | 
						|
								#endif
							 | 
						|
								
							 | 
						|
								/* MSVC compatible compilers (Windows) */
							 | 
						|
								#if defined(LIBPOPCNT_X86_OR_X64) && defined(_MSC_VER)
							 | 
						|
								/*
							 | 
						|
								 * There is an LLVM/Clang bug on Windows where function targets
							 | 
						|
								 * for AVX2 and AVX512 fail to compile unless the user compiles
							 | 
						|
								 * using the options /arch:AVX2 and /arch:AVX512.
							 | 
						|
								 * All Clang versions <= 18.0 (from 2024) are affected by this bug.
							 | 
						|
								 * However, I expect this bug will be fixed in near future:
							 | 
						|
								 * https://github.com/llvm/llvm-project/issues/53520
							 | 
						|
								 */
							 | 
						|
								#if defined(__clang__)
							 | 
						|
								#if defined(__AVX2__)
							 | 
						|
								#define LIBPOPCNT_HAVE_AVX2
							 | 
						|
								#endif
							 | 
						|
								#if defined(__AVX512__)
							 | 
						|
								#define LIBPOPCNT_HAVE_AVX2
							 | 
						|
								#define LIBPOPCNT_HAVE_AVX512
							 | 
						|
								#endif
							 | 
						|
								/* MSVC 2017 or later does not require
							 | 
						|
								 * /arch:AVX2 or /arch:AVX512 */
							 | 
						|
								#elif _MSC_VER >= 1910
							 | 
						|
								#define LIBPOPCNT_HAVE_AVX2
							 | 
						|
								#define LIBPOPCNT_HAVE_AVX512
							 | 
						|
								#endif
							 | 
						|
								#endif
							 | 
						|
								
							 | 
						|
								/*
							 | 
						|
								 * Only enable CPUID runtime checks if this is really
							 | 
						|
								 * needed. E.g. do not enable if user has compiled
							 | 
						|
								 * using -march=native on a CPU that supports AVX512.
							 | 
						|
								 */
							 | 
						|
								#if defined(LIBPOPCNT_X86_OR_X64)                                                                                      \
							 | 
						|
								    && (defined(__cplusplus) || defined(_MSC_VER)                                                                      \
							 | 
						|
								        || (LIBPOPCNT_GNUC_PREREQ(4, 2) || __has_builtin(__sync_val_compare_and_swap)))                                \
							 | 
						|
								    && ((defined(LIBPOPCNT_HAVE_AVX512)                                                                                \
							 | 
						|
								         && !(defined(__AVX512__) || (defined(__AVX512F__) && defined(__AVX512BW__) && defined(__AVX512VPOPCNTDQ__)))) \
							 | 
						|
								        || (defined(LIBPOPCNT_HAVE_AVX2) && !defined(__AVX2__)) || (defined(LIBPOPCNT_HAVE_POPCNT) && !defined(__POPCNT__)))
							 | 
						|
								#define LIBPOPCNT_HAVE_CPUID
							 | 
						|
								#endif
							 | 
						|
								
							 | 
						|
								#ifdef __cplusplus
							 | 
						|
								extern "C" {
							 | 
						|
								#endif
							 | 
						|
								
							 | 
						|
								/*
							 | 
						|
								 * This uses fewer arithmetic operations than any other known
							 | 
						|
								 * implementation on machines with fast multiplication.
							 | 
						|
								 * It uses 12 arithmetic operations, one of which is a multiply.
							 | 
						|
								 * http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation
							 | 
						|
								 */
							 | 
						|
								static inline uint64_t popcnt64_bitwise(uint64_t x)
							 | 
						|
								{
							 | 
						|
								    uint64_t m1  = 0x5555555555555555ull;
							 | 
						|
								    uint64_t m2  = 0x3333333333333333ull;
							 | 
						|
								    uint64_t m4  = 0x0F0F0F0F0F0F0F0Full;
							 | 
						|
								    uint64_t h01 = 0x0101010101010101ull;
							 | 
						|
								
							 | 
						|
								    x -= (x >> 1) & m1;
							 | 
						|
								    x  = (x & m2) + ((x >> 2) & m2);
							 | 
						|
								    x  = (x + (x >> 4)) & m4;
							 | 
						|
								
							 | 
						|
								    return (x * h01) >> 56;
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								#if defined(LIBPOPCNT_HAVE_ASM_POPCNT) && defined(__x86_64__)
							 | 
						|
								
							 | 
						|
								static inline uint64_t popcnt64(uint64_t x)
							 | 
						|
								{
							 | 
						|
								    __asm__("popcnt %1, %0" : "=r"(x) : "0"(x));
							 | 
						|
								    return x;
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								#elif defined(LIBPOPCNT_HAVE_ASM_POPCNT) && defined(__i386__)
							 | 
						|
								
							 | 
						|
								static inline uint32_t popcnt32(uint32_t x)
							 | 
						|
								{
							 | 
						|
								    __asm__("popcnt %1, %0" : "=r"(x) : "0"(x));
							 | 
						|
								    return x;
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								static inline uint64_t popcnt64(uint64_t x) { return popcnt32((uint32_t)x) + popcnt32((uint32_t)(x >> 32)); }
							 | 
						|
								
							 | 
						|
								#elif defined(_MSC_VER) && defined(_M_X64)
							 | 
						|
								
							 | 
						|
								#include <intrin.h>
							 | 
						|
								
							 | 
						|
								static inline uint64_t popcnt64(uint64_t x) { return __popcnt64(x); }
							 | 
						|
								
							 | 
						|
								#elif defined(_MSC_VER) && defined(_M_IX86)
							 | 
						|
								
							 | 
						|
								#include <intrin.h>
							 | 
						|
								
							 | 
						|
								static inline uint64_t popcnt64(uint64_t x) { return __popcnt((uint32_t)x) + __popcnt((uint32_t)(x >> 32)); }
							 | 
						|
								
							 | 
						|
								/* non x86 CPUs */
							 | 
						|
								#elif defined(LIBPOPCNT_HAVE_BUILTIN_POPCOUNT)
							 | 
						|
								
							 | 
						|
								static inline uint64_t popcnt64(uint64_t x) { return __builtin_popcountll(x); }
							 | 
						|
								
							 | 
						|
								/* no hardware POPCNT,
							 | 
						|
								 * use pure integer algorithm */
							 | 
						|
								#else
							 | 
						|
								
							 | 
						|
								static inline uint64_t popcnt64(uint64_t x) { return popcnt64_bitwise(x); }
							 | 
						|
								
							 | 
						|
								#endif
							 | 
						|
								
							 | 
						|
								#if defined(LIBPOPCNT_HAVE_CPUID)
							 | 
						|
								
							 | 
						|
								#if defined(_MSC_VER)
							 | 
						|
								#include <intrin.h>
							 | 
						|
								#include <immintrin.h>
							 | 
						|
								#endif
							 | 
						|
								
							 | 
						|
								/* CPUID bits documentation: */
							 | 
						|
								/* https://en.wikipedia.org/wiki/CPUID */
							 | 
						|
								
							 | 
						|
								/* %ebx bit flags */
							 | 
						|
								#define LIBPOPCNT_BIT_AVX2     (1 << 5)
							 | 
						|
								#define LIBPOPCNT_BIT_AVX512F  (1 << 16)
							 | 
						|
								#define LIBPOPCNT_BIT_AVX512BW (1 << 30)
							 | 
						|
								
							 | 
						|
								/* %ecx bit flags */
							 | 
						|
								#define LIBPOPCNT_BIT_AVX512_VPOPCNTDQ (1 << 14)
							 | 
						|
								#define LIBPOPCNT_BIT_POPCNT           (1 << 23)
							 | 
						|
								
							 | 
						|
								/* xgetbv bit flags */
							 | 
						|
								#define LIBPOPCNT_XSTATE_SSE (1 << 1)
							 | 
						|
								#define LIBPOPCNT_XSTATE_YMM (1 << 2)
							 | 
						|
								#define LIBPOPCNT_XSTATE_ZMM (7 << 5)
							 | 
						|
								
							 | 
						|
								static inline void run_cpuid(int eax, int ecx, int* abcd)
							 | 
						|
								{
							 | 
						|
								#if defined(_MSC_VER)
							 | 
						|
								    __cpuidex(abcd, eax, ecx);
							 | 
						|
								#else
							 | 
						|
								    int ebx = 0;
							 | 
						|
								    int edx = 0;
							 | 
						|
								
							 | 
						|
								#if defined(__i386__) && defined(__PIC__)
							 | 
						|
								    /* In case of PIC under 32-bit EBX cannot be clobbered */
							 | 
						|
								    __asm__ __volatile__("movl %%ebx, %%edi;"
							 | 
						|
								                         "cpuid;"
							 | 
						|
								                         "xchgl %%ebx, %%edi;"
							 | 
						|
								                         : "+a"(eax), "=D"(ebx), "+c"(ecx), "=d"(edx));
							 | 
						|
								#else
							 | 
						|
								    __asm__ __volatile__("cpuid" : "+a"(eax), "+b"(ebx), "+c"(ecx), "=d"(edx));
							 | 
						|
								#endif
							 | 
						|
								
							 | 
						|
								    abcd[0] = eax;
							 | 
						|
								    abcd[1] = ebx;
							 | 
						|
								    abcd[2] = ecx;
							 | 
						|
								    abcd[3] = edx;
							 | 
						|
								#endif
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								#if defined(LIBPOPCNT_HAVE_AVX2) || defined(LIBPOPCNT_HAVE_AVX512)
							 | 
						|
								
							 | 
						|
								static inline uint64_t get_xcr0(void)
							 | 
						|
								{
							 | 
						|
								#if defined(_MSC_VER)
							 | 
						|
								    return _xgetbv(0);
							 | 
						|
								#else
							 | 
						|
								    uint32_t eax;
							 | 
						|
								    uint32_t edx;
							 | 
						|
								
							 | 
						|
								    __asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
							 | 
						|
								    return eax | (((uint64_t)edx) << 32);
							 | 
						|
								#endif
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								#endif
							 | 
						|
								
							 | 
						|
								static inline int get_cpuid(void)
							 | 
						|
								{
							 | 
						|
								    int flags = 0;
							 | 
						|
								    int abcd[4];
							 | 
						|
								
							 | 
						|
								    run_cpuid(1, 0, abcd);
							 | 
						|
								
							 | 
						|
								    if ((abcd[2] & LIBPOPCNT_BIT_POPCNT) == LIBPOPCNT_BIT_POPCNT) flags |= LIBPOPCNT_BIT_POPCNT;
							 | 
						|
								
							 | 
						|
								#if defined(LIBPOPCNT_HAVE_AVX2) || defined(LIBPOPCNT_HAVE_AVX512)
							 | 
						|
								
							 | 
						|
								    int osxsave_mask = (1 << 27);
							 | 
						|
								
							 | 
						|
								    /* ensure OS supports extended processor state management */
							 | 
						|
								    if ((abcd[2] & osxsave_mask) != osxsave_mask) return 0;
							 | 
						|
								
							 | 
						|
								    uint64_t ymm_mask = LIBPOPCNT_XSTATE_SSE | LIBPOPCNT_XSTATE_YMM;
							 | 
						|
								    uint64_t zmm_mask = LIBPOPCNT_XSTATE_SSE | LIBPOPCNT_XSTATE_YMM | LIBPOPCNT_XSTATE_ZMM;
							 | 
						|
								    uint64_t xcr0     = get_xcr0();
							 | 
						|
								
							 | 
						|
								    if ((xcr0 & ymm_mask) == ymm_mask) {
							 | 
						|
								        run_cpuid(7, 0, abcd);
							 | 
						|
								
							 | 
						|
								        if ((abcd[1] & LIBPOPCNT_BIT_AVX2) == LIBPOPCNT_BIT_AVX2) flags |= LIBPOPCNT_BIT_AVX2;
							 | 
						|
								
							 | 
						|
								        if ((xcr0 & zmm_mask) == zmm_mask) {
							 | 
						|
								            /* If all AVX512 features required by our popcnt_avx512() are supported */
							 | 
						|
								            /* then we add LIBPOPCNT_BIT_AVX512_VPOPCNTDQ to our CPUID flags. */
							 | 
						|
								            if ((abcd[1] & LIBPOPCNT_BIT_AVX512F) == LIBPOPCNT_BIT_AVX512F
							 | 
						|
								                && (abcd[1] & LIBPOPCNT_BIT_AVX512BW) == LIBPOPCNT_BIT_AVX512BW
							 | 
						|
								                && (abcd[2] & LIBPOPCNT_BIT_AVX512_VPOPCNTDQ) == LIBPOPCNT_BIT_AVX512_VPOPCNTDQ)
							 | 
						|
								                flags |= LIBPOPCNT_BIT_AVX512_VPOPCNTDQ;
							 | 
						|
								        }
							 | 
						|
								    }
							 | 
						|
								
							 | 
						|
								#endif
							 | 
						|
								
							 | 
						|
								    return flags;
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								#endif /* cpuid */
							 | 
						|
								
							 | 
						|
								#if defined(LIBPOPCNT_HAVE_AVX2) && __has_include(<immintrin.h>)
							 | 
						|
								
							 | 
						|
								#include <immintrin.h>
							 | 
						|
								
							 | 
						|
								#if __has_attribute(target)
							 | 
						|
								__attribute__((target("avx2")))
							 | 
						|
								#endif
							 | 
						|
								static inline void
							 | 
						|
								    CSA256(__m256i* h, __m256i* l, __m256i a, __m256i b, __m256i c)
							 | 
						|
								{
							 | 
						|
								    __m256i u = _mm256_xor_si256(a, b);
							 | 
						|
								    *h        = _mm256_or_si256(_mm256_and_si256(a, b), _mm256_and_si256(u, c));
							 | 
						|
								    *l        = _mm256_xor_si256(u, c);
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								#if __has_attribute(target)
							 | 
						|
								__attribute__((target("avx2")))
							 | 
						|
								#endif
							 | 
						|
								static inline __m256i
							 | 
						|
								    popcnt256(__m256i v)
							 | 
						|
								{
							 | 
						|
								    __m256i lookup1 =
							 | 
						|
								        _mm256_setr_epi8(4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8);
							 | 
						|
								
							 | 
						|
								    __m256i lookup2 =
							 | 
						|
								        _mm256_setr_epi8(4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0, 4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0);
							 | 
						|
								
							 | 
						|
								    __m256i low_mask = _mm256_set1_epi8(0x0f);
							 | 
						|
								    __m256i lo       = _mm256_and_si256(v, low_mask);
							 | 
						|
								    __m256i hi       = _mm256_and_si256(_mm256_srli_epi16(v, 4), low_mask);
							 | 
						|
								    __m256i popcnt1  = _mm256_shuffle_epi8(lookup1, lo);
							 | 
						|
								    __m256i popcnt2  = _mm256_shuffle_epi8(lookup2, hi);
							 | 
						|
								
							 | 
						|
								    return _mm256_sad_epu8(popcnt1, popcnt2);
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								/*
							 | 
						|
								 * AVX2 Harley-Seal popcount (4th iteration).
							 | 
						|
								 * The algorithm is based on the paper "Faster Population Counts
							 | 
						|
								 * using AVX2 Instructions" by Daniel Lemire, Nathan Kurz and
							 | 
						|
								 * Wojciech Mula (23 Nov 2016).
							 | 
						|
								 * @see https://arxiv.org/abs/1611.07612
							 | 
						|
								 */
							 | 
						|
								#if __has_attribute(target)
							 | 
						|
								__attribute__((target("avx2")))
							 | 
						|
								#endif
							 | 
						|
								static inline uint64_t
							 | 
						|
								    popcnt_avx2(const __m256i* ptr, uint64_t size)
							 | 
						|
								{
							 | 
						|
								    __m256i cnt      = _mm256_setzero_si256();
							 | 
						|
								    __m256i ones     = _mm256_setzero_si256();
							 | 
						|
								    __m256i twos     = _mm256_setzero_si256();
							 | 
						|
								    __m256i fours    = _mm256_setzero_si256();
							 | 
						|
								    __m256i eights   = _mm256_setzero_si256();
							 | 
						|
								    __m256i sixteens = _mm256_setzero_si256();
							 | 
						|
								    __m256i twosA, twosB, foursA, foursB, eightsA, eightsB;
							 | 
						|
								
							 | 
						|
								    uint64_t  i     = 0;
							 | 
						|
								    uint64_t  limit = size - size % 16;
							 | 
						|
								    uint64_t* cnt64;
							 | 
						|
								
							 | 
						|
								    for (; i < limit; i += 16) {
							 | 
						|
								        CSA256(&twosA, &ones, ones, _mm256_loadu_si256(ptr + i + 0), _mm256_loadu_si256(ptr + i + 1));
							 | 
						|
								        CSA256(&twosB, &ones, ones, _mm256_loadu_si256(ptr + i + 2), _mm256_loadu_si256(ptr + i + 3));
							 | 
						|
								        CSA256(&foursA, &twos, twos, twosA, twosB);
							 | 
						|
								        CSA256(&twosA, &ones, ones, _mm256_loadu_si256(ptr + i + 4), _mm256_loadu_si256(ptr + i + 5));
							 | 
						|
								        CSA256(&twosB, &ones, ones, _mm256_loadu_si256(ptr + i + 6), _mm256_loadu_si256(ptr + i + 7));
							 | 
						|
								        CSA256(&foursB, &twos, twos, twosA, twosB);
							 | 
						|
								        CSA256(&eightsA, &fours, fours, foursA, foursB);
							 | 
						|
								        CSA256(&twosA, &ones, ones, _mm256_loadu_si256(ptr + i + 8), _mm256_loadu_si256(ptr + i + 9));
							 | 
						|
								        CSA256(&twosB, &ones, ones, _mm256_loadu_si256(ptr + i + 10), _mm256_loadu_si256(ptr + i + 11));
							 | 
						|
								        CSA256(&foursA, &twos, twos, twosA, twosB);
							 | 
						|
								        CSA256(&twosA, &ones, ones, _mm256_loadu_si256(ptr + i + 12), _mm256_loadu_si256(ptr + i + 13));
							 | 
						|
								        CSA256(&twosB, &ones, ones, _mm256_loadu_si256(ptr + i + 14), _mm256_loadu_si256(ptr + i + 15));
							 | 
						|
								        CSA256(&foursB, &twos, twos, twosA, twosB);
							 | 
						|
								        CSA256(&eightsB, &fours, fours, foursA, foursB);
							 | 
						|
								        CSA256(&sixteens, &eights, eights, eightsA, eightsB);
							 | 
						|
								
							 | 
						|
								        cnt = _mm256_add_epi64(cnt, popcnt256(sixteens));
							 | 
						|
								    }
							 | 
						|
								
							 | 
						|
								    cnt = _mm256_slli_epi64(cnt, 4);
							 | 
						|
								    cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(popcnt256(eights), 3));
							 | 
						|
								    cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(popcnt256(fours), 2));
							 | 
						|
								    cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(popcnt256(twos), 1));
							 | 
						|
								    cnt = _mm256_add_epi64(cnt, popcnt256(ones));
							 | 
						|
								
							 | 
						|
								    for (; i < size; i++) cnt = _mm256_add_epi64(cnt, popcnt256(_mm256_loadu_si256(ptr + i)));
							 | 
						|
								
							 | 
						|
								    cnt64 = (uint64_t*)&cnt;
							 | 
						|
								
							 | 
						|
								    return cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3];
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								#endif
							 | 
						|
								
							 | 
						|
								#if defined(LIBPOPCNT_HAVE_AVX512) && __has_include(<immintrin.h>)
							 | 
						|
								
							 | 
						|
								#include <immintrin.h>
							 | 
						|
								
							 | 
						|
								#if __has_attribute(target)
							 | 
						|
								__attribute__((target("avx512f,avx512bw,avx512vpopcntdq")))
							 | 
						|
								#endif
							 | 
						|
								static inline uint64_t
							 | 
						|
								    popcnt_avx512(const uint8_t* ptr8, uint64_t size)
							 | 
						|
								{
							 | 
						|
								    __m512i         cnt    = _mm512_setzero_si512();
							 | 
						|
								    const uint64_t* ptr64  = (const uint64_t*)ptr8;
							 | 
						|
								    uint64_t        size64 = size / sizeof(uint64_t);
							 | 
						|
								    uint64_t        i      = 0;
							 | 
						|
								
							 | 
						|
								    for (; i + 32 <= size64; i += 32) {
							 | 
						|
								        __m512i vec0 = _mm512_loadu_epi64(&ptr64[i + 0]);
							 | 
						|
								        __m512i vec1 = _mm512_loadu_epi64(&ptr64[i + 8]);
							 | 
						|
								        __m512i vec2 = _mm512_loadu_epi64(&ptr64[i + 16]);
							 | 
						|
								        __m512i vec3 = _mm512_loadu_epi64(&ptr64[i + 24]);
							 | 
						|
								
							 | 
						|
								        vec0 = _mm512_popcnt_epi64(vec0);
							 | 
						|
								        vec1 = _mm512_popcnt_epi64(vec1);
							 | 
						|
								        vec2 = _mm512_popcnt_epi64(vec2);
							 | 
						|
								        vec3 = _mm512_popcnt_epi64(vec3);
							 | 
						|
								
							 | 
						|
								        cnt = _mm512_add_epi64(cnt, vec0);
							 | 
						|
								        cnt = _mm512_add_epi64(cnt, vec1);
							 | 
						|
								        cnt = _mm512_add_epi64(cnt, vec2);
							 | 
						|
								        cnt = _mm512_add_epi64(cnt, vec3);
							 | 
						|
								    }
							 | 
						|
								
							 | 
						|
								    for (; i + 8 <= size64; i += 8) {
							 | 
						|
								        __m512i vec = _mm512_loadu_epi64(&ptr64[i]);
							 | 
						|
								        vec         = _mm512_popcnt_epi64(vec);
							 | 
						|
								        cnt         = _mm512_add_epi64(cnt, vec);
							 | 
						|
								    }
							 | 
						|
								
							 | 
						|
								    i *= sizeof(uint64_t);
							 | 
						|
								
							 | 
						|
								    /* Process last 63 bytes */
							 | 
						|
								    if (i < size) {
							 | 
						|
								        __mmask64 mask = (__mmask64)(0xffffffffffffffffull >> (i + 64 - size));
							 | 
						|
								        __m512i   vec  = _mm512_maskz_loadu_epi8(mask, &ptr8[i]);
							 | 
						|
								        vec            = _mm512_popcnt_epi64(vec);
							 | 
						|
								        cnt            = _mm512_add_epi64(cnt, vec);
							 | 
						|
								    }
							 | 
						|
								
							 | 
						|
								    return _mm512_reduce_add_epi64(cnt);
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								#endif
							 | 
						|
								
							 | 
						|
								/* x86 CPUs */
							 | 
						|
								#if defined(LIBPOPCNT_X86_OR_X64)
							 | 
						|
								
							 | 
						|
								/*
							 | 
						|
								 * Count the number of 1 bits in the data array
							 | 
						|
								 * @data: An array
							 | 
						|
								 * @size: Size of data in bytes
							 | 
						|
								 */
							 | 
						|
								static uint64_t popcnt(const void* data, uint64_t size)
							 | 
						|
								{
							 | 
						|
								/*
							 | 
						|
								 * CPUID runtime checks are only enabled if this is needed.
							 | 
						|
								 * E.g. CPUID is disabled when a user compiles his
							 | 
						|
								 * code using -march=native on a CPU with AVX512.
							 | 
						|
								 */
							 | 
						|
								#if defined(LIBPOPCNT_HAVE_CPUID)
							 | 
						|
								#if defined(__cplusplus)
							 | 
						|
								    /* C++11 thread-safe singleton */
							 | 
						|
								    static const int cpuid = get_cpuid();
							 | 
						|
								#else
							 | 
						|
								    static int cpuid_ = -1;
							 | 
						|
								    int        cpuid  = cpuid_;
							 | 
						|
								    if (cpuid == -1) {
							 | 
						|
								        cpuid = get_cpuid();
							 | 
						|
								
							 | 
						|
								#if defined(_MSC_VER)
							 | 
						|
								        _InterlockedCompareExchange(&cpuid_, cpuid, -1);
							 | 
						|
								#else
							 | 
						|
								        __sync_val_compare_and_swap(&cpuid_, -1, cpuid);
							 | 
						|
								#endif
							 | 
						|
								    }
							 | 
						|
								#endif
							 | 
						|
								#endif
							 | 
						|
								
							 | 
						|
								    const uint8_t* ptr = (const uint8_t*)data;
							 | 
						|
								    uint64_t       cnt = 0;
							 | 
						|
								    uint64_t       i   = 0;
							 | 
						|
								
							 | 
						|
								#if defined(LIBPOPCNT_HAVE_AVX512)
							 | 
						|
								#if defined(__AVX512__) || (defined(__AVX512F__) && defined(__AVX512BW__) && defined(__AVX512VPOPCNTDQ__))
							 | 
						|
								    /* For tiny arrays AVX512 is not worth it */
							 | 
						|
								    if (i + 40 <= size)
							 | 
						|
								#else
							 | 
						|
								    if ((cpuid & LIBPOPCNT_BIT_AVX512_VPOPCNTDQ) && i + 40 <= size)
							 | 
						|
								#endif
							 | 
						|
								        return popcnt_avx512(ptr, size);
							 | 
						|
								#endif
							 | 
						|
								
							 | 
						|
								#if defined(LIBPOPCNT_HAVE_AVX2)
							 | 
						|
								#if defined(__AVX2__)
							 | 
						|
								    /* AVX2 requires arrays >= 512 bytes */
							 | 
						|
								    if (i + 512 <= size)
							 | 
						|
								#else
							 | 
						|
								    if ((cpuid & LIBPOPCNT_BIT_AVX2) && i + 512 <= size)
							 | 
						|
								#endif
							 | 
						|
								    {
							 | 
						|
								        const __m256i* ptr256  = (const __m256i*)(ptr + i);
							 | 
						|
								        cnt                   += popcnt_avx2(ptr256, (size - i) / 32);
							 | 
						|
								        i                      = size - size % 32;
							 | 
						|
								    }
							 | 
						|
								#endif
							 | 
						|
								
							 | 
						|
								#if defined(LIBPOPCNT_HAVE_POPCNT)
							 | 
						|
								/*
							 | 
						|
								 * The user has compiled without -mpopcnt.
							 | 
						|
								 * Unfortunately the MSVC compiler does not have
							 | 
						|
								 * a POPCNT macro so we cannot get rid of the
							 | 
						|
								 * runtime check for MSVC.
							 | 
						|
								 */
							 | 
						|
								#if !defined(__POPCNT__)
							 | 
						|
								    if (cpuid & LIBPOPCNT_BIT_POPCNT)
							 | 
						|
								#endif
							 | 
						|
								    {
							 | 
						|
								        if (i + 8 <= size) {
							 | 
						|
								            uintptr_t rem = ((uintptr_t)&ptr[i]) % 8;
							 | 
						|
								
							 | 
						|
								            /* Align &ptr[i] to an 8 byte boundary */
							 | 
						|
								            if (rem != 0) {
							 | 
						|
								                uint64_t val   = 0;
							 | 
						|
								                uint64_t bytes = (uint64_t)(8 - rem % 8);
							 | 
						|
								                bytes          = (bytes <= 7) ? bytes : 7;
							 | 
						|
								                for (uint64_t j = 0; j < bytes; j++) val |= ((uint64_t)ptr[i + j]) << (j * 8);
							 | 
						|
								                cnt += popcnt64(val);
							 | 
						|
								                i   += bytes;
							 | 
						|
								            }
							 | 
						|
								        }
							 | 
						|
								
							 | 
						|
								        for (; i + 8 <= size; i += 8) cnt += popcnt64(*(const uint64_t*)(ptr + i));
							 | 
						|
								
							 | 
						|
								        if (i < size) {
							 | 
						|
								            uint64_t val   = 0;
							 | 
						|
								            uint64_t bytes = (uint64_t)(size - i);
							 | 
						|
								            bytes          = (bytes <= 7) ? bytes : 7;
							 | 
						|
								            for (uint64_t j = 0; j < bytes; j++) val |= ((uint64_t)ptr[i + j]) << (j * 8);
							 | 
						|
								            cnt += popcnt64(val);
							 | 
						|
								        }
							 | 
						|
								
							 | 
						|
								        return cnt;
							 | 
						|
								    }
							 | 
						|
								#endif
							 | 
						|
								
							 | 
						|
								/*
							 | 
						|
								 * This code is used for:
							 | 
						|
								 * 1) Compiler does not support POPCNT.
							 | 
						|
								 * 2) x86 CPU does not support POPCNT (cpuid != POPCNT).
							 | 
						|
								 */
							 | 
						|
								#if !defined(LIBPOPCNT_HAVE_POPCNT) || !defined(__POPCNT__)
							 | 
						|
								
							 | 
						|
								    if (i + 8 <= size) {
							 | 
						|
								        uintptr_t rem = ((uintptr_t)&ptr[i]) % 8;
							 | 
						|
								
							 | 
						|
								        /* Align &ptr[i] to an 8 byte boundary */
							 | 
						|
								        if (rem != 0) {
							 | 
						|
								            uint64_t val   = 0;
							 | 
						|
								            uint64_t bytes = (uint64_t)(8 - rem % 8);
							 | 
						|
								            bytes          = (bytes <= 7) ? bytes : 7;
							 | 
						|
								            for (uint64_t j = 0; j < bytes; j++) val |= ((uint64_t)ptr[i + j]) << (j * 8);
							 | 
						|
								            cnt += popcnt64_bitwise(val);
							 | 
						|
								            i   += bytes;
							 | 
						|
								        }
							 | 
						|
								    }
							 | 
						|
								
							 | 
						|
								    for (; i + 8 <= size; i += 8) cnt += popcnt64_bitwise(*(const uint64_t*)(ptr + i));
							 | 
						|
								
							 | 
						|
								    if (i < size) {
							 | 
						|
								        uint64_t val   = 0;
							 | 
						|
								        uint64_t bytes = (uint64_t)(size - i);
							 | 
						|
								        bytes          = (bytes <= 7) ? bytes : 7;
							 | 
						|
								        for (uint64_t j = 0; j < bytes; j++) val |= ((uint64_t)ptr[i + j]) << (j * 8);
							 | 
						|
								        cnt += popcnt64_bitwise(val);
							 | 
						|
								    }
							 | 
						|
								
							 | 
						|
								    return cnt;
							 | 
						|
								#endif
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								/* Compile with e.g. -march=armv8-a+sve to enable ARM SVE */
							 | 
						|
								#elif defined(__ARM_FEATURE_SVE) && __has_include(<arm_sve.h>)
							 | 
						|
								
							 | 
						|
								#include <arm_sve.h>
							 | 
						|
								
							 | 
						|
								/*
							 | 
						|
								 * Count the number of 1 bits in the data array
							 | 
						|
								 * @data: An array
							 | 
						|
								 * @size: Size of data in bytes
							 | 
						|
								 */
							 | 
						|
								static inline uint64_t popcnt(const void* data, uint64_t size)
							 | 
						|
								{
							 | 
						|
								    uint64_t        i      = 0;
							 | 
						|
								    const uint64_t* ptr64  = (const uint64_t*)data;
							 | 
						|
								    uint64_t        size64 = size / sizeof(uint64_t);
							 | 
						|
								    svuint64_t      vcnt   = svdup_u64(0);
							 | 
						|
								
							 | 
						|
								    for (; i + svcntd() * 4 <= size64; i += svcntd() * 4) {
							 | 
						|
								        svuint64_t vec0 = svld1_u64(svptrue_b64(), &ptr64[i + svcntd() * 0]);
							 | 
						|
								        svuint64_t vec1 = svld1_u64(svptrue_b64(), &ptr64[i + svcntd() * 1]);
							 | 
						|
								        svuint64_t vec2 = svld1_u64(svptrue_b64(), &ptr64[i + svcntd() * 2]);
							 | 
						|
								        svuint64_t vec3 = svld1_u64(svptrue_b64(), &ptr64[i + svcntd() * 3]);
							 | 
						|
								
							 | 
						|
								        vec0 = svcnt_u64_x(svptrue_b64(), vec0);
							 | 
						|
								        vec1 = svcnt_u64_x(svptrue_b64(), vec1);
							 | 
						|
								        vec2 = svcnt_u64_x(svptrue_b64(), vec2);
							 | 
						|
								        vec3 = svcnt_u64_x(svptrue_b64(), vec3);
							 | 
						|
								
							 | 
						|
								        vcnt = svadd_u64_x(svptrue_b64(), vcnt, vec0);
							 | 
						|
								        vcnt = svadd_u64_x(svptrue_b64(), vcnt, vec1);
							 | 
						|
								        vcnt = svadd_u64_x(svptrue_b64(), vcnt, vec2);
							 | 
						|
								        vcnt = svadd_u64_x(svptrue_b64(), vcnt, vec3);
							 | 
						|
								    }
							 | 
						|
								
							 | 
						|
								    svbool_t pg = svwhilelt_b64(i, size64);
							 | 
						|
								
							 | 
						|
								    while (svptest_any(svptrue_b64(), pg)) {
							 | 
						|
								        svuint64_t vec  = svld1_u64(pg, &ptr64[i]);
							 | 
						|
								        vec             = svcnt_u64_z(pg, vec);
							 | 
						|
								        vcnt            = svadd_u64_x(svptrue_b64(), vcnt, vec);
							 | 
						|
								        i              += svcntd();
							 | 
						|
								        pg              = svwhilelt_b64(i, size64);
							 | 
						|
								    }
							 | 
						|
								
							 | 
						|
								    uint64_t cnt   = svaddv_u64(svptrue_b64(), vcnt);
							 | 
						|
								    uint64_t bytes = size % sizeof(uint64_t);
							 | 
						|
								
							 | 
						|
								    if (bytes != 0) {
							 | 
						|
								        i                     = size - bytes;
							 | 
						|
								        const uint8_t* ptr8   = (const uint8_t*)data;
							 | 
						|
								        svbool_t       pg8    = svwhilelt_b8(i, size);
							 | 
						|
								        svuint8_t      vec    = svld1_u8(pg8, &ptr8[i]);
							 | 
						|
								        svuint8_t      vcnt8  = svcnt_u8_z(pg8, vec);
							 | 
						|
								        cnt                  += svaddv_u8(pg8, vcnt8);
							 | 
						|
								    }
							 | 
						|
								
							 | 
						|
								    return cnt;
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								#elif (defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM64)) && __has_include(<arm_neon.h>)
							 | 
						|
								
							 | 
						|
								#include <arm_neon.h>
							 | 
						|
								
							 | 
						|
								static inline uint64x2_t vpadalq(uint64x2_t sum, uint8x16_t t) { return vpadalq_u32(sum, vpaddlq_u16(vpaddlq_u8(t))); }
							 | 
						|
								
							 | 
						|
								/*
							 | 
						|
								 * Count the number of 1 bits in the data array
							 | 
						|
								 * @data: An array
							 | 
						|
								 * @size: Size of data in bytes
							 | 
						|
								 */
							 | 
						|
								static inline uint64_t popcnt(const void* data, uint64_t size)
							 | 
						|
								{
							 | 
						|
								    uint64_t       i          = 0;
							 | 
						|
								    uint64_t       cnt        = 0;
							 | 
						|
								    uint64_t       chunk_size = 64;
							 | 
						|
								    const uint8_t* ptr        = (const uint8_t*)data;
							 | 
						|
								
							 | 
						|
								    if (size >= chunk_size) {
							 | 
						|
								        uint64_t   iters = size / chunk_size;
							 | 
						|
								        uint64x2_t sum   = vcombine_u64(vcreate_u64(0), vcreate_u64(0));
							 | 
						|
								        uint8x16_t zero  = vcombine_u8(vcreate_u8(0), vcreate_u8(0));
							 | 
						|
								
							 | 
						|
								        do {
							 | 
						|
								            uint8x16_t t0 = zero;
							 | 
						|
								            uint8x16_t t1 = zero;
							 | 
						|
								            uint8x16_t t2 = zero;
							 | 
						|
								            uint8x16_t t3 = zero;
							 | 
						|
								
							 | 
						|
								            /*
							 | 
						|
								             * After every 31 iterations we need to add the
							 | 
						|
								             * temporary sums (t0, t1, t2, t3) to the total sum.
							 | 
						|
								             * We must ensure that the temporary sums <= 255
							 | 
						|
								             * and 31 * 8 bits = 248 which is OK.
							 | 
						|
								             */
							 | 
						|
								            uint64_t limit = (i + 31 < iters) ? i + 31 : iters;
							 | 
						|
								
							 | 
						|
								            /* Each iteration processes 64 bytes */
							 | 
						|
								            for (; i < limit; i++) {
							 | 
						|
								                uint8x16x4_t input  = vld4q_u8(ptr);
							 | 
						|
								                ptr                += chunk_size;
							 | 
						|
								
							 | 
						|
								                t0 = vaddq_u8(t0, vcntq_u8(input.val[0]));
							 | 
						|
								                t1 = vaddq_u8(t1, vcntq_u8(input.val[1]));
							 | 
						|
								                t2 = vaddq_u8(t2, vcntq_u8(input.val[2]));
							 | 
						|
								                t3 = vaddq_u8(t3, vcntq_u8(input.val[3]));
							 | 
						|
								            }
							 | 
						|
								
							 | 
						|
								            sum = vpadalq(sum, t0);
							 | 
						|
								            sum = vpadalq(sum, t1);
							 | 
						|
								            sum = vpadalq(sum, t2);
							 | 
						|
								            sum = vpadalq(sum, t3);
							 | 
						|
								        } while (i < iters);
							 | 
						|
								
							 | 
						|
								        i     = 0;
							 | 
						|
								        size %= chunk_size;
							 | 
						|
								
							 | 
						|
								        uint64_t tmp[2];
							 | 
						|
								        vst1q_u64(tmp, sum);
							 | 
						|
								        cnt += tmp[0];
							 | 
						|
								        cnt += tmp[1];
							 | 
						|
								    }
							 | 
						|
								
							 | 
						|
								    if (i + 8 <= size) {
							 | 
						|
								        uintptr_t rem = ((uintptr_t)&ptr[i]) % 8;
							 | 
						|
								
							 | 
						|
								        /* Align &ptr[i] to an 8 byte boundary */
							 | 
						|
								        if (rem != 0) {
							 | 
						|
								            uint64_t val   = 0;
							 | 
						|
								            uint64_t bytes = (uint64_t)(8 - rem % 8);
							 | 
						|
								            bytes          = (bytes <= 7) ? bytes : 7;
							 | 
						|
								            for (uint64_t j = 0; j < bytes; j++) val |= ((uint64_t)ptr[i + j]) << (j * 8);
							 | 
						|
								            cnt += popcnt64(val);
							 | 
						|
								            i   += bytes;
							 | 
						|
								        }
							 | 
						|
								    }
							 | 
						|
								
							 | 
						|
								    for (; i + 8 <= size; i += 8) cnt += popcnt64(*(const uint64_t*)(ptr + i));
							 | 
						|
								
							 | 
						|
								    if (i < size) {
							 | 
						|
								        uint64_t val   = 0;
							 | 
						|
								        uint64_t bytes = (uint64_t)(size - i);
							 | 
						|
								        bytes          = (bytes <= 7) ? bytes : 7;
							 | 
						|
								        for (uint64_t j = 0; j < bytes; j++) val |= ((uint64_t)ptr[i + j]) << (j * 8);
							 | 
						|
								        cnt += popcnt64(val);
							 | 
						|
								    }
							 | 
						|
								
							 | 
						|
								    return cnt;
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								/* all other CPUs */
							 | 
						|
								#else
							 | 
						|
								
							 | 
						|
								/*
							 | 
						|
								 * Count the number of 1 bits in the data array
							 | 
						|
								 * @data: An array
							 | 
						|
								 * @size: Size of data in bytes
							 | 
						|
								 */
							 | 
						|
								static inline uint64_t popcnt(const void* data, uint64_t size)
							 | 
						|
								{
							 | 
						|
								    uint64_t       i   = 0;
							 | 
						|
								    uint64_t       cnt = 0;
							 | 
						|
								    const uint8_t* ptr = (const uint8_t*)data;
							 | 
						|
								
							 | 
						|
								    if (i + 8 <= size) {
							 | 
						|
								        uintptr_t rem = ((uintptr_t)&ptr[i]) % 8;
							 | 
						|
								
							 | 
						|
								        /* Align &ptr[i] to an 8 byte boundary */
							 | 
						|
								        if (rem != 0) {
							 | 
						|
								            uint64_t val   = 0;
							 | 
						|
								            uint64_t bytes = (uint64_t)(8 - rem % 8);
							 | 
						|
								            bytes          = (bytes <= 7) ? bytes : 7;
							 | 
						|
								            for (uint64_t j = 0; j < bytes; j++) val |= ((uint64_t)ptr[i + j]) << (j * 8);
							 | 
						|
								            cnt += popcnt64(val);
							 | 
						|
								            i   += bytes;
							 | 
						|
								        }
							 | 
						|
								    }
							 | 
						|
								
							 | 
						|
								    for (; i + 8 <= size; i += 8) cnt += popcnt64(*(const uint64_t*)(ptr + i));
							 | 
						|
								
							 | 
						|
								    if (i < size) {
							 | 
						|
								        uint64_t val   = 0;
							 | 
						|
								        uint64_t bytes = (uint64_t)(size - i);
							 | 
						|
								        bytes          = (bytes <= 7) ? bytes : 7;
							 | 
						|
								        for (uint64_t j = 0; j < bytes; j++) val |= ((uint64_t)ptr[i + j]) << (j * 8);
							 | 
						|
								        cnt += popcnt64(val);
							 | 
						|
								    }
							 | 
						|
								
							 | 
						|
								    return cnt;
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								#endif
							 | 
						|
								
							 | 
						|
								#ifdef __cplusplus
							 | 
						|
								} /* extern "C" */
							 | 
						|
								#endif
							 | 
						|
								
							 | 
						|
								#endif /* LIBPOPCNT_H */
							 |