@ -67,62 +67,97 @@ static bool buffer_is_zero_integer(const void *buf, size_t len)
# if defined(CONFIG_AVX2_OPT) || defined(__SSE2__)
# if defined(CONFIG_AVX2_OPT) || defined(__SSE2__)
# include <immintrin.h>
# include <immintrin.h>
/* Note that each of these vectorized functions require len >= 64. */
/* Helper for preventing the compiler from reassociating
chains of binary vector operations . */
# define SSE_REASSOC_BARRIER(vec0, vec1) asm("" : "+x"(vec0), "+x"(vec1))
/* Note that these vectorized functions may assume len >= 256. */
static bool __attribute__ ( ( target ( " sse2 " ) ) )
static bool __attribute__ ( ( target ( " sse2 " ) ) )
buffer_zero_sse2 ( const void * buf , size_t len )
buffer_zero_sse2 ( const void * buf , size_t len )
{
{
__m128i t = _mm_loadu_si128 ( buf ) ;
/* Unaligned loads at head/tail. */
__m128i * p = ( __m128i * ) ( ( ( uintptr_t ) buf + 5 * 16 ) & - 16 ) ;
__m128i v = * ( __m128i_u * ) ( buf ) ;
__m128i * e = ( __m128i * ) ( ( ( uintptr_t ) buf + len ) & - 16 ) ;
__m128i w = * ( __m128i_u * ) ( buf + len - 16 ) ;
__m128i zero = _mm_setzero_si128 ( ) ;
/* Align head/tail to 16-byte boundaries. */
const __m128i * p = QEMU_ALIGN_PTR_DOWN ( buf + 16 , 16 ) ;
/* Loop over 16-byte aligned blocks of 64. */
const __m128i * e = QEMU_ALIGN_PTR_DOWN ( buf + len - 1 , 16 ) ;
while ( likely ( p < = e ) ) {
__m128i zero = { 0 } ;
t = _mm_cmpeq_epi8 ( t , zero ) ;
if ( unlikely ( _mm_movemask_epi8 ( t ) ! = 0xFFFF ) ) {
/* Collect a partial block at tail end. */
v | = e [ - 1 ] ; w | = e [ - 2 ] ;
SSE_REASSOC_BARRIER ( v , w ) ;
v | = e [ - 3 ] ; w | = e [ - 4 ] ;
SSE_REASSOC_BARRIER ( v , w ) ;
v | = e [ - 5 ] ; w | = e [ - 6 ] ;
SSE_REASSOC_BARRIER ( v , w ) ;
v | = e [ - 7 ] ; v | = w ;
/*
* Loop over complete 128 - byte blocks .
* With the head and tail removed , e - p > = 14 , so the loop
* must iterate at least once .
*/
do {
v = _mm_cmpeq_epi8 ( v , zero ) ;
if ( unlikely ( _mm_movemask_epi8 ( v ) ! = 0xFFFF ) ) {
return false ;
return false ;
}
}
t = p [ - 4 ] | p [ - 3 ] | p [ - 2 ] | p [ - 1 ] ;
v = p [ 0 ] ; w = p [ 1 ] ;
p + = 4 ;
SSE_REASSOC_BARRIER ( v , w ) ;
}
v | = p [ 2 ] ; w | = p [ 3 ] ;
SSE_REASSOC_BARRIER ( v , w ) ;
v | = p [ 4 ] ; w | = p [ 5 ] ;
SSE_REASSOC_BARRIER ( v , w ) ;
v | = p [ 6 ] ; w | = p [ 7 ] ;
SSE_REASSOC_BARRIER ( v , w ) ;
v | = w ;
p + = 8 ;
} while ( p < e - 7 ) ;
/* Finish the aligned tail. */
return _mm_movemask_epi8 ( _mm_cmpeq_epi8 ( v , zero ) ) = = 0xFFFF ;
t | = e [ - 3 ] ;
t | = e [ - 2 ] ;
t | = e [ - 1 ] ;
/* Finish the unaligned tail. */
t | = _mm_loadu_si128 ( buf + len - 16 ) ;
return _mm_movemask_epi8 ( _mm_cmpeq_epi8 ( t , zero ) ) = = 0xFFFF ;
}
}
# ifdef CONFIG_AVX2_OPT
# ifdef CONFIG_AVX2_OPT
static bool __attribute__ ( ( target ( " avx2 " ) ) )
static bool __attribute__ ( ( target ( " avx2 " ) ) )
buffer_zero_avx2 ( const void * buf , size_t len )
buffer_zero_avx2 ( const void * buf , size_t len )
{
{
/* Begin with an unaligned head of 32 bytes. */
/* Unaligned loads at head/tail. */
__m256i t = _mm256_loadu_si256 ( buf ) ;
__m256i v = * ( __m256i_u * ) ( buf ) ;
__m256i * p = ( __m256i * ) ( ( ( uintptr_t ) buf + 5 * 32 ) & - 32 ) ;
__m256i w = * ( __m256i_u * ) ( buf + len - 32 ) ;
__m256i * e = ( __m256i * ) ( ( ( uintptr_t ) buf + len ) & - 32 ) ;
/* Align head/tail to 32-byte boundaries. */
const __m256i * p = QEMU_ALIGN_PTR_DOWN ( buf + 32 , 32 ) ;
/* Loop over 32-byte aligned blocks of 128. */
const __m256i * e = QEMU_ALIGN_PTR_DOWN ( buf + len - 1 , 32 ) ;
while ( p < = e ) {
__m256i zero = { 0 } ;
if ( unlikely ( ! _mm256_testz_si256 ( t , t ) ) ) {
/* Collect a partial block at tail end. */
v | = e [ - 1 ] ; w | = e [ - 2 ] ;
SSE_REASSOC_BARRIER ( v , w ) ;
v | = e [ - 3 ] ; w | = e [ - 4 ] ;
SSE_REASSOC_BARRIER ( v , w ) ;
v | = e [ - 5 ] ; w | = e [ - 6 ] ;
SSE_REASSOC_BARRIER ( v , w ) ;
v | = e [ - 7 ] ; v | = w ;
/* Loop over complete 256-byte blocks. */
for ( ; p < e - 7 ; p + = 8 ) {
/* PTEST is not profitable here. */
v = _mm256_cmpeq_epi8 ( v , zero ) ;
if ( unlikely ( _mm256_movemask_epi8 ( v ) ! = 0xFFFFFFFF ) ) {
return false ;
return false ;
}
}
t = p [ - 4 ] | p [ - 3 ] | p [ - 2 ] | p [ - 1 ] ;
v = p [ 0 ] ; w = p [ 1 ] ;
p + = 4 ;
SSE_REASSOC_BARRIER ( v , w ) ;
} ;
v | = p [ 2 ] ; w | = p [ 3 ] ;
SSE_REASSOC_BARRIER ( v , w ) ;
/* Finish the last block of 128 unaligned. */
v | = p [ 4 ] ; w | = p [ 5 ] ;
t | = _mm256_loadu_si256 ( buf + len - 4 * 32 ) ;
SSE_REASSOC_BARRIER ( v , w ) ;
t | = _mm256_loadu_si256 ( buf + len - 3 * 32 ) ;
v | = p [ 6 ] ; w | = p [ 7 ] ;
t | = _mm256_loadu_si256 ( buf + len - 2 * 32 ) ;
SSE_REASSOC_BARRIER ( v , w ) ;
t | = _mm256_loadu_si256 ( buf + len - 1 * 32 ) ;
v | = w ;
}
return _mm256_testz_si256 ( t , t ) ;
return _mm256_movemask_epi8 ( _mm256_cmpeq_epi8 ( v , zero ) ) = = 0xFFFFFFFF ;
}
}
# endif /* CONFIG_AVX2_OPT */
# endif /* CONFIG_AVX2_OPT */