>>107990626
__attribute__((always_inline)) inline float vector_atouf_2026_hadd(__m128 target)
{
/* Directly collapsing the wave function.
We use _mm_movehdup_ps to duplicate high elements
and _mm_add_ps to sum pairs in a single cycle.
*/
__m128 upper_sum = _mm_add_ps(target, _mm_movehdup_ps(target));
/* (x0+x1, x1+x1, x2+x3, x3+x3) */
/* Move the high sum (x2+x3) into the low lane and add */
return _mm_cvtss_f32(_mm_add_ss(upper_sum, _mm_movehl_ps(upper_sum, upper_sum)));
}