#line 1 "numpy/core/src/umath/loops_unary_fp.dispatch.c.src"

/*
 *****************************************************************************
 **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
 **       Changes should be made to the original source (.src) file         **
 *****************************************************************************
 */

#line 1
/*@targets
 ** $maxopt baseline
 ** sse2 sse41
 ** vsx2
 ** neon asimd
 ** vx vxe
 **/
/**
 * Force use SSE only on x86, even if AVX2 or AVX512F are enabled
 * through the baseline, since scatter(AVX512F) and gather very costly
 * to handle non-contiguous memory access comparing with SSE for
 * such small operations that this file covers.
*/
#define NPY_SIMD_FORCE_128
#include "numpy/npy_math.h"
#include "simd/simd.h"
#include "loops_utils.h"
#include "loops.h"
/**********************************************************
 ** Scalars
 **********************************************************/
#if !NPY_SIMD_F32
NPY_FINLINE float c_recip_f32(float a)
{ return 1.0f / a; }
NPY_FINLINE float c_abs_f32(float a)
{
    const float tmp = a > 0 ? a : -a;
    /* add 0 to clear -0.0 */
    return tmp + 0;
}
NPY_FINLINE float c_square_f32(float a)
{ return a * a; }
#endif // !NPY_SIMD_F32

#if !NPY_SIMD_F64
NPY_FINLINE double c_recip_f64(double a)
{ return 1.0 / a; }
NPY_FINLINE double c_abs_f64(double a)
{
    const double tmp = a > 0 ? a : -a;
    /* add 0 to clear -0.0 */
    return tmp + 0;
}
NPY_FINLINE double c_square_f64(double a)
{ return a * a; }
#endif // !NPY_SIMD_F64
/**
 * MSVC(32-bit mode) requires a clarified contiguous loop
 * in order to use SSE, otherwise it uses a soft version of square root
 * that doesn't raise a domain error.
 */
#if defined(_MSC_VER) && defined(_M_IX86) && !NPY_SIMD
    #include <emmintrin.h>
    NPY_FINLINE float c_sqrt_f32(float _a)
    {
        __m128 a = _mm_load_ss(&_a);
        __m128 lower = _mm_sqrt_ss(a);
        return _mm_cvtss_f32(lower);
    }
    NPY_FINLINE double c_sqrt_f64(double _a)
    {
        __m128d a = _mm_load_sd(&_a);
        __m128d lower = _mm_sqrt_pd(a);
        return _mm_cvtsd_f64(lower);
    }
#else
    #define c_sqrt_f32 npy_sqrtf
    #define c_sqrt_f64 npy_sqrt
#endif

#define c_ceil_f32 npy_ceilf
#define c_ceil_f64 npy_ceil

#define c_trunc_f32 npy_truncf
#define c_trunc_f64 npy_trunc

#define c_floor_f32 npy_floorf
#define c_floor_f64 npy_floor

#define c_rint_f32 npy_rintf
#define c_rint_f64 npy_rint

/********************************************************************************
 ** Defining the SIMD kernels
 ********************************************************************************/
/** Notes:
 * - avoid the use of libmath to unify fp/domain errors
 *   for both scalars and vectors among all compilers/architectures.
 * - use intrinsic npyv_load_till_* instead of npyv_load_tillz_
 *   to fill the remind lanes with 1.0 to avoid divide by zero fp
 *   exception in reciprocal.
 */
#define CONTIG  0
#define NCONTIG 1

#line 101
#if NPY_SIMD_F32
#line 107
#line 112
static void simd_FLOAT_rint_CONTIG_CONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f32 *src = _src;
          npyv_lanetype_f32 *dst = _dst;

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 4;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
            #else
                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f32 v_unary0 = npyv_rint_f32(v_src0);
        #endif
        
#line 126
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
            #else
                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f32 v_unary1 = npyv_rint_f32(v_src1);
        #endif
        
#line 126
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
            #else
                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f32 v_unary2 = npyv_rint_f32(v_src2);
        #endif
        
#line 126
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
            #else
                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f32 v_unary3 = npyv_rint_f32(v_src3);
        #endif
        
        #line 138
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if CONTIG == CONTIG
        npyv_f32 v_src0 = npyv_load_f32(src);
    #else
        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
    #endif
        npyv_f32 v_unary0 = npyv_rint_f32(v_src0);
    #if CONTIG == CONTIG
        npyv_store_f32(dst, v_unary0);
    #else
        npyv_storen_f32(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if CONTIG == CONTIG
        #if 0
            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
        #else
            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
        #endif
    #else
        #if 0
            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
        #else
            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
        #endif
    #endif
        npyv_f32 v_unary0 = npyv_rint_f32(v_src0);
    #if CONTIG == CONTIG
        npyv_store_till_f32(dst, len, v_unary0);
    #else
        npyv_storen_till_f32(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_FLOAT_rint_NCONTIG_CONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f32 *src = _src;
          npyv_lanetype_f32 *dst = _dst;

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 4;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 4 > 0
            #if NCONTIG == CONTIG
                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
            #else
                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f32 v_unary0 = npyv_rint_f32(v_src0);
        #endif
        
#line 126
        #if 4 > 1
            #if NCONTIG == CONTIG
                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
            #else
                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f32 v_unary1 = npyv_rint_f32(v_src1);
        #endif
        
#line 126
        #if 4 > 2
            #if NCONTIG == CONTIG
                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
            #else
                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f32 v_unary2 = npyv_rint_f32(v_src2);
        #endif
        
#line 126
        #if 4 > 3
            #if NCONTIG == CONTIG
                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
            #else
                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f32 v_unary3 = npyv_rint_f32(v_src3);
        #endif
        
        #line 138
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if NCONTIG == CONTIG
        npyv_f32 v_src0 = npyv_load_f32(src);
    #else
        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
    #endif
        npyv_f32 v_unary0 = npyv_rint_f32(v_src0);
    #if CONTIG == CONTIG
        npyv_store_f32(dst, v_unary0);
    #else
        npyv_storen_f32(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if NCONTIG == CONTIG
        #if 0
            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
        #else
            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
        #endif
    #else
        #if 0
            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
        #else
            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
        #endif
    #endif
        npyv_f32 v_unary0 = npyv_rint_f32(v_src0);
    #if CONTIG == CONTIG
        npyv_store_till_f32(dst, len, v_unary0);
    #else
        npyv_storen_till_f32(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_FLOAT_rint_CONTIG_NCONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f32 *src = _src;
          npyv_lanetype_f32 *dst = _dst;

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 2;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 2 > 0
            #if CONTIG == CONTIG
                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
            #else
                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f32 v_unary0 = npyv_rint_f32(v_src0);
        #endif
        
#line 126
        #if 2 > 1
            #if CONTIG == CONTIG
                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
            #else
                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f32 v_unary1 = npyv_rint_f32(v_src1);
        #endif
        
#line 126
        #if 2 > 2
            #if CONTIG == CONTIG
                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
            #else
                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f32 v_unary2 = npyv_rint_f32(v_src2);
        #endif
        
#line 126
        #if 2 > 3
            #if CONTIG == CONTIG
                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
            #else
                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f32 v_unary3 = npyv_rint_f32(v_src3);
        #endif
        
        #line 138
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if CONTIG == CONTIG
        npyv_f32 v_src0 = npyv_load_f32(src);
    #else
        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
    #endif
        npyv_f32 v_unary0 = npyv_rint_f32(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_f32(dst, v_unary0);
    #else
        npyv_storen_f32(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if CONTIG == CONTIG
        #if 0
            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
        #else
            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
        #endif
    #else
        #if 0
            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
        #else
            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
        #endif
    #endif
        npyv_f32 v_unary0 = npyv_rint_f32(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_till_f32(dst, len, v_unary0);
    #else
        npyv_storen_till_f32(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_FLOAT_rint_NCONTIG_NCONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f32 *src = _src;
          npyv_lanetype_f32 *dst = _dst;

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 2;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
            #else
                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f32 v_unary0 = npyv_rint_f32(v_src0);
        #endif
        
#line 126
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
            #else
                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f32 v_unary1 = npyv_rint_f32(v_src1);
        #endif
        
#line 126
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
            #else
                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f32 v_unary2 = npyv_rint_f32(v_src2);
        #endif
        
#line 126
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
            #else
                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f32 v_unary3 = npyv_rint_f32(v_src3);
        #endif
        
        #line 138
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if NCONTIG == CONTIG
        npyv_f32 v_src0 = npyv_load_f32(src);
    #else
        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
    #endif
        npyv_f32 v_unary0 = npyv_rint_f32(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_f32(dst, v_unary0);
    #else
        npyv_storen_f32(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if NCONTIG == CONTIG
        #if 0
            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
        #else
            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
        #endif
    #else
        #if 0
            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
        #else
            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
        #endif
    #endif
        npyv_f32 v_unary0 = npyv_rint_f32(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_till_f32(dst, len, v_unary0);
    #else
        npyv_storen_till_f32(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}


#line 107
#line 112
static void simd_FLOAT_floor_CONTIG_CONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f32 *src = _src;
          npyv_lanetype_f32 *dst = _dst;

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 4;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
            #else
                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f32 v_unary0 = npyv_floor_f32(v_src0);
        #endif
        
#line 126
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
            #else
                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f32 v_unary1 = npyv_floor_f32(v_src1);
        #endif
        
#line 126
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
            #else
                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f32 v_unary2 = npyv_floor_f32(v_src2);
        #endif
        
#line 126
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
            #else
                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f32 v_unary3 = npyv_floor_f32(v_src3);
        #endif
        
        #line 138
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if CONTIG == CONTIG
        npyv_f32 v_src0 = npyv_load_f32(src);
    #else
        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
    #endif
        npyv_f32 v_unary0 = npyv_floor_f32(v_src0);
    #if CONTIG == CONTIG
        npyv_store_f32(dst, v_unary0);
    #else
        npyv_storen_f32(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if CONTIG == CONTIG
        #if 0
            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
        #else
            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
        #endif
    #else
        #if 0
            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
        #else
            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
        #endif
    #endif
        npyv_f32 v_unary0 = npyv_floor_f32(v_src0);
    #if CONTIG == CONTIG
        npyv_store_till_f32(dst, len, v_unary0);
    #else
        npyv_storen_till_f32(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_FLOAT_floor_NCONTIG_CONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f32 *src = _src;
          npyv_lanetype_f32 *dst = _dst;

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 4;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 4 > 0
            #if NCONTIG == CONTIG
                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
            #else
                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f32 v_unary0 = npyv_floor_f32(v_src0);
        #endif
        
#line 126
        #if 4 > 1
            #if NCONTIG == CONTIG
                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
            #else
                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f32 v_unary1 = npyv_floor_f32(v_src1);
        #endif
        
#line 126
        #if 4 > 2
            #if NCONTIG == CONTIG
                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
            #else
                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f32 v_unary2 = npyv_floor_f32(v_src2);
        #endif
        
#line 126
        #if 4 > 3
            #if NCONTIG == CONTIG
                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
            #else
                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f32 v_unary3 = npyv_floor_f32(v_src3);
        #endif
        
        #line 138
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if NCONTIG == CONTIG
        npyv_f32 v_src0 = npyv_load_f32(src);
    #else
        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
    #endif
        npyv_f32 v_unary0 = npyv_floor_f32(v_src0);
    #if CONTIG == CONTIG
        npyv_store_f32(dst, v_unary0);
    #else
        npyv_storen_f32(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if NCONTIG == CONTIG
        #if 0
            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
        #else
            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
        #endif
    #else
        #if 0
            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
        #else
            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
        #endif
    #endif
        npyv_f32 v_unary0 = npyv_floor_f32(v_src0);
    #if CONTIG == CONTIG
        npyv_store_till_f32(dst, len, v_unary0);
    #else
        npyv_storen_till_f32(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_FLOAT_floor_CONTIG_NCONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f32 *src = _src;
          npyv_lanetype_f32 *dst = _dst;

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 2;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 2 > 0
            #if CONTIG == CONTIG
                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
            #else
                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f32 v_unary0 = npyv_floor_f32(v_src0);
        #endif
        
#line 126
        #if 2 > 1
            #if CONTIG == CONTIG
                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
            #else
                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f32 v_unary1 = npyv_floor_f32(v_src1);
        #endif
        
#line 126
        #if 2 > 2
            #if CONTIG == CONTIG
                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
            #else
                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f32 v_unary2 = npyv_floor_f32(v_src2);
        #endif
        
#line 126
        #if 2 > 3
            #if CONTIG == CONTIG
                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
            #else
                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f32 v_unary3 = npyv_floor_f32(v_src3);
        #endif
        
        #line 138
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if CONTIG == CONTIG
        npyv_f32 v_src0 = npyv_load_f32(src);
    #else
        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
    #endif
        npyv_f32 v_unary0 = npyv_floor_f32(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_f32(dst, v_unary0);
    #else
        npyv_storen_f32(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if CONTIG == CONTIG
        #if 0
            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
        #else
            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
        #endif
    #else
        #if 0
            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
        #else
            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
        #endif
    #endif
        npyv_f32 v_unary0 = npyv_floor_f32(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_till_f32(dst, len, v_unary0);
    #else
        npyv_storen_till_f32(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_FLOAT_floor_NCONTIG_NCONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f32 *src = _src;
          npyv_lanetype_f32 *dst = _dst;

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 2;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
            #else
                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f32 v_unary0 = npyv_floor_f32(v_src0);
        #endif
        
#line 126
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
            #else
                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f32 v_unary1 = npyv_floor_f32(v_src1);
        #endif
        
#line 126
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
            #else
                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f32 v_unary2 = npyv_floor_f32(v_src2);
        #endif
        
#line 126
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
            #else
                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f32 v_unary3 = npyv_floor_f32(v_src3);
        #endif
        
        #line 138
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if NCONTIG == CONTIG
        npyv_f32 v_src0 = npyv_load_f32(src);
    #else
        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
    #endif
        npyv_f32 v_unary0 = npyv_floor_f32(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_f32(dst, v_unary0);
    #else
        npyv_storen_f32(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if NCONTIG == CONTIG
        #if 0
            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
        #else
            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
        #endif
    #else
        #if 0
            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
        #else
            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
        #endif
    #endif
        npyv_f32 v_unary0 = npyv_floor_f32(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_till_f32(dst, len, v_unary0);
    #else
        npyv_storen_till_f32(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}


#line 107
#line 112
static void simd_FLOAT_ceil_CONTIG_CONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f32 *src = _src;
          npyv_lanetype_f32 *dst = _dst;

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 4;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
            #else
                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f32 v_unary0 = npyv_ceil_f32(v_src0);
        #endif
        
#line 126
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
            #else
                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f32 v_unary1 = npyv_ceil_f32(v_src1);
        #endif
        
#line 126
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
            #else
                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f32 v_unary2 = npyv_ceil_f32(v_src2);
        #endif
        
#line 126
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
            #else
                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f32 v_unary3 = npyv_ceil_f32(v_src3);
        #endif
        
        #line 138
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if CONTIG == CONTIG
        npyv_f32 v_src0 = npyv_load_f32(src);
    #else
        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
    #endif
        npyv_f32 v_unary0 = npyv_ceil_f32(v_src0);
    #if CONTIG == CONTIG
        npyv_store_f32(dst, v_unary0);
    #else
        npyv_storen_f32(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if CONTIG == CONTIG
        #if 0
            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
        #else
            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
        #endif
    #else
        #if 0
            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
        #else
            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
        #endif
    #endif
        npyv_f32 v_unary0 = npyv_ceil_f32(v_src0);
    #if CONTIG == CONTIG
        npyv_store_till_f32(dst, len, v_unary0);
    #else
        npyv_storen_till_f32(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_FLOAT_ceil_NCONTIG_CONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f32 *src = _src;
          npyv_lanetype_f32 *dst = _dst;

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 4;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 4 > 0
            #if NCONTIG == CONTIG
                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
            #else
                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f32 v_unary0 = npyv_ceil_f32(v_src0);
        #endif
        
#line 126
        #if 4 > 1
            #if NCONTIG == CONTIG
                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
            #else
                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f32 v_unary1 = npyv_ceil_f32(v_src1);
        #endif
        
#line 126
        #if 4 > 2
            #if NCONTIG == CONTIG
                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
            #else
                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f32 v_unary2 = npyv_ceil_f32(v_src2);
        #endif
        
#line 126
        #if 4 > 3
            #if NCONTIG == CONTIG
                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
            #else
                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f32 v_unary3 = npyv_ceil_f32(v_src3);
        #endif
        
        #line 138
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if NCONTIG == CONTIG
        npyv_f32 v_src0 = npyv_load_f32(src);
    #else
        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
    #endif
        npyv_f32 v_unary0 = npyv_ceil_f32(v_src0);
    #if CONTIG == CONTIG
        npyv_store_f32(dst, v_unary0);
    #else
        npyv_storen_f32(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if NCONTIG == CONTIG
        #if 0
            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
        #else
            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
        #endif
    #else
        #if 0
            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
        #else
            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
        #endif
    #endif
        npyv_f32 v_unary0 = npyv_ceil_f32(v_src0);
    #if CONTIG == CONTIG
        npyv_store_till_f32(dst, len, v_unary0);
    #else
        npyv_storen_till_f32(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_FLOAT_ceil_CONTIG_NCONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f32 *src = _src;
          npyv_lanetype_f32 *dst = _dst;

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 2;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 2 > 0
            #if CONTIG == CONTIG
                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
            #else
                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f32 v_unary0 = npyv_ceil_f32(v_src0);
        #endif
        
#line 126
        #if 2 > 1
            #if CONTIG == CONTIG
                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
            #else
                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f32 v_unary1 = npyv_ceil_f32(v_src1);
        #endif
        
#line 126
        #if 2 > 2
            #if CONTIG == CONTIG
                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
            #else
                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f32 v_unary2 = npyv_ceil_f32(v_src2);
        #endif
        
#line 126
        #if 2 > 3
            #if CONTIG == CONTIG
                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
            #else
                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f32 v_unary3 = npyv_ceil_f32(v_src3);
        #endif
        
        #line 138
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if CONTIG == CONTIG
        npyv_f32 v_src0 = npyv_load_f32(src);
    #else
        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
    #endif
        npyv_f32 v_unary0 = npyv_ceil_f32(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_f32(dst, v_unary0);
    #else
        npyv_storen_f32(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if CONTIG == CONTIG
        #if 0
            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
        #else
            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
        #endif
    #else
        #if 0
            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
        #else
            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
        #endif
    #endif
        npyv_f32 v_unary0 = npyv_ceil_f32(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_till_f32(dst, len, v_unary0);
    #else
        npyv_storen_till_f32(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_FLOAT_ceil_NCONTIG_NCONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f32 *src = _src;
          npyv_lanetype_f32 *dst = _dst;

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 2;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
            #else
                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f32 v_unary0 = npyv_ceil_f32(v_src0);
        #endif
        
#line 126
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
            #else
                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f32 v_unary1 = npyv_ceil_f32(v_src1);
        #endif
        
#line 126
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
            #else
                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f32 v_unary2 = npyv_ceil_f32(v_src2);
        #endif
        
#line 126
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
            #else
                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f32 v_unary3 = npyv_ceil_f32(v_src3);
        #endif
        
        #line 138
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if NCONTIG == CONTIG
        npyv_f32 v_src0 = npyv_load_f32(src);
    #else
        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
    #endif
        npyv_f32 v_unary0 = npyv_ceil_f32(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_f32(dst, v_unary0);
    #else
        npyv_storen_f32(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if NCONTIG == CONTIG
        #if 0
            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
        #else
            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
        #endif
    #else
        #if 0
            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
        #else
            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
        #endif
    #endif
        npyv_f32 v_unary0 = npyv_ceil_f32(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_till_f32(dst, len, v_unary0);
    #else
        npyv_storen_till_f32(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}


#line 107
#line 112
static void simd_FLOAT_trunc_CONTIG_CONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f32 *src = _src;
          npyv_lanetype_f32 *dst = _dst;

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 4;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
            #else
                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f32 v_unary0 = npyv_trunc_f32(v_src0);
        #endif
        
#line 126
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
            #else
                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f32 v_unary1 = npyv_trunc_f32(v_src1);
        #endif
        
#line 126
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
            #else
                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f32 v_unary2 = npyv_trunc_f32(v_src2);
        #endif
        
#line 126
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
            #else
                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f32 v_unary3 = npyv_trunc_f32(v_src3);
        #endif
        
        #line 138
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if CONTIG == CONTIG
        npyv_f32 v_src0 = npyv_load_f32(src);
    #else
        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
    #endif
        npyv_f32 v_unary0 = npyv_trunc_f32(v_src0);
    #if CONTIG == CONTIG
        npyv_store_f32(dst, v_unary0);
    #else
        npyv_storen_f32(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if CONTIG == CONTIG
        #if 0
            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
        #else
            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
        #endif
    #else
        #if 0
            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
        #else
            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
        #endif
    #endif
        npyv_f32 v_unary0 = npyv_trunc_f32(v_src0);
    #if CONTIG == CONTIG
        npyv_store_till_f32(dst, len, v_unary0);
    #else
        npyv_storen_till_f32(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_FLOAT_trunc_NCONTIG_CONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f32 *src = _src;
          npyv_lanetype_f32 *dst = _dst;

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 4;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 4 > 0
            #if NCONTIG == CONTIG
                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
            #else
                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f32 v_unary0 = npyv_trunc_f32(v_src0);
        #endif
        
#line 126
        #if 4 > 1
            #if NCONTIG == CONTIG
                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
            #else
                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f32 v_unary1 = npyv_trunc_f32(v_src1);
        #endif
        
#line 126
        #if 4 > 2
            #if NCONTIG == CONTIG
                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
            #else
                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f32 v_unary2 = npyv_trunc_f32(v_src2);
        #endif
        
#line 126
        #if 4 > 3
            #if NCONTIG == CONTIG
                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
            #else
                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f32 v_unary3 = npyv_trunc_f32(v_src3);
        #endif
        
        #line 138
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if NCONTIG == CONTIG
        npyv_f32 v_src0 = npyv_load_f32(src);
    #else
        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
    #endif
        npyv_f32 v_unary0 = npyv_trunc_f32(v_src0);
    #if CONTIG == CONTIG
        npyv_store_f32(dst, v_unary0);
    #else
        npyv_storen_f32(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if NCONTIG == CONTIG
        #if 0
            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
        #else
            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
        #endif
    #else
        #if 0
            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
        #else
            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
        #endif
    #endif
        npyv_f32 v_unary0 = npyv_trunc_f32(v_src0);
    #if CONTIG == CONTIG
        npyv_store_till_f32(dst, len, v_unary0);
    #else
        npyv_storen_till_f32(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_FLOAT_trunc_CONTIG_NCONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f32 *src = _src;
          npyv_lanetype_f32 *dst = _dst;

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 2;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 2 > 0
            #if CONTIG == CONTIG
                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
            #else
                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f32 v_unary0 = npyv_trunc_f32(v_src0);
        #endif
        
#line 126
        #if 2 > 1
            #if CONTIG == CONTIG
                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
            #else
                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f32 v_unary1 = npyv_trunc_f32(v_src1);
        #endif
        
#line 126
        #if 2 > 2
            #if CONTIG == CONTIG
                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
            #else
                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f32 v_unary2 = npyv_trunc_f32(v_src2);
        #endif
        
#line 126
        #if 2 > 3
            #if CONTIG == CONTIG
                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
            #else
                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f32 v_unary3 = npyv_trunc_f32(v_src3);
        #endif
        
        #line 138
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if CONTIG == CONTIG
        npyv_f32 v_src0 = npyv_load_f32(src);
    #else
        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
    #endif
        npyv_f32 v_unary0 = npyv_trunc_f32(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_f32(dst, v_unary0);
    #else
        npyv_storen_f32(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if CONTIG == CONTIG
        #if 0
            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
        #else
            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
        #endif
    #else
        #if 0
            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
        #else
            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
        #endif
    #endif
        npyv_f32 v_unary0 = npyv_trunc_f32(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_till_f32(dst, len, v_unary0);
    #else
        npyv_storen_till_f32(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_FLOAT_trunc_NCONTIG_NCONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f32 *src = _src;
          npyv_lanetype_f32 *dst = _dst;

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 2;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
            #else
                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f32 v_unary0 = npyv_trunc_f32(v_src0);
        #endif
        
#line 126
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
            #else
                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f32 v_unary1 = npyv_trunc_f32(v_src1);
        #endif
        
#line 126
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
            #else
                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f32 v_unary2 = npyv_trunc_f32(v_src2);
        #endif
        
#line 126
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
            #else
                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f32 v_unary3 = npyv_trunc_f32(v_src3);
        #endif
        
        #line 138
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if NCONTIG == CONTIG
        npyv_f32 v_src0 = npyv_load_f32(src);
    #else
        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
    #endif
        npyv_f32 v_unary0 = npyv_trunc_f32(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_f32(dst, v_unary0);
    #else
        npyv_storen_f32(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if NCONTIG == CONTIG
        #if 0
            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
        #else
            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
        #endif
    #else
        #if 0
            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
        #else
            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
        #endif
    #endif
        npyv_f32 v_unary0 = npyv_trunc_f32(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_till_f32(dst, len, v_unary0);
    #else
        npyv_storen_till_f32(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}


#line 107
#line 112
static void simd_FLOAT_sqrt_CONTIG_CONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f32 *src = _src;
          npyv_lanetype_f32 *dst = _dst;

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 4;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
            #else
                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f32 v_unary0 = npyv_sqrt_f32(v_src0);
        #endif
        
#line 126
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
            #else
                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f32 v_unary1 = npyv_sqrt_f32(v_src1);
        #endif
        
#line 126
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
            #else
                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f32 v_unary2 = npyv_sqrt_f32(v_src2);
        #endif
        
#line 126
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
            #else
                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f32 v_unary3 = npyv_sqrt_f32(v_src3);
        #endif
        
        #line 138
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if CONTIG == CONTIG
        npyv_f32 v_src0 = npyv_load_f32(src);
    #else
        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
    #endif
        npyv_f32 v_unary0 = npyv_sqrt_f32(v_src0);
    #if CONTIG == CONTIG
        npyv_store_f32(dst, v_unary0);
    #else
        npyv_storen_f32(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if CONTIG == CONTIG
        #if 0
            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
        #else
            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
        #endif
    #else
        #if 0
            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
        #else
            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
        #endif
    #endif
        npyv_f32 v_unary0 = npyv_sqrt_f32(v_src0);
    #if CONTIG == CONTIG
        npyv_store_till_f32(dst, len, v_unary0);
    #else
        npyv_storen_till_f32(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_FLOAT_sqrt_NCONTIG_CONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f32 *src = _src;
          npyv_lanetype_f32 *dst = _dst;

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 4;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 4 > 0
            #if NCONTIG == CONTIG
                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
            #else
                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f32 v_unary0 = npyv_sqrt_f32(v_src0);
        #endif
        
#line 126
        #if 4 > 1
            #if NCONTIG == CONTIG
                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
            #else
                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f32 v_unary1 = npyv_sqrt_f32(v_src1);
        #endif
        
#line 126
        #if 4 > 2
            #if NCONTIG == CONTIG
                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
            #else
                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f32 v_unary2 = npyv_sqrt_f32(v_src2);
        #endif
        
#line 126
        #if 4 > 3
            #if NCONTIG == CONTIG
                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
            #else
                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f32 v_unary3 = npyv_sqrt_f32(v_src3);
        #endif
        
        #line 138
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if NCONTIG == CONTIG
        npyv_f32 v_src0 = npyv_load_f32(src);
    #else
        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
    #endif
        npyv_f32 v_unary0 = npyv_sqrt_f32(v_src0);
    #if CONTIG == CONTIG
        npyv_store_f32(dst, v_unary0);
    #else
        npyv_storen_f32(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if NCONTIG == CONTIG
        #if 0
            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
        #else
            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
        #endif
    #else
        #if 0
            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
        #else
            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
        #endif
    #endif
        npyv_f32 v_unary0 = npyv_sqrt_f32(v_src0);
    #if CONTIG == CONTIG
        npyv_store_till_f32(dst, len, v_unary0);
    #else
        npyv_storen_till_f32(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_FLOAT_sqrt_CONTIG_NCONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f32 *src = _src;
          npyv_lanetype_f32 *dst = _dst;

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 2;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 2 > 0
            #if CONTIG == CONTIG
                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
            #else
                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f32 v_unary0 = npyv_sqrt_f32(v_src0);
        #endif
        
#line 126
        #if 2 > 1
            #if CONTIG == CONTIG
                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
            #else
                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f32 v_unary1 = npyv_sqrt_f32(v_src1);
        #endif
        
#line 126
        #if 2 > 2
            #if CONTIG == CONTIG
                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
            #else
                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f32 v_unary2 = npyv_sqrt_f32(v_src2);
        #endif
        
#line 126
        #if 2 > 3
            #if CONTIG == CONTIG
                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
            #else
                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f32 v_unary3 = npyv_sqrt_f32(v_src3);
        #endif
        
        #line 138
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if CONTIG == CONTIG
        npyv_f32 v_src0 = npyv_load_f32(src);
    #else
        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
    #endif
        npyv_f32 v_unary0 = npyv_sqrt_f32(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_f32(dst, v_unary0);
    #else
        npyv_storen_f32(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if CONTIG == CONTIG
        #if 0
            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
        #else
            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
        #endif
    #else
        #if 0
            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
        #else
            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
        #endif
    #endif
        npyv_f32 v_unary0 = npyv_sqrt_f32(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_till_f32(dst, len, v_unary0);
    #else
        npyv_storen_till_f32(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_FLOAT_sqrt_NCONTIG_NCONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f32 *src = _src;
          npyv_lanetype_f32 *dst = _dst;

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 2;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
            #else
                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f32 v_unary0 = npyv_sqrt_f32(v_src0);
        #endif
        
#line 126
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
            #else
                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f32 v_unary1 = npyv_sqrt_f32(v_src1);
        #endif
        
#line 126
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
            #else
                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f32 v_unary2 = npyv_sqrt_f32(v_src2);
        #endif
        
#line 126
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
            #else
                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f32 v_unary3 = npyv_sqrt_f32(v_src3);
        #endif
        
        #line 138
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if NCONTIG == CONTIG
        npyv_f32 v_src0 = npyv_load_f32(src);
    #else
        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
    #endif
        npyv_f32 v_unary0 = npyv_sqrt_f32(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_f32(dst, v_unary0);
    #else
        npyv_storen_f32(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if NCONTIG == CONTIG
        #if 0
            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
        #else
            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
        #endif
    #else
        #if 0
            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
        #else
            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
        #endif
    #endif
        npyv_f32 v_unary0 = npyv_sqrt_f32(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_till_f32(dst, len, v_unary0);
    #else
        npyv_storen_till_f32(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}


#line 107
#line 112
static void simd_FLOAT_absolute_CONTIG_CONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f32 *src = _src;
          npyv_lanetype_f32 *dst = _dst;

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 4;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
            #else
                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f32 v_unary0 = npyv_abs_f32(v_src0);
        #endif
        
#line 126
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
            #else
                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f32 v_unary1 = npyv_abs_f32(v_src1);
        #endif
        
#line 126
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
            #else
                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f32 v_unary2 = npyv_abs_f32(v_src2);
        #endif
        
#line 126
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
            #else
                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f32 v_unary3 = npyv_abs_f32(v_src3);
        #endif
        
        #line 138
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if CONTIG == CONTIG
        npyv_f32 v_src0 = npyv_load_f32(src);
    #else
        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
    #endif
        npyv_f32 v_unary0 = npyv_abs_f32(v_src0);
    #if CONTIG == CONTIG
        npyv_store_f32(dst, v_unary0);
    #else
        npyv_storen_f32(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if CONTIG == CONTIG
        #if 0
            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
        #else
            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
        #endif
    #else
        #if 0
            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
        #else
            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
        #endif
    #endif
        npyv_f32 v_unary0 = npyv_abs_f32(v_src0);
    #if CONTIG == CONTIG
        npyv_store_till_f32(dst, len, v_unary0);
    #else
        npyv_storen_till_f32(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_FLOAT_absolute_NCONTIG_CONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f32 *src = _src;
          npyv_lanetype_f32 *dst = _dst;

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 4;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 4 > 0
            #if NCONTIG == CONTIG
                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
            #else
                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f32 v_unary0 = npyv_abs_f32(v_src0);
        #endif
        
#line 126
        #if 4 > 1
            #if NCONTIG == CONTIG
                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
            #else
                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f32 v_unary1 = npyv_abs_f32(v_src1);
        #endif
        
#line 126
        #if 4 > 2
            #if NCONTIG == CONTIG
                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
            #else
                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f32 v_unary2 = npyv_abs_f32(v_src2);
        #endif
        
#line 126
        #if 4 > 3
            #if NCONTIG == CONTIG
                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
            #else
                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f32 v_unary3 = npyv_abs_f32(v_src3);
        #endif
        
        #line 138
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if NCONTIG == CONTIG
        npyv_f32 v_src0 = npyv_load_f32(src);
    #else
        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
    #endif
        npyv_f32 v_unary0 = npyv_abs_f32(v_src0);
    #if CONTIG == CONTIG
        npyv_store_f32(dst, v_unary0);
    #else
        npyv_storen_f32(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if NCONTIG == CONTIG
        #if 0
            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
        #else
            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
        #endif
    #else
        #if 0
            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
        #else
            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
        #endif
    #endif
        npyv_f32 v_unary0 = npyv_abs_f32(v_src0);
    #if CONTIG == CONTIG
        npyv_store_till_f32(dst, len, v_unary0);
    #else
        npyv_storen_till_f32(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_FLOAT_absolute_CONTIG_NCONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f32 *src = _src;
          npyv_lanetype_f32 *dst = _dst;

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 2;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 2 > 0
            #if CONTIG == CONTIG
                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
            #else
                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f32 v_unary0 = npyv_abs_f32(v_src0);
        #endif
        
#line 126
        #if 2 > 1
            #if CONTIG == CONTIG
                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
            #else
                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f32 v_unary1 = npyv_abs_f32(v_src1);
        #endif
        
#line 126
        #if 2 > 2
            #if CONTIG == CONTIG
                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
            #else
                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f32 v_unary2 = npyv_abs_f32(v_src2);
        #endif
        
#line 126
        #if 2 > 3
            #if CONTIG == CONTIG
                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
            #else
                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f32 v_unary3 = npyv_abs_f32(v_src3);
        #endif
        
        #line 138
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if CONTIG == CONTIG
        npyv_f32 v_src0 = npyv_load_f32(src);
    #else
        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
    #endif
        npyv_f32 v_unary0 = npyv_abs_f32(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_f32(dst, v_unary0);
    #else
        npyv_storen_f32(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if CONTIG == CONTIG
        #if 0
            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
        #else
            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
        #endif
    #else
        #if 0
            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
        #else
            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
        #endif
    #endif
        npyv_f32 v_unary0 = npyv_abs_f32(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_till_f32(dst, len, v_unary0);
    #else
        npyv_storen_till_f32(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_FLOAT_absolute_NCONTIG_NCONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f32 *src = _src;
          npyv_lanetype_f32 *dst = _dst;

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 2;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
            #else
                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f32 v_unary0 = npyv_abs_f32(v_src0);
        #endif
        
#line 126
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
            #else
                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f32 v_unary1 = npyv_abs_f32(v_src1);
        #endif
        
#line 126
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
            #else
                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f32 v_unary2 = npyv_abs_f32(v_src2);
        #endif
        
#line 126
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
            #else
                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f32 v_unary3 = npyv_abs_f32(v_src3);
        #endif
        
        #line 138
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if NCONTIG == CONTIG
        npyv_f32 v_src0 = npyv_load_f32(src);
    #else
        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
    #endif
        npyv_f32 v_unary0 = npyv_abs_f32(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_f32(dst, v_unary0);
    #else
        npyv_storen_f32(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if NCONTIG == CONTIG
        #if 0
            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
        #else
            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
        #endif
    #else
        #if 0
            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
        #else
            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
        #endif
    #endif
        npyv_f32 v_unary0 = npyv_abs_f32(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_till_f32(dst, len, v_unary0);
    #else
        npyv_storen_till_f32(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}


#line 107
#line 112
static void simd_FLOAT_square_CONTIG_CONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f32 *src = _src;
          npyv_lanetype_f32 *dst = _dst;

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 4;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
            #else
                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f32 v_unary0 = npyv_square_f32(v_src0);
        #endif
        
#line 126
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
            #else
                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f32 v_unary1 = npyv_square_f32(v_src1);
        #endif
        
#line 126
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
            #else
                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f32 v_unary2 = npyv_square_f32(v_src2);
        #endif
        
#line 126
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
            #else
                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f32 v_unary3 = npyv_square_f32(v_src3);
        #endif
        
        #line 138
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if CONTIG == CONTIG
        npyv_f32 v_src0 = npyv_load_f32(src);
    #else
        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
    #endif
        npyv_f32 v_unary0 = npyv_square_f32(v_src0);
    #if CONTIG == CONTIG
        npyv_store_f32(dst, v_unary0);
    #else
        npyv_storen_f32(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if CONTIG == CONTIG
        #if 0
            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
        #else
            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
        #endif
    #else
        #if 0
            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
        #else
            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
        #endif
    #endif
        npyv_f32 v_unary0 = npyv_square_f32(v_src0);
    #if CONTIG == CONTIG
        npyv_store_till_f32(dst, len, v_unary0);
    #else
        npyv_storen_till_f32(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_FLOAT_square_NCONTIG_CONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f32 *src = _src;
          npyv_lanetype_f32 *dst = _dst;

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 4;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 4 > 0
            #if NCONTIG == CONTIG
                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
            #else
                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f32 v_unary0 = npyv_square_f32(v_src0);
        #endif
        
#line 126
        #if 4 > 1
            #if NCONTIG == CONTIG
                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
            #else
                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f32 v_unary1 = npyv_square_f32(v_src1);
        #endif
        
#line 126
        #if 4 > 2
            #if NCONTIG == CONTIG
                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
            #else
                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f32 v_unary2 = npyv_square_f32(v_src2);
        #endif
        
#line 126
        #if 4 > 3
            #if NCONTIG == CONTIG
                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
            #else
                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f32 v_unary3 = npyv_square_f32(v_src3);
        #endif
        
        #line 138
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if NCONTIG == CONTIG
        npyv_f32 v_src0 = npyv_load_f32(src);
    #else
        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
    #endif
        npyv_f32 v_unary0 = npyv_square_f32(v_src0);
    #if CONTIG == CONTIG
        npyv_store_f32(dst, v_unary0);
    #else
        npyv_storen_f32(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if NCONTIG == CONTIG
        #if 0
            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
        #else
            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
        #endif
    #else
        #if 0
            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
        #else
            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
        #endif
    #endif
        npyv_f32 v_unary0 = npyv_square_f32(v_src0);
    #if CONTIG == CONTIG
        npyv_store_till_f32(dst, len, v_unary0);
    #else
        npyv_storen_till_f32(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_FLOAT_square_CONTIG_NCONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f32 *src = _src;
          npyv_lanetype_f32 *dst = _dst;

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 2;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 2 > 0
            #if CONTIG == CONTIG
                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
            #else
                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f32 v_unary0 = npyv_square_f32(v_src0);
        #endif
        
#line 126
        #if 2 > 1
            #if CONTIG == CONTIG
                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
            #else
                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f32 v_unary1 = npyv_square_f32(v_src1);
        #endif
        
#line 126
        #if 2 > 2
            #if CONTIG == CONTIG
                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
            #else
                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f32 v_unary2 = npyv_square_f32(v_src2);
        #endif
        
#line 126
        #if 2 > 3
            #if CONTIG == CONTIG
                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
            #else
                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f32 v_unary3 = npyv_square_f32(v_src3);
        #endif
        
        #line 138
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if CONTIG == CONTIG
        npyv_f32 v_src0 = npyv_load_f32(src);
    #else
        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
    #endif
        npyv_f32 v_unary0 = npyv_square_f32(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_f32(dst, v_unary0);
    #else
        npyv_storen_f32(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if CONTIG == CONTIG
        #if 0
            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
        #else
            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
        #endif
    #else
        #if 0
            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
        #else
            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
        #endif
    #endif
        npyv_f32 v_unary0 = npyv_square_f32(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_till_f32(dst, len, v_unary0);
    #else
        npyv_storen_till_f32(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_FLOAT_square_NCONTIG_NCONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f32 *src = _src;
          npyv_lanetype_f32 *dst = _dst;

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 2;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
            #else
                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f32 v_unary0 = npyv_square_f32(v_src0);
        #endif
        
#line 126
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
            #else
                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f32 v_unary1 = npyv_square_f32(v_src1);
        #endif
        
#line 126
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
            #else
                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f32 v_unary2 = npyv_square_f32(v_src2);
        #endif
        
#line 126
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
            #else
                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f32 v_unary3 = npyv_square_f32(v_src3);
        #endif
        
        #line 138
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if NCONTIG == CONTIG
        npyv_f32 v_src0 = npyv_load_f32(src);
    #else
        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
    #endif
        npyv_f32 v_unary0 = npyv_square_f32(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_f32(dst, v_unary0);
    #else
        npyv_storen_f32(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if NCONTIG == CONTIG
        #if 0
            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
        #else
            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
        #endif
    #else
        #if 0
            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
        #else
            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
        #endif
    #endif
        npyv_f32 v_unary0 = npyv_square_f32(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_till_f32(dst, len, v_unary0);
    #else
        npyv_storen_till_f32(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}


#line 107
#line 112
static void simd_FLOAT_reciprocal_CONTIG_CONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f32 *src = _src;
          npyv_lanetype_f32 *dst = _dst;

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 4;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
            #else
                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f32 v_unary0 = npyv_recip_f32(v_src0);
        #endif
        
#line 126
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
            #else
                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f32 v_unary1 = npyv_recip_f32(v_src1);
        #endif
        
#line 126
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
            #else
                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f32 v_unary2 = npyv_recip_f32(v_src2);
        #endif
        
#line 126
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
            #else
                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f32 v_unary3 = npyv_recip_f32(v_src3);
        #endif
        
        #line 138
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if CONTIG == CONTIG
        npyv_f32 v_src0 = npyv_load_f32(src);
    #else
        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
    #endif
        npyv_f32 v_unary0 = npyv_recip_f32(v_src0);
    #if CONTIG == CONTIG
        npyv_store_f32(dst, v_unary0);
    #else
        npyv_storen_f32(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if CONTIG == CONTIG
        #if 1
            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
        #else
            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
        #endif
    #else
        #if 1
            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
        #else
            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
        #endif
    #endif
        npyv_f32 v_unary0 = npyv_recip_f32(v_src0);
    #if CONTIG == CONTIG
        npyv_store_till_f32(dst, len, v_unary0);
    #else
        npyv_storen_till_f32(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_FLOAT_reciprocal_NCONTIG_CONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f32 *src = _src;
          npyv_lanetype_f32 *dst = _dst;

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 4;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 4 > 0
            #if NCONTIG == CONTIG
                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
            #else
                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f32 v_unary0 = npyv_recip_f32(v_src0);
        #endif
        
#line 126
        #if 4 > 1
            #if NCONTIG == CONTIG
                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
            #else
                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f32 v_unary1 = npyv_recip_f32(v_src1);
        #endif
        
#line 126
        #if 4 > 2
            #if NCONTIG == CONTIG
                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
            #else
                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f32 v_unary2 = npyv_recip_f32(v_src2);
        #endif
        
#line 126
        #if 4 > 3
            #if NCONTIG == CONTIG
                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
            #else
                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f32 v_unary3 = npyv_recip_f32(v_src3);
        #endif
        
        #line 138
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_store_f32(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if NCONTIG == CONTIG
        npyv_f32 v_src0 = npyv_load_f32(src);
    #else
        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
    #endif
        npyv_f32 v_unary0 = npyv_recip_f32(v_src0);
    #if CONTIG == CONTIG
        npyv_store_f32(dst, v_unary0);
    #else
        npyv_storen_f32(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if NCONTIG == CONTIG
        #if 1
            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
        #else
            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
        #endif
    #else
        #if 1
            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
        #else
            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
        #endif
    #endif
        npyv_f32 v_unary0 = npyv_recip_f32(v_src0);
    #if CONTIG == CONTIG
        npyv_store_till_f32(dst, len, v_unary0);
    #else
        npyv_storen_till_f32(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_FLOAT_reciprocal_CONTIG_NCONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f32 *src = _src;
          npyv_lanetype_f32 *dst = _dst;

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 2;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 2 > 0
            #if CONTIG == CONTIG
                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
            #else
                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f32 v_unary0 = npyv_recip_f32(v_src0);
        #endif
        
#line 126
        #if 2 > 1
            #if CONTIG == CONTIG
                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
            #else
                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f32 v_unary1 = npyv_recip_f32(v_src1);
        #endif
        
#line 126
        #if 2 > 2
            #if CONTIG == CONTIG
                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
            #else
                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f32 v_unary2 = npyv_recip_f32(v_src2);
        #endif
        
#line 126
        #if 2 > 3
            #if CONTIG == CONTIG
                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
            #else
                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f32 v_unary3 = npyv_recip_f32(v_src3);
        #endif
        
        #line 138
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if CONTIG == CONTIG
        npyv_f32 v_src0 = npyv_load_f32(src);
    #else
        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
    #endif
        npyv_f32 v_unary0 = npyv_recip_f32(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_f32(dst, v_unary0);
    #else
        npyv_storen_f32(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if CONTIG == CONTIG
        #if 1
            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
        #else
            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
        #endif
    #else
        #if 1
            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
        #else
            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
        #endif
    #endif
        npyv_f32 v_unary0 = npyv_recip_f32(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_till_f32(dst, len, v_unary0);
    #else
        npyv_storen_till_f32(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_FLOAT_reciprocal_NCONTIG_NCONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f32 *src = _src;
          npyv_lanetype_f32 *dst = _dst;

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 2;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
            #else
                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f32 v_unary0 = npyv_recip_f32(v_src0);
        #endif
        
#line 126
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
            #else
                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f32 v_unary1 = npyv_recip_f32(v_src1);
        #endif
        
#line 126
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
            #else
                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f32 v_unary2 = npyv_recip_f32(v_src2);
        #endif
        
#line 126
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
            #else
                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f32 v_unary3 = npyv_recip_f32(v_src3);
        #endif
        
        #line 138
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_store_f32(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if NCONTIG == CONTIG
        npyv_f32 v_src0 = npyv_load_f32(src);
    #else
        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
    #endif
        npyv_f32 v_unary0 = npyv_recip_f32(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_f32(dst, v_unary0);
    #else
        npyv_storen_f32(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if NCONTIG == CONTIG
        #if 1
            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
        #else
            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
        #endif
    #else
        #if 1
            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
        #else
            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
        #endif
    #endif
        npyv_f32 v_unary0 = npyv_recip_f32(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_till_f32(dst, len, v_unary0);
    #else
        npyv_storen_till_f32(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}


#endif // NPY_SIMD_F32

#line 101
#if NPY_SIMD_F64
#line 107
#line 112
static void simd_DOUBLE_rint_CONTIG_CONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f64 *src = _src;
          npyv_lanetype_f64 *dst = _dst;

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 4;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
            #else
                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f64 v_unary0 = npyv_rint_f64(v_src0);
        #endif
        
#line 126
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
            #else
                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f64 v_unary1 = npyv_rint_f64(v_src1);
        #endif
        
#line 126
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
            #else
                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f64 v_unary2 = npyv_rint_f64(v_src2);
        #endif
        
#line 126
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
            #else
                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f64 v_unary3 = npyv_rint_f64(v_src3);
        #endif
        
        #line 138
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if CONTIG == CONTIG
        npyv_f64 v_src0 = npyv_load_f64(src);
    #else
        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
    #endif
        npyv_f64 v_unary0 = npyv_rint_f64(v_src0);
    #if CONTIG == CONTIG
        npyv_store_f64(dst, v_unary0);
    #else
        npyv_storen_f64(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if CONTIG == CONTIG
        #if 0
            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
        #else
            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
        #endif
    #else
        #if 0
            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
        #else
            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
        #endif
    #endif
        npyv_f64 v_unary0 = npyv_rint_f64(v_src0);
    #if CONTIG == CONTIG
        npyv_store_till_f64(dst, len, v_unary0);
    #else
        npyv_storen_till_f64(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_DOUBLE_rint_NCONTIG_CONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f64 *src = _src;
          npyv_lanetype_f64 *dst = _dst;

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 4;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 4 > 0
            #if NCONTIG == CONTIG
                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
            #else
                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f64 v_unary0 = npyv_rint_f64(v_src0);
        #endif
        
#line 126
        #if 4 > 1
            #if NCONTIG == CONTIG
                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
            #else
                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f64 v_unary1 = npyv_rint_f64(v_src1);
        #endif
        
#line 126
        #if 4 > 2
            #if NCONTIG == CONTIG
                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
            #else
                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f64 v_unary2 = npyv_rint_f64(v_src2);
        #endif
        
#line 126
        #if 4 > 3
            #if NCONTIG == CONTIG
                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
            #else
                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f64 v_unary3 = npyv_rint_f64(v_src3);
        #endif
        
        #line 138
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if NCONTIG == CONTIG
        npyv_f64 v_src0 = npyv_load_f64(src);
    #else
        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
    #endif
        npyv_f64 v_unary0 = npyv_rint_f64(v_src0);
    #if CONTIG == CONTIG
        npyv_store_f64(dst, v_unary0);
    #else
        npyv_storen_f64(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if NCONTIG == CONTIG
        #if 0
            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
        #else
            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
        #endif
    #else
        #if 0
            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
        #else
            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
        #endif
    #endif
        npyv_f64 v_unary0 = npyv_rint_f64(v_src0);
    #if CONTIG == CONTIG
        npyv_store_till_f64(dst, len, v_unary0);
    #else
        npyv_storen_till_f64(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_DOUBLE_rint_CONTIG_NCONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f64 *src = _src;
          npyv_lanetype_f64 *dst = _dst;

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 2;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 2 > 0
            #if CONTIG == CONTIG
                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
            #else
                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f64 v_unary0 = npyv_rint_f64(v_src0);
        #endif
        
#line 126
        #if 2 > 1
            #if CONTIG == CONTIG
                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
            #else
                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f64 v_unary1 = npyv_rint_f64(v_src1);
        #endif
        
#line 126
        #if 2 > 2
            #if CONTIG == CONTIG
                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
            #else
                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f64 v_unary2 = npyv_rint_f64(v_src2);
        #endif
        
#line 126
        #if 2 > 3
            #if CONTIG == CONTIG
                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
            #else
                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f64 v_unary3 = npyv_rint_f64(v_src3);
        #endif
        
        #line 138
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if CONTIG == CONTIG
        npyv_f64 v_src0 = npyv_load_f64(src);
    #else
        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
    #endif
        npyv_f64 v_unary0 = npyv_rint_f64(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_f64(dst, v_unary0);
    #else
        npyv_storen_f64(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if CONTIG == CONTIG
        #if 0
            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
        #else
            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
        #endif
    #else
        #if 0
            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
        #else
            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
        #endif
    #endif
        npyv_f64 v_unary0 = npyv_rint_f64(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_till_f64(dst, len, v_unary0);
    #else
        npyv_storen_till_f64(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_DOUBLE_rint_NCONTIG_NCONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f64 *src = _src;
          npyv_lanetype_f64 *dst = _dst;

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 2;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
            #else
                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f64 v_unary0 = npyv_rint_f64(v_src0);
        #endif
        
#line 126
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
            #else
                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f64 v_unary1 = npyv_rint_f64(v_src1);
        #endif
        
#line 126
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
            #else
                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f64 v_unary2 = npyv_rint_f64(v_src2);
        #endif
        
#line 126
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
            #else
                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f64 v_unary3 = npyv_rint_f64(v_src3);
        #endif
        
        #line 138
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if NCONTIG == CONTIG
        npyv_f64 v_src0 = npyv_load_f64(src);
    #else
        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
    #endif
        npyv_f64 v_unary0 = npyv_rint_f64(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_f64(dst, v_unary0);
    #else
        npyv_storen_f64(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if NCONTIG == CONTIG
        #if 0
            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
        #else
            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
        #endif
    #else
        #if 0
            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
        #else
            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
        #endif
    #endif
        npyv_f64 v_unary0 = npyv_rint_f64(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_till_f64(dst, len, v_unary0);
    #else
        npyv_storen_till_f64(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}


#line 107
#line 112
static void simd_DOUBLE_floor_CONTIG_CONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f64 *src = _src;
          npyv_lanetype_f64 *dst = _dst;

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 4;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
            #else
                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f64 v_unary0 = npyv_floor_f64(v_src0);
        #endif
        
#line 126
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
            #else
                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f64 v_unary1 = npyv_floor_f64(v_src1);
        #endif
        
#line 126
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
            #else
                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f64 v_unary2 = npyv_floor_f64(v_src2);
        #endif
        
#line 126
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
            #else
                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f64 v_unary3 = npyv_floor_f64(v_src3);
        #endif
        
        #line 138
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if CONTIG == CONTIG
        npyv_f64 v_src0 = npyv_load_f64(src);
    #else
        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
    #endif
        npyv_f64 v_unary0 = npyv_floor_f64(v_src0);
    #if CONTIG == CONTIG
        npyv_store_f64(dst, v_unary0);
    #else
        npyv_storen_f64(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if CONTIG == CONTIG
        #if 0
            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
        #else
            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
        #endif
    #else
        #if 0
            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
        #else
            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
        #endif
    #endif
        npyv_f64 v_unary0 = npyv_floor_f64(v_src0);
    #if CONTIG == CONTIG
        npyv_store_till_f64(dst, len, v_unary0);
    #else
        npyv_storen_till_f64(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_DOUBLE_floor_NCONTIG_CONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f64 *src = _src;
          npyv_lanetype_f64 *dst = _dst;

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 4;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 4 > 0
            #if NCONTIG == CONTIG
                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
            #else
                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f64 v_unary0 = npyv_floor_f64(v_src0);
        #endif
        
#line 126
        #if 4 > 1
            #if NCONTIG == CONTIG
                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
            #else
                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f64 v_unary1 = npyv_floor_f64(v_src1);
        #endif
        
#line 126
        #if 4 > 2
            #if NCONTIG == CONTIG
                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
            #else
                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f64 v_unary2 = npyv_floor_f64(v_src2);
        #endif
        
#line 126
        #if 4 > 3
            #if NCONTIG == CONTIG
                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
            #else
                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f64 v_unary3 = npyv_floor_f64(v_src3);
        #endif
        
        #line 138
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if NCONTIG == CONTIG
        npyv_f64 v_src0 = npyv_load_f64(src);
    #else
        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
    #endif
        npyv_f64 v_unary0 = npyv_floor_f64(v_src0);
    #if CONTIG == CONTIG
        npyv_store_f64(dst, v_unary0);
    #else
        npyv_storen_f64(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if NCONTIG == CONTIG
        #if 0
            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
        #else
            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
        #endif
    #else
        #if 0
            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
        #else
            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
        #endif
    #endif
        npyv_f64 v_unary0 = npyv_floor_f64(v_src0);
    #if CONTIG == CONTIG
        npyv_store_till_f64(dst, len, v_unary0);
    #else
        npyv_storen_till_f64(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_DOUBLE_floor_CONTIG_NCONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f64 *src = _src;
          npyv_lanetype_f64 *dst = _dst;

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 2;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 2 > 0
            #if CONTIG == CONTIG
                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
            #else
                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f64 v_unary0 = npyv_floor_f64(v_src0);
        #endif
        
#line 126
        #if 2 > 1
            #if CONTIG == CONTIG
                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
            #else
                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f64 v_unary1 = npyv_floor_f64(v_src1);
        #endif
        
#line 126
        #if 2 > 2
            #if CONTIG == CONTIG
                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
            #else
                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f64 v_unary2 = npyv_floor_f64(v_src2);
        #endif
        
#line 126
        #if 2 > 3
            #if CONTIG == CONTIG
                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
            #else
                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f64 v_unary3 = npyv_floor_f64(v_src3);
        #endif
        
        #line 138
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if CONTIG == CONTIG
        npyv_f64 v_src0 = npyv_load_f64(src);
    #else
        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
    #endif
        npyv_f64 v_unary0 = npyv_floor_f64(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_f64(dst, v_unary0);
    #else
        npyv_storen_f64(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if CONTIG == CONTIG
        #if 0
            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
        #else
            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
        #endif
    #else
        #if 0
            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
        #else
            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
        #endif
    #endif
        npyv_f64 v_unary0 = npyv_floor_f64(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_till_f64(dst, len, v_unary0);
    #else
        npyv_storen_till_f64(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_DOUBLE_floor_NCONTIG_NCONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f64 *src = _src;
          npyv_lanetype_f64 *dst = _dst;

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 2;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
            #else
                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f64 v_unary0 = npyv_floor_f64(v_src0);
        #endif
        
#line 126
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
            #else
                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f64 v_unary1 = npyv_floor_f64(v_src1);
        #endif
        
#line 126
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
            #else
                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f64 v_unary2 = npyv_floor_f64(v_src2);
        #endif
        
#line 126
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
            #else
                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f64 v_unary3 = npyv_floor_f64(v_src3);
        #endif
        
        #line 138
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if NCONTIG == CONTIG
        npyv_f64 v_src0 = npyv_load_f64(src);
    #else
        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
    #endif
        npyv_f64 v_unary0 = npyv_floor_f64(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_f64(dst, v_unary0);
    #else
        npyv_storen_f64(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if NCONTIG == CONTIG
        #if 0
            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
        #else
            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
        #endif
    #else
        #if 0
            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
        #else
            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
        #endif
    #endif
        npyv_f64 v_unary0 = npyv_floor_f64(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_till_f64(dst, len, v_unary0);
    #else
        npyv_storen_till_f64(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}


#line 107
#line 112
static void simd_DOUBLE_ceil_CONTIG_CONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f64 *src = _src;
          npyv_lanetype_f64 *dst = _dst;

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 4;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
            #else
                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f64 v_unary0 = npyv_ceil_f64(v_src0);
        #endif
        
#line 126
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
            #else
                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f64 v_unary1 = npyv_ceil_f64(v_src1);
        #endif
        
#line 126
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
            #else
                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f64 v_unary2 = npyv_ceil_f64(v_src2);
        #endif
        
#line 126
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
            #else
                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f64 v_unary3 = npyv_ceil_f64(v_src3);
        #endif
        
        #line 138
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if CONTIG == CONTIG
        npyv_f64 v_src0 = npyv_load_f64(src);
    #else
        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
    #endif
        npyv_f64 v_unary0 = npyv_ceil_f64(v_src0);
    #if CONTIG == CONTIG
        npyv_store_f64(dst, v_unary0);
    #else
        npyv_storen_f64(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if CONTIG == CONTIG
        #if 0
            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
        #else
            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
        #endif
    #else
        #if 0
            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
        #else
            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
        #endif
    #endif
        npyv_f64 v_unary0 = npyv_ceil_f64(v_src0);
    #if CONTIG == CONTIG
        npyv_store_till_f64(dst, len, v_unary0);
    #else
        npyv_storen_till_f64(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_DOUBLE_ceil_NCONTIG_CONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f64 *src = _src;
          npyv_lanetype_f64 *dst = _dst;

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 4;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 4 > 0
            #if NCONTIG == CONTIG
                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
            #else
                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f64 v_unary0 = npyv_ceil_f64(v_src0);
        #endif
        
#line 126
        #if 4 > 1
            #if NCONTIG == CONTIG
                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
            #else
                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f64 v_unary1 = npyv_ceil_f64(v_src1);
        #endif
        
#line 126
        #if 4 > 2
            #if NCONTIG == CONTIG
                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
            #else
                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f64 v_unary2 = npyv_ceil_f64(v_src2);
        #endif
        
#line 126
        #if 4 > 3
            #if NCONTIG == CONTIG
                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
            #else
                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f64 v_unary3 = npyv_ceil_f64(v_src3);
        #endif
        
        #line 138
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if NCONTIG == CONTIG
        npyv_f64 v_src0 = npyv_load_f64(src);
    #else
        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
    #endif
        npyv_f64 v_unary0 = npyv_ceil_f64(v_src0);
    #if CONTIG == CONTIG
        npyv_store_f64(dst, v_unary0);
    #else
        npyv_storen_f64(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if NCONTIG == CONTIG
        #if 0
            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
        #else
            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
        #endif
    #else
        #if 0
            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
        #else
            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
        #endif
    #endif
        npyv_f64 v_unary0 = npyv_ceil_f64(v_src0);
    #if CONTIG == CONTIG
        npyv_store_till_f64(dst, len, v_unary0);
    #else
        npyv_storen_till_f64(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_DOUBLE_ceil_CONTIG_NCONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f64 *src = _src;
          npyv_lanetype_f64 *dst = _dst;

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 2;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 2 > 0
            #if CONTIG == CONTIG
                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
            #else
                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f64 v_unary0 = npyv_ceil_f64(v_src0);
        #endif
        
#line 126
        #if 2 > 1
            #if CONTIG == CONTIG
                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
            #else
                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f64 v_unary1 = npyv_ceil_f64(v_src1);
        #endif
        
#line 126
        #if 2 > 2
            #if CONTIG == CONTIG
                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
            #else
                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f64 v_unary2 = npyv_ceil_f64(v_src2);
        #endif
        
#line 126
        #if 2 > 3
            #if CONTIG == CONTIG
                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
            #else
                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f64 v_unary3 = npyv_ceil_f64(v_src3);
        #endif
        
        #line 138
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if CONTIG == CONTIG
        npyv_f64 v_src0 = npyv_load_f64(src);
    #else
        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
    #endif
        npyv_f64 v_unary0 = npyv_ceil_f64(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_f64(dst, v_unary0);
    #else
        npyv_storen_f64(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if CONTIG == CONTIG
        #if 0
            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
        #else
            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
        #endif
    #else
        #if 0
            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
        #else
            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
        #endif
    #endif
        npyv_f64 v_unary0 = npyv_ceil_f64(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_till_f64(dst, len, v_unary0);
    #else
        npyv_storen_till_f64(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_DOUBLE_ceil_NCONTIG_NCONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f64 *src = _src;
          npyv_lanetype_f64 *dst = _dst;

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 2;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
            #else
                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f64 v_unary0 = npyv_ceil_f64(v_src0);
        #endif
        
#line 126
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
            #else
                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f64 v_unary1 = npyv_ceil_f64(v_src1);
        #endif
        
#line 126
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
            #else
                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f64 v_unary2 = npyv_ceil_f64(v_src2);
        #endif
        
#line 126
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
            #else
                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f64 v_unary3 = npyv_ceil_f64(v_src3);
        #endif
        
        #line 138
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if NCONTIG == CONTIG
        npyv_f64 v_src0 = npyv_load_f64(src);
    #else
        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
    #endif
        npyv_f64 v_unary0 = npyv_ceil_f64(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_f64(dst, v_unary0);
    #else
        npyv_storen_f64(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if NCONTIG == CONTIG
        #if 0
            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
        #else
            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
        #endif
    #else
        #if 0
            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
        #else
            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
        #endif
    #endif
        npyv_f64 v_unary0 = npyv_ceil_f64(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_till_f64(dst, len, v_unary0);
    #else
        npyv_storen_till_f64(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}


#line 107
#line 112
static void simd_DOUBLE_trunc_CONTIG_CONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f64 *src = _src;
          npyv_lanetype_f64 *dst = _dst;

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 4;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
            #else
                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f64 v_unary0 = npyv_trunc_f64(v_src0);
        #endif
        
#line 126
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
            #else
                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f64 v_unary1 = npyv_trunc_f64(v_src1);
        #endif
        
#line 126
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
            #else
                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f64 v_unary2 = npyv_trunc_f64(v_src2);
        #endif
        
#line 126
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
            #else
                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f64 v_unary3 = npyv_trunc_f64(v_src3);
        #endif
        
        #line 138
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if CONTIG == CONTIG
        npyv_f64 v_src0 = npyv_load_f64(src);
    #else
        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
    #endif
        npyv_f64 v_unary0 = npyv_trunc_f64(v_src0);
    #if CONTIG == CONTIG
        npyv_store_f64(dst, v_unary0);
    #else
        npyv_storen_f64(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if CONTIG == CONTIG
        #if 0
            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
        #else
            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
        #endif
    #else
        #if 0
            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
        #else
            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
        #endif
    #endif
        npyv_f64 v_unary0 = npyv_trunc_f64(v_src0);
    #if CONTIG == CONTIG
        npyv_store_till_f64(dst, len, v_unary0);
    #else
        npyv_storen_till_f64(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_DOUBLE_trunc_NCONTIG_CONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f64 *src = _src;
          npyv_lanetype_f64 *dst = _dst;

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 4;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 4 > 0
            #if NCONTIG == CONTIG
                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
            #else
                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f64 v_unary0 = npyv_trunc_f64(v_src0);
        #endif
        
#line 126
        #if 4 > 1
            #if NCONTIG == CONTIG
                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
            #else
                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f64 v_unary1 = npyv_trunc_f64(v_src1);
        #endif
        
#line 126
        #if 4 > 2
            #if NCONTIG == CONTIG
                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
            #else
                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f64 v_unary2 = npyv_trunc_f64(v_src2);
        #endif
        
#line 126
        #if 4 > 3
            #if NCONTIG == CONTIG
                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
            #else
                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f64 v_unary3 = npyv_trunc_f64(v_src3);
        #endif
        
        #line 138
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if NCONTIG == CONTIG
        npyv_f64 v_src0 = npyv_load_f64(src);
    #else
        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
    #endif
        npyv_f64 v_unary0 = npyv_trunc_f64(v_src0);
    #if CONTIG == CONTIG
        npyv_store_f64(dst, v_unary0);
    #else
        npyv_storen_f64(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if NCONTIG == CONTIG
        #if 0
            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
        #else
            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
        #endif
    #else
        #if 0
            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
        #else
            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
        #endif
    #endif
        npyv_f64 v_unary0 = npyv_trunc_f64(v_src0);
    #if CONTIG == CONTIG
        npyv_store_till_f64(dst, len, v_unary0);
    #else
        npyv_storen_till_f64(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_DOUBLE_trunc_CONTIG_NCONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f64 *src = _src;
          npyv_lanetype_f64 *dst = _dst;

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 2;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 2 > 0
            #if CONTIG == CONTIG
                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
            #else
                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f64 v_unary0 = npyv_trunc_f64(v_src0);
        #endif
        
#line 126
        #if 2 > 1
            #if CONTIG == CONTIG
                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
            #else
                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f64 v_unary1 = npyv_trunc_f64(v_src1);
        #endif
        
#line 126
        #if 2 > 2
            #if CONTIG == CONTIG
                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
            #else
                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f64 v_unary2 = npyv_trunc_f64(v_src2);
        #endif
        
#line 126
        #if 2 > 3
            #if CONTIG == CONTIG
                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
            #else
                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f64 v_unary3 = npyv_trunc_f64(v_src3);
        #endif
        
        #line 138
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if CONTIG == CONTIG
        npyv_f64 v_src0 = npyv_load_f64(src);
    #else
        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
    #endif
        npyv_f64 v_unary0 = npyv_trunc_f64(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_f64(dst, v_unary0);
    #else
        npyv_storen_f64(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if CONTIG == CONTIG
        #if 0
            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
        #else
            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
        #endif
    #else
        #if 0
            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
        #else
            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
        #endif
    #endif
        npyv_f64 v_unary0 = npyv_trunc_f64(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_till_f64(dst, len, v_unary0);
    #else
        npyv_storen_till_f64(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_DOUBLE_trunc_NCONTIG_NCONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f64 *src = _src;
          npyv_lanetype_f64 *dst = _dst;

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 2;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
            #else
                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f64 v_unary0 = npyv_trunc_f64(v_src0);
        #endif
        
#line 126
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
            #else
                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f64 v_unary1 = npyv_trunc_f64(v_src1);
        #endif
        
#line 126
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
            #else
                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f64 v_unary2 = npyv_trunc_f64(v_src2);
        #endif
        
#line 126
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
            #else
                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f64 v_unary3 = npyv_trunc_f64(v_src3);
        #endif
        
        #line 138
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if NCONTIG == CONTIG
        npyv_f64 v_src0 = npyv_load_f64(src);
    #else
        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
    #endif
        npyv_f64 v_unary0 = npyv_trunc_f64(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_f64(dst, v_unary0);
    #else
        npyv_storen_f64(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if NCONTIG == CONTIG
        #if 0
            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
        #else
            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
        #endif
    #else
        #if 0
            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
        #else
            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
        #endif
    #endif
        npyv_f64 v_unary0 = npyv_trunc_f64(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_till_f64(dst, len, v_unary0);
    #else
        npyv_storen_till_f64(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}


#line 107
#line 112
static void simd_DOUBLE_sqrt_CONTIG_CONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f64 *src = _src;
          npyv_lanetype_f64 *dst = _dst;

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 4;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
            #else
                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f64 v_unary0 = npyv_sqrt_f64(v_src0);
        #endif
        
#line 126
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
            #else
                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f64 v_unary1 = npyv_sqrt_f64(v_src1);
        #endif
        
#line 126
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
            #else
                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f64 v_unary2 = npyv_sqrt_f64(v_src2);
        #endif
        
#line 126
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
            #else
                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f64 v_unary3 = npyv_sqrt_f64(v_src3);
        #endif
        
        #line 138
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if CONTIG == CONTIG
        npyv_f64 v_src0 = npyv_load_f64(src);
    #else
        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
    #endif
        npyv_f64 v_unary0 = npyv_sqrt_f64(v_src0);
    #if CONTIG == CONTIG
        npyv_store_f64(dst, v_unary0);
    #else
        npyv_storen_f64(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if CONTIG == CONTIG
        #if 0
            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
        #else
            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
        #endif
    #else
        #if 0
            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
        #else
            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
        #endif
    #endif
        npyv_f64 v_unary0 = npyv_sqrt_f64(v_src0);
    #if CONTIG == CONTIG
        npyv_store_till_f64(dst, len, v_unary0);
    #else
        npyv_storen_till_f64(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_DOUBLE_sqrt_NCONTIG_CONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f64 *src = _src;
          npyv_lanetype_f64 *dst = _dst;

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 4;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 4 > 0
            #if NCONTIG == CONTIG
                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
            #else
                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f64 v_unary0 = npyv_sqrt_f64(v_src0);
        #endif
        
#line 126
        #if 4 > 1
            #if NCONTIG == CONTIG
                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
            #else
                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f64 v_unary1 = npyv_sqrt_f64(v_src1);
        #endif
        
#line 126
        #if 4 > 2
            #if NCONTIG == CONTIG
                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
            #else
                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f64 v_unary2 = npyv_sqrt_f64(v_src2);
        #endif
        
#line 126
        #if 4 > 3
            #if NCONTIG == CONTIG
                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
            #else
                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f64 v_unary3 = npyv_sqrt_f64(v_src3);
        #endif
        
        #line 138
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if NCONTIG == CONTIG
        npyv_f64 v_src0 = npyv_load_f64(src);
    #else
        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
    #endif
        npyv_f64 v_unary0 = npyv_sqrt_f64(v_src0);
    #if CONTIG == CONTIG
        npyv_store_f64(dst, v_unary0);
    #else
        npyv_storen_f64(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if NCONTIG == CONTIG
        #if 0
            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
        #else
            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
        #endif
    #else
        #if 0
            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
        #else
            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
        #endif
    #endif
        npyv_f64 v_unary0 = npyv_sqrt_f64(v_src0);
    #if CONTIG == CONTIG
        npyv_store_till_f64(dst, len, v_unary0);
    #else
        npyv_storen_till_f64(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_DOUBLE_sqrt_CONTIG_NCONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f64 *src = _src;
          npyv_lanetype_f64 *dst = _dst;

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 2;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 2 > 0
            #if CONTIG == CONTIG
                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
            #else
                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f64 v_unary0 = npyv_sqrt_f64(v_src0);
        #endif
        
#line 126
        #if 2 > 1
            #if CONTIG == CONTIG
                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
            #else
                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f64 v_unary1 = npyv_sqrt_f64(v_src1);
        #endif
        
#line 126
        #if 2 > 2
            #if CONTIG == CONTIG
                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
            #else
                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f64 v_unary2 = npyv_sqrt_f64(v_src2);
        #endif
        
#line 126
        #if 2 > 3
            #if CONTIG == CONTIG
                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
            #else
                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f64 v_unary3 = npyv_sqrt_f64(v_src3);
        #endif
        
        #line 138
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if CONTIG == CONTIG
        npyv_f64 v_src0 = npyv_load_f64(src);
    #else
        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
    #endif
        npyv_f64 v_unary0 = npyv_sqrt_f64(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_f64(dst, v_unary0);
    #else
        npyv_storen_f64(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if CONTIG == CONTIG
        #if 0
            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
        #else
            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
        #endif
    #else
        #if 0
            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
        #else
            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
        #endif
    #endif
        npyv_f64 v_unary0 = npyv_sqrt_f64(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_till_f64(dst, len, v_unary0);
    #else
        npyv_storen_till_f64(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_DOUBLE_sqrt_NCONTIG_NCONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f64 *src = _src;
          npyv_lanetype_f64 *dst = _dst;

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 2;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
            #else
                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f64 v_unary0 = npyv_sqrt_f64(v_src0);
        #endif
        
#line 126
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
            #else
                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f64 v_unary1 = npyv_sqrt_f64(v_src1);
        #endif
        
#line 126
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
            #else
                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f64 v_unary2 = npyv_sqrt_f64(v_src2);
        #endif
        
#line 126
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
            #else
                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f64 v_unary3 = npyv_sqrt_f64(v_src3);
        #endif
        
        #line 138
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if NCONTIG == CONTIG
        npyv_f64 v_src0 = npyv_load_f64(src);
    #else
        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
    #endif
        npyv_f64 v_unary0 = npyv_sqrt_f64(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_f64(dst, v_unary0);
    #else
        npyv_storen_f64(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if NCONTIG == CONTIG
        #if 0
            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
        #else
            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
        #endif
    #else
        #if 0
            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
        #else
            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
        #endif
    #endif
        npyv_f64 v_unary0 = npyv_sqrt_f64(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_till_f64(dst, len, v_unary0);
    #else
        npyv_storen_till_f64(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}


#line 107
#line 112
static void simd_DOUBLE_absolute_CONTIG_CONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f64 *src = _src;
          npyv_lanetype_f64 *dst = _dst;

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 4;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
            #else
                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f64 v_unary0 = npyv_abs_f64(v_src0);
        #endif
        
#line 126
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
            #else
                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f64 v_unary1 = npyv_abs_f64(v_src1);
        #endif
        
#line 126
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
            #else
                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f64 v_unary2 = npyv_abs_f64(v_src2);
        #endif
        
#line 126
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
            #else
                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f64 v_unary3 = npyv_abs_f64(v_src3);
        #endif
        
        #line 138
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if CONTIG == CONTIG
        npyv_f64 v_src0 = npyv_load_f64(src);
    #else
        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
    #endif
        npyv_f64 v_unary0 = npyv_abs_f64(v_src0);
    #if CONTIG == CONTIG
        npyv_store_f64(dst, v_unary0);
    #else
        npyv_storen_f64(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if CONTIG == CONTIG
        #if 0
            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
        #else
            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
        #endif
    #else
        #if 0
            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
        #else
            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
        #endif
    #endif
        npyv_f64 v_unary0 = npyv_abs_f64(v_src0);
    #if CONTIG == CONTIG
        npyv_store_till_f64(dst, len, v_unary0);
    #else
        npyv_storen_till_f64(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_DOUBLE_absolute_NCONTIG_CONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f64 *src = _src;
          npyv_lanetype_f64 *dst = _dst;

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 4;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 4 > 0
            #if NCONTIG == CONTIG
                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
            #else
                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f64 v_unary0 = npyv_abs_f64(v_src0);
        #endif
        
#line 126
        #if 4 > 1
            #if NCONTIG == CONTIG
                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
            #else
                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f64 v_unary1 = npyv_abs_f64(v_src1);
        #endif
        
#line 126
        #if 4 > 2
            #if NCONTIG == CONTIG
                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
            #else
                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f64 v_unary2 = npyv_abs_f64(v_src2);
        #endif
        
#line 126
        #if 4 > 3
            #if NCONTIG == CONTIG
                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
            #else
                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f64 v_unary3 = npyv_abs_f64(v_src3);
        #endif
        
        #line 138
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if NCONTIG == CONTIG
        npyv_f64 v_src0 = npyv_load_f64(src);
    #else
        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
    #endif
        npyv_f64 v_unary0 = npyv_abs_f64(v_src0);
    #if CONTIG == CONTIG
        npyv_store_f64(dst, v_unary0);
    #else
        npyv_storen_f64(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if NCONTIG == CONTIG
        #if 0
            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
        #else
            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
        #endif
    #else
        #if 0
            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
        #else
            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
        #endif
    #endif
        npyv_f64 v_unary0 = npyv_abs_f64(v_src0);
    #if CONTIG == CONTIG
        npyv_store_till_f64(dst, len, v_unary0);
    #else
        npyv_storen_till_f64(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_DOUBLE_absolute_CONTIG_NCONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f64 *src = _src;
          npyv_lanetype_f64 *dst = _dst;

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 2;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 2 > 0
            #if CONTIG == CONTIG
                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
            #else
                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f64 v_unary0 = npyv_abs_f64(v_src0);
        #endif
        
#line 126
        #if 2 > 1
            #if CONTIG == CONTIG
                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
            #else
                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f64 v_unary1 = npyv_abs_f64(v_src1);
        #endif
        
#line 126
        #if 2 > 2
            #if CONTIG == CONTIG
                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
            #else
                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f64 v_unary2 = npyv_abs_f64(v_src2);
        #endif
        
#line 126
        #if 2 > 3
            #if CONTIG == CONTIG
                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
            #else
                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f64 v_unary3 = npyv_abs_f64(v_src3);
        #endif
        
        #line 138
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if CONTIG == CONTIG
        npyv_f64 v_src0 = npyv_load_f64(src);
    #else
        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
    #endif
        npyv_f64 v_unary0 = npyv_abs_f64(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_f64(dst, v_unary0);
    #else
        npyv_storen_f64(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if CONTIG == CONTIG
        #if 0
            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
        #else
            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
        #endif
    #else
        #if 0
            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
        #else
            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
        #endif
    #endif
        npyv_f64 v_unary0 = npyv_abs_f64(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_till_f64(dst, len, v_unary0);
    #else
        npyv_storen_till_f64(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_DOUBLE_absolute_NCONTIG_NCONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f64 *src = _src;
          npyv_lanetype_f64 *dst = _dst;

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 2;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
            #else
                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f64 v_unary0 = npyv_abs_f64(v_src0);
        #endif
        
#line 126
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
            #else
                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f64 v_unary1 = npyv_abs_f64(v_src1);
        #endif
        
#line 126
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
            #else
                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f64 v_unary2 = npyv_abs_f64(v_src2);
        #endif
        
#line 126
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
            #else
                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f64 v_unary3 = npyv_abs_f64(v_src3);
        #endif
        
        #line 138
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if NCONTIG == CONTIG
        npyv_f64 v_src0 = npyv_load_f64(src);
    #else
        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
    #endif
        npyv_f64 v_unary0 = npyv_abs_f64(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_f64(dst, v_unary0);
    #else
        npyv_storen_f64(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if NCONTIG == CONTIG
        #if 0
            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
        #else
            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
        #endif
    #else
        #if 0
            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
        #else
            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
        #endif
    #endif
        npyv_f64 v_unary0 = npyv_abs_f64(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_till_f64(dst, len, v_unary0);
    #else
        npyv_storen_till_f64(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}


#line 107
#line 112
static void simd_DOUBLE_square_CONTIG_CONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f64 *src = _src;
          npyv_lanetype_f64 *dst = _dst;

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 4;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
            #else
                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f64 v_unary0 = npyv_square_f64(v_src0);
        #endif
        
#line 126
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
            #else
                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f64 v_unary1 = npyv_square_f64(v_src1);
        #endif
        
#line 126
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
            #else
                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f64 v_unary2 = npyv_square_f64(v_src2);
        #endif
        
#line 126
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
            #else
                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f64 v_unary3 = npyv_square_f64(v_src3);
        #endif
        
        #line 138
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if CONTIG == CONTIG
        npyv_f64 v_src0 = npyv_load_f64(src);
    #else
        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
    #endif
        npyv_f64 v_unary0 = npyv_square_f64(v_src0);
    #if CONTIG == CONTIG
        npyv_store_f64(dst, v_unary0);
    #else
        npyv_storen_f64(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if CONTIG == CONTIG
        #if 0
            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
        #else
            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
        #endif
    #else
        #if 0
            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
        #else
            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
        #endif
    #endif
        npyv_f64 v_unary0 = npyv_square_f64(v_src0);
    #if CONTIG == CONTIG
        npyv_store_till_f64(dst, len, v_unary0);
    #else
        npyv_storen_till_f64(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_DOUBLE_square_NCONTIG_CONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f64 *src = _src;
          npyv_lanetype_f64 *dst = _dst;

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 4;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 4 > 0
            #if NCONTIG == CONTIG
                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
            #else
                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f64 v_unary0 = npyv_square_f64(v_src0);
        #endif
        
#line 126
        #if 4 > 1
            #if NCONTIG == CONTIG
                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
            #else
                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f64 v_unary1 = npyv_square_f64(v_src1);
        #endif
        
#line 126
        #if 4 > 2
            #if NCONTIG == CONTIG
                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
            #else
                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f64 v_unary2 = npyv_square_f64(v_src2);
        #endif
        
#line 126
        #if 4 > 3
            #if NCONTIG == CONTIG
                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
            #else
                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f64 v_unary3 = npyv_square_f64(v_src3);
        #endif
        
        #line 138
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if NCONTIG == CONTIG
        npyv_f64 v_src0 = npyv_load_f64(src);
    #else
        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
    #endif
        npyv_f64 v_unary0 = npyv_square_f64(v_src0);
    #if CONTIG == CONTIG
        npyv_store_f64(dst, v_unary0);
    #else
        npyv_storen_f64(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if NCONTIG == CONTIG
        #if 0
            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
        #else
            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
        #endif
    #else
        #if 0
            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
        #else
            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
        #endif
    #endif
        npyv_f64 v_unary0 = npyv_square_f64(v_src0);
    #if CONTIG == CONTIG
        npyv_store_till_f64(dst, len, v_unary0);
    #else
        npyv_storen_till_f64(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_DOUBLE_square_CONTIG_NCONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f64 *src = _src;
          npyv_lanetype_f64 *dst = _dst;

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 2;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 2 > 0
            #if CONTIG == CONTIG
                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
            #else
                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f64 v_unary0 = npyv_square_f64(v_src0);
        #endif
        
#line 126
        #if 2 > 1
            #if CONTIG == CONTIG
                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
            #else
                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f64 v_unary1 = npyv_square_f64(v_src1);
        #endif
        
#line 126
        #if 2 > 2
            #if CONTIG == CONTIG
                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
            #else
                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f64 v_unary2 = npyv_square_f64(v_src2);
        #endif
        
#line 126
        #if 2 > 3
            #if CONTIG == CONTIG
                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
            #else
                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f64 v_unary3 = npyv_square_f64(v_src3);
        #endif
        
        #line 138
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if CONTIG == CONTIG
        npyv_f64 v_src0 = npyv_load_f64(src);
    #else
        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
    #endif
        npyv_f64 v_unary0 = npyv_square_f64(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_f64(dst, v_unary0);
    #else
        npyv_storen_f64(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if CONTIG == CONTIG
        #if 0
            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
        #else
            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
        #endif
    #else
        #if 0
            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
        #else
            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
        #endif
    #endif
        npyv_f64 v_unary0 = npyv_square_f64(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_till_f64(dst, len, v_unary0);
    #else
        npyv_storen_till_f64(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_DOUBLE_square_NCONTIG_NCONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f64 *src = _src;
          npyv_lanetype_f64 *dst = _dst;

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 2;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
            #else
                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f64 v_unary0 = npyv_square_f64(v_src0);
        #endif
        
#line 126
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
            #else
                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f64 v_unary1 = npyv_square_f64(v_src1);
        #endif
        
#line 126
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
            #else
                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f64 v_unary2 = npyv_square_f64(v_src2);
        #endif
        
#line 126
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
            #else
                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f64 v_unary3 = npyv_square_f64(v_src3);
        #endif
        
        #line 138
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if NCONTIG == CONTIG
        npyv_f64 v_src0 = npyv_load_f64(src);
    #else
        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
    #endif
        npyv_f64 v_unary0 = npyv_square_f64(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_f64(dst, v_unary0);
    #else
        npyv_storen_f64(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if NCONTIG == CONTIG
        #if 0
            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
        #else
            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
        #endif
    #else
        #if 0
            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
        #else
            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
        #endif
    #endif
        npyv_f64 v_unary0 = npyv_square_f64(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_till_f64(dst, len, v_unary0);
    #else
        npyv_storen_till_f64(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}


#line 107
#line 112
static void simd_DOUBLE_reciprocal_CONTIG_CONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f64 *src = _src;
          npyv_lanetype_f64 *dst = _dst;

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 4;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
            #else
                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f64 v_unary0 = npyv_recip_f64(v_src0);
        #endif
        
#line 126
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
            #else
                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f64 v_unary1 = npyv_recip_f64(v_src1);
        #endif
        
#line 126
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
            #else
                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f64 v_unary2 = npyv_recip_f64(v_src2);
        #endif
        
#line 126
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
            #else
                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f64 v_unary3 = npyv_recip_f64(v_src3);
        #endif
        
        #line 138
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if CONTIG == CONTIG
        npyv_f64 v_src0 = npyv_load_f64(src);
    #else
        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
    #endif
        npyv_f64 v_unary0 = npyv_recip_f64(v_src0);
    #if CONTIG == CONTIG
        npyv_store_f64(dst, v_unary0);
    #else
        npyv_storen_f64(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if CONTIG == CONTIG
        #if 1
            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
        #else
            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
        #endif
    #else
        #if 1
            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
        #else
            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
        #endif
    #endif
        npyv_f64 v_unary0 = npyv_recip_f64(v_src0);
    #if CONTIG == CONTIG
        npyv_store_till_f64(dst, len, v_unary0);
    #else
        npyv_storen_till_f64(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_DOUBLE_reciprocal_NCONTIG_CONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f64 *src = _src;
          npyv_lanetype_f64 *dst = _dst;

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 4;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 4 > 0
            #if NCONTIG == CONTIG
                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
            #else
                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f64 v_unary0 = npyv_recip_f64(v_src0);
        #endif
        
#line 126
        #if 4 > 1
            #if NCONTIG == CONTIG
                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
            #else
                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f64 v_unary1 = npyv_recip_f64(v_src1);
        #endif
        
#line 126
        #if 4 > 2
            #if NCONTIG == CONTIG
                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
            #else
                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f64 v_unary2 = npyv_recip_f64(v_src2);
        #endif
        
#line 126
        #if 4 > 3
            #if NCONTIG == CONTIG
                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
            #else
                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f64 v_unary3 = npyv_recip_f64(v_src3);
        #endif
        
        #line 138
        #if 4 > 0
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 4 > 1
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 4 > 2
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 4 > 3
            #if CONTIG == CONTIG
                npyv_store_f64(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if NCONTIG == CONTIG
        npyv_f64 v_src0 = npyv_load_f64(src);
    #else
        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
    #endif
        npyv_f64 v_unary0 = npyv_recip_f64(v_src0);
    #if CONTIG == CONTIG
        npyv_store_f64(dst, v_unary0);
    #else
        npyv_storen_f64(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if NCONTIG == CONTIG
        #if 1
            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
        #else
            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
        #endif
    #else
        #if 1
            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
        #else
            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
        #endif
    #endif
        npyv_f64 v_unary0 = npyv_recip_f64(v_src0);
    #if CONTIG == CONTIG
        npyv_store_till_f64(dst, len, v_unary0);
    #else
        npyv_storen_till_f64(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_DOUBLE_reciprocal_CONTIG_NCONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f64 *src = _src;
          npyv_lanetype_f64 *dst = _dst;

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 2;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 2 > 0
            #if CONTIG == CONTIG
                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
            #else
                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f64 v_unary0 = npyv_recip_f64(v_src0);
        #endif
        
#line 126
        #if 2 > 1
            #if CONTIG == CONTIG
                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
            #else
                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f64 v_unary1 = npyv_recip_f64(v_src1);
        #endif
        
#line 126
        #if 2 > 2
            #if CONTIG == CONTIG
                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
            #else
                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f64 v_unary2 = npyv_recip_f64(v_src2);
        #endif
        
#line 126
        #if 2 > 3
            #if CONTIG == CONTIG
                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
            #else
                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f64 v_unary3 = npyv_recip_f64(v_src3);
        #endif
        
        #line 138
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if CONTIG == CONTIG
        npyv_f64 v_src0 = npyv_load_f64(src);
    #else
        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
    #endif
        npyv_f64 v_unary0 = npyv_recip_f64(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_f64(dst, v_unary0);
    #else
        npyv_storen_f64(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if CONTIG == CONTIG
        #if 1
            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
        #else
            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
        #endif
    #else
        #if 1
            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
        #else
            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
        #endif
    #endif
        npyv_f64 v_unary0 = npyv_recip_f64(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_till_f64(dst, len, v_unary0);
    #else
        npyv_storen_till_f64(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}

#line 112
static void simd_DOUBLE_reciprocal_NCONTIG_NCONTIG
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_f64 *src = _src;
          npyv_lanetype_f64 *dst = _dst;

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 2;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        #line 126
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
            #else
                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
            #endif
            npyv_f64 v_unary0 = npyv_recip_f64(v_src0);
        #endif
        
#line 126
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
            #else
                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
            #endif
            npyv_f64 v_unary1 = npyv_recip_f64(v_src1);
        #endif
        
#line 126
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
            #else
                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
            #endif
            npyv_f64 v_unary2 = npyv_recip_f64(v_src2);
        #endif
        
#line 126
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
            #else
                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
            #endif
            npyv_f64 v_unary3 = npyv_recip_f64(v_src3);
        #endif
        
        #line 138
        #if 2 > 0
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*0, v_unary0);
            #else
                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
            #endif
        #endif
        
#line 138
        #if 2 > 1
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*1, v_unary1);
            #else
                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
            #endif
        #endif
        
#line 138
        #if 2 > 2
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*2, v_unary2);
            #else
                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
            #endif
        #endif
        
#line 138
        #if 2 > 3
            #if NCONTIG == CONTIG
                npyv_store_f64(dst + vstep*3, v_unary3);
            #else
                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
            #endif
        #endif
        
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if NCONTIG == CONTIG
        npyv_f64 v_src0 = npyv_load_f64(src);
    #else
        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
    #endif
        npyv_f64 v_unary0 = npyv_recip_f64(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_f64(dst, v_unary0);
    #else
        npyv_storen_f64(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if NCONTIG == CONTIG
        #if 1
            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
        #else
            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
        #endif
    #else
        #if 1
            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
        #else
            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
        #endif
    #endif
        npyv_f64 v_unary0 = npyv_recip_f64(v_src0);
    #if NCONTIG == CONTIG
        npyv_store_till_f64(dst, len, v_unary0);
    #else
        npyv_storen_till_f64(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}


#endif // NPY_SIMD_F64


/********************************************************************************
 ** Defining ufunc inner functions
 ********************************************************************************/
#line 201
#line 206
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_rint)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    const char *src = args[0]; char *dst = args[1];
    const npy_intp src_step = steps[0];
    const npy_intp dst_step = steps[1];
    npy_intp len = dimensions[0];
#if NPY_SIMD_F32
    const int lsize = sizeof(npyv_lanetype_f32);
    assert(len <= 1 || (src_step % lsize == 0 && dst_step % lsize == 0));
    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
        goto no_unroll;
    }
    const npy_intp ssrc = src_step / lsize;
    const npy_intp sdst = dst_step / lsize;
    if (!npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst)) {
        goto no_unroll;
    }
    if (ssrc == 1 && sdst == 1) {
        simd_FLOAT_rint_CONTIG_CONTIG(src, 1, dst, 1, len);
    }
    else if (sdst == 1) {
        simd_FLOAT_rint_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
    }
    else if (ssrc == 1) {
        simd_FLOAT_rint_CONTIG_NCONTIG(src, 1, dst, sdst, len);
    } else {
        simd_FLOAT_rint_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
    }
    goto clear;
no_unroll:
#endif // NPY_SIMD_F32
    for (; len > 0; --len, src += src_step, dst += dst_step) {
    #if NPY_SIMD_F32
        // to guarantee the same precision and fp/domain errors for both scalars and vectors
        simd_FLOAT_rint_CONTIG_CONTIG(src, 0, dst, 0, 1);
    #else
        const npyv_lanetype_f32 src0 = *(npyv_lanetype_f32*)src;
        *(npyv_lanetype_f32*)dst = c_rint_f32(src0);
    #endif
    }
#if NPY_SIMD_F32
clear:;
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 206
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_floor)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    const char *src = args[0]; char *dst = args[1];
    const npy_intp src_step = steps[0];
    const npy_intp dst_step = steps[1];
    npy_intp len = dimensions[0];
#if NPY_SIMD_F32
    const int lsize = sizeof(npyv_lanetype_f32);
    assert(len <= 1 || (src_step % lsize == 0 && dst_step % lsize == 0));
    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
        goto no_unroll;
    }
    const npy_intp ssrc = src_step / lsize;
    const npy_intp sdst = dst_step / lsize;
    if (!npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst)) {
        goto no_unroll;
    }
    if (ssrc == 1 && sdst == 1) {
        simd_FLOAT_floor_CONTIG_CONTIG(src, 1, dst, 1, len);
    }
    else if (sdst == 1) {
        simd_FLOAT_floor_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
    }
    else if (ssrc == 1) {
        simd_FLOAT_floor_CONTIG_NCONTIG(src, 1, dst, sdst, len);
    } else {
        simd_FLOAT_floor_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
    }
    goto clear;
no_unroll:
#endif // NPY_SIMD_F32
    for (; len > 0; --len, src += src_step, dst += dst_step) {
    #if NPY_SIMD_F32
        // to guarantee the same precision and fp/domain errors for both scalars and vectors
        simd_FLOAT_floor_CONTIG_CONTIG(src, 0, dst, 0, 1);
    #else
        const npyv_lanetype_f32 src0 = *(npyv_lanetype_f32*)src;
        *(npyv_lanetype_f32*)dst = c_floor_f32(src0);
    #endif
    }
#if NPY_SIMD_F32
clear:;
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 206
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_ceil)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    const char *src = args[0]; char *dst = args[1];
    const npy_intp src_step = steps[0];
    const npy_intp dst_step = steps[1];
    npy_intp len = dimensions[0];
#if NPY_SIMD_F32
    const int lsize = sizeof(npyv_lanetype_f32);
    assert(len <= 1 || (src_step % lsize == 0 && dst_step % lsize == 0));
    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
        goto no_unroll;
    }
    const npy_intp ssrc = src_step / lsize;
    const npy_intp sdst = dst_step / lsize;
    if (!npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst)) {
        goto no_unroll;
    }
    if (ssrc == 1 && sdst == 1) {
        simd_FLOAT_ceil_CONTIG_CONTIG(src, 1, dst, 1, len);
    }
    else if (sdst == 1) {
        simd_FLOAT_ceil_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
    }
    else if (ssrc == 1) {
        simd_FLOAT_ceil_CONTIG_NCONTIG(src, 1, dst, sdst, len);
    } else {
        simd_FLOAT_ceil_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
    }
    goto clear;
no_unroll:
#endif // NPY_SIMD_F32
    for (; len > 0; --len, src += src_step, dst += dst_step) {
    #if NPY_SIMD_F32
        // to guarantee the same precision and fp/domain errors for both scalars and vectors
        simd_FLOAT_ceil_CONTIG_CONTIG(src, 0, dst, 0, 1);
    #else
        const npyv_lanetype_f32 src0 = *(npyv_lanetype_f32*)src;
        *(npyv_lanetype_f32*)dst = c_ceil_f32(src0);
    #endif
    }
#if NPY_SIMD_F32
clear:;
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 206
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_trunc)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    const char *src = args[0]; char *dst = args[1];
    const npy_intp src_step = steps[0];
    const npy_intp dst_step = steps[1];
    npy_intp len = dimensions[0];
#if NPY_SIMD_F32
    const int lsize = sizeof(npyv_lanetype_f32);
    assert(len <= 1 || (src_step % lsize == 0 && dst_step % lsize == 0));
    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
        goto no_unroll;
    }
    const npy_intp ssrc = src_step / lsize;
    const npy_intp sdst = dst_step / lsize;
    if (!npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst)) {
        goto no_unroll;
    }
    if (ssrc == 1 && sdst == 1) {
        simd_FLOAT_trunc_CONTIG_CONTIG(src, 1, dst, 1, len);
    }
    else if (sdst == 1) {
        simd_FLOAT_trunc_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
    }
    else if (ssrc == 1) {
        simd_FLOAT_trunc_CONTIG_NCONTIG(src, 1, dst, sdst, len);
    } else {
        simd_FLOAT_trunc_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
    }
    goto clear;
no_unroll:
#endif // NPY_SIMD_F32
    for (; len > 0; --len, src += src_step, dst += dst_step) {
    #if NPY_SIMD_F32
        // to guarantee the same precision and fp/domain errors for both scalars and vectors
        simd_FLOAT_trunc_CONTIG_CONTIG(src, 0, dst, 0, 1);
    #else
        const npyv_lanetype_f32 src0 = *(npyv_lanetype_f32*)src;
        *(npyv_lanetype_f32*)dst = c_trunc_f32(src0);
    #endif
    }
#if NPY_SIMD_F32
clear:;
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 206
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_sqrt)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    const char *src = args[0]; char *dst = args[1];
    const npy_intp src_step = steps[0];
    const npy_intp dst_step = steps[1];
    npy_intp len = dimensions[0];
#if NPY_SIMD_F32
    const int lsize = sizeof(npyv_lanetype_f32);
    assert(len <= 1 || (src_step % lsize == 0 && dst_step % lsize == 0));
    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
        goto no_unroll;
    }
    const npy_intp ssrc = src_step / lsize;
    const npy_intp sdst = dst_step / lsize;
    if (!npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst)) {
        goto no_unroll;
    }
    if (ssrc == 1 && sdst == 1) {
        simd_FLOAT_sqrt_CONTIG_CONTIG(src, 1, dst, 1, len);
    }
    else if (sdst == 1) {
        simd_FLOAT_sqrt_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
    }
    else if (ssrc == 1) {
        simd_FLOAT_sqrt_CONTIG_NCONTIG(src, 1, dst, sdst, len);
    } else {
        simd_FLOAT_sqrt_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
    }
    goto clear;
no_unroll:
#endif // NPY_SIMD_F32
    for (; len > 0; --len, src += src_step, dst += dst_step) {
    #if NPY_SIMD_F32
        // to guarantee the same precision and fp/domain errors for both scalars and vectors
        simd_FLOAT_sqrt_CONTIG_CONTIG(src, 0, dst, 0, 1);
    #else
        const npyv_lanetype_f32 src0 = *(npyv_lanetype_f32*)src;
        *(npyv_lanetype_f32*)dst = c_sqrt_f32(src0);
    #endif
    }
#if NPY_SIMD_F32
clear:;
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 206
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_absolute)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    const char *src = args[0]; char *dst = args[1];
    const npy_intp src_step = steps[0];
    const npy_intp dst_step = steps[1];
    npy_intp len = dimensions[0];
#if NPY_SIMD_F32
    const int lsize = sizeof(npyv_lanetype_f32);
    assert(len <= 1 || (src_step % lsize == 0 && dst_step % lsize == 0));
    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
        goto no_unroll;
    }
    const npy_intp ssrc = src_step / lsize;
    const npy_intp sdst = dst_step / lsize;
    if (!npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst)) {
        goto no_unroll;
    }
    if (ssrc == 1 && sdst == 1) {
        simd_FLOAT_absolute_CONTIG_CONTIG(src, 1, dst, 1, len);
    }
    else if (sdst == 1) {
        simd_FLOAT_absolute_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
    }
    else if (ssrc == 1) {
        simd_FLOAT_absolute_CONTIG_NCONTIG(src, 1, dst, sdst, len);
    } else {
        simd_FLOAT_absolute_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
    }
    goto clear;
no_unroll:
#endif // NPY_SIMD_F32
    for (; len > 0; --len, src += src_step, dst += dst_step) {
    #if NPY_SIMD_F32
        // to guarantee the same precision and fp/domain errors for both scalars and vectors
        simd_FLOAT_absolute_CONTIG_CONTIG(src, 0, dst, 0, 1);
    #else
        const npyv_lanetype_f32 src0 = *(npyv_lanetype_f32*)src;
        *(npyv_lanetype_f32*)dst = c_abs_f32(src0);
    #endif
    }
#if NPY_SIMD_F32
clear:;
#endif
#if 1
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 206
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_square)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    const char *src = args[0]; char *dst = args[1];
    const npy_intp src_step = steps[0];
    const npy_intp dst_step = steps[1];
    npy_intp len = dimensions[0];
#if NPY_SIMD_F32
    const int lsize = sizeof(npyv_lanetype_f32);
    assert(len <= 1 || (src_step % lsize == 0 && dst_step % lsize == 0));
    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
        goto no_unroll;
    }
    const npy_intp ssrc = src_step / lsize;
    const npy_intp sdst = dst_step / lsize;
    if (!npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst)) {
        goto no_unroll;
    }
    if (ssrc == 1 && sdst == 1) {
        simd_FLOAT_square_CONTIG_CONTIG(src, 1, dst, 1, len);
    }
    else if (sdst == 1) {
        simd_FLOAT_square_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
    }
    else if (ssrc == 1) {
        simd_FLOAT_square_CONTIG_NCONTIG(src, 1, dst, sdst, len);
    } else {
        simd_FLOAT_square_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
    }
    goto clear;
no_unroll:
#endif // NPY_SIMD_F32
    for (; len > 0; --len, src += src_step, dst += dst_step) {
    #if NPY_SIMD_F32
        // to guarantee the same precision and fp/domain errors for both scalars and vectors
        simd_FLOAT_square_CONTIG_CONTIG(src, 0, dst, 0, 1);
    #else
        const npyv_lanetype_f32 src0 = *(npyv_lanetype_f32*)src;
        *(npyv_lanetype_f32*)dst = c_square_f32(src0);
    #endif
    }
#if NPY_SIMD_F32
clear:;
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 206
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_reciprocal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    const char *src = args[0]; char *dst = args[1];
    const npy_intp src_step = steps[0];
    const npy_intp dst_step = steps[1];
    npy_intp len = dimensions[0];
#if NPY_SIMD_F32
    const int lsize = sizeof(npyv_lanetype_f32);
    assert(len <= 1 || (src_step % lsize == 0 && dst_step % lsize == 0));
    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
        goto no_unroll;
    }
    const npy_intp ssrc = src_step / lsize;
    const npy_intp sdst = dst_step / lsize;
    if (!npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst)) {
        goto no_unroll;
    }
    if (ssrc == 1 && sdst == 1) {
        simd_FLOAT_reciprocal_CONTIG_CONTIG(src, 1, dst, 1, len);
    }
    else if (sdst == 1) {
        simd_FLOAT_reciprocal_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
    }
    else if (ssrc == 1) {
        simd_FLOAT_reciprocal_CONTIG_NCONTIG(src, 1, dst, sdst, len);
    } else {
        simd_FLOAT_reciprocal_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
    }
    goto clear;
no_unroll:
#endif // NPY_SIMD_F32
    for (; len > 0; --len, src += src_step, dst += dst_step) {
    #if NPY_SIMD_F32
        // to guarantee the same precision and fp/domain errors for both scalars and vectors
        simd_FLOAT_reciprocal_CONTIG_CONTIG(src, 0, dst, 0, 1);
    #else
        const npyv_lanetype_f32 src0 = *(npyv_lanetype_f32*)src;
        *(npyv_lanetype_f32*)dst = c_recip_f32(src0);
    #endif
    }
#if NPY_SIMD_F32
clear:;
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


#line 201
#line 206
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_rint)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    const char *src = args[0]; char *dst = args[1];
    const npy_intp src_step = steps[0];
    const npy_intp dst_step = steps[1];
    npy_intp len = dimensions[0];
#if NPY_SIMD_F64
    const int lsize = sizeof(npyv_lanetype_f64);
    assert(len <= 1 || (src_step % lsize == 0 && dst_step % lsize == 0));
    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
        goto no_unroll;
    }
    const npy_intp ssrc = src_step / lsize;
    const npy_intp sdst = dst_step / lsize;
    if (!npyv_loadable_stride_f64(ssrc) || !npyv_storable_stride_f64(sdst)) {
        goto no_unroll;
    }
    if (ssrc == 1 && sdst == 1) {
        simd_DOUBLE_rint_CONTIG_CONTIG(src, 1, dst, 1, len);
    }
    else if (sdst == 1) {
        simd_DOUBLE_rint_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
    }
    else if (ssrc == 1) {
        simd_DOUBLE_rint_CONTIG_NCONTIG(src, 1, dst, sdst, len);
    } else {
        simd_DOUBLE_rint_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
    }
    goto clear;
no_unroll:
#endif // NPY_SIMD_F64
    for (; len > 0; --len, src += src_step, dst += dst_step) {
    #if NPY_SIMD_F64
        // to guarantee the same precision and fp/domain errors for both scalars and vectors
        simd_DOUBLE_rint_CONTIG_CONTIG(src, 0, dst, 0, 1);
    #else
        const npyv_lanetype_f64 src0 = *(npyv_lanetype_f64*)src;
        *(npyv_lanetype_f64*)dst = c_rint_f64(src0);
    #endif
    }
#if NPY_SIMD_F64
clear:;
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 206
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_floor)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    const char *src = args[0]; char *dst = args[1];
    const npy_intp src_step = steps[0];
    const npy_intp dst_step = steps[1];
    npy_intp len = dimensions[0];
#if NPY_SIMD_F64
    const int lsize = sizeof(npyv_lanetype_f64);
    assert(len <= 1 || (src_step % lsize == 0 && dst_step % lsize == 0));
    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
        goto no_unroll;
    }
    const npy_intp ssrc = src_step / lsize;
    const npy_intp sdst = dst_step / lsize;
    if (!npyv_loadable_stride_f64(ssrc) || !npyv_storable_stride_f64(sdst)) {
        goto no_unroll;
    }
    if (ssrc == 1 && sdst == 1) {
        simd_DOUBLE_floor_CONTIG_CONTIG(src, 1, dst, 1, len);
    }
    else if (sdst == 1) {
        simd_DOUBLE_floor_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
    }
    else if (ssrc == 1) {
        simd_DOUBLE_floor_CONTIG_NCONTIG(src, 1, dst, sdst, len);
    } else {
        simd_DOUBLE_floor_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
    }
    goto clear;
no_unroll:
#endif // NPY_SIMD_F64
    for (; len > 0; --len, src += src_step, dst += dst_step) {
    #if NPY_SIMD_F64
        // to guarantee the same precision and fp/domain errors for both scalars and vectors
        simd_DOUBLE_floor_CONTIG_CONTIG(src, 0, dst, 0, 1);
    #else
        const npyv_lanetype_f64 src0 = *(npyv_lanetype_f64*)src;
        *(npyv_lanetype_f64*)dst = c_floor_f64(src0);
    #endif
    }
#if NPY_SIMD_F64
clear:;
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 206
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_ceil)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    const char *src = args[0]; char *dst = args[1];
    const npy_intp src_step = steps[0];
    const npy_intp dst_step = steps[1];
    npy_intp len = dimensions[0];
#if NPY_SIMD_F64
    const int lsize = sizeof(npyv_lanetype_f64);
    assert(len <= 1 || (src_step % lsize == 0 && dst_step % lsize == 0));
    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
        goto no_unroll;
    }
    const npy_intp ssrc = src_step / lsize;
    const npy_intp sdst = dst_step / lsize;
    if (!npyv_loadable_stride_f64(ssrc) || !npyv_storable_stride_f64(sdst)) {
        goto no_unroll;
    }
    if (ssrc == 1 && sdst == 1) {
        simd_DOUBLE_ceil_CONTIG_CONTIG(src, 1, dst, 1, len);
    }
    else if (sdst == 1) {
        simd_DOUBLE_ceil_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
    }
    else if (ssrc == 1) {
        simd_DOUBLE_ceil_CONTIG_NCONTIG(src, 1, dst, sdst, len);
    } else {
        simd_DOUBLE_ceil_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
    }
    goto clear;
no_unroll:
#endif // NPY_SIMD_F64
    for (; len > 0; --len, src += src_step, dst += dst_step) {
    #if NPY_SIMD_F64
        // to guarantee the same precision and fp/domain errors for both scalars and vectors
        simd_DOUBLE_ceil_CONTIG_CONTIG(src, 0, dst, 0, 1);
    #else
        const npyv_lanetype_f64 src0 = *(npyv_lanetype_f64*)src;
        *(npyv_lanetype_f64*)dst = c_ceil_f64(src0);
    #endif
    }
#if NPY_SIMD_F64
clear:;
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 206
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_trunc)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    const char *src = args[0]; char *dst = args[1];
    const npy_intp src_step = steps[0];
    const npy_intp dst_step = steps[1];
    npy_intp len = dimensions[0];
#if NPY_SIMD_F64
    const int lsize = sizeof(npyv_lanetype_f64);
    assert(len <= 1 || (src_step % lsize == 0 && dst_step % lsize == 0));
    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
        goto no_unroll;
    }
    const npy_intp ssrc = src_step / lsize;
    const npy_intp sdst = dst_step / lsize;
    if (!npyv_loadable_stride_f64(ssrc) || !npyv_storable_stride_f64(sdst)) {
        goto no_unroll;
    }
    if (ssrc == 1 && sdst == 1) {
        simd_DOUBLE_trunc_CONTIG_CONTIG(src, 1, dst, 1, len);
    }
    else if (sdst == 1) {
        simd_DOUBLE_trunc_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
    }
    else if (ssrc == 1) {
        simd_DOUBLE_trunc_CONTIG_NCONTIG(src, 1, dst, sdst, len);
    } else {
        simd_DOUBLE_trunc_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
    }
    goto clear;
no_unroll:
#endif // NPY_SIMD_F64
    for (; len > 0; --len, src += src_step, dst += dst_step) {
    #if NPY_SIMD_F64
        // to guarantee the same precision and fp/domain errors for both scalars and vectors
        simd_DOUBLE_trunc_CONTIG_CONTIG(src, 0, dst, 0, 1);
    #else
        const npyv_lanetype_f64 src0 = *(npyv_lanetype_f64*)src;
        *(npyv_lanetype_f64*)dst = c_trunc_f64(src0);
    #endif
    }
#if NPY_SIMD_F64
clear:;
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 206
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_sqrt)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    const char *src = args[0]; char *dst = args[1];
    const npy_intp src_step = steps[0];
    const npy_intp dst_step = steps[1];
    npy_intp len = dimensions[0];
#if NPY_SIMD_F64
    const int lsize = sizeof(npyv_lanetype_f64);
    assert(len <= 1 || (src_step % lsize == 0 && dst_step % lsize == 0));
    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
        goto no_unroll;
    }
    const npy_intp ssrc = src_step / lsize;
    const npy_intp sdst = dst_step / lsize;
    if (!npyv_loadable_stride_f64(ssrc) || !npyv_storable_stride_f64(sdst)) {
        goto no_unroll;
    }
    if (ssrc == 1 && sdst == 1) {
        simd_DOUBLE_sqrt_CONTIG_CONTIG(src, 1, dst, 1, len);
    }
    else if (sdst == 1) {
        simd_DOUBLE_sqrt_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
    }
    else if (ssrc == 1) {
        simd_DOUBLE_sqrt_CONTIG_NCONTIG(src, 1, dst, sdst, len);
    } else {
        simd_DOUBLE_sqrt_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
    }
    goto clear;
no_unroll:
#endif // NPY_SIMD_F64
    for (; len > 0; --len, src += src_step, dst += dst_step) {
    #if NPY_SIMD_F64
        // to guarantee the same precision and fp/domain errors for both scalars and vectors
        simd_DOUBLE_sqrt_CONTIG_CONTIG(src, 0, dst, 0, 1);
    #else
        const npyv_lanetype_f64 src0 = *(npyv_lanetype_f64*)src;
        *(npyv_lanetype_f64*)dst = c_sqrt_f64(src0);
    #endif
    }
#if NPY_SIMD_F64
clear:;
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 206
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_absolute)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    const char *src = args[0]; char *dst = args[1];
    const npy_intp src_step = steps[0];
    const npy_intp dst_step = steps[1];
    npy_intp len = dimensions[0];
#if NPY_SIMD_F64
    const int lsize = sizeof(npyv_lanetype_f64);
    assert(len <= 1 || (src_step % lsize == 0 && dst_step % lsize == 0));
    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
        goto no_unroll;
    }
    const npy_intp ssrc = src_step / lsize;
    const npy_intp sdst = dst_step / lsize;
    if (!npyv_loadable_stride_f64(ssrc) || !npyv_storable_stride_f64(sdst)) {
        goto no_unroll;
    }
    if (ssrc == 1 && sdst == 1) {
        simd_DOUBLE_absolute_CONTIG_CONTIG(src, 1, dst, 1, len);
    }
    else if (sdst == 1) {
        simd_DOUBLE_absolute_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
    }
    else if (ssrc == 1) {
        simd_DOUBLE_absolute_CONTIG_NCONTIG(src, 1, dst, sdst, len);
    } else {
        simd_DOUBLE_absolute_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
    }
    goto clear;
no_unroll:
#endif // NPY_SIMD_F64
    for (; len > 0; --len, src += src_step, dst += dst_step) {
    #if NPY_SIMD_F64
        // to guarantee the same precision and fp/domain errors for both scalars and vectors
        simd_DOUBLE_absolute_CONTIG_CONTIG(src, 0, dst, 0, 1);
    #else
        const npyv_lanetype_f64 src0 = *(npyv_lanetype_f64*)src;
        *(npyv_lanetype_f64*)dst = c_abs_f64(src0);
    #endif
    }
#if NPY_SIMD_F64
clear:;
#endif
#if 1
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 206
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_square)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    const char *src = args[0]; char *dst = args[1];
    const npy_intp src_step = steps[0];
    const npy_intp dst_step = steps[1];
    npy_intp len = dimensions[0];
#if NPY_SIMD_F64
    const int lsize = sizeof(npyv_lanetype_f64);
    assert(len <= 1 || (src_step % lsize == 0 && dst_step % lsize == 0));
    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
        goto no_unroll;
    }
    const npy_intp ssrc = src_step / lsize;
    const npy_intp sdst = dst_step / lsize;
    if (!npyv_loadable_stride_f64(ssrc) || !npyv_storable_stride_f64(sdst)) {
        goto no_unroll;
    }
    if (ssrc == 1 && sdst == 1) {
        simd_DOUBLE_square_CONTIG_CONTIG(src, 1, dst, 1, len);
    }
    else if (sdst == 1) {
        simd_DOUBLE_square_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
    }
    else if (ssrc == 1) {
        simd_DOUBLE_square_CONTIG_NCONTIG(src, 1, dst, sdst, len);
    } else {
        simd_DOUBLE_square_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
    }
    goto clear;
no_unroll:
#endif // NPY_SIMD_F64
    for (; len > 0; --len, src += src_step, dst += dst_step) {
    #if NPY_SIMD_F64
        // to guarantee the same precision and fp/domain errors for both scalars and vectors
        simd_DOUBLE_square_CONTIG_CONTIG(src, 0, dst, 0, 1);
    #else
        const npyv_lanetype_f64 src0 = *(npyv_lanetype_f64*)src;
        *(npyv_lanetype_f64*)dst = c_square_f64(src0);
    #endif
    }
#if NPY_SIMD_F64
clear:;
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 206
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_reciprocal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    const char *src = args[0]; char *dst = args[1];
    const npy_intp src_step = steps[0];
    const npy_intp dst_step = steps[1];
    npy_intp len = dimensions[0];
#if NPY_SIMD_F64
    const int lsize = sizeof(npyv_lanetype_f64);
    assert(len <= 1 || (src_step % lsize == 0 && dst_step % lsize == 0));
    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
        goto no_unroll;
    }
    const npy_intp ssrc = src_step / lsize;
    const npy_intp sdst = dst_step / lsize;
    if (!npyv_loadable_stride_f64(ssrc) || !npyv_storable_stride_f64(sdst)) {
        goto no_unroll;
    }
    if (ssrc == 1 && sdst == 1) {
        simd_DOUBLE_reciprocal_CONTIG_CONTIG(src, 1, dst, 1, len);
    }
    else if (sdst == 1) {
        simd_DOUBLE_reciprocal_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
    }
    else if (ssrc == 1) {
        simd_DOUBLE_reciprocal_CONTIG_NCONTIG(src, 1, dst, sdst, len);
    } else {
        simd_DOUBLE_reciprocal_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
    }
    goto clear;
no_unroll:
#endif // NPY_SIMD_F64
    for (; len > 0; --len, src += src_step, dst += dst_step) {
    #if NPY_SIMD_F64
        // to guarantee the same precision and fp/domain errors for both scalars and vectors
        simd_DOUBLE_reciprocal_CONTIG_CONTIG(src, 0, dst, 0, 1);
    #else
        const npyv_lanetype_f64 src0 = *(npyv_lanetype_f64*)src;
        *(npyv_lanetype_f64*)dst = c_recip_f64(src0);
    #endif
    }
#if NPY_SIMD_F64
clear:;
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}



