#line 1 "numpy/core/src/umath/loops_unary.dispatch.c.src"

/*
 *****************************************************************************
 **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
 **       Changes should be made to the original source (.src) file         **
 *****************************************************************************
 */

#line 1
/*@targets
 ** $maxopt baseline
 ** neon asimd
 ** sse2 avx2 avx512_skx
 ** vsx2
 ** vx vxe
 **/

#define _UMATHMODULE
#define _MULTIARRAYMODULE
#define NPY_NO_DEPRECATED_API NPY_API_VERSION

#include "numpy/npy_math.h"
#include "simd/simd.h"
#include "loops_utils.h"
#include "loops.h"
#include "lowlevel_strided_loops.h"
// Provides the various *_LOOP macros
#include "fast_loop_macros.h"

/*******************************************************************************
 ** Scalar ops
 ******************************************************************************/
#define scalar_negative(X) (-X)

/*******************************************************************************
 ** extra SIMD intrinsics
 ******************************************************************************/

#if NPY_SIMD

#line 36
static NPY_INLINE npyv_s8
npyv_negative_s8(npyv_s8 v)
{
#if defined(NPY_HAVE_NEON) && (defined(__aarch64__) || 8 < 64)
    return npyv_reinterpret_s8_s8(vnegq_s8(npyv_reinterpret_s8_s8(v)));
#else
    // (x ^ -1) + 1
    const npyv_s8 m1 = npyv_setall_s8((npyv_lanetype_s8)-1);
    return npyv_sub_s8(npyv_xor_s8(v, m1), m1);
#endif
}

#line 36
static NPY_INLINE npyv_u8
npyv_negative_u8(npyv_u8 v)
{
#if defined(NPY_HAVE_NEON) && (defined(__aarch64__) || 8 < 64)
    return npyv_reinterpret_u8_s8(vnegq_s8(npyv_reinterpret_s8_u8(v)));
#else
    // (x ^ -1) + 1
    const npyv_u8 m1 = npyv_setall_u8((npyv_lanetype_u8)-1);
    return npyv_sub_u8(npyv_xor_u8(v, m1), m1);
#endif
}

#line 36
static NPY_INLINE npyv_s16
npyv_negative_s16(npyv_s16 v)
{
#if defined(NPY_HAVE_NEON) && (defined(__aarch64__) || 16 < 64)
    return npyv_reinterpret_s16_s16(vnegq_s16(npyv_reinterpret_s16_s16(v)));
#else
    // (x ^ -1) + 1
    const npyv_s16 m1 = npyv_setall_s16((npyv_lanetype_s16)-1);
    return npyv_sub_s16(npyv_xor_s16(v, m1), m1);
#endif
}

#line 36
static NPY_INLINE npyv_u16
npyv_negative_u16(npyv_u16 v)
{
#if defined(NPY_HAVE_NEON) && (defined(__aarch64__) || 16 < 64)
    return npyv_reinterpret_u16_s16(vnegq_s16(npyv_reinterpret_s16_u16(v)));
#else
    // (x ^ -1) + 1
    const npyv_u16 m1 = npyv_setall_u16((npyv_lanetype_u16)-1);
    return npyv_sub_u16(npyv_xor_u16(v, m1), m1);
#endif
}

#line 36
static NPY_INLINE npyv_s32
npyv_negative_s32(npyv_s32 v)
{
#if defined(NPY_HAVE_NEON) && (defined(__aarch64__) || 32 < 64)
    return npyv_reinterpret_s32_s32(vnegq_s32(npyv_reinterpret_s32_s32(v)));
#else
    // (x ^ -1) + 1
    const npyv_s32 m1 = npyv_setall_s32((npyv_lanetype_s32)-1);
    return npyv_sub_s32(npyv_xor_s32(v, m1), m1);
#endif
}

#line 36
static NPY_INLINE npyv_u32
npyv_negative_u32(npyv_u32 v)
{
#if defined(NPY_HAVE_NEON) && (defined(__aarch64__) || 32 < 64)
    return npyv_reinterpret_u32_s32(vnegq_s32(npyv_reinterpret_s32_u32(v)));
#else
    // (x ^ -1) + 1
    const npyv_u32 m1 = npyv_setall_u32((npyv_lanetype_u32)-1);
    return npyv_sub_u32(npyv_xor_u32(v, m1), m1);
#endif
}

#line 36
static NPY_INLINE npyv_s64
npyv_negative_s64(npyv_s64 v)
{
#if defined(NPY_HAVE_NEON) && (defined(__aarch64__) || 64 < 64)
    return npyv_reinterpret_s64_s64(vnegq_s64(npyv_reinterpret_s64_s64(v)));
#else
    // (x ^ -1) + 1
    const npyv_s64 m1 = npyv_setall_s64((npyv_lanetype_s64)-1);
    return npyv_sub_s64(npyv_xor_s64(v, m1), m1);
#endif
}

#line 36
static NPY_INLINE npyv_u64
npyv_negative_u64(npyv_u64 v)
{
#if defined(NPY_HAVE_NEON) && (defined(__aarch64__) || 64 < 64)
    return npyv_reinterpret_u64_s64(vnegq_s64(npyv_reinterpret_s64_u64(v)));
#else
    // (x ^ -1) + 1
    const npyv_u64 m1 = npyv_setall_u64((npyv_lanetype_u64)-1);
    return npyv_sub_u64(npyv_xor_u64(v, m1), m1);
#endif
}


#line 54
#if NPY_SIMD_F32
static NPY_INLINE npyv_f32
npyv_negative_f32(npyv_f32 v)
{
#if defined(NPY_HAVE_NEON)
    return vnegq_f32(v);
#else
    // (v ^ signmask)
    const npyv_f32 signmask = npyv_setall_f32(-0.f);
    return npyv_xor_f32(v, signmask);
#endif
}
#endif // NPY_SIMD_F32

#line 54
#if NPY_SIMD_F64
static NPY_INLINE npyv_f64
npyv_negative_f64(npyv_f64 v)
{
#if defined(NPY_HAVE_NEON)
    return vnegq_f64(v);
#else
    // (v ^ signmask)
    const npyv_f64 signmask = npyv_setall_f64(-0.);
    return npyv_xor_f64(v, signmask);
#endif
}
#endif // NPY_SIMD_F64


#endif // NPY_SIMD

/********************************************************************************
 ** Defining the SIMD kernels
 ********************************************************************************/
#line 80
#line 85
#if NPY_SIMD
#if 4 < 1
#error "Unroll must be at least 1"
#elif NPY_SIMD != 128 && 4 > 2
// Avoid memory bandwidth bottleneck for larger SIMD
#define UNROLL 2
#else
#define UNROLL 4
#endif
// contiguous inputs and output.
static NPY_INLINE void
simd_unary_cc_negative_s8(const npyv_lanetype_s8 *ip,
                             npyv_lanetype_s8 *op,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_s8;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
    #line 108
    #if UNROLL > 0
        npyv_s8 v_0 = npyv_load_s8(ip + 0 * vstep);
        npyv_s8 r_0 = npyv_negative_s8(v_0);
        npyv_store_s8(op + 0 * vstep, r_0);
    #endif
    
#line 108
    #if UNROLL > 1
        npyv_s8 v_1 = npyv_load_s8(ip + 1 * vstep);
        npyv_s8 r_1 = npyv_negative_s8(v_1);
        npyv_store_s8(op + 1 * vstep, r_1);
    #endif
    
#line 108
    #if UNROLL > 2
        npyv_s8 v_2 = npyv_load_s8(ip + 2 * vstep);
        npyv_s8 r_2 = npyv_negative_s8(v_2);
        npyv_store_s8(op + 2 * vstep, r_2);
    #endif
    
#line 108
    #if UNROLL > 3
        npyv_s8 v_3 = npyv_load_s8(ip + 3 * vstep);
        npyv_s8 r_3 = npyv_negative_s8(v_3);
        npyv_store_s8(op + 3 * vstep, r_3);
    #endif
    
#line 108
    #if UNROLL > 4
        npyv_s8 v_4 = npyv_load_s8(ip + 4 * vstep);
        npyv_s8 r_4 = npyv_negative_s8(v_4);
        npyv_store_s8(op + 4 * vstep, r_4);
    #endif
    
#line 108
    #if UNROLL > 5
        npyv_s8 v_5 = npyv_load_s8(ip + 5 * vstep);
        npyv_s8 r_5 = npyv_negative_s8(v_5);
        npyv_store_s8(op + 5 * vstep, r_5);
    #endif
    
#line 108
    #if UNROLL > 6
        npyv_s8 v_6 = npyv_load_s8(ip + 6 * vstep);
        npyv_s8 r_6 = npyv_negative_s8(v_6);
        npyv_store_s8(op + 6 * vstep, r_6);
    #endif
    
#line 108
    #if UNROLL > 7
        npyv_s8 v_7 = npyv_load_s8(ip + 7 * vstep);
        npyv_s8 r_7 = npyv_negative_s8(v_7);
        npyv_store_s8(op + 7 * vstep, r_7);
    #endif
    
#line 108
    #if UNROLL > 8
        npyv_s8 v_8 = npyv_load_s8(ip + 8 * vstep);
        npyv_s8 r_8 = npyv_negative_s8(v_8);
        npyv_store_s8(op + 8 * vstep, r_8);
    #endif
    
#line 108
    #if UNROLL > 9
        npyv_s8 v_9 = npyv_load_s8(ip + 9 * vstep);
        npyv_s8 r_9 = npyv_negative_s8(v_9);
        npyv_store_s8(op + 9 * vstep, r_9);
    #endif
    
#line 108
    #if UNROLL > 10
        npyv_s8 v_10 = npyv_load_s8(ip + 10 * vstep);
        npyv_s8 r_10 = npyv_negative_s8(v_10);
        npyv_store_s8(op + 10 * vstep, r_10);
    #endif
    
#line 108
    #if UNROLL > 11
        npyv_s8 v_11 = npyv_load_s8(ip + 11 * vstep);
        npyv_s8 r_11 = npyv_negative_s8(v_11);
        npyv_store_s8(op + 11 * vstep, r_11);
    #endif
    
#line 108
    #if UNROLL > 12
        npyv_s8 v_12 = npyv_load_s8(ip + 12 * vstep);
        npyv_s8 r_12 = npyv_negative_s8(v_12);
        npyv_store_s8(op + 12 * vstep, r_12);
    #endif
    
#line 108
    #if UNROLL > 13
        npyv_s8 v_13 = npyv_load_s8(ip + 13 * vstep);
        npyv_s8 r_13 = npyv_negative_s8(v_13);
        npyv_store_s8(op + 13 * vstep, r_13);
    #endif
    
#line 108
    #if UNROLL > 14
        npyv_s8 v_14 = npyv_load_s8(ip + 14 * vstep);
        npyv_s8 r_14 = npyv_negative_s8(v_14);
        npyv_store_s8(op + 14 * vstep, r_14);
    #endif
    
#line 108
    #if UNROLL > 15
        npyv_s8 v_15 = npyv_load_s8(ip + 15 * vstep);
        npyv_s8 r_15 = npyv_negative_s8(v_15);
        npyv_store_s8(op + 15 * vstep, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += vstep, op +=vstep) {
        npyv_s8 v = npyv_load_s8(ip);
        npyv_s8 r = npyv_negative_s8(v);
        npyv_store_s8(op, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ++ip, ++op) {
        *op = scalar_negative(*ip);
    }
}

#if 0
// contiguous input, non-contiguous output
static NPY_INLINE void
simd_unary_cn_negative_s8(const npyv_lanetype_s8 *ip,
                             npyv_lanetype_s8 *op, npy_intp ostride,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_s8;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += wstep, op += ostride*wstep) {
    #line 142
    #if UNROLL > 0
        npyv_s8 v_0 = npyv_load_s8(ip + 0 * vstep);
        npyv_s8 r_0 = npyv_negative_s8(v_0);
        npyv_storen_s8(op + 0 * vstep * ostride, ostride, r_0);
    #endif
    
#line 142
    #if UNROLL > 1
        npyv_s8 v_1 = npyv_load_s8(ip + 1 * vstep);
        npyv_s8 r_1 = npyv_negative_s8(v_1);
        npyv_storen_s8(op + 1 * vstep * ostride, ostride, r_1);
    #endif
    
#line 142
    #if UNROLL > 2
        npyv_s8 v_2 = npyv_load_s8(ip + 2 * vstep);
        npyv_s8 r_2 = npyv_negative_s8(v_2);
        npyv_storen_s8(op + 2 * vstep * ostride, ostride, r_2);
    #endif
    
#line 142
    #if UNROLL > 3
        npyv_s8 v_3 = npyv_load_s8(ip + 3 * vstep);
        npyv_s8 r_3 = npyv_negative_s8(v_3);
        npyv_storen_s8(op + 3 * vstep * ostride, ostride, r_3);
    #endif
    
#line 142
    #if UNROLL > 4
        npyv_s8 v_4 = npyv_load_s8(ip + 4 * vstep);
        npyv_s8 r_4 = npyv_negative_s8(v_4);
        npyv_storen_s8(op + 4 * vstep * ostride, ostride, r_4);
    #endif
    
#line 142
    #if UNROLL > 5
        npyv_s8 v_5 = npyv_load_s8(ip + 5 * vstep);
        npyv_s8 r_5 = npyv_negative_s8(v_5);
        npyv_storen_s8(op + 5 * vstep * ostride, ostride, r_5);
    #endif
    
#line 142
    #if UNROLL > 6
        npyv_s8 v_6 = npyv_load_s8(ip + 6 * vstep);
        npyv_s8 r_6 = npyv_negative_s8(v_6);
        npyv_storen_s8(op + 6 * vstep * ostride, ostride, r_6);
    #endif
    
#line 142
    #if UNROLL > 7
        npyv_s8 v_7 = npyv_load_s8(ip + 7 * vstep);
        npyv_s8 r_7 = npyv_negative_s8(v_7);
        npyv_storen_s8(op + 7 * vstep * ostride, ostride, r_7);
    #endif
    
#line 142
    #if UNROLL > 8
        npyv_s8 v_8 = npyv_load_s8(ip + 8 * vstep);
        npyv_s8 r_8 = npyv_negative_s8(v_8);
        npyv_storen_s8(op + 8 * vstep * ostride, ostride, r_8);
    #endif
    
#line 142
    #if UNROLL > 9
        npyv_s8 v_9 = npyv_load_s8(ip + 9 * vstep);
        npyv_s8 r_9 = npyv_negative_s8(v_9);
        npyv_storen_s8(op + 9 * vstep * ostride, ostride, r_9);
    #endif
    
#line 142
    #if UNROLL > 10
        npyv_s8 v_10 = npyv_load_s8(ip + 10 * vstep);
        npyv_s8 r_10 = npyv_negative_s8(v_10);
        npyv_storen_s8(op + 10 * vstep * ostride, ostride, r_10);
    #endif
    
#line 142
    #if UNROLL > 11
        npyv_s8 v_11 = npyv_load_s8(ip + 11 * vstep);
        npyv_s8 r_11 = npyv_negative_s8(v_11);
        npyv_storen_s8(op + 11 * vstep * ostride, ostride, r_11);
    #endif
    
#line 142
    #if UNROLL > 12
        npyv_s8 v_12 = npyv_load_s8(ip + 12 * vstep);
        npyv_s8 r_12 = npyv_negative_s8(v_12);
        npyv_storen_s8(op + 12 * vstep * ostride, ostride, r_12);
    #endif
    
#line 142
    #if UNROLL > 13
        npyv_s8 v_13 = npyv_load_s8(ip + 13 * vstep);
        npyv_s8 r_13 = npyv_negative_s8(v_13);
        npyv_storen_s8(op + 13 * vstep * ostride, ostride, r_13);
    #endif
    
#line 142
    #if UNROLL > 14
        npyv_s8 v_14 = npyv_load_s8(ip + 14 * vstep);
        npyv_s8 r_14 = npyv_negative_s8(v_14);
        npyv_storen_s8(op + 14 * vstep * ostride, ostride, r_14);
    #endif
    
#line 142
    #if UNROLL > 15
        npyv_s8 v_15 = npyv_load_s8(ip + 15 * vstep);
        npyv_s8 r_15 = npyv_negative_s8(v_15);
        npyv_storen_s8(op + 15 * vstep * ostride, ostride, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += vstep, op += ostride*vstep) {
        npyv_s8 v = npyv_load_s8(ip);
        npyv_s8 r = npyv_negative_s8(v);
        npyv_storen_s8(op, ostride, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ++ip, op += ostride) {
        *op = scalar_negative(*ip);
    }
}
// non-contiguous input, contiguous output
static NPY_INLINE void
simd_unary_nc_negative_s8(const npyv_lanetype_s8 *ip, npy_intp istride,
                             npyv_lanetype_s8 *op,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_s8;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += wstep) {
    #line 174
    #if UNROLL > 0
        npyv_s8 v_0 = npyv_loadn_s8(ip + 0 * vstep * istride, istride);
        npyv_s8 r_0 = npyv_negative_s8(v_0);
        npyv_store_s8(op + 0 * vstep, r_0);
    #endif
    
#line 174
    #if UNROLL > 1
        npyv_s8 v_1 = npyv_loadn_s8(ip + 1 * vstep * istride, istride);
        npyv_s8 r_1 = npyv_negative_s8(v_1);
        npyv_store_s8(op + 1 * vstep, r_1);
    #endif
    
#line 174
    #if UNROLL > 2
        npyv_s8 v_2 = npyv_loadn_s8(ip + 2 * vstep * istride, istride);
        npyv_s8 r_2 = npyv_negative_s8(v_2);
        npyv_store_s8(op + 2 * vstep, r_2);
    #endif
    
#line 174
    #if UNROLL > 3
        npyv_s8 v_3 = npyv_loadn_s8(ip + 3 * vstep * istride, istride);
        npyv_s8 r_3 = npyv_negative_s8(v_3);
        npyv_store_s8(op + 3 * vstep, r_3);
    #endif
    
#line 174
    #if UNROLL > 4
        npyv_s8 v_4 = npyv_loadn_s8(ip + 4 * vstep * istride, istride);
        npyv_s8 r_4 = npyv_negative_s8(v_4);
        npyv_store_s8(op + 4 * vstep, r_4);
    #endif
    
#line 174
    #if UNROLL > 5
        npyv_s8 v_5 = npyv_loadn_s8(ip + 5 * vstep * istride, istride);
        npyv_s8 r_5 = npyv_negative_s8(v_5);
        npyv_store_s8(op + 5 * vstep, r_5);
    #endif
    
#line 174
    #if UNROLL > 6
        npyv_s8 v_6 = npyv_loadn_s8(ip + 6 * vstep * istride, istride);
        npyv_s8 r_6 = npyv_negative_s8(v_6);
        npyv_store_s8(op + 6 * vstep, r_6);
    #endif
    
#line 174
    #if UNROLL > 7
        npyv_s8 v_7 = npyv_loadn_s8(ip + 7 * vstep * istride, istride);
        npyv_s8 r_7 = npyv_negative_s8(v_7);
        npyv_store_s8(op + 7 * vstep, r_7);
    #endif
    
#line 174
    #if UNROLL > 8
        npyv_s8 v_8 = npyv_loadn_s8(ip + 8 * vstep * istride, istride);
        npyv_s8 r_8 = npyv_negative_s8(v_8);
        npyv_store_s8(op + 8 * vstep, r_8);
    #endif
    
#line 174
    #if UNROLL > 9
        npyv_s8 v_9 = npyv_loadn_s8(ip + 9 * vstep * istride, istride);
        npyv_s8 r_9 = npyv_negative_s8(v_9);
        npyv_store_s8(op + 9 * vstep, r_9);
    #endif
    
#line 174
    #if UNROLL > 10
        npyv_s8 v_10 = npyv_loadn_s8(ip + 10 * vstep * istride, istride);
        npyv_s8 r_10 = npyv_negative_s8(v_10);
        npyv_store_s8(op + 10 * vstep, r_10);
    #endif
    
#line 174
    #if UNROLL > 11
        npyv_s8 v_11 = npyv_loadn_s8(ip + 11 * vstep * istride, istride);
        npyv_s8 r_11 = npyv_negative_s8(v_11);
        npyv_store_s8(op + 11 * vstep, r_11);
    #endif
    
#line 174
    #if UNROLL > 12
        npyv_s8 v_12 = npyv_loadn_s8(ip + 12 * vstep * istride, istride);
        npyv_s8 r_12 = npyv_negative_s8(v_12);
        npyv_store_s8(op + 12 * vstep, r_12);
    #endif
    
#line 174
    #if UNROLL > 13
        npyv_s8 v_13 = npyv_loadn_s8(ip + 13 * vstep * istride, istride);
        npyv_s8 r_13 = npyv_negative_s8(v_13);
        npyv_store_s8(op + 13 * vstep, r_13);
    #endif
    
#line 174
    #if UNROLL > 14
        npyv_s8 v_14 = npyv_loadn_s8(ip + 14 * vstep * istride, istride);
        npyv_s8 r_14 = npyv_negative_s8(v_14);
        npyv_store_s8(op + 14 * vstep, r_14);
    #endif
    
#line 174
    #if UNROLL > 15
        npyv_s8 v_15 = npyv_loadn_s8(ip + 15 * vstep * istride, istride);
        npyv_s8 r_15 = npyv_negative_s8(v_15);
        npyv_store_s8(op + 15 * vstep, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += vstep) {
        npyv_s8 v = npyv_loadn_s8(ip, istride);
        npyv_s8 r = npyv_negative_s8(v);
        npyv_store_s8(op, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ip += istride, ++op) {
        *op = scalar_negative(*ip);
    }
}
// non-contiguous input and output
// limit unroll to 2x
#if UNROLL > 2
#undef UNROLL
#define UNROLL 2
#endif
// X86 does better with unrolled scalar for heavy non-contiguous
#ifndef NPY_HAVE_SSE2
static NPY_INLINE void
simd_unary_nn_negative_s8(const npyv_lanetype_s8 *ip, npy_intp istride,
                             npyv_lanetype_s8 *op, npy_intp ostride,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_s8;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
    #line 213
    #if UNROLL > 0
        npyv_s8 v_0 = npyv_loadn_s8(ip + 0 * vstep * istride, istride);
        npyv_s8 r_0 = npyv_negative_s8(v_0);
        npyv_storen_s8(op + 0 * vstep * ostride, ostride, r_0);
    #endif
    
#line 213
    #if UNROLL > 1
        npyv_s8 v_1 = npyv_loadn_s8(ip + 1 * vstep * istride, istride);
        npyv_s8 r_1 = npyv_negative_s8(v_1);
        npyv_storen_s8(op + 1 * vstep * ostride, ostride, r_1);
    #endif
    
#line 213
    #if UNROLL > 2
        npyv_s8 v_2 = npyv_loadn_s8(ip + 2 * vstep * istride, istride);
        npyv_s8 r_2 = npyv_negative_s8(v_2);
        npyv_storen_s8(op + 2 * vstep * ostride, ostride, r_2);
    #endif
    
#line 213
    #if UNROLL > 3
        npyv_s8 v_3 = npyv_loadn_s8(ip + 3 * vstep * istride, istride);
        npyv_s8 r_3 = npyv_negative_s8(v_3);
        npyv_storen_s8(op + 3 * vstep * ostride, ostride, r_3);
    #endif
    
#line 213
    #if UNROLL > 4
        npyv_s8 v_4 = npyv_loadn_s8(ip + 4 * vstep * istride, istride);
        npyv_s8 r_4 = npyv_negative_s8(v_4);
        npyv_storen_s8(op + 4 * vstep * ostride, ostride, r_4);
    #endif
    
#line 213
    #if UNROLL > 5
        npyv_s8 v_5 = npyv_loadn_s8(ip + 5 * vstep * istride, istride);
        npyv_s8 r_5 = npyv_negative_s8(v_5);
        npyv_storen_s8(op + 5 * vstep * ostride, ostride, r_5);
    #endif
    
#line 213
    #if UNROLL > 6
        npyv_s8 v_6 = npyv_loadn_s8(ip + 6 * vstep * istride, istride);
        npyv_s8 r_6 = npyv_negative_s8(v_6);
        npyv_storen_s8(op + 6 * vstep * ostride, ostride, r_6);
    #endif
    
#line 213
    #if UNROLL > 7
        npyv_s8 v_7 = npyv_loadn_s8(ip + 7 * vstep * istride, istride);
        npyv_s8 r_7 = npyv_negative_s8(v_7);
        npyv_storen_s8(op + 7 * vstep * ostride, ostride, r_7);
    #endif
    
#line 213
    #if UNROLL > 8
        npyv_s8 v_8 = npyv_loadn_s8(ip + 8 * vstep * istride, istride);
        npyv_s8 r_8 = npyv_negative_s8(v_8);
        npyv_storen_s8(op + 8 * vstep * ostride, ostride, r_8);
    #endif
    
#line 213
    #if UNROLL > 9
        npyv_s8 v_9 = npyv_loadn_s8(ip + 9 * vstep * istride, istride);
        npyv_s8 r_9 = npyv_negative_s8(v_9);
        npyv_storen_s8(op + 9 * vstep * ostride, ostride, r_9);
    #endif
    
#line 213
    #if UNROLL > 10
        npyv_s8 v_10 = npyv_loadn_s8(ip + 10 * vstep * istride, istride);
        npyv_s8 r_10 = npyv_negative_s8(v_10);
        npyv_storen_s8(op + 10 * vstep * ostride, ostride, r_10);
    #endif
    
#line 213
    #if UNROLL > 11
        npyv_s8 v_11 = npyv_loadn_s8(ip + 11 * vstep * istride, istride);
        npyv_s8 r_11 = npyv_negative_s8(v_11);
        npyv_storen_s8(op + 11 * vstep * ostride, ostride, r_11);
    #endif
    
#line 213
    #if UNROLL > 12
        npyv_s8 v_12 = npyv_loadn_s8(ip + 12 * vstep * istride, istride);
        npyv_s8 r_12 = npyv_negative_s8(v_12);
        npyv_storen_s8(op + 12 * vstep * ostride, ostride, r_12);
    #endif
    
#line 213
    #if UNROLL > 13
        npyv_s8 v_13 = npyv_loadn_s8(ip + 13 * vstep * istride, istride);
        npyv_s8 r_13 = npyv_negative_s8(v_13);
        npyv_storen_s8(op + 13 * vstep * ostride, ostride, r_13);
    #endif
    
#line 213
    #if UNROLL > 14
        npyv_s8 v_14 = npyv_loadn_s8(ip + 14 * vstep * istride, istride);
        npyv_s8 r_14 = npyv_negative_s8(v_14);
        npyv_storen_s8(op + 14 * vstep * ostride, ostride, r_14);
    #endif
    
#line 213
    #if UNROLL > 15
        npyv_s8 v_15 = npyv_loadn_s8(ip + 15 * vstep * istride, istride);
        npyv_s8 r_15 = npyv_negative_s8(v_15);
        npyv_storen_s8(op + 15 * vstep * ostride, ostride, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
        npyv_s8 v = npyv_loadn_s8(ip, istride);
        npyv_s8 r = npyv_negative_s8(v);
        npyv_storen_s8(op, ostride, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ip += istride, op += ostride) {
        *op = scalar_negative(*ip);
    }
}
#endif // NPY_HAVE_SSE2
#endif // 0
#undef UNROLL
#endif // NPY_SIMD
/*end repeat1**/

#line 80
#line 85
#if NPY_SIMD
#if 4 < 1
#error "Unroll must be at least 1"
#elif NPY_SIMD != 128 && 4 > 2
// Avoid memory bandwidth bottleneck for larger SIMD
#define UNROLL 2
#else
#define UNROLL 4
#endif
// contiguous inputs and output.
static NPY_INLINE void
simd_unary_cc_negative_u8(const npyv_lanetype_u8 *ip,
                             npyv_lanetype_u8 *op,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_u8;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
    #line 108
    #if UNROLL > 0
        npyv_u8 v_0 = npyv_load_u8(ip + 0 * vstep);
        npyv_u8 r_0 = npyv_negative_u8(v_0);
        npyv_store_u8(op + 0 * vstep, r_0);
    #endif
    
#line 108
    #if UNROLL > 1
        npyv_u8 v_1 = npyv_load_u8(ip + 1 * vstep);
        npyv_u8 r_1 = npyv_negative_u8(v_1);
        npyv_store_u8(op + 1 * vstep, r_1);
    #endif
    
#line 108
    #if UNROLL > 2
        npyv_u8 v_2 = npyv_load_u8(ip + 2 * vstep);
        npyv_u8 r_2 = npyv_negative_u8(v_2);
        npyv_store_u8(op + 2 * vstep, r_2);
    #endif
    
#line 108
    #if UNROLL > 3
        npyv_u8 v_3 = npyv_load_u8(ip + 3 * vstep);
        npyv_u8 r_3 = npyv_negative_u8(v_3);
        npyv_store_u8(op + 3 * vstep, r_3);
    #endif
    
#line 108
    #if UNROLL > 4
        npyv_u8 v_4 = npyv_load_u8(ip + 4 * vstep);
        npyv_u8 r_4 = npyv_negative_u8(v_4);
        npyv_store_u8(op + 4 * vstep, r_4);
    #endif
    
#line 108
    #if UNROLL > 5
        npyv_u8 v_5 = npyv_load_u8(ip + 5 * vstep);
        npyv_u8 r_5 = npyv_negative_u8(v_5);
        npyv_store_u8(op + 5 * vstep, r_5);
    #endif
    
#line 108
    #if UNROLL > 6
        npyv_u8 v_6 = npyv_load_u8(ip + 6 * vstep);
        npyv_u8 r_6 = npyv_negative_u8(v_6);
        npyv_store_u8(op + 6 * vstep, r_6);
    #endif
    
#line 108
    #if UNROLL > 7
        npyv_u8 v_7 = npyv_load_u8(ip + 7 * vstep);
        npyv_u8 r_7 = npyv_negative_u8(v_7);
        npyv_store_u8(op + 7 * vstep, r_7);
    #endif
    
#line 108
    #if UNROLL > 8
        npyv_u8 v_8 = npyv_load_u8(ip + 8 * vstep);
        npyv_u8 r_8 = npyv_negative_u8(v_8);
        npyv_store_u8(op + 8 * vstep, r_8);
    #endif
    
#line 108
    #if UNROLL > 9
        npyv_u8 v_9 = npyv_load_u8(ip + 9 * vstep);
        npyv_u8 r_9 = npyv_negative_u8(v_9);
        npyv_store_u8(op + 9 * vstep, r_9);
    #endif
    
#line 108
    #if UNROLL > 10
        npyv_u8 v_10 = npyv_load_u8(ip + 10 * vstep);
        npyv_u8 r_10 = npyv_negative_u8(v_10);
        npyv_store_u8(op + 10 * vstep, r_10);
    #endif
    
#line 108
    #if UNROLL > 11
        npyv_u8 v_11 = npyv_load_u8(ip + 11 * vstep);
        npyv_u8 r_11 = npyv_negative_u8(v_11);
        npyv_store_u8(op + 11 * vstep, r_11);
    #endif
    
#line 108
    #if UNROLL > 12
        npyv_u8 v_12 = npyv_load_u8(ip + 12 * vstep);
        npyv_u8 r_12 = npyv_negative_u8(v_12);
        npyv_store_u8(op + 12 * vstep, r_12);
    #endif
    
#line 108
    #if UNROLL > 13
        npyv_u8 v_13 = npyv_load_u8(ip + 13 * vstep);
        npyv_u8 r_13 = npyv_negative_u8(v_13);
        npyv_store_u8(op + 13 * vstep, r_13);
    #endif
    
#line 108
    #if UNROLL > 14
        npyv_u8 v_14 = npyv_load_u8(ip + 14 * vstep);
        npyv_u8 r_14 = npyv_negative_u8(v_14);
        npyv_store_u8(op + 14 * vstep, r_14);
    #endif
    
#line 108
    #if UNROLL > 15
        npyv_u8 v_15 = npyv_load_u8(ip + 15 * vstep);
        npyv_u8 r_15 = npyv_negative_u8(v_15);
        npyv_store_u8(op + 15 * vstep, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += vstep, op +=vstep) {
        npyv_u8 v = npyv_load_u8(ip);
        npyv_u8 r = npyv_negative_u8(v);
        npyv_store_u8(op, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ++ip, ++op) {
        *op = scalar_negative(*ip);
    }
}

#if 0
// contiguous input, non-contiguous output
static NPY_INLINE void
simd_unary_cn_negative_u8(const npyv_lanetype_u8 *ip,
                             npyv_lanetype_u8 *op, npy_intp ostride,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_u8;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += wstep, op += ostride*wstep) {
    #line 142
    #if UNROLL > 0
        npyv_u8 v_0 = npyv_load_u8(ip + 0 * vstep);
        npyv_u8 r_0 = npyv_negative_u8(v_0);
        npyv_storen_u8(op + 0 * vstep * ostride, ostride, r_0);
    #endif
    
#line 142
    #if UNROLL > 1
        npyv_u8 v_1 = npyv_load_u8(ip + 1 * vstep);
        npyv_u8 r_1 = npyv_negative_u8(v_1);
        npyv_storen_u8(op + 1 * vstep * ostride, ostride, r_1);
    #endif
    
#line 142
    #if UNROLL > 2
        npyv_u8 v_2 = npyv_load_u8(ip + 2 * vstep);
        npyv_u8 r_2 = npyv_negative_u8(v_2);
        npyv_storen_u8(op + 2 * vstep * ostride, ostride, r_2);
    #endif
    
#line 142
    #if UNROLL > 3
        npyv_u8 v_3 = npyv_load_u8(ip + 3 * vstep);
        npyv_u8 r_3 = npyv_negative_u8(v_3);
        npyv_storen_u8(op + 3 * vstep * ostride, ostride, r_3);
    #endif
    
#line 142
    #if UNROLL > 4
        npyv_u8 v_4 = npyv_load_u8(ip + 4 * vstep);
        npyv_u8 r_4 = npyv_negative_u8(v_4);
        npyv_storen_u8(op + 4 * vstep * ostride, ostride, r_4);
    #endif
    
#line 142
    #if UNROLL > 5
        npyv_u8 v_5 = npyv_load_u8(ip + 5 * vstep);
        npyv_u8 r_5 = npyv_negative_u8(v_5);
        npyv_storen_u8(op + 5 * vstep * ostride, ostride, r_5);
    #endif
    
#line 142
    #if UNROLL > 6
        npyv_u8 v_6 = npyv_load_u8(ip + 6 * vstep);
        npyv_u8 r_6 = npyv_negative_u8(v_6);
        npyv_storen_u8(op + 6 * vstep * ostride, ostride, r_6);
    #endif
    
#line 142
    #if UNROLL > 7
        npyv_u8 v_7 = npyv_load_u8(ip + 7 * vstep);
        npyv_u8 r_7 = npyv_negative_u8(v_7);
        npyv_storen_u8(op + 7 * vstep * ostride, ostride, r_7);
    #endif
    
#line 142
    #if UNROLL > 8
        npyv_u8 v_8 = npyv_load_u8(ip + 8 * vstep);
        npyv_u8 r_8 = npyv_negative_u8(v_8);
        npyv_storen_u8(op + 8 * vstep * ostride, ostride, r_8);
    #endif
    
#line 142
    #if UNROLL > 9
        npyv_u8 v_9 = npyv_load_u8(ip + 9 * vstep);
        npyv_u8 r_9 = npyv_negative_u8(v_9);
        npyv_storen_u8(op + 9 * vstep * ostride, ostride, r_9);
    #endif
    
#line 142
    #if UNROLL > 10
        npyv_u8 v_10 = npyv_load_u8(ip + 10 * vstep);
        npyv_u8 r_10 = npyv_negative_u8(v_10);
        npyv_storen_u8(op + 10 * vstep * ostride, ostride, r_10);
    #endif
    
#line 142
    #if UNROLL > 11
        npyv_u8 v_11 = npyv_load_u8(ip + 11 * vstep);
        npyv_u8 r_11 = npyv_negative_u8(v_11);
        npyv_storen_u8(op + 11 * vstep * ostride, ostride, r_11);
    #endif
    
#line 142
    #if UNROLL > 12
        npyv_u8 v_12 = npyv_load_u8(ip + 12 * vstep);
        npyv_u8 r_12 = npyv_negative_u8(v_12);
        npyv_storen_u8(op + 12 * vstep * ostride, ostride, r_12);
    #endif
    
#line 142
    #if UNROLL > 13
        npyv_u8 v_13 = npyv_load_u8(ip + 13 * vstep);
        npyv_u8 r_13 = npyv_negative_u8(v_13);
        npyv_storen_u8(op + 13 * vstep * ostride, ostride, r_13);
    #endif
    
#line 142
    #if UNROLL > 14
        npyv_u8 v_14 = npyv_load_u8(ip + 14 * vstep);
        npyv_u8 r_14 = npyv_negative_u8(v_14);
        npyv_storen_u8(op + 14 * vstep * ostride, ostride, r_14);
    #endif
    
#line 142
    #if UNROLL > 15
        npyv_u8 v_15 = npyv_load_u8(ip + 15 * vstep);
        npyv_u8 r_15 = npyv_negative_u8(v_15);
        npyv_storen_u8(op + 15 * vstep * ostride, ostride, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += vstep, op += ostride*vstep) {
        npyv_u8 v = npyv_load_u8(ip);
        npyv_u8 r = npyv_negative_u8(v);
        npyv_storen_u8(op, ostride, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ++ip, op += ostride) {
        *op = scalar_negative(*ip);
    }
}
// non-contiguous input, contiguous output
static NPY_INLINE void
simd_unary_nc_negative_u8(const npyv_lanetype_u8 *ip, npy_intp istride,
                             npyv_lanetype_u8 *op,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_u8;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += wstep) {
    #line 174
    #if UNROLL > 0
        npyv_u8 v_0 = npyv_loadn_u8(ip + 0 * vstep * istride, istride);
        npyv_u8 r_0 = npyv_negative_u8(v_0);
        npyv_store_u8(op + 0 * vstep, r_0);
    #endif
    
#line 174
    #if UNROLL > 1
        npyv_u8 v_1 = npyv_loadn_u8(ip + 1 * vstep * istride, istride);
        npyv_u8 r_1 = npyv_negative_u8(v_1);
        npyv_store_u8(op + 1 * vstep, r_1);
    #endif
    
#line 174
    #if UNROLL > 2
        npyv_u8 v_2 = npyv_loadn_u8(ip + 2 * vstep * istride, istride);
        npyv_u8 r_2 = npyv_negative_u8(v_2);
        npyv_store_u8(op + 2 * vstep, r_2);
    #endif
    
#line 174
    #if UNROLL > 3
        npyv_u8 v_3 = npyv_loadn_u8(ip + 3 * vstep * istride, istride);
        npyv_u8 r_3 = npyv_negative_u8(v_3);
        npyv_store_u8(op + 3 * vstep, r_3);
    #endif
    
#line 174
    #if UNROLL > 4
        npyv_u8 v_4 = npyv_loadn_u8(ip + 4 * vstep * istride, istride);
        npyv_u8 r_4 = npyv_negative_u8(v_4);
        npyv_store_u8(op + 4 * vstep, r_4);
    #endif
    
#line 174
    #if UNROLL > 5
        npyv_u8 v_5 = npyv_loadn_u8(ip + 5 * vstep * istride, istride);
        npyv_u8 r_5 = npyv_negative_u8(v_5);
        npyv_store_u8(op + 5 * vstep, r_5);
    #endif
    
#line 174
    #if UNROLL > 6
        npyv_u8 v_6 = npyv_loadn_u8(ip + 6 * vstep * istride, istride);
        npyv_u8 r_6 = npyv_negative_u8(v_6);
        npyv_store_u8(op + 6 * vstep, r_6);
    #endif
    
#line 174
    #if UNROLL > 7
        npyv_u8 v_7 = npyv_loadn_u8(ip + 7 * vstep * istride, istride);
        npyv_u8 r_7 = npyv_negative_u8(v_7);
        npyv_store_u8(op + 7 * vstep, r_7);
    #endif
    
#line 174
    #if UNROLL > 8
        npyv_u8 v_8 = npyv_loadn_u8(ip + 8 * vstep * istride, istride);
        npyv_u8 r_8 = npyv_negative_u8(v_8);
        npyv_store_u8(op + 8 * vstep, r_8);
    #endif
    
#line 174
    #if UNROLL > 9
        npyv_u8 v_9 = npyv_loadn_u8(ip + 9 * vstep * istride, istride);
        npyv_u8 r_9 = npyv_negative_u8(v_9);
        npyv_store_u8(op + 9 * vstep, r_9);
    #endif
    
#line 174
    #if UNROLL > 10
        npyv_u8 v_10 = npyv_loadn_u8(ip + 10 * vstep * istride, istride);
        npyv_u8 r_10 = npyv_negative_u8(v_10);
        npyv_store_u8(op + 10 * vstep, r_10);
    #endif
    
#line 174
    #if UNROLL > 11
        npyv_u8 v_11 = npyv_loadn_u8(ip + 11 * vstep * istride, istride);
        npyv_u8 r_11 = npyv_negative_u8(v_11);
        npyv_store_u8(op + 11 * vstep, r_11);
    #endif
    
#line 174
    #if UNROLL > 12
        npyv_u8 v_12 = npyv_loadn_u8(ip + 12 * vstep * istride, istride);
        npyv_u8 r_12 = npyv_negative_u8(v_12);
        npyv_store_u8(op + 12 * vstep, r_12);
    #endif
    
#line 174
    #if UNROLL > 13
        npyv_u8 v_13 = npyv_loadn_u8(ip + 13 * vstep * istride, istride);
        npyv_u8 r_13 = npyv_negative_u8(v_13);
        npyv_store_u8(op + 13 * vstep, r_13);
    #endif
    
#line 174
    #if UNROLL > 14
        npyv_u8 v_14 = npyv_loadn_u8(ip + 14 * vstep * istride, istride);
        npyv_u8 r_14 = npyv_negative_u8(v_14);
        npyv_store_u8(op + 14 * vstep, r_14);
    #endif
    
#line 174
    #if UNROLL > 15
        npyv_u8 v_15 = npyv_loadn_u8(ip + 15 * vstep * istride, istride);
        npyv_u8 r_15 = npyv_negative_u8(v_15);
        npyv_store_u8(op + 15 * vstep, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += vstep) {
        npyv_u8 v = npyv_loadn_u8(ip, istride);
        npyv_u8 r = npyv_negative_u8(v);
        npyv_store_u8(op, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ip += istride, ++op) {
        *op = scalar_negative(*ip);
    }
}
// non-contiguous input and output
// limit unroll to 2x
#if UNROLL > 2
#undef UNROLL
#define UNROLL 2
#endif
// X86 does better with unrolled scalar for heavy non-contiguous
#ifndef NPY_HAVE_SSE2
static NPY_INLINE void
simd_unary_nn_negative_u8(const npyv_lanetype_u8 *ip, npy_intp istride,
                             npyv_lanetype_u8 *op, npy_intp ostride,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_u8;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
    #line 213
    #if UNROLL > 0
        npyv_u8 v_0 = npyv_loadn_u8(ip + 0 * vstep * istride, istride);
        npyv_u8 r_0 = npyv_negative_u8(v_0);
        npyv_storen_u8(op + 0 * vstep * ostride, ostride, r_0);
    #endif
    
#line 213
    #if UNROLL > 1
        npyv_u8 v_1 = npyv_loadn_u8(ip + 1 * vstep * istride, istride);
        npyv_u8 r_1 = npyv_negative_u8(v_1);
        npyv_storen_u8(op + 1 * vstep * ostride, ostride, r_1);
    #endif
    
#line 213
    #if UNROLL > 2
        npyv_u8 v_2 = npyv_loadn_u8(ip + 2 * vstep * istride, istride);
        npyv_u8 r_2 = npyv_negative_u8(v_2);
        npyv_storen_u8(op + 2 * vstep * ostride, ostride, r_2);
    #endif
    
#line 213
    #if UNROLL > 3
        npyv_u8 v_3 = npyv_loadn_u8(ip + 3 * vstep * istride, istride);
        npyv_u8 r_3 = npyv_negative_u8(v_3);
        npyv_storen_u8(op + 3 * vstep * ostride, ostride, r_3);
    #endif
    
#line 213
    #if UNROLL > 4
        npyv_u8 v_4 = npyv_loadn_u8(ip + 4 * vstep * istride, istride);
        npyv_u8 r_4 = npyv_negative_u8(v_4);
        npyv_storen_u8(op + 4 * vstep * ostride, ostride, r_4);
    #endif
    
#line 213
    #if UNROLL > 5
        npyv_u8 v_5 = npyv_loadn_u8(ip + 5 * vstep * istride, istride);
        npyv_u8 r_5 = npyv_negative_u8(v_5);
        npyv_storen_u8(op + 5 * vstep * ostride, ostride, r_5);
    #endif
    
#line 213
    #if UNROLL > 6
        npyv_u8 v_6 = npyv_loadn_u8(ip + 6 * vstep * istride, istride);
        npyv_u8 r_6 = npyv_negative_u8(v_6);
        npyv_storen_u8(op + 6 * vstep * ostride, ostride, r_6);
    #endif
    
#line 213
    #if UNROLL > 7
        npyv_u8 v_7 = npyv_loadn_u8(ip + 7 * vstep * istride, istride);
        npyv_u8 r_7 = npyv_negative_u8(v_7);
        npyv_storen_u8(op + 7 * vstep * ostride, ostride, r_7);
    #endif
    
#line 213
    #if UNROLL > 8
        npyv_u8 v_8 = npyv_loadn_u8(ip + 8 * vstep * istride, istride);
        npyv_u8 r_8 = npyv_negative_u8(v_8);
        npyv_storen_u8(op + 8 * vstep * ostride, ostride, r_8);
    #endif
    
#line 213
    #if UNROLL > 9
        npyv_u8 v_9 = npyv_loadn_u8(ip + 9 * vstep * istride, istride);
        npyv_u8 r_9 = npyv_negative_u8(v_9);
        npyv_storen_u8(op + 9 * vstep * ostride, ostride, r_9);
    #endif
    
#line 213
    #if UNROLL > 10
        npyv_u8 v_10 = npyv_loadn_u8(ip + 10 * vstep * istride, istride);
        npyv_u8 r_10 = npyv_negative_u8(v_10);
        npyv_storen_u8(op + 10 * vstep * ostride, ostride, r_10);
    #endif
    
#line 213
    #if UNROLL > 11
        npyv_u8 v_11 = npyv_loadn_u8(ip + 11 * vstep * istride, istride);
        npyv_u8 r_11 = npyv_negative_u8(v_11);
        npyv_storen_u8(op + 11 * vstep * ostride, ostride, r_11);
    #endif
    
#line 213
    #if UNROLL > 12
        npyv_u8 v_12 = npyv_loadn_u8(ip + 12 * vstep * istride, istride);
        npyv_u8 r_12 = npyv_negative_u8(v_12);
        npyv_storen_u8(op + 12 * vstep * ostride, ostride, r_12);
    #endif
    
#line 213
    #if UNROLL > 13
        npyv_u8 v_13 = npyv_loadn_u8(ip + 13 * vstep * istride, istride);
        npyv_u8 r_13 = npyv_negative_u8(v_13);
        npyv_storen_u8(op + 13 * vstep * ostride, ostride, r_13);
    #endif
    
#line 213
    #if UNROLL > 14
        npyv_u8 v_14 = npyv_loadn_u8(ip + 14 * vstep * istride, istride);
        npyv_u8 r_14 = npyv_negative_u8(v_14);
        npyv_storen_u8(op + 14 * vstep * ostride, ostride, r_14);
    #endif
    
#line 213
    #if UNROLL > 15
        npyv_u8 v_15 = npyv_loadn_u8(ip + 15 * vstep * istride, istride);
        npyv_u8 r_15 = npyv_negative_u8(v_15);
        npyv_storen_u8(op + 15 * vstep * ostride, ostride, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
        npyv_u8 v = npyv_loadn_u8(ip, istride);
        npyv_u8 r = npyv_negative_u8(v);
        npyv_storen_u8(op, ostride, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ip += istride, op += ostride) {
        *op = scalar_negative(*ip);
    }
}
#endif // NPY_HAVE_SSE2
#endif // 0
#undef UNROLL
#endif // NPY_SIMD
/*end repeat1**/

#line 80
#line 85
#if NPY_SIMD
#if 4 < 1
#error "Unroll must be at least 1"
#elif NPY_SIMD != 128 && 4 > 2
// Avoid memory bandwidth bottleneck for larger SIMD
#define UNROLL 2
#else
#define UNROLL 4
#endif
// contiguous inputs and output.
static NPY_INLINE void
simd_unary_cc_negative_s16(const npyv_lanetype_s16 *ip,
                             npyv_lanetype_s16 *op,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_s16;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
    #line 108
    #if UNROLL > 0
        npyv_s16 v_0 = npyv_load_s16(ip + 0 * vstep);
        npyv_s16 r_0 = npyv_negative_s16(v_0);
        npyv_store_s16(op + 0 * vstep, r_0);
    #endif
    
#line 108
    #if UNROLL > 1
        npyv_s16 v_1 = npyv_load_s16(ip + 1 * vstep);
        npyv_s16 r_1 = npyv_negative_s16(v_1);
        npyv_store_s16(op + 1 * vstep, r_1);
    #endif
    
#line 108
    #if UNROLL > 2
        npyv_s16 v_2 = npyv_load_s16(ip + 2 * vstep);
        npyv_s16 r_2 = npyv_negative_s16(v_2);
        npyv_store_s16(op + 2 * vstep, r_2);
    #endif
    
#line 108
    #if UNROLL > 3
        npyv_s16 v_3 = npyv_load_s16(ip + 3 * vstep);
        npyv_s16 r_3 = npyv_negative_s16(v_3);
        npyv_store_s16(op + 3 * vstep, r_3);
    #endif
    
#line 108
    #if UNROLL > 4
        npyv_s16 v_4 = npyv_load_s16(ip + 4 * vstep);
        npyv_s16 r_4 = npyv_negative_s16(v_4);
        npyv_store_s16(op + 4 * vstep, r_4);
    #endif
    
#line 108
    #if UNROLL > 5
        npyv_s16 v_5 = npyv_load_s16(ip + 5 * vstep);
        npyv_s16 r_5 = npyv_negative_s16(v_5);
        npyv_store_s16(op + 5 * vstep, r_5);
    #endif
    
#line 108
    #if UNROLL > 6
        npyv_s16 v_6 = npyv_load_s16(ip + 6 * vstep);
        npyv_s16 r_6 = npyv_negative_s16(v_6);
        npyv_store_s16(op + 6 * vstep, r_6);
    #endif
    
#line 108
    #if UNROLL > 7
        npyv_s16 v_7 = npyv_load_s16(ip + 7 * vstep);
        npyv_s16 r_7 = npyv_negative_s16(v_7);
        npyv_store_s16(op + 7 * vstep, r_7);
    #endif
    
#line 108
    #if UNROLL > 8
        npyv_s16 v_8 = npyv_load_s16(ip + 8 * vstep);
        npyv_s16 r_8 = npyv_negative_s16(v_8);
        npyv_store_s16(op + 8 * vstep, r_8);
    #endif
    
#line 108
    #if UNROLL > 9
        npyv_s16 v_9 = npyv_load_s16(ip + 9 * vstep);
        npyv_s16 r_9 = npyv_negative_s16(v_9);
        npyv_store_s16(op + 9 * vstep, r_9);
    #endif
    
#line 108
    #if UNROLL > 10
        npyv_s16 v_10 = npyv_load_s16(ip + 10 * vstep);
        npyv_s16 r_10 = npyv_negative_s16(v_10);
        npyv_store_s16(op + 10 * vstep, r_10);
    #endif
    
#line 108
    #if UNROLL > 11
        npyv_s16 v_11 = npyv_load_s16(ip + 11 * vstep);
        npyv_s16 r_11 = npyv_negative_s16(v_11);
        npyv_store_s16(op + 11 * vstep, r_11);
    #endif
    
#line 108
    #if UNROLL > 12
        npyv_s16 v_12 = npyv_load_s16(ip + 12 * vstep);
        npyv_s16 r_12 = npyv_negative_s16(v_12);
        npyv_store_s16(op + 12 * vstep, r_12);
    #endif
    
#line 108
    #if UNROLL > 13
        npyv_s16 v_13 = npyv_load_s16(ip + 13 * vstep);
        npyv_s16 r_13 = npyv_negative_s16(v_13);
        npyv_store_s16(op + 13 * vstep, r_13);
    #endif
    
#line 108
    #if UNROLL > 14
        npyv_s16 v_14 = npyv_load_s16(ip + 14 * vstep);
        npyv_s16 r_14 = npyv_negative_s16(v_14);
        npyv_store_s16(op + 14 * vstep, r_14);
    #endif
    
#line 108
    #if UNROLL > 15
        npyv_s16 v_15 = npyv_load_s16(ip + 15 * vstep);
        npyv_s16 r_15 = npyv_negative_s16(v_15);
        npyv_store_s16(op + 15 * vstep, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += vstep, op +=vstep) {
        npyv_s16 v = npyv_load_s16(ip);
        npyv_s16 r = npyv_negative_s16(v);
        npyv_store_s16(op, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ++ip, ++op) {
        *op = scalar_negative(*ip);
    }
}

#if 0
// contiguous input, non-contiguous output
static NPY_INLINE void
simd_unary_cn_negative_s16(const npyv_lanetype_s16 *ip,
                             npyv_lanetype_s16 *op, npy_intp ostride,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_s16;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += wstep, op += ostride*wstep) {
    #line 142
    #if UNROLL > 0
        npyv_s16 v_0 = npyv_load_s16(ip + 0 * vstep);
        npyv_s16 r_0 = npyv_negative_s16(v_0);
        npyv_storen_s16(op + 0 * vstep * ostride, ostride, r_0);
    #endif
    
#line 142
    #if UNROLL > 1
        npyv_s16 v_1 = npyv_load_s16(ip + 1 * vstep);
        npyv_s16 r_1 = npyv_negative_s16(v_1);
        npyv_storen_s16(op + 1 * vstep * ostride, ostride, r_1);
    #endif
    
#line 142
    #if UNROLL > 2
        npyv_s16 v_2 = npyv_load_s16(ip + 2 * vstep);
        npyv_s16 r_2 = npyv_negative_s16(v_2);
        npyv_storen_s16(op + 2 * vstep * ostride, ostride, r_2);
    #endif
    
#line 142
    #if UNROLL > 3
        npyv_s16 v_3 = npyv_load_s16(ip + 3 * vstep);
        npyv_s16 r_3 = npyv_negative_s16(v_3);
        npyv_storen_s16(op + 3 * vstep * ostride, ostride, r_3);
    #endif
    
#line 142
    #if UNROLL > 4
        npyv_s16 v_4 = npyv_load_s16(ip + 4 * vstep);
        npyv_s16 r_4 = npyv_negative_s16(v_4);
        npyv_storen_s16(op + 4 * vstep * ostride, ostride, r_4);
    #endif
    
#line 142
    #if UNROLL > 5
        npyv_s16 v_5 = npyv_load_s16(ip + 5 * vstep);
        npyv_s16 r_5 = npyv_negative_s16(v_5);
        npyv_storen_s16(op + 5 * vstep * ostride, ostride, r_5);
    #endif
    
#line 142
    #if UNROLL > 6
        npyv_s16 v_6 = npyv_load_s16(ip + 6 * vstep);
        npyv_s16 r_6 = npyv_negative_s16(v_6);
        npyv_storen_s16(op + 6 * vstep * ostride, ostride, r_6);
    #endif
    
#line 142
    #if UNROLL > 7
        npyv_s16 v_7 = npyv_load_s16(ip + 7 * vstep);
        npyv_s16 r_7 = npyv_negative_s16(v_7);
        npyv_storen_s16(op + 7 * vstep * ostride, ostride, r_7);
    #endif
    
#line 142
    #if UNROLL > 8
        npyv_s16 v_8 = npyv_load_s16(ip + 8 * vstep);
        npyv_s16 r_8 = npyv_negative_s16(v_8);
        npyv_storen_s16(op + 8 * vstep * ostride, ostride, r_8);
    #endif
    
#line 142
    #if UNROLL > 9
        npyv_s16 v_9 = npyv_load_s16(ip + 9 * vstep);
        npyv_s16 r_9 = npyv_negative_s16(v_9);
        npyv_storen_s16(op + 9 * vstep * ostride, ostride, r_9);
    #endif
    
#line 142
    #if UNROLL > 10
        npyv_s16 v_10 = npyv_load_s16(ip + 10 * vstep);
        npyv_s16 r_10 = npyv_negative_s16(v_10);
        npyv_storen_s16(op + 10 * vstep * ostride, ostride, r_10);
    #endif
    
#line 142
    #if UNROLL > 11
        npyv_s16 v_11 = npyv_load_s16(ip + 11 * vstep);
        npyv_s16 r_11 = npyv_negative_s16(v_11);
        npyv_storen_s16(op + 11 * vstep * ostride, ostride, r_11);
    #endif
    
#line 142
    #if UNROLL > 12
        npyv_s16 v_12 = npyv_load_s16(ip + 12 * vstep);
        npyv_s16 r_12 = npyv_negative_s16(v_12);
        npyv_storen_s16(op + 12 * vstep * ostride, ostride, r_12);
    #endif
    
#line 142
    #if UNROLL > 13
        npyv_s16 v_13 = npyv_load_s16(ip + 13 * vstep);
        npyv_s16 r_13 = npyv_negative_s16(v_13);
        npyv_storen_s16(op + 13 * vstep * ostride, ostride, r_13);
    #endif
    
#line 142
    #if UNROLL > 14
        npyv_s16 v_14 = npyv_load_s16(ip + 14 * vstep);
        npyv_s16 r_14 = npyv_negative_s16(v_14);
        npyv_storen_s16(op + 14 * vstep * ostride, ostride, r_14);
    #endif
    
#line 142
    #if UNROLL > 15
        npyv_s16 v_15 = npyv_load_s16(ip + 15 * vstep);
        npyv_s16 r_15 = npyv_negative_s16(v_15);
        npyv_storen_s16(op + 15 * vstep * ostride, ostride, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += vstep, op += ostride*vstep) {
        npyv_s16 v = npyv_load_s16(ip);
        npyv_s16 r = npyv_negative_s16(v);
        npyv_storen_s16(op, ostride, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ++ip, op += ostride) {
        *op = scalar_negative(*ip);
    }
}
// non-contiguous input, contiguous output
static NPY_INLINE void
simd_unary_nc_negative_s16(const npyv_lanetype_s16 *ip, npy_intp istride,
                             npyv_lanetype_s16 *op,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_s16;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += wstep) {
    #line 174
    #if UNROLL > 0
        npyv_s16 v_0 = npyv_loadn_s16(ip + 0 * vstep * istride, istride);
        npyv_s16 r_0 = npyv_negative_s16(v_0);
        npyv_store_s16(op + 0 * vstep, r_0);
    #endif
    
#line 174
    #if UNROLL > 1
        npyv_s16 v_1 = npyv_loadn_s16(ip + 1 * vstep * istride, istride);
        npyv_s16 r_1 = npyv_negative_s16(v_1);
        npyv_store_s16(op + 1 * vstep, r_1);
    #endif
    
#line 174
    #if UNROLL > 2
        npyv_s16 v_2 = npyv_loadn_s16(ip + 2 * vstep * istride, istride);
        npyv_s16 r_2 = npyv_negative_s16(v_2);
        npyv_store_s16(op + 2 * vstep, r_2);
    #endif
    
#line 174
    #if UNROLL > 3
        npyv_s16 v_3 = npyv_loadn_s16(ip + 3 * vstep * istride, istride);
        npyv_s16 r_3 = npyv_negative_s16(v_3);
        npyv_store_s16(op + 3 * vstep, r_3);
    #endif
    
#line 174
    #if UNROLL > 4
        npyv_s16 v_4 = npyv_loadn_s16(ip + 4 * vstep * istride, istride);
        npyv_s16 r_4 = npyv_negative_s16(v_4);
        npyv_store_s16(op + 4 * vstep, r_4);
    #endif
    
#line 174
    #if UNROLL > 5
        npyv_s16 v_5 = npyv_loadn_s16(ip + 5 * vstep * istride, istride);
        npyv_s16 r_5 = npyv_negative_s16(v_5);
        npyv_store_s16(op + 5 * vstep, r_5);
    #endif
    
#line 174
    #if UNROLL > 6
        npyv_s16 v_6 = npyv_loadn_s16(ip + 6 * vstep * istride, istride);
        npyv_s16 r_6 = npyv_negative_s16(v_6);
        npyv_store_s16(op + 6 * vstep, r_6);
    #endif
    
#line 174
    #if UNROLL > 7
        npyv_s16 v_7 = npyv_loadn_s16(ip + 7 * vstep * istride, istride);
        npyv_s16 r_7 = npyv_negative_s16(v_7);
        npyv_store_s16(op + 7 * vstep, r_7);
    #endif
    
#line 174
    #if UNROLL > 8
        npyv_s16 v_8 = npyv_loadn_s16(ip + 8 * vstep * istride, istride);
        npyv_s16 r_8 = npyv_negative_s16(v_8);
        npyv_store_s16(op + 8 * vstep, r_8);
    #endif
    
#line 174
    #if UNROLL > 9
        npyv_s16 v_9 = npyv_loadn_s16(ip + 9 * vstep * istride, istride);
        npyv_s16 r_9 = npyv_negative_s16(v_9);
        npyv_store_s16(op + 9 * vstep, r_9);
    #endif
    
#line 174
    #if UNROLL > 10
        npyv_s16 v_10 = npyv_loadn_s16(ip + 10 * vstep * istride, istride);
        npyv_s16 r_10 = npyv_negative_s16(v_10);
        npyv_store_s16(op + 10 * vstep, r_10);
    #endif
    
#line 174
    #if UNROLL > 11
        npyv_s16 v_11 = npyv_loadn_s16(ip + 11 * vstep * istride, istride);
        npyv_s16 r_11 = npyv_negative_s16(v_11);
        npyv_store_s16(op + 11 * vstep, r_11);
    #endif
    
#line 174
    #if UNROLL > 12
        npyv_s16 v_12 = npyv_loadn_s16(ip + 12 * vstep * istride, istride);
        npyv_s16 r_12 = npyv_negative_s16(v_12);
        npyv_store_s16(op + 12 * vstep, r_12);
    #endif
    
#line 174
    #if UNROLL > 13
        npyv_s16 v_13 = npyv_loadn_s16(ip + 13 * vstep * istride, istride);
        npyv_s16 r_13 = npyv_negative_s16(v_13);
        npyv_store_s16(op + 13 * vstep, r_13);
    #endif
    
#line 174
    #if UNROLL > 14
        npyv_s16 v_14 = npyv_loadn_s16(ip + 14 * vstep * istride, istride);
        npyv_s16 r_14 = npyv_negative_s16(v_14);
        npyv_store_s16(op + 14 * vstep, r_14);
    #endif
    
#line 174
    #if UNROLL > 15
        npyv_s16 v_15 = npyv_loadn_s16(ip + 15 * vstep * istride, istride);
        npyv_s16 r_15 = npyv_negative_s16(v_15);
        npyv_store_s16(op + 15 * vstep, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += vstep) {
        npyv_s16 v = npyv_loadn_s16(ip, istride);
        npyv_s16 r = npyv_negative_s16(v);
        npyv_store_s16(op, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ip += istride, ++op) {
        *op = scalar_negative(*ip);
    }
}
// non-contiguous input and output
// limit unroll to 2x
#if UNROLL > 2
#undef UNROLL
#define UNROLL 2
#endif
// X86 does better with unrolled scalar for heavy non-contiguous
#ifndef NPY_HAVE_SSE2
static NPY_INLINE void
simd_unary_nn_negative_s16(const npyv_lanetype_s16 *ip, npy_intp istride,
                             npyv_lanetype_s16 *op, npy_intp ostride,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_s16;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
    #line 213
    #if UNROLL > 0
        npyv_s16 v_0 = npyv_loadn_s16(ip + 0 * vstep * istride, istride);
        npyv_s16 r_0 = npyv_negative_s16(v_0);
        npyv_storen_s16(op + 0 * vstep * ostride, ostride, r_0);
    #endif
    
#line 213
    #if UNROLL > 1
        npyv_s16 v_1 = npyv_loadn_s16(ip + 1 * vstep * istride, istride);
        npyv_s16 r_1 = npyv_negative_s16(v_1);
        npyv_storen_s16(op + 1 * vstep * ostride, ostride, r_1);
    #endif
    
#line 213
    #if UNROLL > 2
        npyv_s16 v_2 = npyv_loadn_s16(ip + 2 * vstep * istride, istride);
        npyv_s16 r_2 = npyv_negative_s16(v_2);
        npyv_storen_s16(op + 2 * vstep * ostride, ostride, r_2);
    #endif
    
#line 213
    #if UNROLL > 3
        npyv_s16 v_3 = npyv_loadn_s16(ip + 3 * vstep * istride, istride);
        npyv_s16 r_3 = npyv_negative_s16(v_3);
        npyv_storen_s16(op + 3 * vstep * ostride, ostride, r_3);
    #endif
    
#line 213
    #if UNROLL > 4
        npyv_s16 v_4 = npyv_loadn_s16(ip + 4 * vstep * istride, istride);
        npyv_s16 r_4 = npyv_negative_s16(v_4);
        npyv_storen_s16(op + 4 * vstep * ostride, ostride, r_4);
    #endif
    
#line 213
    #if UNROLL > 5
        npyv_s16 v_5 = npyv_loadn_s16(ip + 5 * vstep * istride, istride);
        npyv_s16 r_5 = npyv_negative_s16(v_5);
        npyv_storen_s16(op + 5 * vstep * ostride, ostride, r_5);
    #endif
    
#line 213
    #if UNROLL > 6
        npyv_s16 v_6 = npyv_loadn_s16(ip + 6 * vstep * istride, istride);
        npyv_s16 r_6 = npyv_negative_s16(v_6);
        npyv_storen_s16(op + 6 * vstep * ostride, ostride, r_6);
    #endif
    
#line 213
    #if UNROLL > 7
        npyv_s16 v_7 = npyv_loadn_s16(ip + 7 * vstep * istride, istride);
        npyv_s16 r_7 = npyv_negative_s16(v_7);
        npyv_storen_s16(op + 7 * vstep * ostride, ostride, r_7);
    #endif
    
#line 213
    #if UNROLL > 8
        npyv_s16 v_8 = npyv_loadn_s16(ip + 8 * vstep * istride, istride);
        npyv_s16 r_8 = npyv_negative_s16(v_8);
        npyv_storen_s16(op + 8 * vstep * ostride, ostride, r_8);
    #endif
    
#line 213
    #if UNROLL > 9
        npyv_s16 v_9 = npyv_loadn_s16(ip + 9 * vstep * istride, istride);
        npyv_s16 r_9 = npyv_negative_s16(v_9);
        npyv_storen_s16(op + 9 * vstep * ostride, ostride, r_9);
    #endif
    
#line 213
    #if UNROLL > 10
        npyv_s16 v_10 = npyv_loadn_s16(ip + 10 * vstep * istride, istride);
        npyv_s16 r_10 = npyv_negative_s16(v_10);
        npyv_storen_s16(op + 10 * vstep * ostride, ostride, r_10);
    #endif
    
#line 213
    #if UNROLL > 11
        npyv_s16 v_11 = npyv_loadn_s16(ip + 11 * vstep * istride, istride);
        npyv_s16 r_11 = npyv_negative_s16(v_11);
        npyv_storen_s16(op + 11 * vstep * ostride, ostride, r_11);
    #endif
    
#line 213
    #if UNROLL > 12
        npyv_s16 v_12 = npyv_loadn_s16(ip + 12 * vstep * istride, istride);
        npyv_s16 r_12 = npyv_negative_s16(v_12);
        npyv_storen_s16(op + 12 * vstep * ostride, ostride, r_12);
    #endif
    
#line 213
    #if UNROLL > 13
        npyv_s16 v_13 = npyv_loadn_s16(ip + 13 * vstep * istride, istride);
        npyv_s16 r_13 = npyv_negative_s16(v_13);
        npyv_storen_s16(op + 13 * vstep * ostride, ostride, r_13);
    #endif
    
#line 213
    #if UNROLL > 14
        npyv_s16 v_14 = npyv_loadn_s16(ip + 14 * vstep * istride, istride);
        npyv_s16 r_14 = npyv_negative_s16(v_14);
        npyv_storen_s16(op + 14 * vstep * ostride, ostride, r_14);
    #endif
    
#line 213
    #if UNROLL > 15
        npyv_s16 v_15 = npyv_loadn_s16(ip + 15 * vstep * istride, istride);
        npyv_s16 r_15 = npyv_negative_s16(v_15);
        npyv_storen_s16(op + 15 * vstep * ostride, ostride, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
        npyv_s16 v = npyv_loadn_s16(ip, istride);
        npyv_s16 r = npyv_negative_s16(v);
        npyv_storen_s16(op, ostride, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ip += istride, op += ostride) {
        *op = scalar_negative(*ip);
    }
}
#endif // NPY_HAVE_SSE2
#endif // 0
#undef UNROLL
#endif // NPY_SIMD
/*end repeat1**/

#line 80
#line 85
#if NPY_SIMD
#if 4 < 1
#error "Unroll must be at least 1"
#elif NPY_SIMD != 128 && 4 > 2
// Avoid memory bandwidth bottleneck for larger SIMD
#define UNROLL 2
#else
#define UNROLL 4
#endif
// contiguous inputs and output.
static NPY_INLINE void
simd_unary_cc_negative_u16(const npyv_lanetype_u16 *ip,
                             npyv_lanetype_u16 *op,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_u16;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
    #line 108
    #if UNROLL > 0
        npyv_u16 v_0 = npyv_load_u16(ip + 0 * vstep);
        npyv_u16 r_0 = npyv_negative_u16(v_0);
        npyv_store_u16(op + 0 * vstep, r_0);
    #endif
    
#line 108
    #if UNROLL > 1
        npyv_u16 v_1 = npyv_load_u16(ip + 1 * vstep);
        npyv_u16 r_1 = npyv_negative_u16(v_1);
        npyv_store_u16(op + 1 * vstep, r_1);
    #endif
    
#line 108
    #if UNROLL > 2
        npyv_u16 v_2 = npyv_load_u16(ip + 2 * vstep);
        npyv_u16 r_2 = npyv_negative_u16(v_2);
        npyv_store_u16(op + 2 * vstep, r_2);
    #endif
    
#line 108
    #if UNROLL > 3
        npyv_u16 v_3 = npyv_load_u16(ip + 3 * vstep);
        npyv_u16 r_3 = npyv_negative_u16(v_3);
        npyv_store_u16(op + 3 * vstep, r_3);
    #endif
    
#line 108
    #if UNROLL > 4
        npyv_u16 v_4 = npyv_load_u16(ip + 4 * vstep);
        npyv_u16 r_4 = npyv_negative_u16(v_4);
        npyv_store_u16(op + 4 * vstep, r_4);
    #endif
    
#line 108
    #if UNROLL > 5
        npyv_u16 v_5 = npyv_load_u16(ip + 5 * vstep);
        npyv_u16 r_5 = npyv_negative_u16(v_5);
        npyv_store_u16(op + 5 * vstep, r_5);
    #endif
    
#line 108
    #if UNROLL > 6
        npyv_u16 v_6 = npyv_load_u16(ip + 6 * vstep);
        npyv_u16 r_6 = npyv_negative_u16(v_6);
        npyv_store_u16(op + 6 * vstep, r_6);
    #endif
    
#line 108
    #if UNROLL > 7
        npyv_u16 v_7 = npyv_load_u16(ip + 7 * vstep);
        npyv_u16 r_7 = npyv_negative_u16(v_7);
        npyv_store_u16(op + 7 * vstep, r_7);
    #endif
    
#line 108
    #if UNROLL > 8
        npyv_u16 v_8 = npyv_load_u16(ip + 8 * vstep);
        npyv_u16 r_8 = npyv_negative_u16(v_8);
        npyv_store_u16(op + 8 * vstep, r_8);
    #endif
    
#line 108
    #if UNROLL > 9
        npyv_u16 v_9 = npyv_load_u16(ip + 9 * vstep);
        npyv_u16 r_9 = npyv_negative_u16(v_9);
        npyv_store_u16(op + 9 * vstep, r_9);
    #endif
    
#line 108
    #if UNROLL > 10
        npyv_u16 v_10 = npyv_load_u16(ip + 10 * vstep);
        npyv_u16 r_10 = npyv_negative_u16(v_10);
        npyv_store_u16(op + 10 * vstep, r_10);
    #endif
    
#line 108
    #if UNROLL > 11
        npyv_u16 v_11 = npyv_load_u16(ip + 11 * vstep);
        npyv_u16 r_11 = npyv_negative_u16(v_11);
        npyv_store_u16(op + 11 * vstep, r_11);
    #endif
    
#line 108
    #if UNROLL > 12
        npyv_u16 v_12 = npyv_load_u16(ip + 12 * vstep);
        npyv_u16 r_12 = npyv_negative_u16(v_12);
        npyv_store_u16(op + 12 * vstep, r_12);
    #endif
    
#line 108
    #if UNROLL > 13
        npyv_u16 v_13 = npyv_load_u16(ip + 13 * vstep);
        npyv_u16 r_13 = npyv_negative_u16(v_13);
        npyv_store_u16(op + 13 * vstep, r_13);
    #endif
    
#line 108
    #if UNROLL > 14
        npyv_u16 v_14 = npyv_load_u16(ip + 14 * vstep);
        npyv_u16 r_14 = npyv_negative_u16(v_14);
        npyv_store_u16(op + 14 * vstep, r_14);
    #endif
    
#line 108
    #if UNROLL > 15
        npyv_u16 v_15 = npyv_load_u16(ip + 15 * vstep);
        npyv_u16 r_15 = npyv_negative_u16(v_15);
        npyv_store_u16(op + 15 * vstep, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += vstep, op +=vstep) {
        npyv_u16 v = npyv_load_u16(ip);
        npyv_u16 r = npyv_negative_u16(v);
        npyv_store_u16(op, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ++ip, ++op) {
        *op = scalar_negative(*ip);
    }
}

#if 0
// contiguous input, non-contiguous output
static NPY_INLINE void
simd_unary_cn_negative_u16(const npyv_lanetype_u16 *ip,
                             npyv_lanetype_u16 *op, npy_intp ostride,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_u16;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += wstep, op += ostride*wstep) {
    #line 142
    #if UNROLL > 0
        npyv_u16 v_0 = npyv_load_u16(ip + 0 * vstep);
        npyv_u16 r_0 = npyv_negative_u16(v_0);
        npyv_storen_u16(op + 0 * vstep * ostride, ostride, r_0);
    #endif
    
#line 142
    #if UNROLL > 1
        npyv_u16 v_1 = npyv_load_u16(ip + 1 * vstep);
        npyv_u16 r_1 = npyv_negative_u16(v_1);
        npyv_storen_u16(op + 1 * vstep * ostride, ostride, r_1);
    #endif
    
#line 142
    #if UNROLL > 2
        npyv_u16 v_2 = npyv_load_u16(ip + 2 * vstep);
        npyv_u16 r_2 = npyv_negative_u16(v_2);
        npyv_storen_u16(op + 2 * vstep * ostride, ostride, r_2);
    #endif
    
#line 142
    #if UNROLL > 3
        npyv_u16 v_3 = npyv_load_u16(ip + 3 * vstep);
        npyv_u16 r_3 = npyv_negative_u16(v_3);
        npyv_storen_u16(op + 3 * vstep * ostride, ostride, r_3);
    #endif
    
#line 142
    #if UNROLL > 4
        npyv_u16 v_4 = npyv_load_u16(ip + 4 * vstep);
        npyv_u16 r_4 = npyv_negative_u16(v_4);
        npyv_storen_u16(op + 4 * vstep * ostride, ostride, r_4);
    #endif
    
#line 142
    #if UNROLL > 5
        npyv_u16 v_5 = npyv_load_u16(ip + 5 * vstep);
        npyv_u16 r_5 = npyv_negative_u16(v_5);
        npyv_storen_u16(op + 5 * vstep * ostride, ostride, r_5);
    #endif
    
#line 142
    #if UNROLL > 6
        npyv_u16 v_6 = npyv_load_u16(ip + 6 * vstep);
        npyv_u16 r_6 = npyv_negative_u16(v_6);
        npyv_storen_u16(op + 6 * vstep * ostride, ostride, r_6);
    #endif
    
#line 142
    #if UNROLL > 7
        npyv_u16 v_7 = npyv_load_u16(ip + 7 * vstep);
        npyv_u16 r_7 = npyv_negative_u16(v_7);
        npyv_storen_u16(op + 7 * vstep * ostride, ostride, r_7);
    #endif
    
#line 142
    #if UNROLL > 8
        npyv_u16 v_8 = npyv_load_u16(ip + 8 * vstep);
        npyv_u16 r_8 = npyv_negative_u16(v_8);
        npyv_storen_u16(op + 8 * vstep * ostride, ostride, r_8);
    #endif
    
#line 142
    #if UNROLL > 9
        npyv_u16 v_9 = npyv_load_u16(ip + 9 * vstep);
        npyv_u16 r_9 = npyv_negative_u16(v_9);
        npyv_storen_u16(op + 9 * vstep * ostride, ostride, r_9);
    #endif
    
#line 142
    #if UNROLL > 10
        npyv_u16 v_10 = npyv_load_u16(ip + 10 * vstep);
        npyv_u16 r_10 = npyv_negative_u16(v_10);
        npyv_storen_u16(op + 10 * vstep * ostride, ostride, r_10);
    #endif
    
#line 142
    #if UNROLL > 11
        npyv_u16 v_11 = npyv_load_u16(ip + 11 * vstep);
        npyv_u16 r_11 = npyv_negative_u16(v_11);
        npyv_storen_u16(op + 11 * vstep * ostride, ostride, r_11);
    #endif
    
#line 142
    #if UNROLL > 12
        npyv_u16 v_12 = npyv_load_u16(ip + 12 * vstep);
        npyv_u16 r_12 = npyv_negative_u16(v_12);
        npyv_storen_u16(op + 12 * vstep * ostride, ostride, r_12);
    #endif
    
#line 142
    #if UNROLL > 13
        npyv_u16 v_13 = npyv_load_u16(ip + 13 * vstep);
        npyv_u16 r_13 = npyv_negative_u16(v_13);
        npyv_storen_u16(op + 13 * vstep * ostride, ostride, r_13);
    #endif
    
#line 142
    #if UNROLL > 14
        npyv_u16 v_14 = npyv_load_u16(ip + 14 * vstep);
        npyv_u16 r_14 = npyv_negative_u16(v_14);
        npyv_storen_u16(op + 14 * vstep * ostride, ostride, r_14);
    #endif
    
#line 142
    #if UNROLL > 15
        npyv_u16 v_15 = npyv_load_u16(ip + 15 * vstep);
        npyv_u16 r_15 = npyv_negative_u16(v_15);
        npyv_storen_u16(op + 15 * vstep * ostride, ostride, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += vstep, op += ostride*vstep) {
        npyv_u16 v = npyv_load_u16(ip);
        npyv_u16 r = npyv_negative_u16(v);
        npyv_storen_u16(op, ostride, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ++ip, op += ostride) {
        *op = scalar_negative(*ip);
    }
}
// non-contiguous input, contiguous output
static NPY_INLINE void
simd_unary_nc_negative_u16(const npyv_lanetype_u16 *ip, npy_intp istride,
                             npyv_lanetype_u16 *op,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_u16;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += wstep) {
    #line 174
    #if UNROLL > 0
        npyv_u16 v_0 = npyv_loadn_u16(ip + 0 * vstep * istride, istride);
        npyv_u16 r_0 = npyv_negative_u16(v_0);
        npyv_store_u16(op + 0 * vstep, r_0);
    #endif
    
#line 174
    #if UNROLL > 1
        npyv_u16 v_1 = npyv_loadn_u16(ip + 1 * vstep * istride, istride);
        npyv_u16 r_1 = npyv_negative_u16(v_1);
        npyv_store_u16(op + 1 * vstep, r_1);
    #endif
    
#line 174
    #if UNROLL > 2
        npyv_u16 v_2 = npyv_loadn_u16(ip + 2 * vstep * istride, istride);
        npyv_u16 r_2 = npyv_negative_u16(v_2);
        npyv_store_u16(op + 2 * vstep, r_2);
    #endif
    
#line 174
    #if UNROLL > 3
        npyv_u16 v_3 = npyv_loadn_u16(ip + 3 * vstep * istride, istride);
        npyv_u16 r_3 = npyv_negative_u16(v_3);
        npyv_store_u16(op + 3 * vstep, r_3);
    #endif
    
#line 174
    #if UNROLL > 4
        npyv_u16 v_4 = npyv_loadn_u16(ip + 4 * vstep * istride, istride);
        npyv_u16 r_4 = npyv_negative_u16(v_4);
        npyv_store_u16(op + 4 * vstep, r_4);
    #endif
    
#line 174
    #if UNROLL > 5
        npyv_u16 v_5 = npyv_loadn_u16(ip + 5 * vstep * istride, istride);
        npyv_u16 r_5 = npyv_negative_u16(v_5);
        npyv_store_u16(op + 5 * vstep, r_5);
    #endif
    
#line 174
    #if UNROLL > 6
        npyv_u16 v_6 = npyv_loadn_u16(ip + 6 * vstep * istride, istride);
        npyv_u16 r_6 = npyv_negative_u16(v_6);
        npyv_store_u16(op + 6 * vstep, r_6);
    #endif
    
#line 174
    #if UNROLL > 7
        npyv_u16 v_7 = npyv_loadn_u16(ip + 7 * vstep * istride, istride);
        npyv_u16 r_7 = npyv_negative_u16(v_7);
        npyv_store_u16(op + 7 * vstep, r_7);
    #endif
    
#line 174
    #if UNROLL > 8
        npyv_u16 v_8 = npyv_loadn_u16(ip + 8 * vstep * istride, istride);
        npyv_u16 r_8 = npyv_negative_u16(v_8);
        npyv_store_u16(op + 8 * vstep, r_8);
    #endif
    
#line 174
    #if UNROLL > 9
        npyv_u16 v_9 = npyv_loadn_u16(ip + 9 * vstep * istride, istride);
        npyv_u16 r_9 = npyv_negative_u16(v_9);
        npyv_store_u16(op + 9 * vstep, r_9);
    #endif
    
#line 174
    #if UNROLL > 10
        npyv_u16 v_10 = npyv_loadn_u16(ip + 10 * vstep * istride, istride);
        npyv_u16 r_10 = npyv_negative_u16(v_10);
        npyv_store_u16(op + 10 * vstep, r_10);
    #endif
    
#line 174
    #if UNROLL > 11
        npyv_u16 v_11 = npyv_loadn_u16(ip + 11 * vstep * istride, istride);
        npyv_u16 r_11 = npyv_negative_u16(v_11);
        npyv_store_u16(op + 11 * vstep, r_11);
    #endif
    
#line 174
    #if UNROLL > 12
        npyv_u16 v_12 = npyv_loadn_u16(ip + 12 * vstep * istride, istride);
        npyv_u16 r_12 = npyv_negative_u16(v_12);
        npyv_store_u16(op + 12 * vstep, r_12);
    #endif
    
#line 174
    #if UNROLL > 13
        npyv_u16 v_13 = npyv_loadn_u16(ip + 13 * vstep * istride, istride);
        npyv_u16 r_13 = npyv_negative_u16(v_13);
        npyv_store_u16(op + 13 * vstep, r_13);
    #endif
    
#line 174
    #if UNROLL > 14
        npyv_u16 v_14 = npyv_loadn_u16(ip + 14 * vstep * istride, istride);
        npyv_u16 r_14 = npyv_negative_u16(v_14);
        npyv_store_u16(op + 14 * vstep, r_14);
    #endif
    
#line 174
    #if UNROLL > 15
        npyv_u16 v_15 = npyv_loadn_u16(ip + 15 * vstep * istride, istride);
        npyv_u16 r_15 = npyv_negative_u16(v_15);
        npyv_store_u16(op + 15 * vstep, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += vstep) {
        npyv_u16 v = npyv_loadn_u16(ip, istride);
        npyv_u16 r = npyv_negative_u16(v);
        npyv_store_u16(op, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ip += istride, ++op) {
        *op = scalar_negative(*ip);
    }
}
// non-contiguous input and output
// limit unroll to 2x
#if UNROLL > 2
#undef UNROLL
#define UNROLL 2
#endif
// X86 does better with unrolled scalar for heavy non-contiguous
#ifndef NPY_HAVE_SSE2
static NPY_INLINE void
simd_unary_nn_negative_u16(const npyv_lanetype_u16 *ip, npy_intp istride,
                             npyv_lanetype_u16 *op, npy_intp ostride,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_u16;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
    #line 213
    #if UNROLL > 0
        npyv_u16 v_0 = npyv_loadn_u16(ip + 0 * vstep * istride, istride);
        npyv_u16 r_0 = npyv_negative_u16(v_0);
        npyv_storen_u16(op + 0 * vstep * ostride, ostride, r_0);
    #endif
    
#line 213
    #if UNROLL > 1
        npyv_u16 v_1 = npyv_loadn_u16(ip + 1 * vstep * istride, istride);
        npyv_u16 r_1 = npyv_negative_u16(v_1);
        npyv_storen_u16(op + 1 * vstep * ostride, ostride, r_1);
    #endif
    
#line 213
    #if UNROLL > 2
        npyv_u16 v_2 = npyv_loadn_u16(ip + 2 * vstep * istride, istride);
        npyv_u16 r_2 = npyv_negative_u16(v_2);
        npyv_storen_u16(op + 2 * vstep * ostride, ostride, r_2);
    #endif
    
#line 213
    #if UNROLL > 3
        npyv_u16 v_3 = npyv_loadn_u16(ip + 3 * vstep * istride, istride);
        npyv_u16 r_3 = npyv_negative_u16(v_3);
        npyv_storen_u16(op + 3 * vstep * ostride, ostride, r_3);
    #endif
    
#line 213
    #if UNROLL > 4
        npyv_u16 v_4 = npyv_loadn_u16(ip + 4 * vstep * istride, istride);
        npyv_u16 r_4 = npyv_negative_u16(v_4);
        npyv_storen_u16(op + 4 * vstep * ostride, ostride, r_4);
    #endif
    
#line 213
    #if UNROLL > 5
        npyv_u16 v_5 = npyv_loadn_u16(ip + 5 * vstep * istride, istride);
        npyv_u16 r_5 = npyv_negative_u16(v_5);
        npyv_storen_u16(op + 5 * vstep * ostride, ostride, r_5);
    #endif
    
#line 213
    #if UNROLL > 6
        npyv_u16 v_6 = npyv_loadn_u16(ip + 6 * vstep * istride, istride);
        npyv_u16 r_6 = npyv_negative_u16(v_6);
        npyv_storen_u16(op + 6 * vstep * ostride, ostride, r_6);
    #endif
    
#line 213
    #if UNROLL > 7
        npyv_u16 v_7 = npyv_loadn_u16(ip + 7 * vstep * istride, istride);
        npyv_u16 r_7 = npyv_negative_u16(v_7);
        npyv_storen_u16(op + 7 * vstep * ostride, ostride, r_7);
    #endif
    
#line 213
    #if UNROLL > 8
        npyv_u16 v_8 = npyv_loadn_u16(ip + 8 * vstep * istride, istride);
        npyv_u16 r_8 = npyv_negative_u16(v_8);
        npyv_storen_u16(op + 8 * vstep * ostride, ostride, r_8);
    #endif
    
#line 213
    #if UNROLL > 9
        npyv_u16 v_9 = npyv_loadn_u16(ip + 9 * vstep * istride, istride);
        npyv_u16 r_9 = npyv_negative_u16(v_9);
        npyv_storen_u16(op + 9 * vstep * ostride, ostride, r_9);
    #endif
    
#line 213
    #if UNROLL > 10
        npyv_u16 v_10 = npyv_loadn_u16(ip + 10 * vstep * istride, istride);
        npyv_u16 r_10 = npyv_negative_u16(v_10);
        npyv_storen_u16(op + 10 * vstep * ostride, ostride, r_10);
    #endif
    
#line 213
    #if UNROLL > 11
        npyv_u16 v_11 = npyv_loadn_u16(ip + 11 * vstep * istride, istride);
        npyv_u16 r_11 = npyv_negative_u16(v_11);
        npyv_storen_u16(op + 11 * vstep * ostride, ostride, r_11);
    #endif
    
#line 213
    #if UNROLL > 12
        npyv_u16 v_12 = npyv_loadn_u16(ip + 12 * vstep * istride, istride);
        npyv_u16 r_12 = npyv_negative_u16(v_12);
        npyv_storen_u16(op + 12 * vstep * ostride, ostride, r_12);
    #endif
    
#line 213
    #if UNROLL > 13
        npyv_u16 v_13 = npyv_loadn_u16(ip + 13 * vstep * istride, istride);
        npyv_u16 r_13 = npyv_negative_u16(v_13);
        npyv_storen_u16(op + 13 * vstep * ostride, ostride, r_13);
    #endif
    
#line 213
    #if UNROLL > 14
        npyv_u16 v_14 = npyv_loadn_u16(ip + 14 * vstep * istride, istride);
        npyv_u16 r_14 = npyv_negative_u16(v_14);
        npyv_storen_u16(op + 14 * vstep * ostride, ostride, r_14);
    #endif
    
#line 213
    #if UNROLL > 15
        npyv_u16 v_15 = npyv_loadn_u16(ip + 15 * vstep * istride, istride);
        npyv_u16 r_15 = npyv_negative_u16(v_15);
        npyv_storen_u16(op + 15 * vstep * ostride, ostride, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
        npyv_u16 v = npyv_loadn_u16(ip, istride);
        npyv_u16 r = npyv_negative_u16(v);
        npyv_storen_u16(op, ostride, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ip += istride, op += ostride) {
        *op = scalar_negative(*ip);
    }
}
#endif // NPY_HAVE_SSE2
#endif // 0
#undef UNROLL
#endif // NPY_SIMD
/*end repeat1**/

#line 80
#line 85
#if NPY_SIMD
#if 4 < 1
#error "Unroll must be at least 1"
#elif NPY_SIMD != 128 && 4 > 2
// Avoid memory bandwidth bottleneck for larger SIMD
#define UNROLL 2
#else
#define UNROLL 4
#endif
// contiguous inputs and output.
static NPY_INLINE void
simd_unary_cc_negative_s32(const npyv_lanetype_s32 *ip,
                             npyv_lanetype_s32 *op,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_s32;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
    #line 108
    #if UNROLL > 0
        npyv_s32 v_0 = npyv_load_s32(ip + 0 * vstep);
        npyv_s32 r_0 = npyv_negative_s32(v_0);
        npyv_store_s32(op + 0 * vstep, r_0);
    #endif
    
#line 108
    #if UNROLL > 1
        npyv_s32 v_1 = npyv_load_s32(ip + 1 * vstep);
        npyv_s32 r_1 = npyv_negative_s32(v_1);
        npyv_store_s32(op + 1 * vstep, r_1);
    #endif
    
#line 108
    #if UNROLL > 2
        npyv_s32 v_2 = npyv_load_s32(ip + 2 * vstep);
        npyv_s32 r_2 = npyv_negative_s32(v_2);
        npyv_store_s32(op + 2 * vstep, r_2);
    #endif
    
#line 108
    #if UNROLL > 3
        npyv_s32 v_3 = npyv_load_s32(ip + 3 * vstep);
        npyv_s32 r_3 = npyv_negative_s32(v_3);
        npyv_store_s32(op + 3 * vstep, r_3);
    #endif
    
#line 108
    #if UNROLL > 4
        npyv_s32 v_4 = npyv_load_s32(ip + 4 * vstep);
        npyv_s32 r_4 = npyv_negative_s32(v_4);
        npyv_store_s32(op + 4 * vstep, r_4);
    #endif
    
#line 108
    #if UNROLL > 5
        npyv_s32 v_5 = npyv_load_s32(ip + 5 * vstep);
        npyv_s32 r_5 = npyv_negative_s32(v_5);
        npyv_store_s32(op + 5 * vstep, r_5);
    #endif
    
#line 108
    #if UNROLL > 6
        npyv_s32 v_6 = npyv_load_s32(ip + 6 * vstep);
        npyv_s32 r_6 = npyv_negative_s32(v_6);
        npyv_store_s32(op + 6 * vstep, r_6);
    #endif
    
#line 108
    #if UNROLL > 7
        npyv_s32 v_7 = npyv_load_s32(ip + 7 * vstep);
        npyv_s32 r_7 = npyv_negative_s32(v_7);
        npyv_store_s32(op + 7 * vstep, r_7);
    #endif
    
#line 108
    #if UNROLL > 8
        npyv_s32 v_8 = npyv_load_s32(ip + 8 * vstep);
        npyv_s32 r_8 = npyv_negative_s32(v_8);
        npyv_store_s32(op + 8 * vstep, r_8);
    #endif
    
#line 108
    #if UNROLL > 9
        npyv_s32 v_9 = npyv_load_s32(ip + 9 * vstep);
        npyv_s32 r_9 = npyv_negative_s32(v_9);
        npyv_store_s32(op + 9 * vstep, r_9);
    #endif
    
#line 108
    #if UNROLL > 10
        npyv_s32 v_10 = npyv_load_s32(ip + 10 * vstep);
        npyv_s32 r_10 = npyv_negative_s32(v_10);
        npyv_store_s32(op + 10 * vstep, r_10);
    #endif
    
#line 108
    #if UNROLL > 11
        npyv_s32 v_11 = npyv_load_s32(ip + 11 * vstep);
        npyv_s32 r_11 = npyv_negative_s32(v_11);
        npyv_store_s32(op + 11 * vstep, r_11);
    #endif
    
#line 108
    #if UNROLL > 12
        npyv_s32 v_12 = npyv_load_s32(ip + 12 * vstep);
        npyv_s32 r_12 = npyv_negative_s32(v_12);
        npyv_store_s32(op + 12 * vstep, r_12);
    #endif
    
#line 108
    #if UNROLL > 13
        npyv_s32 v_13 = npyv_load_s32(ip + 13 * vstep);
        npyv_s32 r_13 = npyv_negative_s32(v_13);
        npyv_store_s32(op + 13 * vstep, r_13);
    #endif
    
#line 108
    #if UNROLL > 14
        npyv_s32 v_14 = npyv_load_s32(ip + 14 * vstep);
        npyv_s32 r_14 = npyv_negative_s32(v_14);
        npyv_store_s32(op + 14 * vstep, r_14);
    #endif
    
#line 108
    #if UNROLL > 15
        npyv_s32 v_15 = npyv_load_s32(ip + 15 * vstep);
        npyv_s32 r_15 = npyv_negative_s32(v_15);
        npyv_store_s32(op + 15 * vstep, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += vstep, op +=vstep) {
        npyv_s32 v = npyv_load_s32(ip);
        npyv_s32 r = npyv_negative_s32(v);
        npyv_store_s32(op, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ++ip, ++op) {
        *op = scalar_negative(*ip);
    }
}

#if 1
// contiguous input, non-contiguous output
static NPY_INLINE void
simd_unary_cn_negative_s32(const npyv_lanetype_s32 *ip,
                             npyv_lanetype_s32 *op, npy_intp ostride,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_s32;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += wstep, op += ostride*wstep) {
    #line 142
    #if UNROLL > 0
        npyv_s32 v_0 = npyv_load_s32(ip + 0 * vstep);
        npyv_s32 r_0 = npyv_negative_s32(v_0);
        npyv_storen_s32(op + 0 * vstep * ostride, ostride, r_0);
    #endif
    
#line 142
    #if UNROLL > 1
        npyv_s32 v_1 = npyv_load_s32(ip + 1 * vstep);
        npyv_s32 r_1 = npyv_negative_s32(v_1);
        npyv_storen_s32(op + 1 * vstep * ostride, ostride, r_1);
    #endif
    
#line 142
    #if UNROLL > 2
        npyv_s32 v_2 = npyv_load_s32(ip + 2 * vstep);
        npyv_s32 r_2 = npyv_negative_s32(v_2);
        npyv_storen_s32(op + 2 * vstep * ostride, ostride, r_2);
    #endif
    
#line 142
    #if UNROLL > 3
        npyv_s32 v_3 = npyv_load_s32(ip + 3 * vstep);
        npyv_s32 r_3 = npyv_negative_s32(v_3);
        npyv_storen_s32(op + 3 * vstep * ostride, ostride, r_3);
    #endif
    
#line 142
    #if UNROLL > 4
        npyv_s32 v_4 = npyv_load_s32(ip + 4 * vstep);
        npyv_s32 r_4 = npyv_negative_s32(v_4);
        npyv_storen_s32(op + 4 * vstep * ostride, ostride, r_4);
    #endif
    
#line 142
    #if UNROLL > 5
        npyv_s32 v_5 = npyv_load_s32(ip + 5 * vstep);
        npyv_s32 r_5 = npyv_negative_s32(v_5);
        npyv_storen_s32(op + 5 * vstep * ostride, ostride, r_5);
    #endif
    
#line 142
    #if UNROLL > 6
        npyv_s32 v_6 = npyv_load_s32(ip + 6 * vstep);
        npyv_s32 r_6 = npyv_negative_s32(v_6);
        npyv_storen_s32(op + 6 * vstep * ostride, ostride, r_6);
    #endif
    
#line 142
    #if UNROLL > 7
        npyv_s32 v_7 = npyv_load_s32(ip + 7 * vstep);
        npyv_s32 r_7 = npyv_negative_s32(v_7);
        npyv_storen_s32(op + 7 * vstep * ostride, ostride, r_7);
    #endif
    
#line 142
    #if UNROLL > 8
        npyv_s32 v_8 = npyv_load_s32(ip + 8 * vstep);
        npyv_s32 r_8 = npyv_negative_s32(v_8);
        npyv_storen_s32(op + 8 * vstep * ostride, ostride, r_8);
    #endif
    
#line 142
    #if UNROLL > 9
        npyv_s32 v_9 = npyv_load_s32(ip + 9 * vstep);
        npyv_s32 r_9 = npyv_negative_s32(v_9);
        npyv_storen_s32(op + 9 * vstep * ostride, ostride, r_9);
    #endif
    
#line 142
    #if UNROLL > 10
        npyv_s32 v_10 = npyv_load_s32(ip + 10 * vstep);
        npyv_s32 r_10 = npyv_negative_s32(v_10);
        npyv_storen_s32(op + 10 * vstep * ostride, ostride, r_10);
    #endif
    
#line 142
    #if UNROLL > 11
        npyv_s32 v_11 = npyv_load_s32(ip + 11 * vstep);
        npyv_s32 r_11 = npyv_negative_s32(v_11);
        npyv_storen_s32(op + 11 * vstep * ostride, ostride, r_11);
    #endif
    
#line 142
    #if UNROLL > 12
        npyv_s32 v_12 = npyv_load_s32(ip + 12 * vstep);
        npyv_s32 r_12 = npyv_negative_s32(v_12);
        npyv_storen_s32(op + 12 * vstep * ostride, ostride, r_12);
    #endif
    
#line 142
    #if UNROLL > 13
        npyv_s32 v_13 = npyv_load_s32(ip + 13 * vstep);
        npyv_s32 r_13 = npyv_negative_s32(v_13);
        npyv_storen_s32(op + 13 * vstep * ostride, ostride, r_13);
    #endif
    
#line 142
    #if UNROLL > 14
        npyv_s32 v_14 = npyv_load_s32(ip + 14 * vstep);
        npyv_s32 r_14 = npyv_negative_s32(v_14);
        npyv_storen_s32(op + 14 * vstep * ostride, ostride, r_14);
    #endif
    
#line 142
    #if UNROLL > 15
        npyv_s32 v_15 = npyv_load_s32(ip + 15 * vstep);
        npyv_s32 r_15 = npyv_negative_s32(v_15);
        npyv_storen_s32(op + 15 * vstep * ostride, ostride, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += vstep, op += ostride*vstep) {
        npyv_s32 v = npyv_load_s32(ip);
        npyv_s32 r = npyv_negative_s32(v);
        npyv_storen_s32(op, ostride, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ++ip, op += ostride) {
        *op = scalar_negative(*ip);
    }
}
// non-contiguous input, contiguous output
static NPY_INLINE void
simd_unary_nc_negative_s32(const npyv_lanetype_s32 *ip, npy_intp istride,
                             npyv_lanetype_s32 *op,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_s32;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += wstep) {
    #line 174
    #if UNROLL > 0
        npyv_s32 v_0 = npyv_loadn_s32(ip + 0 * vstep * istride, istride);
        npyv_s32 r_0 = npyv_negative_s32(v_0);
        npyv_store_s32(op + 0 * vstep, r_0);
    #endif
    
#line 174
    #if UNROLL > 1
        npyv_s32 v_1 = npyv_loadn_s32(ip + 1 * vstep * istride, istride);
        npyv_s32 r_1 = npyv_negative_s32(v_1);
        npyv_store_s32(op + 1 * vstep, r_1);
    #endif
    
#line 174
    #if UNROLL > 2
        npyv_s32 v_2 = npyv_loadn_s32(ip + 2 * vstep * istride, istride);
        npyv_s32 r_2 = npyv_negative_s32(v_2);
        npyv_store_s32(op + 2 * vstep, r_2);
    #endif
    
#line 174
    #if UNROLL > 3
        npyv_s32 v_3 = npyv_loadn_s32(ip + 3 * vstep * istride, istride);
        npyv_s32 r_3 = npyv_negative_s32(v_3);
        npyv_store_s32(op + 3 * vstep, r_3);
    #endif
    
#line 174
    #if UNROLL > 4
        npyv_s32 v_4 = npyv_loadn_s32(ip + 4 * vstep * istride, istride);
        npyv_s32 r_4 = npyv_negative_s32(v_4);
        npyv_store_s32(op + 4 * vstep, r_4);
    #endif
    
#line 174
    #if UNROLL > 5
        npyv_s32 v_5 = npyv_loadn_s32(ip + 5 * vstep * istride, istride);
        npyv_s32 r_5 = npyv_negative_s32(v_5);
        npyv_store_s32(op + 5 * vstep, r_5);
    #endif
    
#line 174
    #if UNROLL > 6
        npyv_s32 v_6 = npyv_loadn_s32(ip + 6 * vstep * istride, istride);
        npyv_s32 r_6 = npyv_negative_s32(v_6);
        npyv_store_s32(op + 6 * vstep, r_6);
    #endif
    
#line 174
    #if UNROLL > 7
        npyv_s32 v_7 = npyv_loadn_s32(ip + 7 * vstep * istride, istride);
        npyv_s32 r_7 = npyv_negative_s32(v_7);
        npyv_store_s32(op + 7 * vstep, r_7);
    #endif
    
#line 174
    #if UNROLL > 8
        npyv_s32 v_8 = npyv_loadn_s32(ip + 8 * vstep * istride, istride);
        npyv_s32 r_8 = npyv_negative_s32(v_8);
        npyv_store_s32(op + 8 * vstep, r_8);
    #endif
    
#line 174
    #if UNROLL > 9
        npyv_s32 v_9 = npyv_loadn_s32(ip + 9 * vstep * istride, istride);
        npyv_s32 r_9 = npyv_negative_s32(v_9);
        npyv_store_s32(op + 9 * vstep, r_9);
    #endif
    
#line 174
    #if UNROLL > 10
        npyv_s32 v_10 = npyv_loadn_s32(ip + 10 * vstep * istride, istride);
        npyv_s32 r_10 = npyv_negative_s32(v_10);
        npyv_store_s32(op + 10 * vstep, r_10);
    #endif
    
#line 174
    #if UNROLL > 11
        npyv_s32 v_11 = npyv_loadn_s32(ip + 11 * vstep * istride, istride);
        npyv_s32 r_11 = npyv_negative_s32(v_11);
        npyv_store_s32(op + 11 * vstep, r_11);
    #endif
    
#line 174
    #if UNROLL > 12
        npyv_s32 v_12 = npyv_loadn_s32(ip + 12 * vstep * istride, istride);
        npyv_s32 r_12 = npyv_negative_s32(v_12);
        npyv_store_s32(op + 12 * vstep, r_12);
    #endif
    
#line 174
    #if UNROLL > 13
        npyv_s32 v_13 = npyv_loadn_s32(ip + 13 * vstep * istride, istride);
        npyv_s32 r_13 = npyv_negative_s32(v_13);
        npyv_store_s32(op + 13 * vstep, r_13);
    #endif
    
#line 174
    #if UNROLL > 14
        npyv_s32 v_14 = npyv_loadn_s32(ip + 14 * vstep * istride, istride);
        npyv_s32 r_14 = npyv_negative_s32(v_14);
        npyv_store_s32(op + 14 * vstep, r_14);
    #endif
    
#line 174
    #if UNROLL > 15
        npyv_s32 v_15 = npyv_loadn_s32(ip + 15 * vstep * istride, istride);
        npyv_s32 r_15 = npyv_negative_s32(v_15);
        npyv_store_s32(op + 15 * vstep, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += vstep) {
        npyv_s32 v = npyv_loadn_s32(ip, istride);
        npyv_s32 r = npyv_negative_s32(v);
        npyv_store_s32(op, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ip += istride, ++op) {
        *op = scalar_negative(*ip);
    }
}
// non-contiguous input and output
// limit unroll to 2x
#if UNROLL > 2
#undef UNROLL
#define UNROLL 2
#endif
// X86 does better with unrolled scalar for heavy non-contiguous
#ifndef NPY_HAVE_SSE2
static NPY_INLINE void
simd_unary_nn_negative_s32(const npyv_lanetype_s32 *ip, npy_intp istride,
                             npyv_lanetype_s32 *op, npy_intp ostride,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_s32;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
    #line 213
    #if UNROLL > 0
        npyv_s32 v_0 = npyv_loadn_s32(ip + 0 * vstep * istride, istride);
        npyv_s32 r_0 = npyv_negative_s32(v_0);
        npyv_storen_s32(op + 0 * vstep * ostride, ostride, r_0);
    #endif
    
#line 213
    #if UNROLL > 1
        npyv_s32 v_1 = npyv_loadn_s32(ip + 1 * vstep * istride, istride);
        npyv_s32 r_1 = npyv_negative_s32(v_1);
        npyv_storen_s32(op + 1 * vstep * ostride, ostride, r_1);
    #endif
    
#line 213
    #if UNROLL > 2
        npyv_s32 v_2 = npyv_loadn_s32(ip + 2 * vstep * istride, istride);
        npyv_s32 r_2 = npyv_negative_s32(v_2);
        npyv_storen_s32(op + 2 * vstep * ostride, ostride, r_2);
    #endif
    
#line 213
    #if UNROLL > 3
        npyv_s32 v_3 = npyv_loadn_s32(ip + 3 * vstep * istride, istride);
        npyv_s32 r_3 = npyv_negative_s32(v_3);
        npyv_storen_s32(op + 3 * vstep * ostride, ostride, r_3);
    #endif
    
#line 213
    #if UNROLL > 4
        npyv_s32 v_4 = npyv_loadn_s32(ip + 4 * vstep * istride, istride);
        npyv_s32 r_4 = npyv_negative_s32(v_4);
        npyv_storen_s32(op + 4 * vstep * ostride, ostride, r_4);
    #endif
    
#line 213
    #if UNROLL > 5
        npyv_s32 v_5 = npyv_loadn_s32(ip + 5 * vstep * istride, istride);
        npyv_s32 r_5 = npyv_negative_s32(v_5);
        npyv_storen_s32(op + 5 * vstep * ostride, ostride, r_5);
    #endif
    
#line 213
    #if UNROLL > 6
        npyv_s32 v_6 = npyv_loadn_s32(ip + 6 * vstep * istride, istride);
        npyv_s32 r_6 = npyv_negative_s32(v_6);
        npyv_storen_s32(op + 6 * vstep * ostride, ostride, r_6);
    #endif
    
#line 213
    #if UNROLL > 7
        npyv_s32 v_7 = npyv_loadn_s32(ip + 7 * vstep * istride, istride);
        npyv_s32 r_7 = npyv_negative_s32(v_7);
        npyv_storen_s32(op + 7 * vstep * ostride, ostride, r_7);
    #endif
    
#line 213
    #if UNROLL > 8
        npyv_s32 v_8 = npyv_loadn_s32(ip + 8 * vstep * istride, istride);
        npyv_s32 r_8 = npyv_negative_s32(v_8);
        npyv_storen_s32(op + 8 * vstep * ostride, ostride, r_8);
    #endif
    
#line 213
    #if UNROLL > 9
        npyv_s32 v_9 = npyv_loadn_s32(ip + 9 * vstep * istride, istride);
        npyv_s32 r_9 = npyv_negative_s32(v_9);
        npyv_storen_s32(op + 9 * vstep * ostride, ostride, r_9);
    #endif
    
#line 213
    #if UNROLL > 10
        npyv_s32 v_10 = npyv_loadn_s32(ip + 10 * vstep * istride, istride);
        npyv_s32 r_10 = npyv_negative_s32(v_10);
        npyv_storen_s32(op + 10 * vstep * ostride, ostride, r_10);
    #endif
    
#line 213
    #if UNROLL > 11
        npyv_s32 v_11 = npyv_loadn_s32(ip + 11 * vstep * istride, istride);
        npyv_s32 r_11 = npyv_negative_s32(v_11);
        npyv_storen_s32(op + 11 * vstep * ostride, ostride, r_11);
    #endif
    
#line 213
    #if UNROLL > 12
        npyv_s32 v_12 = npyv_loadn_s32(ip + 12 * vstep * istride, istride);
        npyv_s32 r_12 = npyv_negative_s32(v_12);
        npyv_storen_s32(op + 12 * vstep * ostride, ostride, r_12);
    #endif
    
#line 213
    #if UNROLL > 13
        npyv_s32 v_13 = npyv_loadn_s32(ip + 13 * vstep * istride, istride);
        npyv_s32 r_13 = npyv_negative_s32(v_13);
        npyv_storen_s32(op + 13 * vstep * ostride, ostride, r_13);
    #endif
    
#line 213
    #if UNROLL > 14
        npyv_s32 v_14 = npyv_loadn_s32(ip + 14 * vstep * istride, istride);
        npyv_s32 r_14 = npyv_negative_s32(v_14);
        npyv_storen_s32(op + 14 * vstep * ostride, ostride, r_14);
    #endif
    
#line 213
    #if UNROLL > 15
        npyv_s32 v_15 = npyv_loadn_s32(ip + 15 * vstep * istride, istride);
        npyv_s32 r_15 = npyv_negative_s32(v_15);
        npyv_storen_s32(op + 15 * vstep * ostride, ostride, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
        npyv_s32 v = npyv_loadn_s32(ip, istride);
        npyv_s32 r = npyv_negative_s32(v);
        npyv_storen_s32(op, ostride, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ip += istride, op += ostride) {
        *op = scalar_negative(*ip);
    }
}
#endif // NPY_HAVE_SSE2
#endif // 1
#undef UNROLL
#endif // NPY_SIMD
/*end repeat1**/

#line 80
#line 85
#if NPY_SIMD
#if 4 < 1
#error "Unroll must be at least 1"
#elif NPY_SIMD != 128 && 4 > 2
// Avoid memory bandwidth bottleneck for larger SIMD
#define UNROLL 2
#else
#define UNROLL 4
#endif
// contiguous inputs and output.
static NPY_INLINE void
simd_unary_cc_negative_u32(const npyv_lanetype_u32 *ip,
                             npyv_lanetype_u32 *op,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_u32;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
    #line 108
    #if UNROLL > 0
        npyv_u32 v_0 = npyv_load_u32(ip + 0 * vstep);
        npyv_u32 r_0 = npyv_negative_u32(v_0);
        npyv_store_u32(op + 0 * vstep, r_0);
    #endif
    
#line 108
    #if UNROLL > 1
        npyv_u32 v_1 = npyv_load_u32(ip + 1 * vstep);
        npyv_u32 r_1 = npyv_negative_u32(v_1);
        npyv_store_u32(op + 1 * vstep, r_1);
    #endif
    
#line 108
    #if UNROLL > 2
        npyv_u32 v_2 = npyv_load_u32(ip + 2 * vstep);
        npyv_u32 r_2 = npyv_negative_u32(v_2);
        npyv_store_u32(op + 2 * vstep, r_2);
    #endif
    
#line 108
    #if UNROLL > 3
        npyv_u32 v_3 = npyv_load_u32(ip + 3 * vstep);
        npyv_u32 r_3 = npyv_negative_u32(v_3);
        npyv_store_u32(op + 3 * vstep, r_3);
    #endif
    
#line 108
    #if UNROLL > 4
        npyv_u32 v_4 = npyv_load_u32(ip + 4 * vstep);
        npyv_u32 r_4 = npyv_negative_u32(v_4);
        npyv_store_u32(op + 4 * vstep, r_4);
    #endif
    
#line 108
    #if UNROLL > 5
        npyv_u32 v_5 = npyv_load_u32(ip + 5 * vstep);
        npyv_u32 r_5 = npyv_negative_u32(v_5);
        npyv_store_u32(op + 5 * vstep, r_5);
    #endif
    
#line 108
    #if UNROLL > 6
        npyv_u32 v_6 = npyv_load_u32(ip + 6 * vstep);
        npyv_u32 r_6 = npyv_negative_u32(v_6);
        npyv_store_u32(op + 6 * vstep, r_6);
    #endif
    
#line 108
    #if UNROLL > 7
        npyv_u32 v_7 = npyv_load_u32(ip + 7 * vstep);
        npyv_u32 r_7 = npyv_negative_u32(v_7);
        npyv_store_u32(op + 7 * vstep, r_7);
    #endif
    
#line 108
    #if UNROLL > 8
        npyv_u32 v_8 = npyv_load_u32(ip + 8 * vstep);
        npyv_u32 r_8 = npyv_negative_u32(v_8);
        npyv_store_u32(op + 8 * vstep, r_8);
    #endif
    
#line 108
    #if UNROLL > 9
        npyv_u32 v_9 = npyv_load_u32(ip + 9 * vstep);
        npyv_u32 r_9 = npyv_negative_u32(v_9);
        npyv_store_u32(op + 9 * vstep, r_9);
    #endif
    
#line 108
    #if UNROLL > 10
        npyv_u32 v_10 = npyv_load_u32(ip + 10 * vstep);
        npyv_u32 r_10 = npyv_negative_u32(v_10);
        npyv_store_u32(op + 10 * vstep, r_10);
    #endif
    
#line 108
    #if UNROLL > 11
        npyv_u32 v_11 = npyv_load_u32(ip + 11 * vstep);
        npyv_u32 r_11 = npyv_negative_u32(v_11);
        npyv_store_u32(op + 11 * vstep, r_11);
    #endif
    
#line 108
    #if UNROLL > 12
        npyv_u32 v_12 = npyv_load_u32(ip + 12 * vstep);
        npyv_u32 r_12 = npyv_negative_u32(v_12);
        npyv_store_u32(op + 12 * vstep, r_12);
    #endif
    
#line 108
    #if UNROLL > 13
        npyv_u32 v_13 = npyv_load_u32(ip + 13 * vstep);
        npyv_u32 r_13 = npyv_negative_u32(v_13);
        npyv_store_u32(op + 13 * vstep, r_13);
    #endif
    
#line 108
    #if UNROLL > 14
        npyv_u32 v_14 = npyv_load_u32(ip + 14 * vstep);
        npyv_u32 r_14 = npyv_negative_u32(v_14);
        npyv_store_u32(op + 14 * vstep, r_14);
    #endif
    
#line 108
    #if UNROLL > 15
        npyv_u32 v_15 = npyv_load_u32(ip + 15 * vstep);
        npyv_u32 r_15 = npyv_negative_u32(v_15);
        npyv_store_u32(op + 15 * vstep, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += vstep, op +=vstep) {
        npyv_u32 v = npyv_load_u32(ip);
        npyv_u32 r = npyv_negative_u32(v);
        npyv_store_u32(op, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ++ip, ++op) {
        *op = scalar_negative(*ip);
    }
}

#if 1
// contiguous input, non-contiguous output
static NPY_INLINE void
simd_unary_cn_negative_u32(const npyv_lanetype_u32 *ip,
                             npyv_lanetype_u32 *op, npy_intp ostride,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_u32;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += wstep, op += ostride*wstep) {
    #line 142
    #if UNROLL > 0
        npyv_u32 v_0 = npyv_load_u32(ip + 0 * vstep);
        npyv_u32 r_0 = npyv_negative_u32(v_0);
        npyv_storen_u32(op + 0 * vstep * ostride, ostride, r_0);
    #endif
    
#line 142
    #if UNROLL > 1
        npyv_u32 v_1 = npyv_load_u32(ip + 1 * vstep);
        npyv_u32 r_1 = npyv_negative_u32(v_1);
        npyv_storen_u32(op + 1 * vstep * ostride, ostride, r_1);
    #endif
    
#line 142
    #if UNROLL > 2
        npyv_u32 v_2 = npyv_load_u32(ip + 2 * vstep);
        npyv_u32 r_2 = npyv_negative_u32(v_2);
        npyv_storen_u32(op + 2 * vstep * ostride, ostride, r_2);
    #endif
    
#line 142
    #if UNROLL > 3
        npyv_u32 v_3 = npyv_load_u32(ip + 3 * vstep);
        npyv_u32 r_3 = npyv_negative_u32(v_3);
        npyv_storen_u32(op + 3 * vstep * ostride, ostride, r_3);
    #endif
    
#line 142
    #if UNROLL > 4
        npyv_u32 v_4 = npyv_load_u32(ip + 4 * vstep);
        npyv_u32 r_4 = npyv_negative_u32(v_4);
        npyv_storen_u32(op + 4 * vstep * ostride, ostride, r_4);
    #endif
    
#line 142
    #if UNROLL > 5
        npyv_u32 v_5 = npyv_load_u32(ip + 5 * vstep);
        npyv_u32 r_5 = npyv_negative_u32(v_5);
        npyv_storen_u32(op + 5 * vstep * ostride, ostride, r_5);
    #endif
    
#line 142
    #if UNROLL > 6
        npyv_u32 v_6 = npyv_load_u32(ip + 6 * vstep);
        npyv_u32 r_6 = npyv_negative_u32(v_6);
        npyv_storen_u32(op + 6 * vstep * ostride, ostride, r_6);
    #endif
    
#line 142
    #if UNROLL > 7
        npyv_u32 v_7 = npyv_load_u32(ip + 7 * vstep);
        npyv_u32 r_7 = npyv_negative_u32(v_7);
        npyv_storen_u32(op + 7 * vstep * ostride, ostride, r_7);
    #endif
    
#line 142
    #if UNROLL > 8
        npyv_u32 v_8 = npyv_load_u32(ip + 8 * vstep);
        npyv_u32 r_8 = npyv_negative_u32(v_8);
        npyv_storen_u32(op + 8 * vstep * ostride, ostride, r_8);
    #endif
    
#line 142
    #if UNROLL > 9
        npyv_u32 v_9 = npyv_load_u32(ip + 9 * vstep);
        npyv_u32 r_9 = npyv_negative_u32(v_9);
        npyv_storen_u32(op + 9 * vstep * ostride, ostride, r_9);
    #endif
    
#line 142
    #if UNROLL > 10
        npyv_u32 v_10 = npyv_load_u32(ip + 10 * vstep);
        npyv_u32 r_10 = npyv_negative_u32(v_10);
        npyv_storen_u32(op + 10 * vstep * ostride, ostride, r_10);
    #endif
    
#line 142
    #if UNROLL > 11
        npyv_u32 v_11 = npyv_load_u32(ip + 11 * vstep);
        npyv_u32 r_11 = npyv_negative_u32(v_11);
        npyv_storen_u32(op + 11 * vstep * ostride, ostride, r_11);
    #endif
    
#line 142
    #if UNROLL > 12
        npyv_u32 v_12 = npyv_load_u32(ip + 12 * vstep);
        npyv_u32 r_12 = npyv_negative_u32(v_12);
        npyv_storen_u32(op + 12 * vstep * ostride, ostride, r_12);
    #endif
    
#line 142
    #if UNROLL > 13
        npyv_u32 v_13 = npyv_load_u32(ip + 13 * vstep);
        npyv_u32 r_13 = npyv_negative_u32(v_13);
        npyv_storen_u32(op + 13 * vstep * ostride, ostride, r_13);
    #endif
    
#line 142
    #if UNROLL > 14
        npyv_u32 v_14 = npyv_load_u32(ip + 14 * vstep);
        npyv_u32 r_14 = npyv_negative_u32(v_14);
        npyv_storen_u32(op + 14 * vstep * ostride, ostride, r_14);
    #endif
    
#line 142
    #if UNROLL > 15
        npyv_u32 v_15 = npyv_load_u32(ip + 15 * vstep);
        npyv_u32 r_15 = npyv_negative_u32(v_15);
        npyv_storen_u32(op + 15 * vstep * ostride, ostride, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += vstep, op += ostride*vstep) {
        npyv_u32 v = npyv_load_u32(ip);
        npyv_u32 r = npyv_negative_u32(v);
        npyv_storen_u32(op, ostride, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ++ip, op += ostride) {
        *op = scalar_negative(*ip);
    }
}
// non-contiguous input, contiguous output
static NPY_INLINE void
simd_unary_nc_negative_u32(const npyv_lanetype_u32 *ip, npy_intp istride,
                             npyv_lanetype_u32 *op,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_u32;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += wstep) {
    #line 174
    #if UNROLL > 0
        npyv_u32 v_0 = npyv_loadn_u32(ip + 0 * vstep * istride, istride);
        npyv_u32 r_0 = npyv_negative_u32(v_0);
        npyv_store_u32(op + 0 * vstep, r_0);
    #endif
    
#line 174
    #if UNROLL > 1
        npyv_u32 v_1 = npyv_loadn_u32(ip + 1 * vstep * istride, istride);
        npyv_u32 r_1 = npyv_negative_u32(v_1);
        npyv_store_u32(op + 1 * vstep, r_1);
    #endif
    
#line 174
    #if UNROLL > 2
        npyv_u32 v_2 = npyv_loadn_u32(ip + 2 * vstep * istride, istride);
        npyv_u32 r_2 = npyv_negative_u32(v_2);
        npyv_store_u32(op + 2 * vstep, r_2);
    #endif
    
#line 174
    #if UNROLL > 3
        npyv_u32 v_3 = npyv_loadn_u32(ip + 3 * vstep * istride, istride);
        npyv_u32 r_3 = npyv_negative_u32(v_3);
        npyv_store_u32(op + 3 * vstep, r_3);
    #endif
    
#line 174
    #if UNROLL > 4
        npyv_u32 v_4 = npyv_loadn_u32(ip + 4 * vstep * istride, istride);
        npyv_u32 r_4 = npyv_negative_u32(v_4);
        npyv_store_u32(op + 4 * vstep, r_4);
    #endif
    
#line 174
    #if UNROLL > 5
        npyv_u32 v_5 = npyv_loadn_u32(ip + 5 * vstep * istride, istride);
        npyv_u32 r_5 = npyv_negative_u32(v_5);
        npyv_store_u32(op + 5 * vstep, r_5);
    #endif
    
#line 174
    #if UNROLL > 6
        npyv_u32 v_6 = npyv_loadn_u32(ip + 6 * vstep * istride, istride);
        npyv_u32 r_6 = npyv_negative_u32(v_6);
        npyv_store_u32(op + 6 * vstep, r_6);
    #endif
    
#line 174
    #if UNROLL > 7
        npyv_u32 v_7 = npyv_loadn_u32(ip + 7 * vstep * istride, istride);
        npyv_u32 r_7 = npyv_negative_u32(v_7);
        npyv_store_u32(op + 7 * vstep, r_7);
    #endif
    
#line 174
    #if UNROLL > 8
        npyv_u32 v_8 = npyv_loadn_u32(ip + 8 * vstep * istride, istride);
        npyv_u32 r_8 = npyv_negative_u32(v_8);
        npyv_store_u32(op + 8 * vstep, r_8);
    #endif
    
#line 174
    #if UNROLL > 9
        npyv_u32 v_9 = npyv_loadn_u32(ip + 9 * vstep * istride, istride);
        npyv_u32 r_9 = npyv_negative_u32(v_9);
        npyv_store_u32(op + 9 * vstep, r_9);
    #endif
    
#line 174
    #if UNROLL > 10
        npyv_u32 v_10 = npyv_loadn_u32(ip + 10 * vstep * istride, istride);
        npyv_u32 r_10 = npyv_negative_u32(v_10);
        npyv_store_u32(op + 10 * vstep, r_10);
    #endif
    
#line 174
    #if UNROLL > 11
        npyv_u32 v_11 = npyv_loadn_u32(ip + 11 * vstep * istride, istride);
        npyv_u32 r_11 = npyv_negative_u32(v_11);
        npyv_store_u32(op + 11 * vstep, r_11);
    #endif
    
#line 174
    #if UNROLL > 12
        npyv_u32 v_12 = npyv_loadn_u32(ip + 12 * vstep * istride, istride);
        npyv_u32 r_12 = npyv_negative_u32(v_12);
        npyv_store_u32(op + 12 * vstep, r_12);
    #endif
    
#line 174
    #if UNROLL > 13
        npyv_u32 v_13 = npyv_loadn_u32(ip + 13 * vstep * istride, istride);
        npyv_u32 r_13 = npyv_negative_u32(v_13);
        npyv_store_u32(op + 13 * vstep, r_13);
    #endif
    
#line 174
    #if UNROLL > 14
        npyv_u32 v_14 = npyv_loadn_u32(ip + 14 * vstep * istride, istride);
        npyv_u32 r_14 = npyv_negative_u32(v_14);
        npyv_store_u32(op + 14 * vstep, r_14);
    #endif
    
#line 174
    #if UNROLL > 15
        npyv_u32 v_15 = npyv_loadn_u32(ip + 15 * vstep * istride, istride);
        npyv_u32 r_15 = npyv_negative_u32(v_15);
        npyv_store_u32(op + 15 * vstep, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += vstep) {
        npyv_u32 v = npyv_loadn_u32(ip, istride);
        npyv_u32 r = npyv_negative_u32(v);
        npyv_store_u32(op, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ip += istride, ++op) {
        *op = scalar_negative(*ip);
    }
}
// non-contiguous input and output
// limit unroll to 2x
#if UNROLL > 2
#undef UNROLL
#define UNROLL 2
#endif
// X86 does better with unrolled scalar for heavy non-contiguous
#ifndef NPY_HAVE_SSE2
static NPY_INLINE void
simd_unary_nn_negative_u32(const npyv_lanetype_u32 *ip, npy_intp istride,
                             npyv_lanetype_u32 *op, npy_intp ostride,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_u32;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
    #line 213
    #if UNROLL > 0
        npyv_u32 v_0 = npyv_loadn_u32(ip + 0 * vstep * istride, istride);
        npyv_u32 r_0 = npyv_negative_u32(v_0);
        npyv_storen_u32(op + 0 * vstep * ostride, ostride, r_0);
    #endif
    
#line 213
    #if UNROLL > 1
        npyv_u32 v_1 = npyv_loadn_u32(ip + 1 * vstep * istride, istride);
        npyv_u32 r_1 = npyv_negative_u32(v_1);
        npyv_storen_u32(op + 1 * vstep * ostride, ostride, r_1);
    #endif
    
#line 213
    #if UNROLL > 2
        npyv_u32 v_2 = npyv_loadn_u32(ip + 2 * vstep * istride, istride);
        npyv_u32 r_2 = npyv_negative_u32(v_2);
        npyv_storen_u32(op + 2 * vstep * ostride, ostride, r_2);
    #endif
    
#line 213
    #if UNROLL > 3
        npyv_u32 v_3 = npyv_loadn_u32(ip + 3 * vstep * istride, istride);
        npyv_u32 r_3 = npyv_negative_u32(v_3);
        npyv_storen_u32(op + 3 * vstep * ostride, ostride, r_3);
    #endif
    
#line 213
    #if UNROLL > 4
        npyv_u32 v_4 = npyv_loadn_u32(ip + 4 * vstep * istride, istride);
        npyv_u32 r_4 = npyv_negative_u32(v_4);
        npyv_storen_u32(op + 4 * vstep * ostride, ostride, r_4);
    #endif
    
#line 213
    #if UNROLL > 5
        npyv_u32 v_5 = npyv_loadn_u32(ip + 5 * vstep * istride, istride);
        npyv_u32 r_5 = npyv_negative_u32(v_5);
        npyv_storen_u32(op + 5 * vstep * ostride, ostride, r_5);
    #endif
    
#line 213
    #if UNROLL > 6
        npyv_u32 v_6 = npyv_loadn_u32(ip + 6 * vstep * istride, istride);
        npyv_u32 r_6 = npyv_negative_u32(v_6);
        npyv_storen_u32(op + 6 * vstep * ostride, ostride, r_6);
    #endif
    
#line 213
    #if UNROLL > 7
        npyv_u32 v_7 = npyv_loadn_u32(ip + 7 * vstep * istride, istride);
        npyv_u32 r_7 = npyv_negative_u32(v_7);
        npyv_storen_u32(op + 7 * vstep * ostride, ostride, r_7);
    #endif
    
#line 213
    #if UNROLL > 8
        npyv_u32 v_8 = npyv_loadn_u32(ip + 8 * vstep * istride, istride);
        npyv_u32 r_8 = npyv_negative_u32(v_8);
        npyv_storen_u32(op + 8 * vstep * ostride, ostride, r_8);
    #endif
    
#line 213
    #if UNROLL > 9
        npyv_u32 v_9 = npyv_loadn_u32(ip + 9 * vstep * istride, istride);
        npyv_u32 r_9 = npyv_negative_u32(v_9);
        npyv_storen_u32(op + 9 * vstep * ostride, ostride, r_9);
    #endif
    
#line 213
    #if UNROLL > 10
        npyv_u32 v_10 = npyv_loadn_u32(ip + 10 * vstep * istride, istride);
        npyv_u32 r_10 = npyv_negative_u32(v_10);
        npyv_storen_u32(op + 10 * vstep * ostride, ostride, r_10);
    #endif
    
#line 213
    #if UNROLL > 11
        npyv_u32 v_11 = npyv_loadn_u32(ip + 11 * vstep * istride, istride);
        npyv_u32 r_11 = npyv_negative_u32(v_11);
        npyv_storen_u32(op + 11 * vstep * ostride, ostride, r_11);
    #endif
    
#line 213
    #if UNROLL > 12
        npyv_u32 v_12 = npyv_loadn_u32(ip + 12 * vstep * istride, istride);
        npyv_u32 r_12 = npyv_negative_u32(v_12);
        npyv_storen_u32(op + 12 * vstep * ostride, ostride, r_12);
    #endif
    
#line 213
    #if UNROLL > 13
        npyv_u32 v_13 = npyv_loadn_u32(ip + 13 * vstep * istride, istride);
        npyv_u32 r_13 = npyv_negative_u32(v_13);
        npyv_storen_u32(op + 13 * vstep * ostride, ostride, r_13);
    #endif
    
#line 213
    #if UNROLL > 14
        npyv_u32 v_14 = npyv_loadn_u32(ip + 14 * vstep * istride, istride);
        npyv_u32 r_14 = npyv_negative_u32(v_14);
        npyv_storen_u32(op + 14 * vstep * ostride, ostride, r_14);
    #endif
    
#line 213
    #if UNROLL > 15
        npyv_u32 v_15 = npyv_loadn_u32(ip + 15 * vstep * istride, istride);
        npyv_u32 r_15 = npyv_negative_u32(v_15);
        npyv_storen_u32(op + 15 * vstep * ostride, ostride, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
        npyv_u32 v = npyv_loadn_u32(ip, istride);
        npyv_u32 r = npyv_negative_u32(v);
        npyv_storen_u32(op, ostride, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ip += istride, op += ostride) {
        *op = scalar_negative(*ip);
    }
}
#endif // NPY_HAVE_SSE2
#endif // 1
#undef UNROLL
#endif // NPY_SIMD
/*end repeat1**/

#line 80
#line 85
#if NPY_SIMD
#if 4 < 1
#error "Unroll must be at least 1"
#elif NPY_SIMD != 128 && 4 > 2
// Avoid memory bandwidth bottleneck for larger SIMD
#define UNROLL 2
#else
#define UNROLL 4
#endif
// contiguous inputs and output.
static NPY_INLINE void
simd_unary_cc_negative_s64(const npyv_lanetype_s64 *ip,
                             npyv_lanetype_s64 *op,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_s64;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
    #line 108
    #if UNROLL > 0
        npyv_s64 v_0 = npyv_load_s64(ip + 0 * vstep);
        npyv_s64 r_0 = npyv_negative_s64(v_0);
        npyv_store_s64(op + 0 * vstep, r_0);
    #endif
    
#line 108
    #if UNROLL > 1
        npyv_s64 v_1 = npyv_load_s64(ip + 1 * vstep);
        npyv_s64 r_1 = npyv_negative_s64(v_1);
        npyv_store_s64(op + 1 * vstep, r_1);
    #endif
    
#line 108
    #if UNROLL > 2
        npyv_s64 v_2 = npyv_load_s64(ip + 2 * vstep);
        npyv_s64 r_2 = npyv_negative_s64(v_2);
        npyv_store_s64(op + 2 * vstep, r_2);
    #endif
    
#line 108
    #if UNROLL > 3
        npyv_s64 v_3 = npyv_load_s64(ip + 3 * vstep);
        npyv_s64 r_3 = npyv_negative_s64(v_3);
        npyv_store_s64(op + 3 * vstep, r_3);
    #endif
    
#line 108
    #if UNROLL > 4
        npyv_s64 v_4 = npyv_load_s64(ip + 4 * vstep);
        npyv_s64 r_4 = npyv_negative_s64(v_4);
        npyv_store_s64(op + 4 * vstep, r_4);
    #endif
    
#line 108
    #if UNROLL > 5
        npyv_s64 v_5 = npyv_load_s64(ip + 5 * vstep);
        npyv_s64 r_5 = npyv_negative_s64(v_5);
        npyv_store_s64(op + 5 * vstep, r_5);
    #endif
    
#line 108
    #if UNROLL > 6
        npyv_s64 v_6 = npyv_load_s64(ip + 6 * vstep);
        npyv_s64 r_6 = npyv_negative_s64(v_6);
        npyv_store_s64(op + 6 * vstep, r_6);
    #endif
    
#line 108
    #if UNROLL > 7
        npyv_s64 v_7 = npyv_load_s64(ip + 7 * vstep);
        npyv_s64 r_7 = npyv_negative_s64(v_7);
        npyv_store_s64(op + 7 * vstep, r_7);
    #endif
    
#line 108
    #if UNROLL > 8
        npyv_s64 v_8 = npyv_load_s64(ip + 8 * vstep);
        npyv_s64 r_8 = npyv_negative_s64(v_8);
        npyv_store_s64(op + 8 * vstep, r_8);
    #endif
    
#line 108
    #if UNROLL > 9
        npyv_s64 v_9 = npyv_load_s64(ip + 9 * vstep);
        npyv_s64 r_9 = npyv_negative_s64(v_9);
        npyv_store_s64(op + 9 * vstep, r_9);
    #endif
    
#line 108
    #if UNROLL > 10
        npyv_s64 v_10 = npyv_load_s64(ip + 10 * vstep);
        npyv_s64 r_10 = npyv_negative_s64(v_10);
        npyv_store_s64(op + 10 * vstep, r_10);
    #endif
    
#line 108
    #if UNROLL > 11
        npyv_s64 v_11 = npyv_load_s64(ip + 11 * vstep);
        npyv_s64 r_11 = npyv_negative_s64(v_11);
        npyv_store_s64(op + 11 * vstep, r_11);
    #endif
    
#line 108
    #if UNROLL > 12
        npyv_s64 v_12 = npyv_load_s64(ip + 12 * vstep);
        npyv_s64 r_12 = npyv_negative_s64(v_12);
        npyv_store_s64(op + 12 * vstep, r_12);
    #endif
    
#line 108
    #if UNROLL > 13
        npyv_s64 v_13 = npyv_load_s64(ip + 13 * vstep);
        npyv_s64 r_13 = npyv_negative_s64(v_13);
        npyv_store_s64(op + 13 * vstep, r_13);
    #endif
    
#line 108
    #if UNROLL > 14
        npyv_s64 v_14 = npyv_load_s64(ip + 14 * vstep);
        npyv_s64 r_14 = npyv_negative_s64(v_14);
        npyv_store_s64(op + 14 * vstep, r_14);
    #endif
    
#line 108
    #if UNROLL > 15
        npyv_s64 v_15 = npyv_load_s64(ip + 15 * vstep);
        npyv_s64 r_15 = npyv_negative_s64(v_15);
        npyv_store_s64(op + 15 * vstep, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += vstep, op +=vstep) {
        npyv_s64 v = npyv_load_s64(ip);
        npyv_s64 r = npyv_negative_s64(v);
        npyv_store_s64(op, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ++ip, ++op) {
        *op = scalar_negative(*ip);
    }
}

#if 1
// contiguous input, non-contiguous output
static NPY_INLINE void
simd_unary_cn_negative_s64(const npyv_lanetype_s64 *ip,
                             npyv_lanetype_s64 *op, npy_intp ostride,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_s64;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += wstep, op += ostride*wstep) {
    #line 142
    #if UNROLL > 0
        npyv_s64 v_0 = npyv_load_s64(ip + 0 * vstep);
        npyv_s64 r_0 = npyv_negative_s64(v_0);
        npyv_storen_s64(op + 0 * vstep * ostride, ostride, r_0);
    #endif
    
#line 142
    #if UNROLL > 1
        npyv_s64 v_1 = npyv_load_s64(ip + 1 * vstep);
        npyv_s64 r_1 = npyv_negative_s64(v_1);
        npyv_storen_s64(op + 1 * vstep * ostride, ostride, r_1);
    #endif
    
#line 142
    #if UNROLL > 2
        npyv_s64 v_2 = npyv_load_s64(ip + 2 * vstep);
        npyv_s64 r_2 = npyv_negative_s64(v_2);
        npyv_storen_s64(op + 2 * vstep * ostride, ostride, r_2);
    #endif
    
#line 142
    #if UNROLL > 3
        npyv_s64 v_3 = npyv_load_s64(ip + 3 * vstep);
        npyv_s64 r_3 = npyv_negative_s64(v_3);
        npyv_storen_s64(op + 3 * vstep * ostride, ostride, r_3);
    #endif
    
#line 142
    #if UNROLL > 4
        npyv_s64 v_4 = npyv_load_s64(ip + 4 * vstep);
        npyv_s64 r_4 = npyv_negative_s64(v_4);
        npyv_storen_s64(op + 4 * vstep * ostride, ostride, r_4);
    #endif
    
#line 142
    #if UNROLL > 5
        npyv_s64 v_5 = npyv_load_s64(ip + 5 * vstep);
        npyv_s64 r_5 = npyv_negative_s64(v_5);
        npyv_storen_s64(op + 5 * vstep * ostride, ostride, r_5);
    #endif
    
#line 142
    #if UNROLL > 6
        npyv_s64 v_6 = npyv_load_s64(ip + 6 * vstep);
        npyv_s64 r_6 = npyv_negative_s64(v_6);
        npyv_storen_s64(op + 6 * vstep * ostride, ostride, r_6);
    #endif
    
#line 142
    #if UNROLL > 7
        npyv_s64 v_7 = npyv_load_s64(ip + 7 * vstep);
        npyv_s64 r_7 = npyv_negative_s64(v_7);
        npyv_storen_s64(op + 7 * vstep * ostride, ostride, r_7);
    #endif
    
#line 142
    #if UNROLL > 8
        npyv_s64 v_8 = npyv_load_s64(ip + 8 * vstep);
        npyv_s64 r_8 = npyv_negative_s64(v_8);
        npyv_storen_s64(op + 8 * vstep * ostride, ostride, r_8);
    #endif
    
#line 142
    #if UNROLL > 9
        npyv_s64 v_9 = npyv_load_s64(ip + 9 * vstep);
        npyv_s64 r_9 = npyv_negative_s64(v_9);
        npyv_storen_s64(op + 9 * vstep * ostride, ostride, r_9);
    #endif
    
#line 142
    #if UNROLL > 10
        npyv_s64 v_10 = npyv_load_s64(ip + 10 * vstep);
        npyv_s64 r_10 = npyv_negative_s64(v_10);
        npyv_storen_s64(op + 10 * vstep * ostride, ostride, r_10);
    #endif
    
#line 142
    #if UNROLL > 11
        npyv_s64 v_11 = npyv_load_s64(ip + 11 * vstep);
        npyv_s64 r_11 = npyv_negative_s64(v_11);
        npyv_storen_s64(op + 11 * vstep * ostride, ostride, r_11);
    #endif
    
#line 142
    #if UNROLL > 12
        npyv_s64 v_12 = npyv_load_s64(ip + 12 * vstep);
        npyv_s64 r_12 = npyv_negative_s64(v_12);
        npyv_storen_s64(op + 12 * vstep * ostride, ostride, r_12);
    #endif
    
#line 142
    #if UNROLL > 13
        npyv_s64 v_13 = npyv_load_s64(ip + 13 * vstep);
        npyv_s64 r_13 = npyv_negative_s64(v_13);
        npyv_storen_s64(op + 13 * vstep * ostride, ostride, r_13);
    #endif
    
#line 142
    #if UNROLL > 14
        npyv_s64 v_14 = npyv_load_s64(ip + 14 * vstep);
        npyv_s64 r_14 = npyv_negative_s64(v_14);
        npyv_storen_s64(op + 14 * vstep * ostride, ostride, r_14);
    #endif
    
#line 142
    #if UNROLL > 15
        npyv_s64 v_15 = npyv_load_s64(ip + 15 * vstep);
        npyv_s64 r_15 = npyv_negative_s64(v_15);
        npyv_storen_s64(op + 15 * vstep * ostride, ostride, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += vstep, op += ostride*vstep) {
        npyv_s64 v = npyv_load_s64(ip);
        npyv_s64 r = npyv_negative_s64(v);
        npyv_storen_s64(op, ostride, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ++ip, op += ostride) {
        *op = scalar_negative(*ip);
    }
}
// non-contiguous input, contiguous output
static NPY_INLINE void
simd_unary_nc_negative_s64(const npyv_lanetype_s64 *ip, npy_intp istride,
                             npyv_lanetype_s64 *op,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_s64;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += wstep) {
    #line 174
    #if UNROLL > 0
        npyv_s64 v_0 = npyv_loadn_s64(ip + 0 * vstep * istride, istride);
        npyv_s64 r_0 = npyv_negative_s64(v_0);
        npyv_store_s64(op + 0 * vstep, r_0);
    #endif
    
#line 174
    #if UNROLL > 1
        npyv_s64 v_1 = npyv_loadn_s64(ip + 1 * vstep * istride, istride);
        npyv_s64 r_1 = npyv_negative_s64(v_1);
        npyv_store_s64(op + 1 * vstep, r_1);
    #endif
    
#line 174
    #if UNROLL > 2
        npyv_s64 v_2 = npyv_loadn_s64(ip + 2 * vstep * istride, istride);
        npyv_s64 r_2 = npyv_negative_s64(v_2);
        npyv_store_s64(op + 2 * vstep, r_2);
    #endif
    
#line 174
    #if UNROLL > 3
        npyv_s64 v_3 = npyv_loadn_s64(ip + 3 * vstep * istride, istride);
        npyv_s64 r_3 = npyv_negative_s64(v_3);
        npyv_store_s64(op + 3 * vstep, r_3);
    #endif
    
#line 174
    #if UNROLL > 4
        npyv_s64 v_4 = npyv_loadn_s64(ip + 4 * vstep * istride, istride);
        npyv_s64 r_4 = npyv_negative_s64(v_4);
        npyv_store_s64(op + 4 * vstep, r_4);
    #endif
    
#line 174
    #if UNROLL > 5
        npyv_s64 v_5 = npyv_loadn_s64(ip + 5 * vstep * istride, istride);
        npyv_s64 r_5 = npyv_negative_s64(v_5);
        npyv_store_s64(op + 5 * vstep, r_5);
    #endif
    
#line 174
    #if UNROLL > 6
        npyv_s64 v_6 = npyv_loadn_s64(ip + 6 * vstep * istride, istride);
        npyv_s64 r_6 = npyv_negative_s64(v_6);
        npyv_store_s64(op + 6 * vstep, r_6);
    #endif
    
#line 174
    #if UNROLL > 7
        npyv_s64 v_7 = npyv_loadn_s64(ip + 7 * vstep * istride, istride);
        npyv_s64 r_7 = npyv_negative_s64(v_7);
        npyv_store_s64(op + 7 * vstep, r_7);
    #endif
    
#line 174
    #if UNROLL > 8
        npyv_s64 v_8 = npyv_loadn_s64(ip + 8 * vstep * istride, istride);
        npyv_s64 r_8 = npyv_negative_s64(v_8);
        npyv_store_s64(op + 8 * vstep, r_8);
    #endif
    
#line 174
    #if UNROLL > 9
        npyv_s64 v_9 = npyv_loadn_s64(ip + 9 * vstep * istride, istride);
        npyv_s64 r_9 = npyv_negative_s64(v_9);
        npyv_store_s64(op + 9 * vstep, r_9);
    #endif
    
#line 174
    #if UNROLL > 10
        npyv_s64 v_10 = npyv_loadn_s64(ip + 10 * vstep * istride, istride);
        npyv_s64 r_10 = npyv_negative_s64(v_10);
        npyv_store_s64(op + 10 * vstep, r_10);
    #endif
    
#line 174
    #if UNROLL > 11
        npyv_s64 v_11 = npyv_loadn_s64(ip + 11 * vstep * istride, istride);
        npyv_s64 r_11 = npyv_negative_s64(v_11);
        npyv_store_s64(op + 11 * vstep, r_11);
    #endif
    
#line 174
    #if UNROLL > 12
        npyv_s64 v_12 = npyv_loadn_s64(ip + 12 * vstep * istride, istride);
        npyv_s64 r_12 = npyv_negative_s64(v_12);
        npyv_store_s64(op + 12 * vstep, r_12);
    #endif
    
#line 174
    #if UNROLL > 13
        npyv_s64 v_13 = npyv_loadn_s64(ip + 13 * vstep * istride, istride);
        npyv_s64 r_13 = npyv_negative_s64(v_13);
        npyv_store_s64(op + 13 * vstep, r_13);
    #endif
    
#line 174
    #if UNROLL > 14
        npyv_s64 v_14 = npyv_loadn_s64(ip + 14 * vstep * istride, istride);
        npyv_s64 r_14 = npyv_negative_s64(v_14);
        npyv_store_s64(op + 14 * vstep, r_14);
    #endif
    
#line 174
    #if UNROLL > 15
        npyv_s64 v_15 = npyv_loadn_s64(ip + 15 * vstep * istride, istride);
        npyv_s64 r_15 = npyv_negative_s64(v_15);
        npyv_store_s64(op + 15 * vstep, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += vstep) {
        npyv_s64 v = npyv_loadn_s64(ip, istride);
        npyv_s64 r = npyv_negative_s64(v);
        npyv_store_s64(op, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ip += istride, ++op) {
        *op = scalar_negative(*ip);
    }
}
// non-contiguous input and output
// limit unroll to 2x
#if UNROLL > 2
#undef UNROLL
#define UNROLL 2
#endif
// X86 does better with unrolled scalar for heavy non-contiguous
#ifndef NPY_HAVE_SSE2
static NPY_INLINE void
simd_unary_nn_negative_s64(const npyv_lanetype_s64 *ip, npy_intp istride,
                             npyv_lanetype_s64 *op, npy_intp ostride,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_s64;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
    #line 213
    #if UNROLL > 0
        npyv_s64 v_0 = npyv_loadn_s64(ip + 0 * vstep * istride, istride);
        npyv_s64 r_0 = npyv_negative_s64(v_0);
        npyv_storen_s64(op + 0 * vstep * ostride, ostride, r_0);
    #endif
    
#line 213
    #if UNROLL > 1
        npyv_s64 v_1 = npyv_loadn_s64(ip + 1 * vstep * istride, istride);
        npyv_s64 r_1 = npyv_negative_s64(v_1);
        npyv_storen_s64(op + 1 * vstep * ostride, ostride, r_1);
    #endif
    
#line 213
    #if UNROLL > 2
        npyv_s64 v_2 = npyv_loadn_s64(ip + 2 * vstep * istride, istride);
        npyv_s64 r_2 = npyv_negative_s64(v_2);
        npyv_storen_s64(op + 2 * vstep * ostride, ostride, r_2);
    #endif
    
#line 213
    #if UNROLL > 3
        npyv_s64 v_3 = npyv_loadn_s64(ip + 3 * vstep * istride, istride);
        npyv_s64 r_3 = npyv_negative_s64(v_3);
        npyv_storen_s64(op + 3 * vstep * ostride, ostride, r_3);
    #endif
    
#line 213
    #if UNROLL > 4
        npyv_s64 v_4 = npyv_loadn_s64(ip + 4 * vstep * istride, istride);
        npyv_s64 r_4 = npyv_negative_s64(v_4);
        npyv_storen_s64(op + 4 * vstep * ostride, ostride, r_4);
    #endif
    
#line 213
    #if UNROLL > 5
        npyv_s64 v_5 = npyv_loadn_s64(ip + 5 * vstep * istride, istride);
        npyv_s64 r_5 = npyv_negative_s64(v_5);
        npyv_storen_s64(op + 5 * vstep * ostride, ostride, r_5);
    #endif
    
#line 213
    #if UNROLL > 6
        npyv_s64 v_6 = npyv_loadn_s64(ip + 6 * vstep * istride, istride);
        npyv_s64 r_6 = npyv_negative_s64(v_6);
        npyv_storen_s64(op + 6 * vstep * ostride, ostride, r_6);
    #endif
    
#line 213
    #if UNROLL > 7
        npyv_s64 v_7 = npyv_loadn_s64(ip + 7 * vstep * istride, istride);
        npyv_s64 r_7 = npyv_negative_s64(v_7);
        npyv_storen_s64(op + 7 * vstep * ostride, ostride, r_7);
    #endif
    
#line 213
    #if UNROLL > 8
        npyv_s64 v_8 = npyv_loadn_s64(ip + 8 * vstep * istride, istride);
        npyv_s64 r_8 = npyv_negative_s64(v_8);
        npyv_storen_s64(op + 8 * vstep * ostride, ostride, r_8);
    #endif
    
#line 213
    #if UNROLL > 9
        npyv_s64 v_9 = npyv_loadn_s64(ip + 9 * vstep * istride, istride);
        npyv_s64 r_9 = npyv_negative_s64(v_9);
        npyv_storen_s64(op + 9 * vstep * ostride, ostride, r_9);
    #endif
    
#line 213
    #if UNROLL > 10
        npyv_s64 v_10 = npyv_loadn_s64(ip + 10 * vstep * istride, istride);
        npyv_s64 r_10 = npyv_negative_s64(v_10);
        npyv_storen_s64(op + 10 * vstep * ostride, ostride, r_10);
    #endif
    
#line 213
    #if UNROLL > 11
        npyv_s64 v_11 = npyv_loadn_s64(ip + 11 * vstep * istride, istride);
        npyv_s64 r_11 = npyv_negative_s64(v_11);
        npyv_storen_s64(op + 11 * vstep * ostride, ostride, r_11);
    #endif
    
#line 213
    #if UNROLL > 12
        npyv_s64 v_12 = npyv_loadn_s64(ip + 12 * vstep * istride, istride);
        npyv_s64 r_12 = npyv_negative_s64(v_12);
        npyv_storen_s64(op + 12 * vstep * ostride, ostride, r_12);
    #endif
    
#line 213
    #if UNROLL > 13
        npyv_s64 v_13 = npyv_loadn_s64(ip + 13 * vstep * istride, istride);
        npyv_s64 r_13 = npyv_negative_s64(v_13);
        npyv_storen_s64(op + 13 * vstep * ostride, ostride, r_13);
    #endif
    
#line 213
    #if UNROLL > 14
        npyv_s64 v_14 = npyv_loadn_s64(ip + 14 * vstep * istride, istride);
        npyv_s64 r_14 = npyv_negative_s64(v_14);
        npyv_storen_s64(op + 14 * vstep * ostride, ostride, r_14);
    #endif
    
#line 213
    #if UNROLL > 15
        npyv_s64 v_15 = npyv_loadn_s64(ip + 15 * vstep * istride, istride);
        npyv_s64 r_15 = npyv_negative_s64(v_15);
        npyv_storen_s64(op + 15 * vstep * ostride, ostride, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
        npyv_s64 v = npyv_loadn_s64(ip, istride);
        npyv_s64 r = npyv_negative_s64(v);
        npyv_storen_s64(op, ostride, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ip += istride, op += ostride) {
        *op = scalar_negative(*ip);
    }
}
#endif // NPY_HAVE_SSE2
#endif // 1
#undef UNROLL
#endif // NPY_SIMD
/*end repeat1**/

#line 80
#line 85
#if NPY_SIMD
#if 4 < 1
#error "Unroll must be at least 1"
#elif NPY_SIMD != 128 && 4 > 2
// Avoid memory bandwidth bottleneck for larger SIMD
#define UNROLL 2
#else
#define UNROLL 4
#endif
// contiguous inputs and output.
static NPY_INLINE void
simd_unary_cc_negative_u64(const npyv_lanetype_u64 *ip,
                             npyv_lanetype_u64 *op,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_u64;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
    #line 108
    #if UNROLL > 0
        npyv_u64 v_0 = npyv_load_u64(ip + 0 * vstep);
        npyv_u64 r_0 = npyv_negative_u64(v_0);
        npyv_store_u64(op + 0 * vstep, r_0);
    #endif
    
#line 108
    #if UNROLL > 1
        npyv_u64 v_1 = npyv_load_u64(ip + 1 * vstep);
        npyv_u64 r_1 = npyv_negative_u64(v_1);
        npyv_store_u64(op + 1 * vstep, r_1);
    #endif
    
#line 108
    #if UNROLL > 2
        npyv_u64 v_2 = npyv_load_u64(ip + 2 * vstep);
        npyv_u64 r_2 = npyv_negative_u64(v_2);
        npyv_store_u64(op + 2 * vstep, r_2);
    #endif
    
#line 108
    #if UNROLL > 3
        npyv_u64 v_3 = npyv_load_u64(ip + 3 * vstep);
        npyv_u64 r_3 = npyv_negative_u64(v_3);
        npyv_store_u64(op + 3 * vstep, r_3);
    #endif
    
#line 108
    #if UNROLL > 4
        npyv_u64 v_4 = npyv_load_u64(ip + 4 * vstep);
        npyv_u64 r_4 = npyv_negative_u64(v_4);
        npyv_store_u64(op + 4 * vstep, r_4);
    #endif
    
#line 108
    #if UNROLL > 5
        npyv_u64 v_5 = npyv_load_u64(ip + 5 * vstep);
        npyv_u64 r_5 = npyv_negative_u64(v_5);
        npyv_store_u64(op + 5 * vstep, r_5);
    #endif
    
#line 108
    #if UNROLL > 6
        npyv_u64 v_6 = npyv_load_u64(ip + 6 * vstep);
        npyv_u64 r_6 = npyv_negative_u64(v_6);
        npyv_store_u64(op + 6 * vstep, r_6);
    #endif
    
#line 108
    #if UNROLL > 7
        npyv_u64 v_7 = npyv_load_u64(ip + 7 * vstep);
        npyv_u64 r_7 = npyv_negative_u64(v_7);
        npyv_store_u64(op + 7 * vstep, r_7);
    #endif
    
#line 108
    #if UNROLL > 8
        npyv_u64 v_8 = npyv_load_u64(ip + 8 * vstep);
        npyv_u64 r_8 = npyv_negative_u64(v_8);
        npyv_store_u64(op + 8 * vstep, r_8);
    #endif
    
#line 108
    #if UNROLL > 9
        npyv_u64 v_9 = npyv_load_u64(ip + 9 * vstep);
        npyv_u64 r_9 = npyv_negative_u64(v_9);
        npyv_store_u64(op + 9 * vstep, r_9);
    #endif
    
#line 108
    #if UNROLL > 10
        npyv_u64 v_10 = npyv_load_u64(ip + 10 * vstep);
        npyv_u64 r_10 = npyv_negative_u64(v_10);
        npyv_store_u64(op + 10 * vstep, r_10);
    #endif
    
#line 108
    #if UNROLL > 11
        npyv_u64 v_11 = npyv_load_u64(ip + 11 * vstep);
        npyv_u64 r_11 = npyv_negative_u64(v_11);
        npyv_store_u64(op + 11 * vstep, r_11);
    #endif
    
#line 108
    #if UNROLL > 12
        npyv_u64 v_12 = npyv_load_u64(ip + 12 * vstep);
        npyv_u64 r_12 = npyv_negative_u64(v_12);
        npyv_store_u64(op + 12 * vstep, r_12);
    #endif
    
#line 108
    #if UNROLL > 13
        npyv_u64 v_13 = npyv_load_u64(ip + 13 * vstep);
        npyv_u64 r_13 = npyv_negative_u64(v_13);
        npyv_store_u64(op + 13 * vstep, r_13);
    #endif
    
#line 108
    #if UNROLL > 14
        npyv_u64 v_14 = npyv_load_u64(ip + 14 * vstep);
        npyv_u64 r_14 = npyv_negative_u64(v_14);
        npyv_store_u64(op + 14 * vstep, r_14);
    #endif
    
#line 108
    #if UNROLL > 15
        npyv_u64 v_15 = npyv_load_u64(ip + 15 * vstep);
        npyv_u64 r_15 = npyv_negative_u64(v_15);
        npyv_store_u64(op + 15 * vstep, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += vstep, op +=vstep) {
        npyv_u64 v = npyv_load_u64(ip);
        npyv_u64 r = npyv_negative_u64(v);
        npyv_store_u64(op, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ++ip, ++op) {
        *op = scalar_negative(*ip);
    }
}

#if 1
// contiguous input, non-contiguous output
static NPY_INLINE void
simd_unary_cn_negative_u64(const npyv_lanetype_u64 *ip,
                             npyv_lanetype_u64 *op, npy_intp ostride,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_u64;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += wstep, op += ostride*wstep) {
    #line 142
    #if UNROLL > 0
        npyv_u64 v_0 = npyv_load_u64(ip + 0 * vstep);
        npyv_u64 r_0 = npyv_negative_u64(v_0);
        npyv_storen_u64(op + 0 * vstep * ostride, ostride, r_0);
    #endif
    
#line 142
    #if UNROLL > 1
        npyv_u64 v_1 = npyv_load_u64(ip + 1 * vstep);
        npyv_u64 r_1 = npyv_negative_u64(v_1);
        npyv_storen_u64(op + 1 * vstep * ostride, ostride, r_1);
    #endif
    
#line 142
    #if UNROLL > 2
        npyv_u64 v_2 = npyv_load_u64(ip + 2 * vstep);
        npyv_u64 r_2 = npyv_negative_u64(v_2);
        npyv_storen_u64(op + 2 * vstep * ostride, ostride, r_2);
    #endif
    
#line 142
    #if UNROLL > 3
        npyv_u64 v_3 = npyv_load_u64(ip + 3 * vstep);
        npyv_u64 r_3 = npyv_negative_u64(v_3);
        npyv_storen_u64(op + 3 * vstep * ostride, ostride, r_3);
    #endif
    
#line 142
    #if UNROLL > 4
        npyv_u64 v_4 = npyv_load_u64(ip + 4 * vstep);
        npyv_u64 r_4 = npyv_negative_u64(v_4);
        npyv_storen_u64(op + 4 * vstep * ostride, ostride, r_4);
    #endif
    
#line 142
    #if UNROLL > 5
        npyv_u64 v_5 = npyv_load_u64(ip + 5 * vstep);
        npyv_u64 r_5 = npyv_negative_u64(v_5);
        npyv_storen_u64(op + 5 * vstep * ostride, ostride, r_5);
    #endif
    
#line 142
    #if UNROLL > 6
        npyv_u64 v_6 = npyv_load_u64(ip + 6 * vstep);
        npyv_u64 r_6 = npyv_negative_u64(v_6);
        npyv_storen_u64(op + 6 * vstep * ostride, ostride, r_6);
    #endif
    
#line 142
    #if UNROLL > 7
        npyv_u64 v_7 = npyv_load_u64(ip + 7 * vstep);
        npyv_u64 r_7 = npyv_negative_u64(v_7);
        npyv_storen_u64(op + 7 * vstep * ostride, ostride, r_7);
    #endif
    
#line 142
    #if UNROLL > 8
        npyv_u64 v_8 = npyv_load_u64(ip + 8 * vstep);
        npyv_u64 r_8 = npyv_negative_u64(v_8);
        npyv_storen_u64(op + 8 * vstep * ostride, ostride, r_8);
    #endif
    
#line 142
    #if UNROLL > 9
        npyv_u64 v_9 = npyv_load_u64(ip + 9 * vstep);
        npyv_u64 r_9 = npyv_negative_u64(v_9);
        npyv_storen_u64(op + 9 * vstep * ostride, ostride, r_9);
    #endif
    
#line 142
    #if UNROLL > 10
        npyv_u64 v_10 = npyv_load_u64(ip + 10 * vstep);
        npyv_u64 r_10 = npyv_negative_u64(v_10);
        npyv_storen_u64(op + 10 * vstep * ostride, ostride, r_10);
    #endif
    
#line 142
    #if UNROLL > 11
        npyv_u64 v_11 = npyv_load_u64(ip + 11 * vstep);
        npyv_u64 r_11 = npyv_negative_u64(v_11);
        npyv_storen_u64(op + 11 * vstep * ostride, ostride, r_11);
    #endif
    
#line 142
    #if UNROLL > 12
        npyv_u64 v_12 = npyv_load_u64(ip + 12 * vstep);
        npyv_u64 r_12 = npyv_negative_u64(v_12);
        npyv_storen_u64(op + 12 * vstep * ostride, ostride, r_12);
    #endif
    
#line 142
    #if UNROLL > 13
        npyv_u64 v_13 = npyv_load_u64(ip + 13 * vstep);
        npyv_u64 r_13 = npyv_negative_u64(v_13);
        npyv_storen_u64(op + 13 * vstep * ostride, ostride, r_13);
    #endif
    
#line 142
    #if UNROLL > 14
        npyv_u64 v_14 = npyv_load_u64(ip + 14 * vstep);
        npyv_u64 r_14 = npyv_negative_u64(v_14);
        npyv_storen_u64(op + 14 * vstep * ostride, ostride, r_14);
    #endif
    
#line 142
    #if UNROLL > 15
        npyv_u64 v_15 = npyv_load_u64(ip + 15 * vstep);
        npyv_u64 r_15 = npyv_negative_u64(v_15);
        npyv_storen_u64(op + 15 * vstep * ostride, ostride, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += vstep, op += ostride*vstep) {
        npyv_u64 v = npyv_load_u64(ip);
        npyv_u64 r = npyv_negative_u64(v);
        npyv_storen_u64(op, ostride, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ++ip, op += ostride) {
        *op = scalar_negative(*ip);
    }
}
// non-contiguous input, contiguous output
static NPY_INLINE void
simd_unary_nc_negative_u64(const npyv_lanetype_u64 *ip, npy_intp istride,
                             npyv_lanetype_u64 *op,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_u64;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += wstep) {
    #line 174
    #if UNROLL > 0
        npyv_u64 v_0 = npyv_loadn_u64(ip + 0 * vstep * istride, istride);
        npyv_u64 r_0 = npyv_negative_u64(v_0);
        npyv_store_u64(op + 0 * vstep, r_0);
    #endif
    
#line 174
    #if UNROLL > 1
        npyv_u64 v_1 = npyv_loadn_u64(ip + 1 * vstep * istride, istride);
        npyv_u64 r_1 = npyv_negative_u64(v_1);
        npyv_store_u64(op + 1 * vstep, r_1);
    #endif
    
#line 174
    #if UNROLL > 2
        npyv_u64 v_2 = npyv_loadn_u64(ip + 2 * vstep * istride, istride);
        npyv_u64 r_2 = npyv_negative_u64(v_2);
        npyv_store_u64(op + 2 * vstep, r_2);
    #endif
    
#line 174
    #if UNROLL > 3
        npyv_u64 v_3 = npyv_loadn_u64(ip + 3 * vstep * istride, istride);
        npyv_u64 r_3 = npyv_negative_u64(v_3);
        npyv_store_u64(op + 3 * vstep, r_3);
    #endif
    
#line 174
    #if UNROLL > 4
        npyv_u64 v_4 = npyv_loadn_u64(ip + 4 * vstep * istride, istride);
        npyv_u64 r_4 = npyv_negative_u64(v_4);
        npyv_store_u64(op + 4 * vstep, r_4);
    #endif
    
#line 174
    #if UNROLL > 5
        npyv_u64 v_5 = npyv_loadn_u64(ip + 5 * vstep * istride, istride);
        npyv_u64 r_5 = npyv_negative_u64(v_5);
        npyv_store_u64(op + 5 * vstep, r_5);
    #endif
    
#line 174
    #if UNROLL > 6
        npyv_u64 v_6 = npyv_loadn_u64(ip + 6 * vstep * istride, istride);
        npyv_u64 r_6 = npyv_negative_u64(v_6);
        npyv_store_u64(op + 6 * vstep, r_6);
    #endif
    
#line 174
    #if UNROLL > 7
        npyv_u64 v_7 = npyv_loadn_u64(ip + 7 * vstep * istride, istride);
        npyv_u64 r_7 = npyv_negative_u64(v_7);
        npyv_store_u64(op + 7 * vstep, r_7);
    #endif
    
#line 174
    #if UNROLL > 8
        npyv_u64 v_8 = npyv_loadn_u64(ip + 8 * vstep * istride, istride);
        npyv_u64 r_8 = npyv_negative_u64(v_8);
        npyv_store_u64(op + 8 * vstep, r_8);
    #endif
    
#line 174
    #if UNROLL > 9
        npyv_u64 v_9 = npyv_loadn_u64(ip + 9 * vstep * istride, istride);
        npyv_u64 r_9 = npyv_negative_u64(v_9);
        npyv_store_u64(op + 9 * vstep, r_9);
    #endif
    
#line 174
    #if UNROLL > 10
        npyv_u64 v_10 = npyv_loadn_u64(ip + 10 * vstep * istride, istride);
        npyv_u64 r_10 = npyv_negative_u64(v_10);
        npyv_store_u64(op + 10 * vstep, r_10);
    #endif
    
#line 174
    #if UNROLL > 11
        npyv_u64 v_11 = npyv_loadn_u64(ip + 11 * vstep * istride, istride);
        npyv_u64 r_11 = npyv_negative_u64(v_11);
        npyv_store_u64(op + 11 * vstep, r_11);
    #endif
    
#line 174
    #if UNROLL > 12
        npyv_u64 v_12 = npyv_loadn_u64(ip + 12 * vstep * istride, istride);
        npyv_u64 r_12 = npyv_negative_u64(v_12);
        npyv_store_u64(op + 12 * vstep, r_12);
    #endif
    
#line 174
    #if UNROLL > 13
        npyv_u64 v_13 = npyv_loadn_u64(ip + 13 * vstep * istride, istride);
        npyv_u64 r_13 = npyv_negative_u64(v_13);
        npyv_store_u64(op + 13 * vstep, r_13);
    #endif
    
#line 174
    #if UNROLL > 14
        npyv_u64 v_14 = npyv_loadn_u64(ip + 14 * vstep * istride, istride);
        npyv_u64 r_14 = npyv_negative_u64(v_14);
        npyv_store_u64(op + 14 * vstep, r_14);
    #endif
    
#line 174
    #if UNROLL > 15
        npyv_u64 v_15 = npyv_loadn_u64(ip + 15 * vstep * istride, istride);
        npyv_u64 r_15 = npyv_negative_u64(v_15);
        npyv_store_u64(op + 15 * vstep, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += vstep) {
        npyv_u64 v = npyv_loadn_u64(ip, istride);
        npyv_u64 r = npyv_negative_u64(v);
        npyv_store_u64(op, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ip += istride, ++op) {
        *op = scalar_negative(*ip);
    }
}
// non-contiguous input and output
// limit unroll to 2x
#if UNROLL > 2
#undef UNROLL
#define UNROLL 2
#endif
// X86 does better with unrolled scalar for heavy non-contiguous
#ifndef NPY_HAVE_SSE2
static NPY_INLINE void
simd_unary_nn_negative_u64(const npyv_lanetype_u64 *ip, npy_intp istride,
                             npyv_lanetype_u64 *op, npy_intp ostride,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_u64;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
    #line 213
    #if UNROLL > 0
        npyv_u64 v_0 = npyv_loadn_u64(ip + 0 * vstep * istride, istride);
        npyv_u64 r_0 = npyv_negative_u64(v_0);
        npyv_storen_u64(op + 0 * vstep * ostride, ostride, r_0);
    #endif
    
#line 213
    #if UNROLL > 1
        npyv_u64 v_1 = npyv_loadn_u64(ip + 1 * vstep * istride, istride);
        npyv_u64 r_1 = npyv_negative_u64(v_1);
        npyv_storen_u64(op + 1 * vstep * ostride, ostride, r_1);
    #endif
    
#line 213
    #if UNROLL > 2
        npyv_u64 v_2 = npyv_loadn_u64(ip + 2 * vstep * istride, istride);
        npyv_u64 r_2 = npyv_negative_u64(v_2);
        npyv_storen_u64(op + 2 * vstep * ostride, ostride, r_2);
    #endif
    
#line 213
    #if UNROLL > 3
        npyv_u64 v_3 = npyv_loadn_u64(ip + 3 * vstep * istride, istride);
        npyv_u64 r_3 = npyv_negative_u64(v_3);
        npyv_storen_u64(op + 3 * vstep * ostride, ostride, r_3);
    #endif
    
#line 213
    #if UNROLL > 4
        npyv_u64 v_4 = npyv_loadn_u64(ip + 4 * vstep * istride, istride);
        npyv_u64 r_4 = npyv_negative_u64(v_4);
        npyv_storen_u64(op + 4 * vstep * ostride, ostride, r_4);
    #endif
    
#line 213
    #if UNROLL > 5
        npyv_u64 v_5 = npyv_loadn_u64(ip + 5 * vstep * istride, istride);
        npyv_u64 r_5 = npyv_negative_u64(v_5);
        npyv_storen_u64(op + 5 * vstep * ostride, ostride, r_5);
    #endif
    
#line 213
    #if UNROLL > 6
        npyv_u64 v_6 = npyv_loadn_u64(ip + 6 * vstep * istride, istride);
        npyv_u64 r_6 = npyv_negative_u64(v_6);
        npyv_storen_u64(op + 6 * vstep * ostride, ostride, r_6);
    #endif
    
#line 213
    #if UNROLL > 7
        npyv_u64 v_7 = npyv_loadn_u64(ip + 7 * vstep * istride, istride);
        npyv_u64 r_7 = npyv_negative_u64(v_7);
        npyv_storen_u64(op + 7 * vstep * ostride, ostride, r_7);
    #endif
    
#line 213
    #if UNROLL > 8
        npyv_u64 v_8 = npyv_loadn_u64(ip + 8 * vstep * istride, istride);
        npyv_u64 r_8 = npyv_negative_u64(v_8);
        npyv_storen_u64(op + 8 * vstep * ostride, ostride, r_8);
    #endif
    
#line 213
    #if UNROLL > 9
        npyv_u64 v_9 = npyv_loadn_u64(ip + 9 * vstep * istride, istride);
        npyv_u64 r_9 = npyv_negative_u64(v_9);
        npyv_storen_u64(op + 9 * vstep * ostride, ostride, r_9);
    #endif
    
#line 213
    #if UNROLL > 10
        npyv_u64 v_10 = npyv_loadn_u64(ip + 10 * vstep * istride, istride);
        npyv_u64 r_10 = npyv_negative_u64(v_10);
        npyv_storen_u64(op + 10 * vstep * ostride, ostride, r_10);
    #endif
    
#line 213
    #if UNROLL > 11
        npyv_u64 v_11 = npyv_loadn_u64(ip + 11 * vstep * istride, istride);
        npyv_u64 r_11 = npyv_negative_u64(v_11);
        npyv_storen_u64(op + 11 * vstep * ostride, ostride, r_11);
    #endif
    
#line 213
    #if UNROLL > 12
        npyv_u64 v_12 = npyv_loadn_u64(ip + 12 * vstep * istride, istride);
        npyv_u64 r_12 = npyv_negative_u64(v_12);
        npyv_storen_u64(op + 12 * vstep * ostride, ostride, r_12);
    #endif
    
#line 213
    #if UNROLL > 13
        npyv_u64 v_13 = npyv_loadn_u64(ip + 13 * vstep * istride, istride);
        npyv_u64 r_13 = npyv_negative_u64(v_13);
        npyv_storen_u64(op + 13 * vstep * ostride, ostride, r_13);
    #endif
    
#line 213
    #if UNROLL > 14
        npyv_u64 v_14 = npyv_loadn_u64(ip + 14 * vstep * istride, istride);
        npyv_u64 r_14 = npyv_negative_u64(v_14);
        npyv_storen_u64(op + 14 * vstep * ostride, ostride, r_14);
    #endif
    
#line 213
    #if UNROLL > 15
        npyv_u64 v_15 = npyv_loadn_u64(ip + 15 * vstep * istride, istride);
        npyv_u64 r_15 = npyv_negative_u64(v_15);
        npyv_storen_u64(op + 15 * vstep * ostride, ostride, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
        npyv_u64 v = npyv_loadn_u64(ip, istride);
        npyv_u64 r = npyv_negative_u64(v);
        npyv_storen_u64(op, ostride, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ip += istride, op += ostride) {
        *op = scalar_negative(*ip);
    }
}
#endif // NPY_HAVE_SSE2
#endif // 1
#undef UNROLL
#endif // NPY_SIMD
/*end repeat1**/

#line 80
#line 85
#if NPY_SIMD_F32
#if 4 < 1
#error "Unroll must be at least 1"
#elif NPY_SIMD != 128 && 4 > 2
// Avoid memory bandwidth bottleneck for larger SIMD
#define UNROLL 2
#else
#define UNROLL 4
#endif
// contiguous inputs and output.
static NPY_INLINE void
simd_unary_cc_negative_f32(const npyv_lanetype_f32 *ip,
                             npyv_lanetype_f32 *op,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
    #line 108
    #if UNROLL > 0
        npyv_f32 v_0 = npyv_load_f32(ip + 0 * vstep);
        npyv_f32 r_0 = npyv_negative_f32(v_0);
        npyv_store_f32(op + 0 * vstep, r_0);
    #endif
    
#line 108
    #if UNROLL > 1
        npyv_f32 v_1 = npyv_load_f32(ip + 1 * vstep);
        npyv_f32 r_1 = npyv_negative_f32(v_1);
        npyv_store_f32(op + 1 * vstep, r_1);
    #endif
    
#line 108
    #if UNROLL > 2
        npyv_f32 v_2 = npyv_load_f32(ip + 2 * vstep);
        npyv_f32 r_2 = npyv_negative_f32(v_2);
        npyv_store_f32(op + 2 * vstep, r_2);
    #endif
    
#line 108
    #if UNROLL > 3
        npyv_f32 v_3 = npyv_load_f32(ip + 3 * vstep);
        npyv_f32 r_3 = npyv_negative_f32(v_3);
        npyv_store_f32(op + 3 * vstep, r_3);
    #endif
    
#line 108
    #if UNROLL > 4
        npyv_f32 v_4 = npyv_load_f32(ip + 4 * vstep);
        npyv_f32 r_4 = npyv_negative_f32(v_4);
        npyv_store_f32(op + 4 * vstep, r_4);
    #endif
    
#line 108
    #if UNROLL > 5
        npyv_f32 v_5 = npyv_load_f32(ip + 5 * vstep);
        npyv_f32 r_5 = npyv_negative_f32(v_5);
        npyv_store_f32(op + 5 * vstep, r_5);
    #endif
    
#line 108
    #if UNROLL > 6
        npyv_f32 v_6 = npyv_load_f32(ip + 6 * vstep);
        npyv_f32 r_6 = npyv_negative_f32(v_6);
        npyv_store_f32(op + 6 * vstep, r_6);
    #endif
    
#line 108
    #if UNROLL > 7
        npyv_f32 v_7 = npyv_load_f32(ip + 7 * vstep);
        npyv_f32 r_7 = npyv_negative_f32(v_7);
        npyv_store_f32(op + 7 * vstep, r_7);
    #endif
    
#line 108
    #if UNROLL > 8
        npyv_f32 v_8 = npyv_load_f32(ip + 8 * vstep);
        npyv_f32 r_8 = npyv_negative_f32(v_8);
        npyv_store_f32(op + 8 * vstep, r_8);
    #endif
    
#line 108
    #if UNROLL > 9
        npyv_f32 v_9 = npyv_load_f32(ip + 9 * vstep);
        npyv_f32 r_9 = npyv_negative_f32(v_9);
        npyv_store_f32(op + 9 * vstep, r_9);
    #endif
    
#line 108
    #if UNROLL > 10
        npyv_f32 v_10 = npyv_load_f32(ip + 10 * vstep);
        npyv_f32 r_10 = npyv_negative_f32(v_10);
        npyv_store_f32(op + 10 * vstep, r_10);
    #endif
    
#line 108
    #if UNROLL > 11
        npyv_f32 v_11 = npyv_load_f32(ip + 11 * vstep);
        npyv_f32 r_11 = npyv_negative_f32(v_11);
        npyv_store_f32(op + 11 * vstep, r_11);
    #endif
    
#line 108
    #if UNROLL > 12
        npyv_f32 v_12 = npyv_load_f32(ip + 12 * vstep);
        npyv_f32 r_12 = npyv_negative_f32(v_12);
        npyv_store_f32(op + 12 * vstep, r_12);
    #endif
    
#line 108
    #if UNROLL > 13
        npyv_f32 v_13 = npyv_load_f32(ip + 13 * vstep);
        npyv_f32 r_13 = npyv_negative_f32(v_13);
        npyv_store_f32(op + 13 * vstep, r_13);
    #endif
    
#line 108
    #if UNROLL > 14
        npyv_f32 v_14 = npyv_load_f32(ip + 14 * vstep);
        npyv_f32 r_14 = npyv_negative_f32(v_14);
        npyv_store_f32(op + 14 * vstep, r_14);
    #endif
    
#line 108
    #if UNROLL > 15
        npyv_f32 v_15 = npyv_load_f32(ip + 15 * vstep);
        npyv_f32 r_15 = npyv_negative_f32(v_15);
        npyv_store_f32(op + 15 * vstep, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += vstep, op +=vstep) {
        npyv_f32 v = npyv_load_f32(ip);
        npyv_f32 r = npyv_negative_f32(v);
        npyv_store_f32(op, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ++ip, ++op) {
        *op = scalar_negative(*ip);
    }
}

#if 1
// contiguous input, non-contiguous output
static NPY_INLINE void
simd_unary_cn_negative_f32(const npyv_lanetype_f32 *ip,
                             npyv_lanetype_f32 *op, npy_intp ostride,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += wstep, op += ostride*wstep) {
    #line 142
    #if UNROLL > 0
        npyv_f32 v_0 = npyv_load_f32(ip + 0 * vstep);
        npyv_f32 r_0 = npyv_negative_f32(v_0);
        npyv_storen_f32(op + 0 * vstep * ostride, ostride, r_0);
    #endif
    
#line 142
    #if UNROLL > 1
        npyv_f32 v_1 = npyv_load_f32(ip + 1 * vstep);
        npyv_f32 r_1 = npyv_negative_f32(v_1);
        npyv_storen_f32(op + 1 * vstep * ostride, ostride, r_1);
    #endif
    
#line 142
    #if UNROLL > 2
        npyv_f32 v_2 = npyv_load_f32(ip + 2 * vstep);
        npyv_f32 r_2 = npyv_negative_f32(v_2);
        npyv_storen_f32(op + 2 * vstep * ostride, ostride, r_2);
    #endif
    
#line 142
    #if UNROLL > 3
        npyv_f32 v_3 = npyv_load_f32(ip + 3 * vstep);
        npyv_f32 r_3 = npyv_negative_f32(v_3);
        npyv_storen_f32(op + 3 * vstep * ostride, ostride, r_3);
    #endif
    
#line 142
    #if UNROLL > 4
        npyv_f32 v_4 = npyv_load_f32(ip + 4 * vstep);
        npyv_f32 r_4 = npyv_negative_f32(v_4);
        npyv_storen_f32(op + 4 * vstep * ostride, ostride, r_4);
    #endif
    
#line 142
    #if UNROLL > 5
        npyv_f32 v_5 = npyv_load_f32(ip + 5 * vstep);
        npyv_f32 r_5 = npyv_negative_f32(v_5);
        npyv_storen_f32(op + 5 * vstep * ostride, ostride, r_5);
    #endif
    
#line 142
    #if UNROLL > 6
        npyv_f32 v_6 = npyv_load_f32(ip + 6 * vstep);
        npyv_f32 r_6 = npyv_negative_f32(v_6);
        npyv_storen_f32(op + 6 * vstep * ostride, ostride, r_6);
    #endif
    
#line 142
    #if UNROLL > 7
        npyv_f32 v_7 = npyv_load_f32(ip + 7 * vstep);
        npyv_f32 r_7 = npyv_negative_f32(v_7);
        npyv_storen_f32(op + 7 * vstep * ostride, ostride, r_7);
    #endif
    
#line 142
    #if UNROLL > 8
        npyv_f32 v_8 = npyv_load_f32(ip + 8 * vstep);
        npyv_f32 r_8 = npyv_negative_f32(v_8);
        npyv_storen_f32(op + 8 * vstep * ostride, ostride, r_8);
    #endif
    
#line 142
    #if UNROLL > 9
        npyv_f32 v_9 = npyv_load_f32(ip + 9 * vstep);
        npyv_f32 r_9 = npyv_negative_f32(v_9);
        npyv_storen_f32(op + 9 * vstep * ostride, ostride, r_9);
    #endif
    
#line 142
    #if UNROLL > 10
        npyv_f32 v_10 = npyv_load_f32(ip + 10 * vstep);
        npyv_f32 r_10 = npyv_negative_f32(v_10);
        npyv_storen_f32(op + 10 * vstep * ostride, ostride, r_10);
    #endif
    
#line 142
    #if UNROLL > 11
        npyv_f32 v_11 = npyv_load_f32(ip + 11 * vstep);
        npyv_f32 r_11 = npyv_negative_f32(v_11);
        npyv_storen_f32(op + 11 * vstep * ostride, ostride, r_11);
    #endif
    
#line 142
    #if UNROLL > 12
        npyv_f32 v_12 = npyv_load_f32(ip + 12 * vstep);
        npyv_f32 r_12 = npyv_negative_f32(v_12);
        npyv_storen_f32(op + 12 * vstep * ostride, ostride, r_12);
    #endif
    
#line 142
    #if UNROLL > 13
        npyv_f32 v_13 = npyv_load_f32(ip + 13 * vstep);
        npyv_f32 r_13 = npyv_negative_f32(v_13);
        npyv_storen_f32(op + 13 * vstep * ostride, ostride, r_13);
    #endif
    
#line 142
    #if UNROLL > 14
        npyv_f32 v_14 = npyv_load_f32(ip + 14 * vstep);
        npyv_f32 r_14 = npyv_negative_f32(v_14);
        npyv_storen_f32(op + 14 * vstep * ostride, ostride, r_14);
    #endif
    
#line 142
    #if UNROLL > 15
        npyv_f32 v_15 = npyv_load_f32(ip + 15 * vstep);
        npyv_f32 r_15 = npyv_negative_f32(v_15);
        npyv_storen_f32(op + 15 * vstep * ostride, ostride, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += vstep, op += ostride*vstep) {
        npyv_f32 v = npyv_load_f32(ip);
        npyv_f32 r = npyv_negative_f32(v);
        npyv_storen_f32(op, ostride, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ++ip, op += ostride) {
        *op = scalar_negative(*ip);
    }
}
// non-contiguous input, contiguous output
static NPY_INLINE void
simd_unary_nc_negative_f32(const npyv_lanetype_f32 *ip, npy_intp istride,
                             npyv_lanetype_f32 *op,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += wstep) {
    #line 174
    #if UNROLL > 0
        npyv_f32 v_0 = npyv_loadn_f32(ip + 0 * vstep * istride, istride);
        npyv_f32 r_0 = npyv_negative_f32(v_0);
        npyv_store_f32(op + 0 * vstep, r_0);
    #endif
    
#line 174
    #if UNROLL > 1
        npyv_f32 v_1 = npyv_loadn_f32(ip + 1 * vstep * istride, istride);
        npyv_f32 r_1 = npyv_negative_f32(v_1);
        npyv_store_f32(op + 1 * vstep, r_1);
    #endif
    
#line 174
    #if UNROLL > 2
        npyv_f32 v_2 = npyv_loadn_f32(ip + 2 * vstep * istride, istride);
        npyv_f32 r_2 = npyv_negative_f32(v_2);
        npyv_store_f32(op + 2 * vstep, r_2);
    #endif
    
#line 174
    #if UNROLL > 3
        npyv_f32 v_3 = npyv_loadn_f32(ip + 3 * vstep * istride, istride);
        npyv_f32 r_3 = npyv_negative_f32(v_3);
        npyv_store_f32(op + 3 * vstep, r_3);
    #endif
    
#line 174
    #if UNROLL > 4
        npyv_f32 v_4 = npyv_loadn_f32(ip + 4 * vstep * istride, istride);
        npyv_f32 r_4 = npyv_negative_f32(v_4);
        npyv_store_f32(op + 4 * vstep, r_4);
    #endif
    
#line 174
    #if UNROLL > 5
        npyv_f32 v_5 = npyv_loadn_f32(ip + 5 * vstep * istride, istride);
        npyv_f32 r_5 = npyv_negative_f32(v_5);
        npyv_store_f32(op + 5 * vstep, r_5);
    #endif
    
#line 174
    #if UNROLL > 6
        npyv_f32 v_6 = npyv_loadn_f32(ip + 6 * vstep * istride, istride);
        npyv_f32 r_6 = npyv_negative_f32(v_6);
        npyv_store_f32(op + 6 * vstep, r_6);
    #endif
    
#line 174
    #if UNROLL > 7
        npyv_f32 v_7 = npyv_loadn_f32(ip + 7 * vstep * istride, istride);
        npyv_f32 r_7 = npyv_negative_f32(v_7);
        npyv_store_f32(op + 7 * vstep, r_7);
    #endif
    
#line 174
    #if UNROLL > 8
        npyv_f32 v_8 = npyv_loadn_f32(ip + 8 * vstep * istride, istride);
        npyv_f32 r_8 = npyv_negative_f32(v_8);
        npyv_store_f32(op + 8 * vstep, r_8);
    #endif
    
#line 174
    #if UNROLL > 9
        npyv_f32 v_9 = npyv_loadn_f32(ip + 9 * vstep * istride, istride);
        npyv_f32 r_9 = npyv_negative_f32(v_9);
        npyv_store_f32(op + 9 * vstep, r_9);
    #endif
    
#line 174
    #if UNROLL > 10
        npyv_f32 v_10 = npyv_loadn_f32(ip + 10 * vstep * istride, istride);
        npyv_f32 r_10 = npyv_negative_f32(v_10);
        npyv_store_f32(op + 10 * vstep, r_10);
    #endif
    
#line 174
    #if UNROLL > 11
        npyv_f32 v_11 = npyv_loadn_f32(ip + 11 * vstep * istride, istride);
        npyv_f32 r_11 = npyv_negative_f32(v_11);
        npyv_store_f32(op + 11 * vstep, r_11);
    #endif
    
#line 174
    #if UNROLL > 12
        npyv_f32 v_12 = npyv_loadn_f32(ip + 12 * vstep * istride, istride);
        npyv_f32 r_12 = npyv_negative_f32(v_12);
        npyv_store_f32(op + 12 * vstep, r_12);
    #endif
    
#line 174
    #if UNROLL > 13
        npyv_f32 v_13 = npyv_loadn_f32(ip + 13 * vstep * istride, istride);
        npyv_f32 r_13 = npyv_negative_f32(v_13);
        npyv_store_f32(op + 13 * vstep, r_13);
    #endif
    
#line 174
    #if UNROLL > 14
        npyv_f32 v_14 = npyv_loadn_f32(ip + 14 * vstep * istride, istride);
        npyv_f32 r_14 = npyv_negative_f32(v_14);
        npyv_store_f32(op + 14 * vstep, r_14);
    #endif
    
#line 174
    #if UNROLL > 15
        npyv_f32 v_15 = npyv_loadn_f32(ip + 15 * vstep * istride, istride);
        npyv_f32 r_15 = npyv_negative_f32(v_15);
        npyv_store_f32(op + 15 * vstep, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += vstep) {
        npyv_f32 v = npyv_loadn_f32(ip, istride);
        npyv_f32 r = npyv_negative_f32(v);
        npyv_store_f32(op, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ip += istride, ++op) {
        *op = scalar_negative(*ip);
    }
}
// non-contiguous input and output
// limit unroll to 2x
#if UNROLL > 2
#undef UNROLL
#define UNROLL 2
#endif
// X86 does better with unrolled scalar for heavy non-contiguous
#ifndef NPY_HAVE_SSE2
static NPY_INLINE void
simd_unary_nn_negative_f32(const npyv_lanetype_f32 *ip, npy_intp istride,
                             npyv_lanetype_f32 *op, npy_intp ostride,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
    #line 213
    #if UNROLL > 0
        npyv_f32 v_0 = npyv_loadn_f32(ip + 0 * vstep * istride, istride);
        npyv_f32 r_0 = npyv_negative_f32(v_0);
        npyv_storen_f32(op + 0 * vstep * ostride, ostride, r_0);
    #endif
    
#line 213
    #if UNROLL > 1
        npyv_f32 v_1 = npyv_loadn_f32(ip + 1 * vstep * istride, istride);
        npyv_f32 r_1 = npyv_negative_f32(v_1);
        npyv_storen_f32(op + 1 * vstep * ostride, ostride, r_1);
    #endif
    
#line 213
    #if UNROLL > 2
        npyv_f32 v_2 = npyv_loadn_f32(ip + 2 * vstep * istride, istride);
        npyv_f32 r_2 = npyv_negative_f32(v_2);
        npyv_storen_f32(op + 2 * vstep * ostride, ostride, r_2);
    #endif
    
#line 213
    #if UNROLL > 3
        npyv_f32 v_3 = npyv_loadn_f32(ip + 3 * vstep * istride, istride);
        npyv_f32 r_3 = npyv_negative_f32(v_3);
        npyv_storen_f32(op + 3 * vstep * ostride, ostride, r_3);
    #endif
    
#line 213
    #if UNROLL > 4
        npyv_f32 v_4 = npyv_loadn_f32(ip + 4 * vstep * istride, istride);
        npyv_f32 r_4 = npyv_negative_f32(v_4);
        npyv_storen_f32(op + 4 * vstep * ostride, ostride, r_4);
    #endif
    
#line 213
    #if UNROLL > 5
        npyv_f32 v_5 = npyv_loadn_f32(ip + 5 * vstep * istride, istride);
        npyv_f32 r_5 = npyv_negative_f32(v_5);
        npyv_storen_f32(op + 5 * vstep * ostride, ostride, r_5);
    #endif
    
#line 213
    #if UNROLL > 6
        npyv_f32 v_6 = npyv_loadn_f32(ip + 6 * vstep * istride, istride);
        npyv_f32 r_6 = npyv_negative_f32(v_6);
        npyv_storen_f32(op + 6 * vstep * ostride, ostride, r_6);
    #endif
    
#line 213
    #if UNROLL > 7
        npyv_f32 v_7 = npyv_loadn_f32(ip + 7 * vstep * istride, istride);
        npyv_f32 r_7 = npyv_negative_f32(v_7);
        npyv_storen_f32(op + 7 * vstep * ostride, ostride, r_7);
    #endif
    
#line 213
    #if UNROLL > 8
        npyv_f32 v_8 = npyv_loadn_f32(ip + 8 * vstep * istride, istride);
        npyv_f32 r_8 = npyv_negative_f32(v_8);
        npyv_storen_f32(op + 8 * vstep * ostride, ostride, r_8);
    #endif
    
#line 213
    #if UNROLL > 9
        npyv_f32 v_9 = npyv_loadn_f32(ip + 9 * vstep * istride, istride);
        npyv_f32 r_9 = npyv_negative_f32(v_9);
        npyv_storen_f32(op + 9 * vstep * ostride, ostride, r_9);
    #endif
    
#line 213
    #if UNROLL > 10
        npyv_f32 v_10 = npyv_loadn_f32(ip + 10 * vstep * istride, istride);
        npyv_f32 r_10 = npyv_negative_f32(v_10);
        npyv_storen_f32(op + 10 * vstep * ostride, ostride, r_10);
    #endif
    
#line 213
    #if UNROLL > 11
        npyv_f32 v_11 = npyv_loadn_f32(ip + 11 * vstep * istride, istride);
        npyv_f32 r_11 = npyv_negative_f32(v_11);
        npyv_storen_f32(op + 11 * vstep * ostride, ostride, r_11);
    #endif
    
#line 213
    #if UNROLL > 12
        npyv_f32 v_12 = npyv_loadn_f32(ip + 12 * vstep * istride, istride);
        npyv_f32 r_12 = npyv_negative_f32(v_12);
        npyv_storen_f32(op + 12 * vstep * ostride, ostride, r_12);
    #endif
    
#line 213
    #if UNROLL > 13
        npyv_f32 v_13 = npyv_loadn_f32(ip + 13 * vstep * istride, istride);
        npyv_f32 r_13 = npyv_negative_f32(v_13);
        npyv_storen_f32(op + 13 * vstep * ostride, ostride, r_13);
    #endif
    
#line 213
    #if UNROLL > 14
        npyv_f32 v_14 = npyv_loadn_f32(ip + 14 * vstep * istride, istride);
        npyv_f32 r_14 = npyv_negative_f32(v_14);
        npyv_storen_f32(op + 14 * vstep * ostride, ostride, r_14);
    #endif
    
#line 213
    #if UNROLL > 15
        npyv_f32 v_15 = npyv_loadn_f32(ip + 15 * vstep * istride, istride);
        npyv_f32 r_15 = npyv_negative_f32(v_15);
        npyv_storen_f32(op + 15 * vstep * ostride, ostride, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
        npyv_f32 v = npyv_loadn_f32(ip, istride);
        npyv_f32 r = npyv_negative_f32(v);
        npyv_storen_f32(op, ostride, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ip += istride, op += ostride) {
        *op = scalar_negative(*ip);
    }
}
#endif // NPY_HAVE_SSE2
#endif // 1
#undef UNROLL
#endif // NPY_SIMD_F32
/*end repeat1**/

#line 80
#line 85
#if NPY_SIMD_F64
#if 4 < 1
#error "Unroll must be at least 1"
#elif NPY_SIMD != 128 && 4 > 2
// Avoid memory bandwidth bottleneck for larger SIMD
#define UNROLL 2
#else
#define UNROLL 4
#endif
// contiguous inputs and output.
static NPY_INLINE void
simd_unary_cc_negative_f64(const npyv_lanetype_f64 *ip,
                             npyv_lanetype_f64 *op,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
    #line 108
    #if UNROLL > 0
        npyv_f64 v_0 = npyv_load_f64(ip + 0 * vstep);
        npyv_f64 r_0 = npyv_negative_f64(v_0);
        npyv_store_f64(op + 0 * vstep, r_0);
    #endif
    
#line 108
    #if UNROLL > 1
        npyv_f64 v_1 = npyv_load_f64(ip + 1 * vstep);
        npyv_f64 r_1 = npyv_negative_f64(v_1);
        npyv_store_f64(op + 1 * vstep, r_1);
    #endif
    
#line 108
    #if UNROLL > 2
        npyv_f64 v_2 = npyv_load_f64(ip + 2 * vstep);
        npyv_f64 r_2 = npyv_negative_f64(v_2);
        npyv_store_f64(op + 2 * vstep, r_2);
    #endif
    
#line 108
    #if UNROLL > 3
        npyv_f64 v_3 = npyv_load_f64(ip + 3 * vstep);
        npyv_f64 r_3 = npyv_negative_f64(v_3);
        npyv_store_f64(op + 3 * vstep, r_3);
    #endif
    
#line 108
    #if UNROLL > 4
        npyv_f64 v_4 = npyv_load_f64(ip + 4 * vstep);
        npyv_f64 r_4 = npyv_negative_f64(v_4);
        npyv_store_f64(op + 4 * vstep, r_4);
    #endif
    
#line 108
    #if UNROLL > 5
        npyv_f64 v_5 = npyv_load_f64(ip + 5 * vstep);
        npyv_f64 r_5 = npyv_negative_f64(v_5);
        npyv_store_f64(op + 5 * vstep, r_5);
    #endif
    
#line 108
    #if UNROLL > 6
        npyv_f64 v_6 = npyv_load_f64(ip + 6 * vstep);
        npyv_f64 r_6 = npyv_negative_f64(v_6);
        npyv_store_f64(op + 6 * vstep, r_6);
    #endif
    
#line 108
    #if UNROLL > 7
        npyv_f64 v_7 = npyv_load_f64(ip + 7 * vstep);
        npyv_f64 r_7 = npyv_negative_f64(v_7);
        npyv_store_f64(op + 7 * vstep, r_7);
    #endif
    
#line 108
    #if UNROLL > 8
        npyv_f64 v_8 = npyv_load_f64(ip + 8 * vstep);
        npyv_f64 r_8 = npyv_negative_f64(v_8);
        npyv_store_f64(op + 8 * vstep, r_8);
    #endif
    
#line 108
    #if UNROLL > 9
        npyv_f64 v_9 = npyv_load_f64(ip + 9 * vstep);
        npyv_f64 r_9 = npyv_negative_f64(v_9);
        npyv_store_f64(op + 9 * vstep, r_9);
    #endif
    
#line 108
    #if UNROLL > 10
        npyv_f64 v_10 = npyv_load_f64(ip + 10 * vstep);
        npyv_f64 r_10 = npyv_negative_f64(v_10);
        npyv_store_f64(op + 10 * vstep, r_10);
    #endif
    
#line 108
    #if UNROLL > 11
        npyv_f64 v_11 = npyv_load_f64(ip + 11 * vstep);
        npyv_f64 r_11 = npyv_negative_f64(v_11);
        npyv_store_f64(op + 11 * vstep, r_11);
    #endif
    
#line 108
    #if UNROLL > 12
        npyv_f64 v_12 = npyv_load_f64(ip + 12 * vstep);
        npyv_f64 r_12 = npyv_negative_f64(v_12);
        npyv_store_f64(op + 12 * vstep, r_12);
    #endif
    
#line 108
    #if UNROLL > 13
        npyv_f64 v_13 = npyv_load_f64(ip + 13 * vstep);
        npyv_f64 r_13 = npyv_negative_f64(v_13);
        npyv_store_f64(op + 13 * vstep, r_13);
    #endif
    
#line 108
    #if UNROLL > 14
        npyv_f64 v_14 = npyv_load_f64(ip + 14 * vstep);
        npyv_f64 r_14 = npyv_negative_f64(v_14);
        npyv_store_f64(op + 14 * vstep, r_14);
    #endif
    
#line 108
    #if UNROLL > 15
        npyv_f64 v_15 = npyv_load_f64(ip + 15 * vstep);
        npyv_f64 r_15 = npyv_negative_f64(v_15);
        npyv_store_f64(op + 15 * vstep, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += vstep, op +=vstep) {
        npyv_f64 v = npyv_load_f64(ip);
        npyv_f64 r = npyv_negative_f64(v);
        npyv_store_f64(op, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ++ip, ++op) {
        *op = scalar_negative(*ip);
    }
}

#if 1
// contiguous input, non-contiguous output
static NPY_INLINE void
simd_unary_cn_negative_f64(const npyv_lanetype_f64 *ip,
                             npyv_lanetype_f64 *op, npy_intp ostride,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += wstep, op += ostride*wstep) {
    #line 142
    #if UNROLL > 0
        npyv_f64 v_0 = npyv_load_f64(ip + 0 * vstep);
        npyv_f64 r_0 = npyv_negative_f64(v_0);
        npyv_storen_f64(op + 0 * vstep * ostride, ostride, r_0);
    #endif
    
#line 142
    #if UNROLL > 1
        npyv_f64 v_1 = npyv_load_f64(ip + 1 * vstep);
        npyv_f64 r_1 = npyv_negative_f64(v_1);
        npyv_storen_f64(op + 1 * vstep * ostride, ostride, r_1);
    #endif
    
#line 142
    #if UNROLL > 2
        npyv_f64 v_2 = npyv_load_f64(ip + 2 * vstep);
        npyv_f64 r_2 = npyv_negative_f64(v_2);
        npyv_storen_f64(op + 2 * vstep * ostride, ostride, r_2);
    #endif
    
#line 142
    #if UNROLL > 3
        npyv_f64 v_3 = npyv_load_f64(ip + 3 * vstep);
        npyv_f64 r_3 = npyv_negative_f64(v_3);
        npyv_storen_f64(op + 3 * vstep * ostride, ostride, r_3);
    #endif
    
#line 142
    #if UNROLL > 4
        npyv_f64 v_4 = npyv_load_f64(ip + 4 * vstep);
        npyv_f64 r_4 = npyv_negative_f64(v_4);
        npyv_storen_f64(op + 4 * vstep * ostride, ostride, r_4);
    #endif
    
#line 142
    #if UNROLL > 5
        npyv_f64 v_5 = npyv_load_f64(ip + 5 * vstep);
        npyv_f64 r_5 = npyv_negative_f64(v_5);
        npyv_storen_f64(op + 5 * vstep * ostride, ostride, r_5);
    #endif
    
#line 142
    #if UNROLL > 6
        npyv_f64 v_6 = npyv_load_f64(ip + 6 * vstep);
        npyv_f64 r_6 = npyv_negative_f64(v_6);
        npyv_storen_f64(op + 6 * vstep * ostride, ostride, r_6);
    #endif
    
#line 142
    #if UNROLL > 7
        npyv_f64 v_7 = npyv_load_f64(ip + 7 * vstep);
        npyv_f64 r_7 = npyv_negative_f64(v_7);
        npyv_storen_f64(op + 7 * vstep * ostride, ostride, r_7);
    #endif
    
#line 142
    #if UNROLL > 8
        npyv_f64 v_8 = npyv_load_f64(ip + 8 * vstep);
        npyv_f64 r_8 = npyv_negative_f64(v_8);
        npyv_storen_f64(op + 8 * vstep * ostride, ostride, r_8);
    #endif
    
#line 142
    #if UNROLL > 9
        npyv_f64 v_9 = npyv_load_f64(ip + 9 * vstep);
        npyv_f64 r_9 = npyv_negative_f64(v_9);
        npyv_storen_f64(op + 9 * vstep * ostride, ostride, r_9);
    #endif
    
#line 142
    #if UNROLL > 10
        npyv_f64 v_10 = npyv_load_f64(ip + 10 * vstep);
        npyv_f64 r_10 = npyv_negative_f64(v_10);
        npyv_storen_f64(op + 10 * vstep * ostride, ostride, r_10);
    #endif
    
#line 142
    #if UNROLL > 11
        npyv_f64 v_11 = npyv_load_f64(ip + 11 * vstep);
        npyv_f64 r_11 = npyv_negative_f64(v_11);
        npyv_storen_f64(op + 11 * vstep * ostride, ostride, r_11);
    #endif
    
#line 142
    #if UNROLL > 12
        npyv_f64 v_12 = npyv_load_f64(ip + 12 * vstep);
        npyv_f64 r_12 = npyv_negative_f64(v_12);
        npyv_storen_f64(op + 12 * vstep * ostride, ostride, r_12);
    #endif
    
#line 142
    #if UNROLL > 13
        npyv_f64 v_13 = npyv_load_f64(ip + 13 * vstep);
        npyv_f64 r_13 = npyv_negative_f64(v_13);
        npyv_storen_f64(op + 13 * vstep * ostride, ostride, r_13);
    #endif
    
#line 142
    #if UNROLL > 14
        npyv_f64 v_14 = npyv_load_f64(ip + 14 * vstep);
        npyv_f64 r_14 = npyv_negative_f64(v_14);
        npyv_storen_f64(op + 14 * vstep * ostride, ostride, r_14);
    #endif
    
#line 142
    #if UNROLL > 15
        npyv_f64 v_15 = npyv_load_f64(ip + 15 * vstep);
        npyv_f64 r_15 = npyv_negative_f64(v_15);
        npyv_storen_f64(op + 15 * vstep * ostride, ostride, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += vstep, op += ostride*vstep) {
        npyv_f64 v = npyv_load_f64(ip);
        npyv_f64 r = npyv_negative_f64(v);
        npyv_storen_f64(op, ostride, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ++ip, op += ostride) {
        *op = scalar_negative(*ip);
    }
}
// non-contiguous input, contiguous output
static NPY_INLINE void
simd_unary_nc_negative_f64(const npyv_lanetype_f64 *ip, npy_intp istride,
                             npyv_lanetype_f64 *op,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += wstep) {
    #line 174
    #if UNROLL > 0
        npyv_f64 v_0 = npyv_loadn_f64(ip + 0 * vstep * istride, istride);
        npyv_f64 r_0 = npyv_negative_f64(v_0);
        npyv_store_f64(op + 0 * vstep, r_0);
    #endif
    
#line 174
    #if UNROLL > 1
        npyv_f64 v_1 = npyv_loadn_f64(ip + 1 * vstep * istride, istride);
        npyv_f64 r_1 = npyv_negative_f64(v_1);
        npyv_store_f64(op + 1 * vstep, r_1);
    #endif
    
#line 174
    #if UNROLL > 2
        npyv_f64 v_2 = npyv_loadn_f64(ip + 2 * vstep * istride, istride);
        npyv_f64 r_2 = npyv_negative_f64(v_2);
        npyv_store_f64(op + 2 * vstep, r_2);
    #endif
    
#line 174
    #if UNROLL > 3
        npyv_f64 v_3 = npyv_loadn_f64(ip + 3 * vstep * istride, istride);
        npyv_f64 r_3 = npyv_negative_f64(v_3);
        npyv_store_f64(op + 3 * vstep, r_3);
    #endif
    
#line 174
    #if UNROLL > 4
        npyv_f64 v_4 = npyv_loadn_f64(ip + 4 * vstep * istride, istride);
        npyv_f64 r_4 = npyv_negative_f64(v_4);
        npyv_store_f64(op + 4 * vstep, r_4);
    #endif
    
#line 174
    #if UNROLL > 5
        npyv_f64 v_5 = npyv_loadn_f64(ip + 5 * vstep * istride, istride);
        npyv_f64 r_5 = npyv_negative_f64(v_5);
        npyv_store_f64(op + 5 * vstep, r_5);
    #endif
    
#line 174
    #if UNROLL > 6
        npyv_f64 v_6 = npyv_loadn_f64(ip + 6 * vstep * istride, istride);
        npyv_f64 r_6 = npyv_negative_f64(v_6);
        npyv_store_f64(op + 6 * vstep, r_6);
    #endif
    
#line 174
    #if UNROLL > 7
        npyv_f64 v_7 = npyv_loadn_f64(ip + 7 * vstep * istride, istride);
        npyv_f64 r_7 = npyv_negative_f64(v_7);
        npyv_store_f64(op + 7 * vstep, r_7);
    #endif
    
#line 174
    #if UNROLL > 8
        npyv_f64 v_8 = npyv_loadn_f64(ip + 8 * vstep * istride, istride);
        npyv_f64 r_8 = npyv_negative_f64(v_8);
        npyv_store_f64(op + 8 * vstep, r_8);
    #endif
    
#line 174
    #if UNROLL > 9
        npyv_f64 v_9 = npyv_loadn_f64(ip + 9 * vstep * istride, istride);
        npyv_f64 r_9 = npyv_negative_f64(v_9);
        npyv_store_f64(op + 9 * vstep, r_9);
    #endif
    
#line 174
    #if UNROLL > 10
        npyv_f64 v_10 = npyv_loadn_f64(ip + 10 * vstep * istride, istride);
        npyv_f64 r_10 = npyv_negative_f64(v_10);
        npyv_store_f64(op + 10 * vstep, r_10);
    #endif
    
#line 174
    #if UNROLL > 11
        npyv_f64 v_11 = npyv_loadn_f64(ip + 11 * vstep * istride, istride);
        npyv_f64 r_11 = npyv_negative_f64(v_11);
        npyv_store_f64(op + 11 * vstep, r_11);
    #endif
    
#line 174
    #if UNROLL > 12
        npyv_f64 v_12 = npyv_loadn_f64(ip + 12 * vstep * istride, istride);
        npyv_f64 r_12 = npyv_negative_f64(v_12);
        npyv_store_f64(op + 12 * vstep, r_12);
    #endif
    
#line 174
    #if UNROLL > 13
        npyv_f64 v_13 = npyv_loadn_f64(ip + 13 * vstep * istride, istride);
        npyv_f64 r_13 = npyv_negative_f64(v_13);
        npyv_store_f64(op + 13 * vstep, r_13);
    #endif
    
#line 174
    #if UNROLL > 14
        npyv_f64 v_14 = npyv_loadn_f64(ip + 14 * vstep * istride, istride);
        npyv_f64 r_14 = npyv_negative_f64(v_14);
        npyv_store_f64(op + 14 * vstep, r_14);
    #endif
    
#line 174
    #if UNROLL > 15
        npyv_f64 v_15 = npyv_loadn_f64(ip + 15 * vstep * istride, istride);
        npyv_f64 r_15 = npyv_negative_f64(v_15);
        npyv_store_f64(op + 15 * vstep, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += vstep) {
        npyv_f64 v = npyv_loadn_f64(ip, istride);
        npyv_f64 r = npyv_negative_f64(v);
        npyv_store_f64(op, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ip += istride, ++op) {
        *op = scalar_negative(*ip);
    }
}
// non-contiguous input and output
// limit unroll to 2x
#if UNROLL > 2
#undef UNROLL
#define UNROLL 2
#endif
// X86 does better with unrolled scalar for heavy non-contiguous
#ifndef NPY_HAVE_SSE2
static NPY_INLINE void
simd_unary_nn_negative_f64(const npyv_lanetype_f64 *ip, npy_intp istride,
                             npyv_lanetype_f64 *op, npy_intp ostride,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
    #line 213
    #if UNROLL > 0
        npyv_f64 v_0 = npyv_loadn_f64(ip + 0 * vstep * istride, istride);
        npyv_f64 r_0 = npyv_negative_f64(v_0);
        npyv_storen_f64(op + 0 * vstep * ostride, ostride, r_0);
    #endif
    
#line 213
    #if UNROLL > 1
        npyv_f64 v_1 = npyv_loadn_f64(ip + 1 * vstep * istride, istride);
        npyv_f64 r_1 = npyv_negative_f64(v_1);
        npyv_storen_f64(op + 1 * vstep * ostride, ostride, r_1);
    #endif
    
#line 213
    #if UNROLL > 2
        npyv_f64 v_2 = npyv_loadn_f64(ip + 2 * vstep * istride, istride);
        npyv_f64 r_2 = npyv_negative_f64(v_2);
        npyv_storen_f64(op + 2 * vstep * ostride, ostride, r_2);
    #endif
    
#line 213
    #if UNROLL > 3
        npyv_f64 v_3 = npyv_loadn_f64(ip + 3 * vstep * istride, istride);
        npyv_f64 r_3 = npyv_negative_f64(v_3);
        npyv_storen_f64(op + 3 * vstep * ostride, ostride, r_3);
    #endif
    
#line 213
    #if UNROLL > 4
        npyv_f64 v_4 = npyv_loadn_f64(ip + 4 * vstep * istride, istride);
        npyv_f64 r_4 = npyv_negative_f64(v_4);
        npyv_storen_f64(op + 4 * vstep * ostride, ostride, r_4);
    #endif
    
#line 213
    #if UNROLL > 5
        npyv_f64 v_5 = npyv_loadn_f64(ip + 5 * vstep * istride, istride);
        npyv_f64 r_5 = npyv_negative_f64(v_5);
        npyv_storen_f64(op + 5 * vstep * ostride, ostride, r_5);
    #endif
    
#line 213
    #if UNROLL > 6
        npyv_f64 v_6 = npyv_loadn_f64(ip + 6 * vstep * istride, istride);
        npyv_f64 r_6 = npyv_negative_f64(v_6);
        npyv_storen_f64(op + 6 * vstep * ostride, ostride, r_6);
    #endif
    
#line 213
    #if UNROLL > 7
        npyv_f64 v_7 = npyv_loadn_f64(ip + 7 * vstep * istride, istride);
        npyv_f64 r_7 = npyv_negative_f64(v_7);
        npyv_storen_f64(op + 7 * vstep * ostride, ostride, r_7);
    #endif
    
#line 213
    #if UNROLL > 8
        npyv_f64 v_8 = npyv_loadn_f64(ip + 8 * vstep * istride, istride);
        npyv_f64 r_8 = npyv_negative_f64(v_8);
        npyv_storen_f64(op + 8 * vstep * ostride, ostride, r_8);
    #endif
    
#line 213
    #if UNROLL > 9
        npyv_f64 v_9 = npyv_loadn_f64(ip + 9 * vstep * istride, istride);
        npyv_f64 r_9 = npyv_negative_f64(v_9);
        npyv_storen_f64(op + 9 * vstep * ostride, ostride, r_9);
    #endif
    
#line 213
    #if UNROLL > 10
        npyv_f64 v_10 = npyv_loadn_f64(ip + 10 * vstep * istride, istride);
        npyv_f64 r_10 = npyv_negative_f64(v_10);
        npyv_storen_f64(op + 10 * vstep * ostride, ostride, r_10);
    #endif
    
#line 213
    #if UNROLL > 11
        npyv_f64 v_11 = npyv_loadn_f64(ip + 11 * vstep * istride, istride);
        npyv_f64 r_11 = npyv_negative_f64(v_11);
        npyv_storen_f64(op + 11 * vstep * ostride, ostride, r_11);
    #endif
    
#line 213
    #if UNROLL > 12
        npyv_f64 v_12 = npyv_loadn_f64(ip + 12 * vstep * istride, istride);
        npyv_f64 r_12 = npyv_negative_f64(v_12);
        npyv_storen_f64(op + 12 * vstep * ostride, ostride, r_12);
    #endif
    
#line 213
    #if UNROLL > 13
        npyv_f64 v_13 = npyv_loadn_f64(ip + 13 * vstep * istride, istride);
        npyv_f64 r_13 = npyv_negative_f64(v_13);
        npyv_storen_f64(op + 13 * vstep * ostride, ostride, r_13);
    #endif
    
#line 213
    #if UNROLL > 14
        npyv_f64 v_14 = npyv_loadn_f64(ip + 14 * vstep * istride, istride);
        npyv_f64 r_14 = npyv_negative_f64(v_14);
        npyv_storen_f64(op + 14 * vstep * ostride, ostride, r_14);
    #endif
    
#line 213
    #if UNROLL > 15
        npyv_f64 v_15 = npyv_loadn_f64(ip + 15 * vstep * istride, istride);
        npyv_f64 r_15 = npyv_negative_f64(v_15);
        npyv_storen_f64(op + 15 * vstep * ostride, ostride, r_15);
    #endif
    
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
        npyv_f64 v = npyv_loadn_f64(ip, istride);
        npyv_f64 r = npyv_negative_f64(v);
        npyv_storen_f64(op, ostride, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ip += istride, op += ostride) {
        *op = scalar_negative(*ip);
    }
}
#endif // NPY_HAVE_SSE2
#endif // 1
#undef UNROLL
#endif // NPY_SIMD_F64
/*end repeat1**/


/********************************************************************************
 ** Defining ufunc inner functions
 ********************************************************************************/
#line 257
#undef TO_SIMD_SFX
#if 0
#line 262
#elif NPY_SIMD && NPY_BITSOF_BYTE == 8
    #if 0
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_BYTE == 16
    #if 0
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_BYTE == 32
    #if 0
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_BYTE == 64
    #if 0
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip = args[0], *op = args[1];
    npy_intp istep = steps[0], ostep = steps[1],
             len = dimensions[0];
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (!is_mem_overlap(ip, istep, op, ostep, len)) {
        if (IS_UNARY_CONT(npy_ubyte, npy_ubyte)) {
            // no overlap and operands are contiguous
            TO_SIMD_SFX(simd_unary_cc_negative)(
                (STYPE*)ip, (STYPE*)op, len
            );
            goto clear;
        }
    #if 0
        const npy_intp istride = istep / sizeof(STYPE);
        const npy_intp ostride = ostep / sizeof(STYPE);
        if (TO_SIMD_SFX(npyv_loadable_stride)(istride) &&
            TO_SIMD_SFX(npyv_storable_stride)(ostride))
        {
            if (istride == 1 && ostride != 1) {
                // contiguous input, non-contiguous output
                TO_SIMD_SFX(simd_unary_cn_negative)(
                    (STYPE*)ip, (STYPE*)op, ostride, len
                );
                goto clear;
            }
            else if (istride != 1 && ostride == 1) {
                // non-contiguous input, contiguous output
                TO_SIMD_SFX(simd_unary_nc_negative)(
                    (STYPE*)ip, istride, (STYPE*)op, len
                );
                goto clear;
            }
        // X86 does better with unrolled scalar for heavy non-contiguous
        #ifndef NPY_HAVE_SSE2
            else if (istride != 1 && ostride != 1) {
                // non-contiguous input and output
                TO_SIMD_SFX(simd_unary_nn_negative)(
                    (STYPE*)ip, istride, (STYPE*)op, ostride, len
                );
                goto clear;
            }
        #endif
        }
    #endif // 0
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    /*
     * scalar unrolls
     * 8x unroll performed best on
     *  - Apple M1 Native / arm64
     *  - Apple M1 Rosetta / SSE42
     *  - iMacPro / AVX512
     */
    #define UNROLL 8
    for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
    #line 347
    #if UNROLL > 0
        const npy_ubyte in_0 = *((const npy_ubyte *)(ip + 0 * istep));
        *((npy_ubyte *)(op + 0 * ostep)) = scalar_negative(in_0);
    #endif
    
#line 347
    #if UNROLL > 1
        const npy_ubyte in_1 = *((const npy_ubyte *)(ip + 1 * istep));
        *((npy_ubyte *)(op + 1 * ostep)) = scalar_negative(in_1);
    #endif
    
#line 347
    #if UNROLL > 2
        const npy_ubyte in_2 = *((const npy_ubyte *)(ip + 2 * istep));
        *((npy_ubyte *)(op + 2 * ostep)) = scalar_negative(in_2);
    #endif
    
#line 347
    #if UNROLL > 3
        const npy_ubyte in_3 = *((const npy_ubyte *)(ip + 3 * istep));
        *((npy_ubyte *)(op + 3 * ostep)) = scalar_negative(in_3);
    #endif
    
#line 347
    #if UNROLL > 4
        const npy_ubyte in_4 = *((const npy_ubyte *)(ip + 4 * istep));
        *((npy_ubyte *)(op + 4 * ostep)) = scalar_negative(in_4);
    #endif
    
#line 347
    #if UNROLL > 5
        const npy_ubyte in_5 = *((const npy_ubyte *)(ip + 5 * istep));
        *((npy_ubyte *)(op + 5 * ostep)) = scalar_negative(in_5);
    #endif
    
#line 347
    #if UNROLL > 6
        const npy_ubyte in_6 = *((const npy_ubyte *)(ip + 6 * istep));
        *((npy_ubyte *)(op + 6 * ostep)) = scalar_negative(in_6);
    #endif
    
#line 347
    #if UNROLL > 7
        const npy_ubyte in_7 = *((const npy_ubyte *)(ip + 7 * istep));
        *((npy_ubyte *)(op + 7 * ostep)) = scalar_negative(in_7);
    #endif
    
#line 347
    #if UNROLL > 8
        const npy_ubyte in_8 = *((const npy_ubyte *)(ip + 8 * istep));
        *((npy_ubyte *)(op + 8 * ostep)) = scalar_negative(in_8);
    #endif
    
#line 347
    #if UNROLL > 9
        const npy_ubyte in_9 = *((const npy_ubyte *)(ip + 9 * istep));
        *((npy_ubyte *)(op + 9 * ostep)) = scalar_negative(in_9);
    #endif
    
#line 347
    #if UNROLL > 10
        const npy_ubyte in_10 = *((const npy_ubyte *)(ip + 10 * istep));
        *((npy_ubyte *)(op + 10 * ostep)) = scalar_negative(in_10);
    #endif
    
#line 347
    #if UNROLL > 11
        const npy_ubyte in_11 = *((const npy_ubyte *)(ip + 11 * istep));
        *((npy_ubyte *)(op + 11 * ostep)) = scalar_negative(in_11);
    #endif
    
#line 347
    #if UNROLL > 12
        const npy_ubyte in_12 = *((const npy_ubyte *)(ip + 12 * istep));
        *((npy_ubyte *)(op + 12 * ostep)) = scalar_negative(in_12);
    #endif
    
#line 347
    #if UNROLL > 13
        const npy_ubyte in_13 = *((const npy_ubyte *)(ip + 13 * istep));
        *((npy_ubyte *)(op + 13 * ostep)) = scalar_negative(in_13);
    #endif
    
#line 347
    #if UNROLL > 14
        const npy_ubyte in_14 = *((const npy_ubyte *)(ip + 14 * istep));
        *((npy_ubyte *)(op + 14 * ostep)) = scalar_negative(in_14);
    #endif
    
#line 347
    #if UNROLL > 15
        const npy_ubyte in_15 = *((const npy_ubyte *)(ip + 15 * istep));
        *((npy_ubyte *)(op + 15 * ostep)) = scalar_negative(in_15);
    #endif
    
    }
#endif // NPY_DISABLE_OPTIMIZATION
    for (; len > 0; --len, ip += istep, op += ostep) {
        *((npy_ubyte *)op) = scalar_negative(*(const npy_ubyte *)ip);
    }
#ifdef TO_SIMD_SFX
clear:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 257
#undef TO_SIMD_SFX
#if 0
#line 262
#elif NPY_SIMD && NPY_BITSOF_SHORT == 8
    #if 0
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_SHORT == 16
    #if 0
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_SHORT == 32
    #if 0
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_SHORT == 64
    #if 0
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip = args[0], *op = args[1];
    npy_intp istep = steps[0], ostep = steps[1],
             len = dimensions[0];
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (!is_mem_overlap(ip, istep, op, ostep, len)) {
        if (IS_UNARY_CONT(npy_ushort, npy_ushort)) {
            // no overlap and operands are contiguous
            TO_SIMD_SFX(simd_unary_cc_negative)(
                (STYPE*)ip, (STYPE*)op, len
            );
            goto clear;
        }
    #if 0
        const npy_intp istride = istep / sizeof(STYPE);
        const npy_intp ostride = ostep / sizeof(STYPE);
        if (TO_SIMD_SFX(npyv_loadable_stride)(istride) &&
            TO_SIMD_SFX(npyv_storable_stride)(ostride))
        {
            if (istride == 1 && ostride != 1) {
                // contiguous input, non-contiguous output
                TO_SIMD_SFX(simd_unary_cn_negative)(
                    (STYPE*)ip, (STYPE*)op, ostride, len
                );
                goto clear;
            }
            else if (istride != 1 && ostride == 1) {
                // non-contiguous input, contiguous output
                TO_SIMD_SFX(simd_unary_nc_negative)(
                    (STYPE*)ip, istride, (STYPE*)op, len
                );
                goto clear;
            }
        // X86 does better with unrolled scalar for heavy non-contiguous
        #ifndef NPY_HAVE_SSE2
            else if (istride != 1 && ostride != 1) {
                // non-contiguous input and output
                TO_SIMD_SFX(simd_unary_nn_negative)(
                    (STYPE*)ip, istride, (STYPE*)op, ostride, len
                );
                goto clear;
            }
        #endif
        }
    #endif // 0
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    /*
     * scalar unrolls
     * 8x unroll performed best on
     *  - Apple M1 Native / arm64
     *  - Apple M1 Rosetta / SSE42
     *  - iMacPro / AVX512
     */
    #define UNROLL 8
    for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
    #line 347
    #if UNROLL > 0
        const npy_ushort in_0 = *((const npy_ushort *)(ip + 0 * istep));
        *((npy_ushort *)(op + 0 * ostep)) = scalar_negative(in_0);
    #endif
    
#line 347
    #if UNROLL > 1
        const npy_ushort in_1 = *((const npy_ushort *)(ip + 1 * istep));
        *((npy_ushort *)(op + 1 * ostep)) = scalar_negative(in_1);
    #endif
    
#line 347
    #if UNROLL > 2
        const npy_ushort in_2 = *((const npy_ushort *)(ip + 2 * istep));
        *((npy_ushort *)(op + 2 * ostep)) = scalar_negative(in_2);
    #endif
    
#line 347
    #if UNROLL > 3
        const npy_ushort in_3 = *((const npy_ushort *)(ip + 3 * istep));
        *((npy_ushort *)(op + 3 * ostep)) = scalar_negative(in_3);
    #endif
    
#line 347
    #if UNROLL > 4
        const npy_ushort in_4 = *((const npy_ushort *)(ip + 4 * istep));
        *((npy_ushort *)(op + 4 * ostep)) = scalar_negative(in_4);
    #endif
    
#line 347
    #if UNROLL > 5
        const npy_ushort in_5 = *((const npy_ushort *)(ip + 5 * istep));
        *((npy_ushort *)(op + 5 * ostep)) = scalar_negative(in_5);
    #endif
    
#line 347
    #if UNROLL > 6
        const npy_ushort in_6 = *((const npy_ushort *)(ip + 6 * istep));
        *((npy_ushort *)(op + 6 * ostep)) = scalar_negative(in_6);
    #endif
    
#line 347
    #if UNROLL > 7
        const npy_ushort in_7 = *((const npy_ushort *)(ip + 7 * istep));
        *((npy_ushort *)(op + 7 * ostep)) = scalar_negative(in_7);
    #endif
    
#line 347
    #if UNROLL > 8
        const npy_ushort in_8 = *((const npy_ushort *)(ip + 8 * istep));
        *((npy_ushort *)(op + 8 * ostep)) = scalar_negative(in_8);
    #endif
    
#line 347
    #if UNROLL > 9
        const npy_ushort in_9 = *((const npy_ushort *)(ip + 9 * istep));
        *((npy_ushort *)(op + 9 * ostep)) = scalar_negative(in_9);
    #endif
    
#line 347
    #if UNROLL > 10
        const npy_ushort in_10 = *((const npy_ushort *)(ip + 10 * istep));
        *((npy_ushort *)(op + 10 * ostep)) = scalar_negative(in_10);
    #endif
    
#line 347
    #if UNROLL > 11
        const npy_ushort in_11 = *((const npy_ushort *)(ip + 11 * istep));
        *((npy_ushort *)(op + 11 * ostep)) = scalar_negative(in_11);
    #endif
    
#line 347
    #if UNROLL > 12
        const npy_ushort in_12 = *((const npy_ushort *)(ip + 12 * istep));
        *((npy_ushort *)(op + 12 * ostep)) = scalar_negative(in_12);
    #endif
    
#line 347
    #if UNROLL > 13
        const npy_ushort in_13 = *((const npy_ushort *)(ip + 13 * istep));
        *((npy_ushort *)(op + 13 * ostep)) = scalar_negative(in_13);
    #endif
    
#line 347
    #if UNROLL > 14
        const npy_ushort in_14 = *((const npy_ushort *)(ip + 14 * istep));
        *((npy_ushort *)(op + 14 * ostep)) = scalar_negative(in_14);
    #endif
    
#line 347
    #if UNROLL > 15
        const npy_ushort in_15 = *((const npy_ushort *)(ip + 15 * istep));
        *((npy_ushort *)(op + 15 * ostep)) = scalar_negative(in_15);
    #endif
    
    }
#endif // NPY_DISABLE_OPTIMIZATION
    for (; len > 0; --len, ip += istep, op += ostep) {
        *((npy_ushort *)op) = scalar_negative(*(const npy_ushort *)ip);
    }
#ifdef TO_SIMD_SFX
clear:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 257
#undef TO_SIMD_SFX
#if 0
#line 262
#elif NPY_SIMD && NPY_BITSOF_INT == 8
    #if 0
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_INT == 16
    #if 0
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_INT == 32
    #if 0
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_INT == 64
    #if 0
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip = args[0], *op = args[1];
    npy_intp istep = steps[0], ostep = steps[1],
             len = dimensions[0];
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (!is_mem_overlap(ip, istep, op, ostep, len)) {
        if (IS_UNARY_CONT(npy_uint, npy_uint)) {
            // no overlap and operands are contiguous
            TO_SIMD_SFX(simd_unary_cc_negative)(
                (STYPE*)ip, (STYPE*)op, len
            );
            goto clear;
        }
    #if 1
        const npy_intp istride = istep / sizeof(STYPE);
        const npy_intp ostride = ostep / sizeof(STYPE);
        if (TO_SIMD_SFX(npyv_loadable_stride)(istride) &&
            TO_SIMD_SFX(npyv_storable_stride)(ostride))
        {
            if (istride == 1 && ostride != 1) {
                // contiguous input, non-contiguous output
                TO_SIMD_SFX(simd_unary_cn_negative)(
                    (STYPE*)ip, (STYPE*)op, ostride, len
                );
                goto clear;
            }
            else if (istride != 1 && ostride == 1) {
                // non-contiguous input, contiguous output
                TO_SIMD_SFX(simd_unary_nc_negative)(
                    (STYPE*)ip, istride, (STYPE*)op, len
                );
                goto clear;
            }
        // X86 does better with unrolled scalar for heavy non-contiguous
        #ifndef NPY_HAVE_SSE2
            else if (istride != 1 && ostride != 1) {
                // non-contiguous input and output
                TO_SIMD_SFX(simd_unary_nn_negative)(
                    (STYPE*)ip, istride, (STYPE*)op, ostride, len
                );
                goto clear;
            }
        #endif
        }
    #endif // 1
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    /*
     * scalar unrolls
     * 8x unroll performed best on
     *  - Apple M1 Native / arm64
     *  - Apple M1 Rosetta / SSE42
     *  - iMacPro / AVX512
     */
    #define UNROLL 8
    for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
    #line 347
    #if UNROLL > 0
        const npy_uint in_0 = *((const npy_uint *)(ip + 0 * istep));
        *((npy_uint *)(op + 0 * ostep)) = scalar_negative(in_0);
    #endif
    
#line 347
    #if UNROLL > 1
        const npy_uint in_1 = *((const npy_uint *)(ip + 1 * istep));
        *((npy_uint *)(op + 1 * ostep)) = scalar_negative(in_1);
    #endif
    
#line 347
    #if UNROLL > 2
        const npy_uint in_2 = *((const npy_uint *)(ip + 2 * istep));
        *((npy_uint *)(op + 2 * ostep)) = scalar_negative(in_2);
    #endif
    
#line 347
    #if UNROLL > 3
        const npy_uint in_3 = *((const npy_uint *)(ip + 3 * istep));
        *((npy_uint *)(op + 3 * ostep)) = scalar_negative(in_3);
    #endif
    
#line 347
    #if UNROLL > 4
        const npy_uint in_4 = *((const npy_uint *)(ip + 4 * istep));
        *((npy_uint *)(op + 4 * ostep)) = scalar_negative(in_4);
    #endif
    
#line 347
    #if UNROLL > 5
        const npy_uint in_5 = *((const npy_uint *)(ip + 5 * istep));
        *((npy_uint *)(op + 5 * ostep)) = scalar_negative(in_5);
    #endif
    
#line 347
    #if UNROLL > 6
        const npy_uint in_6 = *((const npy_uint *)(ip + 6 * istep));
        *((npy_uint *)(op + 6 * ostep)) = scalar_negative(in_6);
    #endif
    
#line 347
    #if UNROLL > 7
        const npy_uint in_7 = *((const npy_uint *)(ip + 7 * istep));
        *((npy_uint *)(op + 7 * ostep)) = scalar_negative(in_7);
    #endif
    
#line 347
    #if UNROLL > 8
        const npy_uint in_8 = *((const npy_uint *)(ip + 8 * istep));
        *((npy_uint *)(op + 8 * ostep)) = scalar_negative(in_8);
    #endif
    
#line 347
    #if UNROLL > 9
        const npy_uint in_9 = *((const npy_uint *)(ip + 9 * istep));
        *((npy_uint *)(op + 9 * ostep)) = scalar_negative(in_9);
    #endif
    
#line 347
    #if UNROLL > 10
        const npy_uint in_10 = *((const npy_uint *)(ip + 10 * istep));
        *((npy_uint *)(op + 10 * ostep)) = scalar_negative(in_10);
    #endif
    
#line 347
    #if UNROLL > 11
        const npy_uint in_11 = *((const npy_uint *)(ip + 11 * istep));
        *((npy_uint *)(op + 11 * ostep)) = scalar_negative(in_11);
    #endif
    
#line 347
    #if UNROLL > 12
        const npy_uint in_12 = *((const npy_uint *)(ip + 12 * istep));
        *((npy_uint *)(op + 12 * ostep)) = scalar_negative(in_12);
    #endif
    
#line 347
    #if UNROLL > 13
        const npy_uint in_13 = *((const npy_uint *)(ip + 13 * istep));
        *((npy_uint *)(op + 13 * ostep)) = scalar_negative(in_13);
    #endif
    
#line 347
    #if UNROLL > 14
        const npy_uint in_14 = *((const npy_uint *)(ip + 14 * istep));
        *((npy_uint *)(op + 14 * ostep)) = scalar_negative(in_14);
    #endif
    
#line 347
    #if UNROLL > 15
        const npy_uint in_15 = *((const npy_uint *)(ip + 15 * istep));
        *((npy_uint *)(op + 15 * ostep)) = scalar_negative(in_15);
    #endif
    
    }
#endif // NPY_DISABLE_OPTIMIZATION
    for (; len > 0; --len, ip += istep, op += ostep) {
        *((npy_uint *)op) = scalar_negative(*(const npy_uint *)ip);
    }
#ifdef TO_SIMD_SFX
clear:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 257
#undef TO_SIMD_SFX
#if 0
#line 262
#elif NPY_SIMD && NPY_BITSOF_LONG == 8
    #if 0
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_LONG == 16
    #if 0
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_LONG == 32
    #if 0
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_LONG == 64
    #if 0
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip = args[0], *op = args[1];
    npy_intp istep = steps[0], ostep = steps[1],
             len = dimensions[0];
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (!is_mem_overlap(ip, istep, op, ostep, len)) {
        if (IS_UNARY_CONT(npy_ulong, npy_ulong)) {
            // no overlap and operands are contiguous
            TO_SIMD_SFX(simd_unary_cc_negative)(
                (STYPE*)ip, (STYPE*)op, len
            );
            goto clear;
        }
    #if 1
        const npy_intp istride = istep / sizeof(STYPE);
        const npy_intp ostride = ostep / sizeof(STYPE);
        if (TO_SIMD_SFX(npyv_loadable_stride)(istride) &&
            TO_SIMD_SFX(npyv_storable_stride)(ostride))
        {
            if (istride == 1 && ostride != 1) {
                // contiguous input, non-contiguous output
                TO_SIMD_SFX(simd_unary_cn_negative)(
                    (STYPE*)ip, (STYPE*)op, ostride, len
                );
                goto clear;
            }
            else if (istride != 1 && ostride == 1) {
                // non-contiguous input, contiguous output
                TO_SIMD_SFX(simd_unary_nc_negative)(
                    (STYPE*)ip, istride, (STYPE*)op, len
                );
                goto clear;
            }
        // X86 does better with unrolled scalar for heavy non-contiguous
        #ifndef NPY_HAVE_SSE2
            else if (istride != 1 && ostride != 1) {
                // non-contiguous input and output
                TO_SIMD_SFX(simd_unary_nn_negative)(
                    (STYPE*)ip, istride, (STYPE*)op, ostride, len
                );
                goto clear;
            }
        #endif
        }
    #endif // 1
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    /*
     * scalar unrolls
     * 8x unroll performed best on
     *  - Apple M1 Native / arm64
     *  - Apple M1 Rosetta / SSE42
     *  - iMacPro / AVX512
     */
    #define UNROLL 8
    for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
    #line 347
    #if UNROLL > 0
        const npy_ulong in_0 = *((const npy_ulong *)(ip + 0 * istep));
        *((npy_ulong *)(op + 0 * ostep)) = scalar_negative(in_0);
    #endif
    
#line 347
    #if UNROLL > 1
        const npy_ulong in_1 = *((const npy_ulong *)(ip + 1 * istep));
        *((npy_ulong *)(op + 1 * ostep)) = scalar_negative(in_1);
    #endif
    
#line 347
    #if UNROLL > 2
        const npy_ulong in_2 = *((const npy_ulong *)(ip + 2 * istep));
        *((npy_ulong *)(op + 2 * ostep)) = scalar_negative(in_2);
    #endif
    
#line 347
    #if UNROLL > 3
        const npy_ulong in_3 = *((const npy_ulong *)(ip + 3 * istep));
        *((npy_ulong *)(op + 3 * ostep)) = scalar_negative(in_3);
    #endif
    
#line 347
    #if UNROLL > 4
        const npy_ulong in_4 = *((const npy_ulong *)(ip + 4 * istep));
        *((npy_ulong *)(op + 4 * ostep)) = scalar_negative(in_4);
    #endif
    
#line 347
    #if UNROLL > 5
        const npy_ulong in_5 = *((const npy_ulong *)(ip + 5 * istep));
        *((npy_ulong *)(op + 5 * ostep)) = scalar_negative(in_5);
    #endif
    
#line 347
    #if UNROLL > 6
        const npy_ulong in_6 = *((const npy_ulong *)(ip + 6 * istep));
        *((npy_ulong *)(op + 6 * ostep)) = scalar_negative(in_6);
    #endif
    
#line 347
    #if UNROLL > 7
        const npy_ulong in_7 = *((const npy_ulong *)(ip + 7 * istep));
        *((npy_ulong *)(op + 7 * ostep)) = scalar_negative(in_7);
    #endif
    
#line 347
    #if UNROLL > 8
        const npy_ulong in_8 = *((const npy_ulong *)(ip + 8 * istep));
        *((npy_ulong *)(op + 8 * ostep)) = scalar_negative(in_8);
    #endif
    
#line 347
    #if UNROLL > 9
        const npy_ulong in_9 = *((const npy_ulong *)(ip + 9 * istep));
        *((npy_ulong *)(op + 9 * ostep)) = scalar_negative(in_9);
    #endif
    
#line 347
    #if UNROLL > 10
        const npy_ulong in_10 = *((const npy_ulong *)(ip + 10 * istep));
        *((npy_ulong *)(op + 10 * ostep)) = scalar_negative(in_10);
    #endif
    
#line 347
    #if UNROLL > 11
        const npy_ulong in_11 = *((const npy_ulong *)(ip + 11 * istep));
        *((npy_ulong *)(op + 11 * ostep)) = scalar_negative(in_11);
    #endif
    
#line 347
    #if UNROLL > 12
        const npy_ulong in_12 = *((const npy_ulong *)(ip + 12 * istep));
        *((npy_ulong *)(op + 12 * ostep)) = scalar_negative(in_12);
    #endif
    
#line 347
    #if UNROLL > 13
        const npy_ulong in_13 = *((const npy_ulong *)(ip + 13 * istep));
        *((npy_ulong *)(op + 13 * ostep)) = scalar_negative(in_13);
    #endif
    
#line 347
    #if UNROLL > 14
        const npy_ulong in_14 = *((const npy_ulong *)(ip + 14 * istep));
        *((npy_ulong *)(op + 14 * ostep)) = scalar_negative(in_14);
    #endif
    
#line 347
    #if UNROLL > 15
        const npy_ulong in_15 = *((const npy_ulong *)(ip + 15 * istep));
        *((npy_ulong *)(op + 15 * ostep)) = scalar_negative(in_15);
    #endif
    
    }
#endif // NPY_DISABLE_OPTIMIZATION
    for (; len > 0; --len, ip += istep, op += ostep) {
        *((npy_ulong *)op) = scalar_negative(*(const npy_ulong *)ip);
    }
#ifdef TO_SIMD_SFX
clear:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 257
#undef TO_SIMD_SFX
#if 0
#line 262
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 8
    #if 0
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 16
    #if 0
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 32
    #if 0
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 64
    #if 0
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip = args[0], *op = args[1];
    npy_intp istep = steps[0], ostep = steps[1],
             len = dimensions[0];
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (!is_mem_overlap(ip, istep, op, ostep, len)) {
        if (IS_UNARY_CONT(npy_ulonglong, npy_ulonglong)) {
            // no overlap and operands are contiguous
            TO_SIMD_SFX(simd_unary_cc_negative)(
                (STYPE*)ip, (STYPE*)op, len
            );
            goto clear;
        }
    #if 1
        const npy_intp istride = istep / sizeof(STYPE);
        const npy_intp ostride = ostep / sizeof(STYPE);
        if (TO_SIMD_SFX(npyv_loadable_stride)(istride) &&
            TO_SIMD_SFX(npyv_storable_stride)(ostride))
        {
            if (istride == 1 && ostride != 1) {
                // contiguous input, non-contiguous output
                TO_SIMD_SFX(simd_unary_cn_negative)(
                    (STYPE*)ip, (STYPE*)op, ostride, len
                );
                goto clear;
            }
            else if (istride != 1 && ostride == 1) {
                // non-contiguous input, contiguous output
                TO_SIMD_SFX(simd_unary_nc_negative)(
                    (STYPE*)ip, istride, (STYPE*)op, len
                );
                goto clear;
            }
        // X86 does better with unrolled scalar for heavy non-contiguous
        #ifndef NPY_HAVE_SSE2
            else if (istride != 1 && ostride != 1) {
                // non-contiguous input and output
                TO_SIMD_SFX(simd_unary_nn_negative)(
                    (STYPE*)ip, istride, (STYPE*)op, ostride, len
                );
                goto clear;
            }
        #endif
        }
    #endif // 1
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    /*
     * scalar unrolls
     * 8x unroll performed best on
     *  - Apple M1 Native / arm64
     *  - Apple M1 Rosetta / SSE42
     *  - iMacPro / AVX512
     */
    #define UNROLL 8
    for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
    #line 347
    #if UNROLL > 0
        const npy_ulonglong in_0 = *((const npy_ulonglong *)(ip + 0 * istep));
        *((npy_ulonglong *)(op + 0 * ostep)) = scalar_negative(in_0);
    #endif
    
#line 347
    #if UNROLL > 1
        const npy_ulonglong in_1 = *((const npy_ulonglong *)(ip + 1 * istep));
        *((npy_ulonglong *)(op + 1 * ostep)) = scalar_negative(in_1);
    #endif
    
#line 347
    #if UNROLL > 2
        const npy_ulonglong in_2 = *((const npy_ulonglong *)(ip + 2 * istep));
        *((npy_ulonglong *)(op + 2 * ostep)) = scalar_negative(in_2);
    #endif
    
#line 347
    #if UNROLL > 3
        const npy_ulonglong in_3 = *((const npy_ulonglong *)(ip + 3 * istep));
        *((npy_ulonglong *)(op + 3 * ostep)) = scalar_negative(in_3);
    #endif
    
#line 347
    #if UNROLL > 4
        const npy_ulonglong in_4 = *((const npy_ulonglong *)(ip + 4 * istep));
        *((npy_ulonglong *)(op + 4 * ostep)) = scalar_negative(in_4);
    #endif
    
#line 347
    #if UNROLL > 5
        const npy_ulonglong in_5 = *((const npy_ulonglong *)(ip + 5 * istep));
        *((npy_ulonglong *)(op + 5 * ostep)) = scalar_negative(in_5);
    #endif
    
#line 347
    #if UNROLL > 6
        const npy_ulonglong in_6 = *((const npy_ulonglong *)(ip + 6 * istep));
        *((npy_ulonglong *)(op + 6 * ostep)) = scalar_negative(in_6);
    #endif
    
#line 347
    #if UNROLL > 7
        const npy_ulonglong in_7 = *((const npy_ulonglong *)(ip + 7 * istep));
        *((npy_ulonglong *)(op + 7 * ostep)) = scalar_negative(in_7);
    #endif
    
#line 347
    #if UNROLL > 8
        const npy_ulonglong in_8 = *((const npy_ulonglong *)(ip + 8 * istep));
        *((npy_ulonglong *)(op + 8 * ostep)) = scalar_negative(in_8);
    #endif
    
#line 347
    #if UNROLL > 9
        const npy_ulonglong in_9 = *((const npy_ulonglong *)(ip + 9 * istep));
        *((npy_ulonglong *)(op + 9 * ostep)) = scalar_negative(in_9);
    #endif
    
#line 347
    #if UNROLL > 10
        const npy_ulonglong in_10 = *((const npy_ulonglong *)(ip + 10 * istep));
        *((npy_ulonglong *)(op + 10 * ostep)) = scalar_negative(in_10);
    #endif
    
#line 347
    #if UNROLL > 11
        const npy_ulonglong in_11 = *((const npy_ulonglong *)(ip + 11 * istep));
        *((npy_ulonglong *)(op + 11 * ostep)) = scalar_negative(in_11);
    #endif
    
#line 347
    #if UNROLL > 12
        const npy_ulonglong in_12 = *((const npy_ulonglong *)(ip + 12 * istep));
        *((npy_ulonglong *)(op + 12 * ostep)) = scalar_negative(in_12);
    #endif
    
#line 347
    #if UNROLL > 13
        const npy_ulonglong in_13 = *((const npy_ulonglong *)(ip + 13 * istep));
        *((npy_ulonglong *)(op + 13 * ostep)) = scalar_negative(in_13);
    #endif
    
#line 347
    #if UNROLL > 14
        const npy_ulonglong in_14 = *((const npy_ulonglong *)(ip + 14 * istep));
        *((npy_ulonglong *)(op + 14 * ostep)) = scalar_negative(in_14);
    #endif
    
#line 347
    #if UNROLL > 15
        const npy_ulonglong in_15 = *((const npy_ulonglong *)(ip + 15 * istep));
        *((npy_ulonglong *)(op + 15 * ostep)) = scalar_negative(in_15);
    #endif
    
    }
#endif // NPY_DISABLE_OPTIMIZATION
    for (; len > 0; --len, ip += istep, op += ostep) {
        *((npy_ulonglong *)op) = scalar_negative(*(const npy_ulonglong *)ip);
    }
#ifdef TO_SIMD_SFX
clear:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 257
#undef TO_SIMD_SFX
#if 0
#line 262
#elif NPY_SIMD && NPY_BITSOF_BYTE == 8
    #if 0
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_BYTE == 16
    #if 0
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_BYTE == 32
    #if 0
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_BYTE == 64
    #if 0
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip = args[0], *op = args[1];
    npy_intp istep = steps[0], ostep = steps[1],
             len = dimensions[0];
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (!is_mem_overlap(ip, istep, op, ostep, len)) {
        if (IS_UNARY_CONT(npy_byte, npy_byte)) {
            // no overlap and operands are contiguous
            TO_SIMD_SFX(simd_unary_cc_negative)(
                (STYPE*)ip, (STYPE*)op, len
            );
            goto clear;
        }
    #if 0
        const npy_intp istride = istep / sizeof(STYPE);
        const npy_intp ostride = ostep / sizeof(STYPE);
        if (TO_SIMD_SFX(npyv_loadable_stride)(istride) &&
            TO_SIMD_SFX(npyv_storable_stride)(ostride))
        {
            if (istride == 1 && ostride != 1) {
                // contiguous input, non-contiguous output
                TO_SIMD_SFX(simd_unary_cn_negative)(
                    (STYPE*)ip, (STYPE*)op, ostride, len
                );
                goto clear;
            }
            else if (istride != 1 && ostride == 1) {
                // non-contiguous input, contiguous output
                TO_SIMD_SFX(simd_unary_nc_negative)(
                    (STYPE*)ip, istride, (STYPE*)op, len
                );
                goto clear;
            }
        // X86 does better with unrolled scalar for heavy non-contiguous
        #ifndef NPY_HAVE_SSE2
            else if (istride != 1 && ostride != 1) {
                // non-contiguous input and output
                TO_SIMD_SFX(simd_unary_nn_negative)(
                    (STYPE*)ip, istride, (STYPE*)op, ostride, len
                );
                goto clear;
            }
        #endif
        }
    #endif // 0
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    /*
     * scalar unrolls
     * 8x unroll performed best on
     *  - Apple M1 Native / arm64
     *  - Apple M1 Rosetta / SSE42
     *  - iMacPro / AVX512
     */
    #define UNROLL 8
    for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
    #line 347
    #if UNROLL > 0
        const npy_byte in_0 = *((const npy_byte *)(ip + 0 * istep));
        *((npy_byte *)(op + 0 * ostep)) = scalar_negative(in_0);
    #endif
    
#line 347
    #if UNROLL > 1
        const npy_byte in_1 = *((const npy_byte *)(ip + 1 * istep));
        *((npy_byte *)(op + 1 * ostep)) = scalar_negative(in_1);
    #endif
    
#line 347
    #if UNROLL > 2
        const npy_byte in_2 = *((const npy_byte *)(ip + 2 * istep));
        *((npy_byte *)(op + 2 * ostep)) = scalar_negative(in_2);
    #endif
    
#line 347
    #if UNROLL > 3
        const npy_byte in_3 = *((const npy_byte *)(ip + 3 * istep));
        *((npy_byte *)(op + 3 * ostep)) = scalar_negative(in_3);
    #endif
    
#line 347
    #if UNROLL > 4
        const npy_byte in_4 = *((const npy_byte *)(ip + 4 * istep));
        *((npy_byte *)(op + 4 * ostep)) = scalar_negative(in_4);
    #endif
    
#line 347
    #if UNROLL > 5
        const npy_byte in_5 = *((const npy_byte *)(ip + 5 * istep));
        *((npy_byte *)(op + 5 * ostep)) = scalar_negative(in_5);
    #endif
    
#line 347
    #if UNROLL > 6
        const npy_byte in_6 = *((const npy_byte *)(ip + 6 * istep));
        *((npy_byte *)(op + 6 * ostep)) = scalar_negative(in_6);
    #endif
    
#line 347
    #if UNROLL > 7
        const npy_byte in_7 = *((const npy_byte *)(ip + 7 * istep));
        *((npy_byte *)(op + 7 * ostep)) = scalar_negative(in_7);
    #endif
    
#line 347
    #if UNROLL > 8
        const npy_byte in_8 = *((const npy_byte *)(ip + 8 * istep));
        *((npy_byte *)(op + 8 * ostep)) = scalar_negative(in_8);
    #endif
    
#line 347
    #if UNROLL > 9
        const npy_byte in_9 = *((const npy_byte *)(ip + 9 * istep));
        *((npy_byte *)(op + 9 * ostep)) = scalar_negative(in_9);
    #endif
    
#line 347
    #if UNROLL > 10
        const npy_byte in_10 = *((const npy_byte *)(ip + 10 * istep));
        *((npy_byte *)(op + 10 * ostep)) = scalar_negative(in_10);
    #endif
    
#line 347
    #if UNROLL > 11
        const npy_byte in_11 = *((const npy_byte *)(ip + 11 * istep));
        *((npy_byte *)(op + 11 * ostep)) = scalar_negative(in_11);
    #endif
    
#line 347
    #if UNROLL > 12
        const npy_byte in_12 = *((const npy_byte *)(ip + 12 * istep));
        *((npy_byte *)(op + 12 * ostep)) = scalar_negative(in_12);
    #endif
    
#line 347
    #if UNROLL > 13
        const npy_byte in_13 = *((const npy_byte *)(ip + 13 * istep));
        *((npy_byte *)(op + 13 * ostep)) = scalar_negative(in_13);
    #endif
    
#line 347
    #if UNROLL > 14
        const npy_byte in_14 = *((const npy_byte *)(ip + 14 * istep));
        *((npy_byte *)(op + 14 * ostep)) = scalar_negative(in_14);
    #endif
    
#line 347
    #if UNROLL > 15
        const npy_byte in_15 = *((const npy_byte *)(ip + 15 * istep));
        *((npy_byte *)(op + 15 * ostep)) = scalar_negative(in_15);
    #endif
    
    }
#endif // NPY_DISABLE_OPTIMIZATION
    for (; len > 0; --len, ip += istep, op += ostep) {
        *((npy_byte *)op) = scalar_negative(*(const npy_byte *)ip);
    }
#ifdef TO_SIMD_SFX
clear:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 257
#undef TO_SIMD_SFX
#if 0
#line 262
#elif NPY_SIMD && NPY_BITSOF_SHORT == 8
    #if 0
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_SHORT == 16
    #if 0
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_SHORT == 32
    #if 0
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_SHORT == 64
    #if 0
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip = args[0], *op = args[1];
    npy_intp istep = steps[0], ostep = steps[1],
             len = dimensions[0];
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (!is_mem_overlap(ip, istep, op, ostep, len)) {
        if (IS_UNARY_CONT(npy_short, npy_short)) {
            // no overlap and operands are contiguous
            TO_SIMD_SFX(simd_unary_cc_negative)(
                (STYPE*)ip, (STYPE*)op, len
            );
            goto clear;
        }
    #if 0
        const npy_intp istride = istep / sizeof(STYPE);
        const npy_intp ostride = ostep / sizeof(STYPE);
        if (TO_SIMD_SFX(npyv_loadable_stride)(istride) &&
            TO_SIMD_SFX(npyv_storable_stride)(ostride))
        {
            if (istride == 1 && ostride != 1) {
                // contiguous input, non-contiguous output
                TO_SIMD_SFX(simd_unary_cn_negative)(
                    (STYPE*)ip, (STYPE*)op, ostride, len
                );
                goto clear;
            }
            else if (istride != 1 && ostride == 1) {
                // non-contiguous input, contiguous output
                TO_SIMD_SFX(simd_unary_nc_negative)(
                    (STYPE*)ip, istride, (STYPE*)op, len
                );
                goto clear;
            }
        // X86 does better with unrolled scalar for heavy non-contiguous
        #ifndef NPY_HAVE_SSE2
            else if (istride != 1 && ostride != 1) {
                // non-contiguous input and output
                TO_SIMD_SFX(simd_unary_nn_negative)(
                    (STYPE*)ip, istride, (STYPE*)op, ostride, len
                );
                goto clear;
            }
        #endif
        }
    #endif // 0
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    /*
     * scalar unrolls
     * 8x unroll performed best on
     *  - Apple M1 Native / arm64
     *  - Apple M1 Rosetta / SSE42
     *  - iMacPro / AVX512
     */
    #define UNROLL 8
    for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
    #line 347
    #if UNROLL > 0
        const npy_short in_0 = *((const npy_short *)(ip + 0 * istep));
        *((npy_short *)(op + 0 * ostep)) = scalar_negative(in_0);
    #endif
    
#line 347
    #if UNROLL > 1
        const npy_short in_1 = *((const npy_short *)(ip + 1 * istep));
        *((npy_short *)(op + 1 * ostep)) = scalar_negative(in_1);
    #endif
    
#line 347
    #if UNROLL > 2
        const npy_short in_2 = *((const npy_short *)(ip + 2 * istep));
        *((npy_short *)(op + 2 * ostep)) = scalar_negative(in_2);
    #endif
    
#line 347
    #if UNROLL > 3
        const npy_short in_3 = *((const npy_short *)(ip + 3 * istep));
        *((npy_short *)(op + 3 * ostep)) = scalar_negative(in_3);
    #endif
    
#line 347
    #if UNROLL > 4
        const npy_short in_4 = *((const npy_short *)(ip + 4 * istep));
        *((npy_short *)(op + 4 * ostep)) = scalar_negative(in_4);
    #endif
    
#line 347
    #if UNROLL > 5
        const npy_short in_5 = *((const npy_short *)(ip + 5 * istep));
        *((npy_short *)(op + 5 * ostep)) = scalar_negative(in_5);
    #endif
    
#line 347
    #if UNROLL > 6
        const npy_short in_6 = *((const npy_short *)(ip + 6 * istep));
        *((npy_short *)(op + 6 * ostep)) = scalar_negative(in_6);
    #endif
    
#line 347
    #if UNROLL > 7
        const npy_short in_7 = *((const npy_short *)(ip + 7 * istep));
        *((npy_short *)(op + 7 * ostep)) = scalar_negative(in_7);
    #endif
    
#line 347
    #if UNROLL > 8
        const npy_short in_8 = *((const npy_short *)(ip + 8 * istep));
        *((npy_short *)(op + 8 * ostep)) = scalar_negative(in_8);
    #endif
    
#line 347
    #if UNROLL > 9
        const npy_short in_9 = *((const npy_short *)(ip + 9 * istep));
        *((npy_short *)(op + 9 * ostep)) = scalar_negative(in_9);
    #endif
    
#line 347
    #if UNROLL > 10
        const npy_short in_10 = *((const npy_short *)(ip + 10 * istep));
        *((npy_short *)(op + 10 * ostep)) = scalar_negative(in_10);
    #endif
    
#line 347
    #if UNROLL > 11
        const npy_short in_11 = *((const npy_short *)(ip + 11 * istep));
        *((npy_short *)(op + 11 * ostep)) = scalar_negative(in_11);
    #endif
    
#line 347
    #if UNROLL > 12
        const npy_short in_12 = *((const npy_short *)(ip + 12 * istep));
        *((npy_short *)(op + 12 * ostep)) = scalar_negative(in_12);
    #endif
    
#line 347
    #if UNROLL > 13
        const npy_short in_13 = *((const npy_short *)(ip + 13 * istep));
        *((npy_short *)(op + 13 * ostep)) = scalar_negative(in_13);
    #endif
    
#line 347
    #if UNROLL > 14
        const npy_short in_14 = *((const npy_short *)(ip + 14 * istep));
        *((npy_short *)(op + 14 * ostep)) = scalar_negative(in_14);
    #endif
    
#line 347
    #if UNROLL > 15
        const npy_short in_15 = *((const npy_short *)(ip + 15 * istep));
        *((npy_short *)(op + 15 * ostep)) = scalar_negative(in_15);
    #endif
    
    }
#endif // NPY_DISABLE_OPTIMIZATION
    for (; len > 0; --len, ip += istep, op += ostep) {
        *((npy_short *)op) = scalar_negative(*(const npy_short *)ip);
    }
#ifdef TO_SIMD_SFX
clear:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 257
#undef TO_SIMD_SFX
#if 0
#line 262
#elif NPY_SIMD && NPY_BITSOF_INT == 8
    #if 0
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_INT == 16
    #if 0
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_INT == 32
    #if 0
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_INT == 64
    #if 0
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip = args[0], *op = args[1];
    npy_intp istep = steps[0], ostep = steps[1],
             len = dimensions[0];
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (!is_mem_overlap(ip, istep, op, ostep, len)) {
        if (IS_UNARY_CONT(npy_int, npy_int)) {
            // no overlap and operands are contiguous
            TO_SIMD_SFX(simd_unary_cc_negative)(
                (STYPE*)ip, (STYPE*)op, len
            );
            goto clear;
        }
    #if 1
        const npy_intp istride = istep / sizeof(STYPE);
        const npy_intp ostride = ostep / sizeof(STYPE);
        if (TO_SIMD_SFX(npyv_loadable_stride)(istride) &&
            TO_SIMD_SFX(npyv_storable_stride)(ostride))
        {
            if (istride == 1 && ostride != 1) {
                // contiguous input, non-contiguous output
                TO_SIMD_SFX(simd_unary_cn_negative)(
                    (STYPE*)ip, (STYPE*)op, ostride, len
                );
                goto clear;
            }
            else if (istride != 1 && ostride == 1) {
                // non-contiguous input, contiguous output
                TO_SIMD_SFX(simd_unary_nc_negative)(
                    (STYPE*)ip, istride, (STYPE*)op, len
                );
                goto clear;
            }
        // X86 does better with unrolled scalar for heavy non-contiguous
        #ifndef NPY_HAVE_SSE2
            else if (istride != 1 && ostride != 1) {
                // non-contiguous input and output
                TO_SIMD_SFX(simd_unary_nn_negative)(
                    (STYPE*)ip, istride, (STYPE*)op, ostride, len
                );
                goto clear;
            }
        #endif
        }
    #endif // 1
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    /*
     * scalar unrolls
     * 8x unroll performed best on
     *  - Apple M1 Native / arm64
     *  - Apple M1 Rosetta / SSE42
     *  - iMacPro / AVX512
     */
    #define UNROLL 8
    for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
    #line 347
    #if UNROLL > 0
        const npy_int in_0 = *((const npy_int *)(ip + 0 * istep));
        *((npy_int *)(op + 0 * ostep)) = scalar_negative(in_0);
    #endif
    
#line 347
    #if UNROLL > 1
        const npy_int in_1 = *((const npy_int *)(ip + 1 * istep));
        *((npy_int *)(op + 1 * ostep)) = scalar_negative(in_1);
    #endif
    
#line 347
    #if UNROLL > 2
        const npy_int in_2 = *((const npy_int *)(ip + 2 * istep));
        *((npy_int *)(op + 2 * ostep)) = scalar_negative(in_2);
    #endif
    
#line 347
    #if UNROLL > 3
        const npy_int in_3 = *((const npy_int *)(ip + 3 * istep));
        *((npy_int *)(op + 3 * ostep)) = scalar_negative(in_3);
    #endif
    
#line 347
    #if UNROLL > 4
        const npy_int in_4 = *((const npy_int *)(ip + 4 * istep));
        *((npy_int *)(op + 4 * ostep)) = scalar_negative(in_4);
    #endif
    
#line 347
    #if UNROLL > 5
        const npy_int in_5 = *((const npy_int *)(ip + 5 * istep));
        *((npy_int *)(op + 5 * ostep)) = scalar_negative(in_5);
    #endif
    
#line 347
    #if UNROLL > 6
        const npy_int in_6 = *((const npy_int *)(ip + 6 * istep));
        *((npy_int *)(op + 6 * ostep)) = scalar_negative(in_6);
    #endif
    
#line 347
    #if UNROLL > 7
        const npy_int in_7 = *((const npy_int *)(ip + 7 * istep));
        *((npy_int *)(op + 7 * ostep)) = scalar_negative(in_7);
    #endif
    
#line 347
    #if UNROLL > 8
        const npy_int in_8 = *((const npy_int *)(ip + 8 * istep));
        *((npy_int *)(op + 8 * ostep)) = scalar_negative(in_8);
    #endif
    
#line 347
    #if UNROLL > 9
        const npy_int in_9 = *((const npy_int *)(ip + 9 * istep));
        *((npy_int *)(op + 9 * ostep)) = scalar_negative(in_9);
    #endif
    
#line 347
    #if UNROLL > 10
        const npy_int in_10 = *((const npy_int *)(ip + 10 * istep));
        *((npy_int *)(op + 10 * ostep)) = scalar_negative(in_10);
    #endif
    
#line 347
    #if UNROLL > 11
        const npy_int in_11 = *((const npy_int *)(ip + 11 * istep));
        *((npy_int *)(op + 11 * ostep)) = scalar_negative(in_11);
    #endif
    
#line 347
    #if UNROLL > 12
        const npy_int in_12 = *((const npy_int *)(ip + 12 * istep));
        *((npy_int *)(op + 12 * ostep)) = scalar_negative(in_12);
    #endif
    
#line 347
    #if UNROLL > 13
        const npy_int in_13 = *((const npy_int *)(ip + 13 * istep));
        *((npy_int *)(op + 13 * ostep)) = scalar_negative(in_13);
    #endif
    
#line 347
    #if UNROLL > 14
        const npy_int in_14 = *((const npy_int *)(ip + 14 * istep));
        *((npy_int *)(op + 14 * ostep)) = scalar_negative(in_14);
    #endif
    
#line 347
    #if UNROLL > 15
        const npy_int in_15 = *((const npy_int *)(ip + 15 * istep));
        *((npy_int *)(op + 15 * ostep)) = scalar_negative(in_15);
    #endif
    
    }
#endif // NPY_DISABLE_OPTIMIZATION
    for (; len > 0; --len, ip += istep, op += ostep) {
        *((npy_int *)op) = scalar_negative(*(const npy_int *)ip);
    }
#ifdef TO_SIMD_SFX
clear:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 257
#undef TO_SIMD_SFX
#if 0
#line 262
#elif NPY_SIMD && NPY_BITSOF_LONG == 8
    #if 0
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_LONG == 16
    #if 0
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_LONG == 32
    #if 0
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_LONG == 64
    #if 0
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip = args[0], *op = args[1];
    npy_intp istep = steps[0], ostep = steps[1],
             len = dimensions[0];
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (!is_mem_overlap(ip, istep, op, ostep, len)) {
        if (IS_UNARY_CONT(npy_long, npy_long)) {
            // no overlap and operands are contiguous
            TO_SIMD_SFX(simd_unary_cc_negative)(
                (STYPE*)ip, (STYPE*)op, len
            );
            goto clear;
        }
    #if 1
        const npy_intp istride = istep / sizeof(STYPE);
        const npy_intp ostride = ostep / sizeof(STYPE);
        if (TO_SIMD_SFX(npyv_loadable_stride)(istride) &&
            TO_SIMD_SFX(npyv_storable_stride)(ostride))
        {
            if (istride == 1 && ostride != 1) {
                // contiguous input, non-contiguous output
                TO_SIMD_SFX(simd_unary_cn_negative)(
                    (STYPE*)ip, (STYPE*)op, ostride, len
                );
                goto clear;
            }
            else if (istride != 1 && ostride == 1) {
                // non-contiguous input, contiguous output
                TO_SIMD_SFX(simd_unary_nc_negative)(
                    (STYPE*)ip, istride, (STYPE*)op, len
                );
                goto clear;
            }
        // X86 does better with unrolled scalar for heavy non-contiguous
        #ifndef NPY_HAVE_SSE2
            else if (istride != 1 && ostride != 1) {
                // non-contiguous input and output
                TO_SIMD_SFX(simd_unary_nn_negative)(
                    (STYPE*)ip, istride, (STYPE*)op, ostride, len
                );
                goto clear;
            }
        #endif
        }
    #endif // 1
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    /*
     * scalar unrolls
     * 8x unroll performed best on
     *  - Apple M1 Native / arm64
     *  - Apple M1 Rosetta / SSE42
     *  - iMacPro / AVX512
     */
    #define UNROLL 8
    for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
    #line 347
    #if UNROLL > 0
        const npy_long in_0 = *((const npy_long *)(ip + 0 * istep));
        *((npy_long *)(op + 0 * ostep)) = scalar_negative(in_0);
    #endif
    
#line 347
    #if UNROLL > 1
        const npy_long in_1 = *((const npy_long *)(ip + 1 * istep));
        *((npy_long *)(op + 1 * ostep)) = scalar_negative(in_1);
    #endif
    
#line 347
    #if UNROLL > 2
        const npy_long in_2 = *((const npy_long *)(ip + 2 * istep));
        *((npy_long *)(op + 2 * ostep)) = scalar_negative(in_2);
    #endif
    
#line 347
    #if UNROLL > 3
        const npy_long in_3 = *((const npy_long *)(ip + 3 * istep));
        *((npy_long *)(op + 3 * ostep)) = scalar_negative(in_3);
    #endif
    
#line 347
    #if UNROLL > 4
        const npy_long in_4 = *((const npy_long *)(ip + 4 * istep));
        *((npy_long *)(op + 4 * ostep)) = scalar_negative(in_4);
    #endif
    
#line 347
    #if UNROLL > 5
        const npy_long in_5 = *((const npy_long *)(ip + 5 * istep));
        *((npy_long *)(op + 5 * ostep)) = scalar_negative(in_5);
    #endif
    
#line 347
    #if UNROLL > 6
        const npy_long in_6 = *((const npy_long *)(ip + 6 * istep));
        *((npy_long *)(op + 6 * ostep)) = scalar_negative(in_6);
    #endif
    
#line 347
    #if UNROLL > 7
        const npy_long in_7 = *((const npy_long *)(ip + 7 * istep));
        *((npy_long *)(op + 7 * ostep)) = scalar_negative(in_7);
    #endif
    
#line 347
    #if UNROLL > 8
        const npy_long in_8 = *((const npy_long *)(ip + 8 * istep));
        *((npy_long *)(op + 8 * ostep)) = scalar_negative(in_8);
    #endif
    
#line 347
    #if UNROLL > 9
        const npy_long in_9 = *((const npy_long *)(ip + 9 * istep));
        *((npy_long *)(op + 9 * ostep)) = scalar_negative(in_9);
    #endif
    
#line 347
    #if UNROLL > 10
        const npy_long in_10 = *((const npy_long *)(ip + 10 * istep));
        *((npy_long *)(op + 10 * ostep)) = scalar_negative(in_10);
    #endif
    
#line 347
    #if UNROLL > 11
        const npy_long in_11 = *((const npy_long *)(ip + 11 * istep));
        *((npy_long *)(op + 11 * ostep)) = scalar_negative(in_11);
    #endif
    
#line 347
    #if UNROLL > 12
        const npy_long in_12 = *((const npy_long *)(ip + 12 * istep));
        *((npy_long *)(op + 12 * ostep)) = scalar_negative(in_12);
    #endif
    
#line 347
    #if UNROLL > 13
        const npy_long in_13 = *((const npy_long *)(ip + 13 * istep));
        *((npy_long *)(op + 13 * ostep)) = scalar_negative(in_13);
    #endif
    
#line 347
    #if UNROLL > 14
        const npy_long in_14 = *((const npy_long *)(ip + 14 * istep));
        *((npy_long *)(op + 14 * ostep)) = scalar_negative(in_14);
    #endif
    
#line 347
    #if UNROLL > 15
        const npy_long in_15 = *((const npy_long *)(ip + 15 * istep));
        *((npy_long *)(op + 15 * ostep)) = scalar_negative(in_15);
    #endif
    
    }
#endif // NPY_DISABLE_OPTIMIZATION
    for (; len > 0; --len, ip += istep, op += ostep) {
        *((npy_long *)op) = scalar_negative(*(const npy_long *)ip);
    }
#ifdef TO_SIMD_SFX
clear:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 257
#undef TO_SIMD_SFX
#if 0
#line 262
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 8
    #if 0
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 16
    #if 0
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 32
    #if 0
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 64
    #if 0
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip = args[0], *op = args[1];
    npy_intp istep = steps[0], ostep = steps[1],
             len = dimensions[0];
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (!is_mem_overlap(ip, istep, op, ostep, len)) {
        if (IS_UNARY_CONT(npy_longlong, npy_longlong)) {
            // no overlap and operands are contiguous
            TO_SIMD_SFX(simd_unary_cc_negative)(
                (STYPE*)ip, (STYPE*)op, len
            );
            goto clear;
        }
    #if 1
        const npy_intp istride = istep / sizeof(STYPE);
        const npy_intp ostride = ostep / sizeof(STYPE);
        if (TO_SIMD_SFX(npyv_loadable_stride)(istride) &&
            TO_SIMD_SFX(npyv_storable_stride)(ostride))
        {
            if (istride == 1 && ostride != 1) {
                // contiguous input, non-contiguous output
                TO_SIMD_SFX(simd_unary_cn_negative)(
                    (STYPE*)ip, (STYPE*)op, ostride, len
                );
                goto clear;
            }
            else if (istride != 1 && ostride == 1) {
                // non-contiguous input, contiguous output
                TO_SIMD_SFX(simd_unary_nc_negative)(
                    (STYPE*)ip, istride, (STYPE*)op, len
                );
                goto clear;
            }
        // X86 does better with unrolled scalar for heavy non-contiguous
        #ifndef NPY_HAVE_SSE2
            else if (istride != 1 && ostride != 1) {
                // non-contiguous input and output
                TO_SIMD_SFX(simd_unary_nn_negative)(
                    (STYPE*)ip, istride, (STYPE*)op, ostride, len
                );
                goto clear;
            }
        #endif
        }
    #endif // 1
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    /*
     * scalar unrolls
     * 8x unroll performed best on
     *  - Apple M1 Native / arm64
     *  - Apple M1 Rosetta / SSE42
     *  - iMacPro / AVX512
     */
    #define UNROLL 8
    for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
    #line 347
    #if UNROLL > 0
        const npy_longlong in_0 = *((const npy_longlong *)(ip + 0 * istep));
        *((npy_longlong *)(op + 0 * ostep)) = scalar_negative(in_0);
    #endif
    
#line 347
    #if UNROLL > 1
        const npy_longlong in_1 = *((const npy_longlong *)(ip + 1 * istep));
        *((npy_longlong *)(op + 1 * ostep)) = scalar_negative(in_1);
    #endif
    
#line 347
    #if UNROLL > 2
        const npy_longlong in_2 = *((const npy_longlong *)(ip + 2 * istep));
        *((npy_longlong *)(op + 2 * ostep)) = scalar_negative(in_2);
    #endif
    
#line 347
    #if UNROLL > 3
        const npy_longlong in_3 = *((const npy_longlong *)(ip + 3 * istep));
        *((npy_longlong *)(op + 3 * ostep)) = scalar_negative(in_3);
    #endif
    
#line 347
    #if UNROLL > 4
        const npy_longlong in_4 = *((const npy_longlong *)(ip + 4 * istep));
        *((npy_longlong *)(op + 4 * ostep)) = scalar_negative(in_4);
    #endif
    
#line 347
    #if UNROLL > 5
        const npy_longlong in_5 = *((const npy_longlong *)(ip + 5 * istep));
        *((npy_longlong *)(op + 5 * ostep)) = scalar_negative(in_5);
    #endif
    
#line 347
    #if UNROLL > 6
        const npy_longlong in_6 = *((const npy_longlong *)(ip + 6 * istep));
        *((npy_longlong *)(op + 6 * ostep)) = scalar_negative(in_6);
    #endif
    
#line 347
    #if UNROLL > 7
        const npy_longlong in_7 = *((const npy_longlong *)(ip + 7 * istep));
        *((npy_longlong *)(op + 7 * ostep)) = scalar_negative(in_7);
    #endif
    
#line 347
    #if UNROLL > 8
        const npy_longlong in_8 = *((const npy_longlong *)(ip + 8 * istep));
        *((npy_longlong *)(op + 8 * ostep)) = scalar_negative(in_8);
    #endif
    
#line 347
    #if UNROLL > 9
        const npy_longlong in_9 = *((const npy_longlong *)(ip + 9 * istep));
        *((npy_longlong *)(op + 9 * ostep)) = scalar_negative(in_9);
    #endif
    
#line 347
    #if UNROLL > 10
        const npy_longlong in_10 = *((const npy_longlong *)(ip + 10 * istep));
        *((npy_longlong *)(op + 10 * ostep)) = scalar_negative(in_10);
    #endif
    
#line 347
    #if UNROLL > 11
        const npy_longlong in_11 = *((const npy_longlong *)(ip + 11 * istep));
        *((npy_longlong *)(op + 11 * ostep)) = scalar_negative(in_11);
    #endif
    
#line 347
    #if UNROLL > 12
        const npy_longlong in_12 = *((const npy_longlong *)(ip + 12 * istep));
        *((npy_longlong *)(op + 12 * ostep)) = scalar_negative(in_12);
    #endif
    
#line 347
    #if UNROLL > 13
        const npy_longlong in_13 = *((const npy_longlong *)(ip + 13 * istep));
        *((npy_longlong *)(op + 13 * ostep)) = scalar_negative(in_13);
    #endif
    
#line 347
    #if UNROLL > 14
        const npy_longlong in_14 = *((const npy_longlong *)(ip + 14 * istep));
        *((npy_longlong *)(op + 14 * ostep)) = scalar_negative(in_14);
    #endif
    
#line 347
    #if UNROLL > 15
        const npy_longlong in_15 = *((const npy_longlong *)(ip + 15 * istep));
        *((npy_longlong *)(op + 15 * ostep)) = scalar_negative(in_15);
    #endif
    
    }
#endif // NPY_DISABLE_OPTIMIZATION
    for (; len > 0; --len, ip += istep, op += ostep) {
        *((npy_longlong *)op) = scalar_negative(*(const npy_longlong *)ip);
    }
#ifdef TO_SIMD_SFX
clear:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 257
#undef TO_SIMD_SFX
#if 0
#line 262
#elif NPY_SIMD && NPY_BITSOF_FLOAT == 8
    #if 1
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_FLOAT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_FLOAT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_FLOAT == 16
    #if 1
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_FLOAT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_FLOAT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_FLOAT == 32
    #if 1
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_FLOAT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_FLOAT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_FLOAT == 64
    #if 1
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_FLOAT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_FLOAT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip = args[0], *op = args[1];
    npy_intp istep = steps[0], ostep = steps[1],
             len = dimensions[0];
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (!is_mem_overlap(ip, istep, op, ostep, len)) {
        if (IS_UNARY_CONT(npy_float, npy_float)) {
            // no overlap and operands are contiguous
            TO_SIMD_SFX(simd_unary_cc_negative)(
                (STYPE*)ip, (STYPE*)op, len
            );
            goto clear;
        }
    #if 1
        const npy_intp istride = istep / sizeof(STYPE);
        const npy_intp ostride = ostep / sizeof(STYPE);
        if (TO_SIMD_SFX(npyv_loadable_stride)(istride) &&
            TO_SIMD_SFX(npyv_storable_stride)(ostride))
        {
            if (istride == 1 && ostride != 1) {
                // contiguous input, non-contiguous output
                TO_SIMD_SFX(simd_unary_cn_negative)(
                    (STYPE*)ip, (STYPE*)op, ostride, len
                );
                goto clear;
            }
            else if (istride != 1 && ostride == 1) {
                // non-contiguous input, contiguous output
                TO_SIMD_SFX(simd_unary_nc_negative)(
                    (STYPE*)ip, istride, (STYPE*)op, len
                );
                goto clear;
            }
        // X86 does better with unrolled scalar for heavy non-contiguous
        #ifndef NPY_HAVE_SSE2
            else if (istride != 1 && ostride != 1) {
                // non-contiguous input and output
                TO_SIMD_SFX(simd_unary_nn_negative)(
                    (STYPE*)ip, istride, (STYPE*)op, ostride, len
                );
                goto clear;
            }
        #endif
        }
    #endif // 1
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    /*
     * scalar unrolls
     * 8x unroll performed best on
     *  - Apple M1 Native / arm64
     *  - Apple M1 Rosetta / SSE42
     *  - iMacPro / AVX512
     */
    #define UNROLL 8
    for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
    #line 347
    #if UNROLL > 0
        const npy_float in_0 = *((const npy_float *)(ip + 0 * istep));
        *((npy_float *)(op + 0 * ostep)) = scalar_negative(in_0);
    #endif
    
#line 347
    #if UNROLL > 1
        const npy_float in_1 = *((const npy_float *)(ip + 1 * istep));
        *((npy_float *)(op + 1 * ostep)) = scalar_negative(in_1);
    #endif
    
#line 347
    #if UNROLL > 2
        const npy_float in_2 = *((const npy_float *)(ip + 2 * istep));
        *((npy_float *)(op + 2 * ostep)) = scalar_negative(in_2);
    #endif
    
#line 347
    #if UNROLL > 3
        const npy_float in_3 = *((const npy_float *)(ip + 3 * istep));
        *((npy_float *)(op + 3 * ostep)) = scalar_negative(in_3);
    #endif
    
#line 347
    #if UNROLL > 4
        const npy_float in_4 = *((const npy_float *)(ip + 4 * istep));
        *((npy_float *)(op + 4 * ostep)) = scalar_negative(in_4);
    #endif
    
#line 347
    #if UNROLL > 5
        const npy_float in_5 = *((const npy_float *)(ip + 5 * istep));
        *((npy_float *)(op + 5 * ostep)) = scalar_negative(in_5);
    #endif
    
#line 347
    #if UNROLL > 6
        const npy_float in_6 = *((const npy_float *)(ip + 6 * istep));
        *((npy_float *)(op + 6 * ostep)) = scalar_negative(in_6);
    #endif
    
#line 347
    #if UNROLL > 7
        const npy_float in_7 = *((const npy_float *)(ip + 7 * istep));
        *((npy_float *)(op + 7 * ostep)) = scalar_negative(in_7);
    #endif
    
#line 347
    #if UNROLL > 8
        const npy_float in_8 = *((const npy_float *)(ip + 8 * istep));
        *((npy_float *)(op + 8 * ostep)) = scalar_negative(in_8);
    #endif
    
#line 347
    #if UNROLL > 9
        const npy_float in_9 = *((const npy_float *)(ip + 9 * istep));
        *((npy_float *)(op + 9 * ostep)) = scalar_negative(in_9);
    #endif
    
#line 347
    #if UNROLL > 10
        const npy_float in_10 = *((const npy_float *)(ip + 10 * istep));
        *((npy_float *)(op + 10 * ostep)) = scalar_negative(in_10);
    #endif
    
#line 347
    #if UNROLL > 11
        const npy_float in_11 = *((const npy_float *)(ip + 11 * istep));
        *((npy_float *)(op + 11 * ostep)) = scalar_negative(in_11);
    #endif
    
#line 347
    #if UNROLL > 12
        const npy_float in_12 = *((const npy_float *)(ip + 12 * istep));
        *((npy_float *)(op + 12 * ostep)) = scalar_negative(in_12);
    #endif
    
#line 347
    #if UNROLL > 13
        const npy_float in_13 = *((const npy_float *)(ip + 13 * istep));
        *((npy_float *)(op + 13 * ostep)) = scalar_negative(in_13);
    #endif
    
#line 347
    #if UNROLL > 14
        const npy_float in_14 = *((const npy_float *)(ip + 14 * istep));
        *((npy_float *)(op + 14 * ostep)) = scalar_negative(in_14);
    #endif
    
#line 347
    #if UNROLL > 15
        const npy_float in_15 = *((const npy_float *)(ip + 15 * istep));
        *((npy_float *)(op + 15 * ostep)) = scalar_negative(in_15);
    #endif
    
    }
#endif // NPY_DISABLE_OPTIMIZATION
    for (; len > 0; --len, ip += istep, op += ostep) {
        *((npy_float *)op) = scalar_negative(*(const npy_float *)ip);
    }
#ifdef TO_SIMD_SFX
clear:
    npyv_cleanup();
#endif
#if 1
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 257
#undef TO_SIMD_SFX
#if 0
#line 262
#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 8
    #if 1
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_DOUBLE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_DOUBLE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 16
    #if 1
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_DOUBLE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_DOUBLE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 32
    #if 1
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_DOUBLE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_DOUBLE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 64
    #if 1
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_DOUBLE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_DOUBLE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip = args[0], *op = args[1];
    npy_intp istep = steps[0], ostep = steps[1],
             len = dimensions[0];
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (!is_mem_overlap(ip, istep, op, ostep, len)) {
        if (IS_UNARY_CONT(npy_double, npy_double)) {
            // no overlap and operands are contiguous
            TO_SIMD_SFX(simd_unary_cc_negative)(
                (STYPE*)ip, (STYPE*)op, len
            );
            goto clear;
        }
    #if 1
        const npy_intp istride = istep / sizeof(STYPE);
        const npy_intp ostride = ostep / sizeof(STYPE);
        if (TO_SIMD_SFX(npyv_loadable_stride)(istride) &&
            TO_SIMD_SFX(npyv_storable_stride)(ostride))
        {
            if (istride == 1 && ostride != 1) {
                // contiguous input, non-contiguous output
                TO_SIMD_SFX(simd_unary_cn_negative)(
                    (STYPE*)ip, (STYPE*)op, ostride, len
                );
                goto clear;
            }
            else if (istride != 1 && ostride == 1) {
                // non-contiguous input, contiguous output
                TO_SIMD_SFX(simd_unary_nc_negative)(
                    (STYPE*)ip, istride, (STYPE*)op, len
                );
                goto clear;
            }
        // X86 does better with unrolled scalar for heavy non-contiguous
        #ifndef NPY_HAVE_SSE2
            else if (istride != 1 && ostride != 1) {
                // non-contiguous input and output
                TO_SIMD_SFX(simd_unary_nn_negative)(
                    (STYPE*)ip, istride, (STYPE*)op, ostride, len
                );
                goto clear;
            }
        #endif
        }
    #endif // 1
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    /*
     * scalar unrolls
     * 8x unroll performed best on
     *  - Apple M1 Native / arm64
     *  - Apple M1 Rosetta / SSE42
     *  - iMacPro / AVX512
     */
    #define UNROLL 8
    for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
    #line 347
    #if UNROLL > 0
        const npy_double in_0 = *((const npy_double *)(ip + 0 * istep));
        *((npy_double *)(op + 0 * ostep)) = scalar_negative(in_0);
    #endif
    
#line 347
    #if UNROLL > 1
        const npy_double in_1 = *((const npy_double *)(ip + 1 * istep));
        *((npy_double *)(op + 1 * ostep)) = scalar_negative(in_1);
    #endif
    
#line 347
    #if UNROLL > 2
        const npy_double in_2 = *((const npy_double *)(ip + 2 * istep));
        *((npy_double *)(op + 2 * ostep)) = scalar_negative(in_2);
    #endif
    
#line 347
    #if UNROLL > 3
        const npy_double in_3 = *((const npy_double *)(ip + 3 * istep));
        *((npy_double *)(op + 3 * ostep)) = scalar_negative(in_3);
    #endif
    
#line 347
    #if UNROLL > 4
        const npy_double in_4 = *((const npy_double *)(ip + 4 * istep));
        *((npy_double *)(op + 4 * ostep)) = scalar_negative(in_4);
    #endif
    
#line 347
    #if UNROLL > 5
        const npy_double in_5 = *((const npy_double *)(ip + 5 * istep));
        *((npy_double *)(op + 5 * ostep)) = scalar_negative(in_5);
    #endif
    
#line 347
    #if UNROLL > 6
        const npy_double in_6 = *((const npy_double *)(ip + 6 * istep));
        *((npy_double *)(op + 6 * ostep)) = scalar_negative(in_6);
    #endif
    
#line 347
    #if UNROLL > 7
        const npy_double in_7 = *((const npy_double *)(ip + 7 * istep));
        *((npy_double *)(op + 7 * ostep)) = scalar_negative(in_7);
    #endif
    
#line 347
    #if UNROLL > 8
        const npy_double in_8 = *((const npy_double *)(ip + 8 * istep));
        *((npy_double *)(op + 8 * ostep)) = scalar_negative(in_8);
    #endif
    
#line 347
    #if UNROLL > 9
        const npy_double in_9 = *((const npy_double *)(ip + 9 * istep));
        *((npy_double *)(op + 9 * ostep)) = scalar_negative(in_9);
    #endif
    
#line 347
    #if UNROLL > 10
        const npy_double in_10 = *((const npy_double *)(ip + 10 * istep));
        *((npy_double *)(op + 10 * ostep)) = scalar_negative(in_10);
    #endif
    
#line 347
    #if UNROLL > 11
        const npy_double in_11 = *((const npy_double *)(ip + 11 * istep));
        *((npy_double *)(op + 11 * ostep)) = scalar_negative(in_11);
    #endif
    
#line 347
    #if UNROLL > 12
        const npy_double in_12 = *((const npy_double *)(ip + 12 * istep));
        *((npy_double *)(op + 12 * ostep)) = scalar_negative(in_12);
    #endif
    
#line 347
    #if UNROLL > 13
        const npy_double in_13 = *((const npy_double *)(ip + 13 * istep));
        *((npy_double *)(op + 13 * ostep)) = scalar_negative(in_13);
    #endif
    
#line 347
    #if UNROLL > 14
        const npy_double in_14 = *((const npy_double *)(ip + 14 * istep));
        *((npy_double *)(op + 14 * ostep)) = scalar_negative(in_14);
    #endif
    
#line 347
    #if UNROLL > 15
        const npy_double in_15 = *((const npy_double *)(ip + 15 * istep));
        *((npy_double *)(op + 15 * ostep)) = scalar_negative(in_15);
    #endif
    
    }
#endif // NPY_DISABLE_OPTIMIZATION
    for (; len > 0; --len, ip += istep, op += ostep) {
        *((npy_double *)op) = scalar_negative(*(const npy_double *)ip);
    }
#ifdef TO_SIMD_SFX
clear:
    npyv_cleanup();
#endif
#if 1
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 257
#undef TO_SIMD_SFX
#if 0
#line 262
#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 8
    #if 1
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_LONGDOUBLE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONGDOUBLE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 16
    #if 1
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_LONGDOUBLE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONGDOUBLE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 32
    #if 1
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_LONGDOUBLE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONGDOUBLE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 262
#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 64
    #if 1
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_LONGDOUBLE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONGDOUBLE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip = args[0], *op = args[1];
    npy_intp istep = steps[0], ostep = steps[1],
             len = dimensions[0];
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (!is_mem_overlap(ip, istep, op, ostep, len)) {
        if (IS_UNARY_CONT(npy_longdouble, npy_longdouble)) {
            // no overlap and operands are contiguous
            TO_SIMD_SFX(simd_unary_cc_negative)(
                (STYPE*)ip, (STYPE*)op, len
            );
            goto clear;
        }
    #if 1
        const npy_intp istride = istep / sizeof(STYPE);
        const npy_intp ostride = ostep / sizeof(STYPE);
        if (TO_SIMD_SFX(npyv_loadable_stride)(istride) &&
            TO_SIMD_SFX(npyv_storable_stride)(ostride))
        {
            if (istride == 1 && ostride != 1) {
                // contiguous input, non-contiguous output
                TO_SIMD_SFX(simd_unary_cn_negative)(
                    (STYPE*)ip, (STYPE*)op, ostride, len
                );
                goto clear;
            }
            else if (istride != 1 && ostride == 1) {
                // non-contiguous input, contiguous output
                TO_SIMD_SFX(simd_unary_nc_negative)(
                    (STYPE*)ip, istride, (STYPE*)op, len
                );
                goto clear;
            }
        // X86 does better with unrolled scalar for heavy non-contiguous
        #ifndef NPY_HAVE_SSE2
            else if (istride != 1 && ostride != 1) {
                // non-contiguous input and output
                TO_SIMD_SFX(simd_unary_nn_negative)(
                    (STYPE*)ip, istride, (STYPE*)op, ostride, len
                );
                goto clear;
            }
        #endif
        }
    #endif // 1
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    /*
     * scalar unrolls
     * 8x unroll performed best on
     *  - Apple M1 Native / arm64
     *  - Apple M1 Rosetta / SSE42
     *  - iMacPro / AVX512
     */
    #define UNROLL 8
    for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
    #line 347
    #if UNROLL > 0
        const npy_longdouble in_0 = *((const npy_longdouble *)(ip + 0 * istep));
        *((npy_longdouble *)(op + 0 * ostep)) = scalar_negative(in_0);
    #endif
    
#line 347
    #if UNROLL > 1
        const npy_longdouble in_1 = *((const npy_longdouble *)(ip + 1 * istep));
        *((npy_longdouble *)(op + 1 * ostep)) = scalar_negative(in_1);
    #endif
    
#line 347
    #if UNROLL > 2
        const npy_longdouble in_2 = *((const npy_longdouble *)(ip + 2 * istep));
        *((npy_longdouble *)(op + 2 * ostep)) = scalar_negative(in_2);
    #endif
    
#line 347
    #if UNROLL > 3
        const npy_longdouble in_3 = *((const npy_longdouble *)(ip + 3 * istep));
        *((npy_longdouble *)(op + 3 * ostep)) = scalar_negative(in_3);
    #endif
    
#line 347
    #if UNROLL > 4
        const npy_longdouble in_4 = *((const npy_longdouble *)(ip + 4 * istep));
        *((npy_longdouble *)(op + 4 * ostep)) = scalar_negative(in_4);
    #endif
    
#line 347
    #if UNROLL > 5
        const npy_longdouble in_5 = *((const npy_longdouble *)(ip + 5 * istep));
        *((npy_longdouble *)(op + 5 * ostep)) = scalar_negative(in_5);
    #endif
    
#line 347
    #if UNROLL > 6
        const npy_longdouble in_6 = *((const npy_longdouble *)(ip + 6 * istep));
        *((npy_longdouble *)(op + 6 * ostep)) = scalar_negative(in_6);
    #endif
    
#line 347
    #if UNROLL > 7
        const npy_longdouble in_7 = *((const npy_longdouble *)(ip + 7 * istep));
        *((npy_longdouble *)(op + 7 * ostep)) = scalar_negative(in_7);
    #endif
    
#line 347
    #if UNROLL > 8
        const npy_longdouble in_8 = *((const npy_longdouble *)(ip + 8 * istep));
        *((npy_longdouble *)(op + 8 * ostep)) = scalar_negative(in_8);
    #endif
    
#line 347
    #if UNROLL > 9
        const npy_longdouble in_9 = *((const npy_longdouble *)(ip + 9 * istep));
        *((npy_longdouble *)(op + 9 * ostep)) = scalar_negative(in_9);
    #endif
    
#line 347
    #if UNROLL > 10
        const npy_longdouble in_10 = *((const npy_longdouble *)(ip + 10 * istep));
        *((npy_longdouble *)(op + 10 * ostep)) = scalar_negative(in_10);
    #endif
    
#line 347
    #if UNROLL > 11
        const npy_longdouble in_11 = *((const npy_longdouble *)(ip + 11 * istep));
        *((npy_longdouble *)(op + 11 * ostep)) = scalar_negative(in_11);
    #endif
    
#line 347
    #if UNROLL > 12
        const npy_longdouble in_12 = *((const npy_longdouble *)(ip + 12 * istep));
        *((npy_longdouble *)(op + 12 * ostep)) = scalar_negative(in_12);
    #endif
    
#line 347
    #if UNROLL > 13
        const npy_longdouble in_13 = *((const npy_longdouble *)(ip + 13 * istep));
        *((npy_longdouble *)(op + 13 * ostep)) = scalar_negative(in_13);
    #endif
    
#line 347
    #if UNROLL > 14
        const npy_longdouble in_14 = *((const npy_longdouble *)(ip + 14 * istep));
        *((npy_longdouble *)(op + 14 * ostep)) = scalar_negative(in_14);
    #endif
    
#line 347
    #if UNROLL > 15
        const npy_longdouble in_15 = *((const npy_longdouble *)(ip + 15 * istep));
        *((npy_longdouble *)(op + 15 * ostep)) = scalar_negative(in_15);
    #endif
    
    }
#endif // NPY_DISABLE_OPTIMIZATION
    for (; len > 0; --len, ip += istep, op += ostep) {
        *((npy_longdouble *)op) = scalar_negative(*(const npy_longdouble *)ip);
    }
#ifdef TO_SIMD_SFX
clear:
    npyv_cleanup();
#endif
#if 1
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


#undef NEGATIVE_CONTIG_ONLY

