#line 1 "numpy/core/src/umath/loops_minmax.dispatch.c.src"

/*
 *****************************************************************************
 **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
 **       Changes should be made to the original source (.src) file         **
 *****************************************************************************
 */

#line 1
/*@targets
 ** $maxopt baseline
 ** neon asimd
 ** sse2 avx2 avx512_skx
 ** vsx2
 ** vx vxe
 **/
#define _UMATHMODULE
#define _MULTIARRAYMODULE
#define NPY_NO_DEPRECATED_API NPY_API_VERSION

#include "simd/simd.h"
#include "loops_utils.h"
#include "loops.h"
#include "lowlevel_strided_loops.h"
// Provides the various *_LOOP macros
#include "fast_loop_macros.h"

/*******************************************************************************
 ** Scalar intrinsics
 ******************************************************************************/
// signed/unsigned int
#define scalar_max_i(A, B) ((A > B) ? A : B)
#define scalar_min_i(A, B) ((A < B) ? A : B)
// fp, propagates NaNs
#define scalar_max(A, B) ((A >= B || npy_isnan(A)) ? A : B)
#define scalar_max_f scalar_max
#define scalar_max_d scalar_max
#define scalar_max_l scalar_max
#define scalar_min(A, B) ((A <= B || npy_isnan(A)) ? A : B)
#define scalar_min_f scalar_min
#define scalar_min_d scalar_min
#define scalar_min_l scalar_min
// fp, ignores NaNs
#define scalar_maxp_f fmaxf
#define scalar_maxp_d fmax
#define scalar_maxp_l fmaxl
#define scalar_minp_f fminf
#define scalar_minp_d fmin
#define scalar_minp_l fminl

// special optimization for fp scalars propagates NaNs
// since there're no C99 support for it
#ifndef NPY_DISABLE_OPTIMIZATION
#line 52
#line 56
#ifdef NPY_HAVE_SSE2
#undef scalar_max_f
NPY_FINLINE npy_float scalar_max_f(npy_float a, npy_float b) {
    __m128 va = _mm_set_ss(a);
    __m128 vb = _mm_set_ss(b);
    __m128 rv = _mm_max_ss(va, vb);
    // X86 handle second operand
    __m128 nn = _mm_cmpord_ss(va, va);
    #ifdef NPY_HAVE_SSE41
    rv = _mm_blendv_ps(va, rv, nn);
    #else
    rv = _mm_xor_ps(va, _mm_and_ps(_mm_xor_ps(va, rv), nn));
    #endif
    return _mm_cvtss_f32(rv);
}
#endif // SSE2
#ifdef __aarch64__
#undef scalar_max_f
NPY_FINLINE npy_float scalar_max_f(npy_float a, npy_float b) {
    npy_float result = 0;
    __asm(
        "fmax %s[result], %s[a], %s[b]"
        : [result] "=w" (result)
        : [a] "w" (a), [b] "w" (b)
    );
    return result;
}
#endif // __aarch64__

#line 56
#ifdef NPY_HAVE_SSE2
#undef scalar_min_f
NPY_FINLINE npy_float scalar_min_f(npy_float a, npy_float b) {
    __m128 va = _mm_set_ss(a);
    __m128 vb = _mm_set_ss(b);
    __m128 rv = _mm_min_ss(va, vb);
    // X86 handle second operand
    __m128 nn = _mm_cmpord_ss(va, va);
    #ifdef NPY_HAVE_SSE41
    rv = _mm_blendv_ps(va, rv, nn);
    #else
    rv = _mm_xor_ps(va, _mm_and_ps(_mm_xor_ps(va, rv), nn));
    #endif
    return _mm_cvtss_f32(rv);
}
#endif // SSE2
#ifdef __aarch64__
#undef scalar_min_f
NPY_FINLINE npy_float scalar_min_f(npy_float a, npy_float b) {
    npy_float result = 0;
    __asm(
        "fmin %s[result], %s[a], %s[b]"
        : [result] "=w" (result)
        : [a] "w" (a), [b] "w" (b)
    );
    return result;
}
#endif // __aarch64__


#line 52
#line 56
#ifdef NPY_HAVE_SSE2
#undef scalar_max_d
NPY_FINLINE npy_double scalar_max_d(npy_double a, npy_double b) {
    __m128d va = _mm_set_sd(a);
    __m128d vb = _mm_set_sd(b);
    __m128d rv = _mm_max_sd(va, vb);
    // X86 handle second operand
    __m128d nn = _mm_cmpord_sd(va, va);
    #ifdef NPY_HAVE_SSE41
    rv = _mm_blendv_pd(va, rv, nn);
    #else
    rv = _mm_xor_pd(va, _mm_and_pd(_mm_xor_pd(va, rv), nn));
    #endif
    return _mm_cvtsd_f64(rv);
}
#endif // SSE2
#ifdef __aarch64__
#undef scalar_max_d
NPY_FINLINE npy_double scalar_max_d(npy_double a, npy_double b) {
    npy_double result = 0;
    __asm(
        "fmax %d[result], %d[a], %d[b]"
        : [result] "=w" (result)
        : [a] "w" (a), [b] "w" (b)
    );
    return result;
}
#endif // __aarch64__

#line 56
#ifdef NPY_HAVE_SSE2
#undef scalar_min_d
NPY_FINLINE npy_double scalar_min_d(npy_double a, npy_double b) {
    __m128d va = _mm_set_sd(a);
    __m128d vb = _mm_set_sd(b);
    __m128d rv = _mm_min_sd(va, vb);
    // X86 handle second operand
    __m128d nn = _mm_cmpord_sd(va, va);
    #ifdef NPY_HAVE_SSE41
    rv = _mm_blendv_pd(va, rv, nn);
    #else
    rv = _mm_xor_pd(va, _mm_and_pd(_mm_xor_pd(va, rv), nn));
    #endif
    return _mm_cvtsd_f64(rv);
}
#endif // SSE2
#ifdef __aarch64__
#undef scalar_min_d
NPY_FINLINE npy_double scalar_min_d(npy_double a, npy_double b) {
    npy_double result = 0;
    __asm(
        "fmin %d[result], %d[a], %d[b]"
        : [result] "=w" (result)
        : [a] "w" (a), [b] "w" (b)
    );
    return result;
}
#endif // __aarch64__


#endif // NPY_DISABLE_OPTIMIZATION
// mapping to double if its possible
#if NPY_BITSOF_DOUBLE == NPY_BITSOF_LONGDOUBLE
#line 92
    #undef scalar_max_l
    #define scalar_max_l scalar_max_d

#line 92
    #undef scalar_min_l
    #define scalar_min_l scalar_min_d

#line 92
    #undef scalar_maxp_l
    #define scalar_maxp_l scalar_maxp_d

#line 92
    #undef scalar_minp_l
    #define scalar_minp_l scalar_minp_d

#endif

/*******************************************************************************
 ** Defining the SIMD kernels
 ******************************************************************************/
#line 106
#line 110
#define SCALAR_OP scalar_max_i
#if NPY_SIMD && (!0 || (0 && 0))

#if 0 && !0
    #define V_INTRIN npyv_maxn_s8 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_maxn_s8
#else
    #define V_INTRIN npyv_max_s8
    #define V_REDUCE_INTRIN npyv_reduce_max_s8
#endif

// contiguous input.
static inline void
simd_reduce_c_max_s8(const npyv_lanetype_s8 *ip, npyv_lanetype_s8 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_s8;
    const int wstep = vstep*8;
    npyv_s8 acc = npyv_setall_s8(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_s8 v0 = npyv_load_s8(ip + vstep * 0);
        npyv_s8 v1 = npyv_load_s8(ip + vstep * 1);
        npyv_s8 v2 = npyv_load_s8(ip + vstep * 2);
        npyv_s8 v3 = npyv_load_s8(ip + vstep * 3);

        npyv_s8 v4 = npyv_load_s8(ip + vstep * 4);
        npyv_s8 v5 = npyv_load_s8(ip + vstep * 5);
        npyv_s8 v6 = npyv_load_s8(ip + vstep * 6);
        npyv_s8 v7 = npyv_load_s8(ip + vstep * 7);

        npyv_s8 r01 = V_INTRIN(v0, v1);
        npyv_s8 r23 = V_INTRIN(v2, v3);
        npyv_s8 r45 = V_INTRIN(v4, v5);
        npyv_s8 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_s8(ip));
    }
    npyv_lanetype_s8 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_s8 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_max_s8(const npyv_lanetype_s8 *ip1, const npyv_lanetype_s8 *ip2,
                                     npyv_lanetype_s8 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_s8;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_s8 v0 = npyv_load_s8(&ip1[i + 0 * elemPerVector]);
        npyv_s8 v1 = npyv_load_s8(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_s8 v2 = npyv_load_s8(&ip1[i + 2 * elemPerVector]);
        npyv_s8 v3 = npyv_load_s8(&ip1[i + 3 * elemPerVector]);
        npyv_s8 v4 = npyv_load_s8(&ip1[i + 4 * elemPerVector]);
        npyv_s8 v5 = npyv_load_s8(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_s8 u0 = npyv_load_s8(&ip2[i + 0 * elemPerVector]);
        npyv_s8 u1 = npyv_load_s8(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_s8 u2 = npyv_load_s8(&ip2[i + 2 * elemPerVector]);
        npyv_s8 u3 = npyv_load_s8(&ip2[i + 3 * elemPerVector]);
        npyv_s8 u4 = npyv_load_s8(&ip2[i + 4 * elemPerVector]);
        npyv_s8 u5 = npyv_load_s8(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_s8 m0 = V_INTRIN(v0, u0);
        npyv_s8 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_s8 m2 = V_INTRIN(v2, u2);
        npyv_s8 m3 = V_INTRIN(v3, u3);
        npyv_s8 m4 = V_INTRIN(v4, u4);
        npyv_s8 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_s8(&op1[i + 0 * elemPerVector], m0);
        npyv_store_s8(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_s8(&op1[i + 2 * elemPerVector], m2);
        npyv_store_s8(&op1[i + 3 * elemPerVector], m3);
        npyv_store_s8(&op1[i + 4 * elemPerVector], m4);
        npyv_store_s8(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_s8 v0 = npyv_load_s8(ip1 + i);
        npyv_s8 u0 = npyv_load_s8(ip2 + i);
        npyv_s8 m0 = V_INTRIN(v0, u0);
        npyv_store_s8(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_s8 in1 = ip1[i];
        const npyv_lanetype_s8 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 0 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_max_s8(const npyv_lanetype_s8 *ip1, npy_intp sip1,
                           const npyv_lanetype_s8 *ip2, npy_intp sip2,
                                 npyv_lanetype_s8 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_s8;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_s8 a, b;
        if (sip1 == 1) {
            a = npyv_load_s8(ip1);
        } else {
            a = npyv_loadn_s8(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_s8(ip2);
        } else {
            b = npyv_loadn_s8(ip2, sip2);
        }
        npyv_s8 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_s8(op1, r);
        } else {
            npyv_storen_s8(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_s8 a = *ip1;
        const npyv_lanetype_s8 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP

#line 110
#define SCALAR_OP scalar_min_i
#if NPY_SIMD && (!0 || (0 && 0))

#if 0 && !0
    #define V_INTRIN npyv_minn_s8 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_minn_s8
#else
    #define V_INTRIN npyv_min_s8
    #define V_REDUCE_INTRIN npyv_reduce_min_s8
#endif

// contiguous input.
static inline void
simd_reduce_c_min_s8(const npyv_lanetype_s8 *ip, npyv_lanetype_s8 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_s8;
    const int wstep = vstep*8;
    npyv_s8 acc = npyv_setall_s8(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_s8 v0 = npyv_load_s8(ip + vstep * 0);
        npyv_s8 v1 = npyv_load_s8(ip + vstep * 1);
        npyv_s8 v2 = npyv_load_s8(ip + vstep * 2);
        npyv_s8 v3 = npyv_load_s8(ip + vstep * 3);

        npyv_s8 v4 = npyv_load_s8(ip + vstep * 4);
        npyv_s8 v5 = npyv_load_s8(ip + vstep * 5);
        npyv_s8 v6 = npyv_load_s8(ip + vstep * 6);
        npyv_s8 v7 = npyv_load_s8(ip + vstep * 7);

        npyv_s8 r01 = V_INTRIN(v0, v1);
        npyv_s8 r23 = V_INTRIN(v2, v3);
        npyv_s8 r45 = V_INTRIN(v4, v5);
        npyv_s8 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_s8(ip));
    }
    npyv_lanetype_s8 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_s8 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_min_s8(const npyv_lanetype_s8 *ip1, const npyv_lanetype_s8 *ip2,
                                     npyv_lanetype_s8 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_s8;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_s8 v0 = npyv_load_s8(&ip1[i + 0 * elemPerVector]);
        npyv_s8 v1 = npyv_load_s8(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_s8 v2 = npyv_load_s8(&ip1[i + 2 * elemPerVector]);
        npyv_s8 v3 = npyv_load_s8(&ip1[i + 3 * elemPerVector]);
        npyv_s8 v4 = npyv_load_s8(&ip1[i + 4 * elemPerVector]);
        npyv_s8 v5 = npyv_load_s8(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_s8 u0 = npyv_load_s8(&ip2[i + 0 * elemPerVector]);
        npyv_s8 u1 = npyv_load_s8(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_s8 u2 = npyv_load_s8(&ip2[i + 2 * elemPerVector]);
        npyv_s8 u3 = npyv_load_s8(&ip2[i + 3 * elemPerVector]);
        npyv_s8 u4 = npyv_load_s8(&ip2[i + 4 * elemPerVector]);
        npyv_s8 u5 = npyv_load_s8(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_s8 m0 = V_INTRIN(v0, u0);
        npyv_s8 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_s8 m2 = V_INTRIN(v2, u2);
        npyv_s8 m3 = V_INTRIN(v3, u3);
        npyv_s8 m4 = V_INTRIN(v4, u4);
        npyv_s8 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_s8(&op1[i + 0 * elemPerVector], m0);
        npyv_store_s8(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_s8(&op1[i + 2 * elemPerVector], m2);
        npyv_store_s8(&op1[i + 3 * elemPerVector], m3);
        npyv_store_s8(&op1[i + 4 * elemPerVector], m4);
        npyv_store_s8(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_s8 v0 = npyv_load_s8(ip1 + i);
        npyv_s8 u0 = npyv_load_s8(ip2 + i);
        npyv_s8 m0 = V_INTRIN(v0, u0);
        npyv_store_s8(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_s8 in1 = ip1[i];
        const npyv_lanetype_s8 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 0 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_min_s8(const npyv_lanetype_s8 *ip1, npy_intp sip1,
                           const npyv_lanetype_s8 *ip2, npy_intp sip2,
                                 npyv_lanetype_s8 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_s8;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_s8 a, b;
        if (sip1 == 1) {
            a = npyv_load_s8(ip1);
        } else {
            a = npyv_loadn_s8(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_s8(ip2);
        } else {
            b = npyv_loadn_s8(ip2, sip2);
        }
        npyv_s8 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_s8(op1, r);
        } else {
            npyv_storen_s8(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_s8 a = *ip1;
        const npyv_lanetype_s8 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP

#line 110
#define SCALAR_OP scalar_maxp_i
#if NPY_SIMD && (!1 || (0 && 1))

#if 0 && !1
    #define V_INTRIN npyv_maxpn_s8 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_maxpn_s8
#else
    #define V_INTRIN npyv_maxp_s8
    #define V_REDUCE_INTRIN npyv_reduce_maxp_s8
#endif

// contiguous input.
static inline void
simd_reduce_c_maxp_s8(const npyv_lanetype_s8 *ip, npyv_lanetype_s8 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_s8;
    const int wstep = vstep*8;
    npyv_s8 acc = npyv_setall_s8(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_s8 v0 = npyv_load_s8(ip + vstep * 0);
        npyv_s8 v1 = npyv_load_s8(ip + vstep * 1);
        npyv_s8 v2 = npyv_load_s8(ip + vstep * 2);
        npyv_s8 v3 = npyv_load_s8(ip + vstep * 3);

        npyv_s8 v4 = npyv_load_s8(ip + vstep * 4);
        npyv_s8 v5 = npyv_load_s8(ip + vstep * 5);
        npyv_s8 v6 = npyv_load_s8(ip + vstep * 6);
        npyv_s8 v7 = npyv_load_s8(ip + vstep * 7);

        npyv_s8 r01 = V_INTRIN(v0, v1);
        npyv_s8 r23 = V_INTRIN(v2, v3);
        npyv_s8 r45 = V_INTRIN(v4, v5);
        npyv_s8 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_s8(ip));
    }
    npyv_lanetype_s8 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_s8 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_maxp_s8(const npyv_lanetype_s8 *ip1, const npyv_lanetype_s8 *ip2,
                                     npyv_lanetype_s8 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_s8;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_s8 v0 = npyv_load_s8(&ip1[i + 0 * elemPerVector]);
        npyv_s8 v1 = npyv_load_s8(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_s8 v2 = npyv_load_s8(&ip1[i + 2 * elemPerVector]);
        npyv_s8 v3 = npyv_load_s8(&ip1[i + 3 * elemPerVector]);
        npyv_s8 v4 = npyv_load_s8(&ip1[i + 4 * elemPerVector]);
        npyv_s8 v5 = npyv_load_s8(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_s8 u0 = npyv_load_s8(&ip2[i + 0 * elemPerVector]);
        npyv_s8 u1 = npyv_load_s8(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_s8 u2 = npyv_load_s8(&ip2[i + 2 * elemPerVector]);
        npyv_s8 u3 = npyv_load_s8(&ip2[i + 3 * elemPerVector]);
        npyv_s8 u4 = npyv_load_s8(&ip2[i + 4 * elemPerVector]);
        npyv_s8 u5 = npyv_load_s8(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_s8 m0 = V_INTRIN(v0, u0);
        npyv_s8 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_s8 m2 = V_INTRIN(v2, u2);
        npyv_s8 m3 = V_INTRIN(v3, u3);
        npyv_s8 m4 = V_INTRIN(v4, u4);
        npyv_s8 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_s8(&op1[i + 0 * elemPerVector], m0);
        npyv_store_s8(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_s8(&op1[i + 2 * elemPerVector], m2);
        npyv_store_s8(&op1[i + 3 * elemPerVector], m3);
        npyv_store_s8(&op1[i + 4 * elemPerVector], m4);
        npyv_store_s8(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_s8 v0 = npyv_load_s8(ip1 + i);
        npyv_s8 u0 = npyv_load_s8(ip2 + i);
        npyv_s8 m0 = V_INTRIN(v0, u0);
        npyv_store_s8(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_s8 in1 = ip1[i];
        const npyv_lanetype_s8 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 0 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_maxp_s8(const npyv_lanetype_s8 *ip1, npy_intp sip1,
                           const npyv_lanetype_s8 *ip2, npy_intp sip2,
                                 npyv_lanetype_s8 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_s8;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_s8 a, b;
        if (sip1 == 1) {
            a = npyv_load_s8(ip1);
        } else {
            a = npyv_loadn_s8(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_s8(ip2);
        } else {
            b = npyv_loadn_s8(ip2, sip2);
        }
        npyv_s8 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_s8(op1, r);
        } else {
            npyv_storen_s8(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_s8 a = *ip1;
        const npyv_lanetype_s8 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP

#line 110
#define SCALAR_OP scalar_minp_i
#if NPY_SIMD && (!1 || (0 && 1))

#if 0 && !1
    #define V_INTRIN npyv_minpn_s8 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_minpn_s8
#else
    #define V_INTRIN npyv_minp_s8
    #define V_REDUCE_INTRIN npyv_reduce_minp_s8
#endif

// contiguous input.
static inline void
simd_reduce_c_minp_s8(const npyv_lanetype_s8 *ip, npyv_lanetype_s8 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_s8;
    const int wstep = vstep*8;
    npyv_s8 acc = npyv_setall_s8(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_s8 v0 = npyv_load_s8(ip + vstep * 0);
        npyv_s8 v1 = npyv_load_s8(ip + vstep * 1);
        npyv_s8 v2 = npyv_load_s8(ip + vstep * 2);
        npyv_s8 v3 = npyv_load_s8(ip + vstep * 3);

        npyv_s8 v4 = npyv_load_s8(ip + vstep * 4);
        npyv_s8 v5 = npyv_load_s8(ip + vstep * 5);
        npyv_s8 v6 = npyv_load_s8(ip + vstep * 6);
        npyv_s8 v7 = npyv_load_s8(ip + vstep * 7);

        npyv_s8 r01 = V_INTRIN(v0, v1);
        npyv_s8 r23 = V_INTRIN(v2, v3);
        npyv_s8 r45 = V_INTRIN(v4, v5);
        npyv_s8 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_s8(ip));
    }
    npyv_lanetype_s8 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_s8 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_minp_s8(const npyv_lanetype_s8 *ip1, const npyv_lanetype_s8 *ip2,
                                     npyv_lanetype_s8 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_s8;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_s8 v0 = npyv_load_s8(&ip1[i + 0 * elemPerVector]);
        npyv_s8 v1 = npyv_load_s8(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_s8 v2 = npyv_load_s8(&ip1[i + 2 * elemPerVector]);
        npyv_s8 v3 = npyv_load_s8(&ip1[i + 3 * elemPerVector]);
        npyv_s8 v4 = npyv_load_s8(&ip1[i + 4 * elemPerVector]);
        npyv_s8 v5 = npyv_load_s8(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_s8 u0 = npyv_load_s8(&ip2[i + 0 * elemPerVector]);
        npyv_s8 u1 = npyv_load_s8(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_s8 u2 = npyv_load_s8(&ip2[i + 2 * elemPerVector]);
        npyv_s8 u3 = npyv_load_s8(&ip2[i + 3 * elemPerVector]);
        npyv_s8 u4 = npyv_load_s8(&ip2[i + 4 * elemPerVector]);
        npyv_s8 u5 = npyv_load_s8(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_s8 m0 = V_INTRIN(v0, u0);
        npyv_s8 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_s8 m2 = V_INTRIN(v2, u2);
        npyv_s8 m3 = V_INTRIN(v3, u3);
        npyv_s8 m4 = V_INTRIN(v4, u4);
        npyv_s8 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_s8(&op1[i + 0 * elemPerVector], m0);
        npyv_store_s8(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_s8(&op1[i + 2 * elemPerVector], m2);
        npyv_store_s8(&op1[i + 3 * elemPerVector], m3);
        npyv_store_s8(&op1[i + 4 * elemPerVector], m4);
        npyv_store_s8(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_s8 v0 = npyv_load_s8(ip1 + i);
        npyv_s8 u0 = npyv_load_s8(ip2 + i);
        npyv_s8 m0 = V_INTRIN(v0, u0);
        npyv_store_s8(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_s8 in1 = ip1[i];
        const npyv_lanetype_s8 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 0 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_minp_s8(const npyv_lanetype_s8 *ip1, npy_intp sip1,
                           const npyv_lanetype_s8 *ip2, npy_intp sip2,
                                 npyv_lanetype_s8 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_s8;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_s8 a, b;
        if (sip1 == 1) {
            a = npyv_load_s8(ip1);
        } else {
            a = npyv_loadn_s8(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_s8(ip2);
        } else {
            b = npyv_loadn_s8(ip2, sip2);
        }
        npyv_s8 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_s8(op1, r);
        } else {
            npyv_storen_s8(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_s8 a = *ip1;
        const npyv_lanetype_s8 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP


#line 106
#line 110
#define SCALAR_OP scalar_max_i
#if NPY_SIMD && (!0 || (0 && 0))

#if 0 && !0
    #define V_INTRIN npyv_maxn_u8 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_maxn_u8
#else
    #define V_INTRIN npyv_max_u8
    #define V_REDUCE_INTRIN npyv_reduce_max_u8
#endif

// contiguous input.
static inline void
simd_reduce_c_max_u8(const npyv_lanetype_u8 *ip, npyv_lanetype_u8 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_u8;
    const int wstep = vstep*8;
    npyv_u8 acc = npyv_setall_u8(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_u8 v0 = npyv_load_u8(ip + vstep * 0);
        npyv_u8 v1 = npyv_load_u8(ip + vstep * 1);
        npyv_u8 v2 = npyv_load_u8(ip + vstep * 2);
        npyv_u8 v3 = npyv_load_u8(ip + vstep * 3);

        npyv_u8 v4 = npyv_load_u8(ip + vstep * 4);
        npyv_u8 v5 = npyv_load_u8(ip + vstep * 5);
        npyv_u8 v6 = npyv_load_u8(ip + vstep * 6);
        npyv_u8 v7 = npyv_load_u8(ip + vstep * 7);

        npyv_u8 r01 = V_INTRIN(v0, v1);
        npyv_u8 r23 = V_INTRIN(v2, v3);
        npyv_u8 r45 = V_INTRIN(v4, v5);
        npyv_u8 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_u8(ip));
    }
    npyv_lanetype_u8 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_u8 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_max_u8(const npyv_lanetype_u8 *ip1, const npyv_lanetype_u8 *ip2,
                                     npyv_lanetype_u8 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_u8;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_u8 v0 = npyv_load_u8(&ip1[i + 0 * elemPerVector]);
        npyv_u8 v1 = npyv_load_u8(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_u8 v2 = npyv_load_u8(&ip1[i + 2 * elemPerVector]);
        npyv_u8 v3 = npyv_load_u8(&ip1[i + 3 * elemPerVector]);
        npyv_u8 v4 = npyv_load_u8(&ip1[i + 4 * elemPerVector]);
        npyv_u8 v5 = npyv_load_u8(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_u8 u0 = npyv_load_u8(&ip2[i + 0 * elemPerVector]);
        npyv_u8 u1 = npyv_load_u8(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_u8 u2 = npyv_load_u8(&ip2[i + 2 * elemPerVector]);
        npyv_u8 u3 = npyv_load_u8(&ip2[i + 3 * elemPerVector]);
        npyv_u8 u4 = npyv_load_u8(&ip2[i + 4 * elemPerVector]);
        npyv_u8 u5 = npyv_load_u8(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_u8 m0 = V_INTRIN(v0, u0);
        npyv_u8 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_u8 m2 = V_INTRIN(v2, u2);
        npyv_u8 m3 = V_INTRIN(v3, u3);
        npyv_u8 m4 = V_INTRIN(v4, u4);
        npyv_u8 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_u8(&op1[i + 0 * elemPerVector], m0);
        npyv_store_u8(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_u8(&op1[i + 2 * elemPerVector], m2);
        npyv_store_u8(&op1[i + 3 * elemPerVector], m3);
        npyv_store_u8(&op1[i + 4 * elemPerVector], m4);
        npyv_store_u8(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_u8 v0 = npyv_load_u8(ip1 + i);
        npyv_u8 u0 = npyv_load_u8(ip2 + i);
        npyv_u8 m0 = V_INTRIN(v0, u0);
        npyv_store_u8(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_u8 in1 = ip1[i];
        const npyv_lanetype_u8 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 0 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_max_u8(const npyv_lanetype_u8 *ip1, npy_intp sip1,
                           const npyv_lanetype_u8 *ip2, npy_intp sip2,
                                 npyv_lanetype_u8 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_u8;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_u8 a, b;
        if (sip1 == 1) {
            a = npyv_load_u8(ip1);
        } else {
            a = npyv_loadn_u8(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_u8(ip2);
        } else {
            b = npyv_loadn_u8(ip2, sip2);
        }
        npyv_u8 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_u8(op1, r);
        } else {
            npyv_storen_u8(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_u8 a = *ip1;
        const npyv_lanetype_u8 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP

#line 110
#define SCALAR_OP scalar_min_i
#if NPY_SIMD && (!0 || (0 && 0))

#if 0 && !0
    #define V_INTRIN npyv_minn_u8 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_minn_u8
#else
    #define V_INTRIN npyv_min_u8
    #define V_REDUCE_INTRIN npyv_reduce_min_u8
#endif

// contiguous input.
static inline void
simd_reduce_c_min_u8(const npyv_lanetype_u8 *ip, npyv_lanetype_u8 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_u8;
    const int wstep = vstep*8;
    npyv_u8 acc = npyv_setall_u8(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_u8 v0 = npyv_load_u8(ip + vstep * 0);
        npyv_u8 v1 = npyv_load_u8(ip + vstep * 1);
        npyv_u8 v2 = npyv_load_u8(ip + vstep * 2);
        npyv_u8 v3 = npyv_load_u8(ip + vstep * 3);

        npyv_u8 v4 = npyv_load_u8(ip + vstep * 4);
        npyv_u8 v5 = npyv_load_u8(ip + vstep * 5);
        npyv_u8 v6 = npyv_load_u8(ip + vstep * 6);
        npyv_u8 v7 = npyv_load_u8(ip + vstep * 7);

        npyv_u8 r01 = V_INTRIN(v0, v1);
        npyv_u8 r23 = V_INTRIN(v2, v3);
        npyv_u8 r45 = V_INTRIN(v4, v5);
        npyv_u8 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_u8(ip));
    }
    npyv_lanetype_u8 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_u8 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_min_u8(const npyv_lanetype_u8 *ip1, const npyv_lanetype_u8 *ip2,
                                     npyv_lanetype_u8 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_u8;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_u8 v0 = npyv_load_u8(&ip1[i + 0 * elemPerVector]);
        npyv_u8 v1 = npyv_load_u8(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_u8 v2 = npyv_load_u8(&ip1[i + 2 * elemPerVector]);
        npyv_u8 v3 = npyv_load_u8(&ip1[i + 3 * elemPerVector]);
        npyv_u8 v4 = npyv_load_u8(&ip1[i + 4 * elemPerVector]);
        npyv_u8 v5 = npyv_load_u8(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_u8 u0 = npyv_load_u8(&ip2[i + 0 * elemPerVector]);
        npyv_u8 u1 = npyv_load_u8(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_u8 u2 = npyv_load_u8(&ip2[i + 2 * elemPerVector]);
        npyv_u8 u3 = npyv_load_u8(&ip2[i + 3 * elemPerVector]);
        npyv_u8 u4 = npyv_load_u8(&ip2[i + 4 * elemPerVector]);
        npyv_u8 u5 = npyv_load_u8(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_u8 m0 = V_INTRIN(v0, u0);
        npyv_u8 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_u8 m2 = V_INTRIN(v2, u2);
        npyv_u8 m3 = V_INTRIN(v3, u3);
        npyv_u8 m4 = V_INTRIN(v4, u4);
        npyv_u8 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_u8(&op1[i + 0 * elemPerVector], m0);
        npyv_store_u8(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_u8(&op1[i + 2 * elemPerVector], m2);
        npyv_store_u8(&op1[i + 3 * elemPerVector], m3);
        npyv_store_u8(&op1[i + 4 * elemPerVector], m4);
        npyv_store_u8(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_u8 v0 = npyv_load_u8(ip1 + i);
        npyv_u8 u0 = npyv_load_u8(ip2 + i);
        npyv_u8 m0 = V_INTRIN(v0, u0);
        npyv_store_u8(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_u8 in1 = ip1[i];
        const npyv_lanetype_u8 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 0 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_min_u8(const npyv_lanetype_u8 *ip1, npy_intp sip1,
                           const npyv_lanetype_u8 *ip2, npy_intp sip2,
                                 npyv_lanetype_u8 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_u8;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_u8 a, b;
        if (sip1 == 1) {
            a = npyv_load_u8(ip1);
        } else {
            a = npyv_loadn_u8(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_u8(ip2);
        } else {
            b = npyv_loadn_u8(ip2, sip2);
        }
        npyv_u8 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_u8(op1, r);
        } else {
            npyv_storen_u8(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_u8 a = *ip1;
        const npyv_lanetype_u8 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP

#line 110
#define SCALAR_OP scalar_maxp_i
#if NPY_SIMD && (!1 || (0 && 1))

#if 0 && !1
    #define V_INTRIN npyv_maxpn_u8 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_maxpn_u8
#else
    #define V_INTRIN npyv_maxp_u8
    #define V_REDUCE_INTRIN npyv_reduce_maxp_u8
#endif

// contiguous input.
static inline void
simd_reduce_c_maxp_u8(const npyv_lanetype_u8 *ip, npyv_lanetype_u8 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_u8;
    const int wstep = vstep*8;
    npyv_u8 acc = npyv_setall_u8(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_u8 v0 = npyv_load_u8(ip + vstep * 0);
        npyv_u8 v1 = npyv_load_u8(ip + vstep * 1);
        npyv_u8 v2 = npyv_load_u8(ip + vstep * 2);
        npyv_u8 v3 = npyv_load_u8(ip + vstep * 3);

        npyv_u8 v4 = npyv_load_u8(ip + vstep * 4);
        npyv_u8 v5 = npyv_load_u8(ip + vstep * 5);
        npyv_u8 v6 = npyv_load_u8(ip + vstep * 6);
        npyv_u8 v7 = npyv_load_u8(ip + vstep * 7);

        npyv_u8 r01 = V_INTRIN(v0, v1);
        npyv_u8 r23 = V_INTRIN(v2, v3);
        npyv_u8 r45 = V_INTRIN(v4, v5);
        npyv_u8 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_u8(ip));
    }
    npyv_lanetype_u8 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_u8 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_maxp_u8(const npyv_lanetype_u8 *ip1, const npyv_lanetype_u8 *ip2,
                                     npyv_lanetype_u8 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_u8;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_u8 v0 = npyv_load_u8(&ip1[i + 0 * elemPerVector]);
        npyv_u8 v1 = npyv_load_u8(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_u8 v2 = npyv_load_u8(&ip1[i + 2 * elemPerVector]);
        npyv_u8 v3 = npyv_load_u8(&ip1[i + 3 * elemPerVector]);
        npyv_u8 v4 = npyv_load_u8(&ip1[i + 4 * elemPerVector]);
        npyv_u8 v5 = npyv_load_u8(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_u8 u0 = npyv_load_u8(&ip2[i + 0 * elemPerVector]);
        npyv_u8 u1 = npyv_load_u8(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_u8 u2 = npyv_load_u8(&ip2[i + 2 * elemPerVector]);
        npyv_u8 u3 = npyv_load_u8(&ip2[i + 3 * elemPerVector]);
        npyv_u8 u4 = npyv_load_u8(&ip2[i + 4 * elemPerVector]);
        npyv_u8 u5 = npyv_load_u8(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_u8 m0 = V_INTRIN(v0, u0);
        npyv_u8 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_u8 m2 = V_INTRIN(v2, u2);
        npyv_u8 m3 = V_INTRIN(v3, u3);
        npyv_u8 m4 = V_INTRIN(v4, u4);
        npyv_u8 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_u8(&op1[i + 0 * elemPerVector], m0);
        npyv_store_u8(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_u8(&op1[i + 2 * elemPerVector], m2);
        npyv_store_u8(&op1[i + 3 * elemPerVector], m3);
        npyv_store_u8(&op1[i + 4 * elemPerVector], m4);
        npyv_store_u8(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_u8 v0 = npyv_load_u8(ip1 + i);
        npyv_u8 u0 = npyv_load_u8(ip2 + i);
        npyv_u8 m0 = V_INTRIN(v0, u0);
        npyv_store_u8(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_u8 in1 = ip1[i];
        const npyv_lanetype_u8 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 0 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_maxp_u8(const npyv_lanetype_u8 *ip1, npy_intp sip1,
                           const npyv_lanetype_u8 *ip2, npy_intp sip2,
                                 npyv_lanetype_u8 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_u8;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_u8 a, b;
        if (sip1 == 1) {
            a = npyv_load_u8(ip1);
        } else {
            a = npyv_loadn_u8(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_u8(ip2);
        } else {
            b = npyv_loadn_u8(ip2, sip2);
        }
        npyv_u8 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_u8(op1, r);
        } else {
            npyv_storen_u8(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_u8 a = *ip1;
        const npyv_lanetype_u8 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP

#line 110
#define SCALAR_OP scalar_minp_i
#if NPY_SIMD && (!1 || (0 && 1))

#if 0 && !1
    #define V_INTRIN npyv_minpn_u8 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_minpn_u8
#else
    #define V_INTRIN npyv_minp_u8
    #define V_REDUCE_INTRIN npyv_reduce_minp_u8
#endif

// contiguous input.
static inline void
simd_reduce_c_minp_u8(const npyv_lanetype_u8 *ip, npyv_lanetype_u8 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_u8;
    const int wstep = vstep*8;
    npyv_u8 acc = npyv_setall_u8(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_u8 v0 = npyv_load_u8(ip + vstep * 0);
        npyv_u8 v1 = npyv_load_u8(ip + vstep * 1);
        npyv_u8 v2 = npyv_load_u8(ip + vstep * 2);
        npyv_u8 v3 = npyv_load_u8(ip + vstep * 3);

        npyv_u8 v4 = npyv_load_u8(ip + vstep * 4);
        npyv_u8 v5 = npyv_load_u8(ip + vstep * 5);
        npyv_u8 v6 = npyv_load_u8(ip + vstep * 6);
        npyv_u8 v7 = npyv_load_u8(ip + vstep * 7);

        npyv_u8 r01 = V_INTRIN(v0, v1);
        npyv_u8 r23 = V_INTRIN(v2, v3);
        npyv_u8 r45 = V_INTRIN(v4, v5);
        npyv_u8 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_u8(ip));
    }
    npyv_lanetype_u8 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_u8 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_minp_u8(const npyv_lanetype_u8 *ip1, const npyv_lanetype_u8 *ip2,
                                     npyv_lanetype_u8 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_u8;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_u8 v0 = npyv_load_u8(&ip1[i + 0 * elemPerVector]);
        npyv_u8 v1 = npyv_load_u8(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_u8 v2 = npyv_load_u8(&ip1[i + 2 * elemPerVector]);
        npyv_u8 v3 = npyv_load_u8(&ip1[i + 3 * elemPerVector]);
        npyv_u8 v4 = npyv_load_u8(&ip1[i + 4 * elemPerVector]);
        npyv_u8 v5 = npyv_load_u8(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_u8 u0 = npyv_load_u8(&ip2[i + 0 * elemPerVector]);
        npyv_u8 u1 = npyv_load_u8(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_u8 u2 = npyv_load_u8(&ip2[i + 2 * elemPerVector]);
        npyv_u8 u3 = npyv_load_u8(&ip2[i + 3 * elemPerVector]);
        npyv_u8 u4 = npyv_load_u8(&ip2[i + 4 * elemPerVector]);
        npyv_u8 u5 = npyv_load_u8(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_u8 m0 = V_INTRIN(v0, u0);
        npyv_u8 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_u8 m2 = V_INTRIN(v2, u2);
        npyv_u8 m3 = V_INTRIN(v3, u3);
        npyv_u8 m4 = V_INTRIN(v4, u4);
        npyv_u8 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_u8(&op1[i + 0 * elemPerVector], m0);
        npyv_store_u8(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_u8(&op1[i + 2 * elemPerVector], m2);
        npyv_store_u8(&op1[i + 3 * elemPerVector], m3);
        npyv_store_u8(&op1[i + 4 * elemPerVector], m4);
        npyv_store_u8(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_u8 v0 = npyv_load_u8(ip1 + i);
        npyv_u8 u0 = npyv_load_u8(ip2 + i);
        npyv_u8 m0 = V_INTRIN(v0, u0);
        npyv_store_u8(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_u8 in1 = ip1[i];
        const npyv_lanetype_u8 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 0 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_minp_u8(const npyv_lanetype_u8 *ip1, npy_intp sip1,
                           const npyv_lanetype_u8 *ip2, npy_intp sip2,
                                 npyv_lanetype_u8 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_u8;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_u8 a, b;
        if (sip1 == 1) {
            a = npyv_load_u8(ip1);
        } else {
            a = npyv_loadn_u8(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_u8(ip2);
        } else {
            b = npyv_loadn_u8(ip2, sip2);
        }
        npyv_u8 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_u8(op1, r);
        } else {
            npyv_storen_u8(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_u8 a = *ip1;
        const npyv_lanetype_u8 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP


#line 106
#line 110
#define SCALAR_OP scalar_max_i
#if NPY_SIMD && (!0 || (0 && 0))

#if 0 && !0
    #define V_INTRIN npyv_maxn_s16 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_maxn_s16
#else
    #define V_INTRIN npyv_max_s16
    #define V_REDUCE_INTRIN npyv_reduce_max_s16
#endif

// contiguous input.
static inline void
simd_reduce_c_max_s16(const npyv_lanetype_s16 *ip, npyv_lanetype_s16 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_s16;
    const int wstep = vstep*8;
    npyv_s16 acc = npyv_setall_s16(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_s16 v0 = npyv_load_s16(ip + vstep * 0);
        npyv_s16 v1 = npyv_load_s16(ip + vstep * 1);
        npyv_s16 v2 = npyv_load_s16(ip + vstep * 2);
        npyv_s16 v3 = npyv_load_s16(ip + vstep * 3);

        npyv_s16 v4 = npyv_load_s16(ip + vstep * 4);
        npyv_s16 v5 = npyv_load_s16(ip + vstep * 5);
        npyv_s16 v6 = npyv_load_s16(ip + vstep * 6);
        npyv_s16 v7 = npyv_load_s16(ip + vstep * 7);

        npyv_s16 r01 = V_INTRIN(v0, v1);
        npyv_s16 r23 = V_INTRIN(v2, v3);
        npyv_s16 r45 = V_INTRIN(v4, v5);
        npyv_s16 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_s16(ip));
    }
    npyv_lanetype_s16 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_s16 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_max_s16(const npyv_lanetype_s16 *ip1, const npyv_lanetype_s16 *ip2,
                                     npyv_lanetype_s16 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_s16;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_s16 v0 = npyv_load_s16(&ip1[i + 0 * elemPerVector]);
        npyv_s16 v1 = npyv_load_s16(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_s16 v2 = npyv_load_s16(&ip1[i + 2 * elemPerVector]);
        npyv_s16 v3 = npyv_load_s16(&ip1[i + 3 * elemPerVector]);
        npyv_s16 v4 = npyv_load_s16(&ip1[i + 4 * elemPerVector]);
        npyv_s16 v5 = npyv_load_s16(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_s16 u0 = npyv_load_s16(&ip2[i + 0 * elemPerVector]);
        npyv_s16 u1 = npyv_load_s16(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_s16 u2 = npyv_load_s16(&ip2[i + 2 * elemPerVector]);
        npyv_s16 u3 = npyv_load_s16(&ip2[i + 3 * elemPerVector]);
        npyv_s16 u4 = npyv_load_s16(&ip2[i + 4 * elemPerVector]);
        npyv_s16 u5 = npyv_load_s16(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_s16 m0 = V_INTRIN(v0, u0);
        npyv_s16 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_s16 m2 = V_INTRIN(v2, u2);
        npyv_s16 m3 = V_INTRIN(v3, u3);
        npyv_s16 m4 = V_INTRIN(v4, u4);
        npyv_s16 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_s16(&op1[i + 0 * elemPerVector], m0);
        npyv_store_s16(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_s16(&op1[i + 2 * elemPerVector], m2);
        npyv_store_s16(&op1[i + 3 * elemPerVector], m3);
        npyv_store_s16(&op1[i + 4 * elemPerVector], m4);
        npyv_store_s16(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_s16 v0 = npyv_load_s16(ip1 + i);
        npyv_s16 u0 = npyv_load_s16(ip2 + i);
        npyv_s16 m0 = V_INTRIN(v0, u0);
        npyv_store_s16(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_s16 in1 = ip1[i];
        const npyv_lanetype_s16 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 0 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_max_s16(const npyv_lanetype_s16 *ip1, npy_intp sip1,
                           const npyv_lanetype_s16 *ip2, npy_intp sip2,
                                 npyv_lanetype_s16 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_s16;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_s16 a, b;
        if (sip1 == 1) {
            a = npyv_load_s16(ip1);
        } else {
            a = npyv_loadn_s16(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_s16(ip2);
        } else {
            b = npyv_loadn_s16(ip2, sip2);
        }
        npyv_s16 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_s16(op1, r);
        } else {
            npyv_storen_s16(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_s16 a = *ip1;
        const npyv_lanetype_s16 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP

#line 110
#define SCALAR_OP scalar_min_i
#if NPY_SIMD && (!0 || (0 && 0))

#if 0 && !0
    #define V_INTRIN npyv_minn_s16 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_minn_s16
#else
    #define V_INTRIN npyv_min_s16
    #define V_REDUCE_INTRIN npyv_reduce_min_s16
#endif

// contiguous input.
static inline void
simd_reduce_c_min_s16(const npyv_lanetype_s16 *ip, npyv_lanetype_s16 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_s16;
    const int wstep = vstep*8;
    npyv_s16 acc = npyv_setall_s16(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_s16 v0 = npyv_load_s16(ip + vstep * 0);
        npyv_s16 v1 = npyv_load_s16(ip + vstep * 1);
        npyv_s16 v2 = npyv_load_s16(ip + vstep * 2);
        npyv_s16 v3 = npyv_load_s16(ip + vstep * 3);

        npyv_s16 v4 = npyv_load_s16(ip + vstep * 4);
        npyv_s16 v5 = npyv_load_s16(ip + vstep * 5);
        npyv_s16 v6 = npyv_load_s16(ip + vstep * 6);
        npyv_s16 v7 = npyv_load_s16(ip + vstep * 7);

        npyv_s16 r01 = V_INTRIN(v0, v1);
        npyv_s16 r23 = V_INTRIN(v2, v3);
        npyv_s16 r45 = V_INTRIN(v4, v5);
        npyv_s16 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_s16(ip));
    }
    npyv_lanetype_s16 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_s16 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_min_s16(const npyv_lanetype_s16 *ip1, const npyv_lanetype_s16 *ip2,
                                     npyv_lanetype_s16 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_s16;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_s16 v0 = npyv_load_s16(&ip1[i + 0 * elemPerVector]);
        npyv_s16 v1 = npyv_load_s16(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_s16 v2 = npyv_load_s16(&ip1[i + 2 * elemPerVector]);
        npyv_s16 v3 = npyv_load_s16(&ip1[i + 3 * elemPerVector]);
        npyv_s16 v4 = npyv_load_s16(&ip1[i + 4 * elemPerVector]);
        npyv_s16 v5 = npyv_load_s16(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_s16 u0 = npyv_load_s16(&ip2[i + 0 * elemPerVector]);
        npyv_s16 u1 = npyv_load_s16(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_s16 u2 = npyv_load_s16(&ip2[i + 2 * elemPerVector]);
        npyv_s16 u3 = npyv_load_s16(&ip2[i + 3 * elemPerVector]);
        npyv_s16 u4 = npyv_load_s16(&ip2[i + 4 * elemPerVector]);
        npyv_s16 u5 = npyv_load_s16(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_s16 m0 = V_INTRIN(v0, u0);
        npyv_s16 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_s16 m2 = V_INTRIN(v2, u2);
        npyv_s16 m3 = V_INTRIN(v3, u3);
        npyv_s16 m4 = V_INTRIN(v4, u4);
        npyv_s16 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_s16(&op1[i + 0 * elemPerVector], m0);
        npyv_store_s16(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_s16(&op1[i + 2 * elemPerVector], m2);
        npyv_store_s16(&op1[i + 3 * elemPerVector], m3);
        npyv_store_s16(&op1[i + 4 * elemPerVector], m4);
        npyv_store_s16(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_s16 v0 = npyv_load_s16(ip1 + i);
        npyv_s16 u0 = npyv_load_s16(ip2 + i);
        npyv_s16 m0 = V_INTRIN(v0, u0);
        npyv_store_s16(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_s16 in1 = ip1[i];
        const npyv_lanetype_s16 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 0 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_min_s16(const npyv_lanetype_s16 *ip1, npy_intp sip1,
                           const npyv_lanetype_s16 *ip2, npy_intp sip2,
                                 npyv_lanetype_s16 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_s16;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_s16 a, b;
        if (sip1 == 1) {
            a = npyv_load_s16(ip1);
        } else {
            a = npyv_loadn_s16(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_s16(ip2);
        } else {
            b = npyv_loadn_s16(ip2, sip2);
        }
        npyv_s16 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_s16(op1, r);
        } else {
            npyv_storen_s16(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_s16 a = *ip1;
        const npyv_lanetype_s16 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP

#line 110
#define SCALAR_OP scalar_maxp_i
#if NPY_SIMD && (!1 || (0 && 1))

#if 0 && !1
    #define V_INTRIN npyv_maxpn_s16 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_maxpn_s16
#else
    #define V_INTRIN npyv_maxp_s16
    #define V_REDUCE_INTRIN npyv_reduce_maxp_s16
#endif

// contiguous input.
static inline void
simd_reduce_c_maxp_s16(const npyv_lanetype_s16 *ip, npyv_lanetype_s16 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_s16;
    const int wstep = vstep*8;
    npyv_s16 acc = npyv_setall_s16(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_s16 v0 = npyv_load_s16(ip + vstep * 0);
        npyv_s16 v1 = npyv_load_s16(ip + vstep * 1);
        npyv_s16 v2 = npyv_load_s16(ip + vstep * 2);
        npyv_s16 v3 = npyv_load_s16(ip + vstep * 3);

        npyv_s16 v4 = npyv_load_s16(ip + vstep * 4);
        npyv_s16 v5 = npyv_load_s16(ip + vstep * 5);
        npyv_s16 v6 = npyv_load_s16(ip + vstep * 6);
        npyv_s16 v7 = npyv_load_s16(ip + vstep * 7);

        npyv_s16 r01 = V_INTRIN(v0, v1);
        npyv_s16 r23 = V_INTRIN(v2, v3);
        npyv_s16 r45 = V_INTRIN(v4, v5);
        npyv_s16 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_s16(ip));
    }
    npyv_lanetype_s16 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_s16 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_maxp_s16(const npyv_lanetype_s16 *ip1, const npyv_lanetype_s16 *ip2,
                                     npyv_lanetype_s16 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_s16;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_s16 v0 = npyv_load_s16(&ip1[i + 0 * elemPerVector]);
        npyv_s16 v1 = npyv_load_s16(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_s16 v2 = npyv_load_s16(&ip1[i + 2 * elemPerVector]);
        npyv_s16 v3 = npyv_load_s16(&ip1[i + 3 * elemPerVector]);
        npyv_s16 v4 = npyv_load_s16(&ip1[i + 4 * elemPerVector]);
        npyv_s16 v5 = npyv_load_s16(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_s16 u0 = npyv_load_s16(&ip2[i + 0 * elemPerVector]);
        npyv_s16 u1 = npyv_load_s16(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_s16 u2 = npyv_load_s16(&ip2[i + 2 * elemPerVector]);
        npyv_s16 u3 = npyv_load_s16(&ip2[i + 3 * elemPerVector]);
        npyv_s16 u4 = npyv_load_s16(&ip2[i + 4 * elemPerVector]);
        npyv_s16 u5 = npyv_load_s16(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_s16 m0 = V_INTRIN(v0, u0);
        npyv_s16 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_s16 m2 = V_INTRIN(v2, u2);
        npyv_s16 m3 = V_INTRIN(v3, u3);
        npyv_s16 m4 = V_INTRIN(v4, u4);
        npyv_s16 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_s16(&op1[i + 0 * elemPerVector], m0);
        npyv_store_s16(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_s16(&op1[i + 2 * elemPerVector], m2);
        npyv_store_s16(&op1[i + 3 * elemPerVector], m3);
        npyv_store_s16(&op1[i + 4 * elemPerVector], m4);
        npyv_store_s16(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_s16 v0 = npyv_load_s16(ip1 + i);
        npyv_s16 u0 = npyv_load_s16(ip2 + i);
        npyv_s16 m0 = V_INTRIN(v0, u0);
        npyv_store_s16(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_s16 in1 = ip1[i];
        const npyv_lanetype_s16 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 0 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_maxp_s16(const npyv_lanetype_s16 *ip1, npy_intp sip1,
                           const npyv_lanetype_s16 *ip2, npy_intp sip2,
                                 npyv_lanetype_s16 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_s16;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_s16 a, b;
        if (sip1 == 1) {
            a = npyv_load_s16(ip1);
        } else {
            a = npyv_loadn_s16(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_s16(ip2);
        } else {
            b = npyv_loadn_s16(ip2, sip2);
        }
        npyv_s16 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_s16(op1, r);
        } else {
            npyv_storen_s16(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_s16 a = *ip1;
        const npyv_lanetype_s16 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP

#line 110
#define SCALAR_OP scalar_minp_i
#if NPY_SIMD && (!1 || (0 && 1))

#if 0 && !1
    #define V_INTRIN npyv_minpn_s16 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_minpn_s16
#else
    #define V_INTRIN npyv_minp_s16
    #define V_REDUCE_INTRIN npyv_reduce_minp_s16
#endif

// contiguous input.
static inline void
simd_reduce_c_minp_s16(const npyv_lanetype_s16 *ip, npyv_lanetype_s16 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_s16;
    const int wstep = vstep*8;
    npyv_s16 acc = npyv_setall_s16(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_s16 v0 = npyv_load_s16(ip + vstep * 0);
        npyv_s16 v1 = npyv_load_s16(ip + vstep * 1);
        npyv_s16 v2 = npyv_load_s16(ip + vstep * 2);
        npyv_s16 v3 = npyv_load_s16(ip + vstep * 3);

        npyv_s16 v4 = npyv_load_s16(ip + vstep * 4);
        npyv_s16 v5 = npyv_load_s16(ip + vstep * 5);
        npyv_s16 v6 = npyv_load_s16(ip + vstep * 6);
        npyv_s16 v7 = npyv_load_s16(ip + vstep * 7);

        npyv_s16 r01 = V_INTRIN(v0, v1);
        npyv_s16 r23 = V_INTRIN(v2, v3);
        npyv_s16 r45 = V_INTRIN(v4, v5);
        npyv_s16 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_s16(ip));
    }
    npyv_lanetype_s16 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_s16 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_minp_s16(const npyv_lanetype_s16 *ip1, const npyv_lanetype_s16 *ip2,
                                     npyv_lanetype_s16 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_s16;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_s16 v0 = npyv_load_s16(&ip1[i + 0 * elemPerVector]);
        npyv_s16 v1 = npyv_load_s16(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_s16 v2 = npyv_load_s16(&ip1[i + 2 * elemPerVector]);
        npyv_s16 v3 = npyv_load_s16(&ip1[i + 3 * elemPerVector]);
        npyv_s16 v4 = npyv_load_s16(&ip1[i + 4 * elemPerVector]);
        npyv_s16 v5 = npyv_load_s16(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_s16 u0 = npyv_load_s16(&ip2[i + 0 * elemPerVector]);
        npyv_s16 u1 = npyv_load_s16(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_s16 u2 = npyv_load_s16(&ip2[i + 2 * elemPerVector]);
        npyv_s16 u3 = npyv_load_s16(&ip2[i + 3 * elemPerVector]);
        npyv_s16 u4 = npyv_load_s16(&ip2[i + 4 * elemPerVector]);
        npyv_s16 u5 = npyv_load_s16(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_s16 m0 = V_INTRIN(v0, u0);
        npyv_s16 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_s16 m2 = V_INTRIN(v2, u2);
        npyv_s16 m3 = V_INTRIN(v3, u3);
        npyv_s16 m4 = V_INTRIN(v4, u4);
        npyv_s16 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_s16(&op1[i + 0 * elemPerVector], m0);
        npyv_store_s16(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_s16(&op1[i + 2 * elemPerVector], m2);
        npyv_store_s16(&op1[i + 3 * elemPerVector], m3);
        npyv_store_s16(&op1[i + 4 * elemPerVector], m4);
        npyv_store_s16(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_s16 v0 = npyv_load_s16(ip1 + i);
        npyv_s16 u0 = npyv_load_s16(ip2 + i);
        npyv_s16 m0 = V_INTRIN(v0, u0);
        npyv_store_s16(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_s16 in1 = ip1[i];
        const npyv_lanetype_s16 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 0 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_minp_s16(const npyv_lanetype_s16 *ip1, npy_intp sip1,
                           const npyv_lanetype_s16 *ip2, npy_intp sip2,
                                 npyv_lanetype_s16 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_s16;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_s16 a, b;
        if (sip1 == 1) {
            a = npyv_load_s16(ip1);
        } else {
            a = npyv_loadn_s16(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_s16(ip2);
        } else {
            b = npyv_loadn_s16(ip2, sip2);
        }
        npyv_s16 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_s16(op1, r);
        } else {
            npyv_storen_s16(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_s16 a = *ip1;
        const npyv_lanetype_s16 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP


#line 106
#line 110
#define SCALAR_OP scalar_max_i
#if NPY_SIMD && (!0 || (0 && 0))

#if 0 && !0
    #define V_INTRIN npyv_maxn_u16 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_maxn_u16
#else
    #define V_INTRIN npyv_max_u16
    #define V_REDUCE_INTRIN npyv_reduce_max_u16
#endif

// contiguous input.
static inline void
simd_reduce_c_max_u16(const npyv_lanetype_u16 *ip, npyv_lanetype_u16 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_u16;
    const int wstep = vstep*8;
    npyv_u16 acc = npyv_setall_u16(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_u16 v0 = npyv_load_u16(ip + vstep * 0);
        npyv_u16 v1 = npyv_load_u16(ip + vstep * 1);
        npyv_u16 v2 = npyv_load_u16(ip + vstep * 2);
        npyv_u16 v3 = npyv_load_u16(ip + vstep * 3);

        npyv_u16 v4 = npyv_load_u16(ip + vstep * 4);
        npyv_u16 v5 = npyv_load_u16(ip + vstep * 5);
        npyv_u16 v6 = npyv_load_u16(ip + vstep * 6);
        npyv_u16 v7 = npyv_load_u16(ip + vstep * 7);

        npyv_u16 r01 = V_INTRIN(v0, v1);
        npyv_u16 r23 = V_INTRIN(v2, v3);
        npyv_u16 r45 = V_INTRIN(v4, v5);
        npyv_u16 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_u16(ip));
    }
    npyv_lanetype_u16 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_u16 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_max_u16(const npyv_lanetype_u16 *ip1, const npyv_lanetype_u16 *ip2,
                                     npyv_lanetype_u16 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_u16;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_u16 v0 = npyv_load_u16(&ip1[i + 0 * elemPerVector]);
        npyv_u16 v1 = npyv_load_u16(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_u16 v2 = npyv_load_u16(&ip1[i + 2 * elemPerVector]);
        npyv_u16 v3 = npyv_load_u16(&ip1[i + 3 * elemPerVector]);
        npyv_u16 v4 = npyv_load_u16(&ip1[i + 4 * elemPerVector]);
        npyv_u16 v5 = npyv_load_u16(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_u16 u0 = npyv_load_u16(&ip2[i + 0 * elemPerVector]);
        npyv_u16 u1 = npyv_load_u16(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_u16 u2 = npyv_load_u16(&ip2[i + 2 * elemPerVector]);
        npyv_u16 u3 = npyv_load_u16(&ip2[i + 3 * elemPerVector]);
        npyv_u16 u4 = npyv_load_u16(&ip2[i + 4 * elemPerVector]);
        npyv_u16 u5 = npyv_load_u16(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_u16 m0 = V_INTRIN(v0, u0);
        npyv_u16 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_u16 m2 = V_INTRIN(v2, u2);
        npyv_u16 m3 = V_INTRIN(v3, u3);
        npyv_u16 m4 = V_INTRIN(v4, u4);
        npyv_u16 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_u16(&op1[i + 0 * elemPerVector], m0);
        npyv_store_u16(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_u16(&op1[i + 2 * elemPerVector], m2);
        npyv_store_u16(&op1[i + 3 * elemPerVector], m3);
        npyv_store_u16(&op1[i + 4 * elemPerVector], m4);
        npyv_store_u16(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_u16 v0 = npyv_load_u16(ip1 + i);
        npyv_u16 u0 = npyv_load_u16(ip2 + i);
        npyv_u16 m0 = V_INTRIN(v0, u0);
        npyv_store_u16(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_u16 in1 = ip1[i];
        const npyv_lanetype_u16 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 0 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_max_u16(const npyv_lanetype_u16 *ip1, npy_intp sip1,
                           const npyv_lanetype_u16 *ip2, npy_intp sip2,
                                 npyv_lanetype_u16 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_u16;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_u16 a, b;
        if (sip1 == 1) {
            a = npyv_load_u16(ip1);
        } else {
            a = npyv_loadn_u16(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_u16(ip2);
        } else {
            b = npyv_loadn_u16(ip2, sip2);
        }
        npyv_u16 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_u16(op1, r);
        } else {
            npyv_storen_u16(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_u16 a = *ip1;
        const npyv_lanetype_u16 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP

#line 110
#define SCALAR_OP scalar_min_i
#if NPY_SIMD && (!0 || (0 && 0))

#if 0 && !0
    #define V_INTRIN npyv_minn_u16 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_minn_u16
#else
    #define V_INTRIN npyv_min_u16
    #define V_REDUCE_INTRIN npyv_reduce_min_u16
#endif

// contiguous input.
static inline void
simd_reduce_c_min_u16(const npyv_lanetype_u16 *ip, npyv_lanetype_u16 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_u16;
    const int wstep = vstep*8;
    npyv_u16 acc = npyv_setall_u16(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_u16 v0 = npyv_load_u16(ip + vstep * 0);
        npyv_u16 v1 = npyv_load_u16(ip + vstep * 1);
        npyv_u16 v2 = npyv_load_u16(ip + vstep * 2);
        npyv_u16 v3 = npyv_load_u16(ip + vstep * 3);

        npyv_u16 v4 = npyv_load_u16(ip + vstep * 4);
        npyv_u16 v5 = npyv_load_u16(ip + vstep * 5);
        npyv_u16 v6 = npyv_load_u16(ip + vstep * 6);
        npyv_u16 v7 = npyv_load_u16(ip + vstep * 7);

        npyv_u16 r01 = V_INTRIN(v0, v1);
        npyv_u16 r23 = V_INTRIN(v2, v3);
        npyv_u16 r45 = V_INTRIN(v4, v5);
        npyv_u16 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_u16(ip));
    }
    npyv_lanetype_u16 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_u16 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_min_u16(const npyv_lanetype_u16 *ip1, const npyv_lanetype_u16 *ip2,
                                     npyv_lanetype_u16 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_u16;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_u16 v0 = npyv_load_u16(&ip1[i + 0 * elemPerVector]);
        npyv_u16 v1 = npyv_load_u16(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_u16 v2 = npyv_load_u16(&ip1[i + 2 * elemPerVector]);
        npyv_u16 v3 = npyv_load_u16(&ip1[i + 3 * elemPerVector]);
        npyv_u16 v4 = npyv_load_u16(&ip1[i + 4 * elemPerVector]);
        npyv_u16 v5 = npyv_load_u16(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_u16 u0 = npyv_load_u16(&ip2[i + 0 * elemPerVector]);
        npyv_u16 u1 = npyv_load_u16(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_u16 u2 = npyv_load_u16(&ip2[i + 2 * elemPerVector]);
        npyv_u16 u3 = npyv_load_u16(&ip2[i + 3 * elemPerVector]);
        npyv_u16 u4 = npyv_load_u16(&ip2[i + 4 * elemPerVector]);
        npyv_u16 u5 = npyv_load_u16(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_u16 m0 = V_INTRIN(v0, u0);
        npyv_u16 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_u16 m2 = V_INTRIN(v2, u2);
        npyv_u16 m3 = V_INTRIN(v3, u3);
        npyv_u16 m4 = V_INTRIN(v4, u4);
        npyv_u16 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_u16(&op1[i + 0 * elemPerVector], m0);
        npyv_store_u16(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_u16(&op1[i + 2 * elemPerVector], m2);
        npyv_store_u16(&op1[i + 3 * elemPerVector], m3);
        npyv_store_u16(&op1[i + 4 * elemPerVector], m4);
        npyv_store_u16(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_u16 v0 = npyv_load_u16(ip1 + i);
        npyv_u16 u0 = npyv_load_u16(ip2 + i);
        npyv_u16 m0 = V_INTRIN(v0, u0);
        npyv_store_u16(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_u16 in1 = ip1[i];
        const npyv_lanetype_u16 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 0 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_min_u16(const npyv_lanetype_u16 *ip1, npy_intp sip1,
                           const npyv_lanetype_u16 *ip2, npy_intp sip2,
                                 npyv_lanetype_u16 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_u16;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_u16 a, b;
        if (sip1 == 1) {
            a = npyv_load_u16(ip1);
        } else {
            a = npyv_loadn_u16(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_u16(ip2);
        } else {
            b = npyv_loadn_u16(ip2, sip2);
        }
        npyv_u16 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_u16(op1, r);
        } else {
            npyv_storen_u16(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_u16 a = *ip1;
        const npyv_lanetype_u16 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP

#line 110
#define SCALAR_OP scalar_maxp_i
#if NPY_SIMD && (!1 || (0 && 1))

#if 0 && !1
    #define V_INTRIN npyv_maxpn_u16 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_maxpn_u16
#else
    #define V_INTRIN npyv_maxp_u16
    #define V_REDUCE_INTRIN npyv_reduce_maxp_u16
#endif

// contiguous input.
static inline void
simd_reduce_c_maxp_u16(const npyv_lanetype_u16 *ip, npyv_lanetype_u16 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_u16;
    const int wstep = vstep*8;
    npyv_u16 acc = npyv_setall_u16(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_u16 v0 = npyv_load_u16(ip + vstep * 0);
        npyv_u16 v1 = npyv_load_u16(ip + vstep * 1);
        npyv_u16 v2 = npyv_load_u16(ip + vstep * 2);
        npyv_u16 v3 = npyv_load_u16(ip + vstep * 3);

        npyv_u16 v4 = npyv_load_u16(ip + vstep * 4);
        npyv_u16 v5 = npyv_load_u16(ip + vstep * 5);
        npyv_u16 v6 = npyv_load_u16(ip + vstep * 6);
        npyv_u16 v7 = npyv_load_u16(ip + vstep * 7);

        npyv_u16 r01 = V_INTRIN(v0, v1);
        npyv_u16 r23 = V_INTRIN(v2, v3);
        npyv_u16 r45 = V_INTRIN(v4, v5);
        npyv_u16 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_u16(ip));
    }
    npyv_lanetype_u16 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_u16 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_maxp_u16(const npyv_lanetype_u16 *ip1, const npyv_lanetype_u16 *ip2,
                                     npyv_lanetype_u16 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_u16;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_u16 v0 = npyv_load_u16(&ip1[i + 0 * elemPerVector]);
        npyv_u16 v1 = npyv_load_u16(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_u16 v2 = npyv_load_u16(&ip1[i + 2 * elemPerVector]);
        npyv_u16 v3 = npyv_load_u16(&ip1[i + 3 * elemPerVector]);
        npyv_u16 v4 = npyv_load_u16(&ip1[i + 4 * elemPerVector]);
        npyv_u16 v5 = npyv_load_u16(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_u16 u0 = npyv_load_u16(&ip2[i + 0 * elemPerVector]);
        npyv_u16 u1 = npyv_load_u16(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_u16 u2 = npyv_load_u16(&ip2[i + 2 * elemPerVector]);
        npyv_u16 u3 = npyv_load_u16(&ip2[i + 3 * elemPerVector]);
        npyv_u16 u4 = npyv_load_u16(&ip2[i + 4 * elemPerVector]);
        npyv_u16 u5 = npyv_load_u16(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_u16 m0 = V_INTRIN(v0, u0);
        npyv_u16 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_u16 m2 = V_INTRIN(v2, u2);
        npyv_u16 m3 = V_INTRIN(v3, u3);
        npyv_u16 m4 = V_INTRIN(v4, u4);
        npyv_u16 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_u16(&op1[i + 0 * elemPerVector], m0);
        npyv_store_u16(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_u16(&op1[i + 2 * elemPerVector], m2);
        npyv_store_u16(&op1[i + 3 * elemPerVector], m3);
        npyv_store_u16(&op1[i + 4 * elemPerVector], m4);
        npyv_store_u16(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_u16 v0 = npyv_load_u16(ip1 + i);
        npyv_u16 u0 = npyv_load_u16(ip2 + i);
        npyv_u16 m0 = V_INTRIN(v0, u0);
        npyv_store_u16(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_u16 in1 = ip1[i];
        const npyv_lanetype_u16 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 0 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_maxp_u16(const npyv_lanetype_u16 *ip1, npy_intp sip1,
                           const npyv_lanetype_u16 *ip2, npy_intp sip2,
                                 npyv_lanetype_u16 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_u16;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_u16 a, b;
        if (sip1 == 1) {
            a = npyv_load_u16(ip1);
        } else {
            a = npyv_loadn_u16(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_u16(ip2);
        } else {
            b = npyv_loadn_u16(ip2, sip2);
        }
        npyv_u16 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_u16(op1, r);
        } else {
            npyv_storen_u16(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_u16 a = *ip1;
        const npyv_lanetype_u16 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP

#line 110
#define SCALAR_OP scalar_minp_i
#if NPY_SIMD && (!1 || (0 && 1))

#if 0 && !1
    #define V_INTRIN npyv_minpn_u16 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_minpn_u16
#else
    #define V_INTRIN npyv_minp_u16
    #define V_REDUCE_INTRIN npyv_reduce_minp_u16
#endif

// contiguous input.
static inline void
simd_reduce_c_minp_u16(const npyv_lanetype_u16 *ip, npyv_lanetype_u16 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_u16;
    const int wstep = vstep*8;
    npyv_u16 acc = npyv_setall_u16(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_u16 v0 = npyv_load_u16(ip + vstep * 0);
        npyv_u16 v1 = npyv_load_u16(ip + vstep * 1);
        npyv_u16 v2 = npyv_load_u16(ip + vstep * 2);
        npyv_u16 v3 = npyv_load_u16(ip + vstep * 3);

        npyv_u16 v4 = npyv_load_u16(ip + vstep * 4);
        npyv_u16 v5 = npyv_load_u16(ip + vstep * 5);
        npyv_u16 v6 = npyv_load_u16(ip + vstep * 6);
        npyv_u16 v7 = npyv_load_u16(ip + vstep * 7);

        npyv_u16 r01 = V_INTRIN(v0, v1);
        npyv_u16 r23 = V_INTRIN(v2, v3);
        npyv_u16 r45 = V_INTRIN(v4, v5);
        npyv_u16 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_u16(ip));
    }
    npyv_lanetype_u16 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_u16 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_minp_u16(const npyv_lanetype_u16 *ip1, const npyv_lanetype_u16 *ip2,
                                     npyv_lanetype_u16 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_u16;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_u16 v0 = npyv_load_u16(&ip1[i + 0 * elemPerVector]);
        npyv_u16 v1 = npyv_load_u16(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_u16 v2 = npyv_load_u16(&ip1[i + 2 * elemPerVector]);
        npyv_u16 v3 = npyv_load_u16(&ip1[i + 3 * elemPerVector]);
        npyv_u16 v4 = npyv_load_u16(&ip1[i + 4 * elemPerVector]);
        npyv_u16 v5 = npyv_load_u16(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_u16 u0 = npyv_load_u16(&ip2[i + 0 * elemPerVector]);
        npyv_u16 u1 = npyv_load_u16(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_u16 u2 = npyv_load_u16(&ip2[i + 2 * elemPerVector]);
        npyv_u16 u3 = npyv_load_u16(&ip2[i + 3 * elemPerVector]);
        npyv_u16 u4 = npyv_load_u16(&ip2[i + 4 * elemPerVector]);
        npyv_u16 u5 = npyv_load_u16(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_u16 m0 = V_INTRIN(v0, u0);
        npyv_u16 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_u16 m2 = V_INTRIN(v2, u2);
        npyv_u16 m3 = V_INTRIN(v3, u3);
        npyv_u16 m4 = V_INTRIN(v4, u4);
        npyv_u16 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_u16(&op1[i + 0 * elemPerVector], m0);
        npyv_store_u16(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_u16(&op1[i + 2 * elemPerVector], m2);
        npyv_store_u16(&op1[i + 3 * elemPerVector], m3);
        npyv_store_u16(&op1[i + 4 * elemPerVector], m4);
        npyv_store_u16(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_u16 v0 = npyv_load_u16(ip1 + i);
        npyv_u16 u0 = npyv_load_u16(ip2 + i);
        npyv_u16 m0 = V_INTRIN(v0, u0);
        npyv_store_u16(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_u16 in1 = ip1[i];
        const npyv_lanetype_u16 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 0 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_minp_u16(const npyv_lanetype_u16 *ip1, npy_intp sip1,
                           const npyv_lanetype_u16 *ip2, npy_intp sip2,
                                 npyv_lanetype_u16 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_u16;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_u16 a, b;
        if (sip1 == 1) {
            a = npyv_load_u16(ip1);
        } else {
            a = npyv_loadn_u16(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_u16(ip2);
        } else {
            b = npyv_loadn_u16(ip2, sip2);
        }
        npyv_u16 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_u16(op1, r);
        } else {
            npyv_storen_u16(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_u16 a = *ip1;
        const npyv_lanetype_u16 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP


#line 106
#line 110
#define SCALAR_OP scalar_max_i
#if NPY_SIMD && (!0 || (0 && 0))

#if 0 && !0
    #define V_INTRIN npyv_maxn_s32 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_maxn_s32
#else
    #define V_INTRIN npyv_max_s32
    #define V_REDUCE_INTRIN npyv_reduce_max_s32
#endif

// contiguous input.
static inline void
simd_reduce_c_max_s32(const npyv_lanetype_s32 *ip, npyv_lanetype_s32 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_s32;
    const int wstep = vstep*8;
    npyv_s32 acc = npyv_setall_s32(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_s32 v0 = npyv_load_s32(ip + vstep * 0);
        npyv_s32 v1 = npyv_load_s32(ip + vstep * 1);
        npyv_s32 v2 = npyv_load_s32(ip + vstep * 2);
        npyv_s32 v3 = npyv_load_s32(ip + vstep * 3);

        npyv_s32 v4 = npyv_load_s32(ip + vstep * 4);
        npyv_s32 v5 = npyv_load_s32(ip + vstep * 5);
        npyv_s32 v6 = npyv_load_s32(ip + vstep * 6);
        npyv_s32 v7 = npyv_load_s32(ip + vstep * 7);

        npyv_s32 r01 = V_INTRIN(v0, v1);
        npyv_s32 r23 = V_INTRIN(v2, v3);
        npyv_s32 r45 = V_INTRIN(v4, v5);
        npyv_s32 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_s32(ip));
    }
    npyv_lanetype_s32 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_s32 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_max_s32(const npyv_lanetype_s32 *ip1, const npyv_lanetype_s32 *ip2,
                                     npyv_lanetype_s32 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_s32;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_s32 v0 = npyv_load_s32(&ip1[i + 0 * elemPerVector]);
        npyv_s32 v1 = npyv_load_s32(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_s32 v2 = npyv_load_s32(&ip1[i + 2 * elemPerVector]);
        npyv_s32 v3 = npyv_load_s32(&ip1[i + 3 * elemPerVector]);
        npyv_s32 v4 = npyv_load_s32(&ip1[i + 4 * elemPerVector]);
        npyv_s32 v5 = npyv_load_s32(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_s32 u0 = npyv_load_s32(&ip2[i + 0 * elemPerVector]);
        npyv_s32 u1 = npyv_load_s32(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_s32 u2 = npyv_load_s32(&ip2[i + 2 * elemPerVector]);
        npyv_s32 u3 = npyv_load_s32(&ip2[i + 3 * elemPerVector]);
        npyv_s32 u4 = npyv_load_s32(&ip2[i + 4 * elemPerVector]);
        npyv_s32 u5 = npyv_load_s32(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_s32 m0 = V_INTRIN(v0, u0);
        npyv_s32 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_s32 m2 = V_INTRIN(v2, u2);
        npyv_s32 m3 = V_INTRIN(v3, u3);
        npyv_s32 m4 = V_INTRIN(v4, u4);
        npyv_s32 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_s32(&op1[i + 0 * elemPerVector], m0);
        npyv_store_s32(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_s32(&op1[i + 2 * elemPerVector], m2);
        npyv_store_s32(&op1[i + 3 * elemPerVector], m3);
        npyv_store_s32(&op1[i + 4 * elemPerVector], m4);
        npyv_store_s32(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_s32 v0 = npyv_load_s32(ip1 + i);
        npyv_s32 u0 = npyv_load_s32(ip2 + i);
        npyv_s32 m0 = V_INTRIN(v0, u0);
        npyv_store_s32(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_s32 in1 = ip1[i];
        const npyv_lanetype_s32 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 0 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_max_s32(const npyv_lanetype_s32 *ip1, npy_intp sip1,
                           const npyv_lanetype_s32 *ip2, npy_intp sip2,
                                 npyv_lanetype_s32 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_s32;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_s32 a, b;
        if (sip1 == 1) {
            a = npyv_load_s32(ip1);
        } else {
            a = npyv_loadn_s32(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_s32(ip2);
        } else {
            b = npyv_loadn_s32(ip2, sip2);
        }
        npyv_s32 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_s32(op1, r);
        } else {
            npyv_storen_s32(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_s32 a = *ip1;
        const npyv_lanetype_s32 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP

#line 110
#define SCALAR_OP scalar_min_i
#if NPY_SIMD && (!0 || (0 && 0))

#if 0 && !0
    #define V_INTRIN npyv_minn_s32 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_minn_s32
#else
    #define V_INTRIN npyv_min_s32
    #define V_REDUCE_INTRIN npyv_reduce_min_s32
#endif

// contiguous input.
static inline void
simd_reduce_c_min_s32(const npyv_lanetype_s32 *ip, npyv_lanetype_s32 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_s32;
    const int wstep = vstep*8;
    npyv_s32 acc = npyv_setall_s32(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_s32 v0 = npyv_load_s32(ip + vstep * 0);
        npyv_s32 v1 = npyv_load_s32(ip + vstep * 1);
        npyv_s32 v2 = npyv_load_s32(ip + vstep * 2);
        npyv_s32 v3 = npyv_load_s32(ip + vstep * 3);

        npyv_s32 v4 = npyv_load_s32(ip + vstep * 4);
        npyv_s32 v5 = npyv_load_s32(ip + vstep * 5);
        npyv_s32 v6 = npyv_load_s32(ip + vstep * 6);
        npyv_s32 v7 = npyv_load_s32(ip + vstep * 7);

        npyv_s32 r01 = V_INTRIN(v0, v1);
        npyv_s32 r23 = V_INTRIN(v2, v3);
        npyv_s32 r45 = V_INTRIN(v4, v5);
        npyv_s32 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_s32(ip));
    }
    npyv_lanetype_s32 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_s32 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_min_s32(const npyv_lanetype_s32 *ip1, const npyv_lanetype_s32 *ip2,
                                     npyv_lanetype_s32 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_s32;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_s32 v0 = npyv_load_s32(&ip1[i + 0 * elemPerVector]);
        npyv_s32 v1 = npyv_load_s32(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_s32 v2 = npyv_load_s32(&ip1[i + 2 * elemPerVector]);
        npyv_s32 v3 = npyv_load_s32(&ip1[i + 3 * elemPerVector]);
        npyv_s32 v4 = npyv_load_s32(&ip1[i + 4 * elemPerVector]);
        npyv_s32 v5 = npyv_load_s32(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_s32 u0 = npyv_load_s32(&ip2[i + 0 * elemPerVector]);
        npyv_s32 u1 = npyv_load_s32(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_s32 u2 = npyv_load_s32(&ip2[i + 2 * elemPerVector]);
        npyv_s32 u3 = npyv_load_s32(&ip2[i + 3 * elemPerVector]);
        npyv_s32 u4 = npyv_load_s32(&ip2[i + 4 * elemPerVector]);
        npyv_s32 u5 = npyv_load_s32(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_s32 m0 = V_INTRIN(v0, u0);
        npyv_s32 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_s32 m2 = V_INTRIN(v2, u2);
        npyv_s32 m3 = V_INTRIN(v3, u3);
        npyv_s32 m4 = V_INTRIN(v4, u4);
        npyv_s32 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_s32(&op1[i + 0 * elemPerVector], m0);
        npyv_store_s32(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_s32(&op1[i + 2 * elemPerVector], m2);
        npyv_store_s32(&op1[i + 3 * elemPerVector], m3);
        npyv_store_s32(&op1[i + 4 * elemPerVector], m4);
        npyv_store_s32(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_s32 v0 = npyv_load_s32(ip1 + i);
        npyv_s32 u0 = npyv_load_s32(ip2 + i);
        npyv_s32 m0 = V_INTRIN(v0, u0);
        npyv_store_s32(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_s32 in1 = ip1[i];
        const npyv_lanetype_s32 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 0 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_min_s32(const npyv_lanetype_s32 *ip1, npy_intp sip1,
                           const npyv_lanetype_s32 *ip2, npy_intp sip2,
                                 npyv_lanetype_s32 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_s32;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_s32 a, b;
        if (sip1 == 1) {
            a = npyv_load_s32(ip1);
        } else {
            a = npyv_loadn_s32(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_s32(ip2);
        } else {
            b = npyv_loadn_s32(ip2, sip2);
        }
        npyv_s32 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_s32(op1, r);
        } else {
            npyv_storen_s32(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_s32 a = *ip1;
        const npyv_lanetype_s32 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP

#line 110
#define SCALAR_OP scalar_maxp_i
#if NPY_SIMD && (!1 || (0 && 1))

#if 0 && !1
    #define V_INTRIN npyv_maxpn_s32 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_maxpn_s32
#else
    #define V_INTRIN npyv_maxp_s32
    #define V_REDUCE_INTRIN npyv_reduce_maxp_s32
#endif

// contiguous input.
static inline void
simd_reduce_c_maxp_s32(const npyv_lanetype_s32 *ip, npyv_lanetype_s32 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_s32;
    const int wstep = vstep*8;
    npyv_s32 acc = npyv_setall_s32(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_s32 v0 = npyv_load_s32(ip + vstep * 0);
        npyv_s32 v1 = npyv_load_s32(ip + vstep * 1);
        npyv_s32 v2 = npyv_load_s32(ip + vstep * 2);
        npyv_s32 v3 = npyv_load_s32(ip + vstep * 3);

        npyv_s32 v4 = npyv_load_s32(ip + vstep * 4);
        npyv_s32 v5 = npyv_load_s32(ip + vstep * 5);
        npyv_s32 v6 = npyv_load_s32(ip + vstep * 6);
        npyv_s32 v7 = npyv_load_s32(ip + vstep * 7);

        npyv_s32 r01 = V_INTRIN(v0, v1);
        npyv_s32 r23 = V_INTRIN(v2, v3);
        npyv_s32 r45 = V_INTRIN(v4, v5);
        npyv_s32 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_s32(ip));
    }
    npyv_lanetype_s32 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_s32 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_maxp_s32(const npyv_lanetype_s32 *ip1, const npyv_lanetype_s32 *ip2,
                                     npyv_lanetype_s32 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_s32;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_s32 v0 = npyv_load_s32(&ip1[i + 0 * elemPerVector]);
        npyv_s32 v1 = npyv_load_s32(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_s32 v2 = npyv_load_s32(&ip1[i + 2 * elemPerVector]);
        npyv_s32 v3 = npyv_load_s32(&ip1[i + 3 * elemPerVector]);
        npyv_s32 v4 = npyv_load_s32(&ip1[i + 4 * elemPerVector]);
        npyv_s32 v5 = npyv_load_s32(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_s32 u0 = npyv_load_s32(&ip2[i + 0 * elemPerVector]);
        npyv_s32 u1 = npyv_load_s32(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_s32 u2 = npyv_load_s32(&ip2[i + 2 * elemPerVector]);
        npyv_s32 u3 = npyv_load_s32(&ip2[i + 3 * elemPerVector]);
        npyv_s32 u4 = npyv_load_s32(&ip2[i + 4 * elemPerVector]);
        npyv_s32 u5 = npyv_load_s32(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_s32 m0 = V_INTRIN(v0, u0);
        npyv_s32 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_s32 m2 = V_INTRIN(v2, u2);
        npyv_s32 m3 = V_INTRIN(v3, u3);
        npyv_s32 m4 = V_INTRIN(v4, u4);
        npyv_s32 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_s32(&op1[i + 0 * elemPerVector], m0);
        npyv_store_s32(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_s32(&op1[i + 2 * elemPerVector], m2);
        npyv_store_s32(&op1[i + 3 * elemPerVector], m3);
        npyv_store_s32(&op1[i + 4 * elemPerVector], m4);
        npyv_store_s32(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_s32 v0 = npyv_load_s32(ip1 + i);
        npyv_s32 u0 = npyv_load_s32(ip2 + i);
        npyv_s32 m0 = V_INTRIN(v0, u0);
        npyv_store_s32(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_s32 in1 = ip1[i];
        const npyv_lanetype_s32 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 0 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_maxp_s32(const npyv_lanetype_s32 *ip1, npy_intp sip1,
                           const npyv_lanetype_s32 *ip2, npy_intp sip2,
                                 npyv_lanetype_s32 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_s32;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_s32 a, b;
        if (sip1 == 1) {
            a = npyv_load_s32(ip1);
        } else {
            a = npyv_loadn_s32(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_s32(ip2);
        } else {
            b = npyv_loadn_s32(ip2, sip2);
        }
        npyv_s32 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_s32(op1, r);
        } else {
            npyv_storen_s32(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_s32 a = *ip1;
        const npyv_lanetype_s32 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP

#line 110
#define SCALAR_OP scalar_minp_i
#if NPY_SIMD && (!1 || (0 && 1))

#if 0 && !1
    #define V_INTRIN npyv_minpn_s32 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_minpn_s32
#else
    #define V_INTRIN npyv_minp_s32
    #define V_REDUCE_INTRIN npyv_reduce_minp_s32
#endif

// contiguous input.
static inline void
simd_reduce_c_minp_s32(const npyv_lanetype_s32 *ip, npyv_lanetype_s32 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_s32;
    const int wstep = vstep*8;
    npyv_s32 acc = npyv_setall_s32(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_s32 v0 = npyv_load_s32(ip + vstep * 0);
        npyv_s32 v1 = npyv_load_s32(ip + vstep * 1);
        npyv_s32 v2 = npyv_load_s32(ip + vstep * 2);
        npyv_s32 v3 = npyv_load_s32(ip + vstep * 3);

        npyv_s32 v4 = npyv_load_s32(ip + vstep * 4);
        npyv_s32 v5 = npyv_load_s32(ip + vstep * 5);
        npyv_s32 v6 = npyv_load_s32(ip + vstep * 6);
        npyv_s32 v7 = npyv_load_s32(ip + vstep * 7);

        npyv_s32 r01 = V_INTRIN(v0, v1);
        npyv_s32 r23 = V_INTRIN(v2, v3);
        npyv_s32 r45 = V_INTRIN(v4, v5);
        npyv_s32 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_s32(ip));
    }
    npyv_lanetype_s32 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_s32 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_minp_s32(const npyv_lanetype_s32 *ip1, const npyv_lanetype_s32 *ip2,
                                     npyv_lanetype_s32 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_s32;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_s32 v0 = npyv_load_s32(&ip1[i + 0 * elemPerVector]);
        npyv_s32 v1 = npyv_load_s32(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_s32 v2 = npyv_load_s32(&ip1[i + 2 * elemPerVector]);
        npyv_s32 v3 = npyv_load_s32(&ip1[i + 3 * elemPerVector]);
        npyv_s32 v4 = npyv_load_s32(&ip1[i + 4 * elemPerVector]);
        npyv_s32 v5 = npyv_load_s32(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_s32 u0 = npyv_load_s32(&ip2[i + 0 * elemPerVector]);
        npyv_s32 u1 = npyv_load_s32(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_s32 u2 = npyv_load_s32(&ip2[i + 2 * elemPerVector]);
        npyv_s32 u3 = npyv_load_s32(&ip2[i + 3 * elemPerVector]);
        npyv_s32 u4 = npyv_load_s32(&ip2[i + 4 * elemPerVector]);
        npyv_s32 u5 = npyv_load_s32(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_s32 m0 = V_INTRIN(v0, u0);
        npyv_s32 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_s32 m2 = V_INTRIN(v2, u2);
        npyv_s32 m3 = V_INTRIN(v3, u3);
        npyv_s32 m4 = V_INTRIN(v4, u4);
        npyv_s32 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_s32(&op1[i + 0 * elemPerVector], m0);
        npyv_store_s32(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_s32(&op1[i + 2 * elemPerVector], m2);
        npyv_store_s32(&op1[i + 3 * elemPerVector], m3);
        npyv_store_s32(&op1[i + 4 * elemPerVector], m4);
        npyv_store_s32(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_s32 v0 = npyv_load_s32(ip1 + i);
        npyv_s32 u0 = npyv_load_s32(ip2 + i);
        npyv_s32 m0 = V_INTRIN(v0, u0);
        npyv_store_s32(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_s32 in1 = ip1[i];
        const npyv_lanetype_s32 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 0 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_minp_s32(const npyv_lanetype_s32 *ip1, npy_intp sip1,
                           const npyv_lanetype_s32 *ip2, npy_intp sip2,
                                 npyv_lanetype_s32 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_s32;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_s32 a, b;
        if (sip1 == 1) {
            a = npyv_load_s32(ip1);
        } else {
            a = npyv_loadn_s32(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_s32(ip2);
        } else {
            b = npyv_loadn_s32(ip2, sip2);
        }
        npyv_s32 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_s32(op1, r);
        } else {
            npyv_storen_s32(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_s32 a = *ip1;
        const npyv_lanetype_s32 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP


#line 106
#line 110
#define SCALAR_OP scalar_max_i
#if NPY_SIMD && (!0 || (0 && 0))

#if 0 && !0
    #define V_INTRIN npyv_maxn_u32 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_maxn_u32
#else
    #define V_INTRIN npyv_max_u32
    #define V_REDUCE_INTRIN npyv_reduce_max_u32
#endif

// contiguous input.
static inline void
simd_reduce_c_max_u32(const npyv_lanetype_u32 *ip, npyv_lanetype_u32 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_u32;
    const int wstep = vstep*8;
    npyv_u32 acc = npyv_setall_u32(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_u32 v0 = npyv_load_u32(ip + vstep * 0);
        npyv_u32 v1 = npyv_load_u32(ip + vstep * 1);
        npyv_u32 v2 = npyv_load_u32(ip + vstep * 2);
        npyv_u32 v3 = npyv_load_u32(ip + vstep * 3);

        npyv_u32 v4 = npyv_load_u32(ip + vstep * 4);
        npyv_u32 v5 = npyv_load_u32(ip + vstep * 5);
        npyv_u32 v6 = npyv_load_u32(ip + vstep * 6);
        npyv_u32 v7 = npyv_load_u32(ip + vstep * 7);

        npyv_u32 r01 = V_INTRIN(v0, v1);
        npyv_u32 r23 = V_INTRIN(v2, v3);
        npyv_u32 r45 = V_INTRIN(v4, v5);
        npyv_u32 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_u32(ip));
    }
    npyv_lanetype_u32 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_u32 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_max_u32(const npyv_lanetype_u32 *ip1, const npyv_lanetype_u32 *ip2,
                                     npyv_lanetype_u32 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_u32;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_u32 v0 = npyv_load_u32(&ip1[i + 0 * elemPerVector]);
        npyv_u32 v1 = npyv_load_u32(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_u32 v2 = npyv_load_u32(&ip1[i + 2 * elemPerVector]);
        npyv_u32 v3 = npyv_load_u32(&ip1[i + 3 * elemPerVector]);
        npyv_u32 v4 = npyv_load_u32(&ip1[i + 4 * elemPerVector]);
        npyv_u32 v5 = npyv_load_u32(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_u32 u0 = npyv_load_u32(&ip2[i + 0 * elemPerVector]);
        npyv_u32 u1 = npyv_load_u32(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_u32 u2 = npyv_load_u32(&ip2[i + 2 * elemPerVector]);
        npyv_u32 u3 = npyv_load_u32(&ip2[i + 3 * elemPerVector]);
        npyv_u32 u4 = npyv_load_u32(&ip2[i + 4 * elemPerVector]);
        npyv_u32 u5 = npyv_load_u32(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_u32 m0 = V_INTRIN(v0, u0);
        npyv_u32 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_u32 m2 = V_INTRIN(v2, u2);
        npyv_u32 m3 = V_INTRIN(v3, u3);
        npyv_u32 m4 = V_INTRIN(v4, u4);
        npyv_u32 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_u32(&op1[i + 0 * elemPerVector], m0);
        npyv_store_u32(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_u32(&op1[i + 2 * elemPerVector], m2);
        npyv_store_u32(&op1[i + 3 * elemPerVector], m3);
        npyv_store_u32(&op1[i + 4 * elemPerVector], m4);
        npyv_store_u32(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_u32 v0 = npyv_load_u32(ip1 + i);
        npyv_u32 u0 = npyv_load_u32(ip2 + i);
        npyv_u32 m0 = V_INTRIN(v0, u0);
        npyv_store_u32(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_u32 in1 = ip1[i];
        const npyv_lanetype_u32 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 0 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_max_u32(const npyv_lanetype_u32 *ip1, npy_intp sip1,
                           const npyv_lanetype_u32 *ip2, npy_intp sip2,
                                 npyv_lanetype_u32 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_u32;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_u32 a, b;
        if (sip1 == 1) {
            a = npyv_load_u32(ip1);
        } else {
            a = npyv_loadn_u32(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_u32(ip2);
        } else {
            b = npyv_loadn_u32(ip2, sip2);
        }
        npyv_u32 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_u32(op1, r);
        } else {
            npyv_storen_u32(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_u32 a = *ip1;
        const npyv_lanetype_u32 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP

#line 110
#define SCALAR_OP scalar_min_i
#if NPY_SIMD && (!0 || (0 && 0))

#if 0 && !0
    #define V_INTRIN npyv_minn_u32 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_minn_u32
#else
    #define V_INTRIN npyv_min_u32
    #define V_REDUCE_INTRIN npyv_reduce_min_u32
#endif

// contiguous input.
static inline void
simd_reduce_c_min_u32(const npyv_lanetype_u32 *ip, npyv_lanetype_u32 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_u32;
    const int wstep = vstep*8;
    npyv_u32 acc = npyv_setall_u32(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_u32 v0 = npyv_load_u32(ip + vstep * 0);
        npyv_u32 v1 = npyv_load_u32(ip + vstep * 1);
        npyv_u32 v2 = npyv_load_u32(ip + vstep * 2);
        npyv_u32 v3 = npyv_load_u32(ip + vstep * 3);

        npyv_u32 v4 = npyv_load_u32(ip + vstep * 4);
        npyv_u32 v5 = npyv_load_u32(ip + vstep * 5);
        npyv_u32 v6 = npyv_load_u32(ip + vstep * 6);
        npyv_u32 v7 = npyv_load_u32(ip + vstep * 7);

        npyv_u32 r01 = V_INTRIN(v0, v1);
        npyv_u32 r23 = V_INTRIN(v2, v3);
        npyv_u32 r45 = V_INTRIN(v4, v5);
        npyv_u32 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_u32(ip));
    }
    npyv_lanetype_u32 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_u32 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_min_u32(const npyv_lanetype_u32 *ip1, const npyv_lanetype_u32 *ip2,
                                     npyv_lanetype_u32 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_u32;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_u32 v0 = npyv_load_u32(&ip1[i + 0 * elemPerVector]);
        npyv_u32 v1 = npyv_load_u32(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_u32 v2 = npyv_load_u32(&ip1[i + 2 * elemPerVector]);
        npyv_u32 v3 = npyv_load_u32(&ip1[i + 3 * elemPerVector]);
        npyv_u32 v4 = npyv_load_u32(&ip1[i + 4 * elemPerVector]);
        npyv_u32 v5 = npyv_load_u32(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_u32 u0 = npyv_load_u32(&ip2[i + 0 * elemPerVector]);
        npyv_u32 u1 = npyv_load_u32(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_u32 u2 = npyv_load_u32(&ip2[i + 2 * elemPerVector]);
        npyv_u32 u3 = npyv_load_u32(&ip2[i + 3 * elemPerVector]);
        npyv_u32 u4 = npyv_load_u32(&ip2[i + 4 * elemPerVector]);
        npyv_u32 u5 = npyv_load_u32(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_u32 m0 = V_INTRIN(v0, u0);
        npyv_u32 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_u32 m2 = V_INTRIN(v2, u2);
        npyv_u32 m3 = V_INTRIN(v3, u3);
        npyv_u32 m4 = V_INTRIN(v4, u4);
        npyv_u32 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_u32(&op1[i + 0 * elemPerVector], m0);
        npyv_store_u32(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_u32(&op1[i + 2 * elemPerVector], m2);
        npyv_store_u32(&op1[i + 3 * elemPerVector], m3);
        npyv_store_u32(&op1[i + 4 * elemPerVector], m4);
        npyv_store_u32(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_u32 v0 = npyv_load_u32(ip1 + i);
        npyv_u32 u0 = npyv_load_u32(ip2 + i);
        npyv_u32 m0 = V_INTRIN(v0, u0);
        npyv_store_u32(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_u32 in1 = ip1[i];
        const npyv_lanetype_u32 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 0 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_min_u32(const npyv_lanetype_u32 *ip1, npy_intp sip1,
                           const npyv_lanetype_u32 *ip2, npy_intp sip2,
                                 npyv_lanetype_u32 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_u32;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_u32 a, b;
        if (sip1 == 1) {
            a = npyv_load_u32(ip1);
        } else {
            a = npyv_loadn_u32(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_u32(ip2);
        } else {
            b = npyv_loadn_u32(ip2, sip2);
        }
        npyv_u32 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_u32(op1, r);
        } else {
            npyv_storen_u32(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_u32 a = *ip1;
        const npyv_lanetype_u32 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP

#line 110
#define SCALAR_OP scalar_maxp_i
#if NPY_SIMD && (!1 || (0 && 1))

#if 0 && !1
    #define V_INTRIN npyv_maxpn_u32 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_maxpn_u32
#else
    #define V_INTRIN npyv_maxp_u32
    #define V_REDUCE_INTRIN npyv_reduce_maxp_u32
#endif

// contiguous input.
static inline void
simd_reduce_c_maxp_u32(const npyv_lanetype_u32 *ip, npyv_lanetype_u32 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_u32;
    const int wstep = vstep*8;
    npyv_u32 acc = npyv_setall_u32(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_u32 v0 = npyv_load_u32(ip + vstep * 0);
        npyv_u32 v1 = npyv_load_u32(ip + vstep * 1);
        npyv_u32 v2 = npyv_load_u32(ip + vstep * 2);
        npyv_u32 v3 = npyv_load_u32(ip + vstep * 3);

        npyv_u32 v4 = npyv_load_u32(ip + vstep * 4);
        npyv_u32 v5 = npyv_load_u32(ip + vstep * 5);
        npyv_u32 v6 = npyv_load_u32(ip + vstep * 6);
        npyv_u32 v7 = npyv_load_u32(ip + vstep * 7);

        npyv_u32 r01 = V_INTRIN(v0, v1);
        npyv_u32 r23 = V_INTRIN(v2, v3);
        npyv_u32 r45 = V_INTRIN(v4, v5);
        npyv_u32 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_u32(ip));
    }
    npyv_lanetype_u32 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_u32 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_maxp_u32(const npyv_lanetype_u32 *ip1, const npyv_lanetype_u32 *ip2,
                                     npyv_lanetype_u32 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_u32;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_u32 v0 = npyv_load_u32(&ip1[i + 0 * elemPerVector]);
        npyv_u32 v1 = npyv_load_u32(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_u32 v2 = npyv_load_u32(&ip1[i + 2 * elemPerVector]);
        npyv_u32 v3 = npyv_load_u32(&ip1[i + 3 * elemPerVector]);
        npyv_u32 v4 = npyv_load_u32(&ip1[i + 4 * elemPerVector]);
        npyv_u32 v5 = npyv_load_u32(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_u32 u0 = npyv_load_u32(&ip2[i + 0 * elemPerVector]);
        npyv_u32 u1 = npyv_load_u32(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_u32 u2 = npyv_load_u32(&ip2[i + 2 * elemPerVector]);
        npyv_u32 u3 = npyv_load_u32(&ip2[i + 3 * elemPerVector]);
        npyv_u32 u4 = npyv_load_u32(&ip2[i + 4 * elemPerVector]);
        npyv_u32 u5 = npyv_load_u32(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_u32 m0 = V_INTRIN(v0, u0);
        npyv_u32 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_u32 m2 = V_INTRIN(v2, u2);
        npyv_u32 m3 = V_INTRIN(v3, u3);
        npyv_u32 m4 = V_INTRIN(v4, u4);
        npyv_u32 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_u32(&op1[i + 0 * elemPerVector], m0);
        npyv_store_u32(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_u32(&op1[i + 2 * elemPerVector], m2);
        npyv_store_u32(&op1[i + 3 * elemPerVector], m3);
        npyv_store_u32(&op1[i + 4 * elemPerVector], m4);
        npyv_store_u32(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_u32 v0 = npyv_load_u32(ip1 + i);
        npyv_u32 u0 = npyv_load_u32(ip2 + i);
        npyv_u32 m0 = V_INTRIN(v0, u0);
        npyv_store_u32(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_u32 in1 = ip1[i];
        const npyv_lanetype_u32 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 0 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_maxp_u32(const npyv_lanetype_u32 *ip1, npy_intp sip1,
                           const npyv_lanetype_u32 *ip2, npy_intp sip2,
                                 npyv_lanetype_u32 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_u32;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_u32 a, b;
        if (sip1 == 1) {
            a = npyv_load_u32(ip1);
        } else {
            a = npyv_loadn_u32(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_u32(ip2);
        } else {
            b = npyv_loadn_u32(ip2, sip2);
        }
        npyv_u32 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_u32(op1, r);
        } else {
            npyv_storen_u32(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_u32 a = *ip1;
        const npyv_lanetype_u32 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP

#line 110
#define SCALAR_OP scalar_minp_i
#if NPY_SIMD && (!1 || (0 && 1))

#if 0 && !1
    #define V_INTRIN npyv_minpn_u32 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_minpn_u32
#else
    #define V_INTRIN npyv_minp_u32
    #define V_REDUCE_INTRIN npyv_reduce_minp_u32
#endif

// contiguous input.
static inline void
simd_reduce_c_minp_u32(const npyv_lanetype_u32 *ip, npyv_lanetype_u32 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_u32;
    const int wstep = vstep*8;
    npyv_u32 acc = npyv_setall_u32(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_u32 v0 = npyv_load_u32(ip + vstep * 0);
        npyv_u32 v1 = npyv_load_u32(ip + vstep * 1);
        npyv_u32 v2 = npyv_load_u32(ip + vstep * 2);
        npyv_u32 v3 = npyv_load_u32(ip + vstep * 3);

        npyv_u32 v4 = npyv_load_u32(ip + vstep * 4);
        npyv_u32 v5 = npyv_load_u32(ip + vstep * 5);
        npyv_u32 v6 = npyv_load_u32(ip + vstep * 6);
        npyv_u32 v7 = npyv_load_u32(ip + vstep * 7);

        npyv_u32 r01 = V_INTRIN(v0, v1);
        npyv_u32 r23 = V_INTRIN(v2, v3);
        npyv_u32 r45 = V_INTRIN(v4, v5);
        npyv_u32 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_u32(ip));
    }
    npyv_lanetype_u32 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_u32 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_minp_u32(const npyv_lanetype_u32 *ip1, const npyv_lanetype_u32 *ip2,
                                     npyv_lanetype_u32 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_u32;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_u32 v0 = npyv_load_u32(&ip1[i + 0 * elemPerVector]);
        npyv_u32 v1 = npyv_load_u32(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_u32 v2 = npyv_load_u32(&ip1[i + 2 * elemPerVector]);
        npyv_u32 v3 = npyv_load_u32(&ip1[i + 3 * elemPerVector]);
        npyv_u32 v4 = npyv_load_u32(&ip1[i + 4 * elemPerVector]);
        npyv_u32 v5 = npyv_load_u32(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_u32 u0 = npyv_load_u32(&ip2[i + 0 * elemPerVector]);
        npyv_u32 u1 = npyv_load_u32(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_u32 u2 = npyv_load_u32(&ip2[i + 2 * elemPerVector]);
        npyv_u32 u3 = npyv_load_u32(&ip2[i + 3 * elemPerVector]);
        npyv_u32 u4 = npyv_load_u32(&ip2[i + 4 * elemPerVector]);
        npyv_u32 u5 = npyv_load_u32(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_u32 m0 = V_INTRIN(v0, u0);
        npyv_u32 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_u32 m2 = V_INTRIN(v2, u2);
        npyv_u32 m3 = V_INTRIN(v3, u3);
        npyv_u32 m4 = V_INTRIN(v4, u4);
        npyv_u32 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_u32(&op1[i + 0 * elemPerVector], m0);
        npyv_store_u32(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_u32(&op1[i + 2 * elemPerVector], m2);
        npyv_store_u32(&op1[i + 3 * elemPerVector], m3);
        npyv_store_u32(&op1[i + 4 * elemPerVector], m4);
        npyv_store_u32(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_u32 v0 = npyv_load_u32(ip1 + i);
        npyv_u32 u0 = npyv_load_u32(ip2 + i);
        npyv_u32 m0 = V_INTRIN(v0, u0);
        npyv_store_u32(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_u32 in1 = ip1[i];
        const npyv_lanetype_u32 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 0 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_minp_u32(const npyv_lanetype_u32 *ip1, npy_intp sip1,
                           const npyv_lanetype_u32 *ip2, npy_intp sip2,
                                 npyv_lanetype_u32 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_u32;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_u32 a, b;
        if (sip1 == 1) {
            a = npyv_load_u32(ip1);
        } else {
            a = npyv_loadn_u32(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_u32(ip2);
        } else {
            b = npyv_loadn_u32(ip2, sip2);
        }
        npyv_u32 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_u32(op1, r);
        } else {
            npyv_storen_u32(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_u32 a = *ip1;
        const npyv_lanetype_u32 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP


#line 106
#line 110
#define SCALAR_OP scalar_max_i
#if NPY_SIMD && (!0 || (0 && 0))

#if 0 && !0
    #define V_INTRIN npyv_maxn_s64 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_maxn_s64
#else
    #define V_INTRIN npyv_max_s64
    #define V_REDUCE_INTRIN npyv_reduce_max_s64
#endif

// contiguous input.
static inline void
simd_reduce_c_max_s64(const npyv_lanetype_s64 *ip, npyv_lanetype_s64 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_s64;
    const int wstep = vstep*8;
    npyv_s64 acc = npyv_setall_s64(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_s64 v0 = npyv_load_s64(ip + vstep * 0);
        npyv_s64 v1 = npyv_load_s64(ip + vstep * 1);
        npyv_s64 v2 = npyv_load_s64(ip + vstep * 2);
        npyv_s64 v3 = npyv_load_s64(ip + vstep * 3);

        npyv_s64 v4 = npyv_load_s64(ip + vstep * 4);
        npyv_s64 v5 = npyv_load_s64(ip + vstep * 5);
        npyv_s64 v6 = npyv_load_s64(ip + vstep * 6);
        npyv_s64 v7 = npyv_load_s64(ip + vstep * 7);

        npyv_s64 r01 = V_INTRIN(v0, v1);
        npyv_s64 r23 = V_INTRIN(v2, v3);
        npyv_s64 r45 = V_INTRIN(v4, v5);
        npyv_s64 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_s64(ip));
    }
    npyv_lanetype_s64 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_s64 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_max_s64(const npyv_lanetype_s64 *ip1, const npyv_lanetype_s64 *ip2,
                                     npyv_lanetype_s64 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_s64;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_s64 v0 = npyv_load_s64(&ip1[i + 0 * elemPerVector]);
        npyv_s64 v1 = npyv_load_s64(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_s64 v2 = npyv_load_s64(&ip1[i + 2 * elemPerVector]);
        npyv_s64 v3 = npyv_load_s64(&ip1[i + 3 * elemPerVector]);
        npyv_s64 v4 = npyv_load_s64(&ip1[i + 4 * elemPerVector]);
        npyv_s64 v5 = npyv_load_s64(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_s64 u0 = npyv_load_s64(&ip2[i + 0 * elemPerVector]);
        npyv_s64 u1 = npyv_load_s64(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_s64 u2 = npyv_load_s64(&ip2[i + 2 * elemPerVector]);
        npyv_s64 u3 = npyv_load_s64(&ip2[i + 3 * elemPerVector]);
        npyv_s64 u4 = npyv_load_s64(&ip2[i + 4 * elemPerVector]);
        npyv_s64 u5 = npyv_load_s64(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_s64 m0 = V_INTRIN(v0, u0);
        npyv_s64 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_s64 m2 = V_INTRIN(v2, u2);
        npyv_s64 m3 = V_INTRIN(v3, u3);
        npyv_s64 m4 = V_INTRIN(v4, u4);
        npyv_s64 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_s64(&op1[i + 0 * elemPerVector], m0);
        npyv_store_s64(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_s64(&op1[i + 2 * elemPerVector], m2);
        npyv_store_s64(&op1[i + 3 * elemPerVector], m3);
        npyv_store_s64(&op1[i + 4 * elemPerVector], m4);
        npyv_store_s64(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_s64 v0 = npyv_load_s64(ip1 + i);
        npyv_s64 u0 = npyv_load_s64(ip2 + i);
        npyv_s64 m0 = V_INTRIN(v0, u0);
        npyv_store_s64(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_s64 in1 = ip1[i];
        const npyv_lanetype_s64 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 0 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_max_s64(const npyv_lanetype_s64 *ip1, npy_intp sip1,
                           const npyv_lanetype_s64 *ip2, npy_intp sip2,
                                 npyv_lanetype_s64 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_s64;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_s64 a, b;
        if (sip1 == 1) {
            a = npyv_load_s64(ip1);
        } else {
            a = npyv_loadn_s64(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_s64(ip2);
        } else {
            b = npyv_loadn_s64(ip2, sip2);
        }
        npyv_s64 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_s64(op1, r);
        } else {
            npyv_storen_s64(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_s64 a = *ip1;
        const npyv_lanetype_s64 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP

#line 110
#define SCALAR_OP scalar_min_i
#if NPY_SIMD && (!0 || (0 && 0))

#if 0 && !0
    #define V_INTRIN npyv_minn_s64 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_minn_s64
#else
    #define V_INTRIN npyv_min_s64
    #define V_REDUCE_INTRIN npyv_reduce_min_s64
#endif

// contiguous input.
static inline void
simd_reduce_c_min_s64(const npyv_lanetype_s64 *ip, npyv_lanetype_s64 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_s64;
    const int wstep = vstep*8;
    npyv_s64 acc = npyv_setall_s64(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_s64 v0 = npyv_load_s64(ip + vstep * 0);
        npyv_s64 v1 = npyv_load_s64(ip + vstep * 1);
        npyv_s64 v2 = npyv_load_s64(ip + vstep * 2);
        npyv_s64 v3 = npyv_load_s64(ip + vstep * 3);

        npyv_s64 v4 = npyv_load_s64(ip + vstep * 4);
        npyv_s64 v5 = npyv_load_s64(ip + vstep * 5);
        npyv_s64 v6 = npyv_load_s64(ip + vstep * 6);
        npyv_s64 v7 = npyv_load_s64(ip + vstep * 7);

        npyv_s64 r01 = V_INTRIN(v0, v1);
        npyv_s64 r23 = V_INTRIN(v2, v3);
        npyv_s64 r45 = V_INTRIN(v4, v5);
        npyv_s64 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_s64(ip));
    }
    npyv_lanetype_s64 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_s64 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_min_s64(const npyv_lanetype_s64 *ip1, const npyv_lanetype_s64 *ip2,
                                     npyv_lanetype_s64 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_s64;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_s64 v0 = npyv_load_s64(&ip1[i + 0 * elemPerVector]);
        npyv_s64 v1 = npyv_load_s64(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_s64 v2 = npyv_load_s64(&ip1[i + 2 * elemPerVector]);
        npyv_s64 v3 = npyv_load_s64(&ip1[i + 3 * elemPerVector]);
        npyv_s64 v4 = npyv_load_s64(&ip1[i + 4 * elemPerVector]);
        npyv_s64 v5 = npyv_load_s64(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_s64 u0 = npyv_load_s64(&ip2[i + 0 * elemPerVector]);
        npyv_s64 u1 = npyv_load_s64(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_s64 u2 = npyv_load_s64(&ip2[i + 2 * elemPerVector]);
        npyv_s64 u3 = npyv_load_s64(&ip2[i + 3 * elemPerVector]);
        npyv_s64 u4 = npyv_load_s64(&ip2[i + 4 * elemPerVector]);
        npyv_s64 u5 = npyv_load_s64(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_s64 m0 = V_INTRIN(v0, u0);
        npyv_s64 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_s64 m2 = V_INTRIN(v2, u2);
        npyv_s64 m3 = V_INTRIN(v3, u3);
        npyv_s64 m4 = V_INTRIN(v4, u4);
        npyv_s64 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_s64(&op1[i + 0 * elemPerVector], m0);
        npyv_store_s64(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_s64(&op1[i + 2 * elemPerVector], m2);
        npyv_store_s64(&op1[i + 3 * elemPerVector], m3);
        npyv_store_s64(&op1[i + 4 * elemPerVector], m4);
        npyv_store_s64(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_s64 v0 = npyv_load_s64(ip1 + i);
        npyv_s64 u0 = npyv_load_s64(ip2 + i);
        npyv_s64 m0 = V_INTRIN(v0, u0);
        npyv_store_s64(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_s64 in1 = ip1[i];
        const npyv_lanetype_s64 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 0 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_min_s64(const npyv_lanetype_s64 *ip1, npy_intp sip1,
                           const npyv_lanetype_s64 *ip2, npy_intp sip2,
                                 npyv_lanetype_s64 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_s64;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_s64 a, b;
        if (sip1 == 1) {
            a = npyv_load_s64(ip1);
        } else {
            a = npyv_loadn_s64(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_s64(ip2);
        } else {
            b = npyv_loadn_s64(ip2, sip2);
        }
        npyv_s64 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_s64(op1, r);
        } else {
            npyv_storen_s64(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_s64 a = *ip1;
        const npyv_lanetype_s64 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP

#line 110
#define SCALAR_OP scalar_maxp_i
#if NPY_SIMD && (!1 || (0 && 1))

#if 0 && !1
    #define V_INTRIN npyv_maxpn_s64 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_maxpn_s64
#else
    #define V_INTRIN npyv_maxp_s64
    #define V_REDUCE_INTRIN npyv_reduce_maxp_s64
#endif

// contiguous input.
static inline void
simd_reduce_c_maxp_s64(const npyv_lanetype_s64 *ip, npyv_lanetype_s64 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_s64;
    const int wstep = vstep*8;
    npyv_s64 acc = npyv_setall_s64(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_s64 v0 = npyv_load_s64(ip + vstep * 0);
        npyv_s64 v1 = npyv_load_s64(ip + vstep * 1);
        npyv_s64 v2 = npyv_load_s64(ip + vstep * 2);
        npyv_s64 v3 = npyv_load_s64(ip + vstep * 3);

        npyv_s64 v4 = npyv_load_s64(ip + vstep * 4);
        npyv_s64 v5 = npyv_load_s64(ip + vstep * 5);
        npyv_s64 v6 = npyv_load_s64(ip + vstep * 6);
        npyv_s64 v7 = npyv_load_s64(ip + vstep * 7);

        npyv_s64 r01 = V_INTRIN(v0, v1);
        npyv_s64 r23 = V_INTRIN(v2, v3);
        npyv_s64 r45 = V_INTRIN(v4, v5);
        npyv_s64 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_s64(ip));
    }
    npyv_lanetype_s64 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_s64 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_maxp_s64(const npyv_lanetype_s64 *ip1, const npyv_lanetype_s64 *ip2,
                                     npyv_lanetype_s64 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_s64;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_s64 v0 = npyv_load_s64(&ip1[i + 0 * elemPerVector]);
        npyv_s64 v1 = npyv_load_s64(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_s64 v2 = npyv_load_s64(&ip1[i + 2 * elemPerVector]);
        npyv_s64 v3 = npyv_load_s64(&ip1[i + 3 * elemPerVector]);
        npyv_s64 v4 = npyv_load_s64(&ip1[i + 4 * elemPerVector]);
        npyv_s64 v5 = npyv_load_s64(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_s64 u0 = npyv_load_s64(&ip2[i + 0 * elemPerVector]);
        npyv_s64 u1 = npyv_load_s64(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_s64 u2 = npyv_load_s64(&ip2[i + 2 * elemPerVector]);
        npyv_s64 u3 = npyv_load_s64(&ip2[i + 3 * elemPerVector]);
        npyv_s64 u4 = npyv_load_s64(&ip2[i + 4 * elemPerVector]);
        npyv_s64 u5 = npyv_load_s64(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_s64 m0 = V_INTRIN(v0, u0);
        npyv_s64 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_s64 m2 = V_INTRIN(v2, u2);
        npyv_s64 m3 = V_INTRIN(v3, u3);
        npyv_s64 m4 = V_INTRIN(v4, u4);
        npyv_s64 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_s64(&op1[i + 0 * elemPerVector], m0);
        npyv_store_s64(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_s64(&op1[i + 2 * elemPerVector], m2);
        npyv_store_s64(&op1[i + 3 * elemPerVector], m3);
        npyv_store_s64(&op1[i + 4 * elemPerVector], m4);
        npyv_store_s64(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_s64 v0 = npyv_load_s64(ip1 + i);
        npyv_s64 u0 = npyv_load_s64(ip2 + i);
        npyv_s64 m0 = V_INTRIN(v0, u0);
        npyv_store_s64(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_s64 in1 = ip1[i];
        const npyv_lanetype_s64 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 0 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_maxp_s64(const npyv_lanetype_s64 *ip1, npy_intp sip1,
                           const npyv_lanetype_s64 *ip2, npy_intp sip2,
                                 npyv_lanetype_s64 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_s64;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_s64 a, b;
        if (sip1 == 1) {
            a = npyv_load_s64(ip1);
        } else {
            a = npyv_loadn_s64(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_s64(ip2);
        } else {
            b = npyv_loadn_s64(ip2, sip2);
        }
        npyv_s64 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_s64(op1, r);
        } else {
            npyv_storen_s64(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_s64 a = *ip1;
        const npyv_lanetype_s64 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP

#line 110
#define SCALAR_OP scalar_minp_i
#if NPY_SIMD && (!1 || (0 && 1))

#if 0 && !1
    #define V_INTRIN npyv_minpn_s64 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_minpn_s64
#else
    #define V_INTRIN npyv_minp_s64
    #define V_REDUCE_INTRIN npyv_reduce_minp_s64
#endif

// contiguous input.
static inline void
simd_reduce_c_minp_s64(const npyv_lanetype_s64 *ip, npyv_lanetype_s64 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_s64;
    const int wstep = vstep*8;
    npyv_s64 acc = npyv_setall_s64(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_s64 v0 = npyv_load_s64(ip + vstep * 0);
        npyv_s64 v1 = npyv_load_s64(ip + vstep * 1);
        npyv_s64 v2 = npyv_load_s64(ip + vstep * 2);
        npyv_s64 v3 = npyv_load_s64(ip + vstep * 3);

        npyv_s64 v4 = npyv_load_s64(ip + vstep * 4);
        npyv_s64 v5 = npyv_load_s64(ip + vstep * 5);
        npyv_s64 v6 = npyv_load_s64(ip + vstep * 6);
        npyv_s64 v7 = npyv_load_s64(ip + vstep * 7);

        npyv_s64 r01 = V_INTRIN(v0, v1);
        npyv_s64 r23 = V_INTRIN(v2, v3);
        npyv_s64 r45 = V_INTRIN(v4, v5);
        npyv_s64 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_s64(ip));
    }
    npyv_lanetype_s64 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_s64 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_minp_s64(const npyv_lanetype_s64 *ip1, const npyv_lanetype_s64 *ip2,
                                     npyv_lanetype_s64 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_s64;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_s64 v0 = npyv_load_s64(&ip1[i + 0 * elemPerVector]);
        npyv_s64 v1 = npyv_load_s64(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_s64 v2 = npyv_load_s64(&ip1[i + 2 * elemPerVector]);
        npyv_s64 v3 = npyv_load_s64(&ip1[i + 3 * elemPerVector]);
        npyv_s64 v4 = npyv_load_s64(&ip1[i + 4 * elemPerVector]);
        npyv_s64 v5 = npyv_load_s64(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_s64 u0 = npyv_load_s64(&ip2[i + 0 * elemPerVector]);
        npyv_s64 u1 = npyv_load_s64(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_s64 u2 = npyv_load_s64(&ip2[i + 2 * elemPerVector]);
        npyv_s64 u3 = npyv_load_s64(&ip2[i + 3 * elemPerVector]);
        npyv_s64 u4 = npyv_load_s64(&ip2[i + 4 * elemPerVector]);
        npyv_s64 u5 = npyv_load_s64(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_s64 m0 = V_INTRIN(v0, u0);
        npyv_s64 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_s64 m2 = V_INTRIN(v2, u2);
        npyv_s64 m3 = V_INTRIN(v3, u3);
        npyv_s64 m4 = V_INTRIN(v4, u4);
        npyv_s64 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_s64(&op1[i + 0 * elemPerVector], m0);
        npyv_store_s64(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_s64(&op1[i + 2 * elemPerVector], m2);
        npyv_store_s64(&op1[i + 3 * elemPerVector], m3);
        npyv_store_s64(&op1[i + 4 * elemPerVector], m4);
        npyv_store_s64(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_s64 v0 = npyv_load_s64(ip1 + i);
        npyv_s64 u0 = npyv_load_s64(ip2 + i);
        npyv_s64 m0 = V_INTRIN(v0, u0);
        npyv_store_s64(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_s64 in1 = ip1[i];
        const npyv_lanetype_s64 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 0 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_minp_s64(const npyv_lanetype_s64 *ip1, npy_intp sip1,
                           const npyv_lanetype_s64 *ip2, npy_intp sip2,
                                 npyv_lanetype_s64 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_s64;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_s64 a, b;
        if (sip1 == 1) {
            a = npyv_load_s64(ip1);
        } else {
            a = npyv_loadn_s64(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_s64(ip2);
        } else {
            b = npyv_loadn_s64(ip2, sip2);
        }
        npyv_s64 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_s64(op1, r);
        } else {
            npyv_storen_s64(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_s64 a = *ip1;
        const npyv_lanetype_s64 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP


#line 106
#line 110
#define SCALAR_OP scalar_max_i
#if NPY_SIMD && (!0 || (0 && 0))

#if 0 && !0
    #define V_INTRIN npyv_maxn_u64 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_maxn_u64
#else
    #define V_INTRIN npyv_max_u64
    #define V_REDUCE_INTRIN npyv_reduce_max_u64
#endif

// contiguous input.
static inline void
simd_reduce_c_max_u64(const npyv_lanetype_u64 *ip, npyv_lanetype_u64 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_u64;
    const int wstep = vstep*8;
    npyv_u64 acc = npyv_setall_u64(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_u64 v0 = npyv_load_u64(ip + vstep * 0);
        npyv_u64 v1 = npyv_load_u64(ip + vstep * 1);
        npyv_u64 v2 = npyv_load_u64(ip + vstep * 2);
        npyv_u64 v3 = npyv_load_u64(ip + vstep * 3);

        npyv_u64 v4 = npyv_load_u64(ip + vstep * 4);
        npyv_u64 v5 = npyv_load_u64(ip + vstep * 5);
        npyv_u64 v6 = npyv_load_u64(ip + vstep * 6);
        npyv_u64 v7 = npyv_load_u64(ip + vstep * 7);

        npyv_u64 r01 = V_INTRIN(v0, v1);
        npyv_u64 r23 = V_INTRIN(v2, v3);
        npyv_u64 r45 = V_INTRIN(v4, v5);
        npyv_u64 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_u64(ip));
    }
    npyv_lanetype_u64 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_u64 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_max_u64(const npyv_lanetype_u64 *ip1, const npyv_lanetype_u64 *ip2,
                                     npyv_lanetype_u64 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_u64;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_u64 v0 = npyv_load_u64(&ip1[i + 0 * elemPerVector]);
        npyv_u64 v1 = npyv_load_u64(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_u64 v2 = npyv_load_u64(&ip1[i + 2 * elemPerVector]);
        npyv_u64 v3 = npyv_load_u64(&ip1[i + 3 * elemPerVector]);
        npyv_u64 v4 = npyv_load_u64(&ip1[i + 4 * elemPerVector]);
        npyv_u64 v5 = npyv_load_u64(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_u64 u0 = npyv_load_u64(&ip2[i + 0 * elemPerVector]);
        npyv_u64 u1 = npyv_load_u64(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_u64 u2 = npyv_load_u64(&ip2[i + 2 * elemPerVector]);
        npyv_u64 u3 = npyv_load_u64(&ip2[i + 3 * elemPerVector]);
        npyv_u64 u4 = npyv_load_u64(&ip2[i + 4 * elemPerVector]);
        npyv_u64 u5 = npyv_load_u64(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_u64 m0 = V_INTRIN(v0, u0);
        npyv_u64 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_u64 m2 = V_INTRIN(v2, u2);
        npyv_u64 m3 = V_INTRIN(v3, u3);
        npyv_u64 m4 = V_INTRIN(v4, u4);
        npyv_u64 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_u64(&op1[i + 0 * elemPerVector], m0);
        npyv_store_u64(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_u64(&op1[i + 2 * elemPerVector], m2);
        npyv_store_u64(&op1[i + 3 * elemPerVector], m3);
        npyv_store_u64(&op1[i + 4 * elemPerVector], m4);
        npyv_store_u64(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_u64 v0 = npyv_load_u64(ip1 + i);
        npyv_u64 u0 = npyv_load_u64(ip2 + i);
        npyv_u64 m0 = V_INTRIN(v0, u0);
        npyv_store_u64(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_u64 in1 = ip1[i];
        const npyv_lanetype_u64 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 0 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_max_u64(const npyv_lanetype_u64 *ip1, npy_intp sip1,
                           const npyv_lanetype_u64 *ip2, npy_intp sip2,
                                 npyv_lanetype_u64 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_u64;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_u64 a, b;
        if (sip1 == 1) {
            a = npyv_load_u64(ip1);
        } else {
            a = npyv_loadn_u64(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_u64(ip2);
        } else {
            b = npyv_loadn_u64(ip2, sip2);
        }
        npyv_u64 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_u64(op1, r);
        } else {
            npyv_storen_u64(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_u64 a = *ip1;
        const npyv_lanetype_u64 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP

#line 110
#define SCALAR_OP scalar_min_i
#if NPY_SIMD && (!0 || (0 && 0))

#if 0 && !0
    #define V_INTRIN npyv_minn_u64 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_minn_u64
#else
    #define V_INTRIN npyv_min_u64
    #define V_REDUCE_INTRIN npyv_reduce_min_u64
#endif

// contiguous input.
static inline void
simd_reduce_c_min_u64(const npyv_lanetype_u64 *ip, npyv_lanetype_u64 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_u64;
    const int wstep = vstep*8;
    npyv_u64 acc = npyv_setall_u64(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_u64 v0 = npyv_load_u64(ip + vstep * 0);
        npyv_u64 v1 = npyv_load_u64(ip + vstep * 1);
        npyv_u64 v2 = npyv_load_u64(ip + vstep * 2);
        npyv_u64 v3 = npyv_load_u64(ip + vstep * 3);

        npyv_u64 v4 = npyv_load_u64(ip + vstep * 4);
        npyv_u64 v5 = npyv_load_u64(ip + vstep * 5);
        npyv_u64 v6 = npyv_load_u64(ip + vstep * 6);
        npyv_u64 v7 = npyv_load_u64(ip + vstep * 7);

        npyv_u64 r01 = V_INTRIN(v0, v1);
        npyv_u64 r23 = V_INTRIN(v2, v3);
        npyv_u64 r45 = V_INTRIN(v4, v5);
        npyv_u64 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_u64(ip));
    }
    npyv_lanetype_u64 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_u64 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_min_u64(const npyv_lanetype_u64 *ip1, const npyv_lanetype_u64 *ip2,
                                     npyv_lanetype_u64 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_u64;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_u64 v0 = npyv_load_u64(&ip1[i + 0 * elemPerVector]);
        npyv_u64 v1 = npyv_load_u64(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_u64 v2 = npyv_load_u64(&ip1[i + 2 * elemPerVector]);
        npyv_u64 v3 = npyv_load_u64(&ip1[i + 3 * elemPerVector]);
        npyv_u64 v4 = npyv_load_u64(&ip1[i + 4 * elemPerVector]);
        npyv_u64 v5 = npyv_load_u64(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_u64 u0 = npyv_load_u64(&ip2[i + 0 * elemPerVector]);
        npyv_u64 u1 = npyv_load_u64(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_u64 u2 = npyv_load_u64(&ip2[i + 2 * elemPerVector]);
        npyv_u64 u3 = npyv_load_u64(&ip2[i + 3 * elemPerVector]);
        npyv_u64 u4 = npyv_load_u64(&ip2[i + 4 * elemPerVector]);
        npyv_u64 u5 = npyv_load_u64(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_u64 m0 = V_INTRIN(v0, u0);
        npyv_u64 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_u64 m2 = V_INTRIN(v2, u2);
        npyv_u64 m3 = V_INTRIN(v3, u3);
        npyv_u64 m4 = V_INTRIN(v4, u4);
        npyv_u64 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_u64(&op1[i + 0 * elemPerVector], m0);
        npyv_store_u64(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_u64(&op1[i + 2 * elemPerVector], m2);
        npyv_store_u64(&op1[i + 3 * elemPerVector], m3);
        npyv_store_u64(&op1[i + 4 * elemPerVector], m4);
        npyv_store_u64(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_u64 v0 = npyv_load_u64(ip1 + i);
        npyv_u64 u0 = npyv_load_u64(ip2 + i);
        npyv_u64 m0 = V_INTRIN(v0, u0);
        npyv_store_u64(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_u64 in1 = ip1[i];
        const npyv_lanetype_u64 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 0 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_min_u64(const npyv_lanetype_u64 *ip1, npy_intp sip1,
                           const npyv_lanetype_u64 *ip2, npy_intp sip2,
                                 npyv_lanetype_u64 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_u64;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_u64 a, b;
        if (sip1 == 1) {
            a = npyv_load_u64(ip1);
        } else {
            a = npyv_loadn_u64(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_u64(ip2);
        } else {
            b = npyv_loadn_u64(ip2, sip2);
        }
        npyv_u64 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_u64(op1, r);
        } else {
            npyv_storen_u64(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_u64 a = *ip1;
        const npyv_lanetype_u64 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP

#line 110
#define SCALAR_OP scalar_maxp_i
#if NPY_SIMD && (!1 || (0 && 1))

#if 0 && !1
    #define V_INTRIN npyv_maxpn_u64 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_maxpn_u64
#else
    #define V_INTRIN npyv_maxp_u64
    #define V_REDUCE_INTRIN npyv_reduce_maxp_u64
#endif

// contiguous input.
static inline void
simd_reduce_c_maxp_u64(const npyv_lanetype_u64 *ip, npyv_lanetype_u64 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_u64;
    const int wstep = vstep*8;
    npyv_u64 acc = npyv_setall_u64(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_u64 v0 = npyv_load_u64(ip + vstep * 0);
        npyv_u64 v1 = npyv_load_u64(ip + vstep * 1);
        npyv_u64 v2 = npyv_load_u64(ip + vstep * 2);
        npyv_u64 v3 = npyv_load_u64(ip + vstep * 3);

        npyv_u64 v4 = npyv_load_u64(ip + vstep * 4);
        npyv_u64 v5 = npyv_load_u64(ip + vstep * 5);
        npyv_u64 v6 = npyv_load_u64(ip + vstep * 6);
        npyv_u64 v7 = npyv_load_u64(ip + vstep * 7);

        npyv_u64 r01 = V_INTRIN(v0, v1);
        npyv_u64 r23 = V_INTRIN(v2, v3);
        npyv_u64 r45 = V_INTRIN(v4, v5);
        npyv_u64 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_u64(ip));
    }
    npyv_lanetype_u64 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_u64 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_maxp_u64(const npyv_lanetype_u64 *ip1, const npyv_lanetype_u64 *ip2,
                                     npyv_lanetype_u64 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_u64;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_u64 v0 = npyv_load_u64(&ip1[i + 0 * elemPerVector]);
        npyv_u64 v1 = npyv_load_u64(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_u64 v2 = npyv_load_u64(&ip1[i + 2 * elemPerVector]);
        npyv_u64 v3 = npyv_load_u64(&ip1[i + 3 * elemPerVector]);
        npyv_u64 v4 = npyv_load_u64(&ip1[i + 4 * elemPerVector]);
        npyv_u64 v5 = npyv_load_u64(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_u64 u0 = npyv_load_u64(&ip2[i + 0 * elemPerVector]);
        npyv_u64 u1 = npyv_load_u64(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_u64 u2 = npyv_load_u64(&ip2[i + 2 * elemPerVector]);
        npyv_u64 u3 = npyv_load_u64(&ip2[i + 3 * elemPerVector]);
        npyv_u64 u4 = npyv_load_u64(&ip2[i + 4 * elemPerVector]);
        npyv_u64 u5 = npyv_load_u64(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_u64 m0 = V_INTRIN(v0, u0);
        npyv_u64 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_u64 m2 = V_INTRIN(v2, u2);
        npyv_u64 m3 = V_INTRIN(v3, u3);
        npyv_u64 m4 = V_INTRIN(v4, u4);
        npyv_u64 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_u64(&op1[i + 0 * elemPerVector], m0);
        npyv_store_u64(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_u64(&op1[i + 2 * elemPerVector], m2);
        npyv_store_u64(&op1[i + 3 * elemPerVector], m3);
        npyv_store_u64(&op1[i + 4 * elemPerVector], m4);
        npyv_store_u64(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_u64 v0 = npyv_load_u64(ip1 + i);
        npyv_u64 u0 = npyv_load_u64(ip2 + i);
        npyv_u64 m0 = V_INTRIN(v0, u0);
        npyv_store_u64(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_u64 in1 = ip1[i];
        const npyv_lanetype_u64 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 0 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_maxp_u64(const npyv_lanetype_u64 *ip1, npy_intp sip1,
                           const npyv_lanetype_u64 *ip2, npy_intp sip2,
                                 npyv_lanetype_u64 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_u64;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_u64 a, b;
        if (sip1 == 1) {
            a = npyv_load_u64(ip1);
        } else {
            a = npyv_loadn_u64(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_u64(ip2);
        } else {
            b = npyv_loadn_u64(ip2, sip2);
        }
        npyv_u64 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_u64(op1, r);
        } else {
            npyv_storen_u64(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_u64 a = *ip1;
        const npyv_lanetype_u64 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP

#line 110
#define SCALAR_OP scalar_minp_i
#if NPY_SIMD && (!1 || (0 && 1))

#if 0 && !1
    #define V_INTRIN npyv_minpn_u64 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_minpn_u64
#else
    #define V_INTRIN npyv_minp_u64
    #define V_REDUCE_INTRIN npyv_reduce_minp_u64
#endif

// contiguous input.
static inline void
simd_reduce_c_minp_u64(const npyv_lanetype_u64 *ip, npyv_lanetype_u64 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_u64;
    const int wstep = vstep*8;
    npyv_u64 acc = npyv_setall_u64(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_u64 v0 = npyv_load_u64(ip + vstep * 0);
        npyv_u64 v1 = npyv_load_u64(ip + vstep * 1);
        npyv_u64 v2 = npyv_load_u64(ip + vstep * 2);
        npyv_u64 v3 = npyv_load_u64(ip + vstep * 3);

        npyv_u64 v4 = npyv_load_u64(ip + vstep * 4);
        npyv_u64 v5 = npyv_load_u64(ip + vstep * 5);
        npyv_u64 v6 = npyv_load_u64(ip + vstep * 6);
        npyv_u64 v7 = npyv_load_u64(ip + vstep * 7);

        npyv_u64 r01 = V_INTRIN(v0, v1);
        npyv_u64 r23 = V_INTRIN(v2, v3);
        npyv_u64 r45 = V_INTRIN(v4, v5);
        npyv_u64 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_u64(ip));
    }
    npyv_lanetype_u64 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_u64 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_minp_u64(const npyv_lanetype_u64 *ip1, const npyv_lanetype_u64 *ip2,
                                     npyv_lanetype_u64 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_u64;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_u64 v0 = npyv_load_u64(&ip1[i + 0 * elemPerVector]);
        npyv_u64 v1 = npyv_load_u64(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_u64 v2 = npyv_load_u64(&ip1[i + 2 * elemPerVector]);
        npyv_u64 v3 = npyv_load_u64(&ip1[i + 3 * elemPerVector]);
        npyv_u64 v4 = npyv_load_u64(&ip1[i + 4 * elemPerVector]);
        npyv_u64 v5 = npyv_load_u64(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_u64 u0 = npyv_load_u64(&ip2[i + 0 * elemPerVector]);
        npyv_u64 u1 = npyv_load_u64(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_u64 u2 = npyv_load_u64(&ip2[i + 2 * elemPerVector]);
        npyv_u64 u3 = npyv_load_u64(&ip2[i + 3 * elemPerVector]);
        npyv_u64 u4 = npyv_load_u64(&ip2[i + 4 * elemPerVector]);
        npyv_u64 u5 = npyv_load_u64(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_u64 m0 = V_INTRIN(v0, u0);
        npyv_u64 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_u64 m2 = V_INTRIN(v2, u2);
        npyv_u64 m3 = V_INTRIN(v3, u3);
        npyv_u64 m4 = V_INTRIN(v4, u4);
        npyv_u64 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_u64(&op1[i + 0 * elemPerVector], m0);
        npyv_store_u64(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_u64(&op1[i + 2 * elemPerVector], m2);
        npyv_store_u64(&op1[i + 3 * elemPerVector], m3);
        npyv_store_u64(&op1[i + 4 * elemPerVector], m4);
        npyv_store_u64(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_u64 v0 = npyv_load_u64(ip1 + i);
        npyv_u64 u0 = npyv_load_u64(ip2 + i);
        npyv_u64 m0 = V_INTRIN(v0, u0);
        npyv_store_u64(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_u64 in1 = ip1[i];
        const npyv_lanetype_u64 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 0 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_minp_u64(const npyv_lanetype_u64 *ip1, npy_intp sip1,
                           const npyv_lanetype_u64 *ip2, npy_intp sip2,
                                 npyv_lanetype_u64 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_u64;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_u64 a, b;
        if (sip1 == 1) {
            a = npyv_load_u64(ip1);
        } else {
            a = npyv_loadn_u64(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_u64(ip2);
        } else {
            b = npyv_loadn_u64(ip2, sip2);
        }
        npyv_u64 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_u64(op1, r);
        } else {
            npyv_storen_u64(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_u64 a = *ip1;
        const npyv_lanetype_u64 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP


#line 106
#line 110
#define SCALAR_OP scalar_max_f
#if NPY_SIMD_F32 && (!0 || (1 && 0))

#if 1 && !0
    #define V_INTRIN npyv_maxn_f32 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_maxn_f32
#else
    #define V_INTRIN npyv_max_f32
    #define V_REDUCE_INTRIN npyv_reduce_max_f32
#endif

// contiguous input.
static inline void
simd_reduce_c_max_f32(const npyv_lanetype_f32 *ip, npyv_lanetype_f32 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep*8;
    npyv_f32 acc = npyv_setall_f32(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_f32 v0 = npyv_load_f32(ip + vstep * 0);
        npyv_f32 v1 = npyv_load_f32(ip + vstep * 1);
        npyv_f32 v2 = npyv_load_f32(ip + vstep * 2);
        npyv_f32 v3 = npyv_load_f32(ip + vstep * 3);

        npyv_f32 v4 = npyv_load_f32(ip + vstep * 4);
        npyv_f32 v5 = npyv_load_f32(ip + vstep * 5);
        npyv_f32 v6 = npyv_load_f32(ip + vstep * 6);
        npyv_f32 v7 = npyv_load_f32(ip + vstep * 7);

        npyv_f32 r01 = V_INTRIN(v0, v1);
        npyv_f32 r23 = V_INTRIN(v2, v3);
        npyv_f32 r45 = V_INTRIN(v4, v5);
        npyv_f32 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_f32(ip));
    }
    npyv_lanetype_f32 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_f32 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_max_f32(const npyv_lanetype_f32 *ip1, const npyv_lanetype_f32 *ip2,
                                     npyv_lanetype_f32 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_f32;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_f32 v0 = npyv_load_f32(&ip1[i + 0 * elemPerVector]);
        npyv_f32 v1 = npyv_load_f32(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_f32 v2 = npyv_load_f32(&ip1[i + 2 * elemPerVector]);
        npyv_f32 v3 = npyv_load_f32(&ip1[i + 3 * elemPerVector]);
        npyv_f32 v4 = npyv_load_f32(&ip1[i + 4 * elemPerVector]);
        npyv_f32 v5 = npyv_load_f32(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_f32 u0 = npyv_load_f32(&ip2[i + 0 * elemPerVector]);
        npyv_f32 u1 = npyv_load_f32(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_f32 u2 = npyv_load_f32(&ip2[i + 2 * elemPerVector]);
        npyv_f32 u3 = npyv_load_f32(&ip2[i + 3 * elemPerVector]);
        npyv_f32 u4 = npyv_load_f32(&ip2[i + 4 * elemPerVector]);
        npyv_f32 u5 = npyv_load_f32(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_f32 m0 = V_INTRIN(v0, u0);
        npyv_f32 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_f32 m2 = V_INTRIN(v2, u2);
        npyv_f32 m3 = V_INTRIN(v3, u3);
        npyv_f32 m4 = V_INTRIN(v4, u4);
        npyv_f32 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_f32(&op1[i + 0 * elemPerVector], m0);
        npyv_store_f32(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_f32(&op1[i + 2 * elemPerVector], m2);
        npyv_store_f32(&op1[i + 3 * elemPerVector], m3);
        npyv_store_f32(&op1[i + 4 * elemPerVector], m4);
        npyv_store_f32(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_f32 v0 = npyv_load_f32(ip1 + i);
        npyv_f32 u0 = npyv_load_f32(ip2 + i);
        npyv_f32 m0 = V_INTRIN(v0, u0);
        npyv_store_f32(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_f32 in1 = ip1[i];
        const npyv_lanetype_f32 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 1 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_max_f32(const npyv_lanetype_f32 *ip1, npy_intp sip1,
                           const npyv_lanetype_f32 *ip2, npy_intp sip2,
                                 npyv_lanetype_f32 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_f32;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_f32 a, b;
        if (sip1 == 1) {
            a = npyv_load_f32(ip1);
        } else {
            a = npyv_loadn_f32(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_f32(ip2);
        } else {
            b = npyv_loadn_f32(ip2, sip2);
        }
        npyv_f32 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_f32(op1, r);
        } else {
            npyv_storen_f32(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_f32 a = *ip1;
        const npyv_lanetype_f32 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP

#line 110
#define SCALAR_OP scalar_min_f
#if NPY_SIMD_F32 && (!0 || (1 && 0))

#if 1 && !0
    #define V_INTRIN npyv_minn_f32 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_minn_f32
#else
    #define V_INTRIN npyv_min_f32
    #define V_REDUCE_INTRIN npyv_reduce_min_f32
#endif

// contiguous input.
static inline void
simd_reduce_c_min_f32(const npyv_lanetype_f32 *ip, npyv_lanetype_f32 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep*8;
    npyv_f32 acc = npyv_setall_f32(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_f32 v0 = npyv_load_f32(ip + vstep * 0);
        npyv_f32 v1 = npyv_load_f32(ip + vstep * 1);
        npyv_f32 v2 = npyv_load_f32(ip + vstep * 2);
        npyv_f32 v3 = npyv_load_f32(ip + vstep * 3);

        npyv_f32 v4 = npyv_load_f32(ip + vstep * 4);
        npyv_f32 v5 = npyv_load_f32(ip + vstep * 5);
        npyv_f32 v6 = npyv_load_f32(ip + vstep * 6);
        npyv_f32 v7 = npyv_load_f32(ip + vstep * 7);

        npyv_f32 r01 = V_INTRIN(v0, v1);
        npyv_f32 r23 = V_INTRIN(v2, v3);
        npyv_f32 r45 = V_INTRIN(v4, v5);
        npyv_f32 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_f32(ip));
    }
    npyv_lanetype_f32 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_f32 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_min_f32(const npyv_lanetype_f32 *ip1, const npyv_lanetype_f32 *ip2,
                                     npyv_lanetype_f32 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_f32;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_f32 v0 = npyv_load_f32(&ip1[i + 0 * elemPerVector]);
        npyv_f32 v1 = npyv_load_f32(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_f32 v2 = npyv_load_f32(&ip1[i + 2 * elemPerVector]);
        npyv_f32 v3 = npyv_load_f32(&ip1[i + 3 * elemPerVector]);
        npyv_f32 v4 = npyv_load_f32(&ip1[i + 4 * elemPerVector]);
        npyv_f32 v5 = npyv_load_f32(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_f32 u0 = npyv_load_f32(&ip2[i + 0 * elemPerVector]);
        npyv_f32 u1 = npyv_load_f32(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_f32 u2 = npyv_load_f32(&ip2[i + 2 * elemPerVector]);
        npyv_f32 u3 = npyv_load_f32(&ip2[i + 3 * elemPerVector]);
        npyv_f32 u4 = npyv_load_f32(&ip2[i + 4 * elemPerVector]);
        npyv_f32 u5 = npyv_load_f32(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_f32 m0 = V_INTRIN(v0, u0);
        npyv_f32 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_f32 m2 = V_INTRIN(v2, u2);
        npyv_f32 m3 = V_INTRIN(v3, u3);
        npyv_f32 m4 = V_INTRIN(v4, u4);
        npyv_f32 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_f32(&op1[i + 0 * elemPerVector], m0);
        npyv_store_f32(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_f32(&op1[i + 2 * elemPerVector], m2);
        npyv_store_f32(&op1[i + 3 * elemPerVector], m3);
        npyv_store_f32(&op1[i + 4 * elemPerVector], m4);
        npyv_store_f32(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_f32 v0 = npyv_load_f32(ip1 + i);
        npyv_f32 u0 = npyv_load_f32(ip2 + i);
        npyv_f32 m0 = V_INTRIN(v0, u0);
        npyv_store_f32(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_f32 in1 = ip1[i];
        const npyv_lanetype_f32 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 1 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_min_f32(const npyv_lanetype_f32 *ip1, npy_intp sip1,
                           const npyv_lanetype_f32 *ip2, npy_intp sip2,
                                 npyv_lanetype_f32 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_f32;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_f32 a, b;
        if (sip1 == 1) {
            a = npyv_load_f32(ip1);
        } else {
            a = npyv_loadn_f32(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_f32(ip2);
        } else {
            b = npyv_loadn_f32(ip2, sip2);
        }
        npyv_f32 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_f32(op1, r);
        } else {
            npyv_storen_f32(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_f32 a = *ip1;
        const npyv_lanetype_f32 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP

#line 110
#define SCALAR_OP scalar_maxp_f
#if NPY_SIMD_F32 && (!1 || (1 && 1))

#if 1 && !1
    #define V_INTRIN npyv_maxpn_f32 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_maxpn_f32
#else
    #define V_INTRIN npyv_maxp_f32
    #define V_REDUCE_INTRIN npyv_reduce_maxp_f32
#endif

// contiguous input.
static inline void
simd_reduce_c_maxp_f32(const npyv_lanetype_f32 *ip, npyv_lanetype_f32 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep*8;
    npyv_f32 acc = npyv_setall_f32(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_f32 v0 = npyv_load_f32(ip + vstep * 0);
        npyv_f32 v1 = npyv_load_f32(ip + vstep * 1);
        npyv_f32 v2 = npyv_load_f32(ip + vstep * 2);
        npyv_f32 v3 = npyv_load_f32(ip + vstep * 3);

        npyv_f32 v4 = npyv_load_f32(ip + vstep * 4);
        npyv_f32 v5 = npyv_load_f32(ip + vstep * 5);
        npyv_f32 v6 = npyv_load_f32(ip + vstep * 6);
        npyv_f32 v7 = npyv_load_f32(ip + vstep * 7);

        npyv_f32 r01 = V_INTRIN(v0, v1);
        npyv_f32 r23 = V_INTRIN(v2, v3);
        npyv_f32 r45 = V_INTRIN(v4, v5);
        npyv_f32 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_f32(ip));
    }
    npyv_lanetype_f32 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_f32 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_maxp_f32(const npyv_lanetype_f32 *ip1, const npyv_lanetype_f32 *ip2,
                                     npyv_lanetype_f32 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_f32;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_f32 v0 = npyv_load_f32(&ip1[i + 0 * elemPerVector]);
        npyv_f32 v1 = npyv_load_f32(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_f32 v2 = npyv_load_f32(&ip1[i + 2 * elemPerVector]);
        npyv_f32 v3 = npyv_load_f32(&ip1[i + 3 * elemPerVector]);
        npyv_f32 v4 = npyv_load_f32(&ip1[i + 4 * elemPerVector]);
        npyv_f32 v5 = npyv_load_f32(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_f32 u0 = npyv_load_f32(&ip2[i + 0 * elemPerVector]);
        npyv_f32 u1 = npyv_load_f32(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_f32 u2 = npyv_load_f32(&ip2[i + 2 * elemPerVector]);
        npyv_f32 u3 = npyv_load_f32(&ip2[i + 3 * elemPerVector]);
        npyv_f32 u4 = npyv_load_f32(&ip2[i + 4 * elemPerVector]);
        npyv_f32 u5 = npyv_load_f32(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_f32 m0 = V_INTRIN(v0, u0);
        npyv_f32 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_f32 m2 = V_INTRIN(v2, u2);
        npyv_f32 m3 = V_INTRIN(v3, u3);
        npyv_f32 m4 = V_INTRIN(v4, u4);
        npyv_f32 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_f32(&op1[i + 0 * elemPerVector], m0);
        npyv_store_f32(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_f32(&op1[i + 2 * elemPerVector], m2);
        npyv_store_f32(&op1[i + 3 * elemPerVector], m3);
        npyv_store_f32(&op1[i + 4 * elemPerVector], m4);
        npyv_store_f32(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_f32 v0 = npyv_load_f32(ip1 + i);
        npyv_f32 u0 = npyv_load_f32(ip2 + i);
        npyv_f32 m0 = V_INTRIN(v0, u0);
        npyv_store_f32(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_f32 in1 = ip1[i];
        const npyv_lanetype_f32 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 1 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_maxp_f32(const npyv_lanetype_f32 *ip1, npy_intp sip1,
                           const npyv_lanetype_f32 *ip2, npy_intp sip2,
                                 npyv_lanetype_f32 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_f32;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_f32 a, b;
        if (sip1 == 1) {
            a = npyv_load_f32(ip1);
        } else {
            a = npyv_loadn_f32(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_f32(ip2);
        } else {
            b = npyv_loadn_f32(ip2, sip2);
        }
        npyv_f32 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_f32(op1, r);
        } else {
            npyv_storen_f32(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_f32 a = *ip1;
        const npyv_lanetype_f32 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP

#line 110
#define SCALAR_OP scalar_minp_f
#if NPY_SIMD_F32 && (!1 || (1 && 1))

#if 1 && !1
    #define V_INTRIN npyv_minpn_f32 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_minpn_f32
#else
    #define V_INTRIN npyv_minp_f32
    #define V_REDUCE_INTRIN npyv_reduce_minp_f32
#endif

// contiguous input.
static inline void
simd_reduce_c_minp_f32(const npyv_lanetype_f32 *ip, npyv_lanetype_f32 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep*8;
    npyv_f32 acc = npyv_setall_f32(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_f32 v0 = npyv_load_f32(ip + vstep * 0);
        npyv_f32 v1 = npyv_load_f32(ip + vstep * 1);
        npyv_f32 v2 = npyv_load_f32(ip + vstep * 2);
        npyv_f32 v3 = npyv_load_f32(ip + vstep * 3);

        npyv_f32 v4 = npyv_load_f32(ip + vstep * 4);
        npyv_f32 v5 = npyv_load_f32(ip + vstep * 5);
        npyv_f32 v6 = npyv_load_f32(ip + vstep * 6);
        npyv_f32 v7 = npyv_load_f32(ip + vstep * 7);

        npyv_f32 r01 = V_INTRIN(v0, v1);
        npyv_f32 r23 = V_INTRIN(v2, v3);
        npyv_f32 r45 = V_INTRIN(v4, v5);
        npyv_f32 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_f32(ip));
    }
    npyv_lanetype_f32 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_f32 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_minp_f32(const npyv_lanetype_f32 *ip1, const npyv_lanetype_f32 *ip2,
                                     npyv_lanetype_f32 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_f32;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_f32 v0 = npyv_load_f32(&ip1[i + 0 * elemPerVector]);
        npyv_f32 v1 = npyv_load_f32(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_f32 v2 = npyv_load_f32(&ip1[i + 2 * elemPerVector]);
        npyv_f32 v3 = npyv_load_f32(&ip1[i + 3 * elemPerVector]);
        npyv_f32 v4 = npyv_load_f32(&ip1[i + 4 * elemPerVector]);
        npyv_f32 v5 = npyv_load_f32(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_f32 u0 = npyv_load_f32(&ip2[i + 0 * elemPerVector]);
        npyv_f32 u1 = npyv_load_f32(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_f32 u2 = npyv_load_f32(&ip2[i + 2 * elemPerVector]);
        npyv_f32 u3 = npyv_load_f32(&ip2[i + 3 * elemPerVector]);
        npyv_f32 u4 = npyv_load_f32(&ip2[i + 4 * elemPerVector]);
        npyv_f32 u5 = npyv_load_f32(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_f32 m0 = V_INTRIN(v0, u0);
        npyv_f32 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_f32 m2 = V_INTRIN(v2, u2);
        npyv_f32 m3 = V_INTRIN(v3, u3);
        npyv_f32 m4 = V_INTRIN(v4, u4);
        npyv_f32 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_f32(&op1[i + 0 * elemPerVector], m0);
        npyv_store_f32(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_f32(&op1[i + 2 * elemPerVector], m2);
        npyv_store_f32(&op1[i + 3 * elemPerVector], m3);
        npyv_store_f32(&op1[i + 4 * elemPerVector], m4);
        npyv_store_f32(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_f32 v0 = npyv_load_f32(ip1 + i);
        npyv_f32 u0 = npyv_load_f32(ip2 + i);
        npyv_f32 m0 = V_INTRIN(v0, u0);
        npyv_store_f32(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_f32 in1 = ip1[i];
        const npyv_lanetype_f32 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 1 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_minp_f32(const npyv_lanetype_f32 *ip1, npy_intp sip1,
                           const npyv_lanetype_f32 *ip2, npy_intp sip2,
                                 npyv_lanetype_f32 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_f32;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_f32 a, b;
        if (sip1 == 1) {
            a = npyv_load_f32(ip1);
        } else {
            a = npyv_loadn_f32(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_f32(ip2);
        } else {
            b = npyv_loadn_f32(ip2, sip2);
        }
        npyv_f32 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_f32(op1, r);
        } else {
            npyv_storen_f32(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_f32 a = *ip1;
        const npyv_lanetype_f32 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP


#line 106
#line 110
#define SCALAR_OP scalar_max_d
#if NPY_SIMD_F64 && (!0 || (1 && 0))

#if 1 && !0
    #define V_INTRIN npyv_maxn_f64 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_maxn_f64
#else
    #define V_INTRIN npyv_max_f64
    #define V_REDUCE_INTRIN npyv_reduce_max_f64
#endif

// contiguous input.
static inline void
simd_reduce_c_max_f64(const npyv_lanetype_f64 *ip, npyv_lanetype_f64 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep*8;
    npyv_f64 acc = npyv_setall_f64(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_f64 v0 = npyv_load_f64(ip + vstep * 0);
        npyv_f64 v1 = npyv_load_f64(ip + vstep * 1);
        npyv_f64 v2 = npyv_load_f64(ip + vstep * 2);
        npyv_f64 v3 = npyv_load_f64(ip + vstep * 3);

        npyv_f64 v4 = npyv_load_f64(ip + vstep * 4);
        npyv_f64 v5 = npyv_load_f64(ip + vstep * 5);
        npyv_f64 v6 = npyv_load_f64(ip + vstep * 6);
        npyv_f64 v7 = npyv_load_f64(ip + vstep * 7);

        npyv_f64 r01 = V_INTRIN(v0, v1);
        npyv_f64 r23 = V_INTRIN(v2, v3);
        npyv_f64 r45 = V_INTRIN(v4, v5);
        npyv_f64 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_f64(ip));
    }
    npyv_lanetype_f64 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_f64 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_max_f64(const npyv_lanetype_f64 *ip1, const npyv_lanetype_f64 *ip2,
                                     npyv_lanetype_f64 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_f64;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_f64 v0 = npyv_load_f64(&ip1[i + 0 * elemPerVector]);
        npyv_f64 v1 = npyv_load_f64(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_f64 v2 = npyv_load_f64(&ip1[i + 2 * elemPerVector]);
        npyv_f64 v3 = npyv_load_f64(&ip1[i + 3 * elemPerVector]);
        npyv_f64 v4 = npyv_load_f64(&ip1[i + 4 * elemPerVector]);
        npyv_f64 v5 = npyv_load_f64(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_f64 u0 = npyv_load_f64(&ip2[i + 0 * elemPerVector]);
        npyv_f64 u1 = npyv_load_f64(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_f64 u2 = npyv_load_f64(&ip2[i + 2 * elemPerVector]);
        npyv_f64 u3 = npyv_load_f64(&ip2[i + 3 * elemPerVector]);
        npyv_f64 u4 = npyv_load_f64(&ip2[i + 4 * elemPerVector]);
        npyv_f64 u5 = npyv_load_f64(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_f64 m0 = V_INTRIN(v0, u0);
        npyv_f64 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_f64 m2 = V_INTRIN(v2, u2);
        npyv_f64 m3 = V_INTRIN(v3, u3);
        npyv_f64 m4 = V_INTRIN(v4, u4);
        npyv_f64 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_f64(&op1[i + 0 * elemPerVector], m0);
        npyv_store_f64(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_f64(&op1[i + 2 * elemPerVector], m2);
        npyv_store_f64(&op1[i + 3 * elemPerVector], m3);
        npyv_store_f64(&op1[i + 4 * elemPerVector], m4);
        npyv_store_f64(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_f64 v0 = npyv_load_f64(ip1 + i);
        npyv_f64 u0 = npyv_load_f64(ip2 + i);
        npyv_f64 m0 = V_INTRIN(v0, u0);
        npyv_store_f64(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_f64 in1 = ip1[i];
        const npyv_lanetype_f64 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 1 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_max_f64(const npyv_lanetype_f64 *ip1, npy_intp sip1,
                           const npyv_lanetype_f64 *ip2, npy_intp sip2,
                                 npyv_lanetype_f64 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_f64;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_f64 a, b;
        if (sip1 == 1) {
            a = npyv_load_f64(ip1);
        } else {
            a = npyv_loadn_f64(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_f64(ip2);
        } else {
            b = npyv_loadn_f64(ip2, sip2);
        }
        npyv_f64 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_f64(op1, r);
        } else {
            npyv_storen_f64(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_f64 a = *ip1;
        const npyv_lanetype_f64 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP

#line 110
#define SCALAR_OP scalar_min_d
#if NPY_SIMD_F64 && (!0 || (1 && 0))

#if 1 && !0
    #define V_INTRIN npyv_minn_f64 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_minn_f64
#else
    #define V_INTRIN npyv_min_f64
    #define V_REDUCE_INTRIN npyv_reduce_min_f64
#endif

// contiguous input.
static inline void
simd_reduce_c_min_f64(const npyv_lanetype_f64 *ip, npyv_lanetype_f64 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep*8;
    npyv_f64 acc = npyv_setall_f64(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_f64 v0 = npyv_load_f64(ip + vstep * 0);
        npyv_f64 v1 = npyv_load_f64(ip + vstep * 1);
        npyv_f64 v2 = npyv_load_f64(ip + vstep * 2);
        npyv_f64 v3 = npyv_load_f64(ip + vstep * 3);

        npyv_f64 v4 = npyv_load_f64(ip + vstep * 4);
        npyv_f64 v5 = npyv_load_f64(ip + vstep * 5);
        npyv_f64 v6 = npyv_load_f64(ip + vstep * 6);
        npyv_f64 v7 = npyv_load_f64(ip + vstep * 7);

        npyv_f64 r01 = V_INTRIN(v0, v1);
        npyv_f64 r23 = V_INTRIN(v2, v3);
        npyv_f64 r45 = V_INTRIN(v4, v5);
        npyv_f64 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_f64(ip));
    }
    npyv_lanetype_f64 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_f64 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_min_f64(const npyv_lanetype_f64 *ip1, const npyv_lanetype_f64 *ip2,
                                     npyv_lanetype_f64 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_f64;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_f64 v0 = npyv_load_f64(&ip1[i + 0 * elemPerVector]);
        npyv_f64 v1 = npyv_load_f64(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_f64 v2 = npyv_load_f64(&ip1[i + 2 * elemPerVector]);
        npyv_f64 v3 = npyv_load_f64(&ip1[i + 3 * elemPerVector]);
        npyv_f64 v4 = npyv_load_f64(&ip1[i + 4 * elemPerVector]);
        npyv_f64 v5 = npyv_load_f64(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_f64 u0 = npyv_load_f64(&ip2[i + 0 * elemPerVector]);
        npyv_f64 u1 = npyv_load_f64(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_f64 u2 = npyv_load_f64(&ip2[i + 2 * elemPerVector]);
        npyv_f64 u3 = npyv_load_f64(&ip2[i + 3 * elemPerVector]);
        npyv_f64 u4 = npyv_load_f64(&ip2[i + 4 * elemPerVector]);
        npyv_f64 u5 = npyv_load_f64(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_f64 m0 = V_INTRIN(v0, u0);
        npyv_f64 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_f64 m2 = V_INTRIN(v2, u2);
        npyv_f64 m3 = V_INTRIN(v3, u3);
        npyv_f64 m4 = V_INTRIN(v4, u4);
        npyv_f64 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_f64(&op1[i + 0 * elemPerVector], m0);
        npyv_store_f64(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_f64(&op1[i + 2 * elemPerVector], m2);
        npyv_store_f64(&op1[i + 3 * elemPerVector], m3);
        npyv_store_f64(&op1[i + 4 * elemPerVector], m4);
        npyv_store_f64(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_f64 v0 = npyv_load_f64(ip1 + i);
        npyv_f64 u0 = npyv_load_f64(ip2 + i);
        npyv_f64 m0 = V_INTRIN(v0, u0);
        npyv_store_f64(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_f64 in1 = ip1[i];
        const npyv_lanetype_f64 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 1 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_min_f64(const npyv_lanetype_f64 *ip1, npy_intp sip1,
                           const npyv_lanetype_f64 *ip2, npy_intp sip2,
                                 npyv_lanetype_f64 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_f64;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_f64 a, b;
        if (sip1 == 1) {
            a = npyv_load_f64(ip1);
        } else {
            a = npyv_loadn_f64(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_f64(ip2);
        } else {
            b = npyv_loadn_f64(ip2, sip2);
        }
        npyv_f64 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_f64(op1, r);
        } else {
            npyv_storen_f64(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_f64 a = *ip1;
        const npyv_lanetype_f64 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP

#line 110
#define SCALAR_OP scalar_maxp_d
#if NPY_SIMD_F64 && (!1 || (1 && 1))

#if 1 && !1
    #define V_INTRIN npyv_maxpn_f64 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_maxpn_f64
#else
    #define V_INTRIN npyv_maxp_f64
    #define V_REDUCE_INTRIN npyv_reduce_maxp_f64
#endif

// contiguous input.
static inline void
simd_reduce_c_maxp_f64(const npyv_lanetype_f64 *ip, npyv_lanetype_f64 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep*8;
    npyv_f64 acc = npyv_setall_f64(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_f64 v0 = npyv_load_f64(ip + vstep * 0);
        npyv_f64 v1 = npyv_load_f64(ip + vstep * 1);
        npyv_f64 v2 = npyv_load_f64(ip + vstep * 2);
        npyv_f64 v3 = npyv_load_f64(ip + vstep * 3);

        npyv_f64 v4 = npyv_load_f64(ip + vstep * 4);
        npyv_f64 v5 = npyv_load_f64(ip + vstep * 5);
        npyv_f64 v6 = npyv_load_f64(ip + vstep * 6);
        npyv_f64 v7 = npyv_load_f64(ip + vstep * 7);

        npyv_f64 r01 = V_INTRIN(v0, v1);
        npyv_f64 r23 = V_INTRIN(v2, v3);
        npyv_f64 r45 = V_INTRIN(v4, v5);
        npyv_f64 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_f64(ip));
    }
    npyv_lanetype_f64 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_f64 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_maxp_f64(const npyv_lanetype_f64 *ip1, const npyv_lanetype_f64 *ip2,
                                     npyv_lanetype_f64 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_f64;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_f64 v0 = npyv_load_f64(&ip1[i + 0 * elemPerVector]);
        npyv_f64 v1 = npyv_load_f64(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_f64 v2 = npyv_load_f64(&ip1[i + 2 * elemPerVector]);
        npyv_f64 v3 = npyv_load_f64(&ip1[i + 3 * elemPerVector]);
        npyv_f64 v4 = npyv_load_f64(&ip1[i + 4 * elemPerVector]);
        npyv_f64 v5 = npyv_load_f64(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_f64 u0 = npyv_load_f64(&ip2[i + 0 * elemPerVector]);
        npyv_f64 u1 = npyv_load_f64(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_f64 u2 = npyv_load_f64(&ip2[i + 2 * elemPerVector]);
        npyv_f64 u3 = npyv_load_f64(&ip2[i + 3 * elemPerVector]);
        npyv_f64 u4 = npyv_load_f64(&ip2[i + 4 * elemPerVector]);
        npyv_f64 u5 = npyv_load_f64(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_f64 m0 = V_INTRIN(v0, u0);
        npyv_f64 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_f64 m2 = V_INTRIN(v2, u2);
        npyv_f64 m3 = V_INTRIN(v3, u3);
        npyv_f64 m4 = V_INTRIN(v4, u4);
        npyv_f64 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_f64(&op1[i + 0 * elemPerVector], m0);
        npyv_store_f64(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_f64(&op1[i + 2 * elemPerVector], m2);
        npyv_store_f64(&op1[i + 3 * elemPerVector], m3);
        npyv_store_f64(&op1[i + 4 * elemPerVector], m4);
        npyv_store_f64(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_f64 v0 = npyv_load_f64(ip1 + i);
        npyv_f64 u0 = npyv_load_f64(ip2 + i);
        npyv_f64 m0 = V_INTRIN(v0, u0);
        npyv_store_f64(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_f64 in1 = ip1[i];
        const npyv_lanetype_f64 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 1 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_maxp_f64(const npyv_lanetype_f64 *ip1, npy_intp sip1,
                           const npyv_lanetype_f64 *ip2, npy_intp sip2,
                                 npyv_lanetype_f64 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_f64;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_f64 a, b;
        if (sip1 == 1) {
            a = npyv_load_f64(ip1);
        } else {
            a = npyv_loadn_f64(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_f64(ip2);
        } else {
            b = npyv_loadn_f64(ip2, sip2);
        }
        npyv_f64 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_f64(op1, r);
        } else {
            npyv_storen_f64(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_f64 a = *ip1;
        const npyv_lanetype_f64 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP

#line 110
#define SCALAR_OP scalar_minp_d
#if NPY_SIMD_F64 && (!1 || (1 && 1))

#if 1 && !1
    #define V_INTRIN npyv_minpn_f64 // propagates NaNs
    #define V_REDUCE_INTRIN npyv_reduce_minpn_f64
#else
    #define V_INTRIN npyv_minp_f64
    #define V_REDUCE_INTRIN npyv_reduce_minp_f64
#endif

// contiguous input.
static inline void
simd_reduce_c_minp_f64(const npyv_lanetype_f64 *ip, npyv_lanetype_f64 *op1, npy_intp len)
{
    if (len < 1) {
        return;
    }
    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep*8;
    npyv_f64 acc = npyv_setall_f64(op1[0]);
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #ifdef NPY_HAVE_SSE2
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_f64 v0 = npyv_load_f64(ip + vstep * 0);
        npyv_f64 v1 = npyv_load_f64(ip + vstep * 1);
        npyv_f64 v2 = npyv_load_f64(ip + vstep * 2);
        npyv_f64 v3 = npyv_load_f64(ip + vstep * 3);

        npyv_f64 v4 = npyv_load_f64(ip + vstep * 4);
        npyv_f64 v5 = npyv_load_f64(ip + vstep * 5);
        npyv_f64 v6 = npyv_load_f64(ip + vstep * 6);
        npyv_f64 v7 = npyv_load_f64(ip + vstep * 7);

        npyv_f64 r01 = V_INTRIN(v0, v1);
        npyv_f64 r23 = V_INTRIN(v2, v3);
        npyv_f64 r45 = V_INTRIN(v4, v5);
        npyv_f64 r67 = V_INTRIN(v6, v7);
        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
    }
    for (; len >= vstep; len -= vstep, ip += vstep) {
        acc = V_INTRIN(acc, npyv_load_f64(ip));
    }
    npyv_lanetype_f64 r = V_REDUCE_INTRIN(acc);
    // Scalar - finish up any remaining iterations
    for (; len > 0; --len, ++ip) {
        const npyv_lanetype_f64 in2 = *ip;
        r = SCALAR_OP(r, in2);
    }
    op1[0] = r;
}

// contiguous inputs and output.
static inline void
simd_binary_ccc_minp_f64(const npyv_lanetype_f64 *ip1, const npyv_lanetype_f64 *ip2,
                                     npyv_lanetype_f64 *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
    // Note, 6x unroll was chosen for best results on Apple M1
    const int vectorsPerLoop = 6;
#else
    // To avoid memory bandwidth bottleneck
    const int vectorsPerLoop = 2;
#endif
    const int elemPerVector = npyv_nlanes_f64;
    int elemPerLoop = vectorsPerLoop * elemPerVector;

    npy_intp i = 0;

    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
        npyv_f64 v0 = npyv_load_f64(&ip1[i + 0 * elemPerVector]);
        npyv_f64 v1 = npyv_load_f64(&ip1[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_f64 v2 = npyv_load_f64(&ip1[i + 2 * elemPerVector]);
        npyv_f64 v3 = npyv_load_f64(&ip1[i + 3 * elemPerVector]);
        npyv_f64 v4 = npyv_load_f64(&ip1[i + 4 * elemPerVector]);
        npyv_f64 v5 = npyv_load_f64(&ip1[i + 5 * elemPerVector]);
    #endif
        npyv_f64 u0 = npyv_load_f64(&ip2[i + 0 * elemPerVector]);
        npyv_f64 u1 = npyv_load_f64(&ip2[i + 1 * elemPerVector]);
    #if NPY_SIMD_WIDTH == 128
        npyv_f64 u2 = npyv_load_f64(&ip2[i + 2 * elemPerVector]);
        npyv_f64 u3 = npyv_load_f64(&ip2[i + 3 * elemPerVector]);
        npyv_f64 u4 = npyv_load_f64(&ip2[i + 4 * elemPerVector]);
        npyv_f64 u5 = npyv_load_f64(&ip2[i + 5 * elemPerVector]);
    #endif
        npyv_f64 m0 = V_INTRIN(v0, u0);
        npyv_f64 m1 = V_INTRIN(v1, u1);
    #if NPY_SIMD_WIDTH == 128
        npyv_f64 m2 = V_INTRIN(v2, u2);
        npyv_f64 m3 = V_INTRIN(v3, u3);
        npyv_f64 m4 = V_INTRIN(v4, u4);
        npyv_f64 m5 = V_INTRIN(v5, u5);
    #endif
        npyv_store_f64(&op1[i + 0 * elemPerVector], m0);
        npyv_store_f64(&op1[i + 1 * elemPerVector], m1);
    #if NPY_SIMD_WIDTH == 128
        npyv_store_f64(&op1[i + 2 * elemPerVector], m2);
        npyv_store_f64(&op1[i + 3 * elemPerVector], m3);
        npyv_store_f64(&op1[i + 4 * elemPerVector], m4);
        npyv_store_f64(&op1[i + 5 * elemPerVector], m5);
    #endif
    }
    for (; (i+elemPerVector) <= len; i += elemPerVector) {
        npyv_f64 v0 = npyv_load_f64(ip1 + i);
        npyv_f64 u0 = npyv_load_f64(ip2 + i);
        npyv_f64 m0 = V_INTRIN(v0, u0);
        npyv_store_f64(op1 + i, m0);
    }
    // Scalar - finish up any remaining iterations
    for (; i < len; ++i) {
        const npyv_lanetype_f64 in1 = ip1[i];
        const npyv_lanetype_f64 in2 = ip2[i];
        op1[i] = SCALAR_OP(in1, in2);
    }
}
// non-contiguous for float 32/64-bit memory access
#if 1 && !defined(NPY_HAVE_NEON)
// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_minp_f64(const npyv_lanetype_f64 *ip1, npy_intp sip1,
                           const npyv_lanetype_f64 *ip2, npy_intp sip2,
                                 npyv_lanetype_f64 *op1, npy_intp sop1,
                                 npy_intp len)
{
    const int vstep = npyv_nlanes_f64;
    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
                         ip2 += sip2*vstep, op1 += sop1*vstep
    ) {
        npyv_f64 a, b;
        if (sip1 == 1) {
            a = npyv_load_f64(ip1);
        } else {
            a = npyv_loadn_f64(ip1, sip1);
        }
        if (sip2 == 1) {
            b = npyv_load_f64(ip2);
        } else {
            b = npyv_loadn_f64(ip2, sip2);
        }
        npyv_f64 r = V_INTRIN(a, b);
        if (sop1 == 1) {
            npyv_store_f64(op1, r);
        } else {
            npyv_storen_f64(op1, sop1, r);
        }
    }
    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
        const npyv_lanetype_f64 a = *ip1;
        const npyv_lanetype_f64 b = *ip2;
        *op1 = SCALAR_OP(a, b);
    }
}
#endif

#undef V_INTRIN
#undef V_REDUCE_INTRIN

#endif // simd_chk && (!fp_only || (is_fp && fp_only))

#undef SCALAR_OP



/*******************************************************************************
 ** Defining ufunc inner functions
 ******************************************************************************/
#line 294
#undef TO_SIMD_SFX
#if 0
#line 299
#elif NPY_SIMD && NPY_BITSOF_BYTE == 8
    #if 0
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_BYTE == 16
    #if 0
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_BYTE == 32
    #if 0
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_BYTE == 64
    #if 0
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_max_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_maximum)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_ubyte)) {
            TO_SIMD_SFX(simd_reduce_c_max)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_ubyte, npy_ubyte)) {
            TO_SIMD_SFX(simd_binary_ccc_max)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_max)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_ubyte m0 = *((npy_ubyte *)(ip2 + (i + 0) * is2));
            npy_ubyte m1 = *((npy_ubyte *)(ip2 + (i + 1) * is2));
            npy_ubyte m2 = *((npy_ubyte *)(ip2 + (i + 2) * is2));
            npy_ubyte m3 = *((npy_ubyte *)(ip2 + (i + 3) * is2));
            npy_ubyte m4 = *((npy_ubyte *)(ip2 + (i + 4) * is2));
            npy_ubyte m5 = *((npy_ubyte *)(ip2 + (i + 5) * is2));
            npy_ubyte m6 = *((npy_ubyte *)(ip2 + (i + 6) * is2));
            npy_ubyte m7 = *((npy_ubyte *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_ubyte v0 = *((npy_ubyte *)(ip2 + (i + 0) * is2));
                npy_ubyte v1 = *((npy_ubyte *)(ip2 + (i + 1) * is2));
                npy_ubyte v2 = *((npy_ubyte *)(ip2 + (i + 2) * is2));
                npy_ubyte v3 = *((npy_ubyte *)(ip2 + (i + 3) * is2));
                npy_ubyte v4 = *((npy_ubyte *)(ip2 + (i + 4) * is2));
                npy_ubyte v5 = *((npy_ubyte *)(ip2 + (i + 5) * is2));
                npy_ubyte v6 = *((npy_ubyte *)(ip2 + (i + 6) * is2));
                npy_ubyte v7 = *((npy_ubyte *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_ubyte *)op1) = SCALAR_OP(*((npy_ubyte *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_ubyte v0 = *((npy_ubyte *)(ip1 + (i + 0) * is1));
            npy_ubyte u0 = *((npy_ubyte *)(ip2 + (i + 0) * is2));
            *((npy_ubyte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_ubyte v1 = *((npy_ubyte *)(ip1 + (i + 1) * is1));
            npy_ubyte u1 = *((npy_ubyte *)(ip2 + (i + 1) * is2));
            *((npy_ubyte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_ubyte v2 = *((npy_ubyte *)(ip1 + (i + 2) * is1));
            npy_ubyte u2 = *((npy_ubyte *)(ip2 + (i + 2) * is2));
            *((npy_ubyte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_ubyte v3 = *((npy_ubyte *)(ip1 + (i + 3) * is1));
            npy_ubyte u3 = *((npy_ubyte *)(ip2 + (i + 3) * is2));
            *((npy_ubyte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_ubyte in1 = *(npy_ubyte *)ip1;
        const npy_ubyte in2 = *(npy_ubyte *)ip2;
        *((npy_ubyte *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_maximum_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_ubyte *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_ubyte *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_ubyte *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_min_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_minimum)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_ubyte)) {
            TO_SIMD_SFX(simd_reduce_c_min)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_ubyte, npy_ubyte)) {
            TO_SIMD_SFX(simd_binary_ccc_min)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_min)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_ubyte m0 = *((npy_ubyte *)(ip2 + (i + 0) * is2));
            npy_ubyte m1 = *((npy_ubyte *)(ip2 + (i + 1) * is2));
            npy_ubyte m2 = *((npy_ubyte *)(ip2 + (i + 2) * is2));
            npy_ubyte m3 = *((npy_ubyte *)(ip2 + (i + 3) * is2));
            npy_ubyte m4 = *((npy_ubyte *)(ip2 + (i + 4) * is2));
            npy_ubyte m5 = *((npy_ubyte *)(ip2 + (i + 5) * is2));
            npy_ubyte m6 = *((npy_ubyte *)(ip2 + (i + 6) * is2));
            npy_ubyte m7 = *((npy_ubyte *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_ubyte v0 = *((npy_ubyte *)(ip2 + (i + 0) * is2));
                npy_ubyte v1 = *((npy_ubyte *)(ip2 + (i + 1) * is2));
                npy_ubyte v2 = *((npy_ubyte *)(ip2 + (i + 2) * is2));
                npy_ubyte v3 = *((npy_ubyte *)(ip2 + (i + 3) * is2));
                npy_ubyte v4 = *((npy_ubyte *)(ip2 + (i + 4) * is2));
                npy_ubyte v5 = *((npy_ubyte *)(ip2 + (i + 5) * is2));
                npy_ubyte v6 = *((npy_ubyte *)(ip2 + (i + 6) * is2));
                npy_ubyte v7 = *((npy_ubyte *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_ubyte *)op1) = SCALAR_OP(*((npy_ubyte *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_ubyte v0 = *((npy_ubyte *)(ip1 + (i + 0) * is1));
            npy_ubyte u0 = *((npy_ubyte *)(ip2 + (i + 0) * is2));
            *((npy_ubyte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_ubyte v1 = *((npy_ubyte *)(ip1 + (i + 1) * is1));
            npy_ubyte u1 = *((npy_ubyte *)(ip2 + (i + 1) * is2));
            *((npy_ubyte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_ubyte v2 = *((npy_ubyte *)(ip1 + (i + 2) * is1));
            npy_ubyte u2 = *((npy_ubyte *)(ip2 + (i + 2) * is2));
            *((npy_ubyte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_ubyte v3 = *((npy_ubyte *)(ip1 + (i + 3) * is1));
            npy_ubyte u3 = *((npy_ubyte *)(ip2 + (i + 3) * is2));
            *((npy_ubyte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_ubyte in1 = *(npy_ubyte *)ip1;
        const npy_ubyte in2 = *(npy_ubyte *)ip2;
        *((npy_ubyte *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_minimum_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_ubyte *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_ubyte *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_ubyte *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_maxp_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_fmax)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_ubyte)) {
            TO_SIMD_SFX(simd_reduce_c_maxp)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_ubyte, npy_ubyte)) {
            TO_SIMD_SFX(simd_binary_ccc_maxp)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_maxp)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_ubyte m0 = *((npy_ubyte *)(ip2 + (i + 0) * is2));
            npy_ubyte m1 = *((npy_ubyte *)(ip2 + (i + 1) * is2));
            npy_ubyte m2 = *((npy_ubyte *)(ip2 + (i + 2) * is2));
            npy_ubyte m3 = *((npy_ubyte *)(ip2 + (i + 3) * is2));
            npy_ubyte m4 = *((npy_ubyte *)(ip2 + (i + 4) * is2));
            npy_ubyte m5 = *((npy_ubyte *)(ip2 + (i + 5) * is2));
            npy_ubyte m6 = *((npy_ubyte *)(ip2 + (i + 6) * is2));
            npy_ubyte m7 = *((npy_ubyte *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_ubyte v0 = *((npy_ubyte *)(ip2 + (i + 0) * is2));
                npy_ubyte v1 = *((npy_ubyte *)(ip2 + (i + 1) * is2));
                npy_ubyte v2 = *((npy_ubyte *)(ip2 + (i + 2) * is2));
                npy_ubyte v3 = *((npy_ubyte *)(ip2 + (i + 3) * is2));
                npy_ubyte v4 = *((npy_ubyte *)(ip2 + (i + 4) * is2));
                npy_ubyte v5 = *((npy_ubyte *)(ip2 + (i + 5) * is2));
                npy_ubyte v6 = *((npy_ubyte *)(ip2 + (i + 6) * is2));
                npy_ubyte v7 = *((npy_ubyte *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_ubyte *)op1) = SCALAR_OP(*((npy_ubyte *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_ubyte v0 = *((npy_ubyte *)(ip1 + (i + 0) * is1));
            npy_ubyte u0 = *((npy_ubyte *)(ip2 + (i + 0) * is2));
            *((npy_ubyte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_ubyte v1 = *((npy_ubyte *)(ip1 + (i + 1) * is1));
            npy_ubyte u1 = *((npy_ubyte *)(ip2 + (i + 1) * is2));
            *((npy_ubyte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_ubyte v2 = *((npy_ubyte *)(ip1 + (i + 2) * is1));
            npy_ubyte u2 = *((npy_ubyte *)(ip2 + (i + 2) * is2));
            *((npy_ubyte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_ubyte v3 = *((npy_ubyte *)(ip1 + (i + 3) * is1));
            npy_ubyte u3 = *((npy_ubyte *)(ip2 + (i + 3) * is2));
            *((npy_ubyte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_ubyte in1 = *(npy_ubyte *)ip1;
        const npy_ubyte in2 = *(npy_ubyte *)ip2;
        *((npy_ubyte *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_fmax_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_ubyte *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_ubyte *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_ubyte *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_minp_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_fmin)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_ubyte)) {
            TO_SIMD_SFX(simd_reduce_c_minp)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_ubyte, npy_ubyte)) {
            TO_SIMD_SFX(simd_binary_ccc_minp)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_minp)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_ubyte m0 = *((npy_ubyte *)(ip2 + (i + 0) * is2));
            npy_ubyte m1 = *((npy_ubyte *)(ip2 + (i + 1) * is2));
            npy_ubyte m2 = *((npy_ubyte *)(ip2 + (i + 2) * is2));
            npy_ubyte m3 = *((npy_ubyte *)(ip2 + (i + 3) * is2));
            npy_ubyte m4 = *((npy_ubyte *)(ip2 + (i + 4) * is2));
            npy_ubyte m5 = *((npy_ubyte *)(ip2 + (i + 5) * is2));
            npy_ubyte m6 = *((npy_ubyte *)(ip2 + (i + 6) * is2));
            npy_ubyte m7 = *((npy_ubyte *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_ubyte v0 = *((npy_ubyte *)(ip2 + (i + 0) * is2));
                npy_ubyte v1 = *((npy_ubyte *)(ip2 + (i + 1) * is2));
                npy_ubyte v2 = *((npy_ubyte *)(ip2 + (i + 2) * is2));
                npy_ubyte v3 = *((npy_ubyte *)(ip2 + (i + 3) * is2));
                npy_ubyte v4 = *((npy_ubyte *)(ip2 + (i + 4) * is2));
                npy_ubyte v5 = *((npy_ubyte *)(ip2 + (i + 5) * is2));
                npy_ubyte v6 = *((npy_ubyte *)(ip2 + (i + 6) * is2));
                npy_ubyte v7 = *((npy_ubyte *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_ubyte *)op1) = SCALAR_OP(*((npy_ubyte *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_ubyte v0 = *((npy_ubyte *)(ip1 + (i + 0) * is1));
            npy_ubyte u0 = *((npy_ubyte *)(ip2 + (i + 0) * is2));
            *((npy_ubyte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_ubyte v1 = *((npy_ubyte *)(ip1 + (i + 1) * is1));
            npy_ubyte u1 = *((npy_ubyte *)(ip2 + (i + 1) * is2));
            *((npy_ubyte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_ubyte v2 = *((npy_ubyte *)(ip1 + (i + 2) * is1));
            npy_ubyte u2 = *((npy_ubyte *)(ip2 + (i + 2) * is2));
            *((npy_ubyte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_ubyte v3 = *((npy_ubyte *)(ip1 + (i + 3) * is1));
            npy_ubyte u3 = *((npy_ubyte *)(ip2 + (i + 3) * is2));
            *((npy_ubyte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_ubyte in1 = *(npy_ubyte *)ip1;
        const npy_ubyte in2 = *(npy_ubyte *)ip2;
        *((npy_ubyte *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_fmin_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_ubyte *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_ubyte *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_ubyte *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)


#line 294
#undef TO_SIMD_SFX
#if 0
#line 299
#elif NPY_SIMD && NPY_BITSOF_SHORT == 8
    #if 0
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_SHORT == 16
    #if 0
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_SHORT == 32
    #if 0
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_SHORT == 64
    #if 0
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_max_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_maximum)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_ushort)) {
            TO_SIMD_SFX(simd_reduce_c_max)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_ushort, npy_ushort)) {
            TO_SIMD_SFX(simd_binary_ccc_max)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_max)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_ushort m0 = *((npy_ushort *)(ip2 + (i + 0) * is2));
            npy_ushort m1 = *((npy_ushort *)(ip2 + (i + 1) * is2));
            npy_ushort m2 = *((npy_ushort *)(ip2 + (i + 2) * is2));
            npy_ushort m3 = *((npy_ushort *)(ip2 + (i + 3) * is2));
            npy_ushort m4 = *((npy_ushort *)(ip2 + (i + 4) * is2));
            npy_ushort m5 = *((npy_ushort *)(ip2 + (i + 5) * is2));
            npy_ushort m6 = *((npy_ushort *)(ip2 + (i + 6) * is2));
            npy_ushort m7 = *((npy_ushort *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_ushort v0 = *((npy_ushort *)(ip2 + (i + 0) * is2));
                npy_ushort v1 = *((npy_ushort *)(ip2 + (i + 1) * is2));
                npy_ushort v2 = *((npy_ushort *)(ip2 + (i + 2) * is2));
                npy_ushort v3 = *((npy_ushort *)(ip2 + (i + 3) * is2));
                npy_ushort v4 = *((npy_ushort *)(ip2 + (i + 4) * is2));
                npy_ushort v5 = *((npy_ushort *)(ip2 + (i + 5) * is2));
                npy_ushort v6 = *((npy_ushort *)(ip2 + (i + 6) * is2));
                npy_ushort v7 = *((npy_ushort *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_ushort *)op1) = SCALAR_OP(*((npy_ushort *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_ushort v0 = *((npy_ushort *)(ip1 + (i + 0) * is1));
            npy_ushort u0 = *((npy_ushort *)(ip2 + (i + 0) * is2));
            *((npy_ushort *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_ushort v1 = *((npy_ushort *)(ip1 + (i + 1) * is1));
            npy_ushort u1 = *((npy_ushort *)(ip2 + (i + 1) * is2));
            *((npy_ushort *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_ushort v2 = *((npy_ushort *)(ip1 + (i + 2) * is1));
            npy_ushort u2 = *((npy_ushort *)(ip2 + (i + 2) * is2));
            *((npy_ushort *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_ushort v3 = *((npy_ushort *)(ip1 + (i + 3) * is1));
            npy_ushort u3 = *((npy_ushort *)(ip2 + (i + 3) * is2));
            *((npy_ushort *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_ushort in1 = *(npy_ushort *)ip1;
        const npy_ushort in2 = *(npy_ushort *)ip2;
        *((npy_ushort *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_maximum_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_ushort *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_ushort *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_ushort *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_min_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_minimum)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_ushort)) {
            TO_SIMD_SFX(simd_reduce_c_min)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_ushort, npy_ushort)) {
            TO_SIMD_SFX(simd_binary_ccc_min)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_min)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_ushort m0 = *((npy_ushort *)(ip2 + (i + 0) * is2));
            npy_ushort m1 = *((npy_ushort *)(ip2 + (i + 1) * is2));
            npy_ushort m2 = *((npy_ushort *)(ip2 + (i + 2) * is2));
            npy_ushort m3 = *((npy_ushort *)(ip2 + (i + 3) * is2));
            npy_ushort m4 = *((npy_ushort *)(ip2 + (i + 4) * is2));
            npy_ushort m5 = *((npy_ushort *)(ip2 + (i + 5) * is2));
            npy_ushort m6 = *((npy_ushort *)(ip2 + (i + 6) * is2));
            npy_ushort m7 = *((npy_ushort *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_ushort v0 = *((npy_ushort *)(ip2 + (i + 0) * is2));
                npy_ushort v1 = *((npy_ushort *)(ip2 + (i + 1) * is2));
                npy_ushort v2 = *((npy_ushort *)(ip2 + (i + 2) * is2));
                npy_ushort v3 = *((npy_ushort *)(ip2 + (i + 3) * is2));
                npy_ushort v4 = *((npy_ushort *)(ip2 + (i + 4) * is2));
                npy_ushort v5 = *((npy_ushort *)(ip2 + (i + 5) * is2));
                npy_ushort v6 = *((npy_ushort *)(ip2 + (i + 6) * is2));
                npy_ushort v7 = *((npy_ushort *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_ushort *)op1) = SCALAR_OP(*((npy_ushort *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_ushort v0 = *((npy_ushort *)(ip1 + (i + 0) * is1));
            npy_ushort u0 = *((npy_ushort *)(ip2 + (i + 0) * is2));
            *((npy_ushort *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_ushort v1 = *((npy_ushort *)(ip1 + (i + 1) * is1));
            npy_ushort u1 = *((npy_ushort *)(ip2 + (i + 1) * is2));
            *((npy_ushort *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_ushort v2 = *((npy_ushort *)(ip1 + (i + 2) * is1));
            npy_ushort u2 = *((npy_ushort *)(ip2 + (i + 2) * is2));
            *((npy_ushort *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_ushort v3 = *((npy_ushort *)(ip1 + (i + 3) * is1));
            npy_ushort u3 = *((npy_ushort *)(ip2 + (i + 3) * is2));
            *((npy_ushort *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_ushort in1 = *(npy_ushort *)ip1;
        const npy_ushort in2 = *(npy_ushort *)ip2;
        *((npy_ushort *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_minimum_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_ushort *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_ushort *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_ushort *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_maxp_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_fmax)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_ushort)) {
            TO_SIMD_SFX(simd_reduce_c_maxp)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_ushort, npy_ushort)) {
            TO_SIMD_SFX(simd_binary_ccc_maxp)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_maxp)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_ushort m0 = *((npy_ushort *)(ip2 + (i + 0) * is2));
            npy_ushort m1 = *((npy_ushort *)(ip2 + (i + 1) * is2));
            npy_ushort m2 = *((npy_ushort *)(ip2 + (i + 2) * is2));
            npy_ushort m3 = *((npy_ushort *)(ip2 + (i + 3) * is2));
            npy_ushort m4 = *((npy_ushort *)(ip2 + (i + 4) * is2));
            npy_ushort m5 = *((npy_ushort *)(ip2 + (i + 5) * is2));
            npy_ushort m6 = *((npy_ushort *)(ip2 + (i + 6) * is2));
            npy_ushort m7 = *((npy_ushort *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_ushort v0 = *((npy_ushort *)(ip2 + (i + 0) * is2));
                npy_ushort v1 = *((npy_ushort *)(ip2 + (i + 1) * is2));
                npy_ushort v2 = *((npy_ushort *)(ip2 + (i + 2) * is2));
                npy_ushort v3 = *((npy_ushort *)(ip2 + (i + 3) * is2));
                npy_ushort v4 = *((npy_ushort *)(ip2 + (i + 4) * is2));
                npy_ushort v5 = *((npy_ushort *)(ip2 + (i + 5) * is2));
                npy_ushort v6 = *((npy_ushort *)(ip2 + (i + 6) * is2));
                npy_ushort v7 = *((npy_ushort *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_ushort *)op1) = SCALAR_OP(*((npy_ushort *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_ushort v0 = *((npy_ushort *)(ip1 + (i + 0) * is1));
            npy_ushort u0 = *((npy_ushort *)(ip2 + (i + 0) * is2));
            *((npy_ushort *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_ushort v1 = *((npy_ushort *)(ip1 + (i + 1) * is1));
            npy_ushort u1 = *((npy_ushort *)(ip2 + (i + 1) * is2));
            *((npy_ushort *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_ushort v2 = *((npy_ushort *)(ip1 + (i + 2) * is1));
            npy_ushort u2 = *((npy_ushort *)(ip2 + (i + 2) * is2));
            *((npy_ushort *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_ushort v3 = *((npy_ushort *)(ip1 + (i + 3) * is1));
            npy_ushort u3 = *((npy_ushort *)(ip2 + (i + 3) * is2));
            *((npy_ushort *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_ushort in1 = *(npy_ushort *)ip1;
        const npy_ushort in2 = *(npy_ushort *)ip2;
        *((npy_ushort *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_fmax_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_ushort *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_ushort *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_ushort *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_minp_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_fmin)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_ushort)) {
            TO_SIMD_SFX(simd_reduce_c_minp)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_ushort, npy_ushort)) {
            TO_SIMD_SFX(simd_binary_ccc_minp)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_minp)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_ushort m0 = *((npy_ushort *)(ip2 + (i + 0) * is2));
            npy_ushort m1 = *((npy_ushort *)(ip2 + (i + 1) * is2));
            npy_ushort m2 = *((npy_ushort *)(ip2 + (i + 2) * is2));
            npy_ushort m3 = *((npy_ushort *)(ip2 + (i + 3) * is2));
            npy_ushort m4 = *((npy_ushort *)(ip2 + (i + 4) * is2));
            npy_ushort m5 = *((npy_ushort *)(ip2 + (i + 5) * is2));
            npy_ushort m6 = *((npy_ushort *)(ip2 + (i + 6) * is2));
            npy_ushort m7 = *((npy_ushort *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_ushort v0 = *((npy_ushort *)(ip2 + (i + 0) * is2));
                npy_ushort v1 = *((npy_ushort *)(ip2 + (i + 1) * is2));
                npy_ushort v2 = *((npy_ushort *)(ip2 + (i + 2) * is2));
                npy_ushort v3 = *((npy_ushort *)(ip2 + (i + 3) * is2));
                npy_ushort v4 = *((npy_ushort *)(ip2 + (i + 4) * is2));
                npy_ushort v5 = *((npy_ushort *)(ip2 + (i + 5) * is2));
                npy_ushort v6 = *((npy_ushort *)(ip2 + (i + 6) * is2));
                npy_ushort v7 = *((npy_ushort *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_ushort *)op1) = SCALAR_OP(*((npy_ushort *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_ushort v0 = *((npy_ushort *)(ip1 + (i + 0) * is1));
            npy_ushort u0 = *((npy_ushort *)(ip2 + (i + 0) * is2));
            *((npy_ushort *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_ushort v1 = *((npy_ushort *)(ip1 + (i + 1) * is1));
            npy_ushort u1 = *((npy_ushort *)(ip2 + (i + 1) * is2));
            *((npy_ushort *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_ushort v2 = *((npy_ushort *)(ip1 + (i + 2) * is1));
            npy_ushort u2 = *((npy_ushort *)(ip2 + (i + 2) * is2));
            *((npy_ushort *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_ushort v3 = *((npy_ushort *)(ip1 + (i + 3) * is1));
            npy_ushort u3 = *((npy_ushort *)(ip2 + (i + 3) * is2));
            *((npy_ushort *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_ushort in1 = *(npy_ushort *)ip1;
        const npy_ushort in2 = *(npy_ushort *)ip2;
        *((npy_ushort *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_fmin_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_ushort *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_ushort *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_ushort *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)


#line 294
#undef TO_SIMD_SFX
#if 0
#line 299
#elif NPY_SIMD && NPY_BITSOF_INT == 8
    #if 0
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_INT == 16
    #if 0
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_INT == 32
    #if 0
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_INT == 64
    #if 0
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_max_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_maximum)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_uint)) {
            TO_SIMD_SFX(simd_reduce_c_max)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_uint, npy_uint)) {
            TO_SIMD_SFX(simd_binary_ccc_max)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_max)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_uint m0 = *((npy_uint *)(ip2 + (i + 0) * is2));
            npy_uint m1 = *((npy_uint *)(ip2 + (i + 1) * is2));
            npy_uint m2 = *((npy_uint *)(ip2 + (i + 2) * is2));
            npy_uint m3 = *((npy_uint *)(ip2 + (i + 3) * is2));
            npy_uint m4 = *((npy_uint *)(ip2 + (i + 4) * is2));
            npy_uint m5 = *((npy_uint *)(ip2 + (i + 5) * is2));
            npy_uint m6 = *((npy_uint *)(ip2 + (i + 6) * is2));
            npy_uint m7 = *((npy_uint *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_uint v0 = *((npy_uint *)(ip2 + (i + 0) * is2));
                npy_uint v1 = *((npy_uint *)(ip2 + (i + 1) * is2));
                npy_uint v2 = *((npy_uint *)(ip2 + (i + 2) * is2));
                npy_uint v3 = *((npy_uint *)(ip2 + (i + 3) * is2));
                npy_uint v4 = *((npy_uint *)(ip2 + (i + 4) * is2));
                npy_uint v5 = *((npy_uint *)(ip2 + (i + 5) * is2));
                npy_uint v6 = *((npy_uint *)(ip2 + (i + 6) * is2));
                npy_uint v7 = *((npy_uint *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_uint *)op1) = SCALAR_OP(*((npy_uint *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_uint v0 = *((npy_uint *)(ip1 + (i + 0) * is1));
            npy_uint u0 = *((npy_uint *)(ip2 + (i + 0) * is2));
            *((npy_uint *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_uint v1 = *((npy_uint *)(ip1 + (i + 1) * is1));
            npy_uint u1 = *((npy_uint *)(ip2 + (i + 1) * is2));
            *((npy_uint *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_uint v2 = *((npy_uint *)(ip1 + (i + 2) * is1));
            npy_uint u2 = *((npy_uint *)(ip2 + (i + 2) * is2));
            *((npy_uint *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_uint v3 = *((npy_uint *)(ip1 + (i + 3) * is1));
            npy_uint u3 = *((npy_uint *)(ip2 + (i + 3) * is2));
            *((npy_uint *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_uint in1 = *(npy_uint *)ip1;
        const npy_uint in2 = *(npy_uint *)ip2;
        *((npy_uint *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_maximum_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_uint *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_uint *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_uint *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_min_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_minimum)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_uint)) {
            TO_SIMD_SFX(simd_reduce_c_min)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_uint, npy_uint)) {
            TO_SIMD_SFX(simd_binary_ccc_min)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_min)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_uint m0 = *((npy_uint *)(ip2 + (i + 0) * is2));
            npy_uint m1 = *((npy_uint *)(ip2 + (i + 1) * is2));
            npy_uint m2 = *((npy_uint *)(ip2 + (i + 2) * is2));
            npy_uint m3 = *((npy_uint *)(ip2 + (i + 3) * is2));
            npy_uint m4 = *((npy_uint *)(ip2 + (i + 4) * is2));
            npy_uint m5 = *((npy_uint *)(ip2 + (i + 5) * is2));
            npy_uint m6 = *((npy_uint *)(ip2 + (i + 6) * is2));
            npy_uint m7 = *((npy_uint *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_uint v0 = *((npy_uint *)(ip2 + (i + 0) * is2));
                npy_uint v1 = *((npy_uint *)(ip2 + (i + 1) * is2));
                npy_uint v2 = *((npy_uint *)(ip2 + (i + 2) * is2));
                npy_uint v3 = *((npy_uint *)(ip2 + (i + 3) * is2));
                npy_uint v4 = *((npy_uint *)(ip2 + (i + 4) * is2));
                npy_uint v5 = *((npy_uint *)(ip2 + (i + 5) * is2));
                npy_uint v6 = *((npy_uint *)(ip2 + (i + 6) * is2));
                npy_uint v7 = *((npy_uint *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_uint *)op1) = SCALAR_OP(*((npy_uint *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_uint v0 = *((npy_uint *)(ip1 + (i + 0) * is1));
            npy_uint u0 = *((npy_uint *)(ip2 + (i + 0) * is2));
            *((npy_uint *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_uint v1 = *((npy_uint *)(ip1 + (i + 1) * is1));
            npy_uint u1 = *((npy_uint *)(ip2 + (i + 1) * is2));
            *((npy_uint *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_uint v2 = *((npy_uint *)(ip1 + (i + 2) * is1));
            npy_uint u2 = *((npy_uint *)(ip2 + (i + 2) * is2));
            *((npy_uint *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_uint v3 = *((npy_uint *)(ip1 + (i + 3) * is1));
            npy_uint u3 = *((npy_uint *)(ip2 + (i + 3) * is2));
            *((npy_uint *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_uint in1 = *(npy_uint *)ip1;
        const npy_uint in2 = *(npy_uint *)ip2;
        *((npy_uint *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_minimum_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_uint *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_uint *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_uint *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_maxp_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_fmax)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_uint)) {
            TO_SIMD_SFX(simd_reduce_c_maxp)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_uint, npy_uint)) {
            TO_SIMD_SFX(simd_binary_ccc_maxp)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_maxp)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_uint m0 = *((npy_uint *)(ip2 + (i + 0) * is2));
            npy_uint m1 = *((npy_uint *)(ip2 + (i + 1) * is2));
            npy_uint m2 = *((npy_uint *)(ip2 + (i + 2) * is2));
            npy_uint m3 = *((npy_uint *)(ip2 + (i + 3) * is2));
            npy_uint m4 = *((npy_uint *)(ip2 + (i + 4) * is2));
            npy_uint m5 = *((npy_uint *)(ip2 + (i + 5) * is2));
            npy_uint m6 = *((npy_uint *)(ip2 + (i + 6) * is2));
            npy_uint m7 = *((npy_uint *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_uint v0 = *((npy_uint *)(ip2 + (i + 0) * is2));
                npy_uint v1 = *((npy_uint *)(ip2 + (i + 1) * is2));
                npy_uint v2 = *((npy_uint *)(ip2 + (i + 2) * is2));
                npy_uint v3 = *((npy_uint *)(ip2 + (i + 3) * is2));
                npy_uint v4 = *((npy_uint *)(ip2 + (i + 4) * is2));
                npy_uint v5 = *((npy_uint *)(ip2 + (i + 5) * is2));
                npy_uint v6 = *((npy_uint *)(ip2 + (i + 6) * is2));
                npy_uint v7 = *((npy_uint *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_uint *)op1) = SCALAR_OP(*((npy_uint *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_uint v0 = *((npy_uint *)(ip1 + (i + 0) * is1));
            npy_uint u0 = *((npy_uint *)(ip2 + (i + 0) * is2));
            *((npy_uint *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_uint v1 = *((npy_uint *)(ip1 + (i + 1) * is1));
            npy_uint u1 = *((npy_uint *)(ip2 + (i + 1) * is2));
            *((npy_uint *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_uint v2 = *((npy_uint *)(ip1 + (i + 2) * is1));
            npy_uint u2 = *((npy_uint *)(ip2 + (i + 2) * is2));
            *((npy_uint *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_uint v3 = *((npy_uint *)(ip1 + (i + 3) * is1));
            npy_uint u3 = *((npy_uint *)(ip2 + (i + 3) * is2));
            *((npy_uint *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_uint in1 = *(npy_uint *)ip1;
        const npy_uint in2 = *(npy_uint *)ip2;
        *((npy_uint *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_fmax_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_uint *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_uint *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_uint *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_minp_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_fmin)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_uint)) {
            TO_SIMD_SFX(simd_reduce_c_minp)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_uint, npy_uint)) {
            TO_SIMD_SFX(simd_binary_ccc_minp)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_minp)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_uint m0 = *((npy_uint *)(ip2 + (i + 0) * is2));
            npy_uint m1 = *((npy_uint *)(ip2 + (i + 1) * is2));
            npy_uint m2 = *((npy_uint *)(ip2 + (i + 2) * is2));
            npy_uint m3 = *((npy_uint *)(ip2 + (i + 3) * is2));
            npy_uint m4 = *((npy_uint *)(ip2 + (i + 4) * is2));
            npy_uint m5 = *((npy_uint *)(ip2 + (i + 5) * is2));
            npy_uint m6 = *((npy_uint *)(ip2 + (i + 6) * is2));
            npy_uint m7 = *((npy_uint *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_uint v0 = *((npy_uint *)(ip2 + (i + 0) * is2));
                npy_uint v1 = *((npy_uint *)(ip2 + (i + 1) * is2));
                npy_uint v2 = *((npy_uint *)(ip2 + (i + 2) * is2));
                npy_uint v3 = *((npy_uint *)(ip2 + (i + 3) * is2));
                npy_uint v4 = *((npy_uint *)(ip2 + (i + 4) * is2));
                npy_uint v5 = *((npy_uint *)(ip2 + (i + 5) * is2));
                npy_uint v6 = *((npy_uint *)(ip2 + (i + 6) * is2));
                npy_uint v7 = *((npy_uint *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_uint *)op1) = SCALAR_OP(*((npy_uint *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_uint v0 = *((npy_uint *)(ip1 + (i + 0) * is1));
            npy_uint u0 = *((npy_uint *)(ip2 + (i + 0) * is2));
            *((npy_uint *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_uint v1 = *((npy_uint *)(ip1 + (i + 1) * is1));
            npy_uint u1 = *((npy_uint *)(ip2 + (i + 1) * is2));
            *((npy_uint *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_uint v2 = *((npy_uint *)(ip1 + (i + 2) * is1));
            npy_uint u2 = *((npy_uint *)(ip2 + (i + 2) * is2));
            *((npy_uint *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_uint v3 = *((npy_uint *)(ip1 + (i + 3) * is1));
            npy_uint u3 = *((npy_uint *)(ip2 + (i + 3) * is2));
            *((npy_uint *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_uint in1 = *(npy_uint *)ip1;
        const npy_uint in2 = *(npy_uint *)ip2;
        *((npy_uint *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_fmin_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_uint *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_uint *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_uint *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)


#line 294
#undef TO_SIMD_SFX
#if 0
#line 299
#elif NPY_SIMD && NPY_BITSOF_LONG == 8
    #if 0
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_LONG == 16
    #if 0
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_LONG == 32
    #if 0
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_LONG == 64
    #if 0
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_max_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_maximum)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_ulong)) {
            TO_SIMD_SFX(simd_reduce_c_max)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_ulong, npy_ulong)) {
            TO_SIMD_SFX(simd_binary_ccc_max)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_max)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_ulong m0 = *((npy_ulong *)(ip2 + (i + 0) * is2));
            npy_ulong m1 = *((npy_ulong *)(ip2 + (i + 1) * is2));
            npy_ulong m2 = *((npy_ulong *)(ip2 + (i + 2) * is2));
            npy_ulong m3 = *((npy_ulong *)(ip2 + (i + 3) * is2));
            npy_ulong m4 = *((npy_ulong *)(ip2 + (i + 4) * is2));
            npy_ulong m5 = *((npy_ulong *)(ip2 + (i + 5) * is2));
            npy_ulong m6 = *((npy_ulong *)(ip2 + (i + 6) * is2));
            npy_ulong m7 = *((npy_ulong *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_ulong v0 = *((npy_ulong *)(ip2 + (i + 0) * is2));
                npy_ulong v1 = *((npy_ulong *)(ip2 + (i + 1) * is2));
                npy_ulong v2 = *((npy_ulong *)(ip2 + (i + 2) * is2));
                npy_ulong v3 = *((npy_ulong *)(ip2 + (i + 3) * is2));
                npy_ulong v4 = *((npy_ulong *)(ip2 + (i + 4) * is2));
                npy_ulong v5 = *((npy_ulong *)(ip2 + (i + 5) * is2));
                npy_ulong v6 = *((npy_ulong *)(ip2 + (i + 6) * is2));
                npy_ulong v7 = *((npy_ulong *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_ulong *)op1) = SCALAR_OP(*((npy_ulong *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_ulong v0 = *((npy_ulong *)(ip1 + (i + 0) * is1));
            npy_ulong u0 = *((npy_ulong *)(ip2 + (i + 0) * is2));
            *((npy_ulong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_ulong v1 = *((npy_ulong *)(ip1 + (i + 1) * is1));
            npy_ulong u1 = *((npy_ulong *)(ip2 + (i + 1) * is2));
            *((npy_ulong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_ulong v2 = *((npy_ulong *)(ip1 + (i + 2) * is1));
            npy_ulong u2 = *((npy_ulong *)(ip2 + (i + 2) * is2));
            *((npy_ulong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_ulong v3 = *((npy_ulong *)(ip1 + (i + 3) * is1));
            npy_ulong u3 = *((npy_ulong *)(ip2 + (i + 3) * is2));
            *((npy_ulong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_ulong in1 = *(npy_ulong *)ip1;
        const npy_ulong in2 = *(npy_ulong *)ip2;
        *((npy_ulong *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_maximum_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_ulong *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_ulong *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_ulong *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_min_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_minimum)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_ulong)) {
            TO_SIMD_SFX(simd_reduce_c_min)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_ulong, npy_ulong)) {
            TO_SIMD_SFX(simd_binary_ccc_min)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_min)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_ulong m0 = *((npy_ulong *)(ip2 + (i + 0) * is2));
            npy_ulong m1 = *((npy_ulong *)(ip2 + (i + 1) * is2));
            npy_ulong m2 = *((npy_ulong *)(ip2 + (i + 2) * is2));
            npy_ulong m3 = *((npy_ulong *)(ip2 + (i + 3) * is2));
            npy_ulong m4 = *((npy_ulong *)(ip2 + (i + 4) * is2));
            npy_ulong m5 = *((npy_ulong *)(ip2 + (i + 5) * is2));
            npy_ulong m6 = *((npy_ulong *)(ip2 + (i + 6) * is2));
            npy_ulong m7 = *((npy_ulong *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_ulong v0 = *((npy_ulong *)(ip2 + (i + 0) * is2));
                npy_ulong v1 = *((npy_ulong *)(ip2 + (i + 1) * is2));
                npy_ulong v2 = *((npy_ulong *)(ip2 + (i + 2) * is2));
                npy_ulong v3 = *((npy_ulong *)(ip2 + (i + 3) * is2));
                npy_ulong v4 = *((npy_ulong *)(ip2 + (i + 4) * is2));
                npy_ulong v5 = *((npy_ulong *)(ip2 + (i + 5) * is2));
                npy_ulong v6 = *((npy_ulong *)(ip2 + (i + 6) * is2));
                npy_ulong v7 = *((npy_ulong *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_ulong *)op1) = SCALAR_OP(*((npy_ulong *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_ulong v0 = *((npy_ulong *)(ip1 + (i + 0) * is1));
            npy_ulong u0 = *((npy_ulong *)(ip2 + (i + 0) * is2));
            *((npy_ulong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_ulong v1 = *((npy_ulong *)(ip1 + (i + 1) * is1));
            npy_ulong u1 = *((npy_ulong *)(ip2 + (i + 1) * is2));
            *((npy_ulong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_ulong v2 = *((npy_ulong *)(ip1 + (i + 2) * is1));
            npy_ulong u2 = *((npy_ulong *)(ip2 + (i + 2) * is2));
            *((npy_ulong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_ulong v3 = *((npy_ulong *)(ip1 + (i + 3) * is1));
            npy_ulong u3 = *((npy_ulong *)(ip2 + (i + 3) * is2));
            *((npy_ulong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_ulong in1 = *(npy_ulong *)ip1;
        const npy_ulong in2 = *(npy_ulong *)ip2;
        *((npy_ulong *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_minimum_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_ulong *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_ulong *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_ulong *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_maxp_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_fmax)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_ulong)) {
            TO_SIMD_SFX(simd_reduce_c_maxp)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_ulong, npy_ulong)) {
            TO_SIMD_SFX(simd_binary_ccc_maxp)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_maxp)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_ulong m0 = *((npy_ulong *)(ip2 + (i + 0) * is2));
            npy_ulong m1 = *((npy_ulong *)(ip2 + (i + 1) * is2));
            npy_ulong m2 = *((npy_ulong *)(ip2 + (i + 2) * is2));
            npy_ulong m3 = *((npy_ulong *)(ip2 + (i + 3) * is2));
            npy_ulong m4 = *((npy_ulong *)(ip2 + (i + 4) * is2));
            npy_ulong m5 = *((npy_ulong *)(ip2 + (i + 5) * is2));
            npy_ulong m6 = *((npy_ulong *)(ip2 + (i + 6) * is2));
            npy_ulong m7 = *((npy_ulong *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_ulong v0 = *((npy_ulong *)(ip2 + (i + 0) * is2));
                npy_ulong v1 = *((npy_ulong *)(ip2 + (i + 1) * is2));
                npy_ulong v2 = *((npy_ulong *)(ip2 + (i + 2) * is2));
                npy_ulong v3 = *((npy_ulong *)(ip2 + (i + 3) * is2));
                npy_ulong v4 = *((npy_ulong *)(ip2 + (i + 4) * is2));
                npy_ulong v5 = *((npy_ulong *)(ip2 + (i + 5) * is2));
                npy_ulong v6 = *((npy_ulong *)(ip2 + (i + 6) * is2));
                npy_ulong v7 = *((npy_ulong *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_ulong *)op1) = SCALAR_OP(*((npy_ulong *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_ulong v0 = *((npy_ulong *)(ip1 + (i + 0) * is1));
            npy_ulong u0 = *((npy_ulong *)(ip2 + (i + 0) * is2));
            *((npy_ulong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_ulong v1 = *((npy_ulong *)(ip1 + (i + 1) * is1));
            npy_ulong u1 = *((npy_ulong *)(ip2 + (i + 1) * is2));
            *((npy_ulong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_ulong v2 = *((npy_ulong *)(ip1 + (i + 2) * is1));
            npy_ulong u2 = *((npy_ulong *)(ip2 + (i + 2) * is2));
            *((npy_ulong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_ulong v3 = *((npy_ulong *)(ip1 + (i + 3) * is1));
            npy_ulong u3 = *((npy_ulong *)(ip2 + (i + 3) * is2));
            *((npy_ulong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_ulong in1 = *(npy_ulong *)ip1;
        const npy_ulong in2 = *(npy_ulong *)ip2;
        *((npy_ulong *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_fmax_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_ulong *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_ulong *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_ulong *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_minp_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_fmin)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_ulong)) {
            TO_SIMD_SFX(simd_reduce_c_minp)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_ulong, npy_ulong)) {
            TO_SIMD_SFX(simd_binary_ccc_minp)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_minp)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_ulong m0 = *((npy_ulong *)(ip2 + (i + 0) * is2));
            npy_ulong m1 = *((npy_ulong *)(ip2 + (i + 1) * is2));
            npy_ulong m2 = *((npy_ulong *)(ip2 + (i + 2) * is2));
            npy_ulong m3 = *((npy_ulong *)(ip2 + (i + 3) * is2));
            npy_ulong m4 = *((npy_ulong *)(ip2 + (i + 4) * is2));
            npy_ulong m5 = *((npy_ulong *)(ip2 + (i + 5) * is2));
            npy_ulong m6 = *((npy_ulong *)(ip2 + (i + 6) * is2));
            npy_ulong m7 = *((npy_ulong *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_ulong v0 = *((npy_ulong *)(ip2 + (i + 0) * is2));
                npy_ulong v1 = *((npy_ulong *)(ip2 + (i + 1) * is2));
                npy_ulong v2 = *((npy_ulong *)(ip2 + (i + 2) * is2));
                npy_ulong v3 = *((npy_ulong *)(ip2 + (i + 3) * is2));
                npy_ulong v4 = *((npy_ulong *)(ip2 + (i + 4) * is2));
                npy_ulong v5 = *((npy_ulong *)(ip2 + (i + 5) * is2));
                npy_ulong v6 = *((npy_ulong *)(ip2 + (i + 6) * is2));
                npy_ulong v7 = *((npy_ulong *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_ulong *)op1) = SCALAR_OP(*((npy_ulong *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_ulong v0 = *((npy_ulong *)(ip1 + (i + 0) * is1));
            npy_ulong u0 = *((npy_ulong *)(ip2 + (i + 0) * is2));
            *((npy_ulong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_ulong v1 = *((npy_ulong *)(ip1 + (i + 1) * is1));
            npy_ulong u1 = *((npy_ulong *)(ip2 + (i + 1) * is2));
            *((npy_ulong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_ulong v2 = *((npy_ulong *)(ip1 + (i + 2) * is1));
            npy_ulong u2 = *((npy_ulong *)(ip2 + (i + 2) * is2));
            *((npy_ulong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_ulong v3 = *((npy_ulong *)(ip1 + (i + 3) * is1));
            npy_ulong u3 = *((npy_ulong *)(ip2 + (i + 3) * is2));
            *((npy_ulong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_ulong in1 = *(npy_ulong *)ip1;
        const npy_ulong in2 = *(npy_ulong *)ip2;
        *((npy_ulong *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_fmin_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_ulong *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_ulong *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_ulong *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)


#line 294
#undef TO_SIMD_SFX
#if 0
#line 299
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 8
    #if 0
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 16
    #if 0
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 32
    #if 0
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 64
    #if 0
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_max_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_maximum)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_ulonglong)) {
            TO_SIMD_SFX(simd_reduce_c_max)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_ulonglong, npy_ulonglong)) {
            TO_SIMD_SFX(simd_binary_ccc_max)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_max)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_ulonglong m0 = *((npy_ulonglong *)(ip2 + (i + 0) * is2));
            npy_ulonglong m1 = *((npy_ulonglong *)(ip2 + (i + 1) * is2));
            npy_ulonglong m2 = *((npy_ulonglong *)(ip2 + (i + 2) * is2));
            npy_ulonglong m3 = *((npy_ulonglong *)(ip2 + (i + 3) * is2));
            npy_ulonglong m4 = *((npy_ulonglong *)(ip2 + (i + 4) * is2));
            npy_ulonglong m5 = *((npy_ulonglong *)(ip2 + (i + 5) * is2));
            npy_ulonglong m6 = *((npy_ulonglong *)(ip2 + (i + 6) * is2));
            npy_ulonglong m7 = *((npy_ulonglong *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_ulonglong v0 = *((npy_ulonglong *)(ip2 + (i + 0) * is2));
                npy_ulonglong v1 = *((npy_ulonglong *)(ip2 + (i + 1) * is2));
                npy_ulonglong v2 = *((npy_ulonglong *)(ip2 + (i + 2) * is2));
                npy_ulonglong v3 = *((npy_ulonglong *)(ip2 + (i + 3) * is2));
                npy_ulonglong v4 = *((npy_ulonglong *)(ip2 + (i + 4) * is2));
                npy_ulonglong v5 = *((npy_ulonglong *)(ip2 + (i + 5) * is2));
                npy_ulonglong v6 = *((npy_ulonglong *)(ip2 + (i + 6) * is2));
                npy_ulonglong v7 = *((npy_ulonglong *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_ulonglong *)op1) = SCALAR_OP(*((npy_ulonglong *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_ulonglong v0 = *((npy_ulonglong *)(ip1 + (i + 0) * is1));
            npy_ulonglong u0 = *((npy_ulonglong *)(ip2 + (i + 0) * is2));
            *((npy_ulonglong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_ulonglong v1 = *((npy_ulonglong *)(ip1 + (i + 1) * is1));
            npy_ulonglong u1 = *((npy_ulonglong *)(ip2 + (i + 1) * is2));
            *((npy_ulonglong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_ulonglong v2 = *((npy_ulonglong *)(ip1 + (i + 2) * is1));
            npy_ulonglong u2 = *((npy_ulonglong *)(ip2 + (i + 2) * is2));
            *((npy_ulonglong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_ulonglong v3 = *((npy_ulonglong *)(ip1 + (i + 3) * is1));
            npy_ulonglong u3 = *((npy_ulonglong *)(ip2 + (i + 3) * is2));
            *((npy_ulonglong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
        const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
        *((npy_ulonglong *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_maximum_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_ulonglong *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_ulonglong *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_ulonglong *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_min_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_minimum)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_ulonglong)) {
            TO_SIMD_SFX(simd_reduce_c_min)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_ulonglong, npy_ulonglong)) {
            TO_SIMD_SFX(simd_binary_ccc_min)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_min)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_ulonglong m0 = *((npy_ulonglong *)(ip2 + (i + 0) * is2));
            npy_ulonglong m1 = *((npy_ulonglong *)(ip2 + (i + 1) * is2));
            npy_ulonglong m2 = *((npy_ulonglong *)(ip2 + (i + 2) * is2));
            npy_ulonglong m3 = *((npy_ulonglong *)(ip2 + (i + 3) * is2));
            npy_ulonglong m4 = *((npy_ulonglong *)(ip2 + (i + 4) * is2));
            npy_ulonglong m5 = *((npy_ulonglong *)(ip2 + (i + 5) * is2));
            npy_ulonglong m6 = *((npy_ulonglong *)(ip2 + (i + 6) * is2));
            npy_ulonglong m7 = *((npy_ulonglong *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_ulonglong v0 = *((npy_ulonglong *)(ip2 + (i + 0) * is2));
                npy_ulonglong v1 = *((npy_ulonglong *)(ip2 + (i + 1) * is2));
                npy_ulonglong v2 = *((npy_ulonglong *)(ip2 + (i + 2) * is2));
                npy_ulonglong v3 = *((npy_ulonglong *)(ip2 + (i + 3) * is2));
                npy_ulonglong v4 = *((npy_ulonglong *)(ip2 + (i + 4) * is2));
                npy_ulonglong v5 = *((npy_ulonglong *)(ip2 + (i + 5) * is2));
                npy_ulonglong v6 = *((npy_ulonglong *)(ip2 + (i + 6) * is2));
                npy_ulonglong v7 = *((npy_ulonglong *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_ulonglong *)op1) = SCALAR_OP(*((npy_ulonglong *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_ulonglong v0 = *((npy_ulonglong *)(ip1 + (i + 0) * is1));
            npy_ulonglong u0 = *((npy_ulonglong *)(ip2 + (i + 0) * is2));
            *((npy_ulonglong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_ulonglong v1 = *((npy_ulonglong *)(ip1 + (i + 1) * is1));
            npy_ulonglong u1 = *((npy_ulonglong *)(ip2 + (i + 1) * is2));
            *((npy_ulonglong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_ulonglong v2 = *((npy_ulonglong *)(ip1 + (i + 2) * is1));
            npy_ulonglong u2 = *((npy_ulonglong *)(ip2 + (i + 2) * is2));
            *((npy_ulonglong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_ulonglong v3 = *((npy_ulonglong *)(ip1 + (i + 3) * is1));
            npy_ulonglong u3 = *((npy_ulonglong *)(ip2 + (i + 3) * is2));
            *((npy_ulonglong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
        const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
        *((npy_ulonglong *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_minimum_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_ulonglong *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_ulonglong *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_ulonglong *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_maxp_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_fmax)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_ulonglong)) {
            TO_SIMD_SFX(simd_reduce_c_maxp)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_ulonglong, npy_ulonglong)) {
            TO_SIMD_SFX(simd_binary_ccc_maxp)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_maxp)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_ulonglong m0 = *((npy_ulonglong *)(ip2 + (i + 0) * is2));
            npy_ulonglong m1 = *((npy_ulonglong *)(ip2 + (i + 1) * is2));
            npy_ulonglong m2 = *((npy_ulonglong *)(ip2 + (i + 2) * is2));
            npy_ulonglong m3 = *((npy_ulonglong *)(ip2 + (i + 3) * is2));
            npy_ulonglong m4 = *((npy_ulonglong *)(ip2 + (i + 4) * is2));
            npy_ulonglong m5 = *((npy_ulonglong *)(ip2 + (i + 5) * is2));
            npy_ulonglong m6 = *((npy_ulonglong *)(ip2 + (i + 6) * is2));
            npy_ulonglong m7 = *((npy_ulonglong *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_ulonglong v0 = *((npy_ulonglong *)(ip2 + (i + 0) * is2));
                npy_ulonglong v1 = *((npy_ulonglong *)(ip2 + (i + 1) * is2));
                npy_ulonglong v2 = *((npy_ulonglong *)(ip2 + (i + 2) * is2));
                npy_ulonglong v3 = *((npy_ulonglong *)(ip2 + (i + 3) * is2));
                npy_ulonglong v4 = *((npy_ulonglong *)(ip2 + (i + 4) * is2));
                npy_ulonglong v5 = *((npy_ulonglong *)(ip2 + (i + 5) * is2));
                npy_ulonglong v6 = *((npy_ulonglong *)(ip2 + (i + 6) * is2));
                npy_ulonglong v7 = *((npy_ulonglong *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_ulonglong *)op1) = SCALAR_OP(*((npy_ulonglong *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_ulonglong v0 = *((npy_ulonglong *)(ip1 + (i + 0) * is1));
            npy_ulonglong u0 = *((npy_ulonglong *)(ip2 + (i + 0) * is2));
            *((npy_ulonglong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_ulonglong v1 = *((npy_ulonglong *)(ip1 + (i + 1) * is1));
            npy_ulonglong u1 = *((npy_ulonglong *)(ip2 + (i + 1) * is2));
            *((npy_ulonglong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_ulonglong v2 = *((npy_ulonglong *)(ip1 + (i + 2) * is1));
            npy_ulonglong u2 = *((npy_ulonglong *)(ip2 + (i + 2) * is2));
            *((npy_ulonglong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_ulonglong v3 = *((npy_ulonglong *)(ip1 + (i + 3) * is1));
            npy_ulonglong u3 = *((npy_ulonglong *)(ip2 + (i + 3) * is2));
            *((npy_ulonglong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
        const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
        *((npy_ulonglong *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_fmax_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_ulonglong *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_ulonglong *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_ulonglong *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_minp_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_fmin)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_ulonglong)) {
            TO_SIMD_SFX(simd_reduce_c_minp)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_ulonglong, npy_ulonglong)) {
            TO_SIMD_SFX(simd_binary_ccc_minp)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_minp)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_ulonglong m0 = *((npy_ulonglong *)(ip2 + (i + 0) * is2));
            npy_ulonglong m1 = *((npy_ulonglong *)(ip2 + (i + 1) * is2));
            npy_ulonglong m2 = *((npy_ulonglong *)(ip2 + (i + 2) * is2));
            npy_ulonglong m3 = *((npy_ulonglong *)(ip2 + (i + 3) * is2));
            npy_ulonglong m4 = *((npy_ulonglong *)(ip2 + (i + 4) * is2));
            npy_ulonglong m5 = *((npy_ulonglong *)(ip2 + (i + 5) * is2));
            npy_ulonglong m6 = *((npy_ulonglong *)(ip2 + (i + 6) * is2));
            npy_ulonglong m7 = *((npy_ulonglong *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_ulonglong v0 = *((npy_ulonglong *)(ip2 + (i + 0) * is2));
                npy_ulonglong v1 = *((npy_ulonglong *)(ip2 + (i + 1) * is2));
                npy_ulonglong v2 = *((npy_ulonglong *)(ip2 + (i + 2) * is2));
                npy_ulonglong v3 = *((npy_ulonglong *)(ip2 + (i + 3) * is2));
                npy_ulonglong v4 = *((npy_ulonglong *)(ip2 + (i + 4) * is2));
                npy_ulonglong v5 = *((npy_ulonglong *)(ip2 + (i + 5) * is2));
                npy_ulonglong v6 = *((npy_ulonglong *)(ip2 + (i + 6) * is2));
                npy_ulonglong v7 = *((npy_ulonglong *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_ulonglong *)op1) = SCALAR_OP(*((npy_ulonglong *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_ulonglong v0 = *((npy_ulonglong *)(ip1 + (i + 0) * is1));
            npy_ulonglong u0 = *((npy_ulonglong *)(ip2 + (i + 0) * is2));
            *((npy_ulonglong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_ulonglong v1 = *((npy_ulonglong *)(ip1 + (i + 1) * is1));
            npy_ulonglong u1 = *((npy_ulonglong *)(ip2 + (i + 1) * is2));
            *((npy_ulonglong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_ulonglong v2 = *((npy_ulonglong *)(ip1 + (i + 2) * is1));
            npy_ulonglong u2 = *((npy_ulonglong *)(ip2 + (i + 2) * is2));
            *((npy_ulonglong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_ulonglong v3 = *((npy_ulonglong *)(ip1 + (i + 3) * is1));
            npy_ulonglong u3 = *((npy_ulonglong *)(ip2 + (i + 3) * is2));
            *((npy_ulonglong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
        const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
        *((npy_ulonglong *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_fmin_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_ulonglong *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_ulonglong *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_ulonglong *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)


#line 294
#undef TO_SIMD_SFX
#if 0
#line 299
#elif NPY_SIMD && NPY_BITSOF_BYTE == 8
    #if 0
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_BYTE == 16
    #if 0
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_BYTE == 32
    #if 0
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_BYTE == 64
    #if 0
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_max_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_maximum)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_byte)) {
            TO_SIMD_SFX(simd_reduce_c_max)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_byte, npy_byte)) {
            TO_SIMD_SFX(simd_binary_ccc_max)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_max)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_byte m0 = *((npy_byte *)(ip2 + (i + 0) * is2));
            npy_byte m1 = *((npy_byte *)(ip2 + (i + 1) * is2));
            npy_byte m2 = *((npy_byte *)(ip2 + (i + 2) * is2));
            npy_byte m3 = *((npy_byte *)(ip2 + (i + 3) * is2));
            npy_byte m4 = *((npy_byte *)(ip2 + (i + 4) * is2));
            npy_byte m5 = *((npy_byte *)(ip2 + (i + 5) * is2));
            npy_byte m6 = *((npy_byte *)(ip2 + (i + 6) * is2));
            npy_byte m7 = *((npy_byte *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_byte v0 = *((npy_byte *)(ip2 + (i + 0) * is2));
                npy_byte v1 = *((npy_byte *)(ip2 + (i + 1) * is2));
                npy_byte v2 = *((npy_byte *)(ip2 + (i + 2) * is2));
                npy_byte v3 = *((npy_byte *)(ip2 + (i + 3) * is2));
                npy_byte v4 = *((npy_byte *)(ip2 + (i + 4) * is2));
                npy_byte v5 = *((npy_byte *)(ip2 + (i + 5) * is2));
                npy_byte v6 = *((npy_byte *)(ip2 + (i + 6) * is2));
                npy_byte v7 = *((npy_byte *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_byte *)op1) = SCALAR_OP(*((npy_byte *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_byte v0 = *((npy_byte *)(ip1 + (i + 0) * is1));
            npy_byte u0 = *((npy_byte *)(ip2 + (i + 0) * is2));
            *((npy_byte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_byte v1 = *((npy_byte *)(ip1 + (i + 1) * is1));
            npy_byte u1 = *((npy_byte *)(ip2 + (i + 1) * is2));
            *((npy_byte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_byte v2 = *((npy_byte *)(ip1 + (i + 2) * is1));
            npy_byte u2 = *((npy_byte *)(ip2 + (i + 2) * is2));
            *((npy_byte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_byte v3 = *((npy_byte *)(ip1 + (i + 3) * is1));
            npy_byte u3 = *((npy_byte *)(ip2 + (i + 3) * is2));
            *((npy_byte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_byte in1 = *(npy_byte *)ip1;
        const npy_byte in2 = *(npy_byte *)ip2;
        *((npy_byte *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_maximum_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_byte *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_byte *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_byte *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_min_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_minimum)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_byte)) {
            TO_SIMD_SFX(simd_reduce_c_min)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_byte, npy_byte)) {
            TO_SIMD_SFX(simd_binary_ccc_min)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_min)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_byte m0 = *((npy_byte *)(ip2 + (i + 0) * is2));
            npy_byte m1 = *((npy_byte *)(ip2 + (i + 1) * is2));
            npy_byte m2 = *((npy_byte *)(ip2 + (i + 2) * is2));
            npy_byte m3 = *((npy_byte *)(ip2 + (i + 3) * is2));
            npy_byte m4 = *((npy_byte *)(ip2 + (i + 4) * is2));
            npy_byte m5 = *((npy_byte *)(ip2 + (i + 5) * is2));
            npy_byte m6 = *((npy_byte *)(ip2 + (i + 6) * is2));
            npy_byte m7 = *((npy_byte *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_byte v0 = *((npy_byte *)(ip2 + (i + 0) * is2));
                npy_byte v1 = *((npy_byte *)(ip2 + (i + 1) * is2));
                npy_byte v2 = *((npy_byte *)(ip2 + (i + 2) * is2));
                npy_byte v3 = *((npy_byte *)(ip2 + (i + 3) * is2));
                npy_byte v4 = *((npy_byte *)(ip2 + (i + 4) * is2));
                npy_byte v5 = *((npy_byte *)(ip2 + (i + 5) * is2));
                npy_byte v6 = *((npy_byte *)(ip2 + (i + 6) * is2));
                npy_byte v7 = *((npy_byte *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_byte *)op1) = SCALAR_OP(*((npy_byte *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_byte v0 = *((npy_byte *)(ip1 + (i + 0) * is1));
            npy_byte u0 = *((npy_byte *)(ip2 + (i + 0) * is2));
            *((npy_byte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_byte v1 = *((npy_byte *)(ip1 + (i + 1) * is1));
            npy_byte u1 = *((npy_byte *)(ip2 + (i + 1) * is2));
            *((npy_byte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_byte v2 = *((npy_byte *)(ip1 + (i + 2) * is1));
            npy_byte u2 = *((npy_byte *)(ip2 + (i + 2) * is2));
            *((npy_byte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_byte v3 = *((npy_byte *)(ip1 + (i + 3) * is1));
            npy_byte u3 = *((npy_byte *)(ip2 + (i + 3) * is2));
            *((npy_byte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_byte in1 = *(npy_byte *)ip1;
        const npy_byte in2 = *(npy_byte *)ip2;
        *((npy_byte *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_minimum_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_byte *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_byte *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_byte *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_maxp_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_fmax)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_byte)) {
            TO_SIMD_SFX(simd_reduce_c_maxp)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_byte, npy_byte)) {
            TO_SIMD_SFX(simd_binary_ccc_maxp)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_maxp)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_byte m0 = *((npy_byte *)(ip2 + (i + 0) * is2));
            npy_byte m1 = *((npy_byte *)(ip2 + (i + 1) * is2));
            npy_byte m2 = *((npy_byte *)(ip2 + (i + 2) * is2));
            npy_byte m3 = *((npy_byte *)(ip2 + (i + 3) * is2));
            npy_byte m4 = *((npy_byte *)(ip2 + (i + 4) * is2));
            npy_byte m5 = *((npy_byte *)(ip2 + (i + 5) * is2));
            npy_byte m6 = *((npy_byte *)(ip2 + (i + 6) * is2));
            npy_byte m7 = *((npy_byte *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_byte v0 = *((npy_byte *)(ip2 + (i + 0) * is2));
                npy_byte v1 = *((npy_byte *)(ip2 + (i + 1) * is2));
                npy_byte v2 = *((npy_byte *)(ip2 + (i + 2) * is2));
                npy_byte v3 = *((npy_byte *)(ip2 + (i + 3) * is2));
                npy_byte v4 = *((npy_byte *)(ip2 + (i + 4) * is2));
                npy_byte v5 = *((npy_byte *)(ip2 + (i + 5) * is2));
                npy_byte v6 = *((npy_byte *)(ip2 + (i + 6) * is2));
                npy_byte v7 = *((npy_byte *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_byte *)op1) = SCALAR_OP(*((npy_byte *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_byte v0 = *((npy_byte *)(ip1 + (i + 0) * is1));
            npy_byte u0 = *((npy_byte *)(ip2 + (i + 0) * is2));
            *((npy_byte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_byte v1 = *((npy_byte *)(ip1 + (i + 1) * is1));
            npy_byte u1 = *((npy_byte *)(ip2 + (i + 1) * is2));
            *((npy_byte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_byte v2 = *((npy_byte *)(ip1 + (i + 2) * is1));
            npy_byte u2 = *((npy_byte *)(ip2 + (i + 2) * is2));
            *((npy_byte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_byte v3 = *((npy_byte *)(ip1 + (i + 3) * is1));
            npy_byte u3 = *((npy_byte *)(ip2 + (i + 3) * is2));
            *((npy_byte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_byte in1 = *(npy_byte *)ip1;
        const npy_byte in2 = *(npy_byte *)ip2;
        *((npy_byte *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_fmax_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_byte *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_byte *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_byte *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_minp_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_fmin)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_byte)) {
            TO_SIMD_SFX(simd_reduce_c_minp)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_byte, npy_byte)) {
            TO_SIMD_SFX(simd_binary_ccc_minp)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_minp)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_byte m0 = *((npy_byte *)(ip2 + (i + 0) * is2));
            npy_byte m1 = *((npy_byte *)(ip2 + (i + 1) * is2));
            npy_byte m2 = *((npy_byte *)(ip2 + (i + 2) * is2));
            npy_byte m3 = *((npy_byte *)(ip2 + (i + 3) * is2));
            npy_byte m4 = *((npy_byte *)(ip2 + (i + 4) * is2));
            npy_byte m5 = *((npy_byte *)(ip2 + (i + 5) * is2));
            npy_byte m6 = *((npy_byte *)(ip2 + (i + 6) * is2));
            npy_byte m7 = *((npy_byte *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_byte v0 = *((npy_byte *)(ip2 + (i + 0) * is2));
                npy_byte v1 = *((npy_byte *)(ip2 + (i + 1) * is2));
                npy_byte v2 = *((npy_byte *)(ip2 + (i + 2) * is2));
                npy_byte v3 = *((npy_byte *)(ip2 + (i + 3) * is2));
                npy_byte v4 = *((npy_byte *)(ip2 + (i + 4) * is2));
                npy_byte v5 = *((npy_byte *)(ip2 + (i + 5) * is2));
                npy_byte v6 = *((npy_byte *)(ip2 + (i + 6) * is2));
                npy_byte v7 = *((npy_byte *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_byte *)op1) = SCALAR_OP(*((npy_byte *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_byte v0 = *((npy_byte *)(ip1 + (i + 0) * is1));
            npy_byte u0 = *((npy_byte *)(ip2 + (i + 0) * is2));
            *((npy_byte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_byte v1 = *((npy_byte *)(ip1 + (i + 1) * is1));
            npy_byte u1 = *((npy_byte *)(ip2 + (i + 1) * is2));
            *((npy_byte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_byte v2 = *((npy_byte *)(ip1 + (i + 2) * is1));
            npy_byte u2 = *((npy_byte *)(ip2 + (i + 2) * is2));
            *((npy_byte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_byte v3 = *((npy_byte *)(ip1 + (i + 3) * is1));
            npy_byte u3 = *((npy_byte *)(ip2 + (i + 3) * is2));
            *((npy_byte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_byte in1 = *(npy_byte *)ip1;
        const npy_byte in2 = *(npy_byte *)ip2;
        *((npy_byte *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_fmin_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_byte *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_byte *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_byte *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)


#line 294
#undef TO_SIMD_SFX
#if 0
#line 299
#elif NPY_SIMD && NPY_BITSOF_SHORT == 8
    #if 0
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_SHORT == 16
    #if 0
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_SHORT == 32
    #if 0
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_SHORT == 64
    #if 0
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_max_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_maximum)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_short)) {
            TO_SIMD_SFX(simd_reduce_c_max)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_short, npy_short)) {
            TO_SIMD_SFX(simd_binary_ccc_max)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_max)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_short m0 = *((npy_short *)(ip2 + (i + 0) * is2));
            npy_short m1 = *((npy_short *)(ip2 + (i + 1) * is2));
            npy_short m2 = *((npy_short *)(ip2 + (i + 2) * is2));
            npy_short m3 = *((npy_short *)(ip2 + (i + 3) * is2));
            npy_short m4 = *((npy_short *)(ip2 + (i + 4) * is2));
            npy_short m5 = *((npy_short *)(ip2 + (i + 5) * is2));
            npy_short m6 = *((npy_short *)(ip2 + (i + 6) * is2));
            npy_short m7 = *((npy_short *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_short v0 = *((npy_short *)(ip2 + (i + 0) * is2));
                npy_short v1 = *((npy_short *)(ip2 + (i + 1) * is2));
                npy_short v2 = *((npy_short *)(ip2 + (i + 2) * is2));
                npy_short v3 = *((npy_short *)(ip2 + (i + 3) * is2));
                npy_short v4 = *((npy_short *)(ip2 + (i + 4) * is2));
                npy_short v5 = *((npy_short *)(ip2 + (i + 5) * is2));
                npy_short v6 = *((npy_short *)(ip2 + (i + 6) * is2));
                npy_short v7 = *((npy_short *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_short *)op1) = SCALAR_OP(*((npy_short *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_short v0 = *((npy_short *)(ip1 + (i + 0) * is1));
            npy_short u0 = *((npy_short *)(ip2 + (i + 0) * is2));
            *((npy_short *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_short v1 = *((npy_short *)(ip1 + (i + 1) * is1));
            npy_short u1 = *((npy_short *)(ip2 + (i + 1) * is2));
            *((npy_short *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_short v2 = *((npy_short *)(ip1 + (i + 2) * is1));
            npy_short u2 = *((npy_short *)(ip2 + (i + 2) * is2));
            *((npy_short *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_short v3 = *((npy_short *)(ip1 + (i + 3) * is1));
            npy_short u3 = *((npy_short *)(ip2 + (i + 3) * is2));
            *((npy_short *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_short in1 = *(npy_short *)ip1;
        const npy_short in2 = *(npy_short *)ip2;
        *((npy_short *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_maximum_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_short *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_short *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_short *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_min_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_minimum)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_short)) {
            TO_SIMD_SFX(simd_reduce_c_min)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_short, npy_short)) {
            TO_SIMD_SFX(simd_binary_ccc_min)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_min)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_short m0 = *((npy_short *)(ip2 + (i + 0) * is2));
            npy_short m1 = *((npy_short *)(ip2 + (i + 1) * is2));
            npy_short m2 = *((npy_short *)(ip2 + (i + 2) * is2));
            npy_short m3 = *((npy_short *)(ip2 + (i + 3) * is2));
            npy_short m4 = *((npy_short *)(ip2 + (i + 4) * is2));
            npy_short m5 = *((npy_short *)(ip2 + (i + 5) * is2));
            npy_short m6 = *((npy_short *)(ip2 + (i + 6) * is2));
            npy_short m7 = *((npy_short *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_short v0 = *((npy_short *)(ip2 + (i + 0) * is2));
                npy_short v1 = *((npy_short *)(ip2 + (i + 1) * is2));
                npy_short v2 = *((npy_short *)(ip2 + (i + 2) * is2));
                npy_short v3 = *((npy_short *)(ip2 + (i + 3) * is2));
                npy_short v4 = *((npy_short *)(ip2 + (i + 4) * is2));
                npy_short v5 = *((npy_short *)(ip2 + (i + 5) * is2));
                npy_short v6 = *((npy_short *)(ip2 + (i + 6) * is2));
                npy_short v7 = *((npy_short *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_short *)op1) = SCALAR_OP(*((npy_short *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_short v0 = *((npy_short *)(ip1 + (i + 0) * is1));
            npy_short u0 = *((npy_short *)(ip2 + (i + 0) * is2));
            *((npy_short *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_short v1 = *((npy_short *)(ip1 + (i + 1) * is1));
            npy_short u1 = *((npy_short *)(ip2 + (i + 1) * is2));
            *((npy_short *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_short v2 = *((npy_short *)(ip1 + (i + 2) * is1));
            npy_short u2 = *((npy_short *)(ip2 + (i + 2) * is2));
            *((npy_short *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_short v3 = *((npy_short *)(ip1 + (i + 3) * is1));
            npy_short u3 = *((npy_short *)(ip2 + (i + 3) * is2));
            *((npy_short *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_short in1 = *(npy_short *)ip1;
        const npy_short in2 = *(npy_short *)ip2;
        *((npy_short *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_minimum_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_short *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_short *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_short *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_maxp_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_fmax)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_short)) {
            TO_SIMD_SFX(simd_reduce_c_maxp)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_short, npy_short)) {
            TO_SIMD_SFX(simd_binary_ccc_maxp)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_maxp)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_short m0 = *((npy_short *)(ip2 + (i + 0) * is2));
            npy_short m1 = *((npy_short *)(ip2 + (i + 1) * is2));
            npy_short m2 = *((npy_short *)(ip2 + (i + 2) * is2));
            npy_short m3 = *((npy_short *)(ip2 + (i + 3) * is2));
            npy_short m4 = *((npy_short *)(ip2 + (i + 4) * is2));
            npy_short m5 = *((npy_short *)(ip2 + (i + 5) * is2));
            npy_short m6 = *((npy_short *)(ip2 + (i + 6) * is2));
            npy_short m7 = *((npy_short *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_short v0 = *((npy_short *)(ip2 + (i + 0) * is2));
                npy_short v1 = *((npy_short *)(ip2 + (i + 1) * is2));
                npy_short v2 = *((npy_short *)(ip2 + (i + 2) * is2));
                npy_short v3 = *((npy_short *)(ip2 + (i + 3) * is2));
                npy_short v4 = *((npy_short *)(ip2 + (i + 4) * is2));
                npy_short v5 = *((npy_short *)(ip2 + (i + 5) * is2));
                npy_short v6 = *((npy_short *)(ip2 + (i + 6) * is2));
                npy_short v7 = *((npy_short *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_short *)op1) = SCALAR_OP(*((npy_short *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_short v0 = *((npy_short *)(ip1 + (i + 0) * is1));
            npy_short u0 = *((npy_short *)(ip2 + (i + 0) * is2));
            *((npy_short *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_short v1 = *((npy_short *)(ip1 + (i + 1) * is1));
            npy_short u1 = *((npy_short *)(ip2 + (i + 1) * is2));
            *((npy_short *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_short v2 = *((npy_short *)(ip1 + (i + 2) * is1));
            npy_short u2 = *((npy_short *)(ip2 + (i + 2) * is2));
            *((npy_short *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_short v3 = *((npy_short *)(ip1 + (i + 3) * is1));
            npy_short u3 = *((npy_short *)(ip2 + (i + 3) * is2));
            *((npy_short *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_short in1 = *(npy_short *)ip1;
        const npy_short in2 = *(npy_short *)ip2;
        *((npy_short *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_fmax_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_short *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_short *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_short *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_minp_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_fmin)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_short)) {
            TO_SIMD_SFX(simd_reduce_c_minp)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_short, npy_short)) {
            TO_SIMD_SFX(simd_binary_ccc_minp)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_minp)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_short m0 = *((npy_short *)(ip2 + (i + 0) * is2));
            npy_short m1 = *((npy_short *)(ip2 + (i + 1) * is2));
            npy_short m2 = *((npy_short *)(ip2 + (i + 2) * is2));
            npy_short m3 = *((npy_short *)(ip2 + (i + 3) * is2));
            npy_short m4 = *((npy_short *)(ip2 + (i + 4) * is2));
            npy_short m5 = *((npy_short *)(ip2 + (i + 5) * is2));
            npy_short m6 = *((npy_short *)(ip2 + (i + 6) * is2));
            npy_short m7 = *((npy_short *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_short v0 = *((npy_short *)(ip2 + (i + 0) * is2));
                npy_short v1 = *((npy_short *)(ip2 + (i + 1) * is2));
                npy_short v2 = *((npy_short *)(ip2 + (i + 2) * is2));
                npy_short v3 = *((npy_short *)(ip2 + (i + 3) * is2));
                npy_short v4 = *((npy_short *)(ip2 + (i + 4) * is2));
                npy_short v5 = *((npy_short *)(ip2 + (i + 5) * is2));
                npy_short v6 = *((npy_short *)(ip2 + (i + 6) * is2));
                npy_short v7 = *((npy_short *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_short *)op1) = SCALAR_OP(*((npy_short *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_short v0 = *((npy_short *)(ip1 + (i + 0) * is1));
            npy_short u0 = *((npy_short *)(ip2 + (i + 0) * is2));
            *((npy_short *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_short v1 = *((npy_short *)(ip1 + (i + 1) * is1));
            npy_short u1 = *((npy_short *)(ip2 + (i + 1) * is2));
            *((npy_short *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_short v2 = *((npy_short *)(ip1 + (i + 2) * is1));
            npy_short u2 = *((npy_short *)(ip2 + (i + 2) * is2));
            *((npy_short *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_short v3 = *((npy_short *)(ip1 + (i + 3) * is1));
            npy_short u3 = *((npy_short *)(ip2 + (i + 3) * is2));
            *((npy_short *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_short in1 = *(npy_short *)ip1;
        const npy_short in2 = *(npy_short *)ip2;
        *((npy_short *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_fmin_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_short *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_short *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_short *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)


#line 294
#undef TO_SIMD_SFX
#if 0
#line 299
#elif NPY_SIMD && NPY_BITSOF_INT == 8
    #if 0
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_INT == 16
    #if 0
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_INT == 32
    #if 0
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_INT == 64
    #if 0
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_max_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_maximum)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_int)) {
            TO_SIMD_SFX(simd_reduce_c_max)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_int, npy_int)) {
            TO_SIMD_SFX(simd_binary_ccc_max)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_max)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_int m0 = *((npy_int *)(ip2 + (i + 0) * is2));
            npy_int m1 = *((npy_int *)(ip2 + (i + 1) * is2));
            npy_int m2 = *((npy_int *)(ip2 + (i + 2) * is2));
            npy_int m3 = *((npy_int *)(ip2 + (i + 3) * is2));
            npy_int m4 = *((npy_int *)(ip2 + (i + 4) * is2));
            npy_int m5 = *((npy_int *)(ip2 + (i + 5) * is2));
            npy_int m6 = *((npy_int *)(ip2 + (i + 6) * is2));
            npy_int m7 = *((npy_int *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_int v0 = *((npy_int *)(ip2 + (i + 0) * is2));
                npy_int v1 = *((npy_int *)(ip2 + (i + 1) * is2));
                npy_int v2 = *((npy_int *)(ip2 + (i + 2) * is2));
                npy_int v3 = *((npy_int *)(ip2 + (i + 3) * is2));
                npy_int v4 = *((npy_int *)(ip2 + (i + 4) * is2));
                npy_int v5 = *((npy_int *)(ip2 + (i + 5) * is2));
                npy_int v6 = *((npy_int *)(ip2 + (i + 6) * is2));
                npy_int v7 = *((npy_int *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_int *)op1) = SCALAR_OP(*((npy_int *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_int v0 = *((npy_int *)(ip1 + (i + 0) * is1));
            npy_int u0 = *((npy_int *)(ip2 + (i + 0) * is2));
            *((npy_int *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_int v1 = *((npy_int *)(ip1 + (i + 1) * is1));
            npy_int u1 = *((npy_int *)(ip2 + (i + 1) * is2));
            *((npy_int *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_int v2 = *((npy_int *)(ip1 + (i + 2) * is1));
            npy_int u2 = *((npy_int *)(ip2 + (i + 2) * is2));
            *((npy_int *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_int v3 = *((npy_int *)(ip1 + (i + 3) * is1));
            npy_int u3 = *((npy_int *)(ip2 + (i + 3) * is2));
            *((npy_int *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_int in1 = *(npy_int *)ip1;
        const npy_int in2 = *(npy_int *)ip2;
        *((npy_int *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_maximum_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_int *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_int *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_int *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_min_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_minimum)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_int)) {
            TO_SIMD_SFX(simd_reduce_c_min)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_int, npy_int)) {
            TO_SIMD_SFX(simd_binary_ccc_min)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_min)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_int m0 = *((npy_int *)(ip2 + (i + 0) * is2));
            npy_int m1 = *((npy_int *)(ip2 + (i + 1) * is2));
            npy_int m2 = *((npy_int *)(ip2 + (i + 2) * is2));
            npy_int m3 = *((npy_int *)(ip2 + (i + 3) * is2));
            npy_int m4 = *((npy_int *)(ip2 + (i + 4) * is2));
            npy_int m5 = *((npy_int *)(ip2 + (i + 5) * is2));
            npy_int m6 = *((npy_int *)(ip2 + (i + 6) * is2));
            npy_int m7 = *((npy_int *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_int v0 = *((npy_int *)(ip2 + (i + 0) * is2));
                npy_int v1 = *((npy_int *)(ip2 + (i + 1) * is2));
                npy_int v2 = *((npy_int *)(ip2 + (i + 2) * is2));
                npy_int v3 = *((npy_int *)(ip2 + (i + 3) * is2));
                npy_int v4 = *((npy_int *)(ip2 + (i + 4) * is2));
                npy_int v5 = *((npy_int *)(ip2 + (i + 5) * is2));
                npy_int v6 = *((npy_int *)(ip2 + (i + 6) * is2));
                npy_int v7 = *((npy_int *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_int *)op1) = SCALAR_OP(*((npy_int *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_int v0 = *((npy_int *)(ip1 + (i + 0) * is1));
            npy_int u0 = *((npy_int *)(ip2 + (i + 0) * is2));
            *((npy_int *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_int v1 = *((npy_int *)(ip1 + (i + 1) * is1));
            npy_int u1 = *((npy_int *)(ip2 + (i + 1) * is2));
            *((npy_int *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_int v2 = *((npy_int *)(ip1 + (i + 2) * is1));
            npy_int u2 = *((npy_int *)(ip2 + (i + 2) * is2));
            *((npy_int *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_int v3 = *((npy_int *)(ip1 + (i + 3) * is1));
            npy_int u3 = *((npy_int *)(ip2 + (i + 3) * is2));
            *((npy_int *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_int in1 = *(npy_int *)ip1;
        const npy_int in2 = *(npy_int *)ip2;
        *((npy_int *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_minimum_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_int *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_int *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_int *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_maxp_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_fmax)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_int)) {
            TO_SIMD_SFX(simd_reduce_c_maxp)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_int, npy_int)) {
            TO_SIMD_SFX(simd_binary_ccc_maxp)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_maxp)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_int m0 = *((npy_int *)(ip2 + (i + 0) * is2));
            npy_int m1 = *((npy_int *)(ip2 + (i + 1) * is2));
            npy_int m2 = *((npy_int *)(ip2 + (i + 2) * is2));
            npy_int m3 = *((npy_int *)(ip2 + (i + 3) * is2));
            npy_int m4 = *((npy_int *)(ip2 + (i + 4) * is2));
            npy_int m5 = *((npy_int *)(ip2 + (i + 5) * is2));
            npy_int m6 = *((npy_int *)(ip2 + (i + 6) * is2));
            npy_int m7 = *((npy_int *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_int v0 = *((npy_int *)(ip2 + (i + 0) * is2));
                npy_int v1 = *((npy_int *)(ip2 + (i + 1) * is2));
                npy_int v2 = *((npy_int *)(ip2 + (i + 2) * is2));
                npy_int v3 = *((npy_int *)(ip2 + (i + 3) * is2));
                npy_int v4 = *((npy_int *)(ip2 + (i + 4) * is2));
                npy_int v5 = *((npy_int *)(ip2 + (i + 5) * is2));
                npy_int v6 = *((npy_int *)(ip2 + (i + 6) * is2));
                npy_int v7 = *((npy_int *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_int *)op1) = SCALAR_OP(*((npy_int *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_int v0 = *((npy_int *)(ip1 + (i + 0) * is1));
            npy_int u0 = *((npy_int *)(ip2 + (i + 0) * is2));
            *((npy_int *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_int v1 = *((npy_int *)(ip1 + (i + 1) * is1));
            npy_int u1 = *((npy_int *)(ip2 + (i + 1) * is2));
            *((npy_int *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_int v2 = *((npy_int *)(ip1 + (i + 2) * is1));
            npy_int u2 = *((npy_int *)(ip2 + (i + 2) * is2));
            *((npy_int *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_int v3 = *((npy_int *)(ip1 + (i + 3) * is1));
            npy_int u3 = *((npy_int *)(ip2 + (i + 3) * is2));
            *((npy_int *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_int in1 = *(npy_int *)ip1;
        const npy_int in2 = *(npy_int *)ip2;
        *((npy_int *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_fmax_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_int *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_int *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_int *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_minp_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_fmin)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_int)) {
            TO_SIMD_SFX(simd_reduce_c_minp)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_int, npy_int)) {
            TO_SIMD_SFX(simd_binary_ccc_minp)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_minp)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_int m0 = *((npy_int *)(ip2 + (i + 0) * is2));
            npy_int m1 = *((npy_int *)(ip2 + (i + 1) * is2));
            npy_int m2 = *((npy_int *)(ip2 + (i + 2) * is2));
            npy_int m3 = *((npy_int *)(ip2 + (i + 3) * is2));
            npy_int m4 = *((npy_int *)(ip2 + (i + 4) * is2));
            npy_int m5 = *((npy_int *)(ip2 + (i + 5) * is2));
            npy_int m6 = *((npy_int *)(ip2 + (i + 6) * is2));
            npy_int m7 = *((npy_int *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_int v0 = *((npy_int *)(ip2 + (i + 0) * is2));
                npy_int v1 = *((npy_int *)(ip2 + (i + 1) * is2));
                npy_int v2 = *((npy_int *)(ip2 + (i + 2) * is2));
                npy_int v3 = *((npy_int *)(ip2 + (i + 3) * is2));
                npy_int v4 = *((npy_int *)(ip2 + (i + 4) * is2));
                npy_int v5 = *((npy_int *)(ip2 + (i + 5) * is2));
                npy_int v6 = *((npy_int *)(ip2 + (i + 6) * is2));
                npy_int v7 = *((npy_int *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_int *)op1) = SCALAR_OP(*((npy_int *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_int v0 = *((npy_int *)(ip1 + (i + 0) * is1));
            npy_int u0 = *((npy_int *)(ip2 + (i + 0) * is2));
            *((npy_int *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_int v1 = *((npy_int *)(ip1 + (i + 1) * is1));
            npy_int u1 = *((npy_int *)(ip2 + (i + 1) * is2));
            *((npy_int *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_int v2 = *((npy_int *)(ip1 + (i + 2) * is1));
            npy_int u2 = *((npy_int *)(ip2 + (i + 2) * is2));
            *((npy_int *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_int v3 = *((npy_int *)(ip1 + (i + 3) * is1));
            npy_int u3 = *((npy_int *)(ip2 + (i + 3) * is2));
            *((npy_int *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_int in1 = *(npy_int *)ip1;
        const npy_int in2 = *(npy_int *)ip2;
        *((npy_int *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_fmin_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_int *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_int *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_int *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)


#line 294
#undef TO_SIMD_SFX
#if 0
#line 299
#elif NPY_SIMD && NPY_BITSOF_LONG == 8
    #if 0
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_LONG == 16
    #if 0
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_LONG == 32
    #if 0
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_LONG == 64
    #if 0
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_max_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_maximum)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_long)) {
            TO_SIMD_SFX(simd_reduce_c_max)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_long, npy_long)) {
            TO_SIMD_SFX(simd_binary_ccc_max)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_max)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_long m0 = *((npy_long *)(ip2 + (i + 0) * is2));
            npy_long m1 = *((npy_long *)(ip2 + (i + 1) * is2));
            npy_long m2 = *((npy_long *)(ip2 + (i + 2) * is2));
            npy_long m3 = *((npy_long *)(ip2 + (i + 3) * is2));
            npy_long m4 = *((npy_long *)(ip2 + (i + 4) * is2));
            npy_long m5 = *((npy_long *)(ip2 + (i + 5) * is2));
            npy_long m6 = *((npy_long *)(ip2 + (i + 6) * is2));
            npy_long m7 = *((npy_long *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_long v0 = *((npy_long *)(ip2 + (i + 0) * is2));
                npy_long v1 = *((npy_long *)(ip2 + (i + 1) * is2));
                npy_long v2 = *((npy_long *)(ip2 + (i + 2) * is2));
                npy_long v3 = *((npy_long *)(ip2 + (i + 3) * is2));
                npy_long v4 = *((npy_long *)(ip2 + (i + 4) * is2));
                npy_long v5 = *((npy_long *)(ip2 + (i + 5) * is2));
                npy_long v6 = *((npy_long *)(ip2 + (i + 6) * is2));
                npy_long v7 = *((npy_long *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_long *)op1) = SCALAR_OP(*((npy_long *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_long v0 = *((npy_long *)(ip1 + (i + 0) * is1));
            npy_long u0 = *((npy_long *)(ip2 + (i + 0) * is2));
            *((npy_long *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_long v1 = *((npy_long *)(ip1 + (i + 1) * is1));
            npy_long u1 = *((npy_long *)(ip2 + (i + 1) * is2));
            *((npy_long *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_long v2 = *((npy_long *)(ip1 + (i + 2) * is1));
            npy_long u2 = *((npy_long *)(ip2 + (i + 2) * is2));
            *((npy_long *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_long v3 = *((npy_long *)(ip1 + (i + 3) * is1));
            npy_long u3 = *((npy_long *)(ip2 + (i + 3) * is2));
            *((npy_long *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_long in1 = *(npy_long *)ip1;
        const npy_long in2 = *(npy_long *)ip2;
        *((npy_long *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_maximum_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_long *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_long *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_long *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_min_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_minimum)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_long)) {
            TO_SIMD_SFX(simd_reduce_c_min)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_long, npy_long)) {
            TO_SIMD_SFX(simd_binary_ccc_min)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_min)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_long m0 = *((npy_long *)(ip2 + (i + 0) * is2));
            npy_long m1 = *((npy_long *)(ip2 + (i + 1) * is2));
            npy_long m2 = *((npy_long *)(ip2 + (i + 2) * is2));
            npy_long m3 = *((npy_long *)(ip2 + (i + 3) * is2));
            npy_long m4 = *((npy_long *)(ip2 + (i + 4) * is2));
            npy_long m5 = *((npy_long *)(ip2 + (i + 5) * is2));
            npy_long m6 = *((npy_long *)(ip2 + (i + 6) * is2));
            npy_long m7 = *((npy_long *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_long v0 = *((npy_long *)(ip2 + (i + 0) * is2));
                npy_long v1 = *((npy_long *)(ip2 + (i + 1) * is2));
                npy_long v2 = *((npy_long *)(ip2 + (i + 2) * is2));
                npy_long v3 = *((npy_long *)(ip2 + (i + 3) * is2));
                npy_long v4 = *((npy_long *)(ip2 + (i + 4) * is2));
                npy_long v5 = *((npy_long *)(ip2 + (i + 5) * is2));
                npy_long v6 = *((npy_long *)(ip2 + (i + 6) * is2));
                npy_long v7 = *((npy_long *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_long *)op1) = SCALAR_OP(*((npy_long *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_long v0 = *((npy_long *)(ip1 + (i + 0) * is1));
            npy_long u0 = *((npy_long *)(ip2 + (i + 0) * is2));
            *((npy_long *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_long v1 = *((npy_long *)(ip1 + (i + 1) * is1));
            npy_long u1 = *((npy_long *)(ip2 + (i + 1) * is2));
            *((npy_long *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_long v2 = *((npy_long *)(ip1 + (i + 2) * is1));
            npy_long u2 = *((npy_long *)(ip2 + (i + 2) * is2));
            *((npy_long *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_long v3 = *((npy_long *)(ip1 + (i + 3) * is1));
            npy_long u3 = *((npy_long *)(ip2 + (i + 3) * is2));
            *((npy_long *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_long in1 = *(npy_long *)ip1;
        const npy_long in2 = *(npy_long *)ip2;
        *((npy_long *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_minimum_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_long *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_long *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_long *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_maxp_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_fmax)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_long)) {
            TO_SIMD_SFX(simd_reduce_c_maxp)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_long, npy_long)) {
            TO_SIMD_SFX(simd_binary_ccc_maxp)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_maxp)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_long m0 = *((npy_long *)(ip2 + (i + 0) * is2));
            npy_long m1 = *((npy_long *)(ip2 + (i + 1) * is2));
            npy_long m2 = *((npy_long *)(ip2 + (i + 2) * is2));
            npy_long m3 = *((npy_long *)(ip2 + (i + 3) * is2));
            npy_long m4 = *((npy_long *)(ip2 + (i + 4) * is2));
            npy_long m5 = *((npy_long *)(ip2 + (i + 5) * is2));
            npy_long m6 = *((npy_long *)(ip2 + (i + 6) * is2));
            npy_long m7 = *((npy_long *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_long v0 = *((npy_long *)(ip2 + (i + 0) * is2));
                npy_long v1 = *((npy_long *)(ip2 + (i + 1) * is2));
                npy_long v2 = *((npy_long *)(ip2 + (i + 2) * is2));
                npy_long v3 = *((npy_long *)(ip2 + (i + 3) * is2));
                npy_long v4 = *((npy_long *)(ip2 + (i + 4) * is2));
                npy_long v5 = *((npy_long *)(ip2 + (i + 5) * is2));
                npy_long v6 = *((npy_long *)(ip2 + (i + 6) * is2));
                npy_long v7 = *((npy_long *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_long *)op1) = SCALAR_OP(*((npy_long *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_long v0 = *((npy_long *)(ip1 + (i + 0) * is1));
            npy_long u0 = *((npy_long *)(ip2 + (i + 0) * is2));
            *((npy_long *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_long v1 = *((npy_long *)(ip1 + (i + 1) * is1));
            npy_long u1 = *((npy_long *)(ip2 + (i + 1) * is2));
            *((npy_long *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_long v2 = *((npy_long *)(ip1 + (i + 2) * is1));
            npy_long u2 = *((npy_long *)(ip2 + (i + 2) * is2));
            *((npy_long *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_long v3 = *((npy_long *)(ip1 + (i + 3) * is1));
            npy_long u3 = *((npy_long *)(ip2 + (i + 3) * is2));
            *((npy_long *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_long in1 = *(npy_long *)ip1;
        const npy_long in2 = *(npy_long *)ip2;
        *((npy_long *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_fmax_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_long *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_long *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_long *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_minp_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_fmin)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_long)) {
            TO_SIMD_SFX(simd_reduce_c_minp)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_long, npy_long)) {
            TO_SIMD_SFX(simd_binary_ccc_minp)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_minp)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_long m0 = *((npy_long *)(ip2 + (i + 0) * is2));
            npy_long m1 = *((npy_long *)(ip2 + (i + 1) * is2));
            npy_long m2 = *((npy_long *)(ip2 + (i + 2) * is2));
            npy_long m3 = *((npy_long *)(ip2 + (i + 3) * is2));
            npy_long m4 = *((npy_long *)(ip2 + (i + 4) * is2));
            npy_long m5 = *((npy_long *)(ip2 + (i + 5) * is2));
            npy_long m6 = *((npy_long *)(ip2 + (i + 6) * is2));
            npy_long m7 = *((npy_long *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_long v0 = *((npy_long *)(ip2 + (i + 0) * is2));
                npy_long v1 = *((npy_long *)(ip2 + (i + 1) * is2));
                npy_long v2 = *((npy_long *)(ip2 + (i + 2) * is2));
                npy_long v3 = *((npy_long *)(ip2 + (i + 3) * is2));
                npy_long v4 = *((npy_long *)(ip2 + (i + 4) * is2));
                npy_long v5 = *((npy_long *)(ip2 + (i + 5) * is2));
                npy_long v6 = *((npy_long *)(ip2 + (i + 6) * is2));
                npy_long v7 = *((npy_long *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_long *)op1) = SCALAR_OP(*((npy_long *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_long v0 = *((npy_long *)(ip1 + (i + 0) * is1));
            npy_long u0 = *((npy_long *)(ip2 + (i + 0) * is2));
            *((npy_long *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_long v1 = *((npy_long *)(ip1 + (i + 1) * is1));
            npy_long u1 = *((npy_long *)(ip2 + (i + 1) * is2));
            *((npy_long *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_long v2 = *((npy_long *)(ip1 + (i + 2) * is1));
            npy_long u2 = *((npy_long *)(ip2 + (i + 2) * is2));
            *((npy_long *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_long v3 = *((npy_long *)(ip1 + (i + 3) * is1));
            npy_long u3 = *((npy_long *)(ip2 + (i + 3) * is2));
            *((npy_long *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_long in1 = *(npy_long *)ip1;
        const npy_long in2 = *(npy_long *)ip2;
        *((npy_long *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_fmin_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_long *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_long *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_long *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)


#line 294
#undef TO_SIMD_SFX
#if 0
#line 299
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 8
    #if 0
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 16
    #if 0
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 32
    #if 0
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 64
    #if 0
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_max_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_maximum)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_longlong)) {
            TO_SIMD_SFX(simd_reduce_c_max)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_longlong, npy_longlong)) {
            TO_SIMD_SFX(simd_binary_ccc_max)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_max)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_longlong m0 = *((npy_longlong *)(ip2 + (i + 0) * is2));
            npy_longlong m1 = *((npy_longlong *)(ip2 + (i + 1) * is2));
            npy_longlong m2 = *((npy_longlong *)(ip2 + (i + 2) * is2));
            npy_longlong m3 = *((npy_longlong *)(ip2 + (i + 3) * is2));
            npy_longlong m4 = *((npy_longlong *)(ip2 + (i + 4) * is2));
            npy_longlong m5 = *((npy_longlong *)(ip2 + (i + 5) * is2));
            npy_longlong m6 = *((npy_longlong *)(ip2 + (i + 6) * is2));
            npy_longlong m7 = *((npy_longlong *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_longlong v0 = *((npy_longlong *)(ip2 + (i + 0) * is2));
                npy_longlong v1 = *((npy_longlong *)(ip2 + (i + 1) * is2));
                npy_longlong v2 = *((npy_longlong *)(ip2 + (i + 2) * is2));
                npy_longlong v3 = *((npy_longlong *)(ip2 + (i + 3) * is2));
                npy_longlong v4 = *((npy_longlong *)(ip2 + (i + 4) * is2));
                npy_longlong v5 = *((npy_longlong *)(ip2 + (i + 5) * is2));
                npy_longlong v6 = *((npy_longlong *)(ip2 + (i + 6) * is2));
                npy_longlong v7 = *((npy_longlong *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_longlong *)op1) = SCALAR_OP(*((npy_longlong *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_longlong v0 = *((npy_longlong *)(ip1 + (i + 0) * is1));
            npy_longlong u0 = *((npy_longlong *)(ip2 + (i + 0) * is2));
            *((npy_longlong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_longlong v1 = *((npy_longlong *)(ip1 + (i + 1) * is1));
            npy_longlong u1 = *((npy_longlong *)(ip2 + (i + 1) * is2));
            *((npy_longlong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_longlong v2 = *((npy_longlong *)(ip1 + (i + 2) * is1));
            npy_longlong u2 = *((npy_longlong *)(ip2 + (i + 2) * is2));
            *((npy_longlong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_longlong v3 = *((npy_longlong *)(ip1 + (i + 3) * is1));
            npy_longlong u3 = *((npy_longlong *)(ip2 + (i + 3) * is2));
            *((npy_longlong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_longlong in1 = *(npy_longlong *)ip1;
        const npy_longlong in2 = *(npy_longlong *)ip2;
        *((npy_longlong *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_maximum_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_longlong *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_longlong *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_longlong *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_min_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_minimum)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_longlong)) {
            TO_SIMD_SFX(simd_reduce_c_min)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_longlong, npy_longlong)) {
            TO_SIMD_SFX(simd_binary_ccc_min)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_min)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_longlong m0 = *((npy_longlong *)(ip2 + (i + 0) * is2));
            npy_longlong m1 = *((npy_longlong *)(ip2 + (i + 1) * is2));
            npy_longlong m2 = *((npy_longlong *)(ip2 + (i + 2) * is2));
            npy_longlong m3 = *((npy_longlong *)(ip2 + (i + 3) * is2));
            npy_longlong m4 = *((npy_longlong *)(ip2 + (i + 4) * is2));
            npy_longlong m5 = *((npy_longlong *)(ip2 + (i + 5) * is2));
            npy_longlong m6 = *((npy_longlong *)(ip2 + (i + 6) * is2));
            npy_longlong m7 = *((npy_longlong *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_longlong v0 = *((npy_longlong *)(ip2 + (i + 0) * is2));
                npy_longlong v1 = *((npy_longlong *)(ip2 + (i + 1) * is2));
                npy_longlong v2 = *((npy_longlong *)(ip2 + (i + 2) * is2));
                npy_longlong v3 = *((npy_longlong *)(ip2 + (i + 3) * is2));
                npy_longlong v4 = *((npy_longlong *)(ip2 + (i + 4) * is2));
                npy_longlong v5 = *((npy_longlong *)(ip2 + (i + 5) * is2));
                npy_longlong v6 = *((npy_longlong *)(ip2 + (i + 6) * is2));
                npy_longlong v7 = *((npy_longlong *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_longlong *)op1) = SCALAR_OP(*((npy_longlong *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_longlong v0 = *((npy_longlong *)(ip1 + (i + 0) * is1));
            npy_longlong u0 = *((npy_longlong *)(ip2 + (i + 0) * is2));
            *((npy_longlong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_longlong v1 = *((npy_longlong *)(ip1 + (i + 1) * is1));
            npy_longlong u1 = *((npy_longlong *)(ip2 + (i + 1) * is2));
            *((npy_longlong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_longlong v2 = *((npy_longlong *)(ip1 + (i + 2) * is1));
            npy_longlong u2 = *((npy_longlong *)(ip2 + (i + 2) * is2));
            *((npy_longlong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_longlong v3 = *((npy_longlong *)(ip1 + (i + 3) * is1));
            npy_longlong u3 = *((npy_longlong *)(ip2 + (i + 3) * is2));
            *((npy_longlong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_longlong in1 = *(npy_longlong *)ip1;
        const npy_longlong in2 = *(npy_longlong *)ip2;
        *((npy_longlong *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_minimum_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_longlong *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_longlong *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_longlong *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_maxp_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_fmax)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_longlong)) {
            TO_SIMD_SFX(simd_reduce_c_maxp)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_longlong, npy_longlong)) {
            TO_SIMD_SFX(simd_binary_ccc_maxp)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_maxp)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_longlong m0 = *((npy_longlong *)(ip2 + (i + 0) * is2));
            npy_longlong m1 = *((npy_longlong *)(ip2 + (i + 1) * is2));
            npy_longlong m2 = *((npy_longlong *)(ip2 + (i + 2) * is2));
            npy_longlong m3 = *((npy_longlong *)(ip2 + (i + 3) * is2));
            npy_longlong m4 = *((npy_longlong *)(ip2 + (i + 4) * is2));
            npy_longlong m5 = *((npy_longlong *)(ip2 + (i + 5) * is2));
            npy_longlong m6 = *((npy_longlong *)(ip2 + (i + 6) * is2));
            npy_longlong m7 = *((npy_longlong *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_longlong v0 = *((npy_longlong *)(ip2 + (i + 0) * is2));
                npy_longlong v1 = *((npy_longlong *)(ip2 + (i + 1) * is2));
                npy_longlong v2 = *((npy_longlong *)(ip2 + (i + 2) * is2));
                npy_longlong v3 = *((npy_longlong *)(ip2 + (i + 3) * is2));
                npy_longlong v4 = *((npy_longlong *)(ip2 + (i + 4) * is2));
                npy_longlong v5 = *((npy_longlong *)(ip2 + (i + 5) * is2));
                npy_longlong v6 = *((npy_longlong *)(ip2 + (i + 6) * is2));
                npy_longlong v7 = *((npy_longlong *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_longlong *)op1) = SCALAR_OP(*((npy_longlong *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_longlong v0 = *((npy_longlong *)(ip1 + (i + 0) * is1));
            npy_longlong u0 = *((npy_longlong *)(ip2 + (i + 0) * is2));
            *((npy_longlong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_longlong v1 = *((npy_longlong *)(ip1 + (i + 1) * is1));
            npy_longlong u1 = *((npy_longlong *)(ip2 + (i + 1) * is2));
            *((npy_longlong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_longlong v2 = *((npy_longlong *)(ip1 + (i + 2) * is1));
            npy_longlong u2 = *((npy_longlong *)(ip2 + (i + 2) * is2));
            *((npy_longlong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_longlong v3 = *((npy_longlong *)(ip1 + (i + 3) * is1));
            npy_longlong u3 = *((npy_longlong *)(ip2 + (i + 3) * is2));
            *((npy_longlong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_longlong in1 = *(npy_longlong *)ip1;
        const npy_longlong in2 = *(npy_longlong *)ip2;
        *((npy_longlong *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_fmax_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_longlong *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_longlong *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_longlong *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_minp_i

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_fmin)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_longlong)) {
            TO_SIMD_SFX(simd_reduce_c_minp)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_longlong, npy_longlong)) {
            TO_SIMD_SFX(simd_binary_ccc_minp)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 0
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_minp)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_longlong m0 = *((npy_longlong *)(ip2 + (i + 0) * is2));
            npy_longlong m1 = *((npy_longlong *)(ip2 + (i + 1) * is2));
            npy_longlong m2 = *((npy_longlong *)(ip2 + (i + 2) * is2));
            npy_longlong m3 = *((npy_longlong *)(ip2 + (i + 3) * is2));
            npy_longlong m4 = *((npy_longlong *)(ip2 + (i + 4) * is2));
            npy_longlong m5 = *((npy_longlong *)(ip2 + (i + 5) * is2));
            npy_longlong m6 = *((npy_longlong *)(ip2 + (i + 6) * is2));
            npy_longlong m7 = *((npy_longlong *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_longlong v0 = *((npy_longlong *)(ip2 + (i + 0) * is2));
                npy_longlong v1 = *((npy_longlong *)(ip2 + (i + 1) * is2));
                npy_longlong v2 = *((npy_longlong *)(ip2 + (i + 2) * is2));
                npy_longlong v3 = *((npy_longlong *)(ip2 + (i + 3) * is2));
                npy_longlong v4 = *((npy_longlong *)(ip2 + (i + 4) * is2));
                npy_longlong v5 = *((npy_longlong *)(ip2 + (i + 5) * is2));
                npy_longlong v6 = *((npy_longlong *)(ip2 + (i + 6) * is2));
                npy_longlong v7 = *((npy_longlong *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_longlong *)op1) = SCALAR_OP(*((npy_longlong *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_longlong v0 = *((npy_longlong *)(ip1 + (i + 0) * is1));
            npy_longlong u0 = *((npy_longlong *)(ip2 + (i + 0) * is2));
            *((npy_longlong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_longlong v1 = *((npy_longlong *)(ip1 + (i + 1) * is1));
            npy_longlong u1 = *((npy_longlong *)(ip2 + (i + 1) * is2));
            *((npy_longlong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_longlong v2 = *((npy_longlong *)(ip1 + (i + 2) * is1));
            npy_longlong u2 = *((npy_longlong *)(ip2 + (i + 2) * is2));
            *((npy_longlong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_longlong v3 = *((npy_longlong *)(ip1 + (i + 3) * is1));
            npy_longlong u3 = *((npy_longlong *)(ip2 + (i + 3) * is2));
            *((npy_longlong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_longlong in1 = *(npy_longlong *)ip1;
        const npy_longlong in2 = *(npy_longlong *)ip2;
        *((npy_longlong *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_fmin_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_longlong *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_longlong *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_longlong *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)


#line 294
#undef TO_SIMD_SFX
#if 0
#line 299
#elif NPY_SIMD && NPY_BITSOF_FLOAT == 8
    #if 1
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_FLOAT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_FLOAT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_FLOAT == 16
    #if 1
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_FLOAT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_FLOAT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_FLOAT == 32
    #if 1
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_FLOAT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_FLOAT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_FLOAT == 64
    #if 1
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_FLOAT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_FLOAT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 321
#if !0 || (1 && 0)
#define SCALAR_OP scalar_max_f

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_maximum)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_float)) {
            TO_SIMD_SFX(simd_reduce_c_max)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_float, npy_float)) {
            TO_SIMD_SFX(simd_binary_ccc_max)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 1
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_max)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_float m0 = *((npy_float *)(ip2 + (i + 0) * is2));
            npy_float m1 = *((npy_float *)(ip2 + (i + 1) * is2));
            npy_float m2 = *((npy_float *)(ip2 + (i + 2) * is2));
            npy_float m3 = *((npy_float *)(ip2 + (i + 3) * is2));
            npy_float m4 = *((npy_float *)(ip2 + (i + 4) * is2));
            npy_float m5 = *((npy_float *)(ip2 + (i + 5) * is2));
            npy_float m6 = *((npy_float *)(ip2 + (i + 6) * is2));
            npy_float m7 = *((npy_float *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_float v0 = *((npy_float *)(ip2 + (i + 0) * is2));
                npy_float v1 = *((npy_float *)(ip2 + (i + 1) * is2));
                npy_float v2 = *((npy_float *)(ip2 + (i + 2) * is2));
                npy_float v3 = *((npy_float *)(ip2 + (i + 3) * is2));
                npy_float v4 = *((npy_float *)(ip2 + (i + 4) * is2));
                npy_float v5 = *((npy_float *)(ip2 + (i + 5) * is2));
                npy_float v6 = *((npy_float *)(ip2 + (i + 6) * is2));
                npy_float v7 = *((npy_float *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_float *)op1) = SCALAR_OP(*((npy_float *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_float v0 = *((npy_float *)(ip1 + (i + 0) * is1));
            npy_float u0 = *((npy_float *)(ip2 + (i + 0) * is2));
            *((npy_float *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_float v1 = *((npy_float *)(ip1 + (i + 1) * is1));
            npy_float u1 = *((npy_float *)(ip2 + (i + 1) * is2));
            *((npy_float *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_float v2 = *((npy_float *)(ip1 + (i + 2) * is1));
            npy_float u2 = *((npy_float *)(ip2 + (i + 2) * is2));
            *((npy_float *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_float v3 = *((npy_float *)(ip1 + (i + 3) * is1));
            npy_float u3 = *((npy_float *)(ip2 + (i + 3) * is2));
            *((npy_float *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_float in1 = *(npy_float *)ip1;
        const npy_float in2 = *(npy_float *)ip2;
        *((npy_float *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 1
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_maximum_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_float *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_float *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_float *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !0 || (1 && 0)
#define SCALAR_OP scalar_min_f

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_minimum)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_float)) {
            TO_SIMD_SFX(simd_reduce_c_min)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_float, npy_float)) {
            TO_SIMD_SFX(simd_binary_ccc_min)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 1
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_min)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_float m0 = *((npy_float *)(ip2 + (i + 0) * is2));
            npy_float m1 = *((npy_float *)(ip2 + (i + 1) * is2));
            npy_float m2 = *((npy_float *)(ip2 + (i + 2) * is2));
            npy_float m3 = *((npy_float *)(ip2 + (i + 3) * is2));
            npy_float m4 = *((npy_float *)(ip2 + (i + 4) * is2));
            npy_float m5 = *((npy_float *)(ip2 + (i + 5) * is2));
            npy_float m6 = *((npy_float *)(ip2 + (i + 6) * is2));
            npy_float m7 = *((npy_float *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_float v0 = *((npy_float *)(ip2 + (i + 0) * is2));
                npy_float v1 = *((npy_float *)(ip2 + (i + 1) * is2));
                npy_float v2 = *((npy_float *)(ip2 + (i + 2) * is2));
                npy_float v3 = *((npy_float *)(ip2 + (i + 3) * is2));
                npy_float v4 = *((npy_float *)(ip2 + (i + 4) * is2));
                npy_float v5 = *((npy_float *)(ip2 + (i + 5) * is2));
                npy_float v6 = *((npy_float *)(ip2 + (i + 6) * is2));
                npy_float v7 = *((npy_float *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_float *)op1) = SCALAR_OP(*((npy_float *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_float v0 = *((npy_float *)(ip1 + (i + 0) * is1));
            npy_float u0 = *((npy_float *)(ip2 + (i + 0) * is2));
            *((npy_float *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_float v1 = *((npy_float *)(ip1 + (i + 1) * is1));
            npy_float u1 = *((npy_float *)(ip2 + (i + 1) * is2));
            *((npy_float *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_float v2 = *((npy_float *)(ip1 + (i + 2) * is1));
            npy_float u2 = *((npy_float *)(ip2 + (i + 2) * is2));
            *((npy_float *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_float v3 = *((npy_float *)(ip1 + (i + 3) * is1));
            npy_float u3 = *((npy_float *)(ip2 + (i + 3) * is2));
            *((npy_float *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_float in1 = *(npy_float *)ip1;
        const npy_float in2 = *(npy_float *)ip2;
        *((npy_float *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 1
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_minimum_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_float *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_float *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_float *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !1 || (1 && 1)
#define SCALAR_OP scalar_maxp_f

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_fmax)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_float)) {
            TO_SIMD_SFX(simd_reduce_c_maxp)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_float, npy_float)) {
            TO_SIMD_SFX(simd_binary_ccc_maxp)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 1
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_maxp)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_float m0 = *((npy_float *)(ip2 + (i + 0) * is2));
            npy_float m1 = *((npy_float *)(ip2 + (i + 1) * is2));
            npy_float m2 = *((npy_float *)(ip2 + (i + 2) * is2));
            npy_float m3 = *((npy_float *)(ip2 + (i + 3) * is2));
            npy_float m4 = *((npy_float *)(ip2 + (i + 4) * is2));
            npy_float m5 = *((npy_float *)(ip2 + (i + 5) * is2));
            npy_float m6 = *((npy_float *)(ip2 + (i + 6) * is2));
            npy_float m7 = *((npy_float *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_float v0 = *((npy_float *)(ip2 + (i + 0) * is2));
                npy_float v1 = *((npy_float *)(ip2 + (i + 1) * is2));
                npy_float v2 = *((npy_float *)(ip2 + (i + 2) * is2));
                npy_float v3 = *((npy_float *)(ip2 + (i + 3) * is2));
                npy_float v4 = *((npy_float *)(ip2 + (i + 4) * is2));
                npy_float v5 = *((npy_float *)(ip2 + (i + 5) * is2));
                npy_float v6 = *((npy_float *)(ip2 + (i + 6) * is2));
                npy_float v7 = *((npy_float *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_float *)op1) = SCALAR_OP(*((npy_float *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_float v0 = *((npy_float *)(ip1 + (i + 0) * is1));
            npy_float u0 = *((npy_float *)(ip2 + (i + 0) * is2));
            *((npy_float *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_float v1 = *((npy_float *)(ip1 + (i + 1) * is1));
            npy_float u1 = *((npy_float *)(ip2 + (i + 1) * is2));
            *((npy_float *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_float v2 = *((npy_float *)(ip1 + (i + 2) * is1));
            npy_float u2 = *((npy_float *)(ip2 + (i + 2) * is2));
            *((npy_float *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_float v3 = *((npy_float *)(ip1 + (i + 3) * is1));
            npy_float u3 = *((npy_float *)(ip2 + (i + 3) * is2));
            *((npy_float *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_float in1 = *(npy_float *)ip1;
        const npy_float in2 = *(npy_float *)ip2;
        *((npy_float *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 1
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_fmax_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_float *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_float *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_float *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !1 || (1 && 1)
#define SCALAR_OP scalar_minp_f

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_fmin)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_float)) {
            TO_SIMD_SFX(simd_reduce_c_minp)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_float, npy_float)) {
            TO_SIMD_SFX(simd_binary_ccc_minp)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 1
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_minp)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_float m0 = *((npy_float *)(ip2 + (i + 0) * is2));
            npy_float m1 = *((npy_float *)(ip2 + (i + 1) * is2));
            npy_float m2 = *((npy_float *)(ip2 + (i + 2) * is2));
            npy_float m3 = *((npy_float *)(ip2 + (i + 3) * is2));
            npy_float m4 = *((npy_float *)(ip2 + (i + 4) * is2));
            npy_float m5 = *((npy_float *)(ip2 + (i + 5) * is2));
            npy_float m6 = *((npy_float *)(ip2 + (i + 6) * is2));
            npy_float m7 = *((npy_float *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_float v0 = *((npy_float *)(ip2 + (i + 0) * is2));
                npy_float v1 = *((npy_float *)(ip2 + (i + 1) * is2));
                npy_float v2 = *((npy_float *)(ip2 + (i + 2) * is2));
                npy_float v3 = *((npy_float *)(ip2 + (i + 3) * is2));
                npy_float v4 = *((npy_float *)(ip2 + (i + 4) * is2));
                npy_float v5 = *((npy_float *)(ip2 + (i + 5) * is2));
                npy_float v6 = *((npy_float *)(ip2 + (i + 6) * is2));
                npy_float v7 = *((npy_float *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_float *)op1) = SCALAR_OP(*((npy_float *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_float v0 = *((npy_float *)(ip1 + (i + 0) * is1));
            npy_float u0 = *((npy_float *)(ip2 + (i + 0) * is2));
            *((npy_float *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_float v1 = *((npy_float *)(ip1 + (i + 1) * is1));
            npy_float u1 = *((npy_float *)(ip2 + (i + 1) * is2));
            *((npy_float *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_float v2 = *((npy_float *)(ip1 + (i + 2) * is1));
            npy_float u2 = *((npy_float *)(ip2 + (i + 2) * is2));
            *((npy_float *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_float v3 = *((npy_float *)(ip1 + (i + 3) * is1));
            npy_float u3 = *((npy_float *)(ip2 + (i + 3) * is2));
            *((npy_float *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_float in1 = *(npy_float *)ip1;
        const npy_float in2 = *(npy_float *)ip2;
        *((npy_float *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 1
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_fmin_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_float *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_float *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_float *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)


#line 294
#undef TO_SIMD_SFX
#if 0
#line 299
#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 8
    #if 1
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_DOUBLE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_DOUBLE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 16
    #if 1
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_DOUBLE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_DOUBLE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 32
    #if 1
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_DOUBLE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_DOUBLE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 64
    #if 1
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_DOUBLE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_DOUBLE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 321
#if !0 || (1 && 0)
#define SCALAR_OP scalar_max_d

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_maximum)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_double)) {
            TO_SIMD_SFX(simd_reduce_c_max)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_double, npy_double)) {
            TO_SIMD_SFX(simd_binary_ccc_max)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 1
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_max)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_double m0 = *((npy_double *)(ip2 + (i + 0) * is2));
            npy_double m1 = *((npy_double *)(ip2 + (i + 1) * is2));
            npy_double m2 = *((npy_double *)(ip2 + (i + 2) * is2));
            npy_double m3 = *((npy_double *)(ip2 + (i + 3) * is2));
            npy_double m4 = *((npy_double *)(ip2 + (i + 4) * is2));
            npy_double m5 = *((npy_double *)(ip2 + (i + 5) * is2));
            npy_double m6 = *((npy_double *)(ip2 + (i + 6) * is2));
            npy_double m7 = *((npy_double *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_double v0 = *((npy_double *)(ip2 + (i + 0) * is2));
                npy_double v1 = *((npy_double *)(ip2 + (i + 1) * is2));
                npy_double v2 = *((npy_double *)(ip2 + (i + 2) * is2));
                npy_double v3 = *((npy_double *)(ip2 + (i + 3) * is2));
                npy_double v4 = *((npy_double *)(ip2 + (i + 4) * is2));
                npy_double v5 = *((npy_double *)(ip2 + (i + 5) * is2));
                npy_double v6 = *((npy_double *)(ip2 + (i + 6) * is2));
                npy_double v7 = *((npy_double *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_double *)op1) = SCALAR_OP(*((npy_double *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_double v0 = *((npy_double *)(ip1 + (i + 0) * is1));
            npy_double u0 = *((npy_double *)(ip2 + (i + 0) * is2));
            *((npy_double *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_double v1 = *((npy_double *)(ip1 + (i + 1) * is1));
            npy_double u1 = *((npy_double *)(ip2 + (i + 1) * is2));
            *((npy_double *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_double v2 = *((npy_double *)(ip1 + (i + 2) * is1));
            npy_double u2 = *((npy_double *)(ip2 + (i + 2) * is2));
            *((npy_double *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_double v3 = *((npy_double *)(ip1 + (i + 3) * is1));
            npy_double u3 = *((npy_double *)(ip2 + (i + 3) * is2));
            *((npy_double *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_double in1 = *(npy_double *)ip1;
        const npy_double in2 = *(npy_double *)ip2;
        *((npy_double *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 1
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_maximum_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_double *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_double *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_double *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !0 || (1 && 0)
#define SCALAR_OP scalar_min_d

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_minimum)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_double)) {
            TO_SIMD_SFX(simd_reduce_c_min)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_double, npy_double)) {
            TO_SIMD_SFX(simd_binary_ccc_min)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 1
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_min)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_double m0 = *((npy_double *)(ip2 + (i + 0) * is2));
            npy_double m1 = *((npy_double *)(ip2 + (i + 1) * is2));
            npy_double m2 = *((npy_double *)(ip2 + (i + 2) * is2));
            npy_double m3 = *((npy_double *)(ip2 + (i + 3) * is2));
            npy_double m4 = *((npy_double *)(ip2 + (i + 4) * is2));
            npy_double m5 = *((npy_double *)(ip2 + (i + 5) * is2));
            npy_double m6 = *((npy_double *)(ip2 + (i + 6) * is2));
            npy_double m7 = *((npy_double *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_double v0 = *((npy_double *)(ip2 + (i + 0) * is2));
                npy_double v1 = *((npy_double *)(ip2 + (i + 1) * is2));
                npy_double v2 = *((npy_double *)(ip2 + (i + 2) * is2));
                npy_double v3 = *((npy_double *)(ip2 + (i + 3) * is2));
                npy_double v4 = *((npy_double *)(ip2 + (i + 4) * is2));
                npy_double v5 = *((npy_double *)(ip2 + (i + 5) * is2));
                npy_double v6 = *((npy_double *)(ip2 + (i + 6) * is2));
                npy_double v7 = *((npy_double *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_double *)op1) = SCALAR_OP(*((npy_double *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_double v0 = *((npy_double *)(ip1 + (i + 0) * is1));
            npy_double u0 = *((npy_double *)(ip2 + (i + 0) * is2));
            *((npy_double *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_double v1 = *((npy_double *)(ip1 + (i + 1) * is1));
            npy_double u1 = *((npy_double *)(ip2 + (i + 1) * is2));
            *((npy_double *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_double v2 = *((npy_double *)(ip1 + (i + 2) * is1));
            npy_double u2 = *((npy_double *)(ip2 + (i + 2) * is2));
            *((npy_double *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_double v3 = *((npy_double *)(ip1 + (i + 3) * is1));
            npy_double u3 = *((npy_double *)(ip2 + (i + 3) * is2));
            *((npy_double *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_double in1 = *(npy_double *)ip1;
        const npy_double in2 = *(npy_double *)ip2;
        *((npy_double *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 1
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_minimum_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_double *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_double *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_double *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !1 || (1 && 1)
#define SCALAR_OP scalar_maxp_d

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_fmax)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_double)) {
            TO_SIMD_SFX(simd_reduce_c_maxp)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_double, npy_double)) {
            TO_SIMD_SFX(simd_binary_ccc_maxp)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 1
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_maxp)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_double m0 = *((npy_double *)(ip2 + (i + 0) * is2));
            npy_double m1 = *((npy_double *)(ip2 + (i + 1) * is2));
            npy_double m2 = *((npy_double *)(ip2 + (i + 2) * is2));
            npy_double m3 = *((npy_double *)(ip2 + (i + 3) * is2));
            npy_double m4 = *((npy_double *)(ip2 + (i + 4) * is2));
            npy_double m5 = *((npy_double *)(ip2 + (i + 5) * is2));
            npy_double m6 = *((npy_double *)(ip2 + (i + 6) * is2));
            npy_double m7 = *((npy_double *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_double v0 = *((npy_double *)(ip2 + (i + 0) * is2));
                npy_double v1 = *((npy_double *)(ip2 + (i + 1) * is2));
                npy_double v2 = *((npy_double *)(ip2 + (i + 2) * is2));
                npy_double v3 = *((npy_double *)(ip2 + (i + 3) * is2));
                npy_double v4 = *((npy_double *)(ip2 + (i + 4) * is2));
                npy_double v5 = *((npy_double *)(ip2 + (i + 5) * is2));
                npy_double v6 = *((npy_double *)(ip2 + (i + 6) * is2));
                npy_double v7 = *((npy_double *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_double *)op1) = SCALAR_OP(*((npy_double *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_double v0 = *((npy_double *)(ip1 + (i + 0) * is1));
            npy_double u0 = *((npy_double *)(ip2 + (i + 0) * is2));
            *((npy_double *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_double v1 = *((npy_double *)(ip1 + (i + 1) * is1));
            npy_double u1 = *((npy_double *)(ip2 + (i + 1) * is2));
            *((npy_double *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_double v2 = *((npy_double *)(ip1 + (i + 2) * is1));
            npy_double u2 = *((npy_double *)(ip2 + (i + 2) * is2));
            *((npy_double *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_double v3 = *((npy_double *)(ip1 + (i + 3) * is1));
            npy_double u3 = *((npy_double *)(ip2 + (i + 3) * is2));
            *((npy_double *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_double in1 = *(npy_double *)ip1;
        const npy_double in2 = *(npy_double *)ip2;
        *((npy_double *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 1
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_fmax_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_double *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_double *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_double *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !1 || (1 && 1)
#define SCALAR_OP scalar_minp_d

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_fmin)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_double)) {
            TO_SIMD_SFX(simd_reduce_c_minp)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_double, npy_double)) {
            TO_SIMD_SFX(simd_binary_ccc_minp)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 1
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_minp)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_double m0 = *((npy_double *)(ip2 + (i + 0) * is2));
            npy_double m1 = *((npy_double *)(ip2 + (i + 1) * is2));
            npy_double m2 = *((npy_double *)(ip2 + (i + 2) * is2));
            npy_double m3 = *((npy_double *)(ip2 + (i + 3) * is2));
            npy_double m4 = *((npy_double *)(ip2 + (i + 4) * is2));
            npy_double m5 = *((npy_double *)(ip2 + (i + 5) * is2));
            npy_double m6 = *((npy_double *)(ip2 + (i + 6) * is2));
            npy_double m7 = *((npy_double *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_double v0 = *((npy_double *)(ip2 + (i + 0) * is2));
                npy_double v1 = *((npy_double *)(ip2 + (i + 1) * is2));
                npy_double v2 = *((npy_double *)(ip2 + (i + 2) * is2));
                npy_double v3 = *((npy_double *)(ip2 + (i + 3) * is2));
                npy_double v4 = *((npy_double *)(ip2 + (i + 4) * is2));
                npy_double v5 = *((npy_double *)(ip2 + (i + 5) * is2));
                npy_double v6 = *((npy_double *)(ip2 + (i + 6) * is2));
                npy_double v7 = *((npy_double *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_double *)op1) = SCALAR_OP(*((npy_double *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_double v0 = *((npy_double *)(ip1 + (i + 0) * is1));
            npy_double u0 = *((npy_double *)(ip2 + (i + 0) * is2));
            *((npy_double *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_double v1 = *((npy_double *)(ip1 + (i + 1) * is1));
            npy_double u1 = *((npy_double *)(ip2 + (i + 1) * is2));
            *((npy_double *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_double v2 = *((npy_double *)(ip1 + (i + 2) * is1));
            npy_double u2 = *((npy_double *)(ip2 + (i + 2) * is2));
            *((npy_double *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_double v3 = *((npy_double *)(ip1 + (i + 3) * is1));
            npy_double u3 = *((npy_double *)(ip2 + (i + 3) * is2));
            *((npy_double *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_double in1 = *(npy_double *)ip1;
        const npy_double in2 = *(npy_double *)ip2;
        *((npy_double *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 1
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_fmin_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_double *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_double *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_double *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)


#line 294
#undef TO_SIMD_SFX
#if 0
#line 299
#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 8
    #if 1
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_LONGDOUBLE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONGDOUBLE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 16
    #if 1
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_LONGDOUBLE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONGDOUBLE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 32
    #if 1
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_LONGDOUBLE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONGDOUBLE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 299
#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 64
    #if 1
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_LONGDOUBLE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONGDOUBLE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 321
#if !0 || (1 && 0)
#define SCALAR_OP scalar_max_l

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_maximum)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_longdouble)) {
            TO_SIMD_SFX(simd_reduce_c_max)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_longdouble, npy_longdouble)) {
            TO_SIMD_SFX(simd_binary_ccc_max)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 1
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_max)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_longdouble m0 = *((npy_longdouble *)(ip2 + (i + 0) * is2));
            npy_longdouble m1 = *((npy_longdouble *)(ip2 + (i + 1) * is2));
            npy_longdouble m2 = *((npy_longdouble *)(ip2 + (i + 2) * is2));
            npy_longdouble m3 = *((npy_longdouble *)(ip2 + (i + 3) * is2));
            npy_longdouble m4 = *((npy_longdouble *)(ip2 + (i + 4) * is2));
            npy_longdouble m5 = *((npy_longdouble *)(ip2 + (i + 5) * is2));
            npy_longdouble m6 = *((npy_longdouble *)(ip2 + (i + 6) * is2));
            npy_longdouble m7 = *((npy_longdouble *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_longdouble v0 = *((npy_longdouble *)(ip2 + (i + 0) * is2));
                npy_longdouble v1 = *((npy_longdouble *)(ip2 + (i + 1) * is2));
                npy_longdouble v2 = *((npy_longdouble *)(ip2 + (i + 2) * is2));
                npy_longdouble v3 = *((npy_longdouble *)(ip2 + (i + 3) * is2));
                npy_longdouble v4 = *((npy_longdouble *)(ip2 + (i + 4) * is2));
                npy_longdouble v5 = *((npy_longdouble *)(ip2 + (i + 5) * is2));
                npy_longdouble v6 = *((npy_longdouble *)(ip2 + (i + 6) * is2));
                npy_longdouble v7 = *((npy_longdouble *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_longdouble *)op1) = SCALAR_OP(*((npy_longdouble *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_longdouble v0 = *((npy_longdouble *)(ip1 + (i + 0) * is1));
            npy_longdouble u0 = *((npy_longdouble *)(ip2 + (i + 0) * is2));
            *((npy_longdouble *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_longdouble v1 = *((npy_longdouble *)(ip1 + (i + 1) * is1));
            npy_longdouble u1 = *((npy_longdouble *)(ip2 + (i + 1) * is2));
            *((npy_longdouble *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_longdouble v2 = *((npy_longdouble *)(ip1 + (i + 2) * is1));
            npy_longdouble u2 = *((npy_longdouble *)(ip2 + (i + 2) * is2));
            *((npy_longdouble *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_longdouble v3 = *((npy_longdouble *)(ip1 + (i + 3) * is1));
            npy_longdouble u3 = *((npy_longdouble *)(ip2 + (i + 3) * is2));
            *((npy_longdouble *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_longdouble in1 = *(npy_longdouble *)ip1;
        const npy_longdouble in2 = *(npy_longdouble *)ip2;
        *((npy_longdouble *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 1
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_maximum_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_longdouble *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_longdouble *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_longdouble *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !0 || (1 && 0)
#define SCALAR_OP scalar_min_l

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_minimum)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_longdouble)) {
            TO_SIMD_SFX(simd_reduce_c_min)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_longdouble, npy_longdouble)) {
            TO_SIMD_SFX(simd_binary_ccc_min)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 1
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_min)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_longdouble m0 = *((npy_longdouble *)(ip2 + (i + 0) * is2));
            npy_longdouble m1 = *((npy_longdouble *)(ip2 + (i + 1) * is2));
            npy_longdouble m2 = *((npy_longdouble *)(ip2 + (i + 2) * is2));
            npy_longdouble m3 = *((npy_longdouble *)(ip2 + (i + 3) * is2));
            npy_longdouble m4 = *((npy_longdouble *)(ip2 + (i + 4) * is2));
            npy_longdouble m5 = *((npy_longdouble *)(ip2 + (i + 5) * is2));
            npy_longdouble m6 = *((npy_longdouble *)(ip2 + (i + 6) * is2));
            npy_longdouble m7 = *((npy_longdouble *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_longdouble v0 = *((npy_longdouble *)(ip2 + (i + 0) * is2));
                npy_longdouble v1 = *((npy_longdouble *)(ip2 + (i + 1) * is2));
                npy_longdouble v2 = *((npy_longdouble *)(ip2 + (i + 2) * is2));
                npy_longdouble v3 = *((npy_longdouble *)(ip2 + (i + 3) * is2));
                npy_longdouble v4 = *((npy_longdouble *)(ip2 + (i + 4) * is2));
                npy_longdouble v5 = *((npy_longdouble *)(ip2 + (i + 5) * is2));
                npy_longdouble v6 = *((npy_longdouble *)(ip2 + (i + 6) * is2));
                npy_longdouble v7 = *((npy_longdouble *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_longdouble *)op1) = SCALAR_OP(*((npy_longdouble *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_longdouble v0 = *((npy_longdouble *)(ip1 + (i + 0) * is1));
            npy_longdouble u0 = *((npy_longdouble *)(ip2 + (i + 0) * is2));
            *((npy_longdouble *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_longdouble v1 = *((npy_longdouble *)(ip1 + (i + 1) * is1));
            npy_longdouble u1 = *((npy_longdouble *)(ip2 + (i + 1) * is2));
            *((npy_longdouble *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_longdouble v2 = *((npy_longdouble *)(ip1 + (i + 2) * is1));
            npy_longdouble u2 = *((npy_longdouble *)(ip2 + (i + 2) * is2));
            *((npy_longdouble *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_longdouble v3 = *((npy_longdouble *)(ip1 + (i + 3) * is1));
            npy_longdouble u3 = *((npy_longdouble *)(ip2 + (i + 3) * is2));
            *((npy_longdouble *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_longdouble in1 = *(npy_longdouble *)ip1;
        const npy_longdouble in2 = *(npy_longdouble *)ip2;
        *((npy_longdouble *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 1
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_minimum_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_longdouble *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_longdouble *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_longdouble *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !1 || (1 && 1)
#define SCALAR_OP scalar_maxp_l

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_fmax)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_longdouble)) {
            TO_SIMD_SFX(simd_reduce_c_maxp)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_longdouble, npy_longdouble)) {
            TO_SIMD_SFX(simd_binary_ccc_maxp)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 1
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_maxp)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_longdouble m0 = *((npy_longdouble *)(ip2 + (i + 0) * is2));
            npy_longdouble m1 = *((npy_longdouble *)(ip2 + (i + 1) * is2));
            npy_longdouble m2 = *((npy_longdouble *)(ip2 + (i + 2) * is2));
            npy_longdouble m3 = *((npy_longdouble *)(ip2 + (i + 3) * is2));
            npy_longdouble m4 = *((npy_longdouble *)(ip2 + (i + 4) * is2));
            npy_longdouble m5 = *((npy_longdouble *)(ip2 + (i + 5) * is2));
            npy_longdouble m6 = *((npy_longdouble *)(ip2 + (i + 6) * is2));
            npy_longdouble m7 = *((npy_longdouble *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_longdouble v0 = *((npy_longdouble *)(ip2 + (i + 0) * is2));
                npy_longdouble v1 = *((npy_longdouble *)(ip2 + (i + 1) * is2));
                npy_longdouble v2 = *((npy_longdouble *)(ip2 + (i + 2) * is2));
                npy_longdouble v3 = *((npy_longdouble *)(ip2 + (i + 3) * is2));
                npy_longdouble v4 = *((npy_longdouble *)(ip2 + (i + 4) * is2));
                npy_longdouble v5 = *((npy_longdouble *)(ip2 + (i + 5) * is2));
                npy_longdouble v6 = *((npy_longdouble *)(ip2 + (i + 6) * is2));
                npy_longdouble v7 = *((npy_longdouble *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_longdouble *)op1) = SCALAR_OP(*((npy_longdouble *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_longdouble v0 = *((npy_longdouble *)(ip1 + (i + 0) * is1));
            npy_longdouble u0 = *((npy_longdouble *)(ip2 + (i + 0) * is2));
            *((npy_longdouble *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_longdouble v1 = *((npy_longdouble *)(ip1 + (i + 1) * is1));
            npy_longdouble u1 = *((npy_longdouble *)(ip2 + (i + 1) * is2));
            *((npy_longdouble *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_longdouble v2 = *((npy_longdouble *)(ip1 + (i + 2) * is1));
            npy_longdouble u2 = *((npy_longdouble *)(ip2 + (i + 2) * is2));
            *((npy_longdouble *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_longdouble v3 = *((npy_longdouble *)(ip1 + (i + 3) * is1));
            npy_longdouble u3 = *((npy_longdouble *)(ip2 + (i + 3) * is2));
            *((npy_longdouble *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_longdouble in1 = *(npy_longdouble *)ip1;
        const npy_longdouble in2 = *(npy_longdouble *)ip2;
        *((npy_longdouble *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 1
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_fmax_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_longdouble *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_longdouble *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_longdouble *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)

#line 321
#if !1 || (1 && 1)
#define SCALAR_OP scalar_minp_l

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_fmin)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
             len = dimensions[0];
    npy_intp i = 0;
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (IS_BINARY_REDUCE) {
        // reduce and contiguous
        if (is2 == sizeof(npy_longdouble)) {
            TO_SIMD_SFX(simd_reduce_c_minp)(
                (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    }
    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
        !is_mem_overlap(ip2, is2, op1, os1, len)
    ) {
        // no overlap and operands are binary contiguous
        if (IS_BINARY_CONT(npy_longdouble, npy_longdouble)) {
            TO_SIMD_SFX(simd_binary_ccc_minp)(
                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
            );
            goto clear_fp;
        }
    // unroll scalars faster than non-contiguous vector load/store on Arm
    #if !defined(NPY_HAVE_NEON) && 1
        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
        ) {
            TO_SIMD_SFX(simd_binary_minp)(
                (STYPE*)ip1, is1/sizeof(STYPE),
                (STYPE*)ip2, is2/sizeof(STYPE),
                (STYPE*)op1, os1/sizeof(STYPE), len
            );
            goto clear_fp;
        }
    #endif
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    // scalar unrolls
    if (IS_BINARY_REDUCE) {
        // Note, 8x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 8;
        if((i+elemPerLoop) <= len){
            npy_longdouble m0 = *((npy_longdouble *)(ip2 + (i + 0) * is2));
            npy_longdouble m1 = *((npy_longdouble *)(ip2 + (i + 1) * is2));
            npy_longdouble m2 = *((npy_longdouble *)(ip2 + (i + 2) * is2));
            npy_longdouble m3 = *((npy_longdouble *)(ip2 + (i + 3) * is2));
            npy_longdouble m4 = *((npy_longdouble *)(ip2 + (i + 4) * is2));
            npy_longdouble m5 = *((npy_longdouble *)(ip2 + (i + 5) * is2));
            npy_longdouble m6 = *((npy_longdouble *)(ip2 + (i + 6) * is2));
            npy_longdouble m7 = *((npy_longdouble *)(ip2 + (i + 7) * is2));

            i += elemPerLoop;
            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
                npy_longdouble v0 = *((npy_longdouble *)(ip2 + (i + 0) * is2));
                npy_longdouble v1 = *((npy_longdouble *)(ip2 + (i + 1) * is2));
                npy_longdouble v2 = *((npy_longdouble *)(ip2 + (i + 2) * is2));
                npy_longdouble v3 = *((npy_longdouble *)(ip2 + (i + 3) * is2));
                npy_longdouble v4 = *((npy_longdouble *)(ip2 + (i + 4) * is2));
                npy_longdouble v5 = *((npy_longdouble *)(ip2 + (i + 5) * is2));
                npy_longdouble v6 = *((npy_longdouble *)(ip2 + (i + 6) * is2));
                npy_longdouble v7 = *((npy_longdouble *)(ip2 + (i + 7) * is2));

                m0 = SCALAR_OP(m0, v0);
                m1 = SCALAR_OP(m1, v1);
                m2 = SCALAR_OP(m2, v2);
                m3 = SCALAR_OP(m3, v3);
                m4 = SCALAR_OP(m4, v4);
                m5 = SCALAR_OP(m5, v5);
                m6 = SCALAR_OP(m6, v6);
                m7 = SCALAR_OP(m7, v7);
            }

            m0 = SCALAR_OP(m0, m1);
            m2 = SCALAR_OP(m2, m3);
            m4 = SCALAR_OP(m4, m5);
            m6 = SCALAR_OP(m6, m7);

            m0 = SCALAR_OP(m0, m2);
            m4 = SCALAR_OP(m4, m6);

            m0 = SCALAR_OP(m0, m4);

             *((npy_longdouble *)op1) = SCALAR_OP(*((npy_longdouble *)op1), m0);
        }
    } else{
        // Note, 4x unroll was chosen for best results on Apple M1
        npy_intp elemPerLoop = 4;
        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
            /* Note, we can't just load all, do all ops, then store all here.
             * Sometimes ufuncs are called with `accumulate`, which makes the
             * assumption that previous iterations have finished before next
             * iteration.  For example, the output of iteration 2 depends on the
             * result of iteration 1.
             */

            #line 431
            npy_longdouble v0 = *((npy_longdouble *)(ip1 + (i + 0) * is1));
            npy_longdouble u0 = *((npy_longdouble *)(ip2 + (i + 0) * is2));
            *((npy_longdouble *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
            
#line 431
            npy_longdouble v1 = *((npy_longdouble *)(ip1 + (i + 1) * is1));
            npy_longdouble u1 = *((npy_longdouble *)(ip2 + (i + 1) * is2));
            *((npy_longdouble *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
            
#line 431
            npy_longdouble v2 = *((npy_longdouble *)(ip1 + (i + 2) * is1));
            npy_longdouble u2 = *((npy_longdouble *)(ip2 + (i + 2) * is2));
            *((npy_longdouble *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
            
#line 431
            npy_longdouble v3 = *((npy_longdouble *)(ip1 + (i + 3) * is1));
            npy_longdouble u3 = *((npy_longdouble *)(ip2 + (i + 3) * is2));
            *((npy_longdouble *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
            
        }
    }
#endif // NPY_DISABLE_OPTIMIZATION
    ip1 += is1 * i;
    ip2 += is2 * i;
    op1 += os1 * i;
    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
        const npy_longdouble in1 = *(npy_longdouble *)ip1;
        const npy_longdouble in2 = *(npy_longdouble *)ip2;
        *((npy_longdouble *)op1) = SCALAR_OP(in1, in2);
    }
#ifdef TO_SIMD_SFX
clear_fp:
    npyv_cleanup();
#endif
#if 1
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_fmin_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp n = dimensions[0];
    npy_intp shape = steps[3];
    npy_intp i;
    npy_longdouble *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_longdouble *)(ip1 + is1 * indx);
        *indexed = SCALAR_OP(*indexed, *(npy_longdouble *)value);
    }
    return 0;
}

#undef SCALAR_OP

#endif // !fp_only || (is_fp && fp_only)




