#line 1 "numpy/core/src/multiarray/argfunc.dispatch.c.src"

/*
 *****************************************************************************
 **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
 **       Changes should be made to the original source (.src) file         **
 *****************************************************************************
 */

#line 1
/* -*- c -*- */
/*@targets
 ** $maxopt baseline
 ** sse2 sse42 xop avx2 avx512_skx
 ** vsx2
 ** neon asimd
 ** vx vxe
 **/

#define NPY_NO_DEPRECATED_API NPY_API_VERSION

#include "simd/simd.h"
#include "numpy/npy_math.h"

#include "arraytypes.h"

#define MIN(a,b) (((a)<(b))?(a):(b))

#if NPY_SIMD
#if NPY_SIMD > 512 || NPY_SIMD < 0
    #error "the following 8/16-bit argmax kernel isn't applicable for larger SIMD"
    // TODO: add special loop for large SIMD width.
    // i.e avoid unroll by x4 should be numerically safe till 2048-bit SIMD width
    // or maybe expand the indices to 32|64-bit vectors(slower).
#endif
#line 32
#line 37
static inline npy_intp
simd_argmax_u8(npyv_lanetype_u8 *ip, npy_intp len)
{
    npyv_lanetype_u8 s_acc = *ip;
    npy_intp ret_idx = 0, i = 0;

    const int vstep = npyv_nlanes_u8;
    const int wstep = vstep*4;
    npyv_lanetype_u8 d_vindices[npyv_nlanes_u8*4];
    for (int vi = 0; vi < wstep; ++vi) {
        d_vindices[vi] = vi;
    }
    const npyv_u8 vindices_0 = npyv_load_u8(d_vindices);
    const npyv_u8 vindices_1 = npyv_load_u8(d_vindices + vstep);
    const npyv_u8 vindices_2 = npyv_load_u8(d_vindices + vstep*2);
    const npyv_u8 vindices_3 = npyv_load_u8(d_vindices + vstep*3);

    const npy_intp max_block = NPY_MAX_UINT8*wstep & -wstep;
    npy_intp len0 = len & -wstep;
    while (i < len0) {
        npyv_u8 acc = npyv_setall_u8(s_acc);
        npyv_u8 acc_indices = npyv_zero_u8();
        npyv_u8 acc_indices_scale = npyv_zero_u8();

        npy_intp n = i + MIN(len0 - i, max_block);
        npy_intp ik = i, i2 = 0;
        for (; i < n; i += wstep, ++i2) {
            npyv_u8 vi = npyv_setall_u8((npyv_lanetype_u8)i2);
            npyv_u8 a = npyv_load_u8(ip + i);
            npyv_u8 b = npyv_load_u8(ip + i + vstep);
            npyv_u8 c = npyv_load_u8(ip + i + vstep*2);
            npyv_u8 d = npyv_load_u8(ip + i + vstep*3);

            // reverse to put lowest index first in case of matched values
            npyv_b8 m_ba = npyv_cmpgt_u8(b, a);
            npyv_b8 m_dc = npyv_cmpgt_u8(d, c);
            npyv_u8  x_ba = npyv_select_u8(m_ba, b, a);
            npyv_u8  x_dc = npyv_select_u8(m_dc, d, c);
            npyv_b8 m_dcba = npyv_cmpgt_u8(x_dc, x_ba);
            npyv_u8  x_dcba = npyv_select_u8(m_dcba, x_dc, x_ba);

            npyv_u8 idx_ba = npyv_select_u8(m_ba, vindices_1, vindices_0);
            npyv_u8 idx_dc = npyv_select_u8(m_dc, vindices_3, vindices_2);
            npyv_u8 idx_dcba = npyv_select_u8(m_dcba, idx_dc, idx_ba);
            npyv_b8 m_acc = npyv_cmpgt_u8(x_dcba, acc);
            acc = npyv_select_u8(m_acc, x_dcba, acc);
            acc_indices = npyv_select_u8(m_acc, idx_dcba, acc_indices);
            acc_indices_scale = npyv_select_u8(m_acc, vi, acc_indices_scale);
        }
        // reduce
        npyv_lanetype_u8 dacc[npyv_nlanes_u8];
        npyv_lanetype_u8 dacc_i[npyv_nlanes_u8];
        npyv_lanetype_u8 dacc_s[npyv_nlanes_u8];
        npyv_store_u8(dacc, acc);
        npyv_store_u8(dacc_i, acc_indices);
        npyv_store_u8(dacc_s, acc_indices_scale);

        for (int vi = 0; vi < vstep; ++vi) {
            if (dacc[vi] > s_acc) {
                s_acc = dacc[vi];
                ret_idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
            }
        }
        // get the lowest index in case of matched values
        for (int vi = 0; vi < vstep; ++vi) {
            npy_intp idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
            if (s_acc == dacc[vi] && ret_idx > idx) {
                ret_idx = idx;
            }
        }
    }
    for (; i < len; ++i) {
        npyv_lanetype_u8 a = ip[i];
        if (a > s_acc) {
            s_acc = a;
            ret_idx = i;
        }
    }
    return ret_idx;
}

#line 37
static inline npy_intp
simd_argmin_u8(npyv_lanetype_u8 *ip, npy_intp len)
{
    npyv_lanetype_u8 s_acc = *ip;
    npy_intp ret_idx = 0, i = 0;

    const int vstep = npyv_nlanes_u8;
    const int wstep = vstep*4;
    npyv_lanetype_u8 d_vindices[npyv_nlanes_u8*4];
    for (int vi = 0; vi < wstep; ++vi) {
        d_vindices[vi] = vi;
    }
    const npyv_u8 vindices_0 = npyv_load_u8(d_vindices);
    const npyv_u8 vindices_1 = npyv_load_u8(d_vindices + vstep);
    const npyv_u8 vindices_2 = npyv_load_u8(d_vindices + vstep*2);
    const npyv_u8 vindices_3 = npyv_load_u8(d_vindices + vstep*3);

    const npy_intp max_block = NPY_MAX_UINT8*wstep & -wstep;
    npy_intp len0 = len & -wstep;
    while (i < len0) {
        npyv_u8 acc = npyv_setall_u8(s_acc);
        npyv_u8 acc_indices = npyv_zero_u8();
        npyv_u8 acc_indices_scale = npyv_zero_u8();

        npy_intp n = i + MIN(len0 - i, max_block);
        npy_intp ik = i, i2 = 0;
        for (; i < n; i += wstep, ++i2) {
            npyv_u8 vi = npyv_setall_u8((npyv_lanetype_u8)i2);
            npyv_u8 a = npyv_load_u8(ip + i);
            npyv_u8 b = npyv_load_u8(ip + i + vstep);
            npyv_u8 c = npyv_load_u8(ip + i + vstep*2);
            npyv_u8 d = npyv_load_u8(ip + i + vstep*3);

            // reverse to put lowest index first in case of matched values
            npyv_b8 m_ba = npyv_cmplt_u8(b, a);
            npyv_b8 m_dc = npyv_cmplt_u8(d, c);
            npyv_u8  x_ba = npyv_select_u8(m_ba, b, a);
            npyv_u8  x_dc = npyv_select_u8(m_dc, d, c);
            npyv_b8 m_dcba = npyv_cmplt_u8(x_dc, x_ba);
            npyv_u8  x_dcba = npyv_select_u8(m_dcba, x_dc, x_ba);

            npyv_u8 idx_ba = npyv_select_u8(m_ba, vindices_1, vindices_0);
            npyv_u8 idx_dc = npyv_select_u8(m_dc, vindices_3, vindices_2);
            npyv_u8 idx_dcba = npyv_select_u8(m_dcba, idx_dc, idx_ba);
            npyv_b8 m_acc = npyv_cmplt_u8(x_dcba, acc);
            acc = npyv_select_u8(m_acc, x_dcba, acc);
            acc_indices = npyv_select_u8(m_acc, idx_dcba, acc_indices);
            acc_indices_scale = npyv_select_u8(m_acc, vi, acc_indices_scale);
        }
        // reduce
        npyv_lanetype_u8 dacc[npyv_nlanes_u8];
        npyv_lanetype_u8 dacc_i[npyv_nlanes_u8];
        npyv_lanetype_u8 dacc_s[npyv_nlanes_u8];
        npyv_store_u8(dacc, acc);
        npyv_store_u8(dacc_i, acc_indices);
        npyv_store_u8(dacc_s, acc_indices_scale);

        for (int vi = 0; vi < vstep; ++vi) {
            if (dacc[vi] < s_acc) {
                s_acc = dacc[vi];
                ret_idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
            }
        }
        // get the lowest index in case of matched values
        for (int vi = 0; vi < vstep; ++vi) {
            npy_intp idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
            if (s_acc == dacc[vi] && ret_idx > idx) {
                ret_idx = idx;
            }
        }
    }
    for (; i < len; ++i) {
        npyv_lanetype_u8 a = ip[i];
        if (a < s_acc) {
            s_acc = a;
            ret_idx = i;
        }
    }
    return ret_idx;
}


#line 32
#line 37
static inline npy_intp
simd_argmax_s8(npyv_lanetype_s8 *ip, npy_intp len)
{
    npyv_lanetype_s8 s_acc = *ip;
    npy_intp ret_idx = 0, i = 0;

    const int vstep = npyv_nlanes_s8;
    const int wstep = vstep*4;
    npyv_lanetype_u8 d_vindices[npyv_nlanes_s8*4];
    for (int vi = 0; vi < wstep; ++vi) {
        d_vindices[vi] = vi;
    }
    const npyv_u8 vindices_0 = npyv_load_u8(d_vindices);
    const npyv_u8 vindices_1 = npyv_load_u8(d_vindices + vstep);
    const npyv_u8 vindices_2 = npyv_load_u8(d_vindices + vstep*2);
    const npyv_u8 vindices_3 = npyv_load_u8(d_vindices + vstep*3);

    const npy_intp max_block = NPY_MAX_UINT8*wstep & -wstep;
    npy_intp len0 = len & -wstep;
    while (i < len0) {
        npyv_s8 acc = npyv_setall_s8(s_acc);
        npyv_u8 acc_indices = npyv_zero_u8();
        npyv_u8 acc_indices_scale = npyv_zero_u8();

        npy_intp n = i + MIN(len0 - i, max_block);
        npy_intp ik = i, i2 = 0;
        for (; i < n; i += wstep, ++i2) {
            npyv_u8 vi = npyv_setall_u8((npyv_lanetype_u8)i2);
            npyv_s8 a = npyv_load_s8(ip + i);
            npyv_s8 b = npyv_load_s8(ip + i + vstep);
            npyv_s8 c = npyv_load_s8(ip + i + vstep*2);
            npyv_s8 d = npyv_load_s8(ip + i + vstep*3);

            // reverse to put lowest index first in case of matched values
            npyv_b8 m_ba = npyv_cmpgt_s8(b, a);
            npyv_b8 m_dc = npyv_cmpgt_s8(d, c);
            npyv_s8  x_ba = npyv_select_s8(m_ba, b, a);
            npyv_s8  x_dc = npyv_select_s8(m_dc, d, c);
            npyv_b8 m_dcba = npyv_cmpgt_s8(x_dc, x_ba);
            npyv_s8  x_dcba = npyv_select_s8(m_dcba, x_dc, x_ba);

            npyv_u8 idx_ba = npyv_select_u8(m_ba, vindices_1, vindices_0);
            npyv_u8 idx_dc = npyv_select_u8(m_dc, vindices_3, vindices_2);
            npyv_u8 idx_dcba = npyv_select_u8(m_dcba, idx_dc, idx_ba);
            npyv_b8 m_acc = npyv_cmpgt_s8(x_dcba, acc);
            acc = npyv_select_s8(m_acc, x_dcba, acc);
            acc_indices = npyv_select_u8(m_acc, idx_dcba, acc_indices);
            acc_indices_scale = npyv_select_u8(m_acc, vi, acc_indices_scale);
        }
        // reduce
        npyv_lanetype_s8 dacc[npyv_nlanes_s8];
        npyv_lanetype_u8 dacc_i[npyv_nlanes_s8];
        npyv_lanetype_u8 dacc_s[npyv_nlanes_s8];
        npyv_store_s8(dacc, acc);
        npyv_store_u8(dacc_i, acc_indices);
        npyv_store_u8(dacc_s, acc_indices_scale);

        for (int vi = 0; vi < vstep; ++vi) {
            if (dacc[vi] > s_acc) {
                s_acc = dacc[vi];
                ret_idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
            }
        }
        // get the lowest index in case of matched values
        for (int vi = 0; vi < vstep; ++vi) {
            npy_intp idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
            if (s_acc == dacc[vi] && ret_idx > idx) {
                ret_idx = idx;
            }
        }
    }
    for (; i < len; ++i) {
        npyv_lanetype_s8 a = ip[i];
        if (a > s_acc) {
            s_acc = a;
            ret_idx = i;
        }
    }
    return ret_idx;
}

#line 37
static inline npy_intp
simd_argmin_s8(npyv_lanetype_s8 *ip, npy_intp len)
{
    npyv_lanetype_s8 s_acc = *ip;
    npy_intp ret_idx = 0, i = 0;

    const int vstep = npyv_nlanes_s8;
    const int wstep = vstep*4;
    npyv_lanetype_u8 d_vindices[npyv_nlanes_s8*4];
    for (int vi = 0; vi < wstep; ++vi) {
        d_vindices[vi] = vi;
    }
    const npyv_u8 vindices_0 = npyv_load_u8(d_vindices);
    const npyv_u8 vindices_1 = npyv_load_u8(d_vindices + vstep);
    const npyv_u8 vindices_2 = npyv_load_u8(d_vindices + vstep*2);
    const npyv_u8 vindices_3 = npyv_load_u8(d_vindices + vstep*3);

    const npy_intp max_block = NPY_MAX_UINT8*wstep & -wstep;
    npy_intp len0 = len & -wstep;
    while (i < len0) {
        npyv_s8 acc = npyv_setall_s8(s_acc);
        npyv_u8 acc_indices = npyv_zero_u8();
        npyv_u8 acc_indices_scale = npyv_zero_u8();

        npy_intp n = i + MIN(len0 - i, max_block);
        npy_intp ik = i, i2 = 0;
        for (; i < n; i += wstep, ++i2) {
            npyv_u8 vi = npyv_setall_u8((npyv_lanetype_u8)i2);
            npyv_s8 a = npyv_load_s8(ip + i);
            npyv_s8 b = npyv_load_s8(ip + i + vstep);
            npyv_s8 c = npyv_load_s8(ip + i + vstep*2);
            npyv_s8 d = npyv_load_s8(ip + i + vstep*3);

            // reverse to put lowest index first in case of matched values
            npyv_b8 m_ba = npyv_cmplt_s8(b, a);
            npyv_b8 m_dc = npyv_cmplt_s8(d, c);
            npyv_s8  x_ba = npyv_select_s8(m_ba, b, a);
            npyv_s8  x_dc = npyv_select_s8(m_dc, d, c);
            npyv_b8 m_dcba = npyv_cmplt_s8(x_dc, x_ba);
            npyv_s8  x_dcba = npyv_select_s8(m_dcba, x_dc, x_ba);

            npyv_u8 idx_ba = npyv_select_u8(m_ba, vindices_1, vindices_0);
            npyv_u8 idx_dc = npyv_select_u8(m_dc, vindices_3, vindices_2);
            npyv_u8 idx_dcba = npyv_select_u8(m_dcba, idx_dc, idx_ba);
            npyv_b8 m_acc = npyv_cmplt_s8(x_dcba, acc);
            acc = npyv_select_s8(m_acc, x_dcba, acc);
            acc_indices = npyv_select_u8(m_acc, idx_dcba, acc_indices);
            acc_indices_scale = npyv_select_u8(m_acc, vi, acc_indices_scale);
        }
        // reduce
        npyv_lanetype_s8 dacc[npyv_nlanes_s8];
        npyv_lanetype_u8 dacc_i[npyv_nlanes_s8];
        npyv_lanetype_u8 dacc_s[npyv_nlanes_s8];
        npyv_store_s8(dacc, acc);
        npyv_store_u8(dacc_i, acc_indices);
        npyv_store_u8(dacc_s, acc_indices_scale);

        for (int vi = 0; vi < vstep; ++vi) {
            if (dacc[vi] < s_acc) {
                s_acc = dacc[vi];
                ret_idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
            }
        }
        // get the lowest index in case of matched values
        for (int vi = 0; vi < vstep; ++vi) {
            npy_intp idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
            if (s_acc == dacc[vi] && ret_idx > idx) {
                ret_idx = idx;
            }
        }
    }
    for (; i < len; ++i) {
        npyv_lanetype_s8 a = ip[i];
        if (a < s_acc) {
            s_acc = a;
            ret_idx = i;
        }
    }
    return ret_idx;
}


#line 32
#line 37
static inline npy_intp
simd_argmax_u16(npyv_lanetype_u16 *ip, npy_intp len)
{
    npyv_lanetype_u16 s_acc = *ip;
    npy_intp ret_idx = 0, i = 0;

    const int vstep = npyv_nlanes_u16;
    const int wstep = vstep*4;
    npyv_lanetype_u16 d_vindices[npyv_nlanes_u16*4];
    for (int vi = 0; vi < wstep; ++vi) {
        d_vindices[vi] = vi;
    }
    const npyv_u16 vindices_0 = npyv_load_u16(d_vindices);
    const npyv_u16 vindices_1 = npyv_load_u16(d_vindices + vstep);
    const npyv_u16 vindices_2 = npyv_load_u16(d_vindices + vstep*2);
    const npyv_u16 vindices_3 = npyv_load_u16(d_vindices + vstep*3);

    const npy_intp max_block = NPY_MAX_UINT16*wstep & -wstep;
    npy_intp len0 = len & -wstep;
    while (i < len0) {
        npyv_u16 acc = npyv_setall_u16(s_acc);
        npyv_u16 acc_indices = npyv_zero_u16();
        npyv_u16 acc_indices_scale = npyv_zero_u16();

        npy_intp n = i + MIN(len0 - i, max_block);
        npy_intp ik = i, i2 = 0;
        for (; i < n; i += wstep, ++i2) {
            npyv_u16 vi = npyv_setall_u16((npyv_lanetype_u16)i2);
            npyv_u16 a = npyv_load_u16(ip + i);
            npyv_u16 b = npyv_load_u16(ip + i + vstep);
            npyv_u16 c = npyv_load_u16(ip + i + vstep*2);
            npyv_u16 d = npyv_load_u16(ip + i + vstep*3);

            // reverse to put lowest index first in case of matched values
            npyv_b16 m_ba = npyv_cmpgt_u16(b, a);
            npyv_b16 m_dc = npyv_cmpgt_u16(d, c);
            npyv_u16  x_ba = npyv_select_u16(m_ba, b, a);
            npyv_u16  x_dc = npyv_select_u16(m_dc, d, c);
            npyv_b16 m_dcba = npyv_cmpgt_u16(x_dc, x_ba);
            npyv_u16  x_dcba = npyv_select_u16(m_dcba, x_dc, x_ba);

            npyv_u16 idx_ba = npyv_select_u16(m_ba, vindices_1, vindices_0);
            npyv_u16 idx_dc = npyv_select_u16(m_dc, vindices_3, vindices_2);
            npyv_u16 idx_dcba = npyv_select_u16(m_dcba, idx_dc, idx_ba);
            npyv_b16 m_acc = npyv_cmpgt_u16(x_dcba, acc);
            acc = npyv_select_u16(m_acc, x_dcba, acc);
            acc_indices = npyv_select_u16(m_acc, idx_dcba, acc_indices);
            acc_indices_scale = npyv_select_u16(m_acc, vi, acc_indices_scale);
        }
        // reduce
        npyv_lanetype_u16 dacc[npyv_nlanes_u16];
        npyv_lanetype_u16 dacc_i[npyv_nlanes_u16];
        npyv_lanetype_u16 dacc_s[npyv_nlanes_u16];
        npyv_store_u16(dacc, acc);
        npyv_store_u16(dacc_i, acc_indices);
        npyv_store_u16(dacc_s, acc_indices_scale);

        for (int vi = 0; vi < vstep; ++vi) {
            if (dacc[vi] > s_acc) {
                s_acc = dacc[vi];
                ret_idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
            }
        }
        // get the lowest index in case of matched values
        for (int vi = 0; vi < vstep; ++vi) {
            npy_intp idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
            if (s_acc == dacc[vi] && ret_idx > idx) {
                ret_idx = idx;
            }
        }
    }
    for (; i < len; ++i) {
        npyv_lanetype_u16 a = ip[i];
        if (a > s_acc) {
            s_acc = a;
            ret_idx = i;
        }
    }
    return ret_idx;
}

#line 37
static inline npy_intp
simd_argmin_u16(npyv_lanetype_u16 *ip, npy_intp len)
{
    npyv_lanetype_u16 s_acc = *ip;
    npy_intp ret_idx = 0, i = 0;

    const int vstep = npyv_nlanes_u16;
    const int wstep = vstep*4;
    npyv_lanetype_u16 d_vindices[npyv_nlanes_u16*4];
    for (int vi = 0; vi < wstep; ++vi) {
        d_vindices[vi] = vi;
    }
    const npyv_u16 vindices_0 = npyv_load_u16(d_vindices);
    const npyv_u16 vindices_1 = npyv_load_u16(d_vindices + vstep);
    const npyv_u16 vindices_2 = npyv_load_u16(d_vindices + vstep*2);
    const npyv_u16 vindices_3 = npyv_load_u16(d_vindices + vstep*3);

    const npy_intp max_block = NPY_MAX_UINT16*wstep & -wstep;
    npy_intp len0 = len & -wstep;
    while (i < len0) {
        npyv_u16 acc = npyv_setall_u16(s_acc);
        npyv_u16 acc_indices = npyv_zero_u16();
        npyv_u16 acc_indices_scale = npyv_zero_u16();

        npy_intp n = i + MIN(len0 - i, max_block);
        npy_intp ik = i, i2 = 0;
        for (; i < n; i += wstep, ++i2) {
            npyv_u16 vi = npyv_setall_u16((npyv_lanetype_u16)i2);
            npyv_u16 a = npyv_load_u16(ip + i);
            npyv_u16 b = npyv_load_u16(ip + i + vstep);
            npyv_u16 c = npyv_load_u16(ip + i + vstep*2);
            npyv_u16 d = npyv_load_u16(ip + i + vstep*3);

            // reverse to put lowest index first in case of matched values
            npyv_b16 m_ba = npyv_cmplt_u16(b, a);
            npyv_b16 m_dc = npyv_cmplt_u16(d, c);
            npyv_u16  x_ba = npyv_select_u16(m_ba, b, a);
            npyv_u16  x_dc = npyv_select_u16(m_dc, d, c);
            npyv_b16 m_dcba = npyv_cmplt_u16(x_dc, x_ba);
            npyv_u16  x_dcba = npyv_select_u16(m_dcba, x_dc, x_ba);

            npyv_u16 idx_ba = npyv_select_u16(m_ba, vindices_1, vindices_0);
            npyv_u16 idx_dc = npyv_select_u16(m_dc, vindices_3, vindices_2);
            npyv_u16 idx_dcba = npyv_select_u16(m_dcba, idx_dc, idx_ba);
            npyv_b16 m_acc = npyv_cmplt_u16(x_dcba, acc);
            acc = npyv_select_u16(m_acc, x_dcba, acc);
            acc_indices = npyv_select_u16(m_acc, idx_dcba, acc_indices);
            acc_indices_scale = npyv_select_u16(m_acc, vi, acc_indices_scale);
        }
        // reduce
        npyv_lanetype_u16 dacc[npyv_nlanes_u16];
        npyv_lanetype_u16 dacc_i[npyv_nlanes_u16];
        npyv_lanetype_u16 dacc_s[npyv_nlanes_u16];
        npyv_store_u16(dacc, acc);
        npyv_store_u16(dacc_i, acc_indices);
        npyv_store_u16(dacc_s, acc_indices_scale);

        for (int vi = 0; vi < vstep; ++vi) {
            if (dacc[vi] < s_acc) {
                s_acc = dacc[vi];
                ret_idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
            }
        }
        // get the lowest index in case of matched values
        for (int vi = 0; vi < vstep; ++vi) {
            npy_intp idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
            if (s_acc == dacc[vi] && ret_idx > idx) {
                ret_idx = idx;
            }
        }
    }
    for (; i < len; ++i) {
        npyv_lanetype_u16 a = ip[i];
        if (a < s_acc) {
            s_acc = a;
            ret_idx = i;
        }
    }
    return ret_idx;
}


#line 32
#line 37
static inline npy_intp
simd_argmax_s16(npyv_lanetype_s16 *ip, npy_intp len)
{
    npyv_lanetype_s16 s_acc = *ip;
    npy_intp ret_idx = 0, i = 0;

    const int vstep = npyv_nlanes_s16;
    const int wstep = vstep*4;
    npyv_lanetype_u16 d_vindices[npyv_nlanes_s16*4];
    for (int vi = 0; vi < wstep; ++vi) {
        d_vindices[vi] = vi;
    }
    const npyv_u16 vindices_0 = npyv_load_u16(d_vindices);
    const npyv_u16 vindices_1 = npyv_load_u16(d_vindices + vstep);
    const npyv_u16 vindices_2 = npyv_load_u16(d_vindices + vstep*2);
    const npyv_u16 vindices_3 = npyv_load_u16(d_vindices + vstep*3);

    const npy_intp max_block = NPY_MAX_UINT16*wstep & -wstep;
    npy_intp len0 = len & -wstep;
    while (i < len0) {
        npyv_s16 acc = npyv_setall_s16(s_acc);
        npyv_u16 acc_indices = npyv_zero_u16();
        npyv_u16 acc_indices_scale = npyv_zero_u16();

        npy_intp n = i + MIN(len0 - i, max_block);
        npy_intp ik = i, i2 = 0;
        for (; i < n; i += wstep, ++i2) {
            npyv_u16 vi = npyv_setall_u16((npyv_lanetype_u16)i2);
            npyv_s16 a = npyv_load_s16(ip + i);
            npyv_s16 b = npyv_load_s16(ip + i + vstep);
            npyv_s16 c = npyv_load_s16(ip + i + vstep*2);
            npyv_s16 d = npyv_load_s16(ip + i + vstep*3);

            // reverse to put lowest index first in case of matched values
            npyv_b16 m_ba = npyv_cmpgt_s16(b, a);
            npyv_b16 m_dc = npyv_cmpgt_s16(d, c);
            npyv_s16  x_ba = npyv_select_s16(m_ba, b, a);
            npyv_s16  x_dc = npyv_select_s16(m_dc, d, c);
            npyv_b16 m_dcba = npyv_cmpgt_s16(x_dc, x_ba);
            npyv_s16  x_dcba = npyv_select_s16(m_dcba, x_dc, x_ba);

            npyv_u16 idx_ba = npyv_select_u16(m_ba, vindices_1, vindices_0);
            npyv_u16 idx_dc = npyv_select_u16(m_dc, vindices_3, vindices_2);
            npyv_u16 idx_dcba = npyv_select_u16(m_dcba, idx_dc, idx_ba);
            npyv_b16 m_acc = npyv_cmpgt_s16(x_dcba, acc);
            acc = npyv_select_s16(m_acc, x_dcba, acc);
            acc_indices = npyv_select_u16(m_acc, idx_dcba, acc_indices);
            acc_indices_scale = npyv_select_u16(m_acc, vi, acc_indices_scale);
        }
        // reduce
        npyv_lanetype_s16 dacc[npyv_nlanes_s16];
        npyv_lanetype_u16 dacc_i[npyv_nlanes_s16];
        npyv_lanetype_u16 dacc_s[npyv_nlanes_s16];
        npyv_store_s16(dacc, acc);
        npyv_store_u16(dacc_i, acc_indices);
        npyv_store_u16(dacc_s, acc_indices_scale);

        for (int vi = 0; vi < vstep; ++vi) {
            if (dacc[vi] > s_acc) {
                s_acc = dacc[vi];
                ret_idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
            }
        }
        // get the lowest index in case of matched values
        for (int vi = 0; vi < vstep; ++vi) {
            npy_intp idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
            if (s_acc == dacc[vi] && ret_idx > idx) {
                ret_idx = idx;
            }
        }
    }
    for (; i < len; ++i) {
        npyv_lanetype_s16 a = ip[i];
        if (a > s_acc) {
            s_acc = a;
            ret_idx = i;
        }
    }
    return ret_idx;
}

#line 37
static inline npy_intp
simd_argmin_s16(npyv_lanetype_s16 *ip, npy_intp len)
{
    npyv_lanetype_s16 s_acc = *ip;
    npy_intp ret_idx = 0, i = 0;

    const int vstep = npyv_nlanes_s16;
    const int wstep = vstep*4;
    npyv_lanetype_u16 d_vindices[npyv_nlanes_s16*4];
    for (int vi = 0; vi < wstep; ++vi) {
        d_vindices[vi] = vi;
    }
    const npyv_u16 vindices_0 = npyv_load_u16(d_vindices);
    const npyv_u16 vindices_1 = npyv_load_u16(d_vindices + vstep);
    const npyv_u16 vindices_2 = npyv_load_u16(d_vindices + vstep*2);
    const npyv_u16 vindices_3 = npyv_load_u16(d_vindices + vstep*3);

    const npy_intp max_block = NPY_MAX_UINT16*wstep & -wstep;
    npy_intp len0 = len & -wstep;
    while (i < len0) {
        npyv_s16 acc = npyv_setall_s16(s_acc);
        npyv_u16 acc_indices = npyv_zero_u16();
        npyv_u16 acc_indices_scale = npyv_zero_u16();

        npy_intp n = i + MIN(len0 - i, max_block);
        npy_intp ik = i, i2 = 0;
        for (; i < n; i += wstep, ++i2) {
            npyv_u16 vi = npyv_setall_u16((npyv_lanetype_u16)i2);
            npyv_s16 a = npyv_load_s16(ip + i);
            npyv_s16 b = npyv_load_s16(ip + i + vstep);
            npyv_s16 c = npyv_load_s16(ip + i + vstep*2);
            npyv_s16 d = npyv_load_s16(ip + i + vstep*3);

            // reverse to put lowest index first in case of matched values
            npyv_b16 m_ba = npyv_cmplt_s16(b, a);
            npyv_b16 m_dc = npyv_cmplt_s16(d, c);
            npyv_s16  x_ba = npyv_select_s16(m_ba, b, a);
            npyv_s16  x_dc = npyv_select_s16(m_dc, d, c);
            npyv_b16 m_dcba = npyv_cmplt_s16(x_dc, x_ba);
            npyv_s16  x_dcba = npyv_select_s16(m_dcba, x_dc, x_ba);

            npyv_u16 idx_ba = npyv_select_u16(m_ba, vindices_1, vindices_0);
            npyv_u16 idx_dc = npyv_select_u16(m_dc, vindices_3, vindices_2);
            npyv_u16 idx_dcba = npyv_select_u16(m_dcba, idx_dc, idx_ba);
            npyv_b16 m_acc = npyv_cmplt_s16(x_dcba, acc);
            acc = npyv_select_s16(m_acc, x_dcba, acc);
            acc_indices = npyv_select_u16(m_acc, idx_dcba, acc_indices);
            acc_indices_scale = npyv_select_u16(m_acc, vi, acc_indices_scale);
        }
        // reduce
        npyv_lanetype_s16 dacc[npyv_nlanes_s16];
        npyv_lanetype_u16 dacc_i[npyv_nlanes_s16];
        npyv_lanetype_u16 dacc_s[npyv_nlanes_s16];
        npyv_store_s16(dacc, acc);
        npyv_store_u16(dacc_i, acc_indices);
        npyv_store_u16(dacc_s, acc_indices_scale);

        for (int vi = 0; vi < vstep; ++vi) {
            if (dacc[vi] < s_acc) {
                s_acc = dacc[vi];
                ret_idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
            }
        }
        // get the lowest index in case of matched values
        for (int vi = 0; vi < vstep; ++vi) {
            npy_intp idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
            if (s_acc == dacc[vi] && ret_idx > idx) {
                ret_idx = idx;
            }
        }
    }
    for (; i < len; ++i) {
        npyv_lanetype_s16 a = ip[i];
        if (a < s_acc) {
            s_acc = a;
            ret_idx = i;
        }
    }
    return ret_idx;
}


#endif

#line 129
#if NPY_SIMD
#line 136
static inline npy_intp
simd_argmax_u32(npyv_lanetype_u32 *ip, npy_intp len)
{
    npyv_lanetype_u32 s_acc = *ip;
    npy_intp ret_idx = 0, i = 0;
    const int vstep = npyv_nlanes_u32;
    const int wstep = vstep*4;
    // loop by a scalar will perform better for small arrays
    if (len < wstep) {
        goto scalar_loop;
    }
    npy_intp len0 = len;
    // guard against wraparound vector addition for 32-bit indices
    // in case of the array length is larger than 16gb
#if 1
    if (len0 > NPY_MAX_UINT32) {
        len0 = NPY_MAX_UINT32;
    }
#endif
    // create index for vector indices
    npyv_lanetype_u32 d_vindices[npyv_nlanes_u32*4];
    for (int vi = 0; vi < wstep; ++vi) {
        d_vindices[vi] = vi;
    }
    const npyv_u32 vindices_0 = npyv_load_u32(d_vindices);
    const npyv_u32 vindices_1 = npyv_load_u32(d_vindices + vstep);
    const npyv_u32 vindices_2 = npyv_load_u32(d_vindices + vstep*2);
    const npyv_u32 vindices_3 = npyv_load_u32(d_vindices + vstep*3);
    // initialize vector accumulator for highest values and its indexes
    npyv_u32 acc_indices = npyv_zero_u32();
    npyv_u32 acc = npyv_setall_u32(s_acc);
    for (npy_intp n = len0 & -wstep; i < n; i += wstep) {
        npyv_u32 vi = npyv_setall_u32((npyv_lanetype_u32)i);
        npyv_u32 a = npyv_load_u32(ip + i);
        npyv_u32 b = npyv_load_u32(ip + i + vstep);
        npyv_u32 c = npyv_load_u32(ip + i + vstep*2);
        npyv_u32 d = npyv_load_u32(ip + i + vstep*3);

        // reverse to put lowest index first in case of matched values
        npyv_b32 m_ba = npyv_cmpgt_u32(b, a);
        npyv_b32 m_dc = npyv_cmpgt_u32(d, c);
        npyv_u32  x_ba = npyv_select_u32(m_ba, b, a);
        npyv_u32  x_dc = npyv_select_u32(m_dc, d, c);
        npyv_b32 m_dcba = npyv_cmpgt_u32(x_dc, x_ba);
        npyv_u32  x_dcba = npyv_select_u32(m_dcba, x_dc, x_ba);

        npyv_u32 idx_ba = npyv_select_u32(m_ba, vindices_1, vindices_0);
        npyv_u32 idx_dc = npyv_select_u32(m_dc, vindices_3, vindices_2);
        npyv_u32 idx_dcba = npyv_select_u32(m_dcba, idx_dc, idx_ba);
        npyv_b32 m_acc = npyv_cmpgt_u32(x_dcba, acc);
        acc = npyv_select_u32(m_acc, x_dcba, acc);
        acc_indices = npyv_select_u32(m_acc, npyv_add_u32(vi, idx_dcba), acc_indices);

    #if 0
        npyv_b32 nnan_a = npyv_notnan_u32(a);
        npyv_b32 nnan_b = npyv_notnan_u32(b);
        npyv_b32 nnan_c = npyv_notnan_u32(c);
        npyv_b32 nnan_d = npyv_notnan_u32(d);
        npyv_b32 nnan_ab = npyv_and_b32(nnan_a, nnan_b);
        npyv_b32 nnan_cd = npyv_and_b32(nnan_c, nnan_d);
        npy_uint64 nnan = npyv_tobits_b32(npyv_and_b32(nnan_ab, nnan_cd));
        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
            npy_uint64 nnan_4[4];
            nnan_4[0] = npyv_tobits_b32(nnan_a);
            nnan_4[1] = npyv_tobits_b32(nnan_b);
            nnan_4[2] = npyv_tobits_b32(nnan_c);
            nnan_4[3] = npyv_tobits_b32(nnan_d);
            for (int ni = 0; ni < 4; ++ni) {
                for (int vi = 0; vi < vstep; ++vi) {
                    if (!((nnan_4[ni] >> vi) & 1)) {
                        return i + ni*vstep + vi;
                    }
                }
            }
        }
    #endif
    }
    for (npy_intp n = len0 & -vstep; i < n; i += vstep) {
        npyv_u32 vi = npyv_setall_u32((npyv_lanetype_u32)i);
        npyv_u32 a = npyv_load_u32(ip + i);
        npyv_b32 m_acc = npyv_cmpgt_u32(a, acc);
        acc = npyv_select_u32(m_acc, a, acc);
        acc_indices = npyv_select_u32(m_acc, npyv_add_u32(vi, vindices_0), acc_indices);
    #if 0
        npyv_b32 nnan_a = npyv_notnan_u32(a);
        npy_uint64 nnan = npyv_tobits_b32(nnan_a);
        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
            for (int vi = 0; vi < vstep; ++vi) {
                if (!((nnan >> vi) & 1)) {
                    return i + vi;
                }
            }
        }
    #endif
    }

    // reduce
    npyv_lanetype_u32 dacc[npyv_nlanes_u32];
    npyv_lanetype_u32 dacc_i[npyv_nlanes_u32];
    npyv_store_u32(dacc_i, acc_indices);
    npyv_store_u32(dacc, acc);

    s_acc = dacc[0];
    ret_idx = dacc_i[0];
    for (int vi = 1; vi < vstep; ++vi) {
        if (dacc[vi] > s_acc) {
            s_acc = dacc[vi];
            ret_idx = (npy_intp)dacc_i[vi];
        }
    }
    // get the lowest index in case of matched values
    for (int vi = 0; vi < vstep; ++vi) {
        if (s_acc == dacc[vi] && ret_idx > (npy_intp)dacc_i[vi]) {
            ret_idx = dacc_i[vi];
        }
    }
scalar_loop:
    for (; i < len; ++i) {
        npyv_lanetype_u32 a = ip[i];
    #if 0
        if (!(a <= s_acc)) {  // negated, for correct nan handling
    #else
        if (a > s_acc) {
    #endif
            s_acc = a;
            ret_idx = i;
        #if 0
            if (npy_isnan(s_acc)) {
                // nan encountered, it's maximal
                return ret_idx;
            }
        #endif
        }
    }
    return ret_idx;
}

#line 136
static inline npy_intp
simd_argmin_u32(npyv_lanetype_u32 *ip, npy_intp len)
{
    npyv_lanetype_u32 s_acc = *ip;
    npy_intp ret_idx = 0, i = 0;
    const int vstep = npyv_nlanes_u32;
    const int wstep = vstep*4;
    // loop by a scalar will perform better for small arrays
    if (len < wstep) {
        goto scalar_loop;
    }
    npy_intp len0 = len;
    // guard against wraparound vector addition for 32-bit indices
    // in case of the array length is larger than 16gb
#if 1
    if (len0 > NPY_MAX_UINT32) {
        len0 = NPY_MAX_UINT32;
    }
#endif
    // create index for vector indices
    npyv_lanetype_u32 d_vindices[npyv_nlanes_u32*4];
    for (int vi = 0; vi < wstep; ++vi) {
        d_vindices[vi] = vi;
    }
    const npyv_u32 vindices_0 = npyv_load_u32(d_vindices);
    const npyv_u32 vindices_1 = npyv_load_u32(d_vindices + vstep);
    const npyv_u32 vindices_2 = npyv_load_u32(d_vindices + vstep*2);
    const npyv_u32 vindices_3 = npyv_load_u32(d_vindices + vstep*3);
    // initialize vector accumulator for highest values and its indexes
    npyv_u32 acc_indices = npyv_zero_u32();
    npyv_u32 acc = npyv_setall_u32(s_acc);
    for (npy_intp n = len0 & -wstep; i < n; i += wstep) {
        npyv_u32 vi = npyv_setall_u32((npyv_lanetype_u32)i);
        npyv_u32 a = npyv_load_u32(ip + i);
        npyv_u32 b = npyv_load_u32(ip + i + vstep);
        npyv_u32 c = npyv_load_u32(ip + i + vstep*2);
        npyv_u32 d = npyv_load_u32(ip + i + vstep*3);

        // reverse to put lowest index first in case of matched values
        npyv_b32 m_ba = npyv_cmplt_u32(b, a);
        npyv_b32 m_dc = npyv_cmplt_u32(d, c);
        npyv_u32  x_ba = npyv_select_u32(m_ba, b, a);
        npyv_u32  x_dc = npyv_select_u32(m_dc, d, c);
        npyv_b32 m_dcba = npyv_cmplt_u32(x_dc, x_ba);
        npyv_u32  x_dcba = npyv_select_u32(m_dcba, x_dc, x_ba);

        npyv_u32 idx_ba = npyv_select_u32(m_ba, vindices_1, vindices_0);
        npyv_u32 idx_dc = npyv_select_u32(m_dc, vindices_3, vindices_2);
        npyv_u32 idx_dcba = npyv_select_u32(m_dcba, idx_dc, idx_ba);
        npyv_b32 m_acc = npyv_cmplt_u32(x_dcba, acc);
        acc = npyv_select_u32(m_acc, x_dcba, acc);
        acc_indices = npyv_select_u32(m_acc, npyv_add_u32(vi, idx_dcba), acc_indices);

    #if 0
        npyv_b32 nnan_a = npyv_notnan_u32(a);
        npyv_b32 nnan_b = npyv_notnan_u32(b);
        npyv_b32 nnan_c = npyv_notnan_u32(c);
        npyv_b32 nnan_d = npyv_notnan_u32(d);
        npyv_b32 nnan_ab = npyv_and_b32(nnan_a, nnan_b);
        npyv_b32 nnan_cd = npyv_and_b32(nnan_c, nnan_d);
        npy_uint64 nnan = npyv_tobits_b32(npyv_and_b32(nnan_ab, nnan_cd));
        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
            npy_uint64 nnan_4[4];
            nnan_4[0] = npyv_tobits_b32(nnan_a);
            nnan_4[1] = npyv_tobits_b32(nnan_b);
            nnan_4[2] = npyv_tobits_b32(nnan_c);
            nnan_4[3] = npyv_tobits_b32(nnan_d);
            for (int ni = 0; ni < 4; ++ni) {
                for (int vi = 0; vi < vstep; ++vi) {
                    if (!((nnan_4[ni] >> vi) & 1)) {
                        return i + ni*vstep + vi;
                    }
                }
            }
        }
    #endif
    }
    for (npy_intp n = len0 & -vstep; i < n; i += vstep) {
        npyv_u32 vi = npyv_setall_u32((npyv_lanetype_u32)i);
        npyv_u32 a = npyv_load_u32(ip + i);
        npyv_b32 m_acc = npyv_cmplt_u32(a, acc);
        acc = npyv_select_u32(m_acc, a, acc);
        acc_indices = npyv_select_u32(m_acc, npyv_add_u32(vi, vindices_0), acc_indices);
    #if 0
        npyv_b32 nnan_a = npyv_notnan_u32(a);
        npy_uint64 nnan = npyv_tobits_b32(nnan_a);
        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
            for (int vi = 0; vi < vstep; ++vi) {
                if (!((nnan >> vi) & 1)) {
                    return i + vi;
                }
            }
        }
    #endif
    }

    // reduce
    npyv_lanetype_u32 dacc[npyv_nlanes_u32];
    npyv_lanetype_u32 dacc_i[npyv_nlanes_u32];
    npyv_store_u32(dacc_i, acc_indices);
    npyv_store_u32(dacc, acc);

    s_acc = dacc[0];
    ret_idx = dacc_i[0];
    for (int vi = 1; vi < vstep; ++vi) {
        if (dacc[vi] < s_acc) {
            s_acc = dacc[vi];
            ret_idx = (npy_intp)dacc_i[vi];
        }
    }
    // get the lowest index in case of matched values
    for (int vi = 0; vi < vstep; ++vi) {
        if (s_acc == dacc[vi] && ret_idx > (npy_intp)dacc_i[vi]) {
            ret_idx = dacc_i[vi];
        }
    }
scalar_loop:
    for (; i < len; ++i) {
        npyv_lanetype_u32 a = ip[i];
    #if 0
        if (!(a >= s_acc)) {  // negated, for correct nan handling
    #else
        if (a < s_acc) {
    #endif
            s_acc = a;
            ret_idx = i;
        #if 0
            if (npy_isnan(s_acc)) {
                // nan encountered, it's maximal
                return ret_idx;
            }
        #endif
        }
    }
    return ret_idx;
}

#endif // chk_simd

#line 129
#if NPY_SIMD
#line 136
static inline npy_intp
simd_argmax_s32(npyv_lanetype_s32 *ip, npy_intp len)
{
    npyv_lanetype_s32 s_acc = *ip;
    npy_intp ret_idx = 0, i = 0;
    const int vstep = npyv_nlanes_s32;
    const int wstep = vstep*4;
    // loop by a scalar will perform better for small arrays
    if (len < wstep) {
        goto scalar_loop;
    }
    npy_intp len0 = len;
    // guard against wraparound vector addition for 32-bit indices
    // in case of the array length is larger than 16gb
#if 1
    if (len0 > NPY_MAX_UINT32) {
        len0 = NPY_MAX_UINT32;
    }
#endif
    // create index for vector indices
    npyv_lanetype_u32 d_vindices[npyv_nlanes_s32*4];
    for (int vi = 0; vi < wstep; ++vi) {
        d_vindices[vi] = vi;
    }
    const npyv_u32 vindices_0 = npyv_load_u32(d_vindices);
    const npyv_u32 vindices_1 = npyv_load_u32(d_vindices + vstep);
    const npyv_u32 vindices_2 = npyv_load_u32(d_vindices + vstep*2);
    const npyv_u32 vindices_3 = npyv_load_u32(d_vindices + vstep*3);
    // initialize vector accumulator for highest values and its indexes
    npyv_u32 acc_indices = npyv_zero_u32();
    npyv_s32 acc = npyv_setall_s32(s_acc);
    for (npy_intp n = len0 & -wstep; i < n; i += wstep) {
        npyv_u32 vi = npyv_setall_u32((npyv_lanetype_u32)i);
        npyv_s32 a = npyv_load_s32(ip + i);
        npyv_s32 b = npyv_load_s32(ip + i + vstep);
        npyv_s32 c = npyv_load_s32(ip + i + vstep*2);
        npyv_s32 d = npyv_load_s32(ip + i + vstep*3);

        // reverse to put lowest index first in case of matched values
        npyv_b32 m_ba = npyv_cmpgt_s32(b, a);
        npyv_b32 m_dc = npyv_cmpgt_s32(d, c);
        npyv_s32  x_ba = npyv_select_s32(m_ba, b, a);
        npyv_s32  x_dc = npyv_select_s32(m_dc, d, c);
        npyv_b32 m_dcba = npyv_cmpgt_s32(x_dc, x_ba);
        npyv_s32  x_dcba = npyv_select_s32(m_dcba, x_dc, x_ba);

        npyv_u32 idx_ba = npyv_select_u32(m_ba, vindices_1, vindices_0);
        npyv_u32 idx_dc = npyv_select_u32(m_dc, vindices_3, vindices_2);
        npyv_u32 idx_dcba = npyv_select_u32(m_dcba, idx_dc, idx_ba);
        npyv_b32 m_acc = npyv_cmpgt_s32(x_dcba, acc);
        acc = npyv_select_s32(m_acc, x_dcba, acc);
        acc_indices = npyv_select_u32(m_acc, npyv_add_u32(vi, idx_dcba), acc_indices);

    #if 0
        npyv_b32 nnan_a = npyv_notnan_s32(a);
        npyv_b32 nnan_b = npyv_notnan_s32(b);
        npyv_b32 nnan_c = npyv_notnan_s32(c);
        npyv_b32 nnan_d = npyv_notnan_s32(d);
        npyv_b32 nnan_ab = npyv_and_b32(nnan_a, nnan_b);
        npyv_b32 nnan_cd = npyv_and_b32(nnan_c, nnan_d);
        npy_uint64 nnan = npyv_tobits_b32(npyv_and_b32(nnan_ab, nnan_cd));
        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
            npy_uint64 nnan_4[4];
            nnan_4[0] = npyv_tobits_b32(nnan_a);
            nnan_4[1] = npyv_tobits_b32(nnan_b);
            nnan_4[2] = npyv_tobits_b32(nnan_c);
            nnan_4[3] = npyv_tobits_b32(nnan_d);
            for (int ni = 0; ni < 4; ++ni) {
                for (int vi = 0; vi < vstep; ++vi) {
                    if (!((nnan_4[ni] >> vi) & 1)) {
                        return i + ni*vstep + vi;
                    }
                }
            }
        }
    #endif
    }
    for (npy_intp n = len0 & -vstep; i < n; i += vstep) {
        npyv_u32 vi = npyv_setall_u32((npyv_lanetype_u32)i);
        npyv_s32 a = npyv_load_s32(ip + i);
        npyv_b32 m_acc = npyv_cmpgt_s32(a, acc);
        acc = npyv_select_s32(m_acc, a, acc);
        acc_indices = npyv_select_u32(m_acc, npyv_add_u32(vi, vindices_0), acc_indices);
    #if 0
        npyv_b32 nnan_a = npyv_notnan_s32(a);
        npy_uint64 nnan = npyv_tobits_b32(nnan_a);
        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
            for (int vi = 0; vi < vstep; ++vi) {
                if (!((nnan >> vi) & 1)) {
                    return i + vi;
                }
            }
        }
    #endif
    }

    // reduce
    npyv_lanetype_s32 dacc[npyv_nlanes_s32];
    npyv_lanetype_u32 dacc_i[npyv_nlanes_s32];
    npyv_store_u32(dacc_i, acc_indices);
    npyv_store_s32(dacc, acc);

    s_acc = dacc[0];
    ret_idx = dacc_i[0];
    for (int vi = 1; vi < vstep; ++vi) {
        if (dacc[vi] > s_acc) {
            s_acc = dacc[vi];
            ret_idx = (npy_intp)dacc_i[vi];
        }
    }
    // get the lowest index in case of matched values
    for (int vi = 0; vi < vstep; ++vi) {
        if (s_acc == dacc[vi] && ret_idx > (npy_intp)dacc_i[vi]) {
            ret_idx = dacc_i[vi];
        }
    }
scalar_loop:
    for (; i < len; ++i) {
        npyv_lanetype_s32 a = ip[i];
    #if 0
        if (!(a <= s_acc)) {  // negated, for correct nan handling
    #else
        if (a > s_acc) {
    #endif
            s_acc = a;
            ret_idx = i;
        #if 0
            if (npy_isnan(s_acc)) {
                // nan encountered, it's maximal
                return ret_idx;
            }
        #endif
        }
    }
    return ret_idx;
}

#line 136
static inline npy_intp
simd_argmin_s32(npyv_lanetype_s32 *ip, npy_intp len)
{
    npyv_lanetype_s32 s_acc = *ip;
    npy_intp ret_idx = 0, i = 0;
    const int vstep = npyv_nlanes_s32;
    const int wstep = vstep*4;
    // loop by a scalar will perform better for small arrays
    if (len < wstep) {
        goto scalar_loop;
    }
    npy_intp len0 = len;
    // guard against wraparound vector addition for 32-bit indices
    // in case of the array length is larger than 16gb
#if 1
    if (len0 > NPY_MAX_UINT32) {
        len0 = NPY_MAX_UINT32;
    }
#endif
    // create index for vector indices
    npyv_lanetype_u32 d_vindices[npyv_nlanes_s32*4];
    for (int vi = 0; vi < wstep; ++vi) {
        d_vindices[vi] = vi;
    }
    const npyv_u32 vindices_0 = npyv_load_u32(d_vindices);
    const npyv_u32 vindices_1 = npyv_load_u32(d_vindices + vstep);
    const npyv_u32 vindices_2 = npyv_load_u32(d_vindices + vstep*2);
    const npyv_u32 vindices_3 = npyv_load_u32(d_vindices + vstep*3);
    // initialize vector accumulator for highest values and its indexes
    npyv_u32 acc_indices = npyv_zero_u32();
    npyv_s32 acc = npyv_setall_s32(s_acc);
    for (npy_intp n = len0 & -wstep; i < n; i += wstep) {
        npyv_u32 vi = npyv_setall_u32((npyv_lanetype_u32)i);
        npyv_s32 a = npyv_load_s32(ip + i);
        npyv_s32 b = npyv_load_s32(ip + i + vstep);
        npyv_s32 c = npyv_load_s32(ip + i + vstep*2);
        npyv_s32 d = npyv_load_s32(ip + i + vstep*3);

        // reverse to put lowest index first in case of matched values
        npyv_b32 m_ba = npyv_cmplt_s32(b, a);
        npyv_b32 m_dc = npyv_cmplt_s32(d, c);
        npyv_s32  x_ba = npyv_select_s32(m_ba, b, a);
        npyv_s32  x_dc = npyv_select_s32(m_dc, d, c);
        npyv_b32 m_dcba = npyv_cmplt_s32(x_dc, x_ba);
        npyv_s32  x_dcba = npyv_select_s32(m_dcba, x_dc, x_ba);

        npyv_u32 idx_ba = npyv_select_u32(m_ba, vindices_1, vindices_0);
        npyv_u32 idx_dc = npyv_select_u32(m_dc, vindices_3, vindices_2);
        npyv_u32 idx_dcba = npyv_select_u32(m_dcba, idx_dc, idx_ba);
        npyv_b32 m_acc = npyv_cmplt_s32(x_dcba, acc);
        acc = npyv_select_s32(m_acc, x_dcba, acc);
        acc_indices = npyv_select_u32(m_acc, npyv_add_u32(vi, idx_dcba), acc_indices);

    #if 0
        npyv_b32 nnan_a = npyv_notnan_s32(a);
        npyv_b32 nnan_b = npyv_notnan_s32(b);
        npyv_b32 nnan_c = npyv_notnan_s32(c);
        npyv_b32 nnan_d = npyv_notnan_s32(d);
        npyv_b32 nnan_ab = npyv_and_b32(nnan_a, nnan_b);
        npyv_b32 nnan_cd = npyv_and_b32(nnan_c, nnan_d);
        npy_uint64 nnan = npyv_tobits_b32(npyv_and_b32(nnan_ab, nnan_cd));
        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
            npy_uint64 nnan_4[4];
            nnan_4[0] = npyv_tobits_b32(nnan_a);
            nnan_4[1] = npyv_tobits_b32(nnan_b);
            nnan_4[2] = npyv_tobits_b32(nnan_c);
            nnan_4[3] = npyv_tobits_b32(nnan_d);
            for (int ni = 0; ni < 4; ++ni) {
                for (int vi = 0; vi < vstep; ++vi) {
                    if (!((nnan_4[ni] >> vi) & 1)) {
                        return i + ni*vstep + vi;
                    }
                }
            }
        }
    #endif
    }
    for (npy_intp n = len0 & -vstep; i < n; i += vstep) {
        npyv_u32 vi = npyv_setall_u32((npyv_lanetype_u32)i);
        npyv_s32 a = npyv_load_s32(ip + i);
        npyv_b32 m_acc = npyv_cmplt_s32(a, acc);
        acc = npyv_select_s32(m_acc, a, acc);
        acc_indices = npyv_select_u32(m_acc, npyv_add_u32(vi, vindices_0), acc_indices);
    #if 0
        npyv_b32 nnan_a = npyv_notnan_s32(a);
        npy_uint64 nnan = npyv_tobits_b32(nnan_a);
        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
            for (int vi = 0; vi < vstep; ++vi) {
                if (!((nnan >> vi) & 1)) {
                    return i + vi;
                }
            }
        }
    #endif
    }

    // reduce
    npyv_lanetype_s32 dacc[npyv_nlanes_s32];
    npyv_lanetype_u32 dacc_i[npyv_nlanes_s32];
    npyv_store_u32(dacc_i, acc_indices);
    npyv_store_s32(dacc, acc);

    s_acc = dacc[0];
    ret_idx = dacc_i[0];
    for (int vi = 1; vi < vstep; ++vi) {
        if (dacc[vi] < s_acc) {
            s_acc = dacc[vi];
            ret_idx = (npy_intp)dacc_i[vi];
        }
    }
    // get the lowest index in case of matched values
    for (int vi = 0; vi < vstep; ++vi) {
        if (s_acc == dacc[vi] && ret_idx > (npy_intp)dacc_i[vi]) {
            ret_idx = dacc_i[vi];
        }
    }
scalar_loop:
    for (; i < len; ++i) {
        npyv_lanetype_s32 a = ip[i];
    #if 0
        if (!(a >= s_acc)) {  // negated, for correct nan handling
    #else
        if (a < s_acc) {
    #endif
            s_acc = a;
            ret_idx = i;
        #if 0
            if (npy_isnan(s_acc)) {
                // nan encountered, it's maximal
                return ret_idx;
            }
        #endif
        }
    }
    return ret_idx;
}

#endif // chk_simd

#line 129
#if NPY_SIMD
#line 136
static inline npy_intp
simd_argmax_u64(npyv_lanetype_u64 *ip, npy_intp len)
{
    npyv_lanetype_u64 s_acc = *ip;
    npy_intp ret_idx = 0, i = 0;
    const int vstep = npyv_nlanes_u64;
    const int wstep = vstep*4;
    // loop by a scalar will perform better for small arrays
    if (len < wstep) {
        goto scalar_loop;
    }
    npy_intp len0 = len;
    // guard against wraparound vector addition for 32-bit indices
    // in case of the array length is larger than 16gb
#if 0
    if (len0 > NPY_MAX_UINT32) {
        len0 = NPY_MAX_UINT32;
    }
#endif
    // create index for vector indices
    npyv_lanetype_u64 d_vindices[npyv_nlanes_u64*4];
    for (int vi = 0; vi < wstep; ++vi) {
        d_vindices[vi] = vi;
    }
    const npyv_u64 vindices_0 = npyv_load_u64(d_vindices);
    const npyv_u64 vindices_1 = npyv_load_u64(d_vindices + vstep);
    const npyv_u64 vindices_2 = npyv_load_u64(d_vindices + vstep*2);
    const npyv_u64 vindices_3 = npyv_load_u64(d_vindices + vstep*3);
    // initialize vector accumulator for highest values and its indexes
    npyv_u64 acc_indices = npyv_zero_u64();
    npyv_u64 acc = npyv_setall_u64(s_acc);
    for (npy_intp n = len0 & -wstep; i < n; i += wstep) {
        npyv_u64 vi = npyv_setall_u64((npyv_lanetype_u64)i);
        npyv_u64 a = npyv_load_u64(ip + i);
        npyv_u64 b = npyv_load_u64(ip + i + vstep);
        npyv_u64 c = npyv_load_u64(ip + i + vstep*2);
        npyv_u64 d = npyv_load_u64(ip + i + vstep*3);

        // reverse to put lowest index first in case of matched values
        npyv_b64 m_ba = npyv_cmpgt_u64(b, a);
        npyv_b64 m_dc = npyv_cmpgt_u64(d, c);
        npyv_u64  x_ba = npyv_select_u64(m_ba, b, a);
        npyv_u64  x_dc = npyv_select_u64(m_dc, d, c);
        npyv_b64 m_dcba = npyv_cmpgt_u64(x_dc, x_ba);
        npyv_u64  x_dcba = npyv_select_u64(m_dcba, x_dc, x_ba);

        npyv_u64 idx_ba = npyv_select_u64(m_ba, vindices_1, vindices_0);
        npyv_u64 idx_dc = npyv_select_u64(m_dc, vindices_3, vindices_2);
        npyv_u64 idx_dcba = npyv_select_u64(m_dcba, idx_dc, idx_ba);
        npyv_b64 m_acc = npyv_cmpgt_u64(x_dcba, acc);
        acc = npyv_select_u64(m_acc, x_dcba, acc);
        acc_indices = npyv_select_u64(m_acc, npyv_add_u64(vi, idx_dcba), acc_indices);

    #if 0
        npyv_b64 nnan_a = npyv_notnan_u64(a);
        npyv_b64 nnan_b = npyv_notnan_u64(b);
        npyv_b64 nnan_c = npyv_notnan_u64(c);
        npyv_b64 nnan_d = npyv_notnan_u64(d);
        npyv_b64 nnan_ab = npyv_and_b64(nnan_a, nnan_b);
        npyv_b64 nnan_cd = npyv_and_b64(nnan_c, nnan_d);
        npy_uint64 nnan = npyv_tobits_b64(npyv_and_b64(nnan_ab, nnan_cd));
        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
            npy_uint64 nnan_4[4];
            nnan_4[0] = npyv_tobits_b64(nnan_a);
            nnan_4[1] = npyv_tobits_b64(nnan_b);
            nnan_4[2] = npyv_tobits_b64(nnan_c);
            nnan_4[3] = npyv_tobits_b64(nnan_d);
            for (int ni = 0; ni < 4; ++ni) {
                for (int vi = 0; vi < vstep; ++vi) {
                    if (!((nnan_4[ni] >> vi) & 1)) {
                        return i + ni*vstep + vi;
                    }
                }
            }
        }
    #endif
    }
    for (npy_intp n = len0 & -vstep; i < n; i += vstep) {
        npyv_u64 vi = npyv_setall_u64((npyv_lanetype_u64)i);
        npyv_u64 a = npyv_load_u64(ip + i);
        npyv_b64 m_acc = npyv_cmpgt_u64(a, acc);
        acc = npyv_select_u64(m_acc, a, acc);
        acc_indices = npyv_select_u64(m_acc, npyv_add_u64(vi, vindices_0), acc_indices);
    #if 0
        npyv_b64 nnan_a = npyv_notnan_u64(a);
        npy_uint64 nnan = npyv_tobits_b64(nnan_a);
        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
            for (int vi = 0; vi < vstep; ++vi) {
                if (!((nnan >> vi) & 1)) {
                    return i + vi;
                }
            }
        }
    #endif
    }

    // reduce
    npyv_lanetype_u64 dacc[npyv_nlanes_u64];
    npyv_lanetype_u64 dacc_i[npyv_nlanes_u64];
    npyv_store_u64(dacc_i, acc_indices);
    npyv_store_u64(dacc, acc);

    s_acc = dacc[0];
    ret_idx = dacc_i[0];
    for (int vi = 1; vi < vstep; ++vi) {
        if (dacc[vi] > s_acc) {
            s_acc = dacc[vi];
            ret_idx = (npy_intp)dacc_i[vi];
        }
    }
    // get the lowest index in case of matched values
    for (int vi = 0; vi < vstep; ++vi) {
        if (s_acc == dacc[vi] && ret_idx > (npy_intp)dacc_i[vi]) {
            ret_idx = dacc_i[vi];
        }
    }
scalar_loop:
    for (; i < len; ++i) {
        npyv_lanetype_u64 a = ip[i];
    #if 0
        if (!(a <= s_acc)) {  // negated, for correct nan handling
    #else
        if (a > s_acc) {
    #endif
            s_acc = a;
            ret_idx = i;
        #if 0
            if (npy_isnan(s_acc)) {
                // nan encountered, it's maximal
                return ret_idx;
            }
        #endif
        }
    }
    return ret_idx;
}

#line 136
static inline npy_intp
simd_argmin_u64(npyv_lanetype_u64 *ip, npy_intp len)
{
    npyv_lanetype_u64 s_acc = *ip;
    npy_intp ret_idx = 0, i = 0;
    const int vstep = npyv_nlanes_u64;
    const int wstep = vstep*4;
    // loop by a scalar will perform better for small arrays
    if (len < wstep) {
        goto scalar_loop;
    }
    npy_intp len0 = len;
    // guard against wraparound vector addition for 32-bit indices
    // in case of the array length is larger than 16gb
#if 0
    if (len0 > NPY_MAX_UINT32) {
        len0 = NPY_MAX_UINT32;
    }
#endif
    // create index for vector indices
    npyv_lanetype_u64 d_vindices[npyv_nlanes_u64*4];
    for (int vi = 0; vi < wstep; ++vi) {
        d_vindices[vi] = vi;
    }
    const npyv_u64 vindices_0 = npyv_load_u64(d_vindices);
    const npyv_u64 vindices_1 = npyv_load_u64(d_vindices + vstep);
    const npyv_u64 vindices_2 = npyv_load_u64(d_vindices + vstep*2);
    const npyv_u64 vindices_3 = npyv_load_u64(d_vindices + vstep*3);
    // initialize vector accumulator for highest values and its indexes
    npyv_u64 acc_indices = npyv_zero_u64();
    npyv_u64 acc = npyv_setall_u64(s_acc);
    for (npy_intp n = len0 & -wstep; i < n; i += wstep) {
        npyv_u64 vi = npyv_setall_u64((npyv_lanetype_u64)i);
        npyv_u64 a = npyv_load_u64(ip + i);
        npyv_u64 b = npyv_load_u64(ip + i + vstep);
        npyv_u64 c = npyv_load_u64(ip + i + vstep*2);
        npyv_u64 d = npyv_load_u64(ip + i + vstep*3);

        // reverse to put lowest index first in case of matched values
        npyv_b64 m_ba = npyv_cmplt_u64(b, a);
        npyv_b64 m_dc = npyv_cmplt_u64(d, c);
        npyv_u64  x_ba = npyv_select_u64(m_ba, b, a);
        npyv_u64  x_dc = npyv_select_u64(m_dc, d, c);
        npyv_b64 m_dcba = npyv_cmplt_u64(x_dc, x_ba);
        npyv_u64  x_dcba = npyv_select_u64(m_dcba, x_dc, x_ba);

        npyv_u64 idx_ba = npyv_select_u64(m_ba, vindices_1, vindices_0);
        npyv_u64 idx_dc = npyv_select_u64(m_dc, vindices_3, vindices_2);
        npyv_u64 idx_dcba = npyv_select_u64(m_dcba, idx_dc, idx_ba);
        npyv_b64 m_acc = npyv_cmplt_u64(x_dcba, acc);
        acc = npyv_select_u64(m_acc, x_dcba, acc);
        acc_indices = npyv_select_u64(m_acc, npyv_add_u64(vi, idx_dcba), acc_indices);

    #if 0
        npyv_b64 nnan_a = npyv_notnan_u64(a);
        npyv_b64 nnan_b = npyv_notnan_u64(b);
        npyv_b64 nnan_c = npyv_notnan_u64(c);
        npyv_b64 nnan_d = npyv_notnan_u64(d);
        npyv_b64 nnan_ab = npyv_and_b64(nnan_a, nnan_b);
        npyv_b64 nnan_cd = npyv_and_b64(nnan_c, nnan_d);
        npy_uint64 nnan = npyv_tobits_b64(npyv_and_b64(nnan_ab, nnan_cd));
        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
            npy_uint64 nnan_4[4];
            nnan_4[0] = npyv_tobits_b64(nnan_a);
            nnan_4[1] = npyv_tobits_b64(nnan_b);
            nnan_4[2] = npyv_tobits_b64(nnan_c);
            nnan_4[3] = npyv_tobits_b64(nnan_d);
            for (int ni = 0; ni < 4; ++ni) {
                for (int vi = 0; vi < vstep; ++vi) {
                    if (!((nnan_4[ni] >> vi) & 1)) {
                        return i + ni*vstep + vi;
                    }
                }
            }
        }
    #endif
    }
    for (npy_intp n = len0 & -vstep; i < n; i += vstep) {
        npyv_u64 vi = npyv_setall_u64((npyv_lanetype_u64)i);
        npyv_u64 a = npyv_load_u64(ip + i);
        npyv_b64 m_acc = npyv_cmplt_u64(a, acc);
        acc = npyv_select_u64(m_acc, a, acc);
        acc_indices = npyv_select_u64(m_acc, npyv_add_u64(vi, vindices_0), acc_indices);
    #if 0
        npyv_b64 nnan_a = npyv_notnan_u64(a);
        npy_uint64 nnan = npyv_tobits_b64(nnan_a);
        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
            for (int vi = 0; vi < vstep; ++vi) {
                if (!((nnan >> vi) & 1)) {
                    return i + vi;
                }
            }
        }
    #endif
    }

    // reduce
    npyv_lanetype_u64 dacc[npyv_nlanes_u64];
    npyv_lanetype_u64 dacc_i[npyv_nlanes_u64];
    npyv_store_u64(dacc_i, acc_indices);
    npyv_store_u64(dacc, acc);

    s_acc = dacc[0];
    ret_idx = dacc_i[0];
    for (int vi = 1; vi < vstep; ++vi) {
        if (dacc[vi] < s_acc) {
            s_acc = dacc[vi];
            ret_idx = (npy_intp)dacc_i[vi];
        }
    }
    // get the lowest index in case of matched values
    for (int vi = 0; vi < vstep; ++vi) {
        if (s_acc == dacc[vi] && ret_idx > (npy_intp)dacc_i[vi]) {
            ret_idx = dacc_i[vi];
        }
    }
scalar_loop:
    for (; i < len; ++i) {
        npyv_lanetype_u64 a = ip[i];
    #if 0
        if (!(a >= s_acc)) {  // negated, for correct nan handling
    #else
        if (a < s_acc) {
    #endif
            s_acc = a;
            ret_idx = i;
        #if 0
            if (npy_isnan(s_acc)) {
                // nan encountered, it's maximal
                return ret_idx;
            }
        #endif
        }
    }
    return ret_idx;
}

#endif // chk_simd

#line 129
#if NPY_SIMD
#line 136
static inline npy_intp
simd_argmax_s64(npyv_lanetype_s64 *ip, npy_intp len)
{
    npyv_lanetype_s64 s_acc = *ip;
    npy_intp ret_idx = 0, i = 0;
    const int vstep = npyv_nlanes_s64;
    const int wstep = vstep*4;
    // loop by a scalar will perform better for small arrays
    if (len < wstep) {
        goto scalar_loop;
    }
    npy_intp len0 = len;
    // guard against wraparound vector addition for 32-bit indices
    // in case of the array length is larger than 16gb
#if 0
    if (len0 > NPY_MAX_UINT32) {
        len0 = NPY_MAX_UINT32;
    }
#endif
    // create index for vector indices
    npyv_lanetype_u64 d_vindices[npyv_nlanes_s64*4];
    for (int vi = 0; vi < wstep; ++vi) {
        d_vindices[vi] = vi;
    }
    const npyv_u64 vindices_0 = npyv_load_u64(d_vindices);
    const npyv_u64 vindices_1 = npyv_load_u64(d_vindices + vstep);
    const npyv_u64 vindices_2 = npyv_load_u64(d_vindices + vstep*2);
    const npyv_u64 vindices_3 = npyv_load_u64(d_vindices + vstep*3);
    // initialize vector accumulator for highest values and its indexes
    npyv_u64 acc_indices = npyv_zero_u64();
    npyv_s64 acc = npyv_setall_s64(s_acc);
    for (npy_intp n = len0 & -wstep; i < n; i += wstep) {
        npyv_u64 vi = npyv_setall_u64((npyv_lanetype_u64)i);
        npyv_s64 a = npyv_load_s64(ip + i);
        npyv_s64 b = npyv_load_s64(ip + i + vstep);
        npyv_s64 c = npyv_load_s64(ip + i + vstep*2);
        npyv_s64 d = npyv_load_s64(ip + i + vstep*3);

        // reverse to put lowest index first in case of matched values
        npyv_b64 m_ba = npyv_cmpgt_s64(b, a);
        npyv_b64 m_dc = npyv_cmpgt_s64(d, c);
        npyv_s64  x_ba = npyv_select_s64(m_ba, b, a);
        npyv_s64  x_dc = npyv_select_s64(m_dc, d, c);
        npyv_b64 m_dcba = npyv_cmpgt_s64(x_dc, x_ba);
        npyv_s64  x_dcba = npyv_select_s64(m_dcba, x_dc, x_ba);

        npyv_u64 idx_ba = npyv_select_u64(m_ba, vindices_1, vindices_0);
        npyv_u64 idx_dc = npyv_select_u64(m_dc, vindices_3, vindices_2);
        npyv_u64 idx_dcba = npyv_select_u64(m_dcba, idx_dc, idx_ba);
        npyv_b64 m_acc = npyv_cmpgt_s64(x_dcba, acc);
        acc = npyv_select_s64(m_acc, x_dcba, acc);
        acc_indices = npyv_select_u64(m_acc, npyv_add_u64(vi, idx_dcba), acc_indices);

    #if 0
        npyv_b64 nnan_a = npyv_notnan_s64(a);
        npyv_b64 nnan_b = npyv_notnan_s64(b);
        npyv_b64 nnan_c = npyv_notnan_s64(c);
        npyv_b64 nnan_d = npyv_notnan_s64(d);
        npyv_b64 nnan_ab = npyv_and_b64(nnan_a, nnan_b);
        npyv_b64 nnan_cd = npyv_and_b64(nnan_c, nnan_d);
        npy_uint64 nnan = npyv_tobits_b64(npyv_and_b64(nnan_ab, nnan_cd));
        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
            npy_uint64 nnan_4[4];
            nnan_4[0] = npyv_tobits_b64(nnan_a);
            nnan_4[1] = npyv_tobits_b64(nnan_b);
            nnan_4[2] = npyv_tobits_b64(nnan_c);
            nnan_4[3] = npyv_tobits_b64(nnan_d);
            for (int ni = 0; ni < 4; ++ni) {
                for (int vi = 0; vi < vstep; ++vi) {
                    if (!((nnan_4[ni] >> vi) & 1)) {
                        return i + ni*vstep + vi;
                    }
                }
            }
        }
    #endif
    }
    for (npy_intp n = len0 & -vstep; i < n; i += vstep) {
        npyv_u64 vi = npyv_setall_u64((npyv_lanetype_u64)i);
        npyv_s64 a = npyv_load_s64(ip + i);
        npyv_b64 m_acc = npyv_cmpgt_s64(a, acc);
        acc = npyv_select_s64(m_acc, a, acc);
        acc_indices = npyv_select_u64(m_acc, npyv_add_u64(vi, vindices_0), acc_indices);
    #if 0
        npyv_b64 nnan_a = npyv_notnan_s64(a);
        npy_uint64 nnan = npyv_tobits_b64(nnan_a);
        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
            for (int vi = 0; vi < vstep; ++vi) {
                if (!((nnan >> vi) & 1)) {
                    return i + vi;
                }
            }
        }
    #endif
    }

    // reduce
    npyv_lanetype_s64 dacc[npyv_nlanes_s64];
    npyv_lanetype_u64 dacc_i[npyv_nlanes_s64];
    npyv_store_u64(dacc_i, acc_indices);
    npyv_store_s64(dacc, acc);

    s_acc = dacc[0];
    ret_idx = dacc_i[0];
    for (int vi = 1; vi < vstep; ++vi) {
        if (dacc[vi] > s_acc) {
            s_acc = dacc[vi];
            ret_idx = (npy_intp)dacc_i[vi];
        }
    }
    // get the lowest index in case of matched values
    for (int vi = 0; vi < vstep; ++vi) {
        if (s_acc == dacc[vi] && ret_idx > (npy_intp)dacc_i[vi]) {
            ret_idx = dacc_i[vi];
        }
    }
scalar_loop:
    for (; i < len; ++i) {
        npyv_lanetype_s64 a = ip[i];
    #if 0
        if (!(a <= s_acc)) {  // negated, for correct nan handling
    #else
        if (a > s_acc) {
    #endif
            s_acc = a;
            ret_idx = i;
        #if 0
            if (npy_isnan(s_acc)) {
                // nan encountered, it's maximal
                return ret_idx;
            }
        #endif
        }
    }
    return ret_idx;
}

#line 136
static inline npy_intp
simd_argmin_s64(npyv_lanetype_s64 *ip, npy_intp len)
{
    npyv_lanetype_s64 s_acc = *ip;
    npy_intp ret_idx = 0, i = 0;
    const int vstep = npyv_nlanes_s64;
    const int wstep = vstep*4;
    // loop by a scalar will perform better for small arrays
    if (len < wstep) {
        goto scalar_loop;
    }
    npy_intp len0 = len;
    // guard against wraparound vector addition for 32-bit indices
    // in case of the array length is larger than 16gb
#if 0
    if (len0 > NPY_MAX_UINT32) {
        len0 = NPY_MAX_UINT32;
    }
#endif
    // create index for vector indices
    npyv_lanetype_u64 d_vindices[npyv_nlanes_s64*4];
    for (int vi = 0; vi < wstep; ++vi) {
        d_vindices[vi] = vi;
    }
    const npyv_u64 vindices_0 = npyv_load_u64(d_vindices);
    const npyv_u64 vindices_1 = npyv_load_u64(d_vindices + vstep);
    const npyv_u64 vindices_2 = npyv_load_u64(d_vindices + vstep*2);
    const npyv_u64 vindices_3 = npyv_load_u64(d_vindices + vstep*3);
    // initialize vector accumulator for highest values and its indexes
    npyv_u64 acc_indices = npyv_zero_u64();
    npyv_s64 acc = npyv_setall_s64(s_acc);
    for (npy_intp n = len0 & -wstep; i < n; i += wstep) {
        npyv_u64 vi = npyv_setall_u64((npyv_lanetype_u64)i);
        npyv_s64 a = npyv_load_s64(ip + i);
        npyv_s64 b = npyv_load_s64(ip + i + vstep);
        npyv_s64 c = npyv_load_s64(ip + i + vstep*2);
        npyv_s64 d = npyv_load_s64(ip + i + vstep*3);

        // reverse to put lowest index first in case of matched values
        npyv_b64 m_ba = npyv_cmplt_s64(b, a);
        npyv_b64 m_dc = npyv_cmplt_s64(d, c);
        npyv_s64  x_ba = npyv_select_s64(m_ba, b, a);
        npyv_s64  x_dc = npyv_select_s64(m_dc, d, c);
        npyv_b64 m_dcba = npyv_cmplt_s64(x_dc, x_ba);
        npyv_s64  x_dcba = npyv_select_s64(m_dcba, x_dc, x_ba);

        npyv_u64 idx_ba = npyv_select_u64(m_ba, vindices_1, vindices_0);
        npyv_u64 idx_dc = npyv_select_u64(m_dc, vindices_3, vindices_2);
        npyv_u64 idx_dcba = npyv_select_u64(m_dcba, idx_dc, idx_ba);
        npyv_b64 m_acc = npyv_cmplt_s64(x_dcba, acc);
        acc = npyv_select_s64(m_acc, x_dcba, acc);
        acc_indices = npyv_select_u64(m_acc, npyv_add_u64(vi, idx_dcba), acc_indices);

    #if 0
        npyv_b64 nnan_a = npyv_notnan_s64(a);
        npyv_b64 nnan_b = npyv_notnan_s64(b);
        npyv_b64 nnan_c = npyv_notnan_s64(c);
        npyv_b64 nnan_d = npyv_notnan_s64(d);
        npyv_b64 nnan_ab = npyv_and_b64(nnan_a, nnan_b);
        npyv_b64 nnan_cd = npyv_and_b64(nnan_c, nnan_d);
        npy_uint64 nnan = npyv_tobits_b64(npyv_and_b64(nnan_ab, nnan_cd));
        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
            npy_uint64 nnan_4[4];
            nnan_4[0] = npyv_tobits_b64(nnan_a);
            nnan_4[1] = npyv_tobits_b64(nnan_b);
            nnan_4[2] = npyv_tobits_b64(nnan_c);
            nnan_4[3] = npyv_tobits_b64(nnan_d);
            for (int ni = 0; ni < 4; ++ni) {
                for (int vi = 0; vi < vstep; ++vi) {
                    if (!((nnan_4[ni] >> vi) & 1)) {
                        return i + ni*vstep + vi;
                    }
                }
            }
        }
    #endif
    }
    for (npy_intp n = len0 & -vstep; i < n; i += vstep) {
        npyv_u64 vi = npyv_setall_u64((npyv_lanetype_u64)i);
        npyv_s64 a = npyv_load_s64(ip + i);
        npyv_b64 m_acc = npyv_cmplt_s64(a, acc);
        acc = npyv_select_s64(m_acc, a, acc);
        acc_indices = npyv_select_u64(m_acc, npyv_add_u64(vi, vindices_0), acc_indices);
    #if 0
        npyv_b64 nnan_a = npyv_notnan_s64(a);
        npy_uint64 nnan = npyv_tobits_b64(nnan_a);
        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
            for (int vi = 0; vi < vstep; ++vi) {
                if (!((nnan >> vi) & 1)) {
                    return i + vi;
                }
            }
        }
    #endif
    }

    // reduce
    npyv_lanetype_s64 dacc[npyv_nlanes_s64];
    npyv_lanetype_u64 dacc_i[npyv_nlanes_s64];
    npyv_store_u64(dacc_i, acc_indices);
    npyv_store_s64(dacc, acc);

    s_acc = dacc[0];
    ret_idx = dacc_i[0];
    for (int vi = 1; vi < vstep; ++vi) {
        if (dacc[vi] < s_acc) {
            s_acc = dacc[vi];
            ret_idx = (npy_intp)dacc_i[vi];
        }
    }
    // get the lowest index in case of matched values
    for (int vi = 0; vi < vstep; ++vi) {
        if (s_acc == dacc[vi] && ret_idx > (npy_intp)dacc_i[vi]) {
            ret_idx = dacc_i[vi];
        }
    }
scalar_loop:
    for (; i < len; ++i) {
        npyv_lanetype_s64 a = ip[i];
    #if 0
        if (!(a >= s_acc)) {  // negated, for correct nan handling
    #else
        if (a < s_acc) {
    #endif
            s_acc = a;
            ret_idx = i;
        #if 0
            if (npy_isnan(s_acc)) {
                // nan encountered, it's maximal
                return ret_idx;
            }
        #endif
        }
    }
    return ret_idx;
}

#endif // chk_simd

#line 129
#if NPY_SIMD_F32
#line 136
static inline npy_intp
simd_argmax_f32(npyv_lanetype_f32 *ip, npy_intp len)
{
    npyv_lanetype_f32 s_acc = *ip;
    npy_intp ret_idx = 0, i = 0;
    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep*4;
    // loop by a scalar will perform better for small arrays
    if (len < wstep) {
        goto scalar_loop;
    }
    npy_intp len0 = len;
    // guard against wraparound vector addition for 32-bit indices
    // in case of the array length is larger than 16gb
#if 1
    if (len0 > NPY_MAX_UINT32) {
        len0 = NPY_MAX_UINT32;
    }
#endif
    // create index for vector indices
    npyv_lanetype_u32 d_vindices[npyv_nlanes_f32*4];
    for (int vi = 0; vi < wstep; ++vi) {
        d_vindices[vi] = vi;
    }
    const npyv_u32 vindices_0 = npyv_load_u32(d_vindices);
    const npyv_u32 vindices_1 = npyv_load_u32(d_vindices + vstep);
    const npyv_u32 vindices_2 = npyv_load_u32(d_vindices + vstep*2);
    const npyv_u32 vindices_3 = npyv_load_u32(d_vindices + vstep*3);
    // initialize vector accumulator for highest values and its indexes
    npyv_u32 acc_indices = npyv_zero_u32();
    npyv_f32 acc = npyv_setall_f32(s_acc);
    for (npy_intp n = len0 & -wstep; i < n; i += wstep) {
        npyv_u32 vi = npyv_setall_u32((npyv_lanetype_u32)i);
        npyv_f32 a = npyv_load_f32(ip + i);
        npyv_f32 b = npyv_load_f32(ip + i + vstep);
        npyv_f32 c = npyv_load_f32(ip + i + vstep*2);
        npyv_f32 d = npyv_load_f32(ip + i + vstep*3);

        // reverse to put lowest index first in case of matched values
        npyv_b32 m_ba = npyv_cmpgt_f32(b, a);
        npyv_b32 m_dc = npyv_cmpgt_f32(d, c);
        npyv_f32  x_ba = npyv_select_f32(m_ba, b, a);
        npyv_f32  x_dc = npyv_select_f32(m_dc, d, c);
        npyv_b32 m_dcba = npyv_cmpgt_f32(x_dc, x_ba);
        npyv_f32  x_dcba = npyv_select_f32(m_dcba, x_dc, x_ba);

        npyv_u32 idx_ba = npyv_select_u32(m_ba, vindices_1, vindices_0);
        npyv_u32 idx_dc = npyv_select_u32(m_dc, vindices_3, vindices_2);
        npyv_u32 idx_dcba = npyv_select_u32(m_dcba, idx_dc, idx_ba);
        npyv_b32 m_acc = npyv_cmpgt_f32(x_dcba, acc);
        acc = npyv_select_f32(m_acc, x_dcba, acc);
        acc_indices = npyv_select_u32(m_acc, npyv_add_u32(vi, idx_dcba), acc_indices);

    #if 1
        npyv_b32 nnan_a = npyv_notnan_f32(a);
        npyv_b32 nnan_b = npyv_notnan_f32(b);
        npyv_b32 nnan_c = npyv_notnan_f32(c);
        npyv_b32 nnan_d = npyv_notnan_f32(d);
        npyv_b32 nnan_ab = npyv_and_b32(nnan_a, nnan_b);
        npyv_b32 nnan_cd = npyv_and_b32(nnan_c, nnan_d);
        npy_uint64 nnan = npyv_tobits_b32(npyv_and_b32(nnan_ab, nnan_cd));
        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
            npy_uint64 nnan_4[4];
            nnan_4[0] = npyv_tobits_b32(nnan_a);
            nnan_4[1] = npyv_tobits_b32(nnan_b);
            nnan_4[2] = npyv_tobits_b32(nnan_c);
            nnan_4[3] = npyv_tobits_b32(nnan_d);
            for (int ni = 0; ni < 4; ++ni) {
                for (int vi = 0; vi < vstep; ++vi) {
                    if (!((nnan_4[ni] >> vi) & 1)) {
                        return i + ni*vstep + vi;
                    }
                }
            }
        }
    #endif
    }
    for (npy_intp n = len0 & -vstep; i < n; i += vstep) {
        npyv_u32 vi = npyv_setall_u32((npyv_lanetype_u32)i);
        npyv_f32 a = npyv_load_f32(ip + i);
        npyv_b32 m_acc = npyv_cmpgt_f32(a, acc);
        acc = npyv_select_f32(m_acc, a, acc);
        acc_indices = npyv_select_u32(m_acc, npyv_add_u32(vi, vindices_0), acc_indices);
    #if 1
        npyv_b32 nnan_a = npyv_notnan_f32(a);
        npy_uint64 nnan = npyv_tobits_b32(nnan_a);
        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
            for (int vi = 0; vi < vstep; ++vi) {
                if (!((nnan >> vi) & 1)) {
                    return i + vi;
                }
            }
        }
    #endif
    }

    // reduce
    npyv_lanetype_f32 dacc[npyv_nlanes_f32];
    npyv_lanetype_u32 dacc_i[npyv_nlanes_f32];
    npyv_store_u32(dacc_i, acc_indices);
    npyv_store_f32(dacc, acc);

    s_acc = dacc[0];
    ret_idx = dacc_i[0];
    for (int vi = 1; vi < vstep; ++vi) {
        if (dacc[vi] > s_acc) {
            s_acc = dacc[vi];
            ret_idx = (npy_intp)dacc_i[vi];
        }
    }
    // get the lowest index in case of matched values
    for (int vi = 0; vi < vstep; ++vi) {
        if (s_acc == dacc[vi] && ret_idx > (npy_intp)dacc_i[vi]) {
            ret_idx = dacc_i[vi];
        }
    }
scalar_loop:
    for (; i < len; ++i) {
        npyv_lanetype_f32 a = ip[i];
    #if 1
        if (!(a <= s_acc)) {  // negated, for correct nan handling
    #else
        if (a > s_acc) {
    #endif
            s_acc = a;
            ret_idx = i;
        #if 1
            if (npy_isnan(s_acc)) {
                // nan encountered, it's maximal
                return ret_idx;
            }
        #endif
        }
    }
    return ret_idx;
}

#line 136
static inline npy_intp
simd_argmin_f32(npyv_lanetype_f32 *ip, npy_intp len)
{
    npyv_lanetype_f32 s_acc = *ip;
    npy_intp ret_idx = 0, i = 0;
    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep*4;
    // loop by a scalar will perform better for small arrays
    if (len < wstep) {
        goto scalar_loop;
    }
    npy_intp len0 = len;
    // guard against wraparound vector addition for 32-bit indices
    // in case of the array length is larger than 16gb
#if 1
    if (len0 > NPY_MAX_UINT32) {
        len0 = NPY_MAX_UINT32;
    }
#endif
    // create index for vector indices
    npyv_lanetype_u32 d_vindices[npyv_nlanes_f32*4];
    for (int vi = 0; vi < wstep; ++vi) {
        d_vindices[vi] = vi;
    }
    const npyv_u32 vindices_0 = npyv_load_u32(d_vindices);
    const npyv_u32 vindices_1 = npyv_load_u32(d_vindices + vstep);
    const npyv_u32 vindices_2 = npyv_load_u32(d_vindices + vstep*2);
    const npyv_u32 vindices_3 = npyv_load_u32(d_vindices + vstep*3);
    // initialize vector accumulator for highest values and its indexes
    npyv_u32 acc_indices = npyv_zero_u32();
    npyv_f32 acc = npyv_setall_f32(s_acc);
    for (npy_intp n = len0 & -wstep; i < n; i += wstep) {
        npyv_u32 vi = npyv_setall_u32((npyv_lanetype_u32)i);
        npyv_f32 a = npyv_load_f32(ip + i);
        npyv_f32 b = npyv_load_f32(ip + i + vstep);
        npyv_f32 c = npyv_load_f32(ip + i + vstep*2);
        npyv_f32 d = npyv_load_f32(ip + i + vstep*3);

        // reverse to put lowest index first in case of matched values
        npyv_b32 m_ba = npyv_cmplt_f32(b, a);
        npyv_b32 m_dc = npyv_cmplt_f32(d, c);
        npyv_f32  x_ba = npyv_select_f32(m_ba, b, a);
        npyv_f32  x_dc = npyv_select_f32(m_dc, d, c);
        npyv_b32 m_dcba = npyv_cmplt_f32(x_dc, x_ba);
        npyv_f32  x_dcba = npyv_select_f32(m_dcba, x_dc, x_ba);

        npyv_u32 idx_ba = npyv_select_u32(m_ba, vindices_1, vindices_0);
        npyv_u32 idx_dc = npyv_select_u32(m_dc, vindices_3, vindices_2);
        npyv_u32 idx_dcba = npyv_select_u32(m_dcba, idx_dc, idx_ba);
        npyv_b32 m_acc = npyv_cmplt_f32(x_dcba, acc);
        acc = npyv_select_f32(m_acc, x_dcba, acc);
        acc_indices = npyv_select_u32(m_acc, npyv_add_u32(vi, idx_dcba), acc_indices);

    #if 1
        npyv_b32 nnan_a = npyv_notnan_f32(a);
        npyv_b32 nnan_b = npyv_notnan_f32(b);
        npyv_b32 nnan_c = npyv_notnan_f32(c);
        npyv_b32 nnan_d = npyv_notnan_f32(d);
        npyv_b32 nnan_ab = npyv_and_b32(nnan_a, nnan_b);
        npyv_b32 nnan_cd = npyv_and_b32(nnan_c, nnan_d);
        npy_uint64 nnan = npyv_tobits_b32(npyv_and_b32(nnan_ab, nnan_cd));
        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
            npy_uint64 nnan_4[4];
            nnan_4[0] = npyv_tobits_b32(nnan_a);
            nnan_4[1] = npyv_tobits_b32(nnan_b);
            nnan_4[2] = npyv_tobits_b32(nnan_c);
            nnan_4[3] = npyv_tobits_b32(nnan_d);
            for (int ni = 0; ni < 4; ++ni) {
                for (int vi = 0; vi < vstep; ++vi) {
                    if (!((nnan_4[ni] >> vi) & 1)) {
                        return i + ni*vstep + vi;
                    }
                }
            }
        }
    #endif
    }
    for (npy_intp n = len0 & -vstep; i < n; i += vstep) {
        npyv_u32 vi = npyv_setall_u32((npyv_lanetype_u32)i);
        npyv_f32 a = npyv_load_f32(ip + i);
        npyv_b32 m_acc = npyv_cmplt_f32(a, acc);
        acc = npyv_select_f32(m_acc, a, acc);
        acc_indices = npyv_select_u32(m_acc, npyv_add_u32(vi, vindices_0), acc_indices);
    #if 1
        npyv_b32 nnan_a = npyv_notnan_f32(a);
        npy_uint64 nnan = npyv_tobits_b32(nnan_a);
        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
            for (int vi = 0; vi < vstep; ++vi) {
                if (!((nnan >> vi) & 1)) {
                    return i + vi;
                }
            }
        }
    #endif
    }

    // reduce
    npyv_lanetype_f32 dacc[npyv_nlanes_f32];
    npyv_lanetype_u32 dacc_i[npyv_nlanes_f32];
    npyv_store_u32(dacc_i, acc_indices);
    npyv_store_f32(dacc, acc);

    s_acc = dacc[0];
    ret_idx = dacc_i[0];
    for (int vi = 1; vi < vstep; ++vi) {
        if (dacc[vi] < s_acc) {
            s_acc = dacc[vi];
            ret_idx = (npy_intp)dacc_i[vi];
        }
    }
    // get the lowest index in case of matched values
    for (int vi = 0; vi < vstep; ++vi) {
        if (s_acc == dacc[vi] && ret_idx > (npy_intp)dacc_i[vi]) {
            ret_idx = dacc_i[vi];
        }
    }
scalar_loop:
    for (; i < len; ++i) {
        npyv_lanetype_f32 a = ip[i];
    #if 1
        if (!(a >= s_acc)) {  // negated, for correct nan handling
    #else
        if (a < s_acc) {
    #endif
            s_acc = a;
            ret_idx = i;
        #if 1
            if (npy_isnan(s_acc)) {
                // nan encountered, it's maximal
                return ret_idx;
            }
        #endif
        }
    }
    return ret_idx;
}

#endif // chk_simd

#line 129
#if NPY_SIMD_F64
#line 136
static inline npy_intp
simd_argmax_f64(npyv_lanetype_f64 *ip, npy_intp len)
{
    npyv_lanetype_f64 s_acc = *ip;
    npy_intp ret_idx = 0, i = 0;
    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep*4;
    // loop by a scalar will perform better for small arrays
    if (len < wstep) {
        goto scalar_loop;
    }
    npy_intp len0 = len;
    // guard against wraparound vector addition for 32-bit indices
    // in case of the array length is larger than 16gb
#if 0
    if (len0 > NPY_MAX_UINT32) {
        len0 = NPY_MAX_UINT32;
    }
#endif
    // create index for vector indices
    npyv_lanetype_u64 d_vindices[npyv_nlanes_f64*4];
    for (int vi = 0; vi < wstep; ++vi) {
        d_vindices[vi] = vi;
    }
    const npyv_u64 vindices_0 = npyv_load_u64(d_vindices);
    const npyv_u64 vindices_1 = npyv_load_u64(d_vindices + vstep);
    const npyv_u64 vindices_2 = npyv_load_u64(d_vindices + vstep*2);
    const npyv_u64 vindices_3 = npyv_load_u64(d_vindices + vstep*3);
    // initialize vector accumulator for highest values and its indexes
    npyv_u64 acc_indices = npyv_zero_u64();
    npyv_f64 acc = npyv_setall_f64(s_acc);
    for (npy_intp n = len0 & -wstep; i < n; i += wstep) {
        npyv_u64 vi = npyv_setall_u64((npyv_lanetype_u64)i);
        npyv_f64 a = npyv_load_f64(ip + i);
        npyv_f64 b = npyv_load_f64(ip + i + vstep);
        npyv_f64 c = npyv_load_f64(ip + i + vstep*2);
        npyv_f64 d = npyv_load_f64(ip + i + vstep*3);

        // reverse to put lowest index first in case of matched values
        npyv_b64 m_ba = npyv_cmpgt_f64(b, a);
        npyv_b64 m_dc = npyv_cmpgt_f64(d, c);
        npyv_f64  x_ba = npyv_select_f64(m_ba, b, a);
        npyv_f64  x_dc = npyv_select_f64(m_dc, d, c);
        npyv_b64 m_dcba = npyv_cmpgt_f64(x_dc, x_ba);
        npyv_f64  x_dcba = npyv_select_f64(m_dcba, x_dc, x_ba);

        npyv_u64 idx_ba = npyv_select_u64(m_ba, vindices_1, vindices_0);
        npyv_u64 idx_dc = npyv_select_u64(m_dc, vindices_3, vindices_2);
        npyv_u64 idx_dcba = npyv_select_u64(m_dcba, idx_dc, idx_ba);
        npyv_b64 m_acc = npyv_cmpgt_f64(x_dcba, acc);
        acc = npyv_select_f64(m_acc, x_dcba, acc);
        acc_indices = npyv_select_u64(m_acc, npyv_add_u64(vi, idx_dcba), acc_indices);

    #if 1
        npyv_b64 nnan_a = npyv_notnan_f64(a);
        npyv_b64 nnan_b = npyv_notnan_f64(b);
        npyv_b64 nnan_c = npyv_notnan_f64(c);
        npyv_b64 nnan_d = npyv_notnan_f64(d);
        npyv_b64 nnan_ab = npyv_and_b64(nnan_a, nnan_b);
        npyv_b64 nnan_cd = npyv_and_b64(nnan_c, nnan_d);
        npy_uint64 nnan = npyv_tobits_b64(npyv_and_b64(nnan_ab, nnan_cd));
        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
            npy_uint64 nnan_4[4];
            nnan_4[0] = npyv_tobits_b64(nnan_a);
            nnan_4[1] = npyv_tobits_b64(nnan_b);
            nnan_4[2] = npyv_tobits_b64(nnan_c);
            nnan_4[3] = npyv_tobits_b64(nnan_d);
            for (int ni = 0; ni < 4; ++ni) {
                for (int vi = 0; vi < vstep; ++vi) {
                    if (!((nnan_4[ni] >> vi) & 1)) {
                        return i + ni*vstep + vi;
                    }
                }
            }
        }
    #endif
    }
    for (npy_intp n = len0 & -vstep; i < n; i += vstep) {
        npyv_u64 vi = npyv_setall_u64((npyv_lanetype_u64)i);
        npyv_f64 a = npyv_load_f64(ip + i);
        npyv_b64 m_acc = npyv_cmpgt_f64(a, acc);
        acc = npyv_select_f64(m_acc, a, acc);
        acc_indices = npyv_select_u64(m_acc, npyv_add_u64(vi, vindices_0), acc_indices);
    #if 1
        npyv_b64 nnan_a = npyv_notnan_f64(a);
        npy_uint64 nnan = npyv_tobits_b64(nnan_a);
        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
            for (int vi = 0; vi < vstep; ++vi) {
                if (!((nnan >> vi) & 1)) {
                    return i + vi;
                }
            }
        }
    #endif
    }

    // reduce
    npyv_lanetype_f64 dacc[npyv_nlanes_f64];
    npyv_lanetype_u64 dacc_i[npyv_nlanes_f64];
    npyv_store_u64(dacc_i, acc_indices);
    npyv_store_f64(dacc, acc);

    s_acc = dacc[0];
    ret_idx = dacc_i[0];
    for (int vi = 1; vi < vstep; ++vi) {
        if (dacc[vi] > s_acc) {
            s_acc = dacc[vi];
            ret_idx = (npy_intp)dacc_i[vi];
        }
    }
    // get the lowest index in case of matched values
    for (int vi = 0; vi < vstep; ++vi) {
        if (s_acc == dacc[vi] && ret_idx > (npy_intp)dacc_i[vi]) {
            ret_idx = dacc_i[vi];
        }
    }
scalar_loop:
    for (; i < len; ++i) {
        npyv_lanetype_f64 a = ip[i];
    #if 1
        if (!(a <= s_acc)) {  // negated, for correct nan handling
    #else
        if (a > s_acc) {
    #endif
            s_acc = a;
            ret_idx = i;
        #if 1
            if (npy_isnan(s_acc)) {
                // nan encountered, it's maximal
                return ret_idx;
            }
        #endif
        }
    }
    return ret_idx;
}

#line 136
static inline npy_intp
simd_argmin_f64(npyv_lanetype_f64 *ip, npy_intp len)
{
    npyv_lanetype_f64 s_acc = *ip;
    npy_intp ret_idx = 0, i = 0;
    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep*4;
    // loop by a scalar will perform better for small arrays
    if (len < wstep) {
        goto scalar_loop;
    }
    npy_intp len0 = len;
    // guard against wraparound vector addition for 32-bit indices
    // in case of the array length is larger than 16gb
#if 0
    if (len0 > NPY_MAX_UINT32) {
        len0 = NPY_MAX_UINT32;
    }
#endif
    // create index for vector indices
    npyv_lanetype_u64 d_vindices[npyv_nlanes_f64*4];
    for (int vi = 0; vi < wstep; ++vi) {
        d_vindices[vi] = vi;
    }
    const npyv_u64 vindices_0 = npyv_load_u64(d_vindices);
    const npyv_u64 vindices_1 = npyv_load_u64(d_vindices + vstep);
    const npyv_u64 vindices_2 = npyv_load_u64(d_vindices + vstep*2);
    const npyv_u64 vindices_3 = npyv_load_u64(d_vindices + vstep*3);
    // initialize vector accumulator for highest values and its indexes
    npyv_u64 acc_indices = npyv_zero_u64();
    npyv_f64 acc = npyv_setall_f64(s_acc);
    for (npy_intp n = len0 & -wstep; i < n; i += wstep) {
        npyv_u64 vi = npyv_setall_u64((npyv_lanetype_u64)i);
        npyv_f64 a = npyv_load_f64(ip + i);
        npyv_f64 b = npyv_load_f64(ip + i + vstep);
        npyv_f64 c = npyv_load_f64(ip + i + vstep*2);
        npyv_f64 d = npyv_load_f64(ip + i + vstep*3);

        // reverse to put lowest index first in case of matched values
        npyv_b64 m_ba = npyv_cmplt_f64(b, a);
        npyv_b64 m_dc = npyv_cmplt_f64(d, c);
        npyv_f64  x_ba = npyv_select_f64(m_ba, b, a);
        npyv_f64  x_dc = npyv_select_f64(m_dc, d, c);
        npyv_b64 m_dcba = npyv_cmplt_f64(x_dc, x_ba);
        npyv_f64  x_dcba = npyv_select_f64(m_dcba, x_dc, x_ba);

        npyv_u64 idx_ba = npyv_select_u64(m_ba, vindices_1, vindices_0);
        npyv_u64 idx_dc = npyv_select_u64(m_dc, vindices_3, vindices_2);
        npyv_u64 idx_dcba = npyv_select_u64(m_dcba, idx_dc, idx_ba);
        npyv_b64 m_acc = npyv_cmplt_f64(x_dcba, acc);
        acc = npyv_select_f64(m_acc, x_dcba, acc);
        acc_indices = npyv_select_u64(m_acc, npyv_add_u64(vi, idx_dcba), acc_indices);

    #if 1
        npyv_b64 nnan_a = npyv_notnan_f64(a);
        npyv_b64 nnan_b = npyv_notnan_f64(b);
        npyv_b64 nnan_c = npyv_notnan_f64(c);
        npyv_b64 nnan_d = npyv_notnan_f64(d);
        npyv_b64 nnan_ab = npyv_and_b64(nnan_a, nnan_b);
        npyv_b64 nnan_cd = npyv_and_b64(nnan_c, nnan_d);
        npy_uint64 nnan = npyv_tobits_b64(npyv_and_b64(nnan_ab, nnan_cd));
        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
            npy_uint64 nnan_4[4];
            nnan_4[0] = npyv_tobits_b64(nnan_a);
            nnan_4[1] = npyv_tobits_b64(nnan_b);
            nnan_4[2] = npyv_tobits_b64(nnan_c);
            nnan_4[3] = npyv_tobits_b64(nnan_d);
            for (int ni = 0; ni < 4; ++ni) {
                for (int vi = 0; vi < vstep; ++vi) {
                    if (!((nnan_4[ni] >> vi) & 1)) {
                        return i + ni*vstep + vi;
                    }
                }
            }
        }
    #endif
    }
    for (npy_intp n = len0 & -vstep; i < n; i += vstep) {
        npyv_u64 vi = npyv_setall_u64((npyv_lanetype_u64)i);
        npyv_f64 a = npyv_load_f64(ip + i);
        npyv_b64 m_acc = npyv_cmplt_f64(a, acc);
        acc = npyv_select_f64(m_acc, a, acc);
        acc_indices = npyv_select_u64(m_acc, npyv_add_u64(vi, vindices_0), acc_indices);
    #if 1
        npyv_b64 nnan_a = npyv_notnan_f64(a);
        npy_uint64 nnan = npyv_tobits_b64(nnan_a);
        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
            for (int vi = 0; vi < vstep; ++vi) {
                if (!((nnan >> vi) & 1)) {
                    return i + vi;
                }
            }
        }
    #endif
    }

    // reduce
    npyv_lanetype_f64 dacc[npyv_nlanes_f64];
    npyv_lanetype_u64 dacc_i[npyv_nlanes_f64];
    npyv_store_u64(dacc_i, acc_indices);
    npyv_store_f64(dacc, acc);

    s_acc = dacc[0];
    ret_idx = dacc_i[0];
    for (int vi = 1; vi < vstep; ++vi) {
        if (dacc[vi] < s_acc) {
            s_acc = dacc[vi];
            ret_idx = (npy_intp)dacc_i[vi];
        }
    }
    // get the lowest index in case of matched values
    for (int vi = 0; vi < vstep; ++vi) {
        if (s_acc == dacc[vi] && ret_idx > (npy_intp)dacc_i[vi]) {
            ret_idx = dacc_i[vi];
        }
    }
scalar_loop:
    for (; i < len; ++i) {
        npyv_lanetype_f64 a = ip[i];
    #if 1
        if (!(a >= s_acc)) {  // negated, for correct nan handling
    #else
        if (a < s_acc) {
    #endif
            s_acc = a;
            ret_idx = i;
        #if 1
            if (npy_isnan(s_acc)) {
                // nan encountered, it's maximal
                return ret_idx;
            }
        #endif
        }
    }
    return ret_idx;
}

#endif // chk_simd


#line 291
#undef TO_SIMD_SFX
#if 0
#line 296
#elif NPY_SIMD && NPY_BITSOF_BYTE == 8
    #if 0
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_BYTE == 16
    #if 0
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_BYTE == 32
    #if 0
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_BYTE == 64
    #if 0
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 318
NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_argmax)
(npy_ubyte *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
{
#if 0
    if (npy_isnan(*ip)) {
        // nan encountered; it's maximal|minimal
        *mindx = 0;
        return 0;
    }
#endif
#ifdef TO_SIMD_SFX
    *mindx = TO_SIMD_SFX(simd_argmax)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
    npyv_cleanup();
#else
    npy_ubyte mp = *ip;
    *mindx = 0;
    npy_intp i = 1;

    for (; i < n; ++i) {
        npy_ubyte a = ip[i];
    #if 0
        if (!(a <= mp)) {  // negated, for correct nan handling
    #else
        if (a > mp) {
    #endif
            mp = a;
            *mindx = i;
        #if 0
            if (npy_isnan(mp)) {
                // nan encountered, it's maximal|minimal
                break;
            }
        #endif
        }
    }
#endif // TO_SIMD_SFX
    return 0;
}

#line 318
NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_argmin)
(npy_ubyte *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
{
#if 0
    if (npy_isnan(*ip)) {
        // nan encountered; it's maximal|minimal
        *mindx = 0;
        return 0;
    }
#endif
#ifdef TO_SIMD_SFX
    *mindx = TO_SIMD_SFX(simd_argmin)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
    npyv_cleanup();
#else
    npy_ubyte mp = *ip;
    *mindx = 0;
    npy_intp i = 1;

    for (; i < n; ++i) {
        npy_ubyte a = ip[i];
    #if 0
        if (!(a >= mp)) {  // negated, for correct nan handling
    #else
        if (a < mp) {
    #endif
            mp = a;
            *mindx = i;
        #if 0
            if (npy_isnan(mp)) {
                // nan encountered, it's maximal|minimal
                break;
            }
        #endif
        }
    }
#endif // TO_SIMD_SFX
    return 0;
}


#line 291
#undef TO_SIMD_SFX
#if 0
#line 296
#elif NPY_SIMD && NPY_BITSOF_SHORT == 8
    #if 0
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_SHORT == 16
    #if 0
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_SHORT == 32
    #if 0
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_SHORT == 64
    #if 0
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 318
NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_argmax)
(npy_ushort *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
{
#if 0
    if (npy_isnan(*ip)) {
        // nan encountered; it's maximal|minimal
        *mindx = 0;
        return 0;
    }
#endif
#ifdef TO_SIMD_SFX
    *mindx = TO_SIMD_SFX(simd_argmax)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
    npyv_cleanup();
#else
    npy_ushort mp = *ip;
    *mindx = 0;
    npy_intp i = 1;

    for (; i < n; ++i) {
        npy_ushort a = ip[i];
    #if 0
        if (!(a <= mp)) {  // negated, for correct nan handling
    #else
        if (a > mp) {
    #endif
            mp = a;
            *mindx = i;
        #if 0
            if (npy_isnan(mp)) {
                // nan encountered, it's maximal|minimal
                break;
            }
        #endif
        }
    }
#endif // TO_SIMD_SFX
    return 0;
}

#line 318
NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_argmin)
(npy_ushort *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
{
#if 0
    if (npy_isnan(*ip)) {
        // nan encountered; it's maximal|minimal
        *mindx = 0;
        return 0;
    }
#endif
#ifdef TO_SIMD_SFX
    *mindx = TO_SIMD_SFX(simd_argmin)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
    npyv_cleanup();
#else
    npy_ushort mp = *ip;
    *mindx = 0;
    npy_intp i = 1;

    for (; i < n; ++i) {
        npy_ushort a = ip[i];
    #if 0
        if (!(a >= mp)) {  // negated, for correct nan handling
    #else
        if (a < mp) {
    #endif
            mp = a;
            *mindx = i;
        #if 0
            if (npy_isnan(mp)) {
                // nan encountered, it's maximal|minimal
                break;
            }
        #endif
        }
    }
#endif // TO_SIMD_SFX
    return 0;
}


#line 291
#undef TO_SIMD_SFX
#if 0
#line 296
#elif NPY_SIMD && NPY_BITSOF_INT == 8
    #if 0
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_INT == 16
    #if 0
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_INT == 32
    #if 0
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_INT == 64
    #if 0
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 318
NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_argmax)
(npy_uint *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
{
#if 0
    if (npy_isnan(*ip)) {
        // nan encountered; it's maximal|minimal
        *mindx = 0;
        return 0;
    }
#endif
#ifdef TO_SIMD_SFX
    *mindx = TO_SIMD_SFX(simd_argmax)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
    npyv_cleanup();
#else
    npy_uint mp = *ip;
    *mindx = 0;
    npy_intp i = 1;

    for (; i < n; ++i) {
        npy_uint a = ip[i];
    #if 0
        if (!(a <= mp)) {  // negated, for correct nan handling
    #else
        if (a > mp) {
    #endif
            mp = a;
            *mindx = i;
        #if 0
            if (npy_isnan(mp)) {
                // nan encountered, it's maximal|minimal
                break;
            }
        #endif
        }
    }
#endif // TO_SIMD_SFX
    return 0;
}

#line 318
NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_argmin)
(npy_uint *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
{
#if 0
    if (npy_isnan(*ip)) {
        // nan encountered; it's maximal|minimal
        *mindx = 0;
        return 0;
    }
#endif
#ifdef TO_SIMD_SFX
    *mindx = TO_SIMD_SFX(simd_argmin)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
    npyv_cleanup();
#else
    npy_uint mp = *ip;
    *mindx = 0;
    npy_intp i = 1;

    for (; i < n; ++i) {
        npy_uint a = ip[i];
    #if 0
        if (!(a >= mp)) {  // negated, for correct nan handling
    #else
        if (a < mp) {
    #endif
            mp = a;
            *mindx = i;
        #if 0
            if (npy_isnan(mp)) {
                // nan encountered, it's maximal|minimal
                break;
            }
        #endif
        }
    }
#endif // TO_SIMD_SFX
    return 0;
}


#line 291
#undef TO_SIMD_SFX
#if 0
#line 296
#elif NPY_SIMD && NPY_BITSOF_LONG == 8
    #if 0
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_LONG == 16
    #if 0
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_LONG == 32
    #if 0
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_LONG == 64
    #if 0
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 318
NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_argmax)
(npy_ulong *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
{
#if 0
    if (npy_isnan(*ip)) {
        // nan encountered; it's maximal|minimal
        *mindx = 0;
        return 0;
    }
#endif
#ifdef TO_SIMD_SFX
    *mindx = TO_SIMD_SFX(simd_argmax)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
    npyv_cleanup();
#else
    npy_ulong mp = *ip;
    *mindx = 0;
    npy_intp i = 1;

    for (; i < n; ++i) {
        npy_ulong a = ip[i];
    #if 0
        if (!(a <= mp)) {  // negated, for correct nan handling
    #else
        if (a > mp) {
    #endif
            mp = a;
            *mindx = i;
        #if 0
            if (npy_isnan(mp)) {
                // nan encountered, it's maximal|minimal
                break;
            }
        #endif
        }
    }
#endif // TO_SIMD_SFX
    return 0;
}

#line 318
NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_argmin)
(npy_ulong *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
{
#if 0
    if (npy_isnan(*ip)) {
        // nan encountered; it's maximal|minimal
        *mindx = 0;
        return 0;
    }
#endif
#ifdef TO_SIMD_SFX
    *mindx = TO_SIMD_SFX(simd_argmin)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
    npyv_cleanup();
#else
    npy_ulong mp = *ip;
    *mindx = 0;
    npy_intp i = 1;

    for (; i < n; ++i) {
        npy_ulong a = ip[i];
    #if 0
        if (!(a >= mp)) {  // negated, for correct nan handling
    #else
        if (a < mp) {
    #endif
            mp = a;
            *mindx = i;
        #if 0
            if (npy_isnan(mp)) {
                // nan encountered, it's maximal|minimal
                break;
            }
        #endif
        }
    }
#endif // TO_SIMD_SFX
    return 0;
}


#line 291
#undef TO_SIMD_SFX
#if 0
#line 296
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 8
    #if 0
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 16
    #if 0
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 32
    #if 0
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 64
    #if 0
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 1
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 318
NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_argmax)
(npy_ulonglong *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
{
#if 0
    if (npy_isnan(*ip)) {
        // nan encountered; it's maximal|minimal
        *mindx = 0;
        return 0;
    }
#endif
#ifdef TO_SIMD_SFX
    *mindx = TO_SIMD_SFX(simd_argmax)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
    npyv_cleanup();
#else
    npy_ulonglong mp = *ip;
    *mindx = 0;
    npy_intp i = 1;

    for (; i < n; ++i) {
        npy_ulonglong a = ip[i];
    #if 0
        if (!(a <= mp)) {  // negated, for correct nan handling
    #else
        if (a > mp) {
    #endif
            mp = a;
            *mindx = i;
        #if 0
            if (npy_isnan(mp)) {
                // nan encountered, it's maximal|minimal
                break;
            }
        #endif
        }
    }
#endif // TO_SIMD_SFX
    return 0;
}

#line 318
NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_argmin)
(npy_ulonglong *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
{
#if 0
    if (npy_isnan(*ip)) {
        // nan encountered; it's maximal|minimal
        *mindx = 0;
        return 0;
    }
#endif
#ifdef TO_SIMD_SFX
    *mindx = TO_SIMD_SFX(simd_argmin)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
    npyv_cleanup();
#else
    npy_ulonglong mp = *ip;
    *mindx = 0;
    npy_intp i = 1;

    for (; i < n; ++i) {
        npy_ulonglong a = ip[i];
    #if 0
        if (!(a >= mp)) {  // negated, for correct nan handling
    #else
        if (a < mp) {
    #endif
            mp = a;
            *mindx = i;
        #if 0
            if (npy_isnan(mp)) {
                // nan encountered, it's maximal|minimal
                break;
            }
        #endif
        }
    }
#endif // TO_SIMD_SFX
    return 0;
}


#line 291
#undef TO_SIMD_SFX
#if 0
#line 296
#elif NPY_SIMD && NPY_BITSOF_BYTE == 8
    #if 0
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_BYTE == 16
    #if 0
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_BYTE == 32
    #if 0
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_BYTE == 64
    #if 0
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 318
NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_argmax)
(npy_byte *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
{
#if 0
    if (npy_isnan(*ip)) {
        // nan encountered; it's maximal|minimal
        *mindx = 0;
        return 0;
    }
#endif
#ifdef TO_SIMD_SFX
    *mindx = TO_SIMD_SFX(simd_argmax)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
    npyv_cleanup();
#else
    npy_byte mp = *ip;
    *mindx = 0;
    npy_intp i = 1;

    for (; i < n; ++i) {
        npy_byte a = ip[i];
    #if 0
        if (!(a <= mp)) {  // negated, for correct nan handling
    #else
        if (a > mp) {
    #endif
            mp = a;
            *mindx = i;
        #if 0
            if (npy_isnan(mp)) {
                // nan encountered, it's maximal|minimal
                break;
            }
        #endif
        }
    }
#endif // TO_SIMD_SFX
    return 0;
}

#line 318
NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_argmin)
(npy_byte *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
{
#if 0
    if (npy_isnan(*ip)) {
        // nan encountered; it's maximal|minimal
        *mindx = 0;
        return 0;
    }
#endif
#ifdef TO_SIMD_SFX
    *mindx = TO_SIMD_SFX(simd_argmin)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
    npyv_cleanup();
#else
    npy_byte mp = *ip;
    *mindx = 0;
    npy_intp i = 1;

    for (; i < n; ++i) {
        npy_byte a = ip[i];
    #if 0
        if (!(a >= mp)) {  // negated, for correct nan handling
    #else
        if (a < mp) {
    #endif
            mp = a;
            *mindx = i;
        #if 0
            if (npy_isnan(mp)) {
                // nan encountered, it's maximal|minimal
                break;
            }
        #endif
        }
    }
#endif // TO_SIMD_SFX
    return 0;
}


#line 291
#undef TO_SIMD_SFX
#if 0
#line 296
#elif NPY_SIMD && NPY_BITSOF_SHORT == 8
    #if 0
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_SHORT == 16
    #if 0
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_SHORT == 32
    #if 0
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_SHORT == 64
    #if 0
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 318
NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_argmax)
(npy_short *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
{
#if 0
    if (npy_isnan(*ip)) {
        // nan encountered; it's maximal|minimal
        *mindx = 0;
        return 0;
    }
#endif
#ifdef TO_SIMD_SFX
    *mindx = TO_SIMD_SFX(simd_argmax)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
    npyv_cleanup();
#else
    npy_short mp = *ip;
    *mindx = 0;
    npy_intp i = 1;

    for (; i < n; ++i) {
        npy_short a = ip[i];
    #if 0
        if (!(a <= mp)) {  // negated, for correct nan handling
    #else
        if (a > mp) {
    #endif
            mp = a;
            *mindx = i;
        #if 0
            if (npy_isnan(mp)) {
                // nan encountered, it's maximal|minimal
                break;
            }
        #endif
        }
    }
#endif // TO_SIMD_SFX
    return 0;
}

#line 318
NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_argmin)
(npy_short *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
{
#if 0
    if (npy_isnan(*ip)) {
        // nan encountered; it's maximal|minimal
        *mindx = 0;
        return 0;
    }
#endif
#ifdef TO_SIMD_SFX
    *mindx = TO_SIMD_SFX(simd_argmin)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
    npyv_cleanup();
#else
    npy_short mp = *ip;
    *mindx = 0;
    npy_intp i = 1;

    for (; i < n; ++i) {
        npy_short a = ip[i];
    #if 0
        if (!(a >= mp)) {  // negated, for correct nan handling
    #else
        if (a < mp) {
    #endif
            mp = a;
            *mindx = i;
        #if 0
            if (npy_isnan(mp)) {
                // nan encountered, it's maximal|minimal
                break;
            }
        #endif
        }
    }
#endif // TO_SIMD_SFX
    return 0;
}


#line 291
#undef TO_SIMD_SFX
#if 0
#line 296
#elif NPY_SIMD && NPY_BITSOF_INT == 8
    #if 0
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_INT == 16
    #if 0
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_INT == 32
    #if 0
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_INT == 64
    #if 0
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 318
NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_argmax)
(npy_int *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
{
#if 0
    if (npy_isnan(*ip)) {
        // nan encountered; it's maximal|minimal
        *mindx = 0;
        return 0;
    }
#endif
#ifdef TO_SIMD_SFX
    *mindx = TO_SIMD_SFX(simd_argmax)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
    npyv_cleanup();
#else
    npy_int mp = *ip;
    *mindx = 0;
    npy_intp i = 1;

    for (; i < n; ++i) {
        npy_int a = ip[i];
    #if 0
        if (!(a <= mp)) {  // negated, for correct nan handling
    #else
        if (a > mp) {
    #endif
            mp = a;
            *mindx = i;
        #if 0
            if (npy_isnan(mp)) {
                // nan encountered, it's maximal|minimal
                break;
            }
        #endif
        }
    }
#endif // TO_SIMD_SFX
    return 0;
}

#line 318
NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_argmin)
(npy_int *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
{
#if 0
    if (npy_isnan(*ip)) {
        // nan encountered; it's maximal|minimal
        *mindx = 0;
        return 0;
    }
#endif
#ifdef TO_SIMD_SFX
    *mindx = TO_SIMD_SFX(simd_argmin)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
    npyv_cleanup();
#else
    npy_int mp = *ip;
    *mindx = 0;
    npy_intp i = 1;

    for (; i < n; ++i) {
        npy_int a = ip[i];
    #if 0
        if (!(a >= mp)) {  // negated, for correct nan handling
    #else
        if (a < mp) {
    #endif
            mp = a;
            *mindx = i;
        #if 0
            if (npy_isnan(mp)) {
                // nan encountered, it's maximal|minimal
                break;
            }
        #endif
        }
    }
#endif // TO_SIMD_SFX
    return 0;
}


#line 291
#undef TO_SIMD_SFX
#if 0
#line 296
#elif NPY_SIMD && NPY_BITSOF_LONG == 8
    #if 0
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_LONG == 16
    #if 0
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_LONG == 32
    #if 0
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_LONG == 64
    #if 0
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 318
NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_argmax)
(npy_long *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
{
#if 0
    if (npy_isnan(*ip)) {
        // nan encountered; it's maximal|minimal
        *mindx = 0;
        return 0;
    }
#endif
#ifdef TO_SIMD_SFX
    *mindx = TO_SIMD_SFX(simd_argmax)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
    npyv_cleanup();
#else
    npy_long mp = *ip;
    *mindx = 0;
    npy_intp i = 1;

    for (; i < n; ++i) {
        npy_long a = ip[i];
    #if 0
        if (!(a <= mp)) {  // negated, for correct nan handling
    #else
        if (a > mp) {
    #endif
            mp = a;
            *mindx = i;
        #if 0
            if (npy_isnan(mp)) {
                // nan encountered, it's maximal|minimal
                break;
            }
        #endif
        }
    }
#endif // TO_SIMD_SFX
    return 0;
}

#line 318
NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_argmin)
(npy_long *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
{
#if 0
    if (npy_isnan(*ip)) {
        // nan encountered; it's maximal|minimal
        *mindx = 0;
        return 0;
    }
#endif
#ifdef TO_SIMD_SFX
    *mindx = TO_SIMD_SFX(simd_argmin)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
    npyv_cleanup();
#else
    npy_long mp = *ip;
    *mindx = 0;
    npy_intp i = 1;

    for (; i < n; ++i) {
        npy_long a = ip[i];
    #if 0
        if (!(a >= mp)) {  // negated, for correct nan handling
    #else
        if (a < mp) {
    #endif
            mp = a;
            *mindx = i;
        #if 0
            if (npy_isnan(mp)) {
                // nan encountered, it's maximal|minimal
                break;
            }
        #endif
        }
    }
#endif // TO_SIMD_SFX
    return 0;
}


#line 291
#undef TO_SIMD_SFX
#if 0
#line 296
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 8
    #if 0
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 16
    #if 0
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 32
    #if 0
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 64
    #if 0
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 318
NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_argmax)
(npy_longlong *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
{
#if 0
    if (npy_isnan(*ip)) {
        // nan encountered; it's maximal|minimal
        *mindx = 0;
        return 0;
    }
#endif
#ifdef TO_SIMD_SFX
    *mindx = TO_SIMD_SFX(simd_argmax)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
    npyv_cleanup();
#else
    npy_longlong mp = *ip;
    *mindx = 0;
    npy_intp i = 1;

    for (; i < n; ++i) {
        npy_longlong a = ip[i];
    #if 0
        if (!(a <= mp)) {  // negated, for correct nan handling
    #else
        if (a > mp) {
    #endif
            mp = a;
            *mindx = i;
        #if 0
            if (npy_isnan(mp)) {
                // nan encountered, it's maximal|minimal
                break;
            }
        #endif
        }
    }
#endif // TO_SIMD_SFX
    return 0;
}

#line 318
NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_argmin)
(npy_longlong *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
{
#if 0
    if (npy_isnan(*ip)) {
        // nan encountered; it's maximal|minimal
        *mindx = 0;
        return 0;
    }
#endif
#ifdef TO_SIMD_SFX
    *mindx = TO_SIMD_SFX(simd_argmin)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
    npyv_cleanup();
#else
    npy_longlong mp = *ip;
    *mindx = 0;
    npy_intp i = 1;

    for (; i < n; ++i) {
        npy_longlong a = ip[i];
    #if 0
        if (!(a >= mp)) {  // negated, for correct nan handling
    #else
        if (a < mp) {
    #endif
            mp = a;
            *mindx = i;
        #if 0
            if (npy_isnan(mp)) {
                // nan encountered, it's maximal|minimal
                break;
            }
        #endif
        }
    }
#endif // TO_SIMD_SFX
    return 0;
}


#line 291
#undef TO_SIMD_SFX
#if 0
#line 296
#elif NPY_SIMD && NPY_BITSOF_FLOAT == 8
    #if 1
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_FLOAT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_FLOAT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_FLOAT == 16
    #if 1
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_FLOAT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_FLOAT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_FLOAT == 32
    #if 1
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_FLOAT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_FLOAT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_FLOAT == 64
    #if 1
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_FLOAT == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_FLOAT == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 318
NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_argmax)
(npy_float *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
{
#if 1
    if (npy_isnan(*ip)) {
        // nan encountered; it's maximal|minimal
        *mindx = 0;
        return 0;
    }
#endif
#ifdef TO_SIMD_SFX
    *mindx = TO_SIMD_SFX(simd_argmax)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
    npyv_cleanup();
#else
    npy_float mp = *ip;
    *mindx = 0;
    npy_intp i = 1;

    for (; i < n; ++i) {
        npy_float a = ip[i];
    #if 1
        if (!(a <= mp)) {  // negated, for correct nan handling
    #else
        if (a > mp) {
    #endif
            mp = a;
            *mindx = i;
        #if 1
            if (npy_isnan(mp)) {
                // nan encountered, it's maximal|minimal
                break;
            }
        #endif
        }
    }
#endif // TO_SIMD_SFX
    return 0;
}

#line 318
NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_argmin)
(npy_float *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
{
#if 1
    if (npy_isnan(*ip)) {
        // nan encountered; it's maximal|minimal
        *mindx = 0;
        return 0;
    }
#endif
#ifdef TO_SIMD_SFX
    *mindx = TO_SIMD_SFX(simd_argmin)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
    npyv_cleanup();
#else
    npy_float mp = *ip;
    *mindx = 0;
    npy_intp i = 1;

    for (; i < n; ++i) {
        npy_float a = ip[i];
    #if 1
        if (!(a >= mp)) {  // negated, for correct nan handling
    #else
        if (a < mp) {
    #endif
            mp = a;
            *mindx = i;
        #if 1
            if (npy_isnan(mp)) {
                // nan encountered, it's maximal|minimal
                break;
            }
        #endif
        }
    }
#endif // TO_SIMD_SFX
    return 0;
}


#line 291
#undef TO_SIMD_SFX
#if 0
#line 296
#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 8
    #if 1
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_DOUBLE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_DOUBLE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 16
    #if 1
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_DOUBLE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_DOUBLE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 32
    #if 1
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_DOUBLE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_DOUBLE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 64
    #if 1
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_DOUBLE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_DOUBLE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 318
NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_argmax)
(npy_double *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
{
#if 1
    if (npy_isnan(*ip)) {
        // nan encountered; it's maximal|minimal
        *mindx = 0;
        return 0;
    }
#endif
#ifdef TO_SIMD_SFX
    *mindx = TO_SIMD_SFX(simd_argmax)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
    npyv_cleanup();
#else
    npy_double mp = *ip;
    *mindx = 0;
    npy_intp i = 1;

    for (; i < n; ++i) {
        npy_double a = ip[i];
    #if 1
        if (!(a <= mp)) {  // negated, for correct nan handling
    #else
        if (a > mp) {
    #endif
            mp = a;
            *mindx = i;
        #if 1
            if (npy_isnan(mp)) {
                // nan encountered, it's maximal|minimal
                break;
            }
        #endif
        }
    }
#endif // TO_SIMD_SFX
    return 0;
}

#line 318
NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_argmin)
(npy_double *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
{
#if 1
    if (npy_isnan(*ip)) {
        // nan encountered; it's maximal|minimal
        *mindx = 0;
        return 0;
    }
#endif
#ifdef TO_SIMD_SFX
    *mindx = TO_SIMD_SFX(simd_argmin)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
    npyv_cleanup();
#else
    npy_double mp = *ip;
    *mindx = 0;
    npy_intp i = 1;

    for (; i < n; ++i) {
        npy_double a = ip[i];
    #if 1
        if (!(a >= mp)) {  // negated, for correct nan handling
    #else
        if (a < mp) {
    #endif
            mp = a;
            *mindx = i;
        #if 1
            if (npy_isnan(mp)) {
                // nan encountered, it's maximal|minimal
                break;
            }
        #endif
        }
    }
#endif // TO_SIMD_SFX
    return 0;
}


#line 291
#undef TO_SIMD_SFX
#if 0
#line 296
#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 8
    #if 1
        #define TO_SIMD_SFX(X) X##_f8
        #if NPY_BITSOF_LONGDOUBLE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONGDOUBLE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u8
    #else
        #define TO_SIMD_SFX(X) X##_s8
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 16
    #if 1
        #define TO_SIMD_SFX(X) X##_f16
        #if NPY_BITSOF_LONGDOUBLE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONGDOUBLE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u16
    #else
        #define TO_SIMD_SFX(X) X##_s16
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 32
    #if 1
        #define TO_SIMD_SFX(X) X##_f32
        #if NPY_BITSOF_LONGDOUBLE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONGDOUBLE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u32
    #else
        #define TO_SIMD_SFX(X) X##_s32
    #endif

#line 296
#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 64
    #if 1
        #define TO_SIMD_SFX(X) X##_f64
        #if NPY_BITSOF_LONGDOUBLE == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_LONGDOUBLE == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif 0
        #define TO_SIMD_SFX(X) X##_u64
    #else
        #define TO_SIMD_SFX(X) X##_s64
    #endif

#endif

#line 318
NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_argmax)
(npy_longdouble *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
{
#if 1
    if (npy_isnan(*ip)) {
        // nan encountered; it's maximal|minimal
        *mindx = 0;
        return 0;
    }
#endif
#ifdef TO_SIMD_SFX
    *mindx = TO_SIMD_SFX(simd_argmax)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
    npyv_cleanup();
#else
    npy_longdouble mp = *ip;
    *mindx = 0;
    npy_intp i = 1;

    for (; i < n; ++i) {
        npy_longdouble a = ip[i];
    #if 1
        if (!(a <= mp)) {  // negated, for correct nan handling
    #else
        if (a > mp) {
    #endif
            mp = a;
            *mindx = i;
        #if 1
            if (npy_isnan(mp)) {
                // nan encountered, it's maximal|minimal
                break;
            }
        #endif
        }
    }
#endif // TO_SIMD_SFX
    return 0;
}

#line 318
NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_argmin)
(npy_longdouble *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
{
#if 1
    if (npy_isnan(*ip)) {
        // nan encountered; it's maximal|minimal
        *mindx = 0;
        return 0;
    }
#endif
#ifdef TO_SIMD_SFX
    *mindx = TO_SIMD_SFX(simd_argmin)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
    npyv_cleanup();
#else
    npy_longdouble mp = *ip;
    *mindx = 0;
    npy_intp i = 1;

    for (; i < n; ++i) {
        npy_longdouble a = ip[i];
    #if 1
        if (!(a >= mp)) {  // negated, for correct nan handling
    #else
        if (a < mp) {
    #endif
            mp = a;
            *mindx = i;
        #if 1
            if (npy_isnan(mp)) {
                // nan encountered, it's maximal|minimal
                break;
            }
        #endif
        }
    }
#endif // TO_SIMD_SFX
    return 0;
}



NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BOOL_argmax)
(npy_bool *ip, npy_intp len, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))

{
    npy_intp i = 0;
#if NPY_SIMD
    const npyv_u8 zero = npyv_zero_u8();
    const int vstep = npyv_nlanes_u8;
    const int wstep = vstep * 4;
    for (npy_intp n = len & -wstep; i < n; i += wstep) {
        npyv_u8 a = npyv_load_u8(ip + i + vstep*0);
        npyv_u8 b = npyv_load_u8(ip + i + vstep*1);
        npyv_u8 c = npyv_load_u8(ip + i + vstep*2);
        npyv_u8 d = npyv_load_u8(ip + i + vstep*3);
        npyv_b8 m_a = npyv_cmpeq_u8(a, zero);
        npyv_b8 m_b = npyv_cmpeq_u8(b, zero);
        npyv_b8 m_c = npyv_cmpeq_u8(c, zero);
        npyv_b8 m_d = npyv_cmpeq_u8(d, zero);
        npyv_b8 m_ab = npyv_and_b8(m_a, m_b);
        npyv_b8 m_cd = npyv_and_b8(m_c, m_d);
        npy_uint64 m = npyv_tobits_b8(npyv_and_b8(m_ab, m_cd));
    #if NPY_SIMD == 512
        if (m != NPY_MAX_UINT64) {
    #else
        if ((npy_int64)m != ((1LL << vstep) - 1)) {
    #endif
            break;
        }
    }
    npyv_cleanup();
#endif // NPY_SIMD
    for (; i < len; ++i) {
        if (ip[i]) {
            *mindx = i;
            return 0;
        }
    }
    *mindx = 0;
    return 0;
}

