#line 1 "numpy/core/src/umath/loops_comparison.dispatch.c.src"

/*
 *****************************************************************************
 **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
 **       Changes should be made to the original source (.src) file         **
 *****************************************************************************
 */

#line 1
/*@targets
 ** $maxopt baseline
 ** sse2 sse42 avx2 avx512f avx512_skx
 ** vsx2 vsx3
 ** neon
 ** vx vxe
 **/
#define _UMATHMODULE
#define _MULTIARRAYMODULE
#define NPY_NO_DEPRECATED_API NPY_API_VERSION

#include "simd/simd.h"
#include "loops_utils.h"
#include "loops.h"
#include "lowlevel_strided_loops.h"
// Provides the various *_LOOP macros
#include "fast_loop_macros.h"

/********************************************************************************
 ** Defining the SIMD kernels
 ********************************************************************************/
#line 28
#line 35
#if NPY_SIMD && !((1 || 0) && 0)
static void simd_binary_equal_u8(char **args, npy_intp len)
{
    npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0];
    npyv_lanetype_u8 *src2 = (npyv_lanetype_u8 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 8 >= 8
        npyv_u8  a1 = npyv_load_u8(src1 + npyv_nlanes_u8 * 0);
        npyv_u8  b1 = npyv_load_u8(src2 + npyv_nlanes_u8 * 0);
        npyv_b8 c1 = npyv_cmpeq_u8(a1, b1);
#if 8 >= 16
        npyv_u8  a2 = npyv_load_u8(src1 + npyv_nlanes_u8 * 1);
        npyv_u8  b2 = npyv_load_u8(src2 + npyv_nlanes_u8 * 1);
        npyv_b8 c2 = npyv_cmpeq_u8(a2, b2);
#if 8 >= 32
        npyv_u8  a3 = npyv_load_u8(src1 + npyv_nlanes_u8 * 2);
        npyv_u8  b3 = npyv_load_u8(src2 + npyv_nlanes_u8 * 2);
        npyv_u8  a4 = npyv_load_u8(src1 + npyv_nlanes_u8 * 3);
        npyv_u8  b4 = npyv_load_u8(src2 + npyv_nlanes_u8 * 3);
        npyv_b8 c3 = npyv_cmpeq_u8(a3, b3);
        npyv_b8 c4 = npyv_cmpeq_u8(a4, b4);
#if 8 == 64
        npyv_u8  a5 = npyv_load_u8(src1 + npyv_nlanes_u8 * 4);
        npyv_u8  b5 = npyv_load_u8(src2 + npyv_nlanes_u8 * 4);
        npyv_u8  a6 = npyv_load_u8(src1 + npyv_nlanes_u8 * 5);
        npyv_u8  b6 = npyv_load_u8(src2 + npyv_nlanes_u8 * 5);
        npyv_u8  a7 = npyv_load_u8(src1 + npyv_nlanes_u8 * 6);
        npyv_u8  b7 = npyv_load_u8(src2 + npyv_nlanes_u8 * 6);
        npyv_u8  a8 = npyv_load_u8(src1 + npyv_nlanes_u8 * 7);
        npyv_u8  b8 = npyv_load_u8(src2 + npyv_nlanes_u8 * 7);
        npyv_b8 c5 = npyv_cmpeq_u8(a5, b5);
        npyv_b8 c6 = npyv_cmpeq_u8(a6, b6);
        npyv_b8 c7 = npyv_cmpeq_u8(a7, b7);
        npyv_b8 c8 = npyv_cmpeq_u8(a8, b8);
#endif // 8 >= 64
#endif // 8 >= 32
#endif // 8 >= 16
#endif // 8 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 8 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 8 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 8 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 8 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_u8 a = *src1;
        const npyv_lanetype_u8 b = *src2;
        *dst = a == b;
    }
}

static void simd_binary_scalar1_equal_u8(char **args, npy_intp len)
{
    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[0];
    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 a         = npyv_setall_u8(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 8 >= 8
        npyv_u8  b1 = npyv_load_u8(src + npyv_nlanes_u8 * 0);
        npyv_b8 c1 = npyv_cmpeq_u8(a, b1);
#if 8 >= 16
        npyv_u8  b2 = npyv_load_u8(src + npyv_nlanes_u8 * 1);
        npyv_b8 c2 = npyv_cmpeq_u8(a, b2);
#if 8 >= 32
        npyv_u8  b3 = npyv_load_u8(src + npyv_nlanes_u8 * 2);
        npyv_u8  b4 = npyv_load_u8(src + npyv_nlanes_u8 * 3);
        npyv_b8 c3 = npyv_cmpeq_u8(a, b3);
        npyv_b8 c4 = npyv_cmpeq_u8(a, b4);
#if 8 == 64
        npyv_u8  b5 = npyv_load_u8(src + npyv_nlanes_u8 * 4);
        npyv_u8  b6 = npyv_load_u8(src + npyv_nlanes_u8 * 5);
        npyv_u8  b7 = npyv_load_u8(src + npyv_nlanes_u8 * 6);
        npyv_u8  b8 = npyv_load_u8(src + npyv_nlanes_u8 * 7);
        npyv_b8 c5 = npyv_cmpeq_u8(a, b5);
        npyv_b8 c6 = npyv_cmpeq_u8(a, b6);
        npyv_b8 c7 = npyv_cmpeq_u8(a, b7);
        npyv_b8 c8 = npyv_cmpeq_u8(a, b8);
#endif // 8 >= 64
#endif // 8 >= 32
#endif // 8 >= 16
#endif // 8 >= 8

#if 8 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 8 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 8 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 8 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u8 b = *src;
        *dst = scalar == b;
    }
}

static void simd_binary_scalar2_equal_u8(char **args, npy_intp len)
{
    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[0];
    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 b         = npyv_setall_u8(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 8 >= 8
        npyv_u8  a1 = npyv_load_u8(src + npyv_nlanes_u8 * 0);
        npyv_b8 c1 = npyv_cmpeq_u8(a1, b);
#if 8 >= 16
        npyv_u8  a2 = npyv_load_u8(src + npyv_nlanes_u8 * 1);
        npyv_b8 c2 = npyv_cmpeq_u8(a2, b);
#if 8 >= 32
        npyv_u8  a3 = npyv_load_u8(src + npyv_nlanes_u8 * 2);
        npyv_u8  a4 = npyv_load_u8(src + npyv_nlanes_u8 * 3);
        npyv_b8 c3 = npyv_cmpeq_u8(a3, b);
        npyv_b8 c4 = npyv_cmpeq_u8(a4, b);
#if 8 == 64
        npyv_u8  a5 = npyv_load_u8(src + npyv_nlanes_u8 * 4);
        npyv_u8  a6 = npyv_load_u8(src + npyv_nlanes_u8 * 5);
        npyv_u8  a7 = npyv_load_u8(src + npyv_nlanes_u8 * 6);
        npyv_u8  a8 = npyv_load_u8(src + npyv_nlanes_u8 * 7);
        npyv_b8 c5 = npyv_cmpeq_u8(a5, b);
        npyv_b8 c6 = npyv_cmpeq_u8(a6, b);
        npyv_b8 c7 = npyv_cmpeq_u8(a7, b);
        npyv_b8 c8 = npyv_cmpeq_u8(a8, b);
#endif // 8 >= 64
#endif // 8 >= 32
#endif // 8 >= 16
#endif // 8 >= 8

#if 8 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 8 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 8 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 8 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u8 a = *src;
        *dst = a == scalar;
    }
}
#endif


#line 35
#if NPY_SIMD && !((0 || 1) && 0)
static void simd_binary_not_equal_u8(char **args, npy_intp len)
{
    npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0];
    npyv_lanetype_u8 *src2 = (npyv_lanetype_u8 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 8 >= 8
        npyv_u8  a1 = npyv_load_u8(src1 + npyv_nlanes_u8 * 0);
        npyv_u8  b1 = npyv_load_u8(src2 + npyv_nlanes_u8 * 0);
        npyv_b8 c1 = npyv_cmpneq_u8(a1, b1);
#if 8 >= 16
        npyv_u8  a2 = npyv_load_u8(src1 + npyv_nlanes_u8 * 1);
        npyv_u8  b2 = npyv_load_u8(src2 + npyv_nlanes_u8 * 1);
        npyv_b8 c2 = npyv_cmpneq_u8(a2, b2);
#if 8 >= 32
        npyv_u8  a3 = npyv_load_u8(src1 + npyv_nlanes_u8 * 2);
        npyv_u8  b3 = npyv_load_u8(src2 + npyv_nlanes_u8 * 2);
        npyv_u8  a4 = npyv_load_u8(src1 + npyv_nlanes_u8 * 3);
        npyv_u8  b4 = npyv_load_u8(src2 + npyv_nlanes_u8 * 3);
        npyv_b8 c3 = npyv_cmpneq_u8(a3, b3);
        npyv_b8 c4 = npyv_cmpneq_u8(a4, b4);
#if 8 == 64
        npyv_u8  a5 = npyv_load_u8(src1 + npyv_nlanes_u8 * 4);
        npyv_u8  b5 = npyv_load_u8(src2 + npyv_nlanes_u8 * 4);
        npyv_u8  a6 = npyv_load_u8(src1 + npyv_nlanes_u8 * 5);
        npyv_u8  b6 = npyv_load_u8(src2 + npyv_nlanes_u8 * 5);
        npyv_u8  a7 = npyv_load_u8(src1 + npyv_nlanes_u8 * 6);
        npyv_u8  b7 = npyv_load_u8(src2 + npyv_nlanes_u8 * 6);
        npyv_u8  a8 = npyv_load_u8(src1 + npyv_nlanes_u8 * 7);
        npyv_u8  b8 = npyv_load_u8(src2 + npyv_nlanes_u8 * 7);
        npyv_b8 c5 = npyv_cmpneq_u8(a5, b5);
        npyv_b8 c6 = npyv_cmpneq_u8(a6, b6);
        npyv_b8 c7 = npyv_cmpneq_u8(a7, b7);
        npyv_b8 c8 = npyv_cmpneq_u8(a8, b8);
#endif // 8 >= 64
#endif // 8 >= 32
#endif // 8 >= 16
#endif // 8 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 8 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 8 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 8 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 8 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_u8 a = *src1;
        const npyv_lanetype_u8 b = *src2;
        *dst = a != b;
    }
}

static void simd_binary_scalar1_not_equal_u8(char **args, npy_intp len)
{
    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[0];
    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 a         = npyv_setall_u8(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 8 >= 8
        npyv_u8  b1 = npyv_load_u8(src + npyv_nlanes_u8 * 0);
        npyv_b8 c1 = npyv_cmpneq_u8(a, b1);
#if 8 >= 16
        npyv_u8  b2 = npyv_load_u8(src + npyv_nlanes_u8 * 1);
        npyv_b8 c2 = npyv_cmpneq_u8(a, b2);
#if 8 >= 32
        npyv_u8  b3 = npyv_load_u8(src + npyv_nlanes_u8 * 2);
        npyv_u8  b4 = npyv_load_u8(src + npyv_nlanes_u8 * 3);
        npyv_b8 c3 = npyv_cmpneq_u8(a, b3);
        npyv_b8 c4 = npyv_cmpneq_u8(a, b4);
#if 8 == 64
        npyv_u8  b5 = npyv_load_u8(src + npyv_nlanes_u8 * 4);
        npyv_u8  b6 = npyv_load_u8(src + npyv_nlanes_u8 * 5);
        npyv_u8  b7 = npyv_load_u8(src + npyv_nlanes_u8 * 6);
        npyv_u8  b8 = npyv_load_u8(src + npyv_nlanes_u8 * 7);
        npyv_b8 c5 = npyv_cmpneq_u8(a, b5);
        npyv_b8 c6 = npyv_cmpneq_u8(a, b6);
        npyv_b8 c7 = npyv_cmpneq_u8(a, b7);
        npyv_b8 c8 = npyv_cmpneq_u8(a, b8);
#endif // 8 >= 64
#endif // 8 >= 32
#endif // 8 >= 16
#endif // 8 >= 8

#if 8 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 8 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 8 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 8 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u8 b = *src;
        *dst = scalar != b;
    }
}

static void simd_binary_scalar2_not_equal_u8(char **args, npy_intp len)
{
    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[0];
    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 b         = npyv_setall_u8(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 8 >= 8
        npyv_u8  a1 = npyv_load_u8(src + npyv_nlanes_u8 * 0);
        npyv_b8 c1 = npyv_cmpneq_u8(a1, b);
#if 8 >= 16
        npyv_u8  a2 = npyv_load_u8(src + npyv_nlanes_u8 * 1);
        npyv_b8 c2 = npyv_cmpneq_u8(a2, b);
#if 8 >= 32
        npyv_u8  a3 = npyv_load_u8(src + npyv_nlanes_u8 * 2);
        npyv_u8  a4 = npyv_load_u8(src + npyv_nlanes_u8 * 3);
        npyv_b8 c3 = npyv_cmpneq_u8(a3, b);
        npyv_b8 c4 = npyv_cmpneq_u8(a4, b);
#if 8 == 64
        npyv_u8  a5 = npyv_load_u8(src + npyv_nlanes_u8 * 4);
        npyv_u8  a6 = npyv_load_u8(src + npyv_nlanes_u8 * 5);
        npyv_u8  a7 = npyv_load_u8(src + npyv_nlanes_u8 * 6);
        npyv_u8  a8 = npyv_load_u8(src + npyv_nlanes_u8 * 7);
        npyv_b8 c5 = npyv_cmpneq_u8(a5, b);
        npyv_b8 c6 = npyv_cmpneq_u8(a6, b);
        npyv_b8 c7 = npyv_cmpneq_u8(a7, b);
        npyv_b8 c8 = npyv_cmpneq_u8(a8, b);
#endif // 8 >= 64
#endif // 8 >= 32
#endif // 8 >= 16
#endif // 8 >= 8

#if 8 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 8 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 8 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 8 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u8 a = *src;
        *dst = a != scalar;
    }
}
#endif


#line 35
#if NPY_SIMD && !((0 || 0) && 0)
static void simd_binary_less_u8(char **args, npy_intp len)
{
    npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0];
    npyv_lanetype_u8 *src2 = (npyv_lanetype_u8 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 8 >= 8
        npyv_u8  a1 = npyv_load_u8(src1 + npyv_nlanes_u8 * 0);
        npyv_u8  b1 = npyv_load_u8(src2 + npyv_nlanes_u8 * 0);
        npyv_b8 c1 = npyv_cmplt_u8(a1, b1);
#if 8 >= 16
        npyv_u8  a2 = npyv_load_u8(src1 + npyv_nlanes_u8 * 1);
        npyv_u8  b2 = npyv_load_u8(src2 + npyv_nlanes_u8 * 1);
        npyv_b8 c2 = npyv_cmplt_u8(a2, b2);
#if 8 >= 32
        npyv_u8  a3 = npyv_load_u8(src1 + npyv_nlanes_u8 * 2);
        npyv_u8  b3 = npyv_load_u8(src2 + npyv_nlanes_u8 * 2);
        npyv_u8  a4 = npyv_load_u8(src1 + npyv_nlanes_u8 * 3);
        npyv_u8  b4 = npyv_load_u8(src2 + npyv_nlanes_u8 * 3);
        npyv_b8 c3 = npyv_cmplt_u8(a3, b3);
        npyv_b8 c4 = npyv_cmplt_u8(a4, b4);
#if 8 == 64
        npyv_u8  a5 = npyv_load_u8(src1 + npyv_nlanes_u8 * 4);
        npyv_u8  b5 = npyv_load_u8(src2 + npyv_nlanes_u8 * 4);
        npyv_u8  a6 = npyv_load_u8(src1 + npyv_nlanes_u8 * 5);
        npyv_u8  b6 = npyv_load_u8(src2 + npyv_nlanes_u8 * 5);
        npyv_u8  a7 = npyv_load_u8(src1 + npyv_nlanes_u8 * 6);
        npyv_u8  b7 = npyv_load_u8(src2 + npyv_nlanes_u8 * 6);
        npyv_u8  a8 = npyv_load_u8(src1 + npyv_nlanes_u8 * 7);
        npyv_u8  b8 = npyv_load_u8(src2 + npyv_nlanes_u8 * 7);
        npyv_b8 c5 = npyv_cmplt_u8(a5, b5);
        npyv_b8 c6 = npyv_cmplt_u8(a6, b6);
        npyv_b8 c7 = npyv_cmplt_u8(a7, b7);
        npyv_b8 c8 = npyv_cmplt_u8(a8, b8);
#endif // 8 >= 64
#endif // 8 >= 32
#endif // 8 >= 16
#endif // 8 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 8 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 8 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 8 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 8 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_u8 a = *src1;
        const npyv_lanetype_u8 b = *src2;
        *dst = a < b;
    }
}

static void simd_binary_scalar1_less_u8(char **args, npy_intp len)
{
    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[0];
    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 a         = npyv_setall_u8(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 8 >= 8
        npyv_u8  b1 = npyv_load_u8(src + npyv_nlanes_u8 * 0);
        npyv_b8 c1 = npyv_cmplt_u8(a, b1);
#if 8 >= 16
        npyv_u8  b2 = npyv_load_u8(src + npyv_nlanes_u8 * 1);
        npyv_b8 c2 = npyv_cmplt_u8(a, b2);
#if 8 >= 32
        npyv_u8  b3 = npyv_load_u8(src + npyv_nlanes_u8 * 2);
        npyv_u8  b4 = npyv_load_u8(src + npyv_nlanes_u8 * 3);
        npyv_b8 c3 = npyv_cmplt_u8(a, b3);
        npyv_b8 c4 = npyv_cmplt_u8(a, b4);
#if 8 == 64
        npyv_u8  b5 = npyv_load_u8(src + npyv_nlanes_u8 * 4);
        npyv_u8  b6 = npyv_load_u8(src + npyv_nlanes_u8 * 5);
        npyv_u8  b7 = npyv_load_u8(src + npyv_nlanes_u8 * 6);
        npyv_u8  b8 = npyv_load_u8(src + npyv_nlanes_u8 * 7);
        npyv_b8 c5 = npyv_cmplt_u8(a, b5);
        npyv_b8 c6 = npyv_cmplt_u8(a, b6);
        npyv_b8 c7 = npyv_cmplt_u8(a, b7);
        npyv_b8 c8 = npyv_cmplt_u8(a, b8);
#endif // 8 >= 64
#endif // 8 >= 32
#endif // 8 >= 16
#endif // 8 >= 8

#if 8 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 8 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 8 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 8 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u8 b = *src;
        *dst = scalar < b;
    }
}

static void simd_binary_scalar2_less_u8(char **args, npy_intp len)
{
    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[0];
    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 b         = npyv_setall_u8(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 8 >= 8
        npyv_u8  a1 = npyv_load_u8(src + npyv_nlanes_u8 * 0);
        npyv_b8 c1 = npyv_cmplt_u8(a1, b);
#if 8 >= 16
        npyv_u8  a2 = npyv_load_u8(src + npyv_nlanes_u8 * 1);
        npyv_b8 c2 = npyv_cmplt_u8(a2, b);
#if 8 >= 32
        npyv_u8  a3 = npyv_load_u8(src + npyv_nlanes_u8 * 2);
        npyv_u8  a4 = npyv_load_u8(src + npyv_nlanes_u8 * 3);
        npyv_b8 c3 = npyv_cmplt_u8(a3, b);
        npyv_b8 c4 = npyv_cmplt_u8(a4, b);
#if 8 == 64
        npyv_u8  a5 = npyv_load_u8(src + npyv_nlanes_u8 * 4);
        npyv_u8  a6 = npyv_load_u8(src + npyv_nlanes_u8 * 5);
        npyv_u8  a7 = npyv_load_u8(src + npyv_nlanes_u8 * 6);
        npyv_u8  a8 = npyv_load_u8(src + npyv_nlanes_u8 * 7);
        npyv_b8 c5 = npyv_cmplt_u8(a5, b);
        npyv_b8 c6 = npyv_cmplt_u8(a6, b);
        npyv_b8 c7 = npyv_cmplt_u8(a7, b);
        npyv_b8 c8 = npyv_cmplt_u8(a8, b);
#endif // 8 >= 64
#endif // 8 >= 32
#endif // 8 >= 16
#endif // 8 >= 8

#if 8 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 8 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 8 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 8 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u8 a = *src;
        *dst = a < scalar;
    }
}
#endif


#line 35
#if NPY_SIMD && !((0 || 0) && 0)
static void simd_binary_less_equal_u8(char **args, npy_intp len)
{
    npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0];
    npyv_lanetype_u8 *src2 = (npyv_lanetype_u8 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 8 >= 8
        npyv_u8  a1 = npyv_load_u8(src1 + npyv_nlanes_u8 * 0);
        npyv_u8  b1 = npyv_load_u8(src2 + npyv_nlanes_u8 * 0);
        npyv_b8 c1 = npyv_cmple_u8(a1, b1);
#if 8 >= 16
        npyv_u8  a2 = npyv_load_u8(src1 + npyv_nlanes_u8 * 1);
        npyv_u8  b2 = npyv_load_u8(src2 + npyv_nlanes_u8 * 1);
        npyv_b8 c2 = npyv_cmple_u8(a2, b2);
#if 8 >= 32
        npyv_u8  a3 = npyv_load_u8(src1 + npyv_nlanes_u8 * 2);
        npyv_u8  b3 = npyv_load_u8(src2 + npyv_nlanes_u8 * 2);
        npyv_u8  a4 = npyv_load_u8(src1 + npyv_nlanes_u8 * 3);
        npyv_u8  b4 = npyv_load_u8(src2 + npyv_nlanes_u8 * 3);
        npyv_b8 c3 = npyv_cmple_u8(a3, b3);
        npyv_b8 c4 = npyv_cmple_u8(a4, b4);
#if 8 == 64
        npyv_u8  a5 = npyv_load_u8(src1 + npyv_nlanes_u8 * 4);
        npyv_u8  b5 = npyv_load_u8(src2 + npyv_nlanes_u8 * 4);
        npyv_u8  a6 = npyv_load_u8(src1 + npyv_nlanes_u8 * 5);
        npyv_u8  b6 = npyv_load_u8(src2 + npyv_nlanes_u8 * 5);
        npyv_u8  a7 = npyv_load_u8(src1 + npyv_nlanes_u8 * 6);
        npyv_u8  b7 = npyv_load_u8(src2 + npyv_nlanes_u8 * 6);
        npyv_u8  a8 = npyv_load_u8(src1 + npyv_nlanes_u8 * 7);
        npyv_u8  b8 = npyv_load_u8(src2 + npyv_nlanes_u8 * 7);
        npyv_b8 c5 = npyv_cmple_u8(a5, b5);
        npyv_b8 c6 = npyv_cmple_u8(a6, b6);
        npyv_b8 c7 = npyv_cmple_u8(a7, b7);
        npyv_b8 c8 = npyv_cmple_u8(a8, b8);
#endif // 8 >= 64
#endif // 8 >= 32
#endif // 8 >= 16
#endif // 8 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 8 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 8 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 8 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 8 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_u8 a = *src1;
        const npyv_lanetype_u8 b = *src2;
        *dst = a <= b;
    }
}

static void simd_binary_scalar1_less_equal_u8(char **args, npy_intp len)
{
    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[0];
    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 a         = npyv_setall_u8(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 8 >= 8
        npyv_u8  b1 = npyv_load_u8(src + npyv_nlanes_u8 * 0);
        npyv_b8 c1 = npyv_cmple_u8(a, b1);
#if 8 >= 16
        npyv_u8  b2 = npyv_load_u8(src + npyv_nlanes_u8 * 1);
        npyv_b8 c2 = npyv_cmple_u8(a, b2);
#if 8 >= 32
        npyv_u8  b3 = npyv_load_u8(src + npyv_nlanes_u8 * 2);
        npyv_u8  b4 = npyv_load_u8(src + npyv_nlanes_u8 * 3);
        npyv_b8 c3 = npyv_cmple_u8(a, b3);
        npyv_b8 c4 = npyv_cmple_u8(a, b4);
#if 8 == 64
        npyv_u8  b5 = npyv_load_u8(src + npyv_nlanes_u8 * 4);
        npyv_u8  b6 = npyv_load_u8(src + npyv_nlanes_u8 * 5);
        npyv_u8  b7 = npyv_load_u8(src + npyv_nlanes_u8 * 6);
        npyv_u8  b8 = npyv_load_u8(src + npyv_nlanes_u8 * 7);
        npyv_b8 c5 = npyv_cmple_u8(a, b5);
        npyv_b8 c6 = npyv_cmple_u8(a, b6);
        npyv_b8 c7 = npyv_cmple_u8(a, b7);
        npyv_b8 c8 = npyv_cmple_u8(a, b8);
#endif // 8 >= 64
#endif // 8 >= 32
#endif // 8 >= 16
#endif // 8 >= 8

#if 8 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 8 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 8 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 8 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u8 b = *src;
        *dst = scalar <= b;
    }
}

static void simd_binary_scalar2_less_equal_u8(char **args, npy_intp len)
{
    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[0];
    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 b         = npyv_setall_u8(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 8 >= 8
        npyv_u8  a1 = npyv_load_u8(src + npyv_nlanes_u8 * 0);
        npyv_b8 c1 = npyv_cmple_u8(a1, b);
#if 8 >= 16
        npyv_u8  a2 = npyv_load_u8(src + npyv_nlanes_u8 * 1);
        npyv_b8 c2 = npyv_cmple_u8(a2, b);
#if 8 >= 32
        npyv_u8  a3 = npyv_load_u8(src + npyv_nlanes_u8 * 2);
        npyv_u8  a4 = npyv_load_u8(src + npyv_nlanes_u8 * 3);
        npyv_b8 c3 = npyv_cmple_u8(a3, b);
        npyv_b8 c4 = npyv_cmple_u8(a4, b);
#if 8 == 64
        npyv_u8  a5 = npyv_load_u8(src + npyv_nlanes_u8 * 4);
        npyv_u8  a6 = npyv_load_u8(src + npyv_nlanes_u8 * 5);
        npyv_u8  a7 = npyv_load_u8(src + npyv_nlanes_u8 * 6);
        npyv_u8  a8 = npyv_load_u8(src + npyv_nlanes_u8 * 7);
        npyv_b8 c5 = npyv_cmple_u8(a5, b);
        npyv_b8 c6 = npyv_cmple_u8(a6, b);
        npyv_b8 c7 = npyv_cmple_u8(a7, b);
        npyv_b8 c8 = npyv_cmple_u8(a8, b);
#endif // 8 >= 64
#endif // 8 >= 32
#endif // 8 >= 16
#endif // 8 >= 8

#if 8 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 8 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 8 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 8 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u8 a = *src;
        *dst = a <= scalar;
    }
}
#endif



#line 28
#line 35
#if NPY_SIMD && !((1 || 0) && 1)
static void simd_binary_equal_s8(char **args, npy_intp len)
{
    npyv_lanetype_s8 *src1 = (npyv_lanetype_s8 *) args[0];
    npyv_lanetype_s8 *src2 = (npyv_lanetype_s8 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 8 >= 8
        npyv_s8  a1 = npyv_load_s8(src1 + npyv_nlanes_s8 * 0);
        npyv_s8  b1 = npyv_load_s8(src2 + npyv_nlanes_s8 * 0);
        npyv_b8 c1 = npyv_cmpeq_s8(a1, b1);
#if 8 >= 16
        npyv_s8  a2 = npyv_load_s8(src1 + npyv_nlanes_s8 * 1);
        npyv_s8  b2 = npyv_load_s8(src2 + npyv_nlanes_s8 * 1);
        npyv_b8 c2 = npyv_cmpeq_s8(a2, b2);
#if 8 >= 32
        npyv_s8  a3 = npyv_load_s8(src1 + npyv_nlanes_s8 * 2);
        npyv_s8  b3 = npyv_load_s8(src2 + npyv_nlanes_s8 * 2);
        npyv_s8  a4 = npyv_load_s8(src1 + npyv_nlanes_s8 * 3);
        npyv_s8  b4 = npyv_load_s8(src2 + npyv_nlanes_s8 * 3);
        npyv_b8 c3 = npyv_cmpeq_s8(a3, b3);
        npyv_b8 c4 = npyv_cmpeq_s8(a4, b4);
#if 8 == 64
        npyv_s8  a5 = npyv_load_s8(src1 + npyv_nlanes_s8 * 4);
        npyv_s8  b5 = npyv_load_s8(src2 + npyv_nlanes_s8 * 4);
        npyv_s8  a6 = npyv_load_s8(src1 + npyv_nlanes_s8 * 5);
        npyv_s8  b6 = npyv_load_s8(src2 + npyv_nlanes_s8 * 5);
        npyv_s8  a7 = npyv_load_s8(src1 + npyv_nlanes_s8 * 6);
        npyv_s8  b7 = npyv_load_s8(src2 + npyv_nlanes_s8 * 6);
        npyv_s8  a8 = npyv_load_s8(src1 + npyv_nlanes_s8 * 7);
        npyv_s8  b8 = npyv_load_s8(src2 + npyv_nlanes_s8 * 7);
        npyv_b8 c5 = npyv_cmpeq_s8(a5, b5);
        npyv_b8 c6 = npyv_cmpeq_s8(a6, b6);
        npyv_b8 c7 = npyv_cmpeq_s8(a7, b7);
        npyv_b8 c8 = npyv_cmpeq_s8(a8, b8);
#endif // 8 >= 64
#endif // 8 >= 32
#endif // 8 >= 16
#endif // 8 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 8 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 8 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 8 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 8 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_s8 a = *src1;
        const npyv_lanetype_s8 b = *src2;
        *dst = a == b;
    }
}

static void simd_binary_scalar1_equal_s8(char **args, npy_intp len)
{
    npyv_lanetype_s8 scalar = *(npyv_lanetype_s8 *) args[0];
    npyv_lanetype_s8 *src   = (npyv_lanetype_s8 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_s8 a         = npyv_setall_s8(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 8 >= 8
        npyv_s8  b1 = npyv_load_s8(src + npyv_nlanes_s8 * 0);
        npyv_b8 c1 = npyv_cmpeq_s8(a, b1);
#if 8 >= 16
        npyv_s8  b2 = npyv_load_s8(src + npyv_nlanes_s8 * 1);
        npyv_b8 c2 = npyv_cmpeq_s8(a, b2);
#if 8 >= 32
        npyv_s8  b3 = npyv_load_s8(src + npyv_nlanes_s8 * 2);
        npyv_s8  b4 = npyv_load_s8(src + npyv_nlanes_s8 * 3);
        npyv_b8 c3 = npyv_cmpeq_s8(a, b3);
        npyv_b8 c4 = npyv_cmpeq_s8(a, b4);
#if 8 == 64
        npyv_s8  b5 = npyv_load_s8(src + npyv_nlanes_s8 * 4);
        npyv_s8  b6 = npyv_load_s8(src + npyv_nlanes_s8 * 5);
        npyv_s8  b7 = npyv_load_s8(src + npyv_nlanes_s8 * 6);
        npyv_s8  b8 = npyv_load_s8(src + npyv_nlanes_s8 * 7);
        npyv_b8 c5 = npyv_cmpeq_s8(a, b5);
        npyv_b8 c6 = npyv_cmpeq_s8(a, b6);
        npyv_b8 c7 = npyv_cmpeq_s8(a, b7);
        npyv_b8 c8 = npyv_cmpeq_s8(a, b8);
#endif // 8 >= 64
#endif // 8 >= 32
#endif // 8 >= 16
#endif // 8 >= 8

#if 8 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 8 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 8 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 8 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_s8 b = *src;
        *dst = scalar == b;
    }
}

static void simd_binary_scalar2_equal_s8(char **args, npy_intp len)
{
    npyv_lanetype_s8 *src   = (npyv_lanetype_s8 *) args[0];
    npyv_lanetype_s8 scalar = *(npyv_lanetype_s8 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_s8 b         = npyv_setall_s8(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 8 >= 8
        npyv_s8  a1 = npyv_load_s8(src + npyv_nlanes_s8 * 0);
        npyv_b8 c1 = npyv_cmpeq_s8(a1, b);
#if 8 >= 16
        npyv_s8  a2 = npyv_load_s8(src + npyv_nlanes_s8 * 1);
        npyv_b8 c2 = npyv_cmpeq_s8(a2, b);
#if 8 >= 32
        npyv_s8  a3 = npyv_load_s8(src + npyv_nlanes_s8 * 2);
        npyv_s8  a4 = npyv_load_s8(src + npyv_nlanes_s8 * 3);
        npyv_b8 c3 = npyv_cmpeq_s8(a3, b);
        npyv_b8 c4 = npyv_cmpeq_s8(a4, b);
#if 8 == 64
        npyv_s8  a5 = npyv_load_s8(src + npyv_nlanes_s8 * 4);
        npyv_s8  a6 = npyv_load_s8(src + npyv_nlanes_s8 * 5);
        npyv_s8  a7 = npyv_load_s8(src + npyv_nlanes_s8 * 6);
        npyv_s8  a8 = npyv_load_s8(src + npyv_nlanes_s8 * 7);
        npyv_b8 c5 = npyv_cmpeq_s8(a5, b);
        npyv_b8 c6 = npyv_cmpeq_s8(a6, b);
        npyv_b8 c7 = npyv_cmpeq_s8(a7, b);
        npyv_b8 c8 = npyv_cmpeq_s8(a8, b);
#endif // 8 >= 64
#endif // 8 >= 32
#endif // 8 >= 16
#endif // 8 >= 8

#if 8 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 8 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 8 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 8 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_s8 a = *src;
        *dst = a == scalar;
    }
}
#endif


#line 35
#if NPY_SIMD && !((0 || 1) && 1)
static void simd_binary_not_equal_s8(char **args, npy_intp len)
{
    npyv_lanetype_s8 *src1 = (npyv_lanetype_s8 *) args[0];
    npyv_lanetype_s8 *src2 = (npyv_lanetype_s8 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 8 >= 8
        npyv_s8  a1 = npyv_load_s8(src1 + npyv_nlanes_s8 * 0);
        npyv_s8  b1 = npyv_load_s8(src2 + npyv_nlanes_s8 * 0);
        npyv_b8 c1 = npyv_cmpneq_s8(a1, b1);
#if 8 >= 16
        npyv_s8  a2 = npyv_load_s8(src1 + npyv_nlanes_s8 * 1);
        npyv_s8  b2 = npyv_load_s8(src2 + npyv_nlanes_s8 * 1);
        npyv_b8 c2 = npyv_cmpneq_s8(a2, b2);
#if 8 >= 32
        npyv_s8  a3 = npyv_load_s8(src1 + npyv_nlanes_s8 * 2);
        npyv_s8  b3 = npyv_load_s8(src2 + npyv_nlanes_s8 * 2);
        npyv_s8  a4 = npyv_load_s8(src1 + npyv_nlanes_s8 * 3);
        npyv_s8  b4 = npyv_load_s8(src2 + npyv_nlanes_s8 * 3);
        npyv_b8 c3 = npyv_cmpneq_s8(a3, b3);
        npyv_b8 c4 = npyv_cmpneq_s8(a4, b4);
#if 8 == 64
        npyv_s8  a5 = npyv_load_s8(src1 + npyv_nlanes_s8 * 4);
        npyv_s8  b5 = npyv_load_s8(src2 + npyv_nlanes_s8 * 4);
        npyv_s8  a6 = npyv_load_s8(src1 + npyv_nlanes_s8 * 5);
        npyv_s8  b6 = npyv_load_s8(src2 + npyv_nlanes_s8 * 5);
        npyv_s8  a7 = npyv_load_s8(src1 + npyv_nlanes_s8 * 6);
        npyv_s8  b7 = npyv_load_s8(src2 + npyv_nlanes_s8 * 6);
        npyv_s8  a8 = npyv_load_s8(src1 + npyv_nlanes_s8 * 7);
        npyv_s8  b8 = npyv_load_s8(src2 + npyv_nlanes_s8 * 7);
        npyv_b8 c5 = npyv_cmpneq_s8(a5, b5);
        npyv_b8 c6 = npyv_cmpneq_s8(a6, b6);
        npyv_b8 c7 = npyv_cmpneq_s8(a7, b7);
        npyv_b8 c8 = npyv_cmpneq_s8(a8, b8);
#endif // 8 >= 64
#endif // 8 >= 32
#endif // 8 >= 16
#endif // 8 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 8 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 8 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 8 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 8 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_s8 a = *src1;
        const npyv_lanetype_s8 b = *src2;
        *dst = a != b;
    }
}

static void simd_binary_scalar1_not_equal_s8(char **args, npy_intp len)
{
    npyv_lanetype_s8 scalar = *(npyv_lanetype_s8 *) args[0];
    npyv_lanetype_s8 *src   = (npyv_lanetype_s8 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_s8 a         = npyv_setall_s8(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 8 >= 8
        npyv_s8  b1 = npyv_load_s8(src + npyv_nlanes_s8 * 0);
        npyv_b8 c1 = npyv_cmpneq_s8(a, b1);
#if 8 >= 16
        npyv_s8  b2 = npyv_load_s8(src + npyv_nlanes_s8 * 1);
        npyv_b8 c2 = npyv_cmpneq_s8(a, b2);
#if 8 >= 32
        npyv_s8  b3 = npyv_load_s8(src + npyv_nlanes_s8 * 2);
        npyv_s8  b4 = npyv_load_s8(src + npyv_nlanes_s8 * 3);
        npyv_b8 c3 = npyv_cmpneq_s8(a, b3);
        npyv_b8 c4 = npyv_cmpneq_s8(a, b4);
#if 8 == 64
        npyv_s8  b5 = npyv_load_s8(src + npyv_nlanes_s8 * 4);
        npyv_s8  b6 = npyv_load_s8(src + npyv_nlanes_s8 * 5);
        npyv_s8  b7 = npyv_load_s8(src + npyv_nlanes_s8 * 6);
        npyv_s8  b8 = npyv_load_s8(src + npyv_nlanes_s8 * 7);
        npyv_b8 c5 = npyv_cmpneq_s8(a, b5);
        npyv_b8 c6 = npyv_cmpneq_s8(a, b6);
        npyv_b8 c7 = npyv_cmpneq_s8(a, b7);
        npyv_b8 c8 = npyv_cmpneq_s8(a, b8);
#endif // 8 >= 64
#endif // 8 >= 32
#endif // 8 >= 16
#endif // 8 >= 8

#if 8 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 8 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 8 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 8 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_s8 b = *src;
        *dst = scalar != b;
    }
}

static void simd_binary_scalar2_not_equal_s8(char **args, npy_intp len)
{
    npyv_lanetype_s8 *src   = (npyv_lanetype_s8 *) args[0];
    npyv_lanetype_s8 scalar = *(npyv_lanetype_s8 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_s8 b         = npyv_setall_s8(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 8 >= 8
        npyv_s8  a1 = npyv_load_s8(src + npyv_nlanes_s8 * 0);
        npyv_b8 c1 = npyv_cmpneq_s8(a1, b);
#if 8 >= 16
        npyv_s8  a2 = npyv_load_s8(src + npyv_nlanes_s8 * 1);
        npyv_b8 c2 = npyv_cmpneq_s8(a2, b);
#if 8 >= 32
        npyv_s8  a3 = npyv_load_s8(src + npyv_nlanes_s8 * 2);
        npyv_s8  a4 = npyv_load_s8(src + npyv_nlanes_s8 * 3);
        npyv_b8 c3 = npyv_cmpneq_s8(a3, b);
        npyv_b8 c4 = npyv_cmpneq_s8(a4, b);
#if 8 == 64
        npyv_s8  a5 = npyv_load_s8(src + npyv_nlanes_s8 * 4);
        npyv_s8  a6 = npyv_load_s8(src + npyv_nlanes_s8 * 5);
        npyv_s8  a7 = npyv_load_s8(src + npyv_nlanes_s8 * 6);
        npyv_s8  a8 = npyv_load_s8(src + npyv_nlanes_s8 * 7);
        npyv_b8 c5 = npyv_cmpneq_s8(a5, b);
        npyv_b8 c6 = npyv_cmpneq_s8(a6, b);
        npyv_b8 c7 = npyv_cmpneq_s8(a7, b);
        npyv_b8 c8 = npyv_cmpneq_s8(a8, b);
#endif // 8 >= 64
#endif // 8 >= 32
#endif // 8 >= 16
#endif // 8 >= 8

#if 8 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 8 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 8 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 8 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_s8 a = *src;
        *dst = a != scalar;
    }
}
#endif


#line 35
#if NPY_SIMD && !((0 || 0) && 1)
static void simd_binary_less_s8(char **args, npy_intp len)
{
    npyv_lanetype_s8 *src1 = (npyv_lanetype_s8 *) args[0];
    npyv_lanetype_s8 *src2 = (npyv_lanetype_s8 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 8 >= 8
        npyv_s8  a1 = npyv_load_s8(src1 + npyv_nlanes_s8 * 0);
        npyv_s8  b1 = npyv_load_s8(src2 + npyv_nlanes_s8 * 0);
        npyv_b8 c1 = npyv_cmplt_s8(a1, b1);
#if 8 >= 16
        npyv_s8  a2 = npyv_load_s8(src1 + npyv_nlanes_s8 * 1);
        npyv_s8  b2 = npyv_load_s8(src2 + npyv_nlanes_s8 * 1);
        npyv_b8 c2 = npyv_cmplt_s8(a2, b2);
#if 8 >= 32
        npyv_s8  a3 = npyv_load_s8(src1 + npyv_nlanes_s8 * 2);
        npyv_s8  b3 = npyv_load_s8(src2 + npyv_nlanes_s8 * 2);
        npyv_s8  a4 = npyv_load_s8(src1 + npyv_nlanes_s8 * 3);
        npyv_s8  b4 = npyv_load_s8(src2 + npyv_nlanes_s8 * 3);
        npyv_b8 c3 = npyv_cmplt_s8(a3, b3);
        npyv_b8 c4 = npyv_cmplt_s8(a4, b4);
#if 8 == 64
        npyv_s8  a5 = npyv_load_s8(src1 + npyv_nlanes_s8 * 4);
        npyv_s8  b5 = npyv_load_s8(src2 + npyv_nlanes_s8 * 4);
        npyv_s8  a6 = npyv_load_s8(src1 + npyv_nlanes_s8 * 5);
        npyv_s8  b6 = npyv_load_s8(src2 + npyv_nlanes_s8 * 5);
        npyv_s8  a7 = npyv_load_s8(src1 + npyv_nlanes_s8 * 6);
        npyv_s8  b7 = npyv_load_s8(src2 + npyv_nlanes_s8 * 6);
        npyv_s8  a8 = npyv_load_s8(src1 + npyv_nlanes_s8 * 7);
        npyv_s8  b8 = npyv_load_s8(src2 + npyv_nlanes_s8 * 7);
        npyv_b8 c5 = npyv_cmplt_s8(a5, b5);
        npyv_b8 c6 = npyv_cmplt_s8(a6, b6);
        npyv_b8 c7 = npyv_cmplt_s8(a7, b7);
        npyv_b8 c8 = npyv_cmplt_s8(a8, b8);
#endif // 8 >= 64
#endif // 8 >= 32
#endif // 8 >= 16
#endif // 8 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 8 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 8 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 8 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 8 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_s8 a = *src1;
        const npyv_lanetype_s8 b = *src2;
        *dst = a < b;
    }
}

static void simd_binary_scalar1_less_s8(char **args, npy_intp len)
{
    npyv_lanetype_s8 scalar = *(npyv_lanetype_s8 *) args[0];
    npyv_lanetype_s8 *src   = (npyv_lanetype_s8 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_s8 a         = npyv_setall_s8(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 8 >= 8
        npyv_s8  b1 = npyv_load_s8(src + npyv_nlanes_s8 * 0);
        npyv_b8 c1 = npyv_cmplt_s8(a, b1);
#if 8 >= 16
        npyv_s8  b2 = npyv_load_s8(src + npyv_nlanes_s8 * 1);
        npyv_b8 c2 = npyv_cmplt_s8(a, b2);
#if 8 >= 32
        npyv_s8  b3 = npyv_load_s8(src + npyv_nlanes_s8 * 2);
        npyv_s8  b4 = npyv_load_s8(src + npyv_nlanes_s8 * 3);
        npyv_b8 c3 = npyv_cmplt_s8(a, b3);
        npyv_b8 c4 = npyv_cmplt_s8(a, b4);
#if 8 == 64
        npyv_s8  b5 = npyv_load_s8(src + npyv_nlanes_s8 * 4);
        npyv_s8  b6 = npyv_load_s8(src + npyv_nlanes_s8 * 5);
        npyv_s8  b7 = npyv_load_s8(src + npyv_nlanes_s8 * 6);
        npyv_s8  b8 = npyv_load_s8(src + npyv_nlanes_s8 * 7);
        npyv_b8 c5 = npyv_cmplt_s8(a, b5);
        npyv_b8 c6 = npyv_cmplt_s8(a, b6);
        npyv_b8 c7 = npyv_cmplt_s8(a, b7);
        npyv_b8 c8 = npyv_cmplt_s8(a, b8);
#endif // 8 >= 64
#endif // 8 >= 32
#endif // 8 >= 16
#endif // 8 >= 8

#if 8 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 8 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 8 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 8 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_s8 b = *src;
        *dst = scalar < b;
    }
}

static void simd_binary_scalar2_less_s8(char **args, npy_intp len)
{
    npyv_lanetype_s8 *src   = (npyv_lanetype_s8 *) args[0];
    npyv_lanetype_s8 scalar = *(npyv_lanetype_s8 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_s8 b         = npyv_setall_s8(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 8 >= 8
        npyv_s8  a1 = npyv_load_s8(src + npyv_nlanes_s8 * 0);
        npyv_b8 c1 = npyv_cmplt_s8(a1, b);
#if 8 >= 16
        npyv_s8  a2 = npyv_load_s8(src + npyv_nlanes_s8 * 1);
        npyv_b8 c2 = npyv_cmplt_s8(a2, b);
#if 8 >= 32
        npyv_s8  a3 = npyv_load_s8(src + npyv_nlanes_s8 * 2);
        npyv_s8  a4 = npyv_load_s8(src + npyv_nlanes_s8 * 3);
        npyv_b8 c3 = npyv_cmplt_s8(a3, b);
        npyv_b8 c4 = npyv_cmplt_s8(a4, b);
#if 8 == 64
        npyv_s8  a5 = npyv_load_s8(src + npyv_nlanes_s8 * 4);
        npyv_s8  a6 = npyv_load_s8(src + npyv_nlanes_s8 * 5);
        npyv_s8  a7 = npyv_load_s8(src + npyv_nlanes_s8 * 6);
        npyv_s8  a8 = npyv_load_s8(src + npyv_nlanes_s8 * 7);
        npyv_b8 c5 = npyv_cmplt_s8(a5, b);
        npyv_b8 c6 = npyv_cmplt_s8(a6, b);
        npyv_b8 c7 = npyv_cmplt_s8(a7, b);
        npyv_b8 c8 = npyv_cmplt_s8(a8, b);
#endif // 8 >= 64
#endif // 8 >= 32
#endif // 8 >= 16
#endif // 8 >= 8

#if 8 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 8 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 8 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 8 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_s8 a = *src;
        *dst = a < scalar;
    }
}
#endif


#line 35
#if NPY_SIMD && !((0 || 0) && 1)
static void simd_binary_less_equal_s8(char **args, npy_intp len)
{
    npyv_lanetype_s8 *src1 = (npyv_lanetype_s8 *) args[0];
    npyv_lanetype_s8 *src2 = (npyv_lanetype_s8 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 8 >= 8
        npyv_s8  a1 = npyv_load_s8(src1 + npyv_nlanes_s8 * 0);
        npyv_s8  b1 = npyv_load_s8(src2 + npyv_nlanes_s8 * 0);
        npyv_b8 c1 = npyv_cmple_s8(a1, b1);
#if 8 >= 16
        npyv_s8  a2 = npyv_load_s8(src1 + npyv_nlanes_s8 * 1);
        npyv_s8  b2 = npyv_load_s8(src2 + npyv_nlanes_s8 * 1);
        npyv_b8 c2 = npyv_cmple_s8(a2, b2);
#if 8 >= 32
        npyv_s8  a3 = npyv_load_s8(src1 + npyv_nlanes_s8 * 2);
        npyv_s8  b3 = npyv_load_s8(src2 + npyv_nlanes_s8 * 2);
        npyv_s8  a4 = npyv_load_s8(src1 + npyv_nlanes_s8 * 3);
        npyv_s8  b4 = npyv_load_s8(src2 + npyv_nlanes_s8 * 3);
        npyv_b8 c3 = npyv_cmple_s8(a3, b3);
        npyv_b8 c4 = npyv_cmple_s8(a4, b4);
#if 8 == 64
        npyv_s8  a5 = npyv_load_s8(src1 + npyv_nlanes_s8 * 4);
        npyv_s8  b5 = npyv_load_s8(src2 + npyv_nlanes_s8 * 4);
        npyv_s8  a6 = npyv_load_s8(src1 + npyv_nlanes_s8 * 5);
        npyv_s8  b6 = npyv_load_s8(src2 + npyv_nlanes_s8 * 5);
        npyv_s8  a7 = npyv_load_s8(src1 + npyv_nlanes_s8 * 6);
        npyv_s8  b7 = npyv_load_s8(src2 + npyv_nlanes_s8 * 6);
        npyv_s8  a8 = npyv_load_s8(src1 + npyv_nlanes_s8 * 7);
        npyv_s8  b8 = npyv_load_s8(src2 + npyv_nlanes_s8 * 7);
        npyv_b8 c5 = npyv_cmple_s8(a5, b5);
        npyv_b8 c6 = npyv_cmple_s8(a6, b6);
        npyv_b8 c7 = npyv_cmple_s8(a7, b7);
        npyv_b8 c8 = npyv_cmple_s8(a8, b8);
#endif // 8 >= 64
#endif // 8 >= 32
#endif // 8 >= 16
#endif // 8 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 8 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 8 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 8 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 8 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_s8 a = *src1;
        const npyv_lanetype_s8 b = *src2;
        *dst = a <= b;
    }
}

static void simd_binary_scalar1_less_equal_s8(char **args, npy_intp len)
{
    npyv_lanetype_s8 scalar = *(npyv_lanetype_s8 *) args[0];
    npyv_lanetype_s8 *src   = (npyv_lanetype_s8 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_s8 a         = npyv_setall_s8(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 8 >= 8
        npyv_s8  b1 = npyv_load_s8(src + npyv_nlanes_s8 * 0);
        npyv_b8 c1 = npyv_cmple_s8(a, b1);
#if 8 >= 16
        npyv_s8  b2 = npyv_load_s8(src + npyv_nlanes_s8 * 1);
        npyv_b8 c2 = npyv_cmple_s8(a, b2);
#if 8 >= 32
        npyv_s8  b3 = npyv_load_s8(src + npyv_nlanes_s8 * 2);
        npyv_s8  b4 = npyv_load_s8(src + npyv_nlanes_s8 * 3);
        npyv_b8 c3 = npyv_cmple_s8(a, b3);
        npyv_b8 c4 = npyv_cmple_s8(a, b4);
#if 8 == 64
        npyv_s8  b5 = npyv_load_s8(src + npyv_nlanes_s8 * 4);
        npyv_s8  b6 = npyv_load_s8(src + npyv_nlanes_s8 * 5);
        npyv_s8  b7 = npyv_load_s8(src + npyv_nlanes_s8 * 6);
        npyv_s8  b8 = npyv_load_s8(src + npyv_nlanes_s8 * 7);
        npyv_b8 c5 = npyv_cmple_s8(a, b5);
        npyv_b8 c6 = npyv_cmple_s8(a, b6);
        npyv_b8 c7 = npyv_cmple_s8(a, b7);
        npyv_b8 c8 = npyv_cmple_s8(a, b8);
#endif // 8 >= 64
#endif // 8 >= 32
#endif // 8 >= 16
#endif // 8 >= 8

#if 8 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 8 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 8 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 8 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_s8 b = *src;
        *dst = scalar <= b;
    }
}

static void simd_binary_scalar2_less_equal_s8(char **args, npy_intp len)
{
    npyv_lanetype_s8 *src   = (npyv_lanetype_s8 *) args[0];
    npyv_lanetype_s8 scalar = *(npyv_lanetype_s8 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_s8 b         = npyv_setall_s8(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 8 >= 8
        npyv_s8  a1 = npyv_load_s8(src + npyv_nlanes_s8 * 0);
        npyv_b8 c1 = npyv_cmple_s8(a1, b);
#if 8 >= 16
        npyv_s8  a2 = npyv_load_s8(src + npyv_nlanes_s8 * 1);
        npyv_b8 c2 = npyv_cmple_s8(a2, b);
#if 8 >= 32
        npyv_s8  a3 = npyv_load_s8(src + npyv_nlanes_s8 * 2);
        npyv_s8  a4 = npyv_load_s8(src + npyv_nlanes_s8 * 3);
        npyv_b8 c3 = npyv_cmple_s8(a3, b);
        npyv_b8 c4 = npyv_cmple_s8(a4, b);
#if 8 == 64
        npyv_s8  a5 = npyv_load_s8(src + npyv_nlanes_s8 * 4);
        npyv_s8  a6 = npyv_load_s8(src + npyv_nlanes_s8 * 5);
        npyv_s8  a7 = npyv_load_s8(src + npyv_nlanes_s8 * 6);
        npyv_s8  a8 = npyv_load_s8(src + npyv_nlanes_s8 * 7);
        npyv_b8 c5 = npyv_cmple_s8(a5, b);
        npyv_b8 c6 = npyv_cmple_s8(a6, b);
        npyv_b8 c7 = npyv_cmple_s8(a7, b);
        npyv_b8 c8 = npyv_cmple_s8(a8, b);
#endif // 8 >= 64
#endif // 8 >= 32
#endif // 8 >= 16
#endif // 8 >= 8

#if 8 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 8 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 8 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 8 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_s8 a = *src;
        *dst = a <= scalar;
    }
}
#endif



#line 28
#line 35
#if NPY_SIMD && !((1 || 0) && 0)
static void simd_binary_equal_u16(char **args, npy_intp len)
{
    npyv_lanetype_u16 *src1 = (npyv_lanetype_u16 *) args[0];
    npyv_lanetype_u16 *src2 = (npyv_lanetype_u16 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 16 >= 8
        npyv_u16  a1 = npyv_load_u16(src1 + npyv_nlanes_u16 * 0);
        npyv_u16  b1 = npyv_load_u16(src2 + npyv_nlanes_u16 * 0);
        npyv_b16 c1 = npyv_cmpeq_u16(a1, b1);
#if 16 >= 16
        npyv_u16  a2 = npyv_load_u16(src1 + npyv_nlanes_u16 * 1);
        npyv_u16  b2 = npyv_load_u16(src2 + npyv_nlanes_u16 * 1);
        npyv_b16 c2 = npyv_cmpeq_u16(a2, b2);
#if 16 >= 32
        npyv_u16  a3 = npyv_load_u16(src1 + npyv_nlanes_u16 * 2);
        npyv_u16  b3 = npyv_load_u16(src2 + npyv_nlanes_u16 * 2);
        npyv_u16  a4 = npyv_load_u16(src1 + npyv_nlanes_u16 * 3);
        npyv_u16  b4 = npyv_load_u16(src2 + npyv_nlanes_u16 * 3);
        npyv_b16 c3 = npyv_cmpeq_u16(a3, b3);
        npyv_b16 c4 = npyv_cmpeq_u16(a4, b4);
#if 16 == 64
        npyv_u16  a5 = npyv_load_u16(src1 + npyv_nlanes_u16 * 4);
        npyv_u16  b5 = npyv_load_u16(src2 + npyv_nlanes_u16 * 4);
        npyv_u16  a6 = npyv_load_u16(src1 + npyv_nlanes_u16 * 5);
        npyv_u16  b6 = npyv_load_u16(src2 + npyv_nlanes_u16 * 5);
        npyv_u16  a7 = npyv_load_u16(src1 + npyv_nlanes_u16 * 6);
        npyv_u16  b7 = npyv_load_u16(src2 + npyv_nlanes_u16 * 6);
        npyv_u16  a8 = npyv_load_u16(src1 + npyv_nlanes_u16 * 7);
        npyv_u16  b8 = npyv_load_u16(src2 + npyv_nlanes_u16 * 7);
        npyv_b16 c5 = npyv_cmpeq_u16(a5, b5);
        npyv_b16 c6 = npyv_cmpeq_u16(a6, b6);
        npyv_b16 c7 = npyv_cmpeq_u16(a7, b7);
        npyv_b16 c8 = npyv_cmpeq_u16(a8, b8);
#endif // 16 >= 64
#endif // 16 >= 32
#endif // 16 >= 16
#endif // 16 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 16 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 16 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 16 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 16 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_u16 a = *src1;
        const npyv_lanetype_u16 b = *src2;
        *dst = a == b;
    }
}

static void simd_binary_scalar1_equal_u16(char **args, npy_intp len)
{
    npyv_lanetype_u16 scalar = *(npyv_lanetype_u16 *) args[0];
    npyv_lanetype_u16 *src   = (npyv_lanetype_u16 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_u16 a         = npyv_setall_u16(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 16 >= 8
        npyv_u16  b1 = npyv_load_u16(src + npyv_nlanes_u16 * 0);
        npyv_b16 c1 = npyv_cmpeq_u16(a, b1);
#if 16 >= 16
        npyv_u16  b2 = npyv_load_u16(src + npyv_nlanes_u16 * 1);
        npyv_b16 c2 = npyv_cmpeq_u16(a, b2);
#if 16 >= 32
        npyv_u16  b3 = npyv_load_u16(src + npyv_nlanes_u16 * 2);
        npyv_u16  b4 = npyv_load_u16(src + npyv_nlanes_u16 * 3);
        npyv_b16 c3 = npyv_cmpeq_u16(a, b3);
        npyv_b16 c4 = npyv_cmpeq_u16(a, b4);
#if 16 == 64
        npyv_u16  b5 = npyv_load_u16(src + npyv_nlanes_u16 * 4);
        npyv_u16  b6 = npyv_load_u16(src + npyv_nlanes_u16 * 5);
        npyv_u16  b7 = npyv_load_u16(src + npyv_nlanes_u16 * 6);
        npyv_u16  b8 = npyv_load_u16(src + npyv_nlanes_u16 * 7);
        npyv_b16 c5 = npyv_cmpeq_u16(a, b5);
        npyv_b16 c6 = npyv_cmpeq_u16(a, b6);
        npyv_b16 c7 = npyv_cmpeq_u16(a, b7);
        npyv_b16 c8 = npyv_cmpeq_u16(a, b8);
#endif // 16 >= 64
#endif // 16 >= 32
#endif // 16 >= 16
#endif // 16 >= 8

#if 16 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 16 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 16 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 16 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u16 b = *src;
        *dst = scalar == b;
    }
}

static void simd_binary_scalar2_equal_u16(char **args, npy_intp len)
{
    npyv_lanetype_u16 *src   = (npyv_lanetype_u16 *) args[0];
    npyv_lanetype_u16 scalar = *(npyv_lanetype_u16 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_u16 b         = npyv_setall_u16(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 16 >= 8
        npyv_u16  a1 = npyv_load_u16(src + npyv_nlanes_u16 * 0);
        npyv_b16 c1 = npyv_cmpeq_u16(a1, b);
#if 16 >= 16
        npyv_u16  a2 = npyv_load_u16(src + npyv_nlanes_u16 * 1);
        npyv_b16 c2 = npyv_cmpeq_u16(a2, b);
#if 16 >= 32
        npyv_u16  a3 = npyv_load_u16(src + npyv_nlanes_u16 * 2);
        npyv_u16  a4 = npyv_load_u16(src + npyv_nlanes_u16 * 3);
        npyv_b16 c3 = npyv_cmpeq_u16(a3, b);
        npyv_b16 c4 = npyv_cmpeq_u16(a4, b);
#if 16 == 64
        npyv_u16  a5 = npyv_load_u16(src + npyv_nlanes_u16 * 4);
        npyv_u16  a6 = npyv_load_u16(src + npyv_nlanes_u16 * 5);
        npyv_u16  a7 = npyv_load_u16(src + npyv_nlanes_u16 * 6);
        npyv_u16  a8 = npyv_load_u16(src + npyv_nlanes_u16 * 7);
        npyv_b16 c5 = npyv_cmpeq_u16(a5, b);
        npyv_b16 c6 = npyv_cmpeq_u16(a6, b);
        npyv_b16 c7 = npyv_cmpeq_u16(a7, b);
        npyv_b16 c8 = npyv_cmpeq_u16(a8, b);
#endif // 16 >= 64
#endif // 16 >= 32
#endif // 16 >= 16
#endif // 16 >= 8

#if 16 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 16 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 16 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 16 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u16 a = *src;
        *dst = a == scalar;
    }
}
#endif


#line 35
#if NPY_SIMD && !((0 || 1) && 0)
static void simd_binary_not_equal_u16(char **args, npy_intp len)
{
    npyv_lanetype_u16 *src1 = (npyv_lanetype_u16 *) args[0];
    npyv_lanetype_u16 *src2 = (npyv_lanetype_u16 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 16 >= 8
        npyv_u16  a1 = npyv_load_u16(src1 + npyv_nlanes_u16 * 0);
        npyv_u16  b1 = npyv_load_u16(src2 + npyv_nlanes_u16 * 0);
        npyv_b16 c1 = npyv_cmpneq_u16(a1, b1);
#if 16 >= 16
        npyv_u16  a2 = npyv_load_u16(src1 + npyv_nlanes_u16 * 1);
        npyv_u16  b2 = npyv_load_u16(src2 + npyv_nlanes_u16 * 1);
        npyv_b16 c2 = npyv_cmpneq_u16(a2, b2);
#if 16 >= 32
        npyv_u16  a3 = npyv_load_u16(src1 + npyv_nlanes_u16 * 2);
        npyv_u16  b3 = npyv_load_u16(src2 + npyv_nlanes_u16 * 2);
        npyv_u16  a4 = npyv_load_u16(src1 + npyv_nlanes_u16 * 3);
        npyv_u16  b4 = npyv_load_u16(src2 + npyv_nlanes_u16 * 3);
        npyv_b16 c3 = npyv_cmpneq_u16(a3, b3);
        npyv_b16 c4 = npyv_cmpneq_u16(a4, b4);
#if 16 == 64
        npyv_u16  a5 = npyv_load_u16(src1 + npyv_nlanes_u16 * 4);
        npyv_u16  b5 = npyv_load_u16(src2 + npyv_nlanes_u16 * 4);
        npyv_u16  a6 = npyv_load_u16(src1 + npyv_nlanes_u16 * 5);
        npyv_u16  b6 = npyv_load_u16(src2 + npyv_nlanes_u16 * 5);
        npyv_u16  a7 = npyv_load_u16(src1 + npyv_nlanes_u16 * 6);
        npyv_u16  b7 = npyv_load_u16(src2 + npyv_nlanes_u16 * 6);
        npyv_u16  a8 = npyv_load_u16(src1 + npyv_nlanes_u16 * 7);
        npyv_u16  b8 = npyv_load_u16(src2 + npyv_nlanes_u16 * 7);
        npyv_b16 c5 = npyv_cmpneq_u16(a5, b5);
        npyv_b16 c6 = npyv_cmpneq_u16(a6, b6);
        npyv_b16 c7 = npyv_cmpneq_u16(a7, b7);
        npyv_b16 c8 = npyv_cmpneq_u16(a8, b8);
#endif // 16 >= 64
#endif // 16 >= 32
#endif // 16 >= 16
#endif // 16 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 16 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 16 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 16 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 16 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_u16 a = *src1;
        const npyv_lanetype_u16 b = *src2;
        *dst = a != b;
    }
}

static void simd_binary_scalar1_not_equal_u16(char **args, npy_intp len)
{
    npyv_lanetype_u16 scalar = *(npyv_lanetype_u16 *) args[0];
    npyv_lanetype_u16 *src   = (npyv_lanetype_u16 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_u16 a         = npyv_setall_u16(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 16 >= 8
        npyv_u16  b1 = npyv_load_u16(src + npyv_nlanes_u16 * 0);
        npyv_b16 c1 = npyv_cmpneq_u16(a, b1);
#if 16 >= 16
        npyv_u16  b2 = npyv_load_u16(src + npyv_nlanes_u16 * 1);
        npyv_b16 c2 = npyv_cmpneq_u16(a, b2);
#if 16 >= 32
        npyv_u16  b3 = npyv_load_u16(src + npyv_nlanes_u16 * 2);
        npyv_u16  b4 = npyv_load_u16(src + npyv_nlanes_u16 * 3);
        npyv_b16 c3 = npyv_cmpneq_u16(a, b3);
        npyv_b16 c4 = npyv_cmpneq_u16(a, b4);
#if 16 == 64
        npyv_u16  b5 = npyv_load_u16(src + npyv_nlanes_u16 * 4);
        npyv_u16  b6 = npyv_load_u16(src + npyv_nlanes_u16 * 5);
        npyv_u16  b7 = npyv_load_u16(src + npyv_nlanes_u16 * 6);
        npyv_u16  b8 = npyv_load_u16(src + npyv_nlanes_u16 * 7);
        npyv_b16 c5 = npyv_cmpneq_u16(a, b5);
        npyv_b16 c6 = npyv_cmpneq_u16(a, b6);
        npyv_b16 c7 = npyv_cmpneq_u16(a, b7);
        npyv_b16 c8 = npyv_cmpneq_u16(a, b8);
#endif // 16 >= 64
#endif // 16 >= 32
#endif // 16 >= 16
#endif // 16 >= 8

#if 16 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 16 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 16 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 16 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u16 b = *src;
        *dst = scalar != b;
    }
}

static void simd_binary_scalar2_not_equal_u16(char **args, npy_intp len)
{
    npyv_lanetype_u16 *src   = (npyv_lanetype_u16 *) args[0];
    npyv_lanetype_u16 scalar = *(npyv_lanetype_u16 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_u16 b         = npyv_setall_u16(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 16 >= 8
        npyv_u16  a1 = npyv_load_u16(src + npyv_nlanes_u16 * 0);
        npyv_b16 c1 = npyv_cmpneq_u16(a1, b);
#if 16 >= 16
        npyv_u16  a2 = npyv_load_u16(src + npyv_nlanes_u16 * 1);
        npyv_b16 c2 = npyv_cmpneq_u16(a2, b);
#if 16 >= 32
        npyv_u16  a3 = npyv_load_u16(src + npyv_nlanes_u16 * 2);
        npyv_u16  a4 = npyv_load_u16(src + npyv_nlanes_u16 * 3);
        npyv_b16 c3 = npyv_cmpneq_u16(a3, b);
        npyv_b16 c4 = npyv_cmpneq_u16(a4, b);
#if 16 == 64
        npyv_u16  a5 = npyv_load_u16(src + npyv_nlanes_u16 * 4);
        npyv_u16  a6 = npyv_load_u16(src + npyv_nlanes_u16 * 5);
        npyv_u16  a7 = npyv_load_u16(src + npyv_nlanes_u16 * 6);
        npyv_u16  a8 = npyv_load_u16(src + npyv_nlanes_u16 * 7);
        npyv_b16 c5 = npyv_cmpneq_u16(a5, b);
        npyv_b16 c6 = npyv_cmpneq_u16(a6, b);
        npyv_b16 c7 = npyv_cmpneq_u16(a7, b);
        npyv_b16 c8 = npyv_cmpneq_u16(a8, b);
#endif // 16 >= 64
#endif // 16 >= 32
#endif // 16 >= 16
#endif // 16 >= 8

#if 16 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 16 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 16 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 16 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u16 a = *src;
        *dst = a != scalar;
    }
}
#endif


#line 35
#if NPY_SIMD && !((0 || 0) && 0)
static void simd_binary_less_u16(char **args, npy_intp len)
{
    npyv_lanetype_u16 *src1 = (npyv_lanetype_u16 *) args[0];
    npyv_lanetype_u16 *src2 = (npyv_lanetype_u16 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 16 >= 8
        npyv_u16  a1 = npyv_load_u16(src1 + npyv_nlanes_u16 * 0);
        npyv_u16  b1 = npyv_load_u16(src2 + npyv_nlanes_u16 * 0);
        npyv_b16 c1 = npyv_cmplt_u16(a1, b1);
#if 16 >= 16
        npyv_u16  a2 = npyv_load_u16(src1 + npyv_nlanes_u16 * 1);
        npyv_u16  b2 = npyv_load_u16(src2 + npyv_nlanes_u16 * 1);
        npyv_b16 c2 = npyv_cmplt_u16(a2, b2);
#if 16 >= 32
        npyv_u16  a3 = npyv_load_u16(src1 + npyv_nlanes_u16 * 2);
        npyv_u16  b3 = npyv_load_u16(src2 + npyv_nlanes_u16 * 2);
        npyv_u16  a4 = npyv_load_u16(src1 + npyv_nlanes_u16 * 3);
        npyv_u16  b4 = npyv_load_u16(src2 + npyv_nlanes_u16 * 3);
        npyv_b16 c3 = npyv_cmplt_u16(a3, b3);
        npyv_b16 c4 = npyv_cmplt_u16(a4, b4);
#if 16 == 64
        npyv_u16  a5 = npyv_load_u16(src1 + npyv_nlanes_u16 * 4);
        npyv_u16  b5 = npyv_load_u16(src2 + npyv_nlanes_u16 * 4);
        npyv_u16  a6 = npyv_load_u16(src1 + npyv_nlanes_u16 * 5);
        npyv_u16  b6 = npyv_load_u16(src2 + npyv_nlanes_u16 * 5);
        npyv_u16  a7 = npyv_load_u16(src1 + npyv_nlanes_u16 * 6);
        npyv_u16  b7 = npyv_load_u16(src2 + npyv_nlanes_u16 * 6);
        npyv_u16  a8 = npyv_load_u16(src1 + npyv_nlanes_u16 * 7);
        npyv_u16  b8 = npyv_load_u16(src2 + npyv_nlanes_u16 * 7);
        npyv_b16 c5 = npyv_cmplt_u16(a5, b5);
        npyv_b16 c6 = npyv_cmplt_u16(a6, b6);
        npyv_b16 c7 = npyv_cmplt_u16(a7, b7);
        npyv_b16 c8 = npyv_cmplt_u16(a8, b8);
#endif // 16 >= 64
#endif // 16 >= 32
#endif // 16 >= 16
#endif // 16 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 16 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 16 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 16 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 16 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_u16 a = *src1;
        const npyv_lanetype_u16 b = *src2;
        *dst = a < b;
    }
}

static void simd_binary_scalar1_less_u16(char **args, npy_intp len)
{
    npyv_lanetype_u16 scalar = *(npyv_lanetype_u16 *) args[0];
    npyv_lanetype_u16 *src   = (npyv_lanetype_u16 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_u16 a         = npyv_setall_u16(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 16 >= 8
        npyv_u16  b1 = npyv_load_u16(src + npyv_nlanes_u16 * 0);
        npyv_b16 c1 = npyv_cmplt_u16(a, b1);
#if 16 >= 16
        npyv_u16  b2 = npyv_load_u16(src + npyv_nlanes_u16 * 1);
        npyv_b16 c2 = npyv_cmplt_u16(a, b2);
#if 16 >= 32
        npyv_u16  b3 = npyv_load_u16(src + npyv_nlanes_u16 * 2);
        npyv_u16  b4 = npyv_load_u16(src + npyv_nlanes_u16 * 3);
        npyv_b16 c3 = npyv_cmplt_u16(a, b3);
        npyv_b16 c4 = npyv_cmplt_u16(a, b4);
#if 16 == 64
        npyv_u16  b5 = npyv_load_u16(src + npyv_nlanes_u16 * 4);
        npyv_u16  b6 = npyv_load_u16(src + npyv_nlanes_u16 * 5);
        npyv_u16  b7 = npyv_load_u16(src + npyv_nlanes_u16 * 6);
        npyv_u16  b8 = npyv_load_u16(src + npyv_nlanes_u16 * 7);
        npyv_b16 c5 = npyv_cmplt_u16(a, b5);
        npyv_b16 c6 = npyv_cmplt_u16(a, b6);
        npyv_b16 c7 = npyv_cmplt_u16(a, b7);
        npyv_b16 c8 = npyv_cmplt_u16(a, b8);
#endif // 16 >= 64
#endif // 16 >= 32
#endif // 16 >= 16
#endif // 16 >= 8

#if 16 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 16 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 16 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 16 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u16 b = *src;
        *dst = scalar < b;
    }
}

static void simd_binary_scalar2_less_u16(char **args, npy_intp len)
{
    npyv_lanetype_u16 *src   = (npyv_lanetype_u16 *) args[0];
    npyv_lanetype_u16 scalar = *(npyv_lanetype_u16 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_u16 b         = npyv_setall_u16(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 16 >= 8
        npyv_u16  a1 = npyv_load_u16(src + npyv_nlanes_u16 * 0);
        npyv_b16 c1 = npyv_cmplt_u16(a1, b);
#if 16 >= 16
        npyv_u16  a2 = npyv_load_u16(src + npyv_nlanes_u16 * 1);
        npyv_b16 c2 = npyv_cmplt_u16(a2, b);
#if 16 >= 32
        npyv_u16  a3 = npyv_load_u16(src + npyv_nlanes_u16 * 2);
        npyv_u16  a4 = npyv_load_u16(src + npyv_nlanes_u16 * 3);
        npyv_b16 c3 = npyv_cmplt_u16(a3, b);
        npyv_b16 c4 = npyv_cmplt_u16(a4, b);
#if 16 == 64
        npyv_u16  a5 = npyv_load_u16(src + npyv_nlanes_u16 * 4);
        npyv_u16  a6 = npyv_load_u16(src + npyv_nlanes_u16 * 5);
        npyv_u16  a7 = npyv_load_u16(src + npyv_nlanes_u16 * 6);
        npyv_u16  a8 = npyv_load_u16(src + npyv_nlanes_u16 * 7);
        npyv_b16 c5 = npyv_cmplt_u16(a5, b);
        npyv_b16 c6 = npyv_cmplt_u16(a6, b);
        npyv_b16 c7 = npyv_cmplt_u16(a7, b);
        npyv_b16 c8 = npyv_cmplt_u16(a8, b);
#endif // 16 >= 64
#endif // 16 >= 32
#endif // 16 >= 16
#endif // 16 >= 8

#if 16 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 16 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 16 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 16 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u16 a = *src;
        *dst = a < scalar;
    }
}
#endif


#line 35
#if NPY_SIMD && !((0 || 0) && 0)
static void simd_binary_less_equal_u16(char **args, npy_intp len)
{
    npyv_lanetype_u16 *src1 = (npyv_lanetype_u16 *) args[0];
    npyv_lanetype_u16 *src2 = (npyv_lanetype_u16 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 16 >= 8
        npyv_u16  a1 = npyv_load_u16(src1 + npyv_nlanes_u16 * 0);
        npyv_u16  b1 = npyv_load_u16(src2 + npyv_nlanes_u16 * 0);
        npyv_b16 c1 = npyv_cmple_u16(a1, b1);
#if 16 >= 16
        npyv_u16  a2 = npyv_load_u16(src1 + npyv_nlanes_u16 * 1);
        npyv_u16  b2 = npyv_load_u16(src2 + npyv_nlanes_u16 * 1);
        npyv_b16 c2 = npyv_cmple_u16(a2, b2);
#if 16 >= 32
        npyv_u16  a3 = npyv_load_u16(src1 + npyv_nlanes_u16 * 2);
        npyv_u16  b3 = npyv_load_u16(src2 + npyv_nlanes_u16 * 2);
        npyv_u16  a4 = npyv_load_u16(src1 + npyv_nlanes_u16 * 3);
        npyv_u16  b4 = npyv_load_u16(src2 + npyv_nlanes_u16 * 3);
        npyv_b16 c3 = npyv_cmple_u16(a3, b3);
        npyv_b16 c4 = npyv_cmple_u16(a4, b4);
#if 16 == 64
        npyv_u16  a5 = npyv_load_u16(src1 + npyv_nlanes_u16 * 4);
        npyv_u16  b5 = npyv_load_u16(src2 + npyv_nlanes_u16 * 4);
        npyv_u16  a6 = npyv_load_u16(src1 + npyv_nlanes_u16 * 5);
        npyv_u16  b6 = npyv_load_u16(src2 + npyv_nlanes_u16 * 5);
        npyv_u16  a7 = npyv_load_u16(src1 + npyv_nlanes_u16 * 6);
        npyv_u16  b7 = npyv_load_u16(src2 + npyv_nlanes_u16 * 6);
        npyv_u16  a8 = npyv_load_u16(src1 + npyv_nlanes_u16 * 7);
        npyv_u16  b8 = npyv_load_u16(src2 + npyv_nlanes_u16 * 7);
        npyv_b16 c5 = npyv_cmple_u16(a5, b5);
        npyv_b16 c6 = npyv_cmple_u16(a6, b6);
        npyv_b16 c7 = npyv_cmple_u16(a7, b7);
        npyv_b16 c8 = npyv_cmple_u16(a8, b8);
#endif // 16 >= 64
#endif // 16 >= 32
#endif // 16 >= 16
#endif // 16 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 16 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 16 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 16 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 16 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_u16 a = *src1;
        const npyv_lanetype_u16 b = *src2;
        *dst = a <= b;
    }
}

static void simd_binary_scalar1_less_equal_u16(char **args, npy_intp len)
{
    npyv_lanetype_u16 scalar = *(npyv_lanetype_u16 *) args[0];
    npyv_lanetype_u16 *src   = (npyv_lanetype_u16 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_u16 a         = npyv_setall_u16(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 16 >= 8
        npyv_u16  b1 = npyv_load_u16(src + npyv_nlanes_u16 * 0);
        npyv_b16 c1 = npyv_cmple_u16(a, b1);
#if 16 >= 16
        npyv_u16  b2 = npyv_load_u16(src + npyv_nlanes_u16 * 1);
        npyv_b16 c2 = npyv_cmple_u16(a, b2);
#if 16 >= 32
        npyv_u16  b3 = npyv_load_u16(src + npyv_nlanes_u16 * 2);
        npyv_u16  b4 = npyv_load_u16(src + npyv_nlanes_u16 * 3);
        npyv_b16 c3 = npyv_cmple_u16(a, b3);
        npyv_b16 c4 = npyv_cmple_u16(a, b4);
#if 16 == 64
        npyv_u16  b5 = npyv_load_u16(src + npyv_nlanes_u16 * 4);
        npyv_u16  b6 = npyv_load_u16(src + npyv_nlanes_u16 * 5);
        npyv_u16  b7 = npyv_load_u16(src + npyv_nlanes_u16 * 6);
        npyv_u16  b8 = npyv_load_u16(src + npyv_nlanes_u16 * 7);
        npyv_b16 c5 = npyv_cmple_u16(a, b5);
        npyv_b16 c6 = npyv_cmple_u16(a, b6);
        npyv_b16 c7 = npyv_cmple_u16(a, b7);
        npyv_b16 c8 = npyv_cmple_u16(a, b8);
#endif // 16 >= 64
#endif // 16 >= 32
#endif // 16 >= 16
#endif // 16 >= 8

#if 16 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 16 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 16 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 16 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u16 b = *src;
        *dst = scalar <= b;
    }
}

static void simd_binary_scalar2_less_equal_u16(char **args, npy_intp len)
{
    npyv_lanetype_u16 *src   = (npyv_lanetype_u16 *) args[0];
    npyv_lanetype_u16 scalar = *(npyv_lanetype_u16 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_u16 b         = npyv_setall_u16(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 16 >= 8
        npyv_u16  a1 = npyv_load_u16(src + npyv_nlanes_u16 * 0);
        npyv_b16 c1 = npyv_cmple_u16(a1, b);
#if 16 >= 16
        npyv_u16  a2 = npyv_load_u16(src + npyv_nlanes_u16 * 1);
        npyv_b16 c2 = npyv_cmple_u16(a2, b);
#if 16 >= 32
        npyv_u16  a3 = npyv_load_u16(src + npyv_nlanes_u16 * 2);
        npyv_u16  a4 = npyv_load_u16(src + npyv_nlanes_u16 * 3);
        npyv_b16 c3 = npyv_cmple_u16(a3, b);
        npyv_b16 c4 = npyv_cmple_u16(a4, b);
#if 16 == 64
        npyv_u16  a5 = npyv_load_u16(src + npyv_nlanes_u16 * 4);
        npyv_u16  a6 = npyv_load_u16(src + npyv_nlanes_u16 * 5);
        npyv_u16  a7 = npyv_load_u16(src + npyv_nlanes_u16 * 6);
        npyv_u16  a8 = npyv_load_u16(src + npyv_nlanes_u16 * 7);
        npyv_b16 c5 = npyv_cmple_u16(a5, b);
        npyv_b16 c6 = npyv_cmple_u16(a6, b);
        npyv_b16 c7 = npyv_cmple_u16(a7, b);
        npyv_b16 c8 = npyv_cmple_u16(a8, b);
#endif // 16 >= 64
#endif // 16 >= 32
#endif // 16 >= 16
#endif // 16 >= 8

#if 16 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 16 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 16 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 16 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u16 a = *src;
        *dst = a <= scalar;
    }
}
#endif



#line 28
#line 35
#if NPY_SIMD && !((1 || 0) && 1)
static void simd_binary_equal_s16(char **args, npy_intp len)
{
    npyv_lanetype_s16 *src1 = (npyv_lanetype_s16 *) args[0];
    npyv_lanetype_s16 *src2 = (npyv_lanetype_s16 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 16 >= 8
        npyv_s16  a1 = npyv_load_s16(src1 + npyv_nlanes_s16 * 0);
        npyv_s16  b1 = npyv_load_s16(src2 + npyv_nlanes_s16 * 0);
        npyv_b16 c1 = npyv_cmpeq_s16(a1, b1);
#if 16 >= 16
        npyv_s16  a2 = npyv_load_s16(src1 + npyv_nlanes_s16 * 1);
        npyv_s16  b2 = npyv_load_s16(src2 + npyv_nlanes_s16 * 1);
        npyv_b16 c2 = npyv_cmpeq_s16(a2, b2);
#if 16 >= 32
        npyv_s16  a3 = npyv_load_s16(src1 + npyv_nlanes_s16 * 2);
        npyv_s16  b3 = npyv_load_s16(src2 + npyv_nlanes_s16 * 2);
        npyv_s16  a4 = npyv_load_s16(src1 + npyv_nlanes_s16 * 3);
        npyv_s16  b4 = npyv_load_s16(src2 + npyv_nlanes_s16 * 3);
        npyv_b16 c3 = npyv_cmpeq_s16(a3, b3);
        npyv_b16 c4 = npyv_cmpeq_s16(a4, b4);
#if 16 == 64
        npyv_s16  a5 = npyv_load_s16(src1 + npyv_nlanes_s16 * 4);
        npyv_s16  b5 = npyv_load_s16(src2 + npyv_nlanes_s16 * 4);
        npyv_s16  a6 = npyv_load_s16(src1 + npyv_nlanes_s16 * 5);
        npyv_s16  b6 = npyv_load_s16(src2 + npyv_nlanes_s16 * 5);
        npyv_s16  a7 = npyv_load_s16(src1 + npyv_nlanes_s16 * 6);
        npyv_s16  b7 = npyv_load_s16(src2 + npyv_nlanes_s16 * 6);
        npyv_s16  a8 = npyv_load_s16(src1 + npyv_nlanes_s16 * 7);
        npyv_s16  b8 = npyv_load_s16(src2 + npyv_nlanes_s16 * 7);
        npyv_b16 c5 = npyv_cmpeq_s16(a5, b5);
        npyv_b16 c6 = npyv_cmpeq_s16(a6, b6);
        npyv_b16 c7 = npyv_cmpeq_s16(a7, b7);
        npyv_b16 c8 = npyv_cmpeq_s16(a8, b8);
#endif // 16 >= 64
#endif // 16 >= 32
#endif // 16 >= 16
#endif // 16 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 16 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 16 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 16 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 16 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_s16 a = *src1;
        const npyv_lanetype_s16 b = *src2;
        *dst = a == b;
    }
}

static void simd_binary_scalar1_equal_s16(char **args, npy_intp len)
{
    npyv_lanetype_s16 scalar = *(npyv_lanetype_s16 *) args[0];
    npyv_lanetype_s16 *src   = (npyv_lanetype_s16 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_s16 a         = npyv_setall_s16(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 16 >= 8
        npyv_s16  b1 = npyv_load_s16(src + npyv_nlanes_s16 * 0);
        npyv_b16 c1 = npyv_cmpeq_s16(a, b1);
#if 16 >= 16
        npyv_s16  b2 = npyv_load_s16(src + npyv_nlanes_s16 * 1);
        npyv_b16 c2 = npyv_cmpeq_s16(a, b2);
#if 16 >= 32
        npyv_s16  b3 = npyv_load_s16(src + npyv_nlanes_s16 * 2);
        npyv_s16  b4 = npyv_load_s16(src + npyv_nlanes_s16 * 3);
        npyv_b16 c3 = npyv_cmpeq_s16(a, b3);
        npyv_b16 c4 = npyv_cmpeq_s16(a, b4);
#if 16 == 64
        npyv_s16  b5 = npyv_load_s16(src + npyv_nlanes_s16 * 4);
        npyv_s16  b6 = npyv_load_s16(src + npyv_nlanes_s16 * 5);
        npyv_s16  b7 = npyv_load_s16(src + npyv_nlanes_s16 * 6);
        npyv_s16  b8 = npyv_load_s16(src + npyv_nlanes_s16 * 7);
        npyv_b16 c5 = npyv_cmpeq_s16(a, b5);
        npyv_b16 c6 = npyv_cmpeq_s16(a, b6);
        npyv_b16 c7 = npyv_cmpeq_s16(a, b7);
        npyv_b16 c8 = npyv_cmpeq_s16(a, b8);
#endif // 16 >= 64
#endif // 16 >= 32
#endif // 16 >= 16
#endif // 16 >= 8

#if 16 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 16 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 16 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 16 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_s16 b = *src;
        *dst = scalar == b;
    }
}

static void simd_binary_scalar2_equal_s16(char **args, npy_intp len)
{
    npyv_lanetype_s16 *src   = (npyv_lanetype_s16 *) args[0];
    npyv_lanetype_s16 scalar = *(npyv_lanetype_s16 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_s16 b         = npyv_setall_s16(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 16 >= 8
        npyv_s16  a1 = npyv_load_s16(src + npyv_nlanes_s16 * 0);
        npyv_b16 c1 = npyv_cmpeq_s16(a1, b);
#if 16 >= 16
        npyv_s16  a2 = npyv_load_s16(src + npyv_nlanes_s16 * 1);
        npyv_b16 c2 = npyv_cmpeq_s16(a2, b);
#if 16 >= 32
        npyv_s16  a3 = npyv_load_s16(src + npyv_nlanes_s16 * 2);
        npyv_s16  a4 = npyv_load_s16(src + npyv_nlanes_s16 * 3);
        npyv_b16 c3 = npyv_cmpeq_s16(a3, b);
        npyv_b16 c4 = npyv_cmpeq_s16(a4, b);
#if 16 == 64
        npyv_s16  a5 = npyv_load_s16(src + npyv_nlanes_s16 * 4);
        npyv_s16  a6 = npyv_load_s16(src + npyv_nlanes_s16 * 5);
        npyv_s16  a7 = npyv_load_s16(src + npyv_nlanes_s16 * 6);
        npyv_s16  a8 = npyv_load_s16(src + npyv_nlanes_s16 * 7);
        npyv_b16 c5 = npyv_cmpeq_s16(a5, b);
        npyv_b16 c6 = npyv_cmpeq_s16(a6, b);
        npyv_b16 c7 = npyv_cmpeq_s16(a7, b);
        npyv_b16 c8 = npyv_cmpeq_s16(a8, b);
#endif // 16 >= 64
#endif // 16 >= 32
#endif // 16 >= 16
#endif // 16 >= 8

#if 16 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 16 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 16 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 16 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_s16 a = *src;
        *dst = a == scalar;
    }
}
#endif


#line 35
#if NPY_SIMD && !((0 || 1) && 1)
static void simd_binary_not_equal_s16(char **args, npy_intp len)
{
    npyv_lanetype_s16 *src1 = (npyv_lanetype_s16 *) args[0];
    npyv_lanetype_s16 *src2 = (npyv_lanetype_s16 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 16 >= 8
        npyv_s16  a1 = npyv_load_s16(src1 + npyv_nlanes_s16 * 0);
        npyv_s16  b1 = npyv_load_s16(src2 + npyv_nlanes_s16 * 0);
        npyv_b16 c1 = npyv_cmpneq_s16(a1, b1);
#if 16 >= 16
        npyv_s16  a2 = npyv_load_s16(src1 + npyv_nlanes_s16 * 1);
        npyv_s16  b2 = npyv_load_s16(src2 + npyv_nlanes_s16 * 1);
        npyv_b16 c2 = npyv_cmpneq_s16(a2, b2);
#if 16 >= 32
        npyv_s16  a3 = npyv_load_s16(src1 + npyv_nlanes_s16 * 2);
        npyv_s16  b3 = npyv_load_s16(src2 + npyv_nlanes_s16 * 2);
        npyv_s16  a4 = npyv_load_s16(src1 + npyv_nlanes_s16 * 3);
        npyv_s16  b4 = npyv_load_s16(src2 + npyv_nlanes_s16 * 3);
        npyv_b16 c3 = npyv_cmpneq_s16(a3, b3);
        npyv_b16 c4 = npyv_cmpneq_s16(a4, b4);
#if 16 == 64
        npyv_s16  a5 = npyv_load_s16(src1 + npyv_nlanes_s16 * 4);
        npyv_s16  b5 = npyv_load_s16(src2 + npyv_nlanes_s16 * 4);
        npyv_s16  a6 = npyv_load_s16(src1 + npyv_nlanes_s16 * 5);
        npyv_s16  b6 = npyv_load_s16(src2 + npyv_nlanes_s16 * 5);
        npyv_s16  a7 = npyv_load_s16(src1 + npyv_nlanes_s16 * 6);
        npyv_s16  b7 = npyv_load_s16(src2 + npyv_nlanes_s16 * 6);
        npyv_s16  a8 = npyv_load_s16(src1 + npyv_nlanes_s16 * 7);
        npyv_s16  b8 = npyv_load_s16(src2 + npyv_nlanes_s16 * 7);
        npyv_b16 c5 = npyv_cmpneq_s16(a5, b5);
        npyv_b16 c6 = npyv_cmpneq_s16(a6, b6);
        npyv_b16 c7 = npyv_cmpneq_s16(a7, b7);
        npyv_b16 c8 = npyv_cmpneq_s16(a8, b8);
#endif // 16 >= 64
#endif // 16 >= 32
#endif // 16 >= 16
#endif // 16 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 16 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 16 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 16 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 16 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_s16 a = *src1;
        const npyv_lanetype_s16 b = *src2;
        *dst = a != b;
    }
}

static void simd_binary_scalar1_not_equal_s16(char **args, npy_intp len)
{
    npyv_lanetype_s16 scalar = *(npyv_lanetype_s16 *) args[0];
    npyv_lanetype_s16 *src   = (npyv_lanetype_s16 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_s16 a         = npyv_setall_s16(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 16 >= 8
        npyv_s16  b1 = npyv_load_s16(src + npyv_nlanes_s16 * 0);
        npyv_b16 c1 = npyv_cmpneq_s16(a, b1);
#if 16 >= 16
        npyv_s16  b2 = npyv_load_s16(src + npyv_nlanes_s16 * 1);
        npyv_b16 c2 = npyv_cmpneq_s16(a, b2);
#if 16 >= 32
        npyv_s16  b3 = npyv_load_s16(src + npyv_nlanes_s16 * 2);
        npyv_s16  b4 = npyv_load_s16(src + npyv_nlanes_s16 * 3);
        npyv_b16 c3 = npyv_cmpneq_s16(a, b3);
        npyv_b16 c4 = npyv_cmpneq_s16(a, b4);
#if 16 == 64
        npyv_s16  b5 = npyv_load_s16(src + npyv_nlanes_s16 * 4);
        npyv_s16  b6 = npyv_load_s16(src + npyv_nlanes_s16 * 5);
        npyv_s16  b7 = npyv_load_s16(src + npyv_nlanes_s16 * 6);
        npyv_s16  b8 = npyv_load_s16(src + npyv_nlanes_s16 * 7);
        npyv_b16 c5 = npyv_cmpneq_s16(a, b5);
        npyv_b16 c6 = npyv_cmpneq_s16(a, b6);
        npyv_b16 c7 = npyv_cmpneq_s16(a, b7);
        npyv_b16 c8 = npyv_cmpneq_s16(a, b8);
#endif // 16 >= 64
#endif // 16 >= 32
#endif // 16 >= 16
#endif // 16 >= 8

#if 16 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 16 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 16 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 16 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_s16 b = *src;
        *dst = scalar != b;
    }
}

static void simd_binary_scalar2_not_equal_s16(char **args, npy_intp len)
{
    npyv_lanetype_s16 *src   = (npyv_lanetype_s16 *) args[0];
    npyv_lanetype_s16 scalar = *(npyv_lanetype_s16 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_s16 b         = npyv_setall_s16(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 16 >= 8
        npyv_s16  a1 = npyv_load_s16(src + npyv_nlanes_s16 * 0);
        npyv_b16 c1 = npyv_cmpneq_s16(a1, b);
#if 16 >= 16
        npyv_s16  a2 = npyv_load_s16(src + npyv_nlanes_s16 * 1);
        npyv_b16 c2 = npyv_cmpneq_s16(a2, b);
#if 16 >= 32
        npyv_s16  a3 = npyv_load_s16(src + npyv_nlanes_s16 * 2);
        npyv_s16  a4 = npyv_load_s16(src + npyv_nlanes_s16 * 3);
        npyv_b16 c3 = npyv_cmpneq_s16(a3, b);
        npyv_b16 c4 = npyv_cmpneq_s16(a4, b);
#if 16 == 64
        npyv_s16  a5 = npyv_load_s16(src + npyv_nlanes_s16 * 4);
        npyv_s16  a6 = npyv_load_s16(src + npyv_nlanes_s16 * 5);
        npyv_s16  a7 = npyv_load_s16(src + npyv_nlanes_s16 * 6);
        npyv_s16  a8 = npyv_load_s16(src + npyv_nlanes_s16 * 7);
        npyv_b16 c5 = npyv_cmpneq_s16(a5, b);
        npyv_b16 c6 = npyv_cmpneq_s16(a6, b);
        npyv_b16 c7 = npyv_cmpneq_s16(a7, b);
        npyv_b16 c8 = npyv_cmpneq_s16(a8, b);
#endif // 16 >= 64
#endif // 16 >= 32
#endif // 16 >= 16
#endif // 16 >= 8

#if 16 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 16 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 16 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 16 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_s16 a = *src;
        *dst = a != scalar;
    }
}
#endif


#line 35
#if NPY_SIMD && !((0 || 0) && 1)
static void simd_binary_less_s16(char **args, npy_intp len)
{
    npyv_lanetype_s16 *src1 = (npyv_lanetype_s16 *) args[0];
    npyv_lanetype_s16 *src2 = (npyv_lanetype_s16 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 16 >= 8
        npyv_s16  a1 = npyv_load_s16(src1 + npyv_nlanes_s16 * 0);
        npyv_s16  b1 = npyv_load_s16(src2 + npyv_nlanes_s16 * 0);
        npyv_b16 c1 = npyv_cmplt_s16(a1, b1);
#if 16 >= 16
        npyv_s16  a2 = npyv_load_s16(src1 + npyv_nlanes_s16 * 1);
        npyv_s16  b2 = npyv_load_s16(src2 + npyv_nlanes_s16 * 1);
        npyv_b16 c2 = npyv_cmplt_s16(a2, b2);
#if 16 >= 32
        npyv_s16  a3 = npyv_load_s16(src1 + npyv_nlanes_s16 * 2);
        npyv_s16  b3 = npyv_load_s16(src2 + npyv_nlanes_s16 * 2);
        npyv_s16  a4 = npyv_load_s16(src1 + npyv_nlanes_s16 * 3);
        npyv_s16  b4 = npyv_load_s16(src2 + npyv_nlanes_s16 * 3);
        npyv_b16 c3 = npyv_cmplt_s16(a3, b3);
        npyv_b16 c4 = npyv_cmplt_s16(a4, b4);
#if 16 == 64
        npyv_s16  a5 = npyv_load_s16(src1 + npyv_nlanes_s16 * 4);
        npyv_s16  b5 = npyv_load_s16(src2 + npyv_nlanes_s16 * 4);
        npyv_s16  a6 = npyv_load_s16(src1 + npyv_nlanes_s16 * 5);
        npyv_s16  b6 = npyv_load_s16(src2 + npyv_nlanes_s16 * 5);
        npyv_s16  a7 = npyv_load_s16(src1 + npyv_nlanes_s16 * 6);
        npyv_s16  b7 = npyv_load_s16(src2 + npyv_nlanes_s16 * 6);
        npyv_s16  a8 = npyv_load_s16(src1 + npyv_nlanes_s16 * 7);
        npyv_s16  b8 = npyv_load_s16(src2 + npyv_nlanes_s16 * 7);
        npyv_b16 c5 = npyv_cmplt_s16(a5, b5);
        npyv_b16 c6 = npyv_cmplt_s16(a6, b6);
        npyv_b16 c7 = npyv_cmplt_s16(a7, b7);
        npyv_b16 c8 = npyv_cmplt_s16(a8, b8);
#endif // 16 >= 64
#endif // 16 >= 32
#endif // 16 >= 16
#endif // 16 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 16 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 16 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 16 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 16 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_s16 a = *src1;
        const npyv_lanetype_s16 b = *src2;
        *dst = a < b;
    }
}

static void simd_binary_scalar1_less_s16(char **args, npy_intp len)
{
    npyv_lanetype_s16 scalar = *(npyv_lanetype_s16 *) args[0];
    npyv_lanetype_s16 *src   = (npyv_lanetype_s16 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_s16 a         = npyv_setall_s16(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 16 >= 8
        npyv_s16  b1 = npyv_load_s16(src + npyv_nlanes_s16 * 0);
        npyv_b16 c1 = npyv_cmplt_s16(a, b1);
#if 16 >= 16
        npyv_s16  b2 = npyv_load_s16(src + npyv_nlanes_s16 * 1);
        npyv_b16 c2 = npyv_cmplt_s16(a, b2);
#if 16 >= 32
        npyv_s16  b3 = npyv_load_s16(src + npyv_nlanes_s16 * 2);
        npyv_s16  b4 = npyv_load_s16(src + npyv_nlanes_s16 * 3);
        npyv_b16 c3 = npyv_cmplt_s16(a, b3);
        npyv_b16 c4 = npyv_cmplt_s16(a, b4);
#if 16 == 64
        npyv_s16  b5 = npyv_load_s16(src + npyv_nlanes_s16 * 4);
        npyv_s16  b6 = npyv_load_s16(src + npyv_nlanes_s16 * 5);
        npyv_s16  b7 = npyv_load_s16(src + npyv_nlanes_s16 * 6);
        npyv_s16  b8 = npyv_load_s16(src + npyv_nlanes_s16 * 7);
        npyv_b16 c5 = npyv_cmplt_s16(a, b5);
        npyv_b16 c6 = npyv_cmplt_s16(a, b6);
        npyv_b16 c7 = npyv_cmplt_s16(a, b7);
        npyv_b16 c8 = npyv_cmplt_s16(a, b8);
#endif // 16 >= 64
#endif // 16 >= 32
#endif // 16 >= 16
#endif // 16 >= 8

#if 16 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 16 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 16 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 16 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_s16 b = *src;
        *dst = scalar < b;
    }
}

static void simd_binary_scalar2_less_s16(char **args, npy_intp len)
{
    npyv_lanetype_s16 *src   = (npyv_lanetype_s16 *) args[0];
    npyv_lanetype_s16 scalar = *(npyv_lanetype_s16 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_s16 b         = npyv_setall_s16(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 16 >= 8
        npyv_s16  a1 = npyv_load_s16(src + npyv_nlanes_s16 * 0);
        npyv_b16 c1 = npyv_cmplt_s16(a1, b);
#if 16 >= 16
        npyv_s16  a2 = npyv_load_s16(src + npyv_nlanes_s16 * 1);
        npyv_b16 c2 = npyv_cmplt_s16(a2, b);
#if 16 >= 32
        npyv_s16  a3 = npyv_load_s16(src + npyv_nlanes_s16 * 2);
        npyv_s16  a4 = npyv_load_s16(src + npyv_nlanes_s16 * 3);
        npyv_b16 c3 = npyv_cmplt_s16(a3, b);
        npyv_b16 c4 = npyv_cmplt_s16(a4, b);
#if 16 == 64
        npyv_s16  a5 = npyv_load_s16(src + npyv_nlanes_s16 * 4);
        npyv_s16  a6 = npyv_load_s16(src + npyv_nlanes_s16 * 5);
        npyv_s16  a7 = npyv_load_s16(src + npyv_nlanes_s16 * 6);
        npyv_s16  a8 = npyv_load_s16(src + npyv_nlanes_s16 * 7);
        npyv_b16 c5 = npyv_cmplt_s16(a5, b);
        npyv_b16 c6 = npyv_cmplt_s16(a6, b);
        npyv_b16 c7 = npyv_cmplt_s16(a7, b);
        npyv_b16 c8 = npyv_cmplt_s16(a8, b);
#endif // 16 >= 64
#endif // 16 >= 32
#endif // 16 >= 16
#endif // 16 >= 8

#if 16 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 16 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 16 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 16 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_s16 a = *src;
        *dst = a < scalar;
    }
}
#endif


#line 35
#if NPY_SIMD && !((0 || 0) && 1)
static void simd_binary_less_equal_s16(char **args, npy_intp len)
{
    npyv_lanetype_s16 *src1 = (npyv_lanetype_s16 *) args[0];
    npyv_lanetype_s16 *src2 = (npyv_lanetype_s16 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 16 >= 8
        npyv_s16  a1 = npyv_load_s16(src1 + npyv_nlanes_s16 * 0);
        npyv_s16  b1 = npyv_load_s16(src2 + npyv_nlanes_s16 * 0);
        npyv_b16 c1 = npyv_cmple_s16(a1, b1);
#if 16 >= 16
        npyv_s16  a2 = npyv_load_s16(src1 + npyv_nlanes_s16 * 1);
        npyv_s16  b2 = npyv_load_s16(src2 + npyv_nlanes_s16 * 1);
        npyv_b16 c2 = npyv_cmple_s16(a2, b2);
#if 16 >= 32
        npyv_s16  a3 = npyv_load_s16(src1 + npyv_nlanes_s16 * 2);
        npyv_s16  b3 = npyv_load_s16(src2 + npyv_nlanes_s16 * 2);
        npyv_s16  a4 = npyv_load_s16(src1 + npyv_nlanes_s16 * 3);
        npyv_s16  b4 = npyv_load_s16(src2 + npyv_nlanes_s16 * 3);
        npyv_b16 c3 = npyv_cmple_s16(a3, b3);
        npyv_b16 c4 = npyv_cmple_s16(a4, b4);
#if 16 == 64
        npyv_s16  a5 = npyv_load_s16(src1 + npyv_nlanes_s16 * 4);
        npyv_s16  b5 = npyv_load_s16(src2 + npyv_nlanes_s16 * 4);
        npyv_s16  a6 = npyv_load_s16(src1 + npyv_nlanes_s16 * 5);
        npyv_s16  b6 = npyv_load_s16(src2 + npyv_nlanes_s16 * 5);
        npyv_s16  a7 = npyv_load_s16(src1 + npyv_nlanes_s16 * 6);
        npyv_s16  b7 = npyv_load_s16(src2 + npyv_nlanes_s16 * 6);
        npyv_s16  a8 = npyv_load_s16(src1 + npyv_nlanes_s16 * 7);
        npyv_s16  b8 = npyv_load_s16(src2 + npyv_nlanes_s16 * 7);
        npyv_b16 c5 = npyv_cmple_s16(a5, b5);
        npyv_b16 c6 = npyv_cmple_s16(a6, b6);
        npyv_b16 c7 = npyv_cmple_s16(a7, b7);
        npyv_b16 c8 = npyv_cmple_s16(a8, b8);
#endif // 16 >= 64
#endif // 16 >= 32
#endif // 16 >= 16
#endif // 16 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 16 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 16 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 16 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 16 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_s16 a = *src1;
        const npyv_lanetype_s16 b = *src2;
        *dst = a <= b;
    }
}

static void simd_binary_scalar1_less_equal_s16(char **args, npy_intp len)
{
    npyv_lanetype_s16 scalar = *(npyv_lanetype_s16 *) args[0];
    npyv_lanetype_s16 *src   = (npyv_lanetype_s16 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_s16 a         = npyv_setall_s16(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 16 >= 8
        npyv_s16  b1 = npyv_load_s16(src + npyv_nlanes_s16 * 0);
        npyv_b16 c1 = npyv_cmple_s16(a, b1);
#if 16 >= 16
        npyv_s16  b2 = npyv_load_s16(src + npyv_nlanes_s16 * 1);
        npyv_b16 c2 = npyv_cmple_s16(a, b2);
#if 16 >= 32
        npyv_s16  b3 = npyv_load_s16(src + npyv_nlanes_s16 * 2);
        npyv_s16  b4 = npyv_load_s16(src + npyv_nlanes_s16 * 3);
        npyv_b16 c3 = npyv_cmple_s16(a, b3);
        npyv_b16 c4 = npyv_cmple_s16(a, b4);
#if 16 == 64
        npyv_s16  b5 = npyv_load_s16(src + npyv_nlanes_s16 * 4);
        npyv_s16  b6 = npyv_load_s16(src + npyv_nlanes_s16 * 5);
        npyv_s16  b7 = npyv_load_s16(src + npyv_nlanes_s16 * 6);
        npyv_s16  b8 = npyv_load_s16(src + npyv_nlanes_s16 * 7);
        npyv_b16 c5 = npyv_cmple_s16(a, b5);
        npyv_b16 c6 = npyv_cmple_s16(a, b6);
        npyv_b16 c7 = npyv_cmple_s16(a, b7);
        npyv_b16 c8 = npyv_cmple_s16(a, b8);
#endif // 16 >= 64
#endif // 16 >= 32
#endif // 16 >= 16
#endif // 16 >= 8

#if 16 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 16 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 16 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 16 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_s16 b = *src;
        *dst = scalar <= b;
    }
}

static void simd_binary_scalar2_less_equal_s16(char **args, npy_intp len)
{
    npyv_lanetype_s16 *src   = (npyv_lanetype_s16 *) args[0];
    npyv_lanetype_s16 scalar = *(npyv_lanetype_s16 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_s16 b         = npyv_setall_s16(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 16 >= 8
        npyv_s16  a1 = npyv_load_s16(src + npyv_nlanes_s16 * 0);
        npyv_b16 c1 = npyv_cmple_s16(a1, b);
#if 16 >= 16
        npyv_s16  a2 = npyv_load_s16(src + npyv_nlanes_s16 * 1);
        npyv_b16 c2 = npyv_cmple_s16(a2, b);
#if 16 >= 32
        npyv_s16  a3 = npyv_load_s16(src + npyv_nlanes_s16 * 2);
        npyv_s16  a4 = npyv_load_s16(src + npyv_nlanes_s16 * 3);
        npyv_b16 c3 = npyv_cmple_s16(a3, b);
        npyv_b16 c4 = npyv_cmple_s16(a4, b);
#if 16 == 64
        npyv_s16  a5 = npyv_load_s16(src + npyv_nlanes_s16 * 4);
        npyv_s16  a6 = npyv_load_s16(src + npyv_nlanes_s16 * 5);
        npyv_s16  a7 = npyv_load_s16(src + npyv_nlanes_s16 * 6);
        npyv_s16  a8 = npyv_load_s16(src + npyv_nlanes_s16 * 7);
        npyv_b16 c5 = npyv_cmple_s16(a5, b);
        npyv_b16 c6 = npyv_cmple_s16(a6, b);
        npyv_b16 c7 = npyv_cmple_s16(a7, b);
        npyv_b16 c8 = npyv_cmple_s16(a8, b);
#endif // 16 >= 64
#endif // 16 >= 32
#endif // 16 >= 16
#endif // 16 >= 8

#if 16 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 16 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 16 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 16 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_s16 a = *src;
        *dst = a <= scalar;
    }
}
#endif



#line 28
#line 35
#if NPY_SIMD && !((1 || 0) && 0)
static void simd_binary_equal_u32(char **args, npy_intp len)
{
    npyv_lanetype_u32 *src1 = (npyv_lanetype_u32 *) args[0];
    npyv_lanetype_u32 *src2 = (npyv_lanetype_u32 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 32 >= 8
        npyv_u32  a1 = npyv_load_u32(src1 + npyv_nlanes_u32 * 0);
        npyv_u32  b1 = npyv_load_u32(src2 + npyv_nlanes_u32 * 0);
        npyv_b32 c1 = npyv_cmpeq_u32(a1, b1);
#if 32 >= 16
        npyv_u32  a2 = npyv_load_u32(src1 + npyv_nlanes_u32 * 1);
        npyv_u32  b2 = npyv_load_u32(src2 + npyv_nlanes_u32 * 1);
        npyv_b32 c2 = npyv_cmpeq_u32(a2, b2);
#if 32 >= 32
        npyv_u32  a3 = npyv_load_u32(src1 + npyv_nlanes_u32 * 2);
        npyv_u32  b3 = npyv_load_u32(src2 + npyv_nlanes_u32 * 2);
        npyv_u32  a4 = npyv_load_u32(src1 + npyv_nlanes_u32 * 3);
        npyv_u32  b4 = npyv_load_u32(src2 + npyv_nlanes_u32 * 3);
        npyv_b32 c3 = npyv_cmpeq_u32(a3, b3);
        npyv_b32 c4 = npyv_cmpeq_u32(a4, b4);
#if 32 == 64
        npyv_u32  a5 = npyv_load_u32(src1 + npyv_nlanes_u32 * 4);
        npyv_u32  b5 = npyv_load_u32(src2 + npyv_nlanes_u32 * 4);
        npyv_u32  a6 = npyv_load_u32(src1 + npyv_nlanes_u32 * 5);
        npyv_u32  b6 = npyv_load_u32(src2 + npyv_nlanes_u32 * 5);
        npyv_u32  a7 = npyv_load_u32(src1 + npyv_nlanes_u32 * 6);
        npyv_u32  b7 = npyv_load_u32(src2 + npyv_nlanes_u32 * 6);
        npyv_u32  a8 = npyv_load_u32(src1 + npyv_nlanes_u32 * 7);
        npyv_u32  b8 = npyv_load_u32(src2 + npyv_nlanes_u32 * 7);
        npyv_b32 c5 = npyv_cmpeq_u32(a5, b5);
        npyv_b32 c6 = npyv_cmpeq_u32(a6, b6);
        npyv_b32 c7 = npyv_cmpeq_u32(a7, b7);
        npyv_b32 c8 = npyv_cmpeq_u32(a8, b8);
#endif // 32 >= 64
#endif // 32 >= 32
#endif // 32 >= 16
#endif // 32 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 32 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 32 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 32 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 32 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_u32 a = *src1;
        const npyv_lanetype_u32 b = *src2;
        *dst = a == b;
    }
}

static void simd_binary_scalar1_equal_u32(char **args, npy_intp len)
{
    npyv_lanetype_u32 scalar = *(npyv_lanetype_u32 *) args[0];
    npyv_lanetype_u32 *src   = (npyv_lanetype_u32 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_u32 a         = npyv_setall_u32(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 32 >= 8
        npyv_u32  b1 = npyv_load_u32(src + npyv_nlanes_u32 * 0);
        npyv_b32 c1 = npyv_cmpeq_u32(a, b1);
#if 32 >= 16
        npyv_u32  b2 = npyv_load_u32(src + npyv_nlanes_u32 * 1);
        npyv_b32 c2 = npyv_cmpeq_u32(a, b2);
#if 32 >= 32
        npyv_u32  b3 = npyv_load_u32(src + npyv_nlanes_u32 * 2);
        npyv_u32  b4 = npyv_load_u32(src + npyv_nlanes_u32 * 3);
        npyv_b32 c3 = npyv_cmpeq_u32(a, b3);
        npyv_b32 c4 = npyv_cmpeq_u32(a, b4);
#if 32 == 64
        npyv_u32  b5 = npyv_load_u32(src + npyv_nlanes_u32 * 4);
        npyv_u32  b6 = npyv_load_u32(src + npyv_nlanes_u32 * 5);
        npyv_u32  b7 = npyv_load_u32(src + npyv_nlanes_u32 * 6);
        npyv_u32  b8 = npyv_load_u32(src + npyv_nlanes_u32 * 7);
        npyv_b32 c5 = npyv_cmpeq_u32(a, b5);
        npyv_b32 c6 = npyv_cmpeq_u32(a, b6);
        npyv_b32 c7 = npyv_cmpeq_u32(a, b7);
        npyv_b32 c8 = npyv_cmpeq_u32(a, b8);
#endif // 32 >= 64
#endif // 32 >= 32
#endif // 32 >= 16
#endif // 32 >= 8

#if 32 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 32 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 32 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 32 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u32 b = *src;
        *dst = scalar == b;
    }
}

static void simd_binary_scalar2_equal_u32(char **args, npy_intp len)
{
    npyv_lanetype_u32 *src   = (npyv_lanetype_u32 *) args[0];
    npyv_lanetype_u32 scalar = *(npyv_lanetype_u32 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_u32 b         = npyv_setall_u32(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 32 >= 8
        npyv_u32  a1 = npyv_load_u32(src + npyv_nlanes_u32 * 0);
        npyv_b32 c1 = npyv_cmpeq_u32(a1, b);
#if 32 >= 16
        npyv_u32  a2 = npyv_load_u32(src + npyv_nlanes_u32 * 1);
        npyv_b32 c2 = npyv_cmpeq_u32(a2, b);
#if 32 >= 32
        npyv_u32  a3 = npyv_load_u32(src + npyv_nlanes_u32 * 2);
        npyv_u32  a4 = npyv_load_u32(src + npyv_nlanes_u32 * 3);
        npyv_b32 c3 = npyv_cmpeq_u32(a3, b);
        npyv_b32 c4 = npyv_cmpeq_u32(a4, b);
#if 32 == 64
        npyv_u32  a5 = npyv_load_u32(src + npyv_nlanes_u32 * 4);
        npyv_u32  a6 = npyv_load_u32(src + npyv_nlanes_u32 * 5);
        npyv_u32  a7 = npyv_load_u32(src + npyv_nlanes_u32 * 6);
        npyv_u32  a8 = npyv_load_u32(src + npyv_nlanes_u32 * 7);
        npyv_b32 c5 = npyv_cmpeq_u32(a5, b);
        npyv_b32 c6 = npyv_cmpeq_u32(a6, b);
        npyv_b32 c7 = npyv_cmpeq_u32(a7, b);
        npyv_b32 c8 = npyv_cmpeq_u32(a8, b);
#endif // 32 >= 64
#endif // 32 >= 32
#endif // 32 >= 16
#endif // 32 >= 8

#if 32 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 32 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 32 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 32 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u32 a = *src;
        *dst = a == scalar;
    }
}
#endif


#line 35
#if NPY_SIMD && !((0 || 1) && 0)
static void simd_binary_not_equal_u32(char **args, npy_intp len)
{
    npyv_lanetype_u32 *src1 = (npyv_lanetype_u32 *) args[0];
    npyv_lanetype_u32 *src2 = (npyv_lanetype_u32 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 32 >= 8
        npyv_u32  a1 = npyv_load_u32(src1 + npyv_nlanes_u32 * 0);
        npyv_u32  b1 = npyv_load_u32(src2 + npyv_nlanes_u32 * 0);
        npyv_b32 c1 = npyv_cmpneq_u32(a1, b1);
#if 32 >= 16
        npyv_u32  a2 = npyv_load_u32(src1 + npyv_nlanes_u32 * 1);
        npyv_u32  b2 = npyv_load_u32(src2 + npyv_nlanes_u32 * 1);
        npyv_b32 c2 = npyv_cmpneq_u32(a2, b2);
#if 32 >= 32
        npyv_u32  a3 = npyv_load_u32(src1 + npyv_nlanes_u32 * 2);
        npyv_u32  b3 = npyv_load_u32(src2 + npyv_nlanes_u32 * 2);
        npyv_u32  a4 = npyv_load_u32(src1 + npyv_nlanes_u32 * 3);
        npyv_u32  b4 = npyv_load_u32(src2 + npyv_nlanes_u32 * 3);
        npyv_b32 c3 = npyv_cmpneq_u32(a3, b3);
        npyv_b32 c4 = npyv_cmpneq_u32(a4, b4);
#if 32 == 64
        npyv_u32  a5 = npyv_load_u32(src1 + npyv_nlanes_u32 * 4);
        npyv_u32  b5 = npyv_load_u32(src2 + npyv_nlanes_u32 * 4);
        npyv_u32  a6 = npyv_load_u32(src1 + npyv_nlanes_u32 * 5);
        npyv_u32  b6 = npyv_load_u32(src2 + npyv_nlanes_u32 * 5);
        npyv_u32  a7 = npyv_load_u32(src1 + npyv_nlanes_u32 * 6);
        npyv_u32  b7 = npyv_load_u32(src2 + npyv_nlanes_u32 * 6);
        npyv_u32  a8 = npyv_load_u32(src1 + npyv_nlanes_u32 * 7);
        npyv_u32  b8 = npyv_load_u32(src2 + npyv_nlanes_u32 * 7);
        npyv_b32 c5 = npyv_cmpneq_u32(a5, b5);
        npyv_b32 c6 = npyv_cmpneq_u32(a6, b6);
        npyv_b32 c7 = npyv_cmpneq_u32(a7, b7);
        npyv_b32 c8 = npyv_cmpneq_u32(a8, b8);
#endif // 32 >= 64
#endif // 32 >= 32
#endif // 32 >= 16
#endif // 32 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 32 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 32 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 32 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 32 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_u32 a = *src1;
        const npyv_lanetype_u32 b = *src2;
        *dst = a != b;
    }
}

static void simd_binary_scalar1_not_equal_u32(char **args, npy_intp len)
{
    npyv_lanetype_u32 scalar = *(npyv_lanetype_u32 *) args[0];
    npyv_lanetype_u32 *src   = (npyv_lanetype_u32 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_u32 a         = npyv_setall_u32(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 32 >= 8
        npyv_u32  b1 = npyv_load_u32(src + npyv_nlanes_u32 * 0);
        npyv_b32 c1 = npyv_cmpneq_u32(a, b1);
#if 32 >= 16
        npyv_u32  b2 = npyv_load_u32(src + npyv_nlanes_u32 * 1);
        npyv_b32 c2 = npyv_cmpneq_u32(a, b2);
#if 32 >= 32
        npyv_u32  b3 = npyv_load_u32(src + npyv_nlanes_u32 * 2);
        npyv_u32  b4 = npyv_load_u32(src + npyv_nlanes_u32 * 3);
        npyv_b32 c3 = npyv_cmpneq_u32(a, b3);
        npyv_b32 c4 = npyv_cmpneq_u32(a, b4);
#if 32 == 64
        npyv_u32  b5 = npyv_load_u32(src + npyv_nlanes_u32 * 4);
        npyv_u32  b6 = npyv_load_u32(src + npyv_nlanes_u32 * 5);
        npyv_u32  b7 = npyv_load_u32(src + npyv_nlanes_u32 * 6);
        npyv_u32  b8 = npyv_load_u32(src + npyv_nlanes_u32 * 7);
        npyv_b32 c5 = npyv_cmpneq_u32(a, b5);
        npyv_b32 c6 = npyv_cmpneq_u32(a, b6);
        npyv_b32 c7 = npyv_cmpneq_u32(a, b7);
        npyv_b32 c8 = npyv_cmpneq_u32(a, b8);
#endif // 32 >= 64
#endif // 32 >= 32
#endif // 32 >= 16
#endif // 32 >= 8

#if 32 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 32 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 32 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 32 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u32 b = *src;
        *dst = scalar != b;
    }
}

static void simd_binary_scalar2_not_equal_u32(char **args, npy_intp len)
{
    npyv_lanetype_u32 *src   = (npyv_lanetype_u32 *) args[0];
    npyv_lanetype_u32 scalar = *(npyv_lanetype_u32 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_u32 b         = npyv_setall_u32(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 32 >= 8
        npyv_u32  a1 = npyv_load_u32(src + npyv_nlanes_u32 * 0);
        npyv_b32 c1 = npyv_cmpneq_u32(a1, b);
#if 32 >= 16
        npyv_u32  a2 = npyv_load_u32(src + npyv_nlanes_u32 * 1);
        npyv_b32 c2 = npyv_cmpneq_u32(a2, b);
#if 32 >= 32
        npyv_u32  a3 = npyv_load_u32(src + npyv_nlanes_u32 * 2);
        npyv_u32  a4 = npyv_load_u32(src + npyv_nlanes_u32 * 3);
        npyv_b32 c3 = npyv_cmpneq_u32(a3, b);
        npyv_b32 c4 = npyv_cmpneq_u32(a4, b);
#if 32 == 64
        npyv_u32  a5 = npyv_load_u32(src + npyv_nlanes_u32 * 4);
        npyv_u32  a6 = npyv_load_u32(src + npyv_nlanes_u32 * 5);
        npyv_u32  a7 = npyv_load_u32(src + npyv_nlanes_u32 * 6);
        npyv_u32  a8 = npyv_load_u32(src + npyv_nlanes_u32 * 7);
        npyv_b32 c5 = npyv_cmpneq_u32(a5, b);
        npyv_b32 c6 = npyv_cmpneq_u32(a6, b);
        npyv_b32 c7 = npyv_cmpneq_u32(a7, b);
        npyv_b32 c8 = npyv_cmpneq_u32(a8, b);
#endif // 32 >= 64
#endif // 32 >= 32
#endif // 32 >= 16
#endif // 32 >= 8

#if 32 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 32 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 32 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 32 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u32 a = *src;
        *dst = a != scalar;
    }
}
#endif


#line 35
#if NPY_SIMD && !((0 || 0) && 0)
static void simd_binary_less_u32(char **args, npy_intp len)
{
    npyv_lanetype_u32 *src1 = (npyv_lanetype_u32 *) args[0];
    npyv_lanetype_u32 *src2 = (npyv_lanetype_u32 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 32 >= 8
        npyv_u32  a1 = npyv_load_u32(src1 + npyv_nlanes_u32 * 0);
        npyv_u32  b1 = npyv_load_u32(src2 + npyv_nlanes_u32 * 0);
        npyv_b32 c1 = npyv_cmplt_u32(a1, b1);
#if 32 >= 16
        npyv_u32  a2 = npyv_load_u32(src1 + npyv_nlanes_u32 * 1);
        npyv_u32  b2 = npyv_load_u32(src2 + npyv_nlanes_u32 * 1);
        npyv_b32 c2 = npyv_cmplt_u32(a2, b2);
#if 32 >= 32
        npyv_u32  a3 = npyv_load_u32(src1 + npyv_nlanes_u32 * 2);
        npyv_u32  b3 = npyv_load_u32(src2 + npyv_nlanes_u32 * 2);
        npyv_u32  a4 = npyv_load_u32(src1 + npyv_nlanes_u32 * 3);
        npyv_u32  b4 = npyv_load_u32(src2 + npyv_nlanes_u32 * 3);
        npyv_b32 c3 = npyv_cmplt_u32(a3, b3);
        npyv_b32 c4 = npyv_cmplt_u32(a4, b4);
#if 32 == 64
        npyv_u32  a5 = npyv_load_u32(src1 + npyv_nlanes_u32 * 4);
        npyv_u32  b5 = npyv_load_u32(src2 + npyv_nlanes_u32 * 4);
        npyv_u32  a6 = npyv_load_u32(src1 + npyv_nlanes_u32 * 5);
        npyv_u32  b6 = npyv_load_u32(src2 + npyv_nlanes_u32 * 5);
        npyv_u32  a7 = npyv_load_u32(src1 + npyv_nlanes_u32 * 6);
        npyv_u32  b7 = npyv_load_u32(src2 + npyv_nlanes_u32 * 6);
        npyv_u32  a8 = npyv_load_u32(src1 + npyv_nlanes_u32 * 7);
        npyv_u32  b8 = npyv_load_u32(src2 + npyv_nlanes_u32 * 7);
        npyv_b32 c5 = npyv_cmplt_u32(a5, b5);
        npyv_b32 c6 = npyv_cmplt_u32(a6, b6);
        npyv_b32 c7 = npyv_cmplt_u32(a7, b7);
        npyv_b32 c8 = npyv_cmplt_u32(a8, b8);
#endif // 32 >= 64
#endif // 32 >= 32
#endif // 32 >= 16
#endif // 32 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 32 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 32 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 32 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 32 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_u32 a = *src1;
        const npyv_lanetype_u32 b = *src2;
        *dst = a < b;
    }
}

static void simd_binary_scalar1_less_u32(char **args, npy_intp len)
{
    npyv_lanetype_u32 scalar = *(npyv_lanetype_u32 *) args[0];
    npyv_lanetype_u32 *src   = (npyv_lanetype_u32 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_u32 a         = npyv_setall_u32(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 32 >= 8
        npyv_u32  b1 = npyv_load_u32(src + npyv_nlanes_u32 * 0);
        npyv_b32 c1 = npyv_cmplt_u32(a, b1);
#if 32 >= 16
        npyv_u32  b2 = npyv_load_u32(src + npyv_nlanes_u32 * 1);
        npyv_b32 c2 = npyv_cmplt_u32(a, b2);
#if 32 >= 32
        npyv_u32  b3 = npyv_load_u32(src + npyv_nlanes_u32 * 2);
        npyv_u32  b4 = npyv_load_u32(src + npyv_nlanes_u32 * 3);
        npyv_b32 c3 = npyv_cmplt_u32(a, b3);
        npyv_b32 c4 = npyv_cmplt_u32(a, b4);
#if 32 == 64
        npyv_u32  b5 = npyv_load_u32(src + npyv_nlanes_u32 * 4);
        npyv_u32  b6 = npyv_load_u32(src + npyv_nlanes_u32 * 5);
        npyv_u32  b7 = npyv_load_u32(src + npyv_nlanes_u32 * 6);
        npyv_u32  b8 = npyv_load_u32(src + npyv_nlanes_u32 * 7);
        npyv_b32 c5 = npyv_cmplt_u32(a, b5);
        npyv_b32 c6 = npyv_cmplt_u32(a, b6);
        npyv_b32 c7 = npyv_cmplt_u32(a, b7);
        npyv_b32 c8 = npyv_cmplt_u32(a, b8);
#endif // 32 >= 64
#endif // 32 >= 32
#endif // 32 >= 16
#endif // 32 >= 8

#if 32 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 32 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 32 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 32 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u32 b = *src;
        *dst = scalar < b;
    }
}

static void simd_binary_scalar2_less_u32(char **args, npy_intp len)
{
    npyv_lanetype_u32 *src   = (npyv_lanetype_u32 *) args[0];
    npyv_lanetype_u32 scalar = *(npyv_lanetype_u32 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_u32 b         = npyv_setall_u32(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 32 >= 8
        npyv_u32  a1 = npyv_load_u32(src + npyv_nlanes_u32 * 0);
        npyv_b32 c1 = npyv_cmplt_u32(a1, b);
#if 32 >= 16
        npyv_u32  a2 = npyv_load_u32(src + npyv_nlanes_u32 * 1);
        npyv_b32 c2 = npyv_cmplt_u32(a2, b);
#if 32 >= 32
        npyv_u32  a3 = npyv_load_u32(src + npyv_nlanes_u32 * 2);
        npyv_u32  a4 = npyv_load_u32(src + npyv_nlanes_u32 * 3);
        npyv_b32 c3 = npyv_cmplt_u32(a3, b);
        npyv_b32 c4 = npyv_cmplt_u32(a4, b);
#if 32 == 64
        npyv_u32  a5 = npyv_load_u32(src + npyv_nlanes_u32 * 4);
        npyv_u32  a6 = npyv_load_u32(src + npyv_nlanes_u32 * 5);
        npyv_u32  a7 = npyv_load_u32(src + npyv_nlanes_u32 * 6);
        npyv_u32  a8 = npyv_load_u32(src + npyv_nlanes_u32 * 7);
        npyv_b32 c5 = npyv_cmplt_u32(a5, b);
        npyv_b32 c6 = npyv_cmplt_u32(a6, b);
        npyv_b32 c7 = npyv_cmplt_u32(a7, b);
        npyv_b32 c8 = npyv_cmplt_u32(a8, b);
#endif // 32 >= 64
#endif // 32 >= 32
#endif // 32 >= 16
#endif // 32 >= 8

#if 32 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 32 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 32 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 32 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u32 a = *src;
        *dst = a < scalar;
    }
}
#endif


#line 35
#if NPY_SIMD && !((0 || 0) && 0)
static void simd_binary_less_equal_u32(char **args, npy_intp len)
{
    npyv_lanetype_u32 *src1 = (npyv_lanetype_u32 *) args[0];
    npyv_lanetype_u32 *src2 = (npyv_lanetype_u32 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 32 >= 8
        npyv_u32  a1 = npyv_load_u32(src1 + npyv_nlanes_u32 * 0);
        npyv_u32  b1 = npyv_load_u32(src2 + npyv_nlanes_u32 * 0);
        npyv_b32 c1 = npyv_cmple_u32(a1, b1);
#if 32 >= 16
        npyv_u32  a2 = npyv_load_u32(src1 + npyv_nlanes_u32 * 1);
        npyv_u32  b2 = npyv_load_u32(src2 + npyv_nlanes_u32 * 1);
        npyv_b32 c2 = npyv_cmple_u32(a2, b2);
#if 32 >= 32
        npyv_u32  a3 = npyv_load_u32(src1 + npyv_nlanes_u32 * 2);
        npyv_u32  b3 = npyv_load_u32(src2 + npyv_nlanes_u32 * 2);
        npyv_u32  a4 = npyv_load_u32(src1 + npyv_nlanes_u32 * 3);
        npyv_u32  b4 = npyv_load_u32(src2 + npyv_nlanes_u32 * 3);
        npyv_b32 c3 = npyv_cmple_u32(a3, b3);
        npyv_b32 c4 = npyv_cmple_u32(a4, b4);
#if 32 == 64
        npyv_u32  a5 = npyv_load_u32(src1 + npyv_nlanes_u32 * 4);
        npyv_u32  b5 = npyv_load_u32(src2 + npyv_nlanes_u32 * 4);
        npyv_u32  a6 = npyv_load_u32(src1 + npyv_nlanes_u32 * 5);
        npyv_u32  b6 = npyv_load_u32(src2 + npyv_nlanes_u32 * 5);
        npyv_u32  a7 = npyv_load_u32(src1 + npyv_nlanes_u32 * 6);
        npyv_u32  b7 = npyv_load_u32(src2 + npyv_nlanes_u32 * 6);
        npyv_u32  a8 = npyv_load_u32(src1 + npyv_nlanes_u32 * 7);
        npyv_u32  b8 = npyv_load_u32(src2 + npyv_nlanes_u32 * 7);
        npyv_b32 c5 = npyv_cmple_u32(a5, b5);
        npyv_b32 c6 = npyv_cmple_u32(a6, b6);
        npyv_b32 c7 = npyv_cmple_u32(a7, b7);
        npyv_b32 c8 = npyv_cmple_u32(a8, b8);
#endif // 32 >= 64
#endif // 32 >= 32
#endif // 32 >= 16
#endif // 32 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 32 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 32 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 32 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 32 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_u32 a = *src1;
        const npyv_lanetype_u32 b = *src2;
        *dst = a <= b;
    }
}

static void simd_binary_scalar1_less_equal_u32(char **args, npy_intp len)
{
    npyv_lanetype_u32 scalar = *(npyv_lanetype_u32 *) args[0];
    npyv_lanetype_u32 *src   = (npyv_lanetype_u32 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_u32 a         = npyv_setall_u32(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 32 >= 8
        npyv_u32  b1 = npyv_load_u32(src + npyv_nlanes_u32 * 0);
        npyv_b32 c1 = npyv_cmple_u32(a, b1);
#if 32 >= 16
        npyv_u32  b2 = npyv_load_u32(src + npyv_nlanes_u32 * 1);
        npyv_b32 c2 = npyv_cmple_u32(a, b2);
#if 32 >= 32
        npyv_u32  b3 = npyv_load_u32(src + npyv_nlanes_u32 * 2);
        npyv_u32  b4 = npyv_load_u32(src + npyv_nlanes_u32 * 3);
        npyv_b32 c3 = npyv_cmple_u32(a, b3);
        npyv_b32 c4 = npyv_cmple_u32(a, b4);
#if 32 == 64
        npyv_u32  b5 = npyv_load_u32(src + npyv_nlanes_u32 * 4);
        npyv_u32  b6 = npyv_load_u32(src + npyv_nlanes_u32 * 5);
        npyv_u32  b7 = npyv_load_u32(src + npyv_nlanes_u32 * 6);
        npyv_u32  b8 = npyv_load_u32(src + npyv_nlanes_u32 * 7);
        npyv_b32 c5 = npyv_cmple_u32(a, b5);
        npyv_b32 c6 = npyv_cmple_u32(a, b6);
        npyv_b32 c7 = npyv_cmple_u32(a, b7);
        npyv_b32 c8 = npyv_cmple_u32(a, b8);
#endif // 32 >= 64
#endif // 32 >= 32
#endif // 32 >= 16
#endif // 32 >= 8

#if 32 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 32 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 32 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 32 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u32 b = *src;
        *dst = scalar <= b;
    }
}

static void simd_binary_scalar2_less_equal_u32(char **args, npy_intp len)
{
    npyv_lanetype_u32 *src   = (npyv_lanetype_u32 *) args[0];
    npyv_lanetype_u32 scalar = *(npyv_lanetype_u32 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_u32 b         = npyv_setall_u32(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 32 >= 8
        npyv_u32  a1 = npyv_load_u32(src + npyv_nlanes_u32 * 0);
        npyv_b32 c1 = npyv_cmple_u32(a1, b);
#if 32 >= 16
        npyv_u32  a2 = npyv_load_u32(src + npyv_nlanes_u32 * 1);
        npyv_b32 c2 = npyv_cmple_u32(a2, b);
#if 32 >= 32
        npyv_u32  a3 = npyv_load_u32(src + npyv_nlanes_u32 * 2);
        npyv_u32  a4 = npyv_load_u32(src + npyv_nlanes_u32 * 3);
        npyv_b32 c3 = npyv_cmple_u32(a3, b);
        npyv_b32 c4 = npyv_cmple_u32(a4, b);
#if 32 == 64
        npyv_u32  a5 = npyv_load_u32(src + npyv_nlanes_u32 * 4);
        npyv_u32  a6 = npyv_load_u32(src + npyv_nlanes_u32 * 5);
        npyv_u32  a7 = npyv_load_u32(src + npyv_nlanes_u32 * 6);
        npyv_u32  a8 = npyv_load_u32(src + npyv_nlanes_u32 * 7);
        npyv_b32 c5 = npyv_cmple_u32(a5, b);
        npyv_b32 c6 = npyv_cmple_u32(a6, b);
        npyv_b32 c7 = npyv_cmple_u32(a7, b);
        npyv_b32 c8 = npyv_cmple_u32(a8, b);
#endif // 32 >= 64
#endif // 32 >= 32
#endif // 32 >= 16
#endif // 32 >= 8

#if 32 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 32 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 32 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 32 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u32 a = *src;
        *dst = a <= scalar;
    }
}
#endif



#line 28
#line 35
#if NPY_SIMD && !((1 || 0) && 1)
static void simd_binary_equal_s32(char **args, npy_intp len)
{
    npyv_lanetype_s32 *src1 = (npyv_lanetype_s32 *) args[0];
    npyv_lanetype_s32 *src2 = (npyv_lanetype_s32 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 32 >= 8
        npyv_s32  a1 = npyv_load_s32(src1 + npyv_nlanes_s32 * 0);
        npyv_s32  b1 = npyv_load_s32(src2 + npyv_nlanes_s32 * 0);
        npyv_b32 c1 = npyv_cmpeq_s32(a1, b1);
#if 32 >= 16
        npyv_s32  a2 = npyv_load_s32(src1 + npyv_nlanes_s32 * 1);
        npyv_s32  b2 = npyv_load_s32(src2 + npyv_nlanes_s32 * 1);
        npyv_b32 c2 = npyv_cmpeq_s32(a2, b2);
#if 32 >= 32
        npyv_s32  a3 = npyv_load_s32(src1 + npyv_nlanes_s32 * 2);
        npyv_s32  b3 = npyv_load_s32(src2 + npyv_nlanes_s32 * 2);
        npyv_s32  a4 = npyv_load_s32(src1 + npyv_nlanes_s32 * 3);
        npyv_s32  b4 = npyv_load_s32(src2 + npyv_nlanes_s32 * 3);
        npyv_b32 c3 = npyv_cmpeq_s32(a3, b3);
        npyv_b32 c4 = npyv_cmpeq_s32(a4, b4);
#if 32 == 64
        npyv_s32  a5 = npyv_load_s32(src1 + npyv_nlanes_s32 * 4);
        npyv_s32  b5 = npyv_load_s32(src2 + npyv_nlanes_s32 * 4);
        npyv_s32  a6 = npyv_load_s32(src1 + npyv_nlanes_s32 * 5);
        npyv_s32  b6 = npyv_load_s32(src2 + npyv_nlanes_s32 * 5);
        npyv_s32  a7 = npyv_load_s32(src1 + npyv_nlanes_s32 * 6);
        npyv_s32  b7 = npyv_load_s32(src2 + npyv_nlanes_s32 * 6);
        npyv_s32  a8 = npyv_load_s32(src1 + npyv_nlanes_s32 * 7);
        npyv_s32  b8 = npyv_load_s32(src2 + npyv_nlanes_s32 * 7);
        npyv_b32 c5 = npyv_cmpeq_s32(a5, b5);
        npyv_b32 c6 = npyv_cmpeq_s32(a6, b6);
        npyv_b32 c7 = npyv_cmpeq_s32(a7, b7);
        npyv_b32 c8 = npyv_cmpeq_s32(a8, b8);
#endif // 32 >= 64
#endif // 32 >= 32
#endif // 32 >= 16
#endif // 32 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 32 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 32 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 32 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 32 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_s32 a = *src1;
        const npyv_lanetype_s32 b = *src2;
        *dst = a == b;
    }
}

static void simd_binary_scalar1_equal_s32(char **args, npy_intp len)
{
    npyv_lanetype_s32 scalar = *(npyv_lanetype_s32 *) args[0];
    npyv_lanetype_s32 *src   = (npyv_lanetype_s32 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_s32 a         = npyv_setall_s32(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 32 >= 8
        npyv_s32  b1 = npyv_load_s32(src + npyv_nlanes_s32 * 0);
        npyv_b32 c1 = npyv_cmpeq_s32(a, b1);
#if 32 >= 16
        npyv_s32  b2 = npyv_load_s32(src + npyv_nlanes_s32 * 1);
        npyv_b32 c2 = npyv_cmpeq_s32(a, b2);
#if 32 >= 32
        npyv_s32  b3 = npyv_load_s32(src + npyv_nlanes_s32 * 2);
        npyv_s32  b4 = npyv_load_s32(src + npyv_nlanes_s32 * 3);
        npyv_b32 c3 = npyv_cmpeq_s32(a, b3);
        npyv_b32 c4 = npyv_cmpeq_s32(a, b4);
#if 32 == 64
        npyv_s32  b5 = npyv_load_s32(src + npyv_nlanes_s32 * 4);
        npyv_s32  b6 = npyv_load_s32(src + npyv_nlanes_s32 * 5);
        npyv_s32  b7 = npyv_load_s32(src + npyv_nlanes_s32 * 6);
        npyv_s32  b8 = npyv_load_s32(src + npyv_nlanes_s32 * 7);
        npyv_b32 c5 = npyv_cmpeq_s32(a, b5);
        npyv_b32 c6 = npyv_cmpeq_s32(a, b6);
        npyv_b32 c7 = npyv_cmpeq_s32(a, b7);
        npyv_b32 c8 = npyv_cmpeq_s32(a, b8);
#endif // 32 >= 64
#endif // 32 >= 32
#endif // 32 >= 16
#endif // 32 >= 8

#if 32 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 32 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 32 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 32 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_s32 b = *src;
        *dst = scalar == b;
    }
}

static void simd_binary_scalar2_equal_s32(char **args, npy_intp len)
{
    npyv_lanetype_s32 *src   = (npyv_lanetype_s32 *) args[0];
    npyv_lanetype_s32 scalar = *(npyv_lanetype_s32 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_s32 b         = npyv_setall_s32(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 32 >= 8
        npyv_s32  a1 = npyv_load_s32(src + npyv_nlanes_s32 * 0);
        npyv_b32 c1 = npyv_cmpeq_s32(a1, b);
#if 32 >= 16
        npyv_s32  a2 = npyv_load_s32(src + npyv_nlanes_s32 * 1);
        npyv_b32 c2 = npyv_cmpeq_s32(a2, b);
#if 32 >= 32
        npyv_s32  a3 = npyv_load_s32(src + npyv_nlanes_s32 * 2);
        npyv_s32  a4 = npyv_load_s32(src + npyv_nlanes_s32 * 3);
        npyv_b32 c3 = npyv_cmpeq_s32(a3, b);
        npyv_b32 c4 = npyv_cmpeq_s32(a4, b);
#if 32 == 64
        npyv_s32  a5 = npyv_load_s32(src + npyv_nlanes_s32 * 4);
        npyv_s32  a6 = npyv_load_s32(src + npyv_nlanes_s32 * 5);
        npyv_s32  a7 = npyv_load_s32(src + npyv_nlanes_s32 * 6);
        npyv_s32  a8 = npyv_load_s32(src + npyv_nlanes_s32 * 7);
        npyv_b32 c5 = npyv_cmpeq_s32(a5, b);
        npyv_b32 c6 = npyv_cmpeq_s32(a6, b);
        npyv_b32 c7 = npyv_cmpeq_s32(a7, b);
        npyv_b32 c8 = npyv_cmpeq_s32(a8, b);
#endif // 32 >= 64
#endif // 32 >= 32
#endif // 32 >= 16
#endif // 32 >= 8

#if 32 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 32 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 32 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 32 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_s32 a = *src;
        *dst = a == scalar;
    }
}
#endif


#line 35
#if NPY_SIMD && !((0 || 1) && 1)
static void simd_binary_not_equal_s32(char **args, npy_intp len)
{
    npyv_lanetype_s32 *src1 = (npyv_lanetype_s32 *) args[0];
    npyv_lanetype_s32 *src2 = (npyv_lanetype_s32 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 32 >= 8
        npyv_s32  a1 = npyv_load_s32(src1 + npyv_nlanes_s32 * 0);
        npyv_s32  b1 = npyv_load_s32(src2 + npyv_nlanes_s32 * 0);
        npyv_b32 c1 = npyv_cmpneq_s32(a1, b1);
#if 32 >= 16
        npyv_s32  a2 = npyv_load_s32(src1 + npyv_nlanes_s32 * 1);
        npyv_s32  b2 = npyv_load_s32(src2 + npyv_nlanes_s32 * 1);
        npyv_b32 c2 = npyv_cmpneq_s32(a2, b2);
#if 32 >= 32
        npyv_s32  a3 = npyv_load_s32(src1 + npyv_nlanes_s32 * 2);
        npyv_s32  b3 = npyv_load_s32(src2 + npyv_nlanes_s32 * 2);
        npyv_s32  a4 = npyv_load_s32(src1 + npyv_nlanes_s32 * 3);
        npyv_s32  b4 = npyv_load_s32(src2 + npyv_nlanes_s32 * 3);
        npyv_b32 c3 = npyv_cmpneq_s32(a3, b3);
        npyv_b32 c4 = npyv_cmpneq_s32(a4, b4);
#if 32 == 64
        npyv_s32  a5 = npyv_load_s32(src1 + npyv_nlanes_s32 * 4);
        npyv_s32  b5 = npyv_load_s32(src2 + npyv_nlanes_s32 * 4);
        npyv_s32  a6 = npyv_load_s32(src1 + npyv_nlanes_s32 * 5);
        npyv_s32  b6 = npyv_load_s32(src2 + npyv_nlanes_s32 * 5);
        npyv_s32  a7 = npyv_load_s32(src1 + npyv_nlanes_s32 * 6);
        npyv_s32  b7 = npyv_load_s32(src2 + npyv_nlanes_s32 * 6);
        npyv_s32  a8 = npyv_load_s32(src1 + npyv_nlanes_s32 * 7);
        npyv_s32  b8 = npyv_load_s32(src2 + npyv_nlanes_s32 * 7);
        npyv_b32 c5 = npyv_cmpneq_s32(a5, b5);
        npyv_b32 c6 = npyv_cmpneq_s32(a6, b6);
        npyv_b32 c7 = npyv_cmpneq_s32(a7, b7);
        npyv_b32 c8 = npyv_cmpneq_s32(a8, b8);
#endif // 32 >= 64
#endif // 32 >= 32
#endif // 32 >= 16
#endif // 32 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 32 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 32 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 32 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 32 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_s32 a = *src1;
        const npyv_lanetype_s32 b = *src2;
        *dst = a != b;
    }
}

static void simd_binary_scalar1_not_equal_s32(char **args, npy_intp len)
{
    npyv_lanetype_s32 scalar = *(npyv_lanetype_s32 *) args[0];
    npyv_lanetype_s32 *src   = (npyv_lanetype_s32 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_s32 a         = npyv_setall_s32(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 32 >= 8
        npyv_s32  b1 = npyv_load_s32(src + npyv_nlanes_s32 * 0);
        npyv_b32 c1 = npyv_cmpneq_s32(a, b1);
#if 32 >= 16
        npyv_s32  b2 = npyv_load_s32(src + npyv_nlanes_s32 * 1);
        npyv_b32 c2 = npyv_cmpneq_s32(a, b2);
#if 32 >= 32
        npyv_s32  b3 = npyv_load_s32(src + npyv_nlanes_s32 * 2);
        npyv_s32  b4 = npyv_load_s32(src + npyv_nlanes_s32 * 3);
        npyv_b32 c3 = npyv_cmpneq_s32(a, b3);
        npyv_b32 c4 = npyv_cmpneq_s32(a, b4);
#if 32 == 64
        npyv_s32  b5 = npyv_load_s32(src + npyv_nlanes_s32 * 4);
        npyv_s32  b6 = npyv_load_s32(src + npyv_nlanes_s32 * 5);
        npyv_s32  b7 = npyv_load_s32(src + npyv_nlanes_s32 * 6);
        npyv_s32  b8 = npyv_load_s32(src + npyv_nlanes_s32 * 7);
        npyv_b32 c5 = npyv_cmpneq_s32(a, b5);
        npyv_b32 c6 = npyv_cmpneq_s32(a, b6);
        npyv_b32 c7 = npyv_cmpneq_s32(a, b7);
        npyv_b32 c8 = npyv_cmpneq_s32(a, b8);
#endif // 32 >= 64
#endif // 32 >= 32
#endif // 32 >= 16
#endif // 32 >= 8

#if 32 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 32 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 32 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 32 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_s32 b = *src;
        *dst = scalar != b;
    }
}

static void simd_binary_scalar2_not_equal_s32(char **args, npy_intp len)
{
    npyv_lanetype_s32 *src   = (npyv_lanetype_s32 *) args[0];
    npyv_lanetype_s32 scalar = *(npyv_lanetype_s32 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_s32 b         = npyv_setall_s32(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 32 >= 8
        npyv_s32  a1 = npyv_load_s32(src + npyv_nlanes_s32 * 0);
        npyv_b32 c1 = npyv_cmpneq_s32(a1, b);
#if 32 >= 16
        npyv_s32  a2 = npyv_load_s32(src + npyv_nlanes_s32 * 1);
        npyv_b32 c2 = npyv_cmpneq_s32(a2, b);
#if 32 >= 32
        npyv_s32  a3 = npyv_load_s32(src + npyv_nlanes_s32 * 2);
        npyv_s32  a4 = npyv_load_s32(src + npyv_nlanes_s32 * 3);
        npyv_b32 c3 = npyv_cmpneq_s32(a3, b);
        npyv_b32 c4 = npyv_cmpneq_s32(a4, b);
#if 32 == 64
        npyv_s32  a5 = npyv_load_s32(src + npyv_nlanes_s32 * 4);
        npyv_s32  a6 = npyv_load_s32(src + npyv_nlanes_s32 * 5);
        npyv_s32  a7 = npyv_load_s32(src + npyv_nlanes_s32 * 6);
        npyv_s32  a8 = npyv_load_s32(src + npyv_nlanes_s32 * 7);
        npyv_b32 c5 = npyv_cmpneq_s32(a5, b);
        npyv_b32 c6 = npyv_cmpneq_s32(a6, b);
        npyv_b32 c7 = npyv_cmpneq_s32(a7, b);
        npyv_b32 c8 = npyv_cmpneq_s32(a8, b);
#endif // 32 >= 64
#endif // 32 >= 32
#endif // 32 >= 16
#endif // 32 >= 8

#if 32 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 32 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 32 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 32 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_s32 a = *src;
        *dst = a != scalar;
    }
}
#endif


#line 35
#if NPY_SIMD && !((0 || 0) && 1)
static void simd_binary_less_s32(char **args, npy_intp len)
{
    npyv_lanetype_s32 *src1 = (npyv_lanetype_s32 *) args[0];
    npyv_lanetype_s32 *src2 = (npyv_lanetype_s32 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 32 >= 8
        npyv_s32  a1 = npyv_load_s32(src1 + npyv_nlanes_s32 * 0);
        npyv_s32  b1 = npyv_load_s32(src2 + npyv_nlanes_s32 * 0);
        npyv_b32 c1 = npyv_cmplt_s32(a1, b1);
#if 32 >= 16
        npyv_s32  a2 = npyv_load_s32(src1 + npyv_nlanes_s32 * 1);
        npyv_s32  b2 = npyv_load_s32(src2 + npyv_nlanes_s32 * 1);
        npyv_b32 c2 = npyv_cmplt_s32(a2, b2);
#if 32 >= 32
        npyv_s32  a3 = npyv_load_s32(src1 + npyv_nlanes_s32 * 2);
        npyv_s32  b3 = npyv_load_s32(src2 + npyv_nlanes_s32 * 2);
        npyv_s32  a4 = npyv_load_s32(src1 + npyv_nlanes_s32 * 3);
        npyv_s32  b4 = npyv_load_s32(src2 + npyv_nlanes_s32 * 3);
        npyv_b32 c3 = npyv_cmplt_s32(a3, b3);
        npyv_b32 c4 = npyv_cmplt_s32(a4, b4);
#if 32 == 64
        npyv_s32  a5 = npyv_load_s32(src1 + npyv_nlanes_s32 * 4);
        npyv_s32  b5 = npyv_load_s32(src2 + npyv_nlanes_s32 * 4);
        npyv_s32  a6 = npyv_load_s32(src1 + npyv_nlanes_s32 * 5);
        npyv_s32  b6 = npyv_load_s32(src2 + npyv_nlanes_s32 * 5);
        npyv_s32  a7 = npyv_load_s32(src1 + npyv_nlanes_s32 * 6);
        npyv_s32  b7 = npyv_load_s32(src2 + npyv_nlanes_s32 * 6);
        npyv_s32  a8 = npyv_load_s32(src1 + npyv_nlanes_s32 * 7);
        npyv_s32  b8 = npyv_load_s32(src2 + npyv_nlanes_s32 * 7);
        npyv_b32 c5 = npyv_cmplt_s32(a5, b5);
        npyv_b32 c6 = npyv_cmplt_s32(a6, b6);
        npyv_b32 c7 = npyv_cmplt_s32(a7, b7);
        npyv_b32 c8 = npyv_cmplt_s32(a8, b8);
#endif // 32 >= 64
#endif // 32 >= 32
#endif // 32 >= 16
#endif // 32 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 32 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 32 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 32 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 32 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_s32 a = *src1;
        const npyv_lanetype_s32 b = *src2;
        *dst = a < b;
    }
}

static void simd_binary_scalar1_less_s32(char **args, npy_intp len)
{
    npyv_lanetype_s32 scalar = *(npyv_lanetype_s32 *) args[0];
    npyv_lanetype_s32 *src   = (npyv_lanetype_s32 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_s32 a         = npyv_setall_s32(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 32 >= 8
        npyv_s32  b1 = npyv_load_s32(src + npyv_nlanes_s32 * 0);
        npyv_b32 c1 = npyv_cmplt_s32(a, b1);
#if 32 >= 16
        npyv_s32  b2 = npyv_load_s32(src + npyv_nlanes_s32 * 1);
        npyv_b32 c2 = npyv_cmplt_s32(a, b2);
#if 32 >= 32
        npyv_s32  b3 = npyv_load_s32(src + npyv_nlanes_s32 * 2);
        npyv_s32  b4 = npyv_load_s32(src + npyv_nlanes_s32 * 3);
        npyv_b32 c3 = npyv_cmplt_s32(a, b3);
        npyv_b32 c4 = npyv_cmplt_s32(a, b4);
#if 32 == 64
        npyv_s32  b5 = npyv_load_s32(src + npyv_nlanes_s32 * 4);
        npyv_s32  b6 = npyv_load_s32(src + npyv_nlanes_s32 * 5);
        npyv_s32  b7 = npyv_load_s32(src + npyv_nlanes_s32 * 6);
        npyv_s32  b8 = npyv_load_s32(src + npyv_nlanes_s32 * 7);
        npyv_b32 c5 = npyv_cmplt_s32(a, b5);
        npyv_b32 c6 = npyv_cmplt_s32(a, b6);
        npyv_b32 c7 = npyv_cmplt_s32(a, b7);
        npyv_b32 c8 = npyv_cmplt_s32(a, b8);
#endif // 32 >= 64
#endif // 32 >= 32
#endif // 32 >= 16
#endif // 32 >= 8

#if 32 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 32 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 32 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 32 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_s32 b = *src;
        *dst = scalar < b;
    }
}

static void simd_binary_scalar2_less_s32(char **args, npy_intp len)
{
    npyv_lanetype_s32 *src   = (npyv_lanetype_s32 *) args[0];
    npyv_lanetype_s32 scalar = *(npyv_lanetype_s32 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_s32 b         = npyv_setall_s32(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 32 >= 8
        npyv_s32  a1 = npyv_load_s32(src + npyv_nlanes_s32 * 0);
        npyv_b32 c1 = npyv_cmplt_s32(a1, b);
#if 32 >= 16
        npyv_s32  a2 = npyv_load_s32(src + npyv_nlanes_s32 * 1);
        npyv_b32 c2 = npyv_cmplt_s32(a2, b);
#if 32 >= 32
        npyv_s32  a3 = npyv_load_s32(src + npyv_nlanes_s32 * 2);
        npyv_s32  a4 = npyv_load_s32(src + npyv_nlanes_s32 * 3);
        npyv_b32 c3 = npyv_cmplt_s32(a3, b);
        npyv_b32 c4 = npyv_cmplt_s32(a4, b);
#if 32 == 64
        npyv_s32  a5 = npyv_load_s32(src + npyv_nlanes_s32 * 4);
        npyv_s32  a6 = npyv_load_s32(src + npyv_nlanes_s32 * 5);
        npyv_s32  a7 = npyv_load_s32(src + npyv_nlanes_s32 * 6);
        npyv_s32  a8 = npyv_load_s32(src + npyv_nlanes_s32 * 7);
        npyv_b32 c5 = npyv_cmplt_s32(a5, b);
        npyv_b32 c6 = npyv_cmplt_s32(a6, b);
        npyv_b32 c7 = npyv_cmplt_s32(a7, b);
        npyv_b32 c8 = npyv_cmplt_s32(a8, b);
#endif // 32 >= 64
#endif // 32 >= 32
#endif // 32 >= 16
#endif // 32 >= 8

#if 32 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 32 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 32 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 32 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_s32 a = *src;
        *dst = a < scalar;
    }
}
#endif


#line 35
#if NPY_SIMD && !((0 || 0) && 1)
static void simd_binary_less_equal_s32(char **args, npy_intp len)
{
    npyv_lanetype_s32 *src1 = (npyv_lanetype_s32 *) args[0];
    npyv_lanetype_s32 *src2 = (npyv_lanetype_s32 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 32 >= 8
        npyv_s32  a1 = npyv_load_s32(src1 + npyv_nlanes_s32 * 0);
        npyv_s32  b1 = npyv_load_s32(src2 + npyv_nlanes_s32 * 0);
        npyv_b32 c1 = npyv_cmple_s32(a1, b1);
#if 32 >= 16
        npyv_s32  a2 = npyv_load_s32(src1 + npyv_nlanes_s32 * 1);
        npyv_s32  b2 = npyv_load_s32(src2 + npyv_nlanes_s32 * 1);
        npyv_b32 c2 = npyv_cmple_s32(a2, b2);
#if 32 >= 32
        npyv_s32  a3 = npyv_load_s32(src1 + npyv_nlanes_s32 * 2);
        npyv_s32  b3 = npyv_load_s32(src2 + npyv_nlanes_s32 * 2);
        npyv_s32  a4 = npyv_load_s32(src1 + npyv_nlanes_s32 * 3);
        npyv_s32  b4 = npyv_load_s32(src2 + npyv_nlanes_s32 * 3);
        npyv_b32 c3 = npyv_cmple_s32(a3, b3);
        npyv_b32 c4 = npyv_cmple_s32(a4, b4);
#if 32 == 64
        npyv_s32  a5 = npyv_load_s32(src1 + npyv_nlanes_s32 * 4);
        npyv_s32  b5 = npyv_load_s32(src2 + npyv_nlanes_s32 * 4);
        npyv_s32  a6 = npyv_load_s32(src1 + npyv_nlanes_s32 * 5);
        npyv_s32  b6 = npyv_load_s32(src2 + npyv_nlanes_s32 * 5);
        npyv_s32  a7 = npyv_load_s32(src1 + npyv_nlanes_s32 * 6);
        npyv_s32  b7 = npyv_load_s32(src2 + npyv_nlanes_s32 * 6);
        npyv_s32  a8 = npyv_load_s32(src1 + npyv_nlanes_s32 * 7);
        npyv_s32  b8 = npyv_load_s32(src2 + npyv_nlanes_s32 * 7);
        npyv_b32 c5 = npyv_cmple_s32(a5, b5);
        npyv_b32 c6 = npyv_cmple_s32(a6, b6);
        npyv_b32 c7 = npyv_cmple_s32(a7, b7);
        npyv_b32 c8 = npyv_cmple_s32(a8, b8);
#endif // 32 >= 64
#endif // 32 >= 32
#endif // 32 >= 16
#endif // 32 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 32 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 32 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 32 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 32 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_s32 a = *src1;
        const npyv_lanetype_s32 b = *src2;
        *dst = a <= b;
    }
}

static void simd_binary_scalar1_less_equal_s32(char **args, npy_intp len)
{
    npyv_lanetype_s32 scalar = *(npyv_lanetype_s32 *) args[0];
    npyv_lanetype_s32 *src   = (npyv_lanetype_s32 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_s32 a         = npyv_setall_s32(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 32 >= 8
        npyv_s32  b1 = npyv_load_s32(src + npyv_nlanes_s32 * 0);
        npyv_b32 c1 = npyv_cmple_s32(a, b1);
#if 32 >= 16
        npyv_s32  b2 = npyv_load_s32(src + npyv_nlanes_s32 * 1);
        npyv_b32 c2 = npyv_cmple_s32(a, b2);
#if 32 >= 32
        npyv_s32  b3 = npyv_load_s32(src + npyv_nlanes_s32 * 2);
        npyv_s32  b4 = npyv_load_s32(src + npyv_nlanes_s32 * 3);
        npyv_b32 c3 = npyv_cmple_s32(a, b3);
        npyv_b32 c4 = npyv_cmple_s32(a, b4);
#if 32 == 64
        npyv_s32  b5 = npyv_load_s32(src + npyv_nlanes_s32 * 4);
        npyv_s32  b6 = npyv_load_s32(src + npyv_nlanes_s32 * 5);
        npyv_s32  b7 = npyv_load_s32(src + npyv_nlanes_s32 * 6);
        npyv_s32  b8 = npyv_load_s32(src + npyv_nlanes_s32 * 7);
        npyv_b32 c5 = npyv_cmple_s32(a, b5);
        npyv_b32 c6 = npyv_cmple_s32(a, b6);
        npyv_b32 c7 = npyv_cmple_s32(a, b7);
        npyv_b32 c8 = npyv_cmple_s32(a, b8);
#endif // 32 >= 64
#endif // 32 >= 32
#endif // 32 >= 16
#endif // 32 >= 8

#if 32 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 32 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 32 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 32 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_s32 b = *src;
        *dst = scalar <= b;
    }
}

static void simd_binary_scalar2_less_equal_s32(char **args, npy_intp len)
{
    npyv_lanetype_s32 *src   = (npyv_lanetype_s32 *) args[0];
    npyv_lanetype_s32 scalar = *(npyv_lanetype_s32 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_s32 b         = npyv_setall_s32(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 32 >= 8
        npyv_s32  a1 = npyv_load_s32(src + npyv_nlanes_s32 * 0);
        npyv_b32 c1 = npyv_cmple_s32(a1, b);
#if 32 >= 16
        npyv_s32  a2 = npyv_load_s32(src + npyv_nlanes_s32 * 1);
        npyv_b32 c2 = npyv_cmple_s32(a2, b);
#if 32 >= 32
        npyv_s32  a3 = npyv_load_s32(src + npyv_nlanes_s32 * 2);
        npyv_s32  a4 = npyv_load_s32(src + npyv_nlanes_s32 * 3);
        npyv_b32 c3 = npyv_cmple_s32(a3, b);
        npyv_b32 c4 = npyv_cmple_s32(a4, b);
#if 32 == 64
        npyv_s32  a5 = npyv_load_s32(src + npyv_nlanes_s32 * 4);
        npyv_s32  a6 = npyv_load_s32(src + npyv_nlanes_s32 * 5);
        npyv_s32  a7 = npyv_load_s32(src + npyv_nlanes_s32 * 6);
        npyv_s32  a8 = npyv_load_s32(src + npyv_nlanes_s32 * 7);
        npyv_b32 c5 = npyv_cmple_s32(a5, b);
        npyv_b32 c6 = npyv_cmple_s32(a6, b);
        npyv_b32 c7 = npyv_cmple_s32(a7, b);
        npyv_b32 c8 = npyv_cmple_s32(a8, b);
#endif // 32 >= 64
#endif // 32 >= 32
#endif // 32 >= 16
#endif // 32 >= 8

#if 32 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 32 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 32 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 32 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_s32 a = *src;
        *dst = a <= scalar;
    }
}
#endif



#line 28
#line 35
#if NPY_SIMD && !((1 || 0) && 0)
static void simd_binary_equal_u64(char **args, npy_intp len)
{
    npyv_lanetype_u64 *src1 = (npyv_lanetype_u64 *) args[0];
    npyv_lanetype_u64 *src2 = (npyv_lanetype_u64 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 64 >= 8
        npyv_u64  a1 = npyv_load_u64(src1 + npyv_nlanes_u64 * 0);
        npyv_u64  b1 = npyv_load_u64(src2 + npyv_nlanes_u64 * 0);
        npyv_b64 c1 = npyv_cmpeq_u64(a1, b1);
#if 64 >= 16
        npyv_u64  a2 = npyv_load_u64(src1 + npyv_nlanes_u64 * 1);
        npyv_u64  b2 = npyv_load_u64(src2 + npyv_nlanes_u64 * 1);
        npyv_b64 c2 = npyv_cmpeq_u64(a2, b2);
#if 64 >= 32
        npyv_u64  a3 = npyv_load_u64(src1 + npyv_nlanes_u64 * 2);
        npyv_u64  b3 = npyv_load_u64(src2 + npyv_nlanes_u64 * 2);
        npyv_u64  a4 = npyv_load_u64(src1 + npyv_nlanes_u64 * 3);
        npyv_u64  b4 = npyv_load_u64(src2 + npyv_nlanes_u64 * 3);
        npyv_b64 c3 = npyv_cmpeq_u64(a3, b3);
        npyv_b64 c4 = npyv_cmpeq_u64(a4, b4);
#if 64 == 64
        npyv_u64  a5 = npyv_load_u64(src1 + npyv_nlanes_u64 * 4);
        npyv_u64  b5 = npyv_load_u64(src2 + npyv_nlanes_u64 * 4);
        npyv_u64  a6 = npyv_load_u64(src1 + npyv_nlanes_u64 * 5);
        npyv_u64  b6 = npyv_load_u64(src2 + npyv_nlanes_u64 * 5);
        npyv_u64  a7 = npyv_load_u64(src1 + npyv_nlanes_u64 * 6);
        npyv_u64  b7 = npyv_load_u64(src2 + npyv_nlanes_u64 * 6);
        npyv_u64  a8 = npyv_load_u64(src1 + npyv_nlanes_u64 * 7);
        npyv_u64  b8 = npyv_load_u64(src2 + npyv_nlanes_u64 * 7);
        npyv_b64 c5 = npyv_cmpeq_u64(a5, b5);
        npyv_b64 c6 = npyv_cmpeq_u64(a6, b6);
        npyv_b64 c7 = npyv_cmpeq_u64(a7, b7);
        npyv_b64 c8 = npyv_cmpeq_u64(a8, b8);
#endif // 64 >= 64
#endif // 64 >= 32
#endif // 64 >= 16
#endif // 64 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 64 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 64 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 64 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 64 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_u64 a = *src1;
        const npyv_lanetype_u64 b = *src2;
        *dst = a == b;
    }
}

static void simd_binary_scalar1_equal_u64(char **args, npy_intp len)
{
    npyv_lanetype_u64 scalar = *(npyv_lanetype_u64 *) args[0];
    npyv_lanetype_u64 *src   = (npyv_lanetype_u64 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_u64 a         = npyv_setall_u64(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 64 >= 8
        npyv_u64  b1 = npyv_load_u64(src + npyv_nlanes_u64 * 0);
        npyv_b64 c1 = npyv_cmpeq_u64(a, b1);
#if 64 >= 16
        npyv_u64  b2 = npyv_load_u64(src + npyv_nlanes_u64 * 1);
        npyv_b64 c2 = npyv_cmpeq_u64(a, b2);
#if 64 >= 32
        npyv_u64  b3 = npyv_load_u64(src + npyv_nlanes_u64 * 2);
        npyv_u64  b4 = npyv_load_u64(src + npyv_nlanes_u64 * 3);
        npyv_b64 c3 = npyv_cmpeq_u64(a, b3);
        npyv_b64 c4 = npyv_cmpeq_u64(a, b4);
#if 64 == 64
        npyv_u64  b5 = npyv_load_u64(src + npyv_nlanes_u64 * 4);
        npyv_u64  b6 = npyv_load_u64(src + npyv_nlanes_u64 * 5);
        npyv_u64  b7 = npyv_load_u64(src + npyv_nlanes_u64 * 6);
        npyv_u64  b8 = npyv_load_u64(src + npyv_nlanes_u64 * 7);
        npyv_b64 c5 = npyv_cmpeq_u64(a, b5);
        npyv_b64 c6 = npyv_cmpeq_u64(a, b6);
        npyv_b64 c7 = npyv_cmpeq_u64(a, b7);
        npyv_b64 c8 = npyv_cmpeq_u64(a, b8);
#endif // 64 >= 64
#endif // 64 >= 32
#endif // 64 >= 16
#endif // 64 >= 8

#if 64 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 64 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 64 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 64 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u64 b = *src;
        *dst = scalar == b;
    }
}

static void simd_binary_scalar2_equal_u64(char **args, npy_intp len)
{
    npyv_lanetype_u64 *src   = (npyv_lanetype_u64 *) args[0];
    npyv_lanetype_u64 scalar = *(npyv_lanetype_u64 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_u64 b         = npyv_setall_u64(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 64 >= 8
        npyv_u64  a1 = npyv_load_u64(src + npyv_nlanes_u64 * 0);
        npyv_b64 c1 = npyv_cmpeq_u64(a1, b);
#if 64 >= 16
        npyv_u64  a2 = npyv_load_u64(src + npyv_nlanes_u64 * 1);
        npyv_b64 c2 = npyv_cmpeq_u64(a2, b);
#if 64 >= 32
        npyv_u64  a3 = npyv_load_u64(src + npyv_nlanes_u64 * 2);
        npyv_u64  a4 = npyv_load_u64(src + npyv_nlanes_u64 * 3);
        npyv_b64 c3 = npyv_cmpeq_u64(a3, b);
        npyv_b64 c4 = npyv_cmpeq_u64(a4, b);
#if 64 == 64
        npyv_u64  a5 = npyv_load_u64(src + npyv_nlanes_u64 * 4);
        npyv_u64  a6 = npyv_load_u64(src + npyv_nlanes_u64 * 5);
        npyv_u64  a7 = npyv_load_u64(src + npyv_nlanes_u64 * 6);
        npyv_u64  a8 = npyv_load_u64(src + npyv_nlanes_u64 * 7);
        npyv_b64 c5 = npyv_cmpeq_u64(a5, b);
        npyv_b64 c6 = npyv_cmpeq_u64(a6, b);
        npyv_b64 c7 = npyv_cmpeq_u64(a7, b);
        npyv_b64 c8 = npyv_cmpeq_u64(a8, b);
#endif // 64 >= 64
#endif // 64 >= 32
#endif // 64 >= 16
#endif // 64 >= 8

#if 64 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 64 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 64 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 64 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u64 a = *src;
        *dst = a == scalar;
    }
}
#endif


#line 35
#if NPY_SIMD && !((0 || 1) && 0)
static void simd_binary_not_equal_u64(char **args, npy_intp len)
{
    npyv_lanetype_u64 *src1 = (npyv_lanetype_u64 *) args[0];
    npyv_lanetype_u64 *src2 = (npyv_lanetype_u64 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 64 >= 8
        npyv_u64  a1 = npyv_load_u64(src1 + npyv_nlanes_u64 * 0);
        npyv_u64  b1 = npyv_load_u64(src2 + npyv_nlanes_u64 * 0);
        npyv_b64 c1 = npyv_cmpneq_u64(a1, b1);
#if 64 >= 16
        npyv_u64  a2 = npyv_load_u64(src1 + npyv_nlanes_u64 * 1);
        npyv_u64  b2 = npyv_load_u64(src2 + npyv_nlanes_u64 * 1);
        npyv_b64 c2 = npyv_cmpneq_u64(a2, b2);
#if 64 >= 32
        npyv_u64  a3 = npyv_load_u64(src1 + npyv_nlanes_u64 * 2);
        npyv_u64  b3 = npyv_load_u64(src2 + npyv_nlanes_u64 * 2);
        npyv_u64  a4 = npyv_load_u64(src1 + npyv_nlanes_u64 * 3);
        npyv_u64  b4 = npyv_load_u64(src2 + npyv_nlanes_u64 * 3);
        npyv_b64 c3 = npyv_cmpneq_u64(a3, b3);
        npyv_b64 c4 = npyv_cmpneq_u64(a4, b4);
#if 64 == 64
        npyv_u64  a5 = npyv_load_u64(src1 + npyv_nlanes_u64 * 4);
        npyv_u64  b5 = npyv_load_u64(src2 + npyv_nlanes_u64 * 4);
        npyv_u64  a6 = npyv_load_u64(src1 + npyv_nlanes_u64 * 5);
        npyv_u64  b6 = npyv_load_u64(src2 + npyv_nlanes_u64 * 5);
        npyv_u64  a7 = npyv_load_u64(src1 + npyv_nlanes_u64 * 6);
        npyv_u64  b7 = npyv_load_u64(src2 + npyv_nlanes_u64 * 6);
        npyv_u64  a8 = npyv_load_u64(src1 + npyv_nlanes_u64 * 7);
        npyv_u64  b8 = npyv_load_u64(src2 + npyv_nlanes_u64 * 7);
        npyv_b64 c5 = npyv_cmpneq_u64(a5, b5);
        npyv_b64 c6 = npyv_cmpneq_u64(a6, b6);
        npyv_b64 c7 = npyv_cmpneq_u64(a7, b7);
        npyv_b64 c8 = npyv_cmpneq_u64(a8, b8);
#endif // 64 >= 64
#endif // 64 >= 32
#endif // 64 >= 16
#endif // 64 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 64 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 64 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 64 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 64 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_u64 a = *src1;
        const npyv_lanetype_u64 b = *src2;
        *dst = a != b;
    }
}

static void simd_binary_scalar1_not_equal_u64(char **args, npy_intp len)
{
    npyv_lanetype_u64 scalar = *(npyv_lanetype_u64 *) args[0];
    npyv_lanetype_u64 *src   = (npyv_lanetype_u64 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_u64 a         = npyv_setall_u64(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 64 >= 8
        npyv_u64  b1 = npyv_load_u64(src + npyv_nlanes_u64 * 0);
        npyv_b64 c1 = npyv_cmpneq_u64(a, b1);
#if 64 >= 16
        npyv_u64  b2 = npyv_load_u64(src + npyv_nlanes_u64 * 1);
        npyv_b64 c2 = npyv_cmpneq_u64(a, b2);
#if 64 >= 32
        npyv_u64  b3 = npyv_load_u64(src + npyv_nlanes_u64 * 2);
        npyv_u64  b4 = npyv_load_u64(src + npyv_nlanes_u64 * 3);
        npyv_b64 c3 = npyv_cmpneq_u64(a, b3);
        npyv_b64 c4 = npyv_cmpneq_u64(a, b4);
#if 64 == 64
        npyv_u64  b5 = npyv_load_u64(src + npyv_nlanes_u64 * 4);
        npyv_u64  b6 = npyv_load_u64(src + npyv_nlanes_u64 * 5);
        npyv_u64  b7 = npyv_load_u64(src + npyv_nlanes_u64 * 6);
        npyv_u64  b8 = npyv_load_u64(src + npyv_nlanes_u64 * 7);
        npyv_b64 c5 = npyv_cmpneq_u64(a, b5);
        npyv_b64 c6 = npyv_cmpneq_u64(a, b6);
        npyv_b64 c7 = npyv_cmpneq_u64(a, b7);
        npyv_b64 c8 = npyv_cmpneq_u64(a, b8);
#endif // 64 >= 64
#endif // 64 >= 32
#endif // 64 >= 16
#endif // 64 >= 8

#if 64 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 64 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 64 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 64 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u64 b = *src;
        *dst = scalar != b;
    }
}

static void simd_binary_scalar2_not_equal_u64(char **args, npy_intp len)
{
    npyv_lanetype_u64 *src   = (npyv_lanetype_u64 *) args[0];
    npyv_lanetype_u64 scalar = *(npyv_lanetype_u64 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_u64 b         = npyv_setall_u64(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 64 >= 8
        npyv_u64  a1 = npyv_load_u64(src + npyv_nlanes_u64 * 0);
        npyv_b64 c1 = npyv_cmpneq_u64(a1, b);
#if 64 >= 16
        npyv_u64  a2 = npyv_load_u64(src + npyv_nlanes_u64 * 1);
        npyv_b64 c2 = npyv_cmpneq_u64(a2, b);
#if 64 >= 32
        npyv_u64  a3 = npyv_load_u64(src + npyv_nlanes_u64 * 2);
        npyv_u64  a4 = npyv_load_u64(src + npyv_nlanes_u64 * 3);
        npyv_b64 c3 = npyv_cmpneq_u64(a3, b);
        npyv_b64 c4 = npyv_cmpneq_u64(a4, b);
#if 64 == 64
        npyv_u64  a5 = npyv_load_u64(src + npyv_nlanes_u64 * 4);
        npyv_u64  a6 = npyv_load_u64(src + npyv_nlanes_u64 * 5);
        npyv_u64  a7 = npyv_load_u64(src + npyv_nlanes_u64 * 6);
        npyv_u64  a8 = npyv_load_u64(src + npyv_nlanes_u64 * 7);
        npyv_b64 c5 = npyv_cmpneq_u64(a5, b);
        npyv_b64 c6 = npyv_cmpneq_u64(a6, b);
        npyv_b64 c7 = npyv_cmpneq_u64(a7, b);
        npyv_b64 c8 = npyv_cmpneq_u64(a8, b);
#endif // 64 >= 64
#endif // 64 >= 32
#endif // 64 >= 16
#endif // 64 >= 8

#if 64 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 64 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 64 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 64 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u64 a = *src;
        *dst = a != scalar;
    }
}
#endif


#line 35
#if NPY_SIMD && !((0 || 0) && 0)
static void simd_binary_less_u64(char **args, npy_intp len)
{
    npyv_lanetype_u64 *src1 = (npyv_lanetype_u64 *) args[0];
    npyv_lanetype_u64 *src2 = (npyv_lanetype_u64 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 64 >= 8
        npyv_u64  a1 = npyv_load_u64(src1 + npyv_nlanes_u64 * 0);
        npyv_u64  b1 = npyv_load_u64(src2 + npyv_nlanes_u64 * 0);
        npyv_b64 c1 = npyv_cmplt_u64(a1, b1);
#if 64 >= 16
        npyv_u64  a2 = npyv_load_u64(src1 + npyv_nlanes_u64 * 1);
        npyv_u64  b2 = npyv_load_u64(src2 + npyv_nlanes_u64 * 1);
        npyv_b64 c2 = npyv_cmplt_u64(a2, b2);
#if 64 >= 32
        npyv_u64  a3 = npyv_load_u64(src1 + npyv_nlanes_u64 * 2);
        npyv_u64  b3 = npyv_load_u64(src2 + npyv_nlanes_u64 * 2);
        npyv_u64  a4 = npyv_load_u64(src1 + npyv_nlanes_u64 * 3);
        npyv_u64  b4 = npyv_load_u64(src2 + npyv_nlanes_u64 * 3);
        npyv_b64 c3 = npyv_cmplt_u64(a3, b3);
        npyv_b64 c4 = npyv_cmplt_u64(a4, b4);
#if 64 == 64
        npyv_u64  a5 = npyv_load_u64(src1 + npyv_nlanes_u64 * 4);
        npyv_u64  b5 = npyv_load_u64(src2 + npyv_nlanes_u64 * 4);
        npyv_u64  a6 = npyv_load_u64(src1 + npyv_nlanes_u64 * 5);
        npyv_u64  b6 = npyv_load_u64(src2 + npyv_nlanes_u64 * 5);
        npyv_u64  a7 = npyv_load_u64(src1 + npyv_nlanes_u64 * 6);
        npyv_u64  b7 = npyv_load_u64(src2 + npyv_nlanes_u64 * 6);
        npyv_u64  a8 = npyv_load_u64(src1 + npyv_nlanes_u64 * 7);
        npyv_u64  b8 = npyv_load_u64(src2 + npyv_nlanes_u64 * 7);
        npyv_b64 c5 = npyv_cmplt_u64(a5, b5);
        npyv_b64 c6 = npyv_cmplt_u64(a6, b6);
        npyv_b64 c7 = npyv_cmplt_u64(a7, b7);
        npyv_b64 c8 = npyv_cmplt_u64(a8, b8);
#endif // 64 >= 64
#endif // 64 >= 32
#endif // 64 >= 16
#endif // 64 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 64 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 64 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 64 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 64 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_u64 a = *src1;
        const npyv_lanetype_u64 b = *src2;
        *dst = a < b;
    }
}

static void simd_binary_scalar1_less_u64(char **args, npy_intp len)
{
    npyv_lanetype_u64 scalar = *(npyv_lanetype_u64 *) args[0];
    npyv_lanetype_u64 *src   = (npyv_lanetype_u64 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_u64 a         = npyv_setall_u64(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 64 >= 8
        npyv_u64  b1 = npyv_load_u64(src + npyv_nlanes_u64 * 0);
        npyv_b64 c1 = npyv_cmplt_u64(a, b1);
#if 64 >= 16
        npyv_u64  b2 = npyv_load_u64(src + npyv_nlanes_u64 * 1);
        npyv_b64 c2 = npyv_cmplt_u64(a, b2);
#if 64 >= 32
        npyv_u64  b3 = npyv_load_u64(src + npyv_nlanes_u64 * 2);
        npyv_u64  b4 = npyv_load_u64(src + npyv_nlanes_u64 * 3);
        npyv_b64 c3 = npyv_cmplt_u64(a, b3);
        npyv_b64 c4 = npyv_cmplt_u64(a, b4);
#if 64 == 64
        npyv_u64  b5 = npyv_load_u64(src + npyv_nlanes_u64 * 4);
        npyv_u64  b6 = npyv_load_u64(src + npyv_nlanes_u64 * 5);
        npyv_u64  b7 = npyv_load_u64(src + npyv_nlanes_u64 * 6);
        npyv_u64  b8 = npyv_load_u64(src + npyv_nlanes_u64 * 7);
        npyv_b64 c5 = npyv_cmplt_u64(a, b5);
        npyv_b64 c6 = npyv_cmplt_u64(a, b6);
        npyv_b64 c7 = npyv_cmplt_u64(a, b7);
        npyv_b64 c8 = npyv_cmplt_u64(a, b8);
#endif // 64 >= 64
#endif // 64 >= 32
#endif // 64 >= 16
#endif // 64 >= 8

#if 64 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 64 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 64 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 64 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u64 b = *src;
        *dst = scalar < b;
    }
}

static void simd_binary_scalar2_less_u64(char **args, npy_intp len)
{
    npyv_lanetype_u64 *src   = (npyv_lanetype_u64 *) args[0];
    npyv_lanetype_u64 scalar = *(npyv_lanetype_u64 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_u64 b         = npyv_setall_u64(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 64 >= 8
        npyv_u64  a1 = npyv_load_u64(src + npyv_nlanes_u64 * 0);
        npyv_b64 c1 = npyv_cmplt_u64(a1, b);
#if 64 >= 16
        npyv_u64  a2 = npyv_load_u64(src + npyv_nlanes_u64 * 1);
        npyv_b64 c2 = npyv_cmplt_u64(a2, b);
#if 64 >= 32
        npyv_u64  a3 = npyv_load_u64(src + npyv_nlanes_u64 * 2);
        npyv_u64  a4 = npyv_load_u64(src + npyv_nlanes_u64 * 3);
        npyv_b64 c3 = npyv_cmplt_u64(a3, b);
        npyv_b64 c4 = npyv_cmplt_u64(a4, b);
#if 64 == 64
        npyv_u64  a5 = npyv_load_u64(src + npyv_nlanes_u64 * 4);
        npyv_u64  a6 = npyv_load_u64(src + npyv_nlanes_u64 * 5);
        npyv_u64  a7 = npyv_load_u64(src + npyv_nlanes_u64 * 6);
        npyv_u64  a8 = npyv_load_u64(src + npyv_nlanes_u64 * 7);
        npyv_b64 c5 = npyv_cmplt_u64(a5, b);
        npyv_b64 c6 = npyv_cmplt_u64(a6, b);
        npyv_b64 c7 = npyv_cmplt_u64(a7, b);
        npyv_b64 c8 = npyv_cmplt_u64(a8, b);
#endif // 64 >= 64
#endif // 64 >= 32
#endif // 64 >= 16
#endif // 64 >= 8

#if 64 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 64 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 64 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 64 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u64 a = *src;
        *dst = a < scalar;
    }
}
#endif


#line 35
#if NPY_SIMD && !((0 || 0) && 0)
static void simd_binary_less_equal_u64(char **args, npy_intp len)
{
    npyv_lanetype_u64 *src1 = (npyv_lanetype_u64 *) args[0];
    npyv_lanetype_u64 *src2 = (npyv_lanetype_u64 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 64 >= 8
        npyv_u64  a1 = npyv_load_u64(src1 + npyv_nlanes_u64 * 0);
        npyv_u64  b1 = npyv_load_u64(src2 + npyv_nlanes_u64 * 0);
        npyv_b64 c1 = npyv_cmple_u64(a1, b1);
#if 64 >= 16
        npyv_u64  a2 = npyv_load_u64(src1 + npyv_nlanes_u64 * 1);
        npyv_u64  b2 = npyv_load_u64(src2 + npyv_nlanes_u64 * 1);
        npyv_b64 c2 = npyv_cmple_u64(a2, b2);
#if 64 >= 32
        npyv_u64  a3 = npyv_load_u64(src1 + npyv_nlanes_u64 * 2);
        npyv_u64  b3 = npyv_load_u64(src2 + npyv_nlanes_u64 * 2);
        npyv_u64  a4 = npyv_load_u64(src1 + npyv_nlanes_u64 * 3);
        npyv_u64  b4 = npyv_load_u64(src2 + npyv_nlanes_u64 * 3);
        npyv_b64 c3 = npyv_cmple_u64(a3, b3);
        npyv_b64 c4 = npyv_cmple_u64(a4, b4);
#if 64 == 64
        npyv_u64  a5 = npyv_load_u64(src1 + npyv_nlanes_u64 * 4);
        npyv_u64  b5 = npyv_load_u64(src2 + npyv_nlanes_u64 * 4);
        npyv_u64  a6 = npyv_load_u64(src1 + npyv_nlanes_u64 * 5);
        npyv_u64  b6 = npyv_load_u64(src2 + npyv_nlanes_u64 * 5);
        npyv_u64  a7 = npyv_load_u64(src1 + npyv_nlanes_u64 * 6);
        npyv_u64  b7 = npyv_load_u64(src2 + npyv_nlanes_u64 * 6);
        npyv_u64  a8 = npyv_load_u64(src1 + npyv_nlanes_u64 * 7);
        npyv_u64  b8 = npyv_load_u64(src2 + npyv_nlanes_u64 * 7);
        npyv_b64 c5 = npyv_cmple_u64(a5, b5);
        npyv_b64 c6 = npyv_cmple_u64(a6, b6);
        npyv_b64 c7 = npyv_cmple_u64(a7, b7);
        npyv_b64 c8 = npyv_cmple_u64(a8, b8);
#endif // 64 >= 64
#endif // 64 >= 32
#endif // 64 >= 16
#endif // 64 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 64 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 64 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 64 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 64 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_u64 a = *src1;
        const npyv_lanetype_u64 b = *src2;
        *dst = a <= b;
    }
}

static void simd_binary_scalar1_less_equal_u64(char **args, npy_intp len)
{
    npyv_lanetype_u64 scalar = *(npyv_lanetype_u64 *) args[0];
    npyv_lanetype_u64 *src   = (npyv_lanetype_u64 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_u64 a         = npyv_setall_u64(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 64 >= 8
        npyv_u64  b1 = npyv_load_u64(src + npyv_nlanes_u64 * 0);
        npyv_b64 c1 = npyv_cmple_u64(a, b1);
#if 64 >= 16
        npyv_u64  b2 = npyv_load_u64(src + npyv_nlanes_u64 * 1);
        npyv_b64 c2 = npyv_cmple_u64(a, b2);
#if 64 >= 32
        npyv_u64  b3 = npyv_load_u64(src + npyv_nlanes_u64 * 2);
        npyv_u64  b4 = npyv_load_u64(src + npyv_nlanes_u64 * 3);
        npyv_b64 c3 = npyv_cmple_u64(a, b3);
        npyv_b64 c4 = npyv_cmple_u64(a, b4);
#if 64 == 64
        npyv_u64  b5 = npyv_load_u64(src + npyv_nlanes_u64 * 4);
        npyv_u64  b6 = npyv_load_u64(src + npyv_nlanes_u64 * 5);
        npyv_u64  b7 = npyv_load_u64(src + npyv_nlanes_u64 * 6);
        npyv_u64  b8 = npyv_load_u64(src + npyv_nlanes_u64 * 7);
        npyv_b64 c5 = npyv_cmple_u64(a, b5);
        npyv_b64 c6 = npyv_cmple_u64(a, b6);
        npyv_b64 c7 = npyv_cmple_u64(a, b7);
        npyv_b64 c8 = npyv_cmple_u64(a, b8);
#endif // 64 >= 64
#endif // 64 >= 32
#endif // 64 >= 16
#endif // 64 >= 8

#if 64 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 64 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 64 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 64 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u64 b = *src;
        *dst = scalar <= b;
    }
}

static void simd_binary_scalar2_less_equal_u64(char **args, npy_intp len)
{
    npyv_lanetype_u64 *src   = (npyv_lanetype_u64 *) args[0];
    npyv_lanetype_u64 scalar = *(npyv_lanetype_u64 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_u64 b         = npyv_setall_u64(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 64 >= 8
        npyv_u64  a1 = npyv_load_u64(src + npyv_nlanes_u64 * 0);
        npyv_b64 c1 = npyv_cmple_u64(a1, b);
#if 64 >= 16
        npyv_u64  a2 = npyv_load_u64(src + npyv_nlanes_u64 * 1);
        npyv_b64 c2 = npyv_cmple_u64(a2, b);
#if 64 >= 32
        npyv_u64  a3 = npyv_load_u64(src + npyv_nlanes_u64 * 2);
        npyv_u64  a4 = npyv_load_u64(src + npyv_nlanes_u64 * 3);
        npyv_b64 c3 = npyv_cmple_u64(a3, b);
        npyv_b64 c4 = npyv_cmple_u64(a4, b);
#if 64 == 64
        npyv_u64  a5 = npyv_load_u64(src + npyv_nlanes_u64 * 4);
        npyv_u64  a6 = npyv_load_u64(src + npyv_nlanes_u64 * 5);
        npyv_u64  a7 = npyv_load_u64(src + npyv_nlanes_u64 * 6);
        npyv_u64  a8 = npyv_load_u64(src + npyv_nlanes_u64 * 7);
        npyv_b64 c5 = npyv_cmple_u64(a5, b);
        npyv_b64 c6 = npyv_cmple_u64(a6, b);
        npyv_b64 c7 = npyv_cmple_u64(a7, b);
        npyv_b64 c8 = npyv_cmple_u64(a8, b);
#endif // 64 >= 64
#endif // 64 >= 32
#endif // 64 >= 16
#endif // 64 >= 8

#if 64 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 64 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 64 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 64 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u64 a = *src;
        *dst = a <= scalar;
    }
}
#endif



#line 28
#line 35
#if NPY_SIMD && !((1 || 0) && 1)
static void simd_binary_equal_s64(char **args, npy_intp len)
{
    npyv_lanetype_s64 *src1 = (npyv_lanetype_s64 *) args[0];
    npyv_lanetype_s64 *src2 = (npyv_lanetype_s64 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 64 >= 8
        npyv_s64  a1 = npyv_load_s64(src1 + npyv_nlanes_s64 * 0);
        npyv_s64  b1 = npyv_load_s64(src2 + npyv_nlanes_s64 * 0);
        npyv_b64 c1 = npyv_cmpeq_s64(a1, b1);
#if 64 >= 16
        npyv_s64  a2 = npyv_load_s64(src1 + npyv_nlanes_s64 * 1);
        npyv_s64  b2 = npyv_load_s64(src2 + npyv_nlanes_s64 * 1);
        npyv_b64 c2 = npyv_cmpeq_s64(a2, b2);
#if 64 >= 32
        npyv_s64  a3 = npyv_load_s64(src1 + npyv_nlanes_s64 * 2);
        npyv_s64  b3 = npyv_load_s64(src2 + npyv_nlanes_s64 * 2);
        npyv_s64  a4 = npyv_load_s64(src1 + npyv_nlanes_s64 * 3);
        npyv_s64  b4 = npyv_load_s64(src2 + npyv_nlanes_s64 * 3);
        npyv_b64 c3 = npyv_cmpeq_s64(a3, b3);
        npyv_b64 c4 = npyv_cmpeq_s64(a4, b4);
#if 64 == 64
        npyv_s64  a5 = npyv_load_s64(src1 + npyv_nlanes_s64 * 4);
        npyv_s64  b5 = npyv_load_s64(src2 + npyv_nlanes_s64 * 4);
        npyv_s64  a6 = npyv_load_s64(src1 + npyv_nlanes_s64 * 5);
        npyv_s64  b6 = npyv_load_s64(src2 + npyv_nlanes_s64 * 5);
        npyv_s64  a7 = npyv_load_s64(src1 + npyv_nlanes_s64 * 6);
        npyv_s64  b7 = npyv_load_s64(src2 + npyv_nlanes_s64 * 6);
        npyv_s64  a8 = npyv_load_s64(src1 + npyv_nlanes_s64 * 7);
        npyv_s64  b8 = npyv_load_s64(src2 + npyv_nlanes_s64 * 7);
        npyv_b64 c5 = npyv_cmpeq_s64(a5, b5);
        npyv_b64 c6 = npyv_cmpeq_s64(a6, b6);
        npyv_b64 c7 = npyv_cmpeq_s64(a7, b7);
        npyv_b64 c8 = npyv_cmpeq_s64(a8, b8);
#endif // 64 >= 64
#endif // 64 >= 32
#endif // 64 >= 16
#endif // 64 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 64 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 64 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 64 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 64 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_s64 a = *src1;
        const npyv_lanetype_s64 b = *src2;
        *dst = a == b;
    }
}

static void simd_binary_scalar1_equal_s64(char **args, npy_intp len)
{
    npyv_lanetype_s64 scalar = *(npyv_lanetype_s64 *) args[0];
    npyv_lanetype_s64 *src   = (npyv_lanetype_s64 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_s64 a         = npyv_setall_s64(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 64 >= 8
        npyv_s64  b1 = npyv_load_s64(src + npyv_nlanes_s64 * 0);
        npyv_b64 c1 = npyv_cmpeq_s64(a, b1);
#if 64 >= 16
        npyv_s64  b2 = npyv_load_s64(src + npyv_nlanes_s64 * 1);
        npyv_b64 c2 = npyv_cmpeq_s64(a, b2);
#if 64 >= 32
        npyv_s64  b3 = npyv_load_s64(src + npyv_nlanes_s64 * 2);
        npyv_s64  b4 = npyv_load_s64(src + npyv_nlanes_s64 * 3);
        npyv_b64 c3 = npyv_cmpeq_s64(a, b3);
        npyv_b64 c4 = npyv_cmpeq_s64(a, b4);
#if 64 == 64
        npyv_s64  b5 = npyv_load_s64(src + npyv_nlanes_s64 * 4);
        npyv_s64  b6 = npyv_load_s64(src + npyv_nlanes_s64 * 5);
        npyv_s64  b7 = npyv_load_s64(src + npyv_nlanes_s64 * 6);
        npyv_s64  b8 = npyv_load_s64(src + npyv_nlanes_s64 * 7);
        npyv_b64 c5 = npyv_cmpeq_s64(a, b5);
        npyv_b64 c6 = npyv_cmpeq_s64(a, b6);
        npyv_b64 c7 = npyv_cmpeq_s64(a, b7);
        npyv_b64 c8 = npyv_cmpeq_s64(a, b8);
#endif // 64 >= 64
#endif // 64 >= 32
#endif // 64 >= 16
#endif // 64 >= 8

#if 64 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 64 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 64 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 64 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_s64 b = *src;
        *dst = scalar == b;
    }
}

static void simd_binary_scalar2_equal_s64(char **args, npy_intp len)
{
    npyv_lanetype_s64 *src   = (npyv_lanetype_s64 *) args[0];
    npyv_lanetype_s64 scalar = *(npyv_lanetype_s64 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_s64 b         = npyv_setall_s64(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 64 >= 8
        npyv_s64  a1 = npyv_load_s64(src + npyv_nlanes_s64 * 0);
        npyv_b64 c1 = npyv_cmpeq_s64(a1, b);
#if 64 >= 16
        npyv_s64  a2 = npyv_load_s64(src + npyv_nlanes_s64 * 1);
        npyv_b64 c2 = npyv_cmpeq_s64(a2, b);
#if 64 >= 32
        npyv_s64  a3 = npyv_load_s64(src + npyv_nlanes_s64 * 2);
        npyv_s64  a4 = npyv_load_s64(src + npyv_nlanes_s64 * 3);
        npyv_b64 c3 = npyv_cmpeq_s64(a3, b);
        npyv_b64 c4 = npyv_cmpeq_s64(a4, b);
#if 64 == 64
        npyv_s64  a5 = npyv_load_s64(src + npyv_nlanes_s64 * 4);
        npyv_s64  a6 = npyv_load_s64(src + npyv_nlanes_s64 * 5);
        npyv_s64  a7 = npyv_load_s64(src + npyv_nlanes_s64 * 6);
        npyv_s64  a8 = npyv_load_s64(src + npyv_nlanes_s64 * 7);
        npyv_b64 c5 = npyv_cmpeq_s64(a5, b);
        npyv_b64 c6 = npyv_cmpeq_s64(a6, b);
        npyv_b64 c7 = npyv_cmpeq_s64(a7, b);
        npyv_b64 c8 = npyv_cmpeq_s64(a8, b);
#endif // 64 >= 64
#endif // 64 >= 32
#endif // 64 >= 16
#endif // 64 >= 8

#if 64 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 64 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 64 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 64 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_s64 a = *src;
        *dst = a == scalar;
    }
}
#endif


#line 35
#if NPY_SIMD && !((0 || 1) && 1)
static void simd_binary_not_equal_s64(char **args, npy_intp len)
{
    npyv_lanetype_s64 *src1 = (npyv_lanetype_s64 *) args[0];
    npyv_lanetype_s64 *src2 = (npyv_lanetype_s64 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 64 >= 8
        npyv_s64  a1 = npyv_load_s64(src1 + npyv_nlanes_s64 * 0);
        npyv_s64  b1 = npyv_load_s64(src2 + npyv_nlanes_s64 * 0);
        npyv_b64 c1 = npyv_cmpneq_s64(a1, b1);
#if 64 >= 16
        npyv_s64  a2 = npyv_load_s64(src1 + npyv_nlanes_s64 * 1);
        npyv_s64  b2 = npyv_load_s64(src2 + npyv_nlanes_s64 * 1);
        npyv_b64 c2 = npyv_cmpneq_s64(a2, b2);
#if 64 >= 32
        npyv_s64  a3 = npyv_load_s64(src1 + npyv_nlanes_s64 * 2);
        npyv_s64  b3 = npyv_load_s64(src2 + npyv_nlanes_s64 * 2);
        npyv_s64  a4 = npyv_load_s64(src1 + npyv_nlanes_s64 * 3);
        npyv_s64  b4 = npyv_load_s64(src2 + npyv_nlanes_s64 * 3);
        npyv_b64 c3 = npyv_cmpneq_s64(a3, b3);
        npyv_b64 c4 = npyv_cmpneq_s64(a4, b4);
#if 64 == 64
        npyv_s64  a5 = npyv_load_s64(src1 + npyv_nlanes_s64 * 4);
        npyv_s64  b5 = npyv_load_s64(src2 + npyv_nlanes_s64 * 4);
        npyv_s64  a6 = npyv_load_s64(src1 + npyv_nlanes_s64 * 5);
        npyv_s64  b6 = npyv_load_s64(src2 + npyv_nlanes_s64 * 5);
        npyv_s64  a7 = npyv_load_s64(src1 + npyv_nlanes_s64 * 6);
        npyv_s64  b7 = npyv_load_s64(src2 + npyv_nlanes_s64 * 6);
        npyv_s64  a8 = npyv_load_s64(src1 + npyv_nlanes_s64 * 7);
        npyv_s64  b8 = npyv_load_s64(src2 + npyv_nlanes_s64 * 7);
        npyv_b64 c5 = npyv_cmpneq_s64(a5, b5);
        npyv_b64 c6 = npyv_cmpneq_s64(a6, b6);
        npyv_b64 c7 = npyv_cmpneq_s64(a7, b7);
        npyv_b64 c8 = npyv_cmpneq_s64(a8, b8);
#endif // 64 >= 64
#endif // 64 >= 32
#endif // 64 >= 16
#endif // 64 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 64 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 64 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 64 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 64 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_s64 a = *src1;
        const npyv_lanetype_s64 b = *src2;
        *dst = a != b;
    }
}

static void simd_binary_scalar1_not_equal_s64(char **args, npy_intp len)
{
    npyv_lanetype_s64 scalar = *(npyv_lanetype_s64 *) args[0];
    npyv_lanetype_s64 *src   = (npyv_lanetype_s64 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_s64 a         = npyv_setall_s64(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 64 >= 8
        npyv_s64  b1 = npyv_load_s64(src + npyv_nlanes_s64 * 0);
        npyv_b64 c1 = npyv_cmpneq_s64(a, b1);
#if 64 >= 16
        npyv_s64  b2 = npyv_load_s64(src + npyv_nlanes_s64 * 1);
        npyv_b64 c2 = npyv_cmpneq_s64(a, b2);
#if 64 >= 32
        npyv_s64  b3 = npyv_load_s64(src + npyv_nlanes_s64 * 2);
        npyv_s64  b4 = npyv_load_s64(src + npyv_nlanes_s64 * 3);
        npyv_b64 c3 = npyv_cmpneq_s64(a, b3);
        npyv_b64 c4 = npyv_cmpneq_s64(a, b4);
#if 64 == 64
        npyv_s64  b5 = npyv_load_s64(src + npyv_nlanes_s64 * 4);
        npyv_s64  b6 = npyv_load_s64(src + npyv_nlanes_s64 * 5);
        npyv_s64  b7 = npyv_load_s64(src + npyv_nlanes_s64 * 6);
        npyv_s64  b8 = npyv_load_s64(src + npyv_nlanes_s64 * 7);
        npyv_b64 c5 = npyv_cmpneq_s64(a, b5);
        npyv_b64 c6 = npyv_cmpneq_s64(a, b6);
        npyv_b64 c7 = npyv_cmpneq_s64(a, b7);
        npyv_b64 c8 = npyv_cmpneq_s64(a, b8);
#endif // 64 >= 64
#endif // 64 >= 32
#endif // 64 >= 16
#endif // 64 >= 8

#if 64 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 64 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 64 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 64 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_s64 b = *src;
        *dst = scalar != b;
    }
}

static void simd_binary_scalar2_not_equal_s64(char **args, npy_intp len)
{
    npyv_lanetype_s64 *src   = (npyv_lanetype_s64 *) args[0];
    npyv_lanetype_s64 scalar = *(npyv_lanetype_s64 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_s64 b         = npyv_setall_s64(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 64 >= 8
        npyv_s64  a1 = npyv_load_s64(src + npyv_nlanes_s64 * 0);
        npyv_b64 c1 = npyv_cmpneq_s64(a1, b);
#if 64 >= 16
        npyv_s64  a2 = npyv_load_s64(src + npyv_nlanes_s64 * 1);
        npyv_b64 c2 = npyv_cmpneq_s64(a2, b);
#if 64 >= 32
        npyv_s64  a3 = npyv_load_s64(src + npyv_nlanes_s64 * 2);
        npyv_s64  a4 = npyv_load_s64(src + npyv_nlanes_s64 * 3);
        npyv_b64 c3 = npyv_cmpneq_s64(a3, b);
        npyv_b64 c4 = npyv_cmpneq_s64(a4, b);
#if 64 == 64
        npyv_s64  a5 = npyv_load_s64(src + npyv_nlanes_s64 * 4);
        npyv_s64  a6 = npyv_load_s64(src + npyv_nlanes_s64 * 5);
        npyv_s64  a7 = npyv_load_s64(src + npyv_nlanes_s64 * 6);
        npyv_s64  a8 = npyv_load_s64(src + npyv_nlanes_s64 * 7);
        npyv_b64 c5 = npyv_cmpneq_s64(a5, b);
        npyv_b64 c6 = npyv_cmpneq_s64(a6, b);
        npyv_b64 c7 = npyv_cmpneq_s64(a7, b);
        npyv_b64 c8 = npyv_cmpneq_s64(a8, b);
#endif // 64 >= 64
#endif // 64 >= 32
#endif // 64 >= 16
#endif // 64 >= 8

#if 64 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 64 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 64 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 64 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_s64 a = *src;
        *dst = a != scalar;
    }
}
#endif


#line 35
#if NPY_SIMD && !((0 || 0) && 1)
static void simd_binary_less_s64(char **args, npy_intp len)
{
    npyv_lanetype_s64 *src1 = (npyv_lanetype_s64 *) args[0];
    npyv_lanetype_s64 *src2 = (npyv_lanetype_s64 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 64 >= 8
        npyv_s64  a1 = npyv_load_s64(src1 + npyv_nlanes_s64 * 0);
        npyv_s64  b1 = npyv_load_s64(src2 + npyv_nlanes_s64 * 0);
        npyv_b64 c1 = npyv_cmplt_s64(a1, b1);
#if 64 >= 16
        npyv_s64  a2 = npyv_load_s64(src1 + npyv_nlanes_s64 * 1);
        npyv_s64  b2 = npyv_load_s64(src2 + npyv_nlanes_s64 * 1);
        npyv_b64 c2 = npyv_cmplt_s64(a2, b2);
#if 64 >= 32
        npyv_s64  a3 = npyv_load_s64(src1 + npyv_nlanes_s64 * 2);
        npyv_s64  b3 = npyv_load_s64(src2 + npyv_nlanes_s64 * 2);
        npyv_s64  a4 = npyv_load_s64(src1 + npyv_nlanes_s64 * 3);
        npyv_s64  b4 = npyv_load_s64(src2 + npyv_nlanes_s64 * 3);
        npyv_b64 c3 = npyv_cmplt_s64(a3, b3);
        npyv_b64 c4 = npyv_cmplt_s64(a4, b4);
#if 64 == 64
        npyv_s64  a5 = npyv_load_s64(src1 + npyv_nlanes_s64 * 4);
        npyv_s64  b5 = npyv_load_s64(src2 + npyv_nlanes_s64 * 4);
        npyv_s64  a6 = npyv_load_s64(src1 + npyv_nlanes_s64 * 5);
        npyv_s64  b6 = npyv_load_s64(src2 + npyv_nlanes_s64 * 5);
        npyv_s64  a7 = npyv_load_s64(src1 + npyv_nlanes_s64 * 6);
        npyv_s64  b7 = npyv_load_s64(src2 + npyv_nlanes_s64 * 6);
        npyv_s64  a8 = npyv_load_s64(src1 + npyv_nlanes_s64 * 7);
        npyv_s64  b8 = npyv_load_s64(src2 + npyv_nlanes_s64 * 7);
        npyv_b64 c5 = npyv_cmplt_s64(a5, b5);
        npyv_b64 c6 = npyv_cmplt_s64(a6, b6);
        npyv_b64 c7 = npyv_cmplt_s64(a7, b7);
        npyv_b64 c8 = npyv_cmplt_s64(a8, b8);
#endif // 64 >= 64
#endif // 64 >= 32
#endif // 64 >= 16
#endif // 64 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 64 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 64 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 64 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 64 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_s64 a = *src1;
        const npyv_lanetype_s64 b = *src2;
        *dst = a < b;
    }
}

static void simd_binary_scalar1_less_s64(char **args, npy_intp len)
{
    npyv_lanetype_s64 scalar = *(npyv_lanetype_s64 *) args[0];
    npyv_lanetype_s64 *src   = (npyv_lanetype_s64 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_s64 a         = npyv_setall_s64(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 64 >= 8
        npyv_s64  b1 = npyv_load_s64(src + npyv_nlanes_s64 * 0);
        npyv_b64 c1 = npyv_cmplt_s64(a, b1);
#if 64 >= 16
        npyv_s64  b2 = npyv_load_s64(src + npyv_nlanes_s64 * 1);
        npyv_b64 c2 = npyv_cmplt_s64(a, b2);
#if 64 >= 32
        npyv_s64  b3 = npyv_load_s64(src + npyv_nlanes_s64 * 2);
        npyv_s64  b4 = npyv_load_s64(src + npyv_nlanes_s64 * 3);
        npyv_b64 c3 = npyv_cmplt_s64(a, b3);
        npyv_b64 c4 = npyv_cmplt_s64(a, b4);
#if 64 == 64
        npyv_s64  b5 = npyv_load_s64(src + npyv_nlanes_s64 * 4);
        npyv_s64  b6 = npyv_load_s64(src + npyv_nlanes_s64 * 5);
        npyv_s64  b7 = npyv_load_s64(src + npyv_nlanes_s64 * 6);
        npyv_s64  b8 = npyv_load_s64(src + npyv_nlanes_s64 * 7);
        npyv_b64 c5 = npyv_cmplt_s64(a, b5);
        npyv_b64 c6 = npyv_cmplt_s64(a, b6);
        npyv_b64 c7 = npyv_cmplt_s64(a, b7);
        npyv_b64 c8 = npyv_cmplt_s64(a, b8);
#endif // 64 >= 64
#endif // 64 >= 32
#endif // 64 >= 16
#endif // 64 >= 8

#if 64 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 64 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 64 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 64 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_s64 b = *src;
        *dst = scalar < b;
    }
}

static void simd_binary_scalar2_less_s64(char **args, npy_intp len)
{
    npyv_lanetype_s64 *src   = (npyv_lanetype_s64 *) args[0];
    npyv_lanetype_s64 scalar = *(npyv_lanetype_s64 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_s64 b         = npyv_setall_s64(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 64 >= 8
        npyv_s64  a1 = npyv_load_s64(src + npyv_nlanes_s64 * 0);
        npyv_b64 c1 = npyv_cmplt_s64(a1, b);
#if 64 >= 16
        npyv_s64  a2 = npyv_load_s64(src + npyv_nlanes_s64 * 1);
        npyv_b64 c2 = npyv_cmplt_s64(a2, b);
#if 64 >= 32
        npyv_s64  a3 = npyv_load_s64(src + npyv_nlanes_s64 * 2);
        npyv_s64  a4 = npyv_load_s64(src + npyv_nlanes_s64 * 3);
        npyv_b64 c3 = npyv_cmplt_s64(a3, b);
        npyv_b64 c4 = npyv_cmplt_s64(a4, b);
#if 64 == 64
        npyv_s64  a5 = npyv_load_s64(src + npyv_nlanes_s64 * 4);
        npyv_s64  a6 = npyv_load_s64(src + npyv_nlanes_s64 * 5);
        npyv_s64  a7 = npyv_load_s64(src + npyv_nlanes_s64 * 6);
        npyv_s64  a8 = npyv_load_s64(src + npyv_nlanes_s64 * 7);
        npyv_b64 c5 = npyv_cmplt_s64(a5, b);
        npyv_b64 c6 = npyv_cmplt_s64(a6, b);
        npyv_b64 c7 = npyv_cmplt_s64(a7, b);
        npyv_b64 c8 = npyv_cmplt_s64(a8, b);
#endif // 64 >= 64
#endif // 64 >= 32
#endif // 64 >= 16
#endif // 64 >= 8

#if 64 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 64 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 64 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 64 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_s64 a = *src;
        *dst = a < scalar;
    }
}
#endif


#line 35
#if NPY_SIMD && !((0 || 0) && 1)
static void simd_binary_less_equal_s64(char **args, npy_intp len)
{
    npyv_lanetype_s64 *src1 = (npyv_lanetype_s64 *) args[0];
    npyv_lanetype_s64 *src2 = (npyv_lanetype_s64 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 64 >= 8
        npyv_s64  a1 = npyv_load_s64(src1 + npyv_nlanes_s64 * 0);
        npyv_s64  b1 = npyv_load_s64(src2 + npyv_nlanes_s64 * 0);
        npyv_b64 c1 = npyv_cmple_s64(a1, b1);
#if 64 >= 16
        npyv_s64  a2 = npyv_load_s64(src1 + npyv_nlanes_s64 * 1);
        npyv_s64  b2 = npyv_load_s64(src2 + npyv_nlanes_s64 * 1);
        npyv_b64 c2 = npyv_cmple_s64(a2, b2);
#if 64 >= 32
        npyv_s64  a3 = npyv_load_s64(src1 + npyv_nlanes_s64 * 2);
        npyv_s64  b3 = npyv_load_s64(src2 + npyv_nlanes_s64 * 2);
        npyv_s64  a4 = npyv_load_s64(src1 + npyv_nlanes_s64 * 3);
        npyv_s64  b4 = npyv_load_s64(src2 + npyv_nlanes_s64 * 3);
        npyv_b64 c3 = npyv_cmple_s64(a3, b3);
        npyv_b64 c4 = npyv_cmple_s64(a4, b4);
#if 64 == 64
        npyv_s64  a5 = npyv_load_s64(src1 + npyv_nlanes_s64 * 4);
        npyv_s64  b5 = npyv_load_s64(src2 + npyv_nlanes_s64 * 4);
        npyv_s64  a6 = npyv_load_s64(src1 + npyv_nlanes_s64 * 5);
        npyv_s64  b6 = npyv_load_s64(src2 + npyv_nlanes_s64 * 5);
        npyv_s64  a7 = npyv_load_s64(src1 + npyv_nlanes_s64 * 6);
        npyv_s64  b7 = npyv_load_s64(src2 + npyv_nlanes_s64 * 6);
        npyv_s64  a8 = npyv_load_s64(src1 + npyv_nlanes_s64 * 7);
        npyv_s64  b8 = npyv_load_s64(src2 + npyv_nlanes_s64 * 7);
        npyv_b64 c5 = npyv_cmple_s64(a5, b5);
        npyv_b64 c6 = npyv_cmple_s64(a6, b6);
        npyv_b64 c7 = npyv_cmple_s64(a7, b7);
        npyv_b64 c8 = npyv_cmple_s64(a8, b8);
#endif // 64 >= 64
#endif // 64 >= 32
#endif // 64 >= 16
#endif // 64 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 64 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 64 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 64 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 64 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_s64 a = *src1;
        const npyv_lanetype_s64 b = *src2;
        *dst = a <= b;
    }
}

static void simd_binary_scalar1_less_equal_s64(char **args, npy_intp len)
{
    npyv_lanetype_s64 scalar = *(npyv_lanetype_s64 *) args[0];
    npyv_lanetype_s64 *src   = (npyv_lanetype_s64 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_s64 a         = npyv_setall_s64(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 64 >= 8
        npyv_s64  b1 = npyv_load_s64(src + npyv_nlanes_s64 * 0);
        npyv_b64 c1 = npyv_cmple_s64(a, b1);
#if 64 >= 16
        npyv_s64  b2 = npyv_load_s64(src + npyv_nlanes_s64 * 1);
        npyv_b64 c2 = npyv_cmple_s64(a, b2);
#if 64 >= 32
        npyv_s64  b3 = npyv_load_s64(src + npyv_nlanes_s64 * 2);
        npyv_s64  b4 = npyv_load_s64(src + npyv_nlanes_s64 * 3);
        npyv_b64 c3 = npyv_cmple_s64(a, b3);
        npyv_b64 c4 = npyv_cmple_s64(a, b4);
#if 64 == 64
        npyv_s64  b5 = npyv_load_s64(src + npyv_nlanes_s64 * 4);
        npyv_s64  b6 = npyv_load_s64(src + npyv_nlanes_s64 * 5);
        npyv_s64  b7 = npyv_load_s64(src + npyv_nlanes_s64 * 6);
        npyv_s64  b8 = npyv_load_s64(src + npyv_nlanes_s64 * 7);
        npyv_b64 c5 = npyv_cmple_s64(a, b5);
        npyv_b64 c6 = npyv_cmple_s64(a, b6);
        npyv_b64 c7 = npyv_cmple_s64(a, b7);
        npyv_b64 c8 = npyv_cmple_s64(a, b8);
#endif // 64 >= 64
#endif // 64 >= 32
#endif // 64 >= 16
#endif // 64 >= 8

#if 64 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 64 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 64 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 64 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_s64 b = *src;
        *dst = scalar <= b;
    }
}

static void simd_binary_scalar2_less_equal_s64(char **args, npy_intp len)
{
    npyv_lanetype_s64 *src   = (npyv_lanetype_s64 *) args[0];
    npyv_lanetype_s64 scalar = *(npyv_lanetype_s64 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_s64 b         = npyv_setall_s64(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 64 >= 8
        npyv_s64  a1 = npyv_load_s64(src + npyv_nlanes_s64 * 0);
        npyv_b64 c1 = npyv_cmple_s64(a1, b);
#if 64 >= 16
        npyv_s64  a2 = npyv_load_s64(src + npyv_nlanes_s64 * 1);
        npyv_b64 c2 = npyv_cmple_s64(a2, b);
#if 64 >= 32
        npyv_s64  a3 = npyv_load_s64(src + npyv_nlanes_s64 * 2);
        npyv_s64  a4 = npyv_load_s64(src + npyv_nlanes_s64 * 3);
        npyv_b64 c3 = npyv_cmple_s64(a3, b);
        npyv_b64 c4 = npyv_cmple_s64(a4, b);
#if 64 == 64
        npyv_s64  a5 = npyv_load_s64(src + npyv_nlanes_s64 * 4);
        npyv_s64  a6 = npyv_load_s64(src + npyv_nlanes_s64 * 5);
        npyv_s64  a7 = npyv_load_s64(src + npyv_nlanes_s64 * 6);
        npyv_s64  a8 = npyv_load_s64(src + npyv_nlanes_s64 * 7);
        npyv_b64 c5 = npyv_cmple_s64(a5, b);
        npyv_b64 c6 = npyv_cmple_s64(a6, b);
        npyv_b64 c7 = npyv_cmple_s64(a7, b);
        npyv_b64 c8 = npyv_cmple_s64(a8, b);
#endif // 64 >= 64
#endif // 64 >= 32
#endif // 64 >= 16
#endif // 64 >= 8

#if 64 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 64 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 64 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 64 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_s64 a = *src;
        *dst = a <= scalar;
    }
}
#endif



#line 28
#line 35
#if NPY_SIMD_F32 && !((1 || 0) && 0)
static void simd_binary_equal_f32(char **args, npy_intp len)
{
    npyv_lanetype_f32 *src1 = (npyv_lanetype_f32 *) args[0];
    npyv_lanetype_f32 *src2 = (npyv_lanetype_f32 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 32 >= 8
        npyv_f32  a1 = npyv_load_f32(src1 + npyv_nlanes_f32 * 0);
        npyv_f32  b1 = npyv_load_f32(src2 + npyv_nlanes_f32 * 0);
        npyv_b32 c1 = npyv_cmpeq_f32(a1, b1);
#if 32 >= 16
        npyv_f32  a2 = npyv_load_f32(src1 + npyv_nlanes_f32 * 1);
        npyv_f32  b2 = npyv_load_f32(src2 + npyv_nlanes_f32 * 1);
        npyv_b32 c2 = npyv_cmpeq_f32(a2, b2);
#if 32 >= 32
        npyv_f32  a3 = npyv_load_f32(src1 + npyv_nlanes_f32 * 2);
        npyv_f32  b3 = npyv_load_f32(src2 + npyv_nlanes_f32 * 2);
        npyv_f32  a4 = npyv_load_f32(src1 + npyv_nlanes_f32 * 3);
        npyv_f32  b4 = npyv_load_f32(src2 + npyv_nlanes_f32 * 3);
        npyv_b32 c3 = npyv_cmpeq_f32(a3, b3);
        npyv_b32 c4 = npyv_cmpeq_f32(a4, b4);
#if 32 == 64
        npyv_f32  a5 = npyv_load_f32(src1 + npyv_nlanes_f32 * 4);
        npyv_f32  b5 = npyv_load_f32(src2 + npyv_nlanes_f32 * 4);
        npyv_f32  a6 = npyv_load_f32(src1 + npyv_nlanes_f32 * 5);
        npyv_f32  b6 = npyv_load_f32(src2 + npyv_nlanes_f32 * 5);
        npyv_f32  a7 = npyv_load_f32(src1 + npyv_nlanes_f32 * 6);
        npyv_f32  b7 = npyv_load_f32(src2 + npyv_nlanes_f32 * 6);
        npyv_f32  a8 = npyv_load_f32(src1 + npyv_nlanes_f32 * 7);
        npyv_f32  b8 = npyv_load_f32(src2 + npyv_nlanes_f32 * 7);
        npyv_b32 c5 = npyv_cmpeq_f32(a5, b5);
        npyv_b32 c6 = npyv_cmpeq_f32(a6, b6);
        npyv_b32 c7 = npyv_cmpeq_f32(a7, b7);
        npyv_b32 c8 = npyv_cmpeq_f32(a8, b8);
#endif // 32 >= 64
#endif // 32 >= 32
#endif // 32 >= 16
#endif // 32 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 32 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 32 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 32 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 32 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_f32 a = *src1;
        const npyv_lanetype_f32 b = *src2;
        *dst = a == b;
    }
}

static void simd_binary_scalar1_equal_f32(char **args, npy_intp len)
{
    npyv_lanetype_f32 scalar = *(npyv_lanetype_f32 *) args[0];
    npyv_lanetype_f32 *src   = (npyv_lanetype_f32 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_f32 a         = npyv_setall_f32(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 32 >= 8
        npyv_f32  b1 = npyv_load_f32(src + npyv_nlanes_f32 * 0);
        npyv_b32 c1 = npyv_cmpeq_f32(a, b1);
#if 32 >= 16
        npyv_f32  b2 = npyv_load_f32(src + npyv_nlanes_f32 * 1);
        npyv_b32 c2 = npyv_cmpeq_f32(a, b2);
#if 32 >= 32
        npyv_f32  b3 = npyv_load_f32(src + npyv_nlanes_f32 * 2);
        npyv_f32  b4 = npyv_load_f32(src + npyv_nlanes_f32 * 3);
        npyv_b32 c3 = npyv_cmpeq_f32(a, b3);
        npyv_b32 c4 = npyv_cmpeq_f32(a, b4);
#if 32 == 64
        npyv_f32  b5 = npyv_load_f32(src + npyv_nlanes_f32 * 4);
        npyv_f32  b6 = npyv_load_f32(src + npyv_nlanes_f32 * 5);
        npyv_f32  b7 = npyv_load_f32(src + npyv_nlanes_f32 * 6);
        npyv_f32  b8 = npyv_load_f32(src + npyv_nlanes_f32 * 7);
        npyv_b32 c5 = npyv_cmpeq_f32(a, b5);
        npyv_b32 c6 = npyv_cmpeq_f32(a, b6);
        npyv_b32 c7 = npyv_cmpeq_f32(a, b7);
        npyv_b32 c8 = npyv_cmpeq_f32(a, b8);
#endif // 32 >= 64
#endif // 32 >= 32
#endif // 32 >= 16
#endif // 32 >= 8

#if 32 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 32 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 32 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 32 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_f32 b = *src;
        *dst = scalar == b;
    }
}

static void simd_binary_scalar2_equal_f32(char **args, npy_intp len)
{
    npyv_lanetype_f32 *src   = (npyv_lanetype_f32 *) args[0];
    npyv_lanetype_f32 scalar = *(npyv_lanetype_f32 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_f32 b         = npyv_setall_f32(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 32 >= 8
        npyv_f32  a1 = npyv_load_f32(src + npyv_nlanes_f32 * 0);
        npyv_b32 c1 = npyv_cmpeq_f32(a1, b);
#if 32 >= 16
        npyv_f32  a2 = npyv_load_f32(src + npyv_nlanes_f32 * 1);
        npyv_b32 c2 = npyv_cmpeq_f32(a2, b);
#if 32 >= 32
        npyv_f32  a3 = npyv_load_f32(src + npyv_nlanes_f32 * 2);
        npyv_f32  a4 = npyv_load_f32(src + npyv_nlanes_f32 * 3);
        npyv_b32 c3 = npyv_cmpeq_f32(a3, b);
        npyv_b32 c4 = npyv_cmpeq_f32(a4, b);
#if 32 == 64
        npyv_f32  a5 = npyv_load_f32(src + npyv_nlanes_f32 * 4);
        npyv_f32  a6 = npyv_load_f32(src + npyv_nlanes_f32 * 5);
        npyv_f32  a7 = npyv_load_f32(src + npyv_nlanes_f32 * 6);
        npyv_f32  a8 = npyv_load_f32(src + npyv_nlanes_f32 * 7);
        npyv_b32 c5 = npyv_cmpeq_f32(a5, b);
        npyv_b32 c6 = npyv_cmpeq_f32(a6, b);
        npyv_b32 c7 = npyv_cmpeq_f32(a7, b);
        npyv_b32 c8 = npyv_cmpeq_f32(a8, b);
#endif // 32 >= 64
#endif // 32 >= 32
#endif // 32 >= 16
#endif // 32 >= 8

#if 32 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 32 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 32 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 32 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_f32 a = *src;
        *dst = a == scalar;
    }
}
#endif


#line 35
#if NPY_SIMD_F32 && !((0 || 1) && 0)
static void simd_binary_not_equal_f32(char **args, npy_intp len)
{
    npyv_lanetype_f32 *src1 = (npyv_lanetype_f32 *) args[0];
    npyv_lanetype_f32 *src2 = (npyv_lanetype_f32 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 32 >= 8
        npyv_f32  a1 = npyv_load_f32(src1 + npyv_nlanes_f32 * 0);
        npyv_f32  b1 = npyv_load_f32(src2 + npyv_nlanes_f32 * 0);
        npyv_b32 c1 = npyv_cmpneq_f32(a1, b1);
#if 32 >= 16
        npyv_f32  a2 = npyv_load_f32(src1 + npyv_nlanes_f32 * 1);
        npyv_f32  b2 = npyv_load_f32(src2 + npyv_nlanes_f32 * 1);
        npyv_b32 c2 = npyv_cmpneq_f32(a2, b2);
#if 32 >= 32
        npyv_f32  a3 = npyv_load_f32(src1 + npyv_nlanes_f32 * 2);
        npyv_f32  b3 = npyv_load_f32(src2 + npyv_nlanes_f32 * 2);
        npyv_f32  a4 = npyv_load_f32(src1 + npyv_nlanes_f32 * 3);
        npyv_f32  b4 = npyv_load_f32(src2 + npyv_nlanes_f32 * 3);
        npyv_b32 c3 = npyv_cmpneq_f32(a3, b3);
        npyv_b32 c4 = npyv_cmpneq_f32(a4, b4);
#if 32 == 64
        npyv_f32  a5 = npyv_load_f32(src1 + npyv_nlanes_f32 * 4);
        npyv_f32  b5 = npyv_load_f32(src2 + npyv_nlanes_f32 * 4);
        npyv_f32  a6 = npyv_load_f32(src1 + npyv_nlanes_f32 * 5);
        npyv_f32  b6 = npyv_load_f32(src2 + npyv_nlanes_f32 * 5);
        npyv_f32  a7 = npyv_load_f32(src1 + npyv_nlanes_f32 * 6);
        npyv_f32  b7 = npyv_load_f32(src2 + npyv_nlanes_f32 * 6);
        npyv_f32  a8 = npyv_load_f32(src1 + npyv_nlanes_f32 * 7);
        npyv_f32  b8 = npyv_load_f32(src2 + npyv_nlanes_f32 * 7);
        npyv_b32 c5 = npyv_cmpneq_f32(a5, b5);
        npyv_b32 c6 = npyv_cmpneq_f32(a6, b6);
        npyv_b32 c7 = npyv_cmpneq_f32(a7, b7);
        npyv_b32 c8 = npyv_cmpneq_f32(a8, b8);
#endif // 32 >= 64
#endif // 32 >= 32
#endif // 32 >= 16
#endif // 32 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 32 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 32 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 32 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 32 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_f32 a = *src1;
        const npyv_lanetype_f32 b = *src2;
        *dst = a != b;
    }
}

static void simd_binary_scalar1_not_equal_f32(char **args, npy_intp len)
{
    npyv_lanetype_f32 scalar = *(npyv_lanetype_f32 *) args[0];
    npyv_lanetype_f32 *src   = (npyv_lanetype_f32 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_f32 a         = npyv_setall_f32(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 32 >= 8
        npyv_f32  b1 = npyv_load_f32(src + npyv_nlanes_f32 * 0);
        npyv_b32 c1 = npyv_cmpneq_f32(a, b1);
#if 32 >= 16
        npyv_f32  b2 = npyv_load_f32(src + npyv_nlanes_f32 * 1);
        npyv_b32 c2 = npyv_cmpneq_f32(a, b2);
#if 32 >= 32
        npyv_f32  b3 = npyv_load_f32(src + npyv_nlanes_f32 * 2);
        npyv_f32  b4 = npyv_load_f32(src + npyv_nlanes_f32 * 3);
        npyv_b32 c3 = npyv_cmpneq_f32(a, b3);
        npyv_b32 c4 = npyv_cmpneq_f32(a, b4);
#if 32 == 64
        npyv_f32  b5 = npyv_load_f32(src + npyv_nlanes_f32 * 4);
        npyv_f32  b6 = npyv_load_f32(src + npyv_nlanes_f32 * 5);
        npyv_f32  b7 = npyv_load_f32(src + npyv_nlanes_f32 * 6);
        npyv_f32  b8 = npyv_load_f32(src + npyv_nlanes_f32 * 7);
        npyv_b32 c5 = npyv_cmpneq_f32(a, b5);
        npyv_b32 c6 = npyv_cmpneq_f32(a, b6);
        npyv_b32 c7 = npyv_cmpneq_f32(a, b7);
        npyv_b32 c8 = npyv_cmpneq_f32(a, b8);
#endif // 32 >= 64
#endif // 32 >= 32
#endif // 32 >= 16
#endif // 32 >= 8

#if 32 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 32 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 32 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 32 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_f32 b = *src;
        *dst = scalar != b;
    }
}

static void simd_binary_scalar2_not_equal_f32(char **args, npy_intp len)
{
    npyv_lanetype_f32 *src   = (npyv_lanetype_f32 *) args[0];
    npyv_lanetype_f32 scalar = *(npyv_lanetype_f32 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_f32 b         = npyv_setall_f32(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 32 >= 8
        npyv_f32  a1 = npyv_load_f32(src + npyv_nlanes_f32 * 0);
        npyv_b32 c1 = npyv_cmpneq_f32(a1, b);
#if 32 >= 16
        npyv_f32  a2 = npyv_load_f32(src + npyv_nlanes_f32 * 1);
        npyv_b32 c2 = npyv_cmpneq_f32(a2, b);
#if 32 >= 32
        npyv_f32  a3 = npyv_load_f32(src + npyv_nlanes_f32 * 2);
        npyv_f32  a4 = npyv_load_f32(src + npyv_nlanes_f32 * 3);
        npyv_b32 c3 = npyv_cmpneq_f32(a3, b);
        npyv_b32 c4 = npyv_cmpneq_f32(a4, b);
#if 32 == 64
        npyv_f32  a5 = npyv_load_f32(src + npyv_nlanes_f32 * 4);
        npyv_f32  a6 = npyv_load_f32(src + npyv_nlanes_f32 * 5);
        npyv_f32  a7 = npyv_load_f32(src + npyv_nlanes_f32 * 6);
        npyv_f32  a8 = npyv_load_f32(src + npyv_nlanes_f32 * 7);
        npyv_b32 c5 = npyv_cmpneq_f32(a5, b);
        npyv_b32 c6 = npyv_cmpneq_f32(a6, b);
        npyv_b32 c7 = npyv_cmpneq_f32(a7, b);
        npyv_b32 c8 = npyv_cmpneq_f32(a8, b);
#endif // 32 >= 64
#endif // 32 >= 32
#endif // 32 >= 16
#endif // 32 >= 8

#if 32 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 32 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 32 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 32 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_f32 a = *src;
        *dst = a != scalar;
    }
}
#endif


#line 35
#if NPY_SIMD_F32 && !((0 || 0) && 0)
static void simd_binary_less_f32(char **args, npy_intp len)
{
    npyv_lanetype_f32 *src1 = (npyv_lanetype_f32 *) args[0];
    npyv_lanetype_f32 *src2 = (npyv_lanetype_f32 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 32 >= 8
        npyv_f32  a1 = npyv_load_f32(src1 + npyv_nlanes_f32 * 0);
        npyv_f32  b1 = npyv_load_f32(src2 + npyv_nlanes_f32 * 0);
        npyv_b32 c1 = npyv_cmplt_f32(a1, b1);
#if 32 >= 16
        npyv_f32  a2 = npyv_load_f32(src1 + npyv_nlanes_f32 * 1);
        npyv_f32  b2 = npyv_load_f32(src2 + npyv_nlanes_f32 * 1);
        npyv_b32 c2 = npyv_cmplt_f32(a2, b2);
#if 32 >= 32
        npyv_f32  a3 = npyv_load_f32(src1 + npyv_nlanes_f32 * 2);
        npyv_f32  b3 = npyv_load_f32(src2 + npyv_nlanes_f32 * 2);
        npyv_f32  a4 = npyv_load_f32(src1 + npyv_nlanes_f32 * 3);
        npyv_f32  b4 = npyv_load_f32(src2 + npyv_nlanes_f32 * 3);
        npyv_b32 c3 = npyv_cmplt_f32(a3, b3);
        npyv_b32 c4 = npyv_cmplt_f32(a4, b4);
#if 32 == 64
        npyv_f32  a5 = npyv_load_f32(src1 + npyv_nlanes_f32 * 4);
        npyv_f32  b5 = npyv_load_f32(src2 + npyv_nlanes_f32 * 4);
        npyv_f32  a6 = npyv_load_f32(src1 + npyv_nlanes_f32 * 5);
        npyv_f32  b6 = npyv_load_f32(src2 + npyv_nlanes_f32 * 5);
        npyv_f32  a7 = npyv_load_f32(src1 + npyv_nlanes_f32 * 6);
        npyv_f32  b7 = npyv_load_f32(src2 + npyv_nlanes_f32 * 6);
        npyv_f32  a8 = npyv_load_f32(src1 + npyv_nlanes_f32 * 7);
        npyv_f32  b8 = npyv_load_f32(src2 + npyv_nlanes_f32 * 7);
        npyv_b32 c5 = npyv_cmplt_f32(a5, b5);
        npyv_b32 c6 = npyv_cmplt_f32(a6, b6);
        npyv_b32 c7 = npyv_cmplt_f32(a7, b7);
        npyv_b32 c8 = npyv_cmplt_f32(a8, b8);
#endif // 32 >= 64
#endif // 32 >= 32
#endif // 32 >= 16
#endif // 32 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 32 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 32 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 32 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 32 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_f32 a = *src1;
        const npyv_lanetype_f32 b = *src2;
        *dst = a < b;
    }
}

static void simd_binary_scalar1_less_f32(char **args, npy_intp len)
{
    npyv_lanetype_f32 scalar = *(npyv_lanetype_f32 *) args[0];
    npyv_lanetype_f32 *src   = (npyv_lanetype_f32 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_f32 a         = npyv_setall_f32(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 32 >= 8
        npyv_f32  b1 = npyv_load_f32(src + npyv_nlanes_f32 * 0);
        npyv_b32 c1 = npyv_cmplt_f32(a, b1);
#if 32 >= 16
        npyv_f32  b2 = npyv_load_f32(src + npyv_nlanes_f32 * 1);
        npyv_b32 c2 = npyv_cmplt_f32(a, b2);
#if 32 >= 32
        npyv_f32  b3 = npyv_load_f32(src + npyv_nlanes_f32 * 2);
        npyv_f32  b4 = npyv_load_f32(src + npyv_nlanes_f32 * 3);
        npyv_b32 c3 = npyv_cmplt_f32(a, b3);
        npyv_b32 c4 = npyv_cmplt_f32(a, b4);
#if 32 == 64
        npyv_f32  b5 = npyv_load_f32(src + npyv_nlanes_f32 * 4);
        npyv_f32  b6 = npyv_load_f32(src + npyv_nlanes_f32 * 5);
        npyv_f32  b7 = npyv_load_f32(src + npyv_nlanes_f32 * 6);
        npyv_f32  b8 = npyv_load_f32(src + npyv_nlanes_f32 * 7);
        npyv_b32 c5 = npyv_cmplt_f32(a, b5);
        npyv_b32 c6 = npyv_cmplt_f32(a, b6);
        npyv_b32 c7 = npyv_cmplt_f32(a, b7);
        npyv_b32 c8 = npyv_cmplt_f32(a, b8);
#endif // 32 >= 64
#endif // 32 >= 32
#endif // 32 >= 16
#endif // 32 >= 8

#if 32 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 32 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 32 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 32 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_f32 b = *src;
        *dst = scalar < b;
    }
}

static void simd_binary_scalar2_less_f32(char **args, npy_intp len)
{
    npyv_lanetype_f32 *src   = (npyv_lanetype_f32 *) args[0];
    npyv_lanetype_f32 scalar = *(npyv_lanetype_f32 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_f32 b         = npyv_setall_f32(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 32 >= 8
        npyv_f32  a1 = npyv_load_f32(src + npyv_nlanes_f32 * 0);
        npyv_b32 c1 = npyv_cmplt_f32(a1, b);
#if 32 >= 16
        npyv_f32  a2 = npyv_load_f32(src + npyv_nlanes_f32 * 1);
        npyv_b32 c2 = npyv_cmplt_f32(a2, b);
#if 32 >= 32
        npyv_f32  a3 = npyv_load_f32(src + npyv_nlanes_f32 * 2);
        npyv_f32  a4 = npyv_load_f32(src + npyv_nlanes_f32 * 3);
        npyv_b32 c3 = npyv_cmplt_f32(a3, b);
        npyv_b32 c4 = npyv_cmplt_f32(a4, b);
#if 32 == 64
        npyv_f32  a5 = npyv_load_f32(src + npyv_nlanes_f32 * 4);
        npyv_f32  a6 = npyv_load_f32(src + npyv_nlanes_f32 * 5);
        npyv_f32  a7 = npyv_load_f32(src + npyv_nlanes_f32 * 6);
        npyv_f32  a8 = npyv_load_f32(src + npyv_nlanes_f32 * 7);
        npyv_b32 c5 = npyv_cmplt_f32(a5, b);
        npyv_b32 c6 = npyv_cmplt_f32(a6, b);
        npyv_b32 c7 = npyv_cmplt_f32(a7, b);
        npyv_b32 c8 = npyv_cmplt_f32(a8, b);
#endif // 32 >= 64
#endif // 32 >= 32
#endif // 32 >= 16
#endif // 32 >= 8

#if 32 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 32 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 32 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 32 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_f32 a = *src;
        *dst = a < scalar;
    }
}
#endif


#line 35
#if NPY_SIMD_F32 && !((0 || 0) && 0)
static void simd_binary_less_equal_f32(char **args, npy_intp len)
{
    npyv_lanetype_f32 *src1 = (npyv_lanetype_f32 *) args[0];
    npyv_lanetype_f32 *src2 = (npyv_lanetype_f32 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 32 >= 8
        npyv_f32  a1 = npyv_load_f32(src1 + npyv_nlanes_f32 * 0);
        npyv_f32  b1 = npyv_load_f32(src2 + npyv_nlanes_f32 * 0);
        npyv_b32 c1 = npyv_cmple_f32(a1, b1);
#if 32 >= 16
        npyv_f32  a2 = npyv_load_f32(src1 + npyv_nlanes_f32 * 1);
        npyv_f32  b2 = npyv_load_f32(src2 + npyv_nlanes_f32 * 1);
        npyv_b32 c2 = npyv_cmple_f32(a2, b2);
#if 32 >= 32
        npyv_f32  a3 = npyv_load_f32(src1 + npyv_nlanes_f32 * 2);
        npyv_f32  b3 = npyv_load_f32(src2 + npyv_nlanes_f32 * 2);
        npyv_f32  a4 = npyv_load_f32(src1 + npyv_nlanes_f32 * 3);
        npyv_f32  b4 = npyv_load_f32(src2 + npyv_nlanes_f32 * 3);
        npyv_b32 c3 = npyv_cmple_f32(a3, b3);
        npyv_b32 c4 = npyv_cmple_f32(a4, b4);
#if 32 == 64
        npyv_f32  a5 = npyv_load_f32(src1 + npyv_nlanes_f32 * 4);
        npyv_f32  b5 = npyv_load_f32(src2 + npyv_nlanes_f32 * 4);
        npyv_f32  a6 = npyv_load_f32(src1 + npyv_nlanes_f32 * 5);
        npyv_f32  b6 = npyv_load_f32(src2 + npyv_nlanes_f32 * 5);
        npyv_f32  a7 = npyv_load_f32(src1 + npyv_nlanes_f32 * 6);
        npyv_f32  b7 = npyv_load_f32(src2 + npyv_nlanes_f32 * 6);
        npyv_f32  a8 = npyv_load_f32(src1 + npyv_nlanes_f32 * 7);
        npyv_f32  b8 = npyv_load_f32(src2 + npyv_nlanes_f32 * 7);
        npyv_b32 c5 = npyv_cmple_f32(a5, b5);
        npyv_b32 c6 = npyv_cmple_f32(a6, b6);
        npyv_b32 c7 = npyv_cmple_f32(a7, b7);
        npyv_b32 c8 = npyv_cmple_f32(a8, b8);
#endif // 32 >= 64
#endif // 32 >= 32
#endif // 32 >= 16
#endif // 32 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 32 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 32 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 32 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 32 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_f32 a = *src1;
        const npyv_lanetype_f32 b = *src2;
        *dst = a <= b;
    }
}

static void simd_binary_scalar1_less_equal_f32(char **args, npy_intp len)
{
    npyv_lanetype_f32 scalar = *(npyv_lanetype_f32 *) args[0];
    npyv_lanetype_f32 *src   = (npyv_lanetype_f32 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_f32 a         = npyv_setall_f32(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 32 >= 8
        npyv_f32  b1 = npyv_load_f32(src + npyv_nlanes_f32 * 0);
        npyv_b32 c1 = npyv_cmple_f32(a, b1);
#if 32 >= 16
        npyv_f32  b2 = npyv_load_f32(src + npyv_nlanes_f32 * 1);
        npyv_b32 c2 = npyv_cmple_f32(a, b2);
#if 32 >= 32
        npyv_f32  b3 = npyv_load_f32(src + npyv_nlanes_f32 * 2);
        npyv_f32  b4 = npyv_load_f32(src + npyv_nlanes_f32 * 3);
        npyv_b32 c3 = npyv_cmple_f32(a, b3);
        npyv_b32 c4 = npyv_cmple_f32(a, b4);
#if 32 == 64
        npyv_f32  b5 = npyv_load_f32(src + npyv_nlanes_f32 * 4);
        npyv_f32  b6 = npyv_load_f32(src + npyv_nlanes_f32 * 5);
        npyv_f32  b7 = npyv_load_f32(src + npyv_nlanes_f32 * 6);
        npyv_f32  b8 = npyv_load_f32(src + npyv_nlanes_f32 * 7);
        npyv_b32 c5 = npyv_cmple_f32(a, b5);
        npyv_b32 c6 = npyv_cmple_f32(a, b6);
        npyv_b32 c7 = npyv_cmple_f32(a, b7);
        npyv_b32 c8 = npyv_cmple_f32(a, b8);
#endif // 32 >= 64
#endif // 32 >= 32
#endif // 32 >= 16
#endif // 32 >= 8

#if 32 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 32 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 32 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 32 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_f32 b = *src;
        *dst = scalar <= b;
    }
}

static void simd_binary_scalar2_less_equal_f32(char **args, npy_intp len)
{
    npyv_lanetype_f32 *src   = (npyv_lanetype_f32 *) args[0];
    npyv_lanetype_f32 scalar = *(npyv_lanetype_f32 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_f32 b         = npyv_setall_f32(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 32 >= 8
        npyv_f32  a1 = npyv_load_f32(src + npyv_nlanes_f32 * 0);
        npyv_b32 c1 = npyv_cmple_f32(a1, b);
#if 32 >= 16
        npyv_f32  a2 = npyv_load_f32(src + npyv_nlanes_f32 * 1);
        npyv_b32 c2 = npyv_cmple_f32(a2, b);
#if 32 >= 32
        npyv_f32  a3 = npyv_load_f32(src + npyv_nlanes_f32 * 2);
        npyv_f32  a4 = npyv_load_f32(src + npyv_nlanes_f32 * 3);
        npyv_b32 c3 = npyv_cmple_f32(a3, b);
        npyv_b32 c4 = npyv_cmple_f32(a4, b);
#if 32 == 64
        npyv_f32  a5 = npyv_load_f32(src + npyv_nlanes_f32 * 4);
        npyv_f32  a6 = npyv_load_f32(src + npyv_nlanes_f32 * 5);
        npyv_f32  a7 = npyv_load_f32(src + npyv_nlanes_f32 * 6);
        npyv_f32  a8 = npyv_load_f32(src + npyv_nlanes_f32 * 7);
        npyv_b32 c5 = npyv_cmple_f32(a5, b);
        npyv_b32 c6 = npyv_cmple_f32(a6, b);
        npyv_b32 c7 = npyv_cmple_f32(a7, b);
        npyv_b32 c8 = npyv_cmple_f32(a8, b);
#endif // 32 >= 64
#endif // 32 >= 32
#endif // 32 >= 16
#endif // 32 >= 8

#if 32 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 32 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 32 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 32 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_f32 a = *src;
        *dst = a <= scalar;
    }
}
#endif



#line 28
#line 35
#if NPY_SIMD_F64 && !((1 || 0) && 0)
static void simd_binary_equal_f64(char **args, npy_intp len)
{
    npyv_lanetype_f64 *src1 = (npyv_lanetype_f64 *) args[0];
    npyv_lanetype_f64 *src2 = (npyv_lanetype_f64 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 64 >= 8
        npyv_f64  a1 = npyv_load_f64(src1 + npyv_nlanes_f64 * 0);
        npyv_f64  b1 = npyv_load_f64(src2 + npyv_nlanes_f64 * 0);
        npyv_b64 c1 = npyv_cmpeq_f64(a1, b1);
#if 64 >= 16
        npyv_f64  a2 = npyv_load_f64(src1 + npyv_nlanes_f64 * 1);
        npyv_f64  b2 = npyv_load_f64(src2 + npyv_nlanes_f64 * 1);
        npyv_b64 c2 = npyv_cmpeq_f64(a2, b2);
#if 64 >= 32
        npyv_f64  a3 = npyv_load_f64(src1 + npyv_nlanes_f64 * 2);
        npyv_f64  b3 = npyv_load_f64(src2 + npyv_nlanes_f64 * 2);
        npyv_f64  a4 = npyv_load_f64(src1 + npyv_nlanes_f64 * 3);
        npyv_f64  b4 = npyv_load_f64(src2 + npyv_nlanes_f64 * 3);
        npyv_b64 c3 = npyv_cmpeq_f64(a3, b3);
        npyv_b64 c4 = npyv_cmpeq_f64(a4, b4);
#if 64 == 64
        npyv_f64  a5 = npyv_load_f64(src1 + npyv_nlanes_f64 * 4);
        npyv_f64  b5 = npyv_load_f64(src2 + npyv_nlanes_f64 * 4);
        npyv_f64  a6 = npyv_load_f64(src1 + npyv_nlanes_f64 * 5);
        npyv_f64  b6 = npyv_load_f64(src2 + npyv_nlanes_f64 * 5);
        npyv_f64  a7 = npyv_load_f64(src1 + npyv_nlanes_f64 * 6);
        npyv_f64  b7 = npyv_load_f64(src2 + npyv_nlanes_f64 * 6);
        npyv_f64  a8 = npyv_load_f64(src1 + npyv_nlanes_f64 * 7);
        npyv_f64  b8 = npyv_load_f64(src2 + npyv_nlanes_f64 * 7);
        npyv_b64 c5 = npyv_cmpeq_f64(a5, b5);
        npyv_b64 c6 = npyv_cmpeq_f64(a6, b6);
        npyv_b64 c7 = npyv_cmpeq_f64(a7, b7);
        npyv_b64 c8 = npyv_cmpeq_f64(a8, b8);
#endif // 64 >= 64
#endif // 64 >= 32
#endif // 64 >= 16
#endif // 64 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 64 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 64 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 64 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 64 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_f64 a = *src1;
        const npyv_lanetype_f64 b = *src2;
        *dst = a == b;
    }
}

static void simd_binary_scalar1_equal_f64(char **args, npy_intp len)
{
    npyv_lanetype_f64 scalar = *(npyv_lanetype_f64 *) args[0];
    npyv_lanetype_f64 *src   = (npyv_lanetype_f64 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_f64 a         = npyv_setall_f64(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 64 >= 8
        npyv_f64  b1 = npyv_load_f64(src + npyv_nlanes_f64 * 0);
        npyv_b64 c1 = npyv_cmpeq_f64(a, b1);
#if 64 >= 16
        npyv_f64  b2 = npyv_load_f64(src + npyv_nlanes_f64 * 1);
        npyv_b64 c2 = npyv_cmpeq_f64(a, b2);
#if 64 >= 32
        npyv_f64  b3 = npyv_load_f64(src + npyv_nlanes_f64 * 2);
        npyv_f64  b4 = npyv_load_f64(src + npyv_nlanes_f64 * 3);
        npyv_b64 c3 = npyv_cmpeq_f64(a, b3);
        npyv_b64 c4 = npyv_cmpeq_f64(a, b4);
#if 64 == 64
        npyv_f64  b5 = npyv_load_f64(src + npyv_nlanes_f64 * 4);
        npyv_f64  b6 = npyv_load_f64(src + npyv_nlanes_f64 * 5);
        npyv_f64  b7 = npyv_load_f64(src + npyv_nlanes_f64 * 6);
        npyv_f64  b8 = npyv_load_f64(src + npyv_nlanes_f64 * 7);
        npyv_b64 c5 = npyv_cmpeq_f64(a, b5);
        npyv_b64 c6 = npyv_cmpeq_f64(a, b6);
        npyv_b64 c7 = npyv_cmpeq_f64(a, b7);
        npyv_b64 c8 = npyv_cmpeq_f64(a, b8);
#endif // 64 >= 64
#endif // 64 >= 32
#endif // 64 >= 16
#endif // 64 >= 8

#if 64 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 64 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 64 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 64 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_f64 b = *src;
        *dst = scalar == b;
    }
}

static void simd_binary_scalar2_equal_f64(char **args, npy_intp len)
{
    npyv_lanetype_f64 *src   = (npyv_lanetype_f64 *) args[0];
    npyv_lanetype_f64 scalar = *(npyv_lanetype_f64 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_f64 b         = npyv_setall_f64(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 64 >= 8
        npyv_f64  a1 = npyv_load_f64(src + npyv_nlanes_f64 * 0);
        npyv_b64 c1 = npyv_cmpeq_f64(a1, b);
#if 64 >= 16
        npyv_f64  a2 = npyv_load_f64(src + npyv_nlanes_f64 * 1);
        npyv_b64 c2 = npyv_cmpeq_f64(a2, b);
#if 64 >= 32
        npyv_f64  a3 = npyv_load_f64(src + npyv_nlanes_f64 * 2);
        npyv_f64  a4 = npyv_load_f64(src + npyv_nlanes_f64 * 3);
        npyv_b64 c3 = npyv_cmpeq_f64(a3, b);
        npyv_b64 c4 = npyv_cmpeq_f64(a4, b);
#if 64 == 64
        npyv_f64  a5 = npyv_load_f64(src + npyv_nlanes_f64 * 4);
        npyv_f64  a6 = npyv_load_f64(src + npyv_nlanes_f64 * 5);
        npyv_f64  a7 = npyv_load_f64(src + npyv_nlanes_f64 * 6);
        npyv_f64  a8 = npyv_load_f64(src + npyv_nlanes_f64 * 7);
        npyv_b64 c5 = npyv_cmpeq_f64(a5, b);
        npyv_b64 c6 = npyv_cmpeq_f64(a6, b);
        npyv_b64 c7 = npyv_cmpeq_f64(a7, b);
        npyv_b64 c8 = npyv_cmpeq_f64(a8, b);
#endif // 64 >= 64
#endif // 64 >= 32
#endif // 64 >= 16
#endif // 64 >= 8

#if 64 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 64 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 64 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 64 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_f64 a = *src;
        *dst = a == scalar;
    }
}
#endif


#line 35
#if NPY_SIMD_F64 && !((0 || 1) && 0)
static void simd_binary_not_equal_f64(char **args, npy_intp len)
{
    npyv_lanetype_f64 *src1 = (npyv_lanetype_f64 *) args[0];
    npyv_lanetype_f64 *src2 = (npyv_lanetype_f64 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 64 >= 8
        npyv_f64  a1 = npyv_load_f64(src1 + npyv_nlanes_f64 * 0);
        npyv_f64  b1 = npyv_load_f64(src2 + npyv_nlanes_f64 * 0);
        npyv_b64 c1 = npyv_cmpneq_f64(a1, b1);
#if 64 >= 16
        npyv_f64  a2 = npyv_load_f64(src1 + npyv_nlanes_f64 * 1);
        npyv_f64  b2 = npyv_load_f64(src2 + npyv_nlanes_f64 * 1);
        npyv_b64 c2 = npyv_cmpneq_f64(a2, b2);
#if 64 >= 32
        npyv_f64  a3 = npyv_load_f64(src1 + npyv_nlanes_f64 * 2);
        npyv_f64  b3 = npyv_load_f64(src2 + npyv_nlanes_f64 * 2);
        npyv_f64  a4 = npyv_load_f64(src1 + npyv_nlanes_f64 * 3);
        npyv_f64  b4 = npyv_load_f64(src2 + npyv_nlanes_f64 * 3);
        npyv_b64 c3 = npyv_cmpneq_f64(a3, b3);
        npyv_b64 c4 = npyv_cmpneq_f64(a4, b4);
#if 64 == 64
        npyv_f64  a5 = npyv_load_f64(src1 + npyv_nlanes_f64 * 4);
        npyv_f64  b5 = npyv_load_f64(src2 + npyv_nlanes_f64 * 4);
        npyv_f64  a6 = npyv_load_f64(src1 + npyv_nlanes_f64 * 5);
        npyv_f64  b6 = npyv_load_f64(src2 + npyv_nlanes_f64 * 5);
        npyv_f64  a7 = npyv_load_f64(src1 + npyv_nlanes_f64 * 6);
        npyv_f64  b7 = npyv_load_f64(src2 + npyv_nlanes_f64 * 6);
        npyv_f64  a8 = npyv_load_f64(src1 + npyv_nlanes_f64 * 7);
        npyv_f64  b8 = npyv_load_f64(src2 + npyv_nlanes_f64 * 7);
        npyv_b64 c5 = npyv_cmpneq_f64(a5, b5);
        npyv_b64 c6 = npyv_cmpneq_f64(a6, b6);
        npyv_b64 c7 = npyv_cmpneq_f64(a7, b7);
        npyv_b64 c8 = npyv_cmpneq_f64(a8, b8);
#endif // 64 >= 64
#endif // 64 >= 32
#endif // 64 >= 16
#endif // 64 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 64 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 64 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 64 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 64 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_f64 a = *src1;
        const npyv_lanetype_f64 b = *src2;
        *dst = a != b;
    }
}

static void simd_binary_scalar1_not_equal_f64(char **args, npy_intp len)
{
    npyv_lanetype_f64 scalar = *(npyv_lanetype_f64 *) args[0];
    npyv_lanetype_f64 *src   = (npyv_lanetype_f64 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_f64 a         = npyv_setall_f64(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 64 >= 8
        npyv_f64  b1 = npyv_load_f64(src + npyv_nlanes_f64 * 0);
        npyv_b64 c1 = npyv_cmpneq_f64(a, b1);
#if 64 >= 16
        npyv_f64  b2 = npyv_load_f64(src + npyv_nlanes_f64 * 1);
        npyv_b64 c2 = npyv_cmpneq_f64(a, b2);
#if 64 >= 32
        npyv_f64  b3 = npyv_load_f64(src + npyv_nlanes_f64 * 2);
        npyv_f64  b4 = npyv_load_f64(src + npyv_nlanes_f64 * 3);
        npyv_b64 c3 = npyv_cmpneq_f64(a, b3);
        npyv_b64 c4 = npyv_cmpneq_f64(a, b4);
#if 64 == 64
        npyv_f64  b5 = npyv_load_f64(src + npyv_nlanes_f64 * 4);
        npyv_f64  b6 = npyv_load_f64(src + npyv_nlanes_f64 * 5);
        npyv_f64  b7 = npyv_load_f64(src + npyv_nlanes_f64 * 6);
        npyv_f64  b8 = npyv_load_f64(src + npyv_nlanes_f64 * 7);
        npyv_b64 c5 = npyv_cmpneq_f64(a, b5);
        npyv_b64 c6 = npyv_cmpneq_f64(a, b6);
        npyv_b64 c7 = npyv_cmpneq_f64(a, b7);
        npyv_b64 c8 = npyv_cmpneq_f64(a, b8);
#endif // 64 >= 64
#endif // 64 >= 32
#endif // 64 >= 16
#endif // 64 >= 8

#if 64 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 64 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 64 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 64 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_f64 b = *src;
        *dst = scalar != b;
    }
}

static void simd_binary_scalar2_not_equal_f64(char **args, npy_intp len)
{
    npyv_lanetype_f64 *src   = (npyv_lanetype_f64 *) args[0];
    npyv_lanetype_f64 scalar = *(npyv_lanetype_f64 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_f64 b         = npyv_setall_f64(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 64 >= 8
        npyv_f64  a1 = npyv_load_f64(src + npyv_nlanes_f64 * 0);
        npyv_b64 c1 = npyv_cmpneq_f64(a1, b);
#if 64 >= 16
        npyv_f64  a2 = npyv_load_f64(src + npyv_nlanes_f64 * 1);
        npyv_b64 c2 = npyv_cmpneq_f64(a2, b);
#if 64 >= 32
        npyv_f64  a3 = npyv_load_f64(src + npyv_nlanes_f64 * 2);
        npyv_f64  a4 = npyv_load_f64(src + npyv_nlanes_f64 * 3);
        npyv_b64 c3 = npyv_cmpneq_f64(a3, b);
        npyv_b64 c4 = npyv_cmpneq_f64(a4, b);
#if 64 == 64
        npyv_f64  a5 = npyv_load_f64(src + npyv_nlanes_f64 * 4);
        npyv_f64  a6 = npyv_load_f64(src + npyv_nlanes_f64 * 5);
        npyv_f64  a7 = npyv_load_f64(src + npyv_nlanes_f64 * 6);
        npyv_f64  a8 = npyv_load_f64(src + npyv_nlanes_f64 * 7);
        npyv_b64 c5 = npyv_cmpneq_f64(a5, b);
        npyv_b64 c6 = npyv_cmpneq_f64(a6, b);
        npyv_b64 c7 = npyv_cmpneq_f64(a7, b);
        npyv_b64 c8 = npyv_cmpneq_f64(a8, b);
#endif // 64 >= 64
#endif // 64 >= 32
#endif // 64 >= 16
#endif // 64 >= 8

#if 64 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 64 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 64 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 64 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_f64 a = *src;
        *dst = a != scalar;
    }
}
#endif


#line 35
#if NPY_SIMD_F64 && !((0 || 0) && 0)
static void simd_binary_less_f64(char **args, npy_intp len)
{
    npyv_lanetype_f64 *src1 = (npyv_lanetype_f64 *) args[0];
    npyv_lanetype_f64 *src2 = (npyv_lanetype_f64 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 64 >= 8
        npyv_f64  a1 = npyv_load_f64(src1 + npyv_nlanes_f64 * 0);
        npyv_f64  b1 = npyv_load_f64(src2 + npyv_nlanes_f64 * 0);
        npyv_b64 c1 = npyv_cmplt_f64(a1, b1);
#if 64 >= 16
        npyv_f64  a2 = npyv_load_f64(src1 + npyv_nlanes_f64 * 1);
        npyv_f64  b2 = npyv_load_f64(src2 + npyv_nlanes_f64 * 1);
        npyv_b64 c2 = npyv_cmplt_f64(a2, b2);
#if 64 >= 32
        npyv_f64  a3 = npyv_load_f64(src1 + npyv_nlanes_f64 * 2);
        npyv_f64  b3 = npyv_load_f64(src2 + npyv_nlanes_f64 * 2);
        npyv_f64  a4 = npyv_load_f64(src1 + npyv_nlanes_f64 * 3);
        npyv_f64  b4 = npyv_load_f64(src2 + npyv_nlanes_f64 * 3);
        npyv_b64 c3 = npyv_cmplt_f64(a3, b3);
        npyv_b64 c4 = npyv_cmplt_f64(a4, b4);
#if 64 == 64
        npyv_f64  a5 = npyv_load_f64(src1 + npyv_nlanes_f64 * 4);
        npyv_f64  b5 = npyv_load_f64(src2 + npyv_nlanes_f64 * 4);
        npyv_f64  a6 = npyv_load_f64(src1 + npyv_nlanes_f64 * 5);
        npyv_f64  b6 = npyv_load_f64(src2 + npyv_nlanes_f64 * 5);
        npyv_f64  a7 = npyv_load_f64(src1 + npyv_nlanes_f64 * 6);
        npyv_f64  b7 = npyv_load_f64(src2 + npyv_nlanes_f64 * 6);
        npyv_f64  a8 = npyv_load_f64(src1 + npyv_nlanes_f64 * 7);
        npyv_f64  b8 = npyv_load_f64(src2 + npyv_nlanes_f64 * 7);
        npyv_b64 c5 = npyv_cmplt_f64(a5, b5);
        npyv_b64 c6 = npyv_cmplt_f64(a6, b6);
        npyv_b64 c7 = npyv_cmplt_f64(a7, b7);
        npyv_b64 c8 = npyv_cmplt_f64(a8, b8);
#endif // 64 >= 64
#endif // 64 >= 32
#endif // 64 >= 16
#endif // 64 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 64 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 64 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 64 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 64 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_f64 a = *src1;
        const npyv_lanetype_f64 b = *src2;
        *dst = a < b;
    }
}

static void simd_binary_scalar1_less_f64(char **args, npy_intp len)
{
    npyv_lanetype_f64 scalar = *(npyv_lanetype_f64 *) args[0];
    npyv_lanetype_f64 *src   = (npyv_lanetype_f64 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_f64 a         = npyv_setall_f64(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 64 >= 8
        npyv_f64  b1 = npyv_load_f64(src + npyv_nlanes_f64 * 0);
        npyv_b64 c1 = npyv_cmplt_f64(a, b1);
#if 64 >= 16
        npyv_f64  b2 = npyv_load_f64(src + npyv_nlanes_f64 * 1);
        npyv_b64 c2 = npyv_cmplt_f64(a, b2);
#if 64 >= 32
        npyv_f64  b3 = npyv_load_f64(src + npyv_nlanes_f64 * 2);
        npyv_f64  b4 = npyv_load_f64(src + npyv_nlanes_f64 * 3);
        npyv_b64 c3 = npyv_cmplt_f64(a, b3);
        npyv_b64 c4 = npyv_cmplt_f64(a, b4);
#if 64 == 64
        npyv_f64  b5 = npyv_load_f64(src + npyv_nlanes_f64 * 4);
        npyv_f64  b6 = npyv_load_f64(src + npyv_nlanes_f64 * 5);
        npyv_f64  b7 = npyv_load_f64(src + npyv_nlanes_f64 * 6);
        npyv_f64  b8 = npyv_load_f64(src + npyv_nlanes_f64 * 7);
        npyv_b64 c5 = npyv_cmplt_f64(a, b5);
        npyv_b64 c6 = npyv_cmplt_f64(a, b6);
        npyv_b64 c7 = npyv_cmplt_f64(a, b7);
        npyv_b64 c8 = npyv_cmplt_f64(a, b8);
#endif // 64 >= 64
#endif // 64 >= 32
#endif // 64 >= 16
#endif // 64 >= 8

#if 64 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 64 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 64 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 64 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_f64 b = *src;
        *dst = scalar < b;
    }
}

static void simd_binary_scalar2_less_f64(char **args, npy_intp len)
{
    npyv_lanetype_f64 *src   = (npyv_lanetype_f64 *) args[0];
    npyv_lanetype_f64 scalar = *(npyv_lanetype_f64 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_f64 b         = npyv_setall_f64(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 64 >= 8
        npyv_f64  a1 = npyv_load_f64(src + npyv_nlanes_f64 * 0);
        npyv_b64 c1 = npyv_cmplt_f64(a1, b);
#if 64 >= 16
        npyv_f64  a2 = npyv_load_f64(src + npyv_nlanes_f64 * 1);
        npyv_b64 c2 = npyv_cmplt_f64(a2, b);
#if 64 >= 32
        npyv_f64  a3 = npyv_load_f64(src + npyv_nlanes_f64 * 2);
        npyv_f64  a4 = npyv_load_f64(src + npyv_nlanes_f64 * 3);
        npyv_b64 c3 = npyv_cmplt_f64(a3, b);
        npyv_b64 c4 = npyv_cmplt_f64(a4, b);
#if 64 == 64
        npyv_f64  a5 = npyv_load_f64(src + npyv_nlanes_f64 * 4);
        npyv_f64  a6 = npyv_load_f64(src + npyv_nlanes_f64 * 5);
        npyv_f64  a7 = npyv_load_f64(src + npyv_nlanes_f64 * 6);
        npyv_f64  a8 = npyv_load_f64(src + npyv_nlanes_f64 * 7);
        npyv_b64 c5 = npyv_cmplt_f64(a5, b);
        npyv_b64 c6 = npyv_cmplt_f64(a6, b);
        npyv_b64 c7 = npyv_cmplt_f64(a7, b);
        npyv_b64 c8 = npyv_cmplt_f64(a8, b);
#endif // 64 >= 64
#endif // 64 >= 32
#endif // 64 >= 16
#endif // 64 >= 8

#if 64 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 64 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 64 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 64 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_f64 a = *src;
        *dst = a < scalar;
    }
}
#endif


#line 35
#if NPY_SIMD_F64 && !((0 || 0) && 0)
static void simd_binary_less_equal_f64(char **args, npy_intp len)
{
    npyv_lanetype_f64 *src1 = (npyv_lanetype_f64 *) args[0];
    npyv_lanetype_f64 *src2 = (npyv_lanetype_f64 *) args[1];
    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask    = npyv_setall_u8(0x1);
    const int vstep           = npyv_nlanes_u8;

    // Unroll the loop to get a resultant vector with 'vsteps' elements.
    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
#if 64 >= 8
        npyv_f64  a1 = npyv_load_f64(src1 + npyv_nlanes_f64 * 0);
        npyv_f64  b1 = npyv_load_f64(src2 + npyv_nlanes_f64 * 0);
        npyv_b64 c1 = npyv_cmple_f64(a1, b1);
#if 64 >= 16
        npyv_f64  a2 = npyv_load_f64(src1 + npyv_nlanes_f64 * 1);
        npyv_f64  b2 = npyv_load_f64(src2 + npyv_nlanes_f64 * 1);
        npyv_b64 c2 = npyv_cmple_f64(a2, b2);
#if 64 >= 32
        npyv_f64  a3 = npyv_load_f64(src1 + npyv_nlanes_f64 * 2);
        npyv_f64  b3 = npyv_load_f64(src2 + npyv_nlanes_f64 * 2);
        npyv_f64  a4 = npyv_load_f64(src1 + npyv_nlanes_f64 * 3);
        npyv_f64  b4 = npyv_load_f64(src2 + npyv_nlanes_f64 * 3);
        npyv_b64 c3 = npyv_cmple_f64(a3, b3);
        npyv_b64 c4 = npyv_cmple_f64(a4, b4);
#if 64 == 64
        npyv_f64  a5 = npyv_load_f64(src1 + npyv_nlanes_f64 * 4);
        npyv_f64  b5 = npyv_load_f64(src2 + npyv_nlanes_f64 * 4);
        npyv_f64  a6 = npyv_load_f64(src1 + npyv_nlanes_f64 * 5);
        npyv_f64  b6 = npyv_load_f64(src2 + npyv_nlanes_f64 * 5);
        npyv_f64  a7 = npyv_load_f64(src1 + npyv_nlanes_f64 * 6);
        npyv_f64  b7 = npyv_load_f64(src2 + npyv_nlanes_f64 * 6);
        npyv_f64  a8 = npyv_load_f64(src1 + npyv_nlanes_f64 * 7);
        npyv_f64  b8 = npyv_load_f64(src2 + npyv_nlanes_f64 * 7);
        npyv_b64 c5 = npyv_cmple_f64(a5, b5);
        npyv_b64 c6 = npyv_cmple_f64(a6, b6);
        npyv_b64 c7 = npyv_cmple_f64(a7, b7);
        npyv_b64 c8 = npyv_cmple_f64(a8, b8);
#endif // 64 >= 64
#endif // 64 >= 32
#endif // 64 >= 16
#endif // 64 >= 8

        // Pack the 'c' vectors into a single vector 'r'
#if 64 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 64 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 64 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 64 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_f64 a = *src1;
        const npyv_lanetype_f64 b = *src2;
        *dst = a <= b;
    }
}

static void simd_binary_scalar1_less_equal_f64(char **args, npy_intp len)
{
    npyv_lanetype_f64 scalar = *(npyv_lanetype_f64 *) args[0];
    npyv_lanetype_f64 *src   = (npyv_lanetype_f64 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_f64 a         = npyv_setall_f64(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 64 >= 8
        npyv_f64  b1 = npyv_load_f64(src + npyv_nlanes_f64 * 0);
        npyv_b64 c1 = npyv_cmple_f64(a, b1);
#if 64 >= 16
        npyv_f64  b2 = npyv_load_f64(src + npyv_nlanes_f64 * 1);
        npyv_b64 c2 = npyv_cmple_f64(a, b2);
#if 64 >= 32
        npyv_f64  b3 = npyv_load_f64(src + npyv_nlanes_f64 * 2);
        npyv_f64  b4 = npyv_load_f64(src + npyv_nlanes_f64 * 3);
        npyv_b64 c3 = npyv_cmple_f64(a, b3);
        npyv_b64 c4 = npyv_cmple_f64(a, b4);
#if 64 == 64
        npyv_f64  b5 = npyv_load_f64(src + npyv_nlanes_f64 * 4);
        npyv_f64  b6 = npyv_load_f64(src + npyv_nlanes_f64 * 5);
        npyv_f64  b7 = npyv_load_f64(src + npyv_nlanes_f64 * 6);
        npyv_f64  b8 = npyv_load_f64(src + npyv_nlanes_f64 * 7);
        npyv_b64 c5 = npyv_cmple_f64(a, b5);
        npyv_b64 c6 = npyv_cmple_f64(a, b6);
        npyv_b64 c7 = npyv_cmple_f64(a, b7);
        npyv_b64 c8 = npyv_cmple_f64(a, b8);
#endif // 64 >= 64
#endif // 64 >= 32
#endif // 64 >= 16
#endif // 64 >= 8

#if 64 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 64 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 64 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 64 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_f64 b = *src;
        *dst = scalar <= b;
    }
}

static void simd_binary_scalar2_less_equal_f64(char **args, npy_intp len)
{
    npyv_lanetype_f64 *src   = (npyv_lanetype_f64 *) args[0];
    npyv_lanetype_f64 scalar = *(npyv_lanetype_f64 *) args[1];
    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
    const npyv_f64 b         = npyv_setall_f64(scalar);
    const npyv_u8 truemask     = npyv_setall_u8(0x1);
    const int vstep            = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
#if 64 >= 8
        npyv_f64  a1 = npyv_load_f64(src + npyv_nlanes_f64 * 0);
        npyv_b64 c1 = npyv_cmple_f64(a1, b);
#if 64 >= 16
        npyv_f64  a2 = npyv_load_f64(src + npyv_nlanes_f64 * 1);
        npyv_b64 c2 = npyv_cmple_f64(a2, b);
#if 64 >= 32
        npyv_f64  a3 = npyv_load_f64(src + npyv_nlanes_f64 * 2);
        npyv_f64  a4 = npyv_load_f64(src + npyv_nlanes_f64 * 3);
        npyv_b64 c3 = npyv_cmple_f64(a3, b);
        npyv_b64 c4 = npyv_cmple_f64(a4, b);
#if 64 == 64
        npyv_f64  a5 = npyv_load_f64(src + npyv_nlanes_f64 * 4);
        npyv_f64  a6 = npyv_load_f64(src + npyv_nlanes_f64 * 5);
        npyv_f64  a7 = npyv_load_f64(src + npyv_nlanes_f64 * 6);
        npyv_f64  a8 = npyv_load_f64(src + npyv_nlanes_f64 * 7);
        npyv_b64 c5 = npyv_cmple_f64(a5, b);
        npyv_b64 c6 = npyv_cmple_f64(a6, b);
        npyv_b64 c7 = npyv_cmple_f64(a7, b);
        npyv_b64 c8 = npyv_cmple_f64(a8, b);
#endif // 64 >= 64
#endif // 64 >= 32
#endif // 64 >= 16
#endif // 64 >= 8

#if 64 == 8
        npyv_u8 r = npyv_cvt_u8_b8(c1);
#elif 64 == 16
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
#elif 64 == 32
        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
#elif 64 == 64
        npyv_u8 r =
            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
#endif
        npyv_store_u8(dst, npyv_and_u8(r, truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_f64 a = *src;
        *dst = a <= scalar;
    }
}
#endif




#line 220

#if NPY_SIMD
static void simd_binary_equal_b8(char **args, npy_intp len)
{
    npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0];
    npyv_lanetype_u8 *src2 = (npyv_lanetype_u8 *) args[1];
    npyv_lanetype_u8 *dst  = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask = npyv_setall_u8(0x1);
    const npyv_u8 vzero    = npyv_setall_u8(0x0);
    const int vstep        = npyv_nlanes_u8;

    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
        // Whatever element in src != 0x0 is converted to 0xFF
        npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src1), vzero);
        npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src2), vzero);
        npyv_b8 c = npyv_xnor_b8(a, b);
        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_u8 a = *src1 != 0;
        const npyv_lanetype_u8 b = *src2 != 0;
        *dst = a == b;
    }
}

static void simd_binary_scalar1_equal_b8(char **args, npy_intp len)
{
    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[0];
    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[1];
    npyv_lanetype_u8 *dst   = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 vzero     = npyv_setall_u8(0x0);
    const npyv_u8 vscalar   = npyv_setall_u8(scalar);
    const npyv_b8 a         = npyv_cmpeq_u8(vscalar, vzero);
    const npyv_u8 truemask  = npyv_setall_u8(0x1);
    const int vstep         = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
        npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src), vzero);
        npyv_b8 c = npyv_xnor_b8(a, b);
        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u8 b = *src != 0;
        *dst = scalar == b;
    }
}

static void simd_binary_scalar2_equal_b8(char **args, npy_intp len)
{
    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[0];
    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[1];
    npyv_lanetype_u8 *dst   = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 vzero     = npyv_setall_u8(0x0);
    const npyv_u8 vscalar   = npyv_setall_u8(scalar);
    const npyv_b8 b         = npyv_cmpeq_u8(vscalar, vzero);
    const npyv_u8 truemask  = npyv_setall_u8(0x1);
    const int vstep         = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
        npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src), vzero);
        npyv_b8 c = npyv_xnor_b8(a, b);
        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u8 a = *src != 0;
        *dst = a == scalar;
    }
}
#endif

#line 220

#if NPY_SIMD
static void simd_binary_not_equal_b8(char **args, npy_intp len)
{
    npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0];
    npyv_lanetype_u8 *src2 = (npyv_lanetype_u8 *) args[1];
    npyv_lanetype_u8 *dst  = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask = npyv_setall_u8(0x1);
    const npyv_u8 vzero    = npyv_setall_u8(0x0);
    const int vstep        = npyv_nlanes_u8;

    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
        // Whatever element in src != 0x0 is converted to 0xFF
        npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src1), vzero);
        npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src2), vzero);
        npyv_b8 c = npyv_xor_b8(a, b);
        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_u8 a = *src1 != 0;
        const npyv_lanetype_u8 b = *src2 != 0;
        *dst = a != b;
    }
}

static void simd_binary_scalar1_not_equal_b8(char **args, npy_intp len)
{
    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[0];
    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[1];
    npyv_lanetype_u8 *dst   = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 vzero     = npyv_setall_u8(0x0);
    const npyv_u8 vscalar   = npyv_setall_u8(scalar);
    const npyv_b8 a         = npyv_cmpeq_u8(vscalar, vzero);
    const npyv_u8 truemask  = npyv_setall_u8(0x1);
    const int vstep         = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
        npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src), vzero);
        npyv_b8 c = npyv_xor_b8(a, b);
        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u8 b = *src != 0;
        *dst = scalar != b;
    }
}

static void simd_binary_scalar2_not_equal_b8(char **args, npy_intp len)
{
    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[0];
    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[1];
    npyv_lanetype_u8 *dst   = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 vzero     = npyv_setall_u8(0x0);
    const npyv_u8 vscalar   = npyv_setall_u8(scalar);
    const npyv_b8 b         = npyv_cmpeq_u8(vscalar, vzero);
    const npyv_u8 truemask  = npyv_setall_u8(0x1);
    const int vstep         = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
        npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src), vzero);
        npyv_b8 c = npyv_xor_b8(a, b);
        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u8 a = *src != 0;
        *dst = a != scalar;
    }
}
#endif

#line 220

#if NPY_SIMD
static void simd_binary_less_b8(char **args, npy_intp len)
{
    npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0];
    npyv_lanetype_u8 *src2 = (npyv_lanetype_u8 *) args[1];
    npyv_lanetype_u8 *dst  = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask = npyv_setall_u8(0x1);
    const npyv_u8 vzero    = npyv_setall_u8(0x0);
    const int vstep        = npyv_nlanes_u8;

    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
        // Whatever element in src != 0x0 is converted to 0xFF
        npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src1), vzero);
        npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src2), vzero);
        npyv_b8 c = npyv_andc_b8(a, b);
        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_u8 a = *src1 != 0;
        const npyv_lanetype_u8 b = *src2 != 0;
        *dst = a < b;
    }
}

static void simd_binary_scalar1_less_b8(char **args, npy_intp len)
{
    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[0];
    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[1];
    npyv_lanetype_u8 *dst   = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 vzero     = npyv_setall_u8(0x0);
    const npyv_u8 vscalar   = npyv_setall_u8(scalar);
    const npyv_b8 a         = npyv_cmpeq_u8(vscalar, vzero);
    const npyv_u8 truemask  = npyv_setall_u8(0x1);
    const int vstep         = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
        npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src), vzero);
        npyv_b8 c = npyv_andc_b8(a, b);
        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u8 b = *src != 0;
        *dst = scalar < b;
    }
}

static void simd_binary_scalar2_less_b8(char **args, npy_intp len)
{
    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[0];
    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[1];
    npyv_lanetype_u8 *dst   = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 vzero     = npyv_setall_u8(0x0);
    const npyv_u8 vscalar   = npyv_setall_u8(scalar);
    const npyv_b8 b         = npyv_cmpeq_u8(vscalar, vzero);
    const npyv_u8 truemask  = npyv_setall_u8(0x1);
    const int vstep         = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
        npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src), vzero);
        npyv_b8 c = npyv_andc_b8(a, b);
        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u8 a = *src != 0;
        *dst = a < scalar;
    }
}
#endif

#line 220

#if NPY_SIMD
static void simd_binary_less_equal_b8(char **args, npy_intp len)
{
    npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0];
    npyv_lanetype_u8 *src2 = (npyv_lanetype_u8 *) args[1];
    npyv_lanetype_u8 *dst  = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 truemask = npyv_setall_u8(0x1);
    const npyv_u8 vzero    = npyv_setall_u8(0x0);
    const int vstep        = npyv_nlanes_u8;

    for (; len >= vstep;
         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
        // Whatever element in src != 0x0 is converted to 0xFF
        npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src1), vzero);
        npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src2), vzero);
        npyv_b8 c = npyv_orc_b8(a, b);
        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst) {
        const npyv_lanetype_u8 a = *src1 != 0;
        const npyv_lanetype_u8 b = *src2 != 0;
        *dst = a <= b;
    }
}

static void simd_binary_scalar1_less_equal_b8(char **args, npy_intp len)
{
    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[0];
    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[1];
    npyv_lanetype_u8 *dst   = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 vzero     = npyv_setall_u8(0x0);
    const npyv_u8 vscalar   = npyv_setall_u8(scalar);
    const npyv_b8 a         = npyv_cmpeq_u8(vscalar, vzero);
    const npyv_u8 truemask  = npyv_setall_u8(0x1);
    const int vstep         = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
        npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src), vzero);
        npyv_b8 c = npyv_orc_b8(a, b);
        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u8 b = *src != 0;
        *dst = scalar <= b;
    }
}

static void simd_binary_scalar2_less_equal_b8(char **args, npy_intp len)
{
    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[0];
    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[1];
    npyv_lanetype_u8 *dst   = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 vzero     = npyv_setall_u8(0x0);
    const npyv_u8 vscalar   = npyv_setall_u8(scalar);
    const npyv_b8 b         = npyv_cmpeq_u8(vscalar, vzero);
    const npyv_u8 truemask  = npyv_setall_u8(0x1);
    const int vstep         = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
        npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src), vzero);
        npyv_b8 c = npyv_orc_b8(a, b);
        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u8 a = *src != 0;
        *dst = a <= scalar;
    }
}
#endif


#line 304
#line 310
#if !((1 || 0) && 0)
static inline void
run_binary_simd_equal_b8(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_ubyte, npy_bool)) {
            simd_binary_scalar1_equal_b8(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_ubyte, npy_bool)) {
            simd_binary_scalar2_equal_b8(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_ubyte, npy_bool)) {
            simd_binary_equal_b8(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 1
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_ubyte in1 = *(npy_ubyte *)ip1;
        const npy_ubyte in2 = *(npy_ubyte *)ip2;
#endif
        *((npy_bool *)op1) = in1 == in2;
    }
}
#endif

#line 310
#if !((0 || 1) && 0)
static inline void
run_binary_simd_not_equal_b8(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_ubyte, npy_bool)) {
            simd_binary_scalar1_not_equal_b8(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_ubyte, npy_bool)) {
            simd_binary_scalar2_not_equal_b8(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_ubyte, npy_bool)) {
            simd_binary_not_equal_b8(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 1
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_ubyte in1 = *(npy_ubyte *)ip1;
        const npy_ubyte in2 = *(npy_ubyte *)ip2;
#endif
        *((npy_bool *)op1) = in1 != in2;
    }
}
#endif

#line 310
#if !((0 || 0) && 0)
static inline void
run_binary_simd_less_b8(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_ubyte, npy_bool)) {
            simd_binary_scalar1_less_b8(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_ubyte, npy_bool)) {
            simd_binary_scalar2_less_b8(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_ubyte, npy_bool)) {
            simd_binary_less_b8(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 1
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_ubyte in1 = *(npy_ubyte *)ip1;
        const npy_ubyte in2 = *(npy_ubyte *)ip2;
#endif
        *((npy_bool *)op1) = in1 < in2;
    }
}
#endif

#line 310
#if !((0 || 0) && 0)
static inline void
run_binary_simd_less_equal_b8(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_ubyte, npy_bool)) {
            simd_binary_scalar1_less_equal_b8(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_ubyte, npy_bool)) {
            simd_binary_scalar2_less_equal_b8(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_ubyte, npy_bool)) {
            simd_binary_less_equal_b8(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 1
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_ubyte in1 = *(npy_ubyte *)ip1;
        const npy_ubyte in2 = *(npy_ubyte *)ip2;
#endif
        *((npy_bool *)op1) = in1 <= in2;
    }
}
#endif


#line 304
#line 310
#if !((1 || 0) && 0)
static inline void
run_binary_simd_equal_u8(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_ubyte, npy_bool)) {
            simd_binary_scalar1_equal_u8(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_ubyte, npy_bool)) {
            simd_binary_scalar2_equal_u8(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_ubyte, npy_bool)) {
            simd_binary_equal_u8(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_ubyte in1 = *(npy_ubyte *)ip1;
        const npy_ubyte in2 = *(npy_ubyte *)ip2;
#endif
        *((npy_bool *)op1) = in1 == in2;
    }
}
#endif

#line 310
#if !((0 || 1) && 0)
static inline void
run_binary_simd_not_equal_u8(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_ubyte, npy_bool)) {
            simd_binary_scalar1_not_equal_u8(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_ubyte, npy_bool)) {
            simd_binary_scalar2_not_equal_u8(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_ubyte, npy_bool)) {
            simd_binary_not_equal_u8(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_ubyte in1 = *(npy_ubyte *)ip1;
        const npy_ubyte in2 = *(npy_ubyte *)ip2;
#endif
        *((npy_bool *)op1) = in1 != in2;
    }
}
#endif

#line 310
#if !((0 || 0) && 0)
static inline void
run_binary_simd_less_u8(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_ubyte, npy_bool)) {
            simd_binary_scalar1_less_u8(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_ubyte, npy_bool)) {
            simd_binary_scalar2_less_u8(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_ubyte, npy_bool)) {
            simd_binary_less_u8(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_ubyte in1 = *(npy_ubyte *)ip1;
        const npy_ubyte in2 = *(npy_ubyte *)ip2;
#endif
        *((npy_bool *)op1) = in1 < in2;
    }
}
#endif

#line 310
#if !((0 || 0) && 0)
static inline void
run_binary_simd_less_equal_u8(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_ubyte, npy_bool)) {
            simd_binary_scalar1_less_equal_u8(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_ubyte, npy_bool)) {
            simd_binary_scalar2_less_equal_u8(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_ubyte, npy_bool)) {
            simd_binary_less_equal_u8(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_ubyte in1 = *(npy_ubyte *)ip1;
        const npy_ubyte in2 = *(npy_ubyte *)ip2;
#endif
        *((npy_bool *)op1) = in1 <= in2;
    }
}
#endif


#line 304
#line 310
#if !((1 || 0) && 1)
static inline void
run_binary_simd_equal_s8(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_byte, npy_bool)) {
            simd_binary_scalar1_equal_s8(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_byte, npy_bool)) {
            simd_binary_scalar2_equal_s8(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_byte, npy_bool)) {
            simd_binary_equal_s8(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_byte in1 = *(npy_byte *)ip1;
        const npy_byte in2 = *(npy_byte *)ip2;
#endif
        *((npy_bool *)op1) = in1 == in2;
    }
}
#endif

#line 310
#if !((0 || 1) && 1)
static inline void
run_binary_simd_not_equal_s8(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_byte, npy_bool)) {
            simd_binary_scalar1_not_equal_s8(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_byte, npy_bool)) {
            simd_binary_scalar2_not_equal_s8(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_byte, npy_bool)) {
            simd_binary_not_equal_s8(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_byte in1 = *(npy_byte *)ip1;
        const npy_byte in2 = *(npy_byte *)ip2;
#endif
        *((npy_bool *)op1) = in1 != in2;
    }
}
#endif

#line 310
#if !((0 || 0) && 1)
static inline void
run_binary_simd_less_s8(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_byte, npy_bool)) {
            simd_binary_scalar1_less_s8(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_byte, npy_bool)) {
            simd_binary_scalar2_less_s8(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_byte, npy_bool)) {
            simd_binary_less_s8(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_byte in1 = *(npy_byte *)ip1;
        const npy_byte in2 = *(npy_byte *)ip2;
#endif
        *((npy_bool *)op1) = in1 < in2;
    }
}
#endif

#line 310
#if !((0 || 0) && 1)
static inline void
run_binary_simd_less_equal_s8(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_byte, npy_bool)) {
            simd_binary_scalar1_less_equal_s8(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_byte, npy_bool)) {
            simd_binary_scalar2_less_equal_s8(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_byte, npy_bool)) {
            simd_binary_less_equal_s8(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_byte in1 = *(npy_byte *)ip1;
        const npy_byte in2 = *(npy_byte *)ip2;
#endif
        *((npy_bool *)op1) = in1 <= in2;
    }
}
#endif


#line 304
#line 310
#if !((1 || 0) && 0)
static inline void
run_binary_simd_equal_u16(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_ushort, npy_bool)) {
            simd_binary_scalar1_equal_u16(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_ushort, npy_bool)) {
            simd_binary_scalar2_equal_u16(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_ushort, npy_bool)) {
            simd_binary_equal_u16(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_ushort in1 = *(npy_ushort *)ip1;
        const npy_ushort in2 = *(npy_ushort *)ip2;
#endif
        *((npy_bool *)op1) = in1 == in2;
    }
}
#endif

#line 310
#if !((0 || 1) && 0)
static inline void
run_binary_simd_not_equal_u16(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_ushort, npy_bool)) {
            simd_binary_scalar1_not_equal_u16(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_ushort, npy_bool)) {
            simd_binary_scalar2_not_equal_u16(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_ushort, npy_bool)) {
            simd_binary_not_equal_u16(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_ushort in1 = *(npy_ushort *)ip1;
        const npy_ushort in2 = *(npy_ushort *)ip2;
#endif
        *((npy_bool *)op1) = in1 != in2;
    }
}
#endif

#line 310
#if !((0 || 0) && 0)
static inline void
run_binary_simd_less_u16(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_ushort, npy_bool)) {
            simd_binary_scalar1_less_u16(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_ushort, npy_bool)) {
            simd_binary_scalar2_less_u16(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_ushort, npy_bool)) {
            simd_binary_less_u16(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_ushort in1 = *(npy_ushort *)ip1;
        const npy_ushort in2 = *(npy_ushort *)ip2;
#endif
        *((npy_bool *)op1) = in1 < in2;
    }
}
#endif

#line 310
#if !((0 || 0) && 0)
static inline void
run_binary_simd_less_equal_u16(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_ushort, npy_bool)) {
            simd_binary_scalar1_less_equal_u16(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_ushort, npy_bool)) {
            simd_binary_scalar2_less_equal_u16(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_ushort, npy_bool)) {
            simd_binary_less_equal_u16(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_ushort in1 = *(npy_ushort *)ip1;
        const npy_ushort in2 = *(npy_ushort *)ip2;
#endif
        *((npy_bool *)op1) = in1 <= in2;
    }
}
#endif


#line 304
#line 310
#if !((1 || 0) && 1)
static inline void
run_binary_simd_equal_s16(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_short, npy_bool)) {
            simd_binary_scalar1_equal_s16(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_short, npy_bool)) {
            simd_binary_scalar2_equal_s16(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_short, npy_bool)) {
            simd_binary_equal_s16(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_short in1 = *(npy_short *)ip1;
        const npy_short in2 = *(npy_short *)ip2;
#endif
        *((npy_bool *)op1) = in1 == in2;
    }
}
#endif

#line 310
#if !((0 || 1) && 1)
static inline void
run_binary_simd_not_equal_s16(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_short, npy_bool)) {
            simd_binary_scalar1_not_equal_s16(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_short, npy_bool)) {
            simd_binary_scalar2_not_equal_s16(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_short, npy_bool)) {
            simd_binary_not_equal_s16(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_short in1 = *(npy_short *)ip1;
        const npy_short in2 = *(npy_short *)ip2;
#endif
        *((npy_bool *)op1) = in1 != in2;
    }
}
#endif

#line 310
#if !((0 || 0) && 1)
static inline void
run_binary_simd_less_s16(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_short, npy_bool)) {
            simd_binary_scalar1_less_s16(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_short, npy_bool)) {
            simd_binary_scalar2_less_s16(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_short, npy_bool)) {
            simd_binary_less_s16(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_short in1 = *(npy_short *)ip1;
        const npy_short in2 = *(npy_short *)ip2;
#endif
        *((npy_bool *)op1) = in1 < in2;
    }
}
#endif

#line 310
#if !((0 || 0) && 1)
static inline void
run_binary_simd_less_equal_s16(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_short, npy_bool)) {
            simd_binary_scalar1_less_equal_s16(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_short, npy_bool)) {
            simd_binary_scalar2_less_equal_s16(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_short, npy_bool)) {
            simd_binary_less_equal_s16(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_short in1 = *(npy_short *)ip1;
        const npy_short in2 = *(npy_short *)ip2;
#endif
        *((npy_bool *)op1) = in1 <= in2;
    }
}
#endif


#line 304
#line 310
#if !((1 || 0) && 0)
static inline void
run_binary_simd_equal_u32(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_uint, npy_bool)) {
            simd_binary_scalar1_equal_u32(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_uint, npy_bool)) {
            simd_binary_scalar2_equal_u32(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_uint, npy_bool)) {
            simd_binary_equal_u32(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_uint in1 = *(npy_uint *)ip1;
        const npy_uint in2 = *(npy_uint *)ip2;
#endif
        *((npy_bool *)op1) = in1 == in2;
    }
}
#endif

#line 310
#if !((0 || 1) && 0)
static inline void
run_binary_simd_not_equal_u32(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_uint, npy_bool)) {
            simd_binary_scalar1_not_equal_u32(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_uint, npy_bool)) {
            simd_binary_scalar2_not_equal_u32(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_uint, npy_bool)) {
            simd_binary_not_equal_u32(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_uint in1 = *(npy_uint *)ip1;
        const npy_uint in2 = *(npy_uint *)ip2;
#endif
        *((npy_bool *)op1) = in1 != in2;
    }
}
#endif

#line 310
#if !((0 || 0) && 0)
static inline void
run_binary_simd_less_u32(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_uint, npy_bool)) {
            simd_binary_scalar1_less_u32(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_uint, npy_bool)) {
            simd_binary_scalar2_less_u32(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_uint, npy_bool)) {
            simd_binary_less_u32(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_uint in1 = *(npy_uint *)ip1;
        const npy_uint in2 = *(npy_uint *)ip2;
#endif
        *((npy_bool *)op1) = in1 < in2;
    }
}
#endif

#line 310
#if !((0 || 0) && 0)
static inline void
run_binary_simd_less_equal_u32(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_uint, npy_bool)) {
            simd_binary_scalar1_less_equal_u32(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_uint, npy_bool)) {
            simd_binary_scalar2_less_equal_u32(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_uint, npy_bool)) {
            simd_binary_less_equal_u32(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_uint in1 = *(npy_uint *)ip1;
        const npy_uint in2 = *(npy_uint *)ip2;
#endif
        *((npy_bool *)op1) = in1 <= in2;
    }
}
#endif


#line 304
#line 310
#if !((1 || 0) && 1)
static inline void
run_binary_simd_equal_s32(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_int, npy_bool)) {
            simd_binary_scalar1_equal_s32(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_int, npy_bool)) {
            simd_binary_scalar2_equal_s32(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_int, npy_bool)) {
            simd_binary_equal_s32(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_int in1 = *(npy_int *)ip1;
        const npy_int in2 = *(npy_int *)ip2;
#endif
        *((npy_bool *)op1) = in1 == in2;
    }
}
#endif

#line 310
#if !((0 || 1) && 1)
static inline void
run_binary_simd_not_equal_s32(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_int, npy_bool)) {
            simd_binary_scalar1_not_equal_s32(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_int, npy_bool)) {
            simd_binary_scalar2_not_equal_s32(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_int, npy_bool)) {
            simd_binary_not_equal_s32(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_int in1 = *(npy_int *)ip1;
        const npy_int in2 = *(npy_int *)ip2;
#endif
        *((npy_bool *)op1) = in1 != in2;
    }
}
#endif

#line 310
#if !((0 || 0) && 1)
static inline void
run_binary_simd_less_s32(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_int, npy_bool)) {
            simd_binary_scalar1_less_s32(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_int, npy_bool)) {
            simd_binary_scalar2_less_s32(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_int, npy_bool)) {
            simd_binary_less_s32(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_int in1 = *(npy_int *)ip1;
        const npy_int in2 = *(npy_int *)ip2;
#endif
        *((npy_bool *)op1) = in1 < in2;
    }
}
#endif

#line 310
#if !((0 || 0) && 1)
static inline void
run_binary_simd_less_equal_s32(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_int, npy_bool)) {
            simd_binary_scalar1_less_equal_s32(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_int, npy_bool)) {
            simd_binary_scalar2_less_equal_s32(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_int, npy_bool)) {
            simd_binary_less_equal_s32(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_int in1 = *(npy_int *)ip1;
        const npy_int in2 = *(npy_int *)ip2;
#endif
        *((npy_bool *)op1) = in1 <= in2;
    }
}
#endif


#line 304
#line 310
#if !((1 || 0) && 0)
static inline void
run_binary_simd_equal_u64(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_ulonglong, npy_bool)) {
            simd_binary_scalar1_equal_u64(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_ulonglong, npy_bool)) {
            simd_binary_scalar2_equal_u64(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_ulonglong, npy_bool)) {
            simd_binary_equal_u64(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
        const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
#endif
        *((npy_bool *)op1) = in1 == in2;
    }
}
#endif

#line 310
#if !((0 || 1) && 0)
static inline void
run_binary_simd_not_equal_u64(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_ulonglong, npy_bool)) {
            simd_binary_scalar1_not_equal_u64(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_ulonglong, npy_bool)) {
            simd_binary_scalar2_not_equal_u64(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_ulonglong, npy_bool)) {
            simd_binary_not_equal_u64(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
        const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
#endif
        *((npy_bool *)op1) = in1 != in2;
    }
}
#endif

#line 310
#if !((0 || 0) && 0)
static inline void
run_binary_simd_less_u64(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_ulonglong, npy_bool)) {
            simd_binary_scalar1_less_u64(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_ulonglong, npy_bool)) {
            simd_binary_scalar2_less_u64(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_ulonglong, npy_bool)) {
            simd_binary_less_u64(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
        const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
#endif
        *((npy_bool *)op1) = in1 < in2;
    }
}
#endif

#line 310
#if !((0 || 0) && 0)
static inline void
run_binary_simd_less_equal_u64(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_ulonglong, npy_bool)) {
            simd_binary_scalar1_less_equal_u64(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_ulonglong, npy_bool)) {
            simd_binary_scalar2_less_equal_u64(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_ulonglong, npy_bool)) {
            simd_binary_less_equal_u64(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
        const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
#endif
        *((npy_bool *)op1) = in1 <= in2;
    }
}
#endif


#line 304
#line 310
#if !((1 || 0) && 1)
static inline void
run_binary_simd_equal_s64(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_longlong, npy_bool)) {
            simd_binary_scalar1_equal_s64(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_longlong, npy_bool)) {
            simd_binary_scalar2_equal_s64(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_longlong, npy_bool)) {
            simd_binary_equal_s64(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_longlong in1 = *(npy_longlong *)ip1;
        const npy_longlong in2 = *(npy_longlong *)ip2;
#endif
        *((npy_bool *)op1) = in1 == in2;
    }
}
#endif

#line 310
#if !((0 || 1) && 1)
static inline void
run_binary_simd_not_equal_s64(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_longlong, npy_bool)) {
            simd_binary_scalar1_not_equal_s64(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_longlong, npy_bool)) {
            simd_binary_scalar2_not_equal_s64(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_longlong, npy_bool)) {
            simd_binary_not_equal_s64(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_longlong in1 = *(npy_longlong *)ip1;
        const npy_longlong in2 = *(npy_longlong *)ip2;
#endif
        *((npy_bool *)op1) = in1 != in2;
    }
}
#endif

#line 310
#if !((0 || 0) && 1)
static inline void
run_binary_simd_less_s64(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_longlong, npy_bool)) {
            simd_binary_scalar1_less_s64(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_longlong, npy_bool)) {
            simd_binary_scalar2_less_s64(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_longlong, npy_bool)) {
            simd_binary_less_s64(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_longlong in1 = *(npy_longlong *)ip1;
        const npy_longlong in2 = *(npy_longlong *)ip2;
#endif
        *((npy_bool *)op1) = in1 < in2;
    }
}
#endif

#line 310
#if !((0 || 0) && 1)
static inline void
run_binary_simd_less_equal_s64(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_longlong, npy_bool)) {
            simd_binary_scalar1_less_equal_s64(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_longlong, npy_bool)) {
            simd_binary_scalar2_less_equal_s64(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_longlong, npy_bool)) {
            simd_binary_less_equal_s64(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_longlong in1 = *(npy_longlong *)ip1;
        const npy_longlong in2 = *(npy_longlong *)ip2;
#endif
        *((npy_bool *)op1) = in1 <= in2;
    }
}
#endif


#line 304
#line 310
#if !((1 || 0) && 0)
static inline void
run_binary_simd_equal_f32(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD_F32
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_float, npy_bool)) {
            simd_binary_scalar1_equal_f32(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_float, npy_bool)) {
            simd_binary_scalar2_equal_f32(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_float, npy_bool)) {
            simd_binary_equal_f32(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_float in1 = *(npy_float *)ip1;
        const npy_float in2 = *(npy_float *)ip2;
#endif
        *((npy_bool *)op1) = in1 == in2;
    }
}
#endif

#line 310
#if !((0 || 1) && 0)
static inline void
run_binary_simd_not_equal_f32(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD_F32
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_float, npy_bool)) {
            simd_binary_scalar1_not_equal_f32(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_float, npy_bool)) {
            simd_binary_scalar2_not_equal_f32(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_float, npy_bool)) {
            simd_binary_not_equal_f32(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_float in1 = *(npy_float *)ip1;
        const npy_float in2 = *(npy_float *)ip2;
#endif
        *((npy_bool *)op1) = in1 != in2;
    }
}
#endif

#line 310
#if !((0 || 0) && 0)
static inline void
run_binary_simd_less_f32(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD_F32
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_float, npy_bool)) {
            simd_binary_scalar1_less_f32(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_float, npy_bool)) {
            simd_binary_scalar2_less_f32(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_float, npy_bool)) {
            simd_binary_less_f32(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_float in1 = *(npy_float *)ip1;
        const npy_float in2 = *(npy_float *)ip2;
#endif
        *((npy_bool *)op1) = in1 < in2;
    }
}
#endif

#line 310
#if !((0 || 0) && 0)
static inline void
run_binary_simd_less_equal_f32(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD_F32
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_float, npy_bool)) {
            simd_binary_scalar1_less_equal_f32(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_float, npy_bool)) {
            simd_binary_scalar2_less_equal_f32(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_float, npy_bool)) {
            simd_binary_less_equal_f32(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_float in1 = *(npy_float *)ip1;
        const npy_float in2 = *(npy_float *)ip2;
#endif
        *((npy_bool *)op1) = in1 <= in2;
    }
}
#endif


#line 304
#line 310
#if !((1 || 0) && 0)
static inline void
run_binary_simd_equal_f64(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD_F64
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_double, npy_bool)) {
            simd_binary_scalar1_equal_f64(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_double, npy_bool)) {
            simd_binary_scalar2_equal_f64(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_double, npy_bool)) {
            simd_binary_equal_f64(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_double in1 = *(npy_double *)ip1;
        const npy_double in2 = *(npy_double *)ip2;
#endif
        *((npy_bool *)op1) = in1 == in2;
    }
}
#endif

#line 310
#if !((0 || 1) && 0)
static inline void
run_binary_simd_not_equal_f64(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD_F64
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_double, npy_bool)) {
            simd_binary_scalar1_not_equal_f64(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_double, npy_bool)) {
            simd_binary_scalar2_not_equal_f64(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_double, npy_bool)) {
            simd_binary_not_equal_f64(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_double in1 = *(npy_double *)ip1;
        const npy_double in2 = *(npy_double *)ip2;
#endif
        *((npy_bool *)op1) = in1 != in2;
    }
}
#endif

#line 310
#if !((0 || 0) && 0)
static inline void
run_binary_simd_less_f64(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD_F64
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_double, npy_bool)) {
            simd_binary_scalar1_less_f64(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_double, npy_bool)) {
            simd_binary_scalar2_less_f64(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_double, npy_bool)) {
            simd_binary_less_f64(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_double in1 = *(npy_double *)ip1;
        const npy_double in2 = *(npy_double *)ip2;
#endif
        *((npy_bool *)op1) = in1 < in2;
    }
}
#endif

#line 310
#if !((0 || 0) && 0)
static inline void
run_binary_simd_less_equal_f64(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD_F64
    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
    ) {
        /* argument one scalar */
        if (IS_BINARY_CONT_S1(npy_double, npy_bool)) {
            simd_binary_scalar1_less_equal_f64(args, dimensions[0]);
            return;
        }
        /* argument two scalar */
        else if (IS_BINARY_CONT_S2(npy_double, npy_bool)) {
            simd_binary_scalar2_less_equal_f64(args, dimensions[0]);
            return;
        }
        else if (IS_BINARY_CONT(npy_double, npy_bool)) {
            simd_binary_less_equal_f64(args, dimensions[0]);
            return;
        }
    }
#endif

    BINARY_LOOP {
#if 0
        npy_bool in1 = *((npy_bool *)ip1) != 0;
        npy_bool in2 = *((npy_bool *)ip2) != 0;
#else
        const npy_double in1 = *(npy_double *)ip1;
        const npy_double in2 = *(npy_double *)ip2;
#endif
        *((npy_bool *)op1) = in1 <= in2;
    }
}
#endif



/********************************************************************************
 ** Defining ufunc inner functions
 ********************************************************************************/

/*
 * In order to reduce the size of the binary generated from this source, the
 * following rules are applied: 1) each data type implements its function
 * 'greater' as a call to the function 'less' but with the arguments swapped,
 * the same applies to the function 'greater_equal', which is implemented
 * with a call to the function 'less_equal', and 2) for the integer datatypes
 * of the same size (eg 8-bit), a single kernel of the functions 'equal' and
 * 'not_equal' is used to implement both signed and unsigned types.
 */

#line 372
#undef TO_SIMD_SFX
#undef TO_SIMD_UTYPE
#if 0
#line 378
#elif NPY_BITSOF_BYTE == 8
    #define TO_SIMD_UTYPE(X) X##_u8
    #if 0
        #define TO_SIMD_SFX(X) X##_s8
    #else
        #define TO_SIMD_SFX(X) X##_u8
    #endif

#line 378
#elif NPY_BITSOF_BYTE == 16
    #define TO_SIMD_UTYPE(X) X##_u16
    #if 0
        #define TO_SIMD_SFX(X) X##_s16
    #else
        #define TO_SIMD_SFX(X) X##_u16
    #endif

#line 378
#elif NPY_BITSOF_BYTE == 32
    #define TO_SIMD_UTYPE(X) X##_u32
    #if 0
        #define TO_SIMD_SFX(X) X##_s32
    #else
        #define TO_SIMD_SFX(X) X##_u32
    #endif

#line 378
#elif NPY_BITSOF_BYTE == 64
    #define TO_SIMD_UTYPE(X) X##_u64
    #if 0
        #define TO_SIMD_SFX(X) X##_s64
    #else
        #define TO_SIMD_SFX(X) X##_u64
    #endif

#endif

#line 392
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_greater)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *nargs[3] = {args[1], args[0], args[2]};
    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
    TO_SIMD_SFX(run_binary_simd_less)(nargs, dimensions, nsteps);
}

#line 392
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_greater_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *nargs[3] = {args[1], args[0], args[2]};
    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
    TO_SIMD_SFX(run_binary_simd_less_equal)(nargs, dimensions, nsteps);
}


#line 404
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_less)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_SFX(run_binary_simd_less)(args, dimensions, steps);
}

#line 404
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_less_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_SFX(run_binary_simd_less_equal)(args, dimensions, steps);
}


#line 414
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_UTYPE(run_binary_simd_equal)(args, dimensions, steps);
}

#line 414
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_not_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_UTYPE(run_binary_simd_not_equal)(args, dimensions, steps);
}


#line 372
#undef TO_SIMD_SFX
#undef TO_SIMD_UTYPE
#if 0
#line 378
#elif NPY_BITSOF_SHORT == 8
    #define TO_SIMD_UTYPE(X) X##_u8
    #if 0
        #define TO_SIMD_SFX(X) X##_s8
    #else
        #define TO_SIMD_SFX(X) X##_u8
    #endif

#line 378
#elif NPY_BITSOF_SHORT == 16
    #define TO_SIMD_UTYPE(X) X##_u16
    #if 0
        #define TO_SIMD_SFX(X) X##_s16
    #else
        #define TO_SIMD_SFX(X) X##_u16
    #endif

#line 378
#elif NPY_BITSOF_SHORT == 32
    #define TO_SIMD_UTYPE(X) X##_u32
    #if 0
        #define TO_SIMD_SFX(X) X##_s32
    #else
        #define TO_SIMD_SFX(X) X##_u32
    #endif

#line 378
#elif NPY_BITSOF_SHORT == 64
    #define TO_SIMD_UTYPE(X) X##_u64
    #if 0
        #define TO_SIMD_SFX(X) X##_s64
    #else
        #define TO_SIMD_SFX(X) X##_u64
    #endif

#endif

#line 392
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_greater)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *nargs[3] = {args[1], args[0], args[2]};
    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
    TO_SIMD_SFX(run_binary_simd_less)(nargs, dimensions, nsteps);
}

#line 392
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_greater_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *nargs[3] = {args[1], args[0], args[2]};
    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
    TO_SIMD_SFX(run_binary_simd_less_equal)(nargs, dimensions, nsteps);
}


#line 404
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_less)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_SFX(run_binary_simd_less)(args, dimensions, steps);
}

#line 404
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_less_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_SFX(run_binary_simd_less_equal)(args, dimensions, steps);
}


#line 414
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_UTYPE(run_binary_simd_equal)(args, dimensions, steps);
}

#line 414
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_not_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_UTYPE(run_binary_simd_not_equal)(args, dimensions, steps);
}


#line 372
#undef TO_SIMD_SFX
#undef TO_SIMD_UTYPE
#if 0
#line 378
#elif NPY_BITSOF_INT == 8
    #define TO_SIMD_UTYPE(X) X##_u8
    #if 0
        #define TO_SIMD_SFX(X) X##_s8
    #else
        #define TO_SIMD_SFX(X) X##_u8
    #endif

#line 378
#elif NPY_BITSOF_INT == 16
    #define TO_SIMD_UTYPE(X) X##_u16
    #if 0
        #define TO_SIMD_SFX(X) X##_s16
    #else
        #define TO_SIMD_SFX(X) X##_u16
    #endif

#line 378
#elif NPY_BITSOF_INT == 32
    #define TO_SIMD_UTYPE(X) X##_u32
    #if 0
        #define TO_SIMD_SFX(X) X##_s32
    #else
        #define TO_SIMD_SFX(X) X##_u32
    #endif

#line 378
#elif NPY_BITSOF_INT == 64
    #define TO_SIMD_UTYPE(X) X##_u64
    #if 0
        #define TO_SIMD_SFX(X) X##_s64
    #else
        #define TO_SIMD_SFX(X) X##_u64
    #endif

#endif

#line 392
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_greater)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *nargs[3] = {args[1], args[0], args[2]};
    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
    TO_SIMD_SFX(run_binary_simd_less)(nargs, dimensions, nsteps);
}

#line 392
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_greater_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *nargs[3] = {args[1], args[0], args[2]};
    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
    TO_SIMD_SFX(run_binary_simd_less_equal)(nargs, dimensions, nsteps);
}


#line 404
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_less)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_SFX(run_binary_simd_less)(args, dimensions, steps);
}

#line 404
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_less_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_SFX(run_binary_simd_less_equal)(args, dimensions, steps);
}


#line 414
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_UTYPE(run_binary_simd_equal)(args, dimensions, steps);
}

#line 414
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_not_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_UTYPE(run_binary_simd_not_equal)(args, dimensions, steps);
}


#line 372
#undef TO_SIMD_SFX
#undef TO_SIMD_UTYPE
#if 0
#line 378
#elif NPY_BITSOF_LONG == 8
    #define TO_SIMD_UTYPE(X) X##_u8
    #if 0
        #define TO_SIMD_SFX(X) X##_s8
    #else
        #define TO_SIMD_SFX(X) X##_u8
    #endif

#line 378
#elif NPY_BITSOF_LONG == 16
    #define TO_SIMD_UTYPE(X) X##_u16
    #if 0
        #define TO_SIMD_SFX(X) X##_s16
    #else
        #define TO_SIMD_SFX(X) X##_u16
    #endif

#line 378
#elif NPY_BITSOF_LONG == 32
    #define TO_SIMD_UTYPE(X) X##_u32
    #if 0
        #define TO_SIMD_SFX(X) X##_s32
    #else
        #define TO_SIMD_SFX(X) X##_u32
    #endif

#line 378
#elif NPY_BITSOF_LONG == 64
    #define TO_SIMD_UTYPE(X) X##_u64
    #if 0
        #define TO_SIMD_SFX(X) X##_s64
    #else
        #define TO_SIMD_SFX(X) X##_u64
    #endif

#endif

#line 392
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_greater)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *nargs[3] = {args[1], args[0], args[2]};
    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
    TO_SIMD_SFX(run_binary_simd_less)(nargs, dimensions, nsteps);
}

#line 392
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_greater_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *nargs[3] = {args[1], args[0], args[2]};
    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
    TO_SIMD_SFX(run_binary_simd_less_equal)(nargs, dimensions, nsteps);
}


#line 404
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_less)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_SFX(run_binary_simd_less)(args, dimensions, steps);
}

#line 404
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_less_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_SFX(run_binary_simd_less_equal)(args, dimensions, steps);
}


#line 414
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_UTYPE(run_binary_simd_equal)(args, dimensions, steps);
}

#line 414
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_not_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_UTYPE(run_binary_simd_not_equal)(args, dimensions, steps);
}


#line 372
#undef TO_SIMD_SFX
#undef TO_SIMD_UTYPE
#if 0
#line 378
#elif NPY_BITSOF_LONGLONG == 8
    #define TO_SIMD_UTYPE(X) X##_u8
    #if 0
        #define TO_SIMD_SFX(X) X##_s8
    #else
        #define TO_SIMD_SFX(X) X##_u8
    #endif

#line 378
#elif NPY_BITSOF_LONGLONG == 16
    #define TO_SIMD_UTYPE(X) X##_u16
    #if 0
        #define TO_SIMD_SFX(X) X##_s16
    #else
        #define TO_SIMD_SFX(X) X##_u16
    #endif

#line 378
#elif NPY_BITSOF_LONGLONG == 32
    #define TO_SIMD_UTYPE(X) X##_u32
    #if 0
        #define TO_SIMD_SFX(X) X##_s32
    #else
        #define TO_SIMD_SFX(X) X##_u32
    #endif

#line 378
#elif NPY_BITSOF_LONGLONG == 64
    #define TO_SIMD_UTYPE(X) X##_u64
    #if 0
        #define TO_SIMD_SFX(X) X##_s64
    #else
        #define TO_SIMD_SFX(X) X##_u64
    #endif

#endif

#line 392
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_greater)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *nargs[3] = {args[1], args[0], args[2]};
    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
    TO_SIMD_SFX(run_binary_simd_less)(nargs, dimensions, nsteps);
}

#line 392
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_greater_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *nargs[3] = {args[1], args[0], args[2]};
    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
    TO_SIMD_SFX(run_binary_simd_less_equal)(nargs, dimensions, nsteps);
}


#line 404
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_less)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_SFX(run_binary_simd_less)(args, dimensions, steps);
}

#line 404
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_less_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_SFX(run_binary_simd_less_equal)(args, dimensions, steps);
}


#line 414
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_UTYPE(run_binary_simd_equal)(args, dimensions, steps);
}

#line 414
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_not_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_UTYPE(run_binary_simd_not_equal)(args, dimensions, steps);
}


#line 372
#undef TO_SIMD_SFX
#undef TO_SIMD_UTYPE
#if 0
#line 378
#elif NPY_BITSOF_BYTE == 8
    #define TO_SIMD_UTYPE(X) X##_u8
    #if 1
        #define TO_SIMD_SFX(X) X##_s8
    #else
        #define TO_SIMD_SFX(X) X##_u8
    #endif

#line 378
#elif NPY_BITSOF_BYTE == 16
    #define TO_SIMD_UTYPE(X) X##_u16
    #if 1
        #define TO_SIMD_SFX(X) X##_s16
    #else
        #define TO_SIMD_SFX(X) X##_u16
    #endif

#line 378
#elif NPY_BITSOF_BYTE == 32
    #define TO_SIMD_UTYPE(X) X##_u32
    #if 1
        #define TO_SIMD_SFX(X) X##_s32
    #else
        #define TO_SIMD_SFX(X) X##_u32
    #endif

#line 378
#elif NPY_BITSOF_BYTE == 64
    #define TO_SIMD_UTYPE(X) X##_u64
    #if 1
        #define TO_SIMD_SFX(X) X##_s64
    #else
        #define TO_SIMD_SFX(X) X##_u64
    #endif

#endif

#line 392
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_greater)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *nargs[3] = {args[1], args[0], args[2]};
    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
    TO_SIMD_SFX(run_binary_simd_less)(nargs, dimensions, nsteps);
}

#line 392
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_greater_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *nargs[3] = {args[1], args[0], args[2]};
    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
    TO_SIMD_SFX(run_binary_simd_less_equal)(nargs, dimensions, nsteps);
}


#line 404
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_less)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_SFX(run_binary_simd_less)(args, dimensions, steps);
}

#line 404
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_less_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_SFX(run_binary_simd_less_equal)(args, dimensions, steps);
}


#line 414
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_UTYPE(run_binary_simd_equal)(args, dimensions, steps);
}

#line 414
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_not_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_UTYPE(run_binary_simd_not_equal)(args, dimensions, steps);
}


#line 372
#undef TO_SIMD_SFX
#undef TO_SIMD_UTYPE
#if 0
#line 378
#elif NPY_BITSOF_SHORT == 8
    #define TO_SIMD_UTYPE(X) X##_u8
    #if 1
        #define TO_SIMD_SFX(X) X##_s8
    #else
        #define TO_SIMD_SFX(X) X##_u8
    #endif

#line 378
#elif NPY_BITSOF_SHORT == 16
    #define TO_SIMD_UTYPE(X) X##_u16
    #if 1
        #define TO_SIMD_SFX(X) X##_s16
    #else
        #define TO_SIMD_SFX(X) X##_u16
    #endif

#line 378
#elif NPY_BITSOF_SHORT == 32
    #define TO_SIMD_UTYPE(X) X##_u32
    #if 1
        #define TO_SIMD_SFX(X) X##_s32
    #else
        #define TO_SIMD_SFX(X) X##_u32
    #endif

#line 378
#elif NPY_BITSOF_SHORT == 64
    #define TO_SIMD_UTYPE(X) X##_u64
    #if 1
        #define TO_SIMD_SFX(X) X##_s64
    #else
        #define TO_SIMD_SFX(X) X##_u64
    #endif

#endif

#line 392
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_greater)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *nargs[3] = {args[1], args[0], args[2]};
    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
    TO_SIMD_SFX(run_binary_simd_less)(nargs, dimensions, nsteps);
}

#line 392
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_greater_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *nargs[3] = {args[1], args[0], args[2]};
    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
    TO_SIMD_SFX(run_binary_simd_less_equal)(nargs, dimensions, nsteps);
}


#line 404
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_less)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_SFX(run_binary_simd_less)(args, dimensions, steps);
}

#line 404
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_less_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_SFX(run_binary_simd_less_equal)(args, dimensions, steps);
}


#line 414
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_UTYPE(run_binary_simd_equal)(args, dimensions, steps);
}

#line 414
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_not_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_UTYPE(run_binary_simd_not_equal)(args, dimensions, steps);
}


#line 372
#undef TO_SIMD_SFX
#undef TO_SIMD_UTYPE
#if 0
#line 378
#elif NPY_BITSOF_INT == 8
    #define TO_SIMD_UTYPE(X) X##_u8
    #if 1
        #define TO_SIMD_SFX(X) X##_s8
    #else
        #define TO_SIMD_SFX(X) X##_u8
    #endif

#line 378
#elif NPY_BITSOF_INT == 16
    #define TO_SIMD_UTYPE(X) X##_u16
    #if 1
        #define TO_SIMD_SFX(X) X##_s16
    #else
        #define TO_SIMD_SFX(X) X##_u16
    #endif

#line 378
#elif NPY_BITSOF_INT == 32
    #define TO_SIMD_UTYPE(X) X##_u32
    #if 1
        #define TO_SIMD_SFX(X) X##_s32
    #else
        #define TO_SIMD_SFX(X) X##_u32
    #endif

#line 378
#elif NPY_BITSOF_INT == 64
    #define TO_SIMD_UTYPE(X) X##_u64
    #if 1
        #define TO_SIMD_SFX(X) X##_s64
    #else
        #define TO_SIMD_SFX(X) X##_u64
    #endif

#endif

#line 392
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_greater)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *nargs[3] = {args[1], args[0], args[2]};
    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
    TO_SIMD_SFX(run_binary_simd_less)(nargs, dimensions, nsteps);
}

#line 392
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_greater_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *nargs[3] = {args[1], args[0], args[2]};
    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
    TO_SIMD_SFX(run_binary_simd_less_equal)(nargs, dimensions, nsteps);
}


#line 404
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_less)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_SFX(run_binary_simd_less)(args, dimensions, steps);
}

#line 404
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_less_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_SFX(run_binary_simd_less_equal)(args, dimensions, steps);
}


#line 414
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_UTYPE(run_binary_simd_equal)(args, dimensions, steps);
}

#line 414
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_not_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_UTYPE(run_binary_simd_not_equal)(args, dimensions, steps);
}


#line 372
#undef TO_SIMD_SFX
#undef TO_SIMD_UTYPE
#if 0
#line 378
#elif NPY_BITSOF_LONG == 8
    #define TO_SIMD_UTYPE(X) X##_u8
    #if 1
        #define TO_SIMD_SFX(X) X##_s8
    #else
        #define TO_SIMD_SFX(X) X##_u8
    #endif

#line 378
#elif NPY_BITSOF_LONG == 16
    #define TO_SIMD_UTYPE(X) X##_u16
    #if 1
        #define TO_SIMD_SFX(X) X##_s16
    #else
        #define TO_SIMD_SFX(X) X##_u16
    #endif

#line 378
#elif NPY_BITSOF_LONG == 32
    #define TO_SIMD_UTYPE(X) X##_u32
    #if 1
        #define TO_SIMD_SFX(X) X##_s32
    #else
        #define TO_SIMD_SFX(X) X##_u32
    #endif

#line 378
#elif NPY_BITSOF_LONG == 64
    #define TO_SIMD_UTYPE(X) X##_u64
    #if 1
        #define TO_SIMD_SFX(X) X##_s64
    #else
        #define TO_SIMD_SFX(X) X##_u64
    #endif

#endif

#line 392
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_greater)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *nargs[3] = {args[1], args[0], args[2]};
    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
    TO_SIMD_SFX(run_binary_simd_less)(nargs, dimensions, nsteps);
}

#line 392
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_greater_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *nargs[3] = {args[1], args[0], args[2]};
    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
    TO_SIMD_SFX(run_binary_simd_less_equal)(nargs, dimensions, nsteps);
}


#line 404
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_less)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_SFX(run_binary_simd_less)(args, dimensions, steps);
}

#line 404
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_less_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_SFX(run_binary_simd_less_equal)(args, dimensions, steps);
}


#line 414
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_UTYPE(run_binary_simd_equal)(args, dimensions, steps);
}

#line 414
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_not_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_UTYPE(run_binary_simd_not_equal)(args, dimensions, steps);
}


#line 372
#undef TO_SIMD_SFX
#undef TO_SIMD_UTYPE
#if 0
#line 378
#elif NPY_BITSOF_LONGLONG == 8
    #define TO_SIMD_UTYPE(X) X##_u8
    #if 1
        #define TO_SIMD_SFX(X) X##_s8
    #else
        #define TO_SIMD_SFX(X) X##_u8
    #endif

#line 378
#elif NPY_BITSOF_LONGLONG == 16
    #define TO_SIMD_UTYPE(X) X##_u16
    #if 1
        #define TO_SIMD_SFX(X) X##_s16
    #else
        #define TO_SIMD_SFX(X) X##_u16
    #endif

#line 378
#elif NPY_BITSOF_LONGLONG == 32
    #define TO_SIMD_UTYPE(X) X##_u32
    #if 1
        #define TO_SIMD_SFX(X) X##_s32
    #else
        #define TO_SIMD_SFX(X) X##_u32
    #endif

#line 378
#elif NPY_BITSOF_LONGLONG == 64
    #define TO_SIMD_UTYPE(X) X##_u64
    #if 1
        #define TO_SIMD_SFX(X) X##_s64
    #else
        #define TO_SIMD_SFX(X) X##_u64
    #endif

#endif

#line 392
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_greater)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *nargs[3] = {args[1], args[0], args[2]};
    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
    TO_SIMD_SFX(run_binary_simd_less)(nargs, dimensions, nsteps);
}

#line 392
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_greater_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *nargs[3] = {args[1], args[0], args[2]};
    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
    TO_SIMD_SFX(run_binary_simd_less_equal)(nargs, dimensions, nsteps);
}


#line 404
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_less)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_SFX(run_binary_simd_less)(args, dimensions, steps);
}

#line 404
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_less_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_SFX(run_binary_simd_less_equal)(args, dimensions, steps);
}


#line 414
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_UTYPE(run_binary_simd_equal)(args, dimensions, steps);
}

#line 414
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_not_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    TO_SIMD_UTYPE(run_binary_simd_not_equal)(args, dimensions, steps);
}



#line 428
#line 432
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_greater)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *nargs[3] = {args[1], args[0], args[2]};
    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
    run_binary_simd_less_b8(nargs, dimensions, nsteps);
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 432
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_greater_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *nargs[3] = {args[1], args[0], args[2]};
    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
    run_binary_simd_less_equal_b8(nargs, dimensions, nsteps);
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


#line 447
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    run_binary_simd_equal_b8(args, dimensions, steps);
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 447
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_not_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    run_binary_simd_not_equal_b8(args, dimensions, steps);
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 447
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_less)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    run_binary_simd_less_b8(args, dimensions, steps);
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 447
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_less_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    run_binary_simd_less_equal_b8(args, dimensions, steps);
#if 0
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


#line 428
#line 432
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_greater)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *nargs[3] = {args[1], args[0], args[2]};
    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
    run_binary_simd_less_f32(nargs, dimensions, nsteps);
#if 1
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 432
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_greater_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *nargs[3] = {args[1], args[0], args[2]};
    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
    run_binary_simd_less_equal_f32(nargs, dimensions, nsteps);
#if 1
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


#line 447
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    run_binary_simd_equal_f32(args, dimensions, steps);
#if 1
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 447
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_not_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    run_binary_simd_not_equal_f32(args, dimensions, steps);
#if 1
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 447
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_less)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    run_binary_simd_less_f32(args, dimensions, steps);
#if 1
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 447
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_less_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    run_binary_simd_less_equal_f32(args, dimensions, steps);
#if 1
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


#line 428
#line 432
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_greater)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *nargs[3] = {args[1], args[0], args[2]};
    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
    run_binary_simd_less_f64(nargs, dimensions, nsteps);
#if 1
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 432
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_greater_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *nargs[3] = {args[1], args[0], args[2]};
    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
    run_binary_simd_less_equal_f64(nargs, dimensions, nsteps);
#if 1
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}


#line 447
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    run_binary_simd_equal_f64(args, dimensions, steps);
#if 1
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 447
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_not_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    run_binary_simd_not_equal_f64(args, dimensions, steps);
#if 1
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 447
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_less)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    run_binary_simd_less_f64(args, dimensions, steps);
#if 1
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}

#line 447
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_less_equal)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    run_binary_simd_less_equal_f64(args, dimensions, steps);
#if 1
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}



