#line 1 "numpy/core/src/umath/loops_logical.dispatch.c.src"

/*
 *****************************************************************************
 **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
 **       Changes should be made to the original source (.src) file         **
 *****************************************************************************
 */

#line 1
/*@targets
 ** $maxopt baseline
 ** neon asimd
 ** sse2 avx2 avx512_skx
 ** vsx2
 ** vx
 **/
#define _UMATHMODULE
#define _MULTIARRAYMODULE
#define NPY_NO_DEPRECATED_API NPY_API_VERSION

#include "simd/simd.h"
#include "loops_utils.h"
#include "loops.h"
#include "lowlevel_strided_loops.h"
// Provides the various *_LOOP macros
#include "fast_loop_macros.h"

/*******************************************************************************
 ** Defining the SIMD kernels
 ******************************************************************************/

#if NPY_SIMD
/*
 * convert any bit set to boolean true so vectorized and normal operations are
 * consistent, should not be required if bool is used correctly everywhere but
 * you never know
 */
NPY_FINLINE npyv_u8 byte_to_true(npyv_u8 v)
{
    const npyv_u8 zero = npyv_zero_u8();
    const npyv_u8 truemask = npyv_setall_u8(1 == 1);
    // cmpeq(v, 0) turns 0x00 -> 0xff and non-zero -> 0x00
    npyv_u8 tmp = npyv_cvt_u8_b8(npyv_cmpeq_u8(v, zero));
    // tmp is filled with 0xff/0x00, negate and mask to boolean true
    return npyv_andc_u8(truemask, tmp);
}
/*
 * convert mask vector (0xff/0x00) to boolean true.  similar to byte_to_true(),
 * but we've already got a mask and can skip negation.
 */
NPY_FINLINE npyv_u8 mask_to_true(npyv_b8 v)
{
    const npyv_u8 truemask = npyv_setall_u8(1 == 1);
    return npyv_and_u8(truemask, npyv_cvt_u8_b8(v));
}
/*
 * For logical_and, we have to be careful to handle non-bool inputs where
 * bits of each operand might not overlap. Example: a = 0x01, b = 0x80
 * Both evaluate to boolean true, however, a & b is false.  Return value
 * should be consistent with byte_to_true().
 */
NPY_FINLINE npyv_u8 simd_logical_and_u8(npyv_u8 a, npyv_u8 b)
{
    const npyv_u8 zero = npyv_zero_u8();
    const npyv_u8 truemask = npyv_setall_u8(1 == 1);
    npyv_b8 ma = npyv_cmpeq_u8(a, zero);
    npyv_b8 mb = npyv_cmpeq_u8(b, zero);
    npyv_u8 r = npyv_cvt_u8_b8(npyv_or_b8(ma, mb));
    return npyv_andc_u8(truemask, r);
}
/*
 * We don't really need the following, but it simplifies the templating code
 * below since it is paired with simd_logical_and_u8() above.
 */
NPY_FINLINE npyv_u8 simd_logical_or_u8(npyv_u8 a, npyv_u8 b)
{
    npyv_u8 r = npyv_or_u8(a, b);
    return byte_to_true(r);
}


#line 82
static void
simd_binary_logical_and_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, npy_intp len)
{
    #define UNROLL 16

    const int vstep = npyv_nlanes_u8;
    const int wstep = vstep * UNROLL;

    // Unrolled vectors loop
    for (; len >= wstep; len -= wstep, ip1 += wstep, ip2 += wstep, op += wstep) {
        #line 95
        #if UNROLL > 0
        npyv_u8 a0 = npyv_load_u8(ip1 + vstep * 0);
        npyv_u8 b0 = npyv_load_u8(ip2 + vstep * 0);
        npyv_u8 r0 = simd_logical_and_u8(a0, b0);
        npyv_store_u8(op + vstep * 0, r0);
        #endif
        
#line 95
        #if UNROLL > 1
        npyv_u8 a1 = npyv_load_u8(ip1 + vstep * 1);
        npyv_u8 b1 = npyv_load_u8(ip2 + vstep * 1);
        npyv_u8 r1 = simd_logical_and_u8(a1, b1);
        npyv_store_u8(op + vstep * 1, r1);
        #endif
        
#line 95
        #if UNROLL > 2
        npyv_u8 a2 = npyv_load_u8(ip1 + vstep * 2);
        npyv_u8 b2 = npyv_load_u8(ip2 + vstep * 2);
        npyv_u8 r2 = simd_logical_and_u8(a2, b2);
        npyv_store_u8(op + vstep * 2, r2);
        #endif
        
#line 95
        #if UNROLL > 3
        npyv_u8 a3 = npyv_load_u8(ip1 + vstep * 3);
        npyv_u8 b3 = npyv_load_u8(ip2 + vstep * 3);
        npyv_u8 r3 = simd_logical_and_u8(a3, b3);
        npyv_store_u8(op + vstep * 3, r3);
        #endif
        
#line 95
        #if UNROLL > 4
        npyv_u8 a4 = npyv_load_u8(ip1 + vstep * 4);
        npyv_u8 b4 = npyv_load_u8(ip2 + vstep * 4);
        npyv_u8 r4 = simd_logical_and_u8(a4, b4);
        npyv_store_u8(op + vstep * 4, r4);
        #endif
        
#line 95
        #if UNROLL > 5
        npyv_u8 a5 = npyv_load_u8(ip1 + vstep * 5);
        npyv_u8 b5 = npyv_load_u8(ip2 + vstep * 5);
        npyv_u8 r5 = simd_logical_and_u8(a5, b5);
        npyv_store_u8(op + vstep * 5, r5);
        #endif
        
#line 95
        #if UNROLL > 6
        npyv_u8 a6 = npyv_load_u8(ip1 + vstep * 6);
        npyv_u8 b6 = npyv_load_u8(ip2 + vstep * 6);
        npyv_u8 r6 = simd_logical_and_u8(a6, b6);
        npyv_store_u8(op + vstep * 6, r6);
        #endif
        
#line 95
        #if UNROLL > 7
        npyv_u8 a7 = npyv_load_u8(ip1 + vstep * 7);
        npyv_u8 b7 = npyv_load_u8(ip2 + vstep * 7);
        npyv_u8 r7 = simd_logical_and_u8(a7, b7);
        npyv_store_u8(op + vstep * 7, r7);
        #endif
        
#line 95
        #if UNROLL > 8
        npyv_u8 a8 = npyv_load_u8(ip1 + vstep * 8);
        npyv_u8 b8 = npyv_load_u8(ip2 + vstep * 8);
        npyv_u8 r8 = simd_logical_and_u8(a8, b8);
        npyv_store_u8(op + vstep * 8, r8);
        #endif
        
#line 95
        #if UNROLL > 9
        npyv_u8 a9 = npyv_load_u8(ip1 + vstep * 9);
        npyv_u8 b9 = npyv_load_u8(ip2 + vstep * 9);
        npyv_u8 r9 = simd_logical_and_u8(a9, b9);
        npyv_store_u8(op + vstep * 9, r9);
        #endif
        
#line 95
        #if UNROLL > 10
        npyv_u8 a10 = npyv_load_u8(ip1 + vstep * 10);
        npyv_u8 b10 = npyv_load_u8(ip2 + vstep * 10);
        npyv_u8 r10 = simd_logical_and_u8(a10, b10);
        npyv_store_u8(op + vstep * 10, r10);
        #endif
        
#line 95
        #if UNROLL > 11
        npyv_u8 a11 = npyv_load_u8(ip1 + vstep * 11);
        npyv_u8 b11 = npyv_load_u8(ip2 + vstep * 11);
        npyv_u8 r11 = simd_logical_and_u8(a11, b11);
        npyv_store_u8(op + vstep * 11, r11);
        #endif
        
#line 95
        #if UNROLL > 12
        npyv_u8 a12 = npyv_load_u8(ip1 + vstep * 12);
        npyv_u8 b12 = npyv_load_u8(ip2 + vstep * 12);
        npyv_u8 r12 = simd_logical_and_u8(a12, b12);
        npyv_store_u8(op + vstep * 12, r12);
        #endif
        
#line 95
        #if UNROLL > 13
        npyv_u8 a13 = npyv_load_u8(ip1 + vstep * 13);
        npyv_u8 b13 = npyv_load_u8(ip2 + vstep * 13);
        npyv_u8 r13 = simd_logical_and_u8(a13, b13);
        npyv_store_u8(op + vstep * 13, r13);
        #endif
        
#line 95
        #if UNROLL > 14
        npyv_u8 a14 = npyv_load_u8(ip1 + vstep * 14);
        npyv_u8 b14 = npyv_load_u8(ip2 + vstep * 14);
        npyv_u8 r14 = simd_logical_and_u8(a14, b14);
        npyv_store_u8(op + vstep * 14, r14);
        #endif
        
#line 95
        #if UNROLL > 15
        npyv_u8 a15 = npyv_load_u8(ip1 + vstep * 15);
        npyv_u8 b15 = npyv_load_u8(ip2 + vstep * 15);
        npyv_u8 r15 = simd_logical_and_u8(a15, b15);
        npyv_store_u8(op + vstep * 15, r15);
        #endif
        
    }
    #undef UNROLL

    // Single vectors loop
    for (; len >= vstep; len -= vstep, ip1 += vstep, ip2 += vstep, op += vstep) {
        npyv_u8 a = npyv_load_u8(ip1);
        npyv_u8 b = npyv_load_u8(ip2);
        npyv_u8 r = simd_logical_and_u8(a, b);
        npyv_store_u8(op, r);
    }

    // Scalar loop to finish off
    for (; len > 0; len--, ip1++, ip2++, op++) {
        *op = *ip1 && *ip2;
    }
}

static void
simd_reduce_logical_and_BOOL(npy_bool * op, npy_bool * ip, npy_intp len)
{
    #define UNROLL 8

    const int vstep = npyv_nlanes_u8;
    const int wstep = vstep * UNROLL;

    // Unrolled vectors loop
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #if defined(NPY_HAVE_SSE2)
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_u8 v0 = npyv_load_u8(ip + vstep * 0);
        npyv_u8 v1 = npyv_load_u8(ip + vstep * 1);
        npyv_u8 v2 = npyv_load_u8(ip + vstep * 2);
        npyv_u8 v3 = npyv_load_u8(ip + vstep * 3);
        npyv_u8 v4 = npyv_load_u8(ip + vstep * 4);
        npyv_u8 v5 = npyv_load_u8(ip + vstep * 5);
        npyv_u8 v6 = npyv_load_u8(ip + vstep * 6);
        npyv_u8 v7 = npyv_load_u8(ip + vstep * 7);

        npyv_u8 m01 = npyv_min_u8(v0, v1);
        npyv_u8 m23 = npyv_min_u8(v2, v3);
        npyv_u8 m45 = npyv_min_u8(v4, v5);
        npyv_u8 m67 = npyv_min_u8(v6, v7);

        npyv_u8 m0123 = npyv_min_u8(m01, m23);
        npyv_u8 m4567 = npyv_min_u8(m45, m67);

        npyv_u8 mv = npyv_min_u8(m0123, m4567);

        if(npyv_all_u8(mv) == 0){
            *op = !1;
            return;
        }
    }

    // Single vectors loop
    for (; len >= vstep; len -= vstep, ip += vstep) {
        npyv_u8 v0 = npyv_load_u8(ip);
        if(npyv_all_u8(v0) == 0){
            *op = !1;
            return;
        }
    }

    // Scalar loop to finish off
    for (; len > 0; --len, ++ip) {
        *op = *op && *ip;
        if (*op == 0) {
            return;
        }
    }
#undef UNROLL
}

#line 82
static void
simd_binary_logical_or_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, npy_intp len)
{
    #define UNROLL 16

    const int vstep = npyv_nlanes_u8;
    const int wstep = vstep * UNROLL;

    // Unrolled vectors loop
    for (; len >= wstep; len -= wstep, ip1 += wstep, ip2 += wstep, op += wstep) {
        #line 95
        #if UNROLL > 0
        npyv_u8 a0 = npyv_load_u8(ip1 + vstep * 0);
        npyv_u8 b0 = npyv_load_u8(ip2 + vstep * 0);
        npyv_u8 r0 = simd_logical_or_u8(a0, b0);
        npyv_store_u8(op + vstep * 0, r0);
        #endif
        
#line 95
        #if UNROLL > 1
        npyv_u8 a1 = npyv_load_u8(ip1 + vstep * 1);
        npyv_u8 b1 = npyv_load_u8(ip2 + vstep * 1);
        npyv_u8 r1 = simd_logical_or_u8(a1, b1);
        npyv_store_u8(op + vstep * 1, r1);
        #endif
        
#line 95
        #if UNROLL > 2
        npyv_u8 a2 = npyv_load_u8(ip1 + vstep * 2);
        npyv_u8 b2 = npyv_load_u8(ip2 + vstep * 2);
        npyv_u8 r2 = simd_logical_or_u8(a2, b2);
        npyv_store_u8(op + vstep * 2, r2);
        #endif
        
#line 95
        #if UNROLL > 3
        npyv_u8 a3 = npyv_load_u8(ip1 + vstep * 3);
        npyv_u8 b3 = npyv_load_u8(ip2 + vstep * 3);
        npyv_u8 r3 = simd_logical_or_u8(a3, b3);
        npyv_store_u8(op + vstep * 3, r3);
        #endif
        
#line 95
        #if UNROLL > 4
        npyv_u8 a4 = npyv_load_u8(ip1 + vstep * 4);
        npyv_u8 b4 = npyv_load_u8(ip2 + vstep * 4);
        npyv_u8 r4 = simd_logical_or_u8(a4, b4);
        npyv_store_u8(op + vstep * 4, r4);
        #endif
        
#line 95
        #if UNROLL > 5
        npyv_u8 a5 = npyv_load_u8(ip1 + vstep * 5);
        npyv_u8 b5 = npyv_load_u8(ip2 + vstep * 5);
        npyv_u8 r5 = simd_logical_or_u8(a5, b5);
        npyv_store_u8(op + vstep * 5, r5);
        #endif
        
#line 95
        #if UNROLL > 6
        npyv_u8 a6 = npyv_load_u8(ip1 + vstep * 6);
        npyv_u8 b6 = npyv_load_u8(ip2 + vstep * 6);
        npyv_u8 r6 = simd_logical_or_u8(a6, b6);
        npyv_store_u8(op + vstep * 6, r6);
        #endif
        
#line 95
        #if UNROLL > 7
        npyv_u8 a7 = npyv_load_u8(ip1 + vstep * 7);
        npyv_u8 b7 = npyv_load_u8(ip2 + vstep * 7);
        npyv_u8 r7 = simd_logical_or_u8(a7, b7);
        npyv_store_u8(op + vstep * 7, r7);
        #endif
        
#line 95
        #if UNROLL > 8
        npyv_u8 a8 = npyv_load_u8(ip1 + vstep * 8);
        npyv_u8 b8 = npyv_load_u8(ip2 + vstep * 8);
        npyv_u8 r8 = simd_logical_or_u8(a8, b8);
        npyv_store_u8(op + vstep * 8, r8);
        #endif
        
#line 95
        #if UNROLL > 9
        npyv_u8 a9 = npyv_load_u8(ip1 + vstep * 9);
        npyv_u8 b9 = npyv_load_u8(ip2 + vstep * 9);
        npyv_u8 r9 = simd_logical_or_u8(a9, b9);
        npyv_store_u8(op + vstep * 9, r9);
        #endif
        
#line 95
        #if UNROLL > 10
        npyv_u8 a10 = npyv_load_u8(ip1 + vstep * 10);
        npyv_u8 b10 = npyv_load_u8(ip2 + vstep * 10);
        npyv_u8 r10 = simd_logical_or_u8(a10, b10);
        npyv_store_u8(op + vstep * 10, r10);
        #endif
        
#line 95
        #if UNROLL > 11
        npyv_u8 a11 = npyv_load_u8(ip1 + vstep * 11);
        npyv_u8 b11 = npyv_load_u8(ip2 + vstep * 11);
        npyv_u8 r11 = simd_logical_or_u8(a11, b11);
        npyv_store_u8(op + vstep * 11, r11);
        #endif
        
#line 95
        #if UNROLL > 12
        npyv_u8 a12 = npyv_load_u8(ip1 + vstep * 12);
        npyv_u8 b12 = npyv_load_u8(ip2 + vstep * 12);
        npyv_u8 r12 = simd_logical_or_u8(a12, b12);
        npyv_store_u8(op + vstep * 12, r12);
        #endif
        
#line 95
        #if UNROLL > 13
        npyv_u8 a13 = npyv_load_u8(ip1 + vstep * 13);
        npyv_u8 b13 = npyv_load_u8(ip2 + vstep * 13);
        npyv_u8 r13 = simd_logical_or_u8(a13, b13);
        npyv_store_u8(op + vstep * 13, r13);
        #endif
        
#line 95
        #if UNROLL > 14
        npyv_u8 a14 = npyv_load_u8(ip1 + vstep * 14);
        npyv_u8 b14 = npyv_load_u8(ip2 + vstep * 14);
        npyv_u8 r14 = simd_logical_or_u8(a14, b14);
        npyv_store_u8(op + vstep * 14, r14);
        #endif
        
#line 95
        #if UNROLL > 15
        npyv_u8 a15 = npyv_load_u8(ip1 + vstep * 15);
        npyv_u8 b15 = npyv_load_u8(ip2 + vstep * 15);
        npyv_u8 r15 = simd_logical_or_u8(a15, b15);
        npyv_store_u8(op + vstep * 15, r15);
        #endif
        
    }
    #undef UNROLL

    // Single vectors loop
    for (; len >= vstep; len -= vstep, ip1 += vstep, ip2 += vstep, op += vstep) {
        npyv_u8 a = npyv_load_u8(ip1);
        npyv_u8 b = npyv_load_u8(ip2);
        npyv_u8 r = simd_logical_or_u8(a, b);
        npyv_store_u8(op, r);
    }

    // Scalar loop to finish off
    for (; len > 0; len--, ip1++, ip2++, op++) {
        *op = *ip1 || *ip2;
    }
}

static void
simd_reduce_logical_or_BOOL(npy_bool * op, npy_bool * ip, npy_intp len)
{
    #define UNROLL 8

    const int vstep = npyv_nlanes_u8;
    const int wstep = vstep * UNROLL;

    // Unrolled vectors loop
    for (; len >= wstep; len -= wstep, ip += wstep) {
    #if defined(NPY_HAVE_SSE2)
        NPY_PREFETCH(ip + wstep, 0, 3);
    #endif
        npyv_u8 v0 = npyv_load_u8(ip + vstep * 0);
        npyv_u8 v1 = npyv_load_u8(ip + vstep * 1);
        npyv_u8 v2 = npyv_load_u8(ip + vstep * 2);
        npyv_u8 v3 = npyv_load_u8(ip + vstep * 3);
        npyv_u8 v4 = npyv_load_u8(ip + vstep * 4);
        npyv_u8 v5 = npyv_load_u8(ip + vstep * 5);
        npyv_u8 v6 = npyv_load_u8(ip + vstep * 6);
        npyv_u8 v7 = npyv_load_u8(ip + vstep * 7);

        npyv_u8 m01 = npyv_max_u8(v0, v1);
        npyv_u8 m23 = npyv_max_u8(v2, v3);
        npyv_u8 m45 = npyv_max_u8(v4, v5);
        npyv_u8 m67 = npyv_max_u8(v6, v7);

        npyv_u8 m0123 = npyv_max_u8(m01, m23);
        npyv_u8 m4567 = npyv_max_u8(m45, m67);

        npyv_u8 mv = npyv_max_u8(m0123, m4567);

        if(npyv_any_u8(mv) != 0){
            *op = !0;
            return;
        }
    }

    // Single vectors loop
    for (; len >= vstep; len -= vstep, ip += vstep) {
        npyv_u8 v0 = npyv_load_u8(ip);
        if(npyv_any_u8(v0) != 0){
            *op = !0;
            return;
        }
    }

    // Scalar loop to finish off
    for (; len > 0; --len, ++ip) {
        *op = *op || *ip;
        if (*op != 0) {
            return;
        }
    }
#undef UNROLL
}


#line 182
static void
simd_logical_not_BOOL(npy_bool * op, npy_bool * ip, npy_intp len)
{
    #define UNROLL 16

    const int vstep = npyv_nlanes_u8;
    const int wstep = vstep * UNROLL;

    #if 1
    const npyv_u8 zero = npyv_zero_u8();
    #endif

    // Unrolled vectors loop
    for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
        #line 199
        #if UNROLL > 0
        npyv_u8 v0 = npyv_load_u8(ip + vstep * 0);
#if 1
        npyv_u8 r0 = mask_to_true(npyv_cmpeq_u8(v0, zero));
#else
        npyv_u8 r0 = byte_to_true(v0);
#endif
        npyv_store_u8(op + vstep * 0, r0);
        #endif
        
#line 199
        #if UNROLL > 1
        npyv_u8 v1 = npyv_load_u8(ip + vstep * 1);
#if 1
        npyv_u8 r1 = mask_to_true(npyv_cmpeq_u8(v1, zero));
#else
        npyv_u8 r1 = byte_to_true(v1);
#endif
        npyv_store_u8(op + vstep * 1, r1);
        #endif
        
#line 199
        #if UNROLL > 2
        npyv_u8 v2 = npyv_load_u8(ip + vstep * 2);
#if 1
        npyv_u8 r2 = mask_to_true(npyv_cmpeq_u8(v2, zero));
#else
        npyv_u8 r2 = byte_to_true(v2);
#endif
        npyv_store_u8(op + vstep * 2, r2);
        #endif
        
#line 199
        #if UNROLL > 3
        npyv_u8 v3 = npyv_load_u8(ip + vstep * 3);
#if 1
        npyv_u8 r3 = mask_to_true(npyv_cmpeq_u8(v3, zero));
#else
        npyv_u8 r3 = byte_to_true(v3);
#endif
        npyv_store_u8(op + vstep * 3, r3);
        #endif
        
#line 199
        #if UNROLL > 4
        npyv_u8 v4 = npyv_load_u8(ip + vstep * 4);
#if 1
        npyv_u8 r4 = mask_to_true(npyv_cmpeq_u8(v4, zero));
#else
        npyv_u8 r4 = byte_to_true(v4);
#endif
        npyv_store_u8(op + vstep * 4, r4);
        #endif
        
#line 199
        #if UNROLL > 5
        npyv_u8 v5 = npyv_load_u8(ip + vstep * 5);
#if 1
        npyv_u8 r5 = mask_to_true(npyv_cmpeq_u8(v5, zero));
#else
        npyv_u8 r5 = byte_to_true(v5);
#endif
        npyv_store_u8(op + vstep * 5, r5);
        #endif
        
#line 199
        #if UNROLL > 6
        npyv_u8 v6 = npyv_load_u8(ip + vstep * 6);
#if 1
        npyv_u8 r6 = mask_to_true(npyv_cmpeq_u8(v6, zero));
#else
        npyv_u8 r6 = byte_to_true(v6);
#endif
        npyv_store_u8(op + vstep * 6, r6);
        #endif
        
#line 199
        #if UNROLL > 7
        npyv_u8 v7 = npyv_load_u8(ip + vstep * 7);
#if 1
        npyv_u8 r7 = mask_to_true(npyv_cmpeq_u8(v7, zero));
#else
        npyv_u8 r7 = byte_to_true(v7);
#endif
        npyv_store_u8(op + vstep * 7, r7);
        #endif
        
#line 199
        #if UNROLL > 8
        npyv_u8 v8 = npyv_load_u8(ip + vstep * 8);
#if 1
        npyv_u8 r8 = mask_to_true(npyv_cmpeq_u8(v8, zero));
#else
        npyv_u8 r8 = byte_to_true(v8);
#endif
        npyv_store_u8(op + vstep * 8, r8);
        #endif
        
#line 199
        #if UNROLL > 9
        npyv_u8 v9 = npyv_load_u8(ip + vstep * 9);
#if 1
        npyv_u8 r9 = mask_to_true(npyv_cmpeq_u8(v9, zero));
#else
        npyv_u8 r9 = byte_to_true(v9);
#endif
        npyv_store_u8(op + vstep * 9, r9);
        #endif
        
#line 199
        #if UNROLL > 10
        npyv_u8 v10 = npyv_load_u8(ip + vstep * 10);
#if 1
        npyv_u8 r10 = mask_to_true(npyv_cmpeq_u8(v10, zero));
#else
        npyv_u8 r10 = byte_to_true(v10);
#endif
        npyv_store_u8(op + vstep * 10, r10);
        #endif
        
#line 199
        #if UNROLL > 11
        npyv_u8 v11 = npyv_load_u8(ip + vstep * 11);
#if 1
        npyv_u8 r11 = mask_to_true(npyv_cmpeq_u8(v11, zero));
#else
        npyv_u8 r11 = byte_to_true(v11);
#endif
        npyv_store_u8(op + vstep * 11, r11);
        #endif
        
#line 199
        #if UNROLL > 12
        npyv_u8 v12 = npyv_load_u8(ip + vstep * 12);
#if 1
        npyv_u8 r12 = mask_to_true(npyv_cmpeq_u8(v12, zero));
#else
        npyv_u8 r12 = byte_to_true(v12);
#endif
        npyv_store_u8(op + vstep * 12, r12);
        #endif
        
#line 199
        #if UNROLL > 13
        npyv_u8 v13 = npyv_load_u8(ip + vstep * 13);
#if 1
        npyv_u8 r13 = mask_to_true(npyv_cmpeq_u8(v13, zero));
#else
        npyv_u8 r13 = byte_to_true(v13);
#endif
        npyv_store_u8(op + vstep * 13, r13);
        #endif
        
#line 199
        #if UNROLL > 14
        npyv_u8 v14 = npyv_load_u8(ip + vstep * 14);
#if 1
        npyv_u8 r14 = mask_to_true(npyv_cmpeq_u8(v14, zero));
#else
        npyv_u8 r14 = byte_to_true(v14);
#endif
        npyv_store_u8(op + vstep * 14, r14);
        #endif
        
#line 199
        #if UNROLL > 15
        npyv_u8 v15 = npyv_load_u8(ip + vstep * 15);
#if 1
        npyv_u8 r15 = mask_to_true(npyv_cmpeq_u8(v15, zero));
#else
        npyv_u8 r15 = byte_to_true(v15);
#endif
        npyv_store_u8(op + vstep * 15, r15);
        #endif
        
    }
    #undef UNROLL

    // Single vectors loop
    for (; len >= vstep; len -= vstep, ip += vstep, op += vstep) {
        npyv_u8 v = npyv_load_u8(ip);
#if 1
        npyv_u8 r = mask_to_true(npyv_cmpeq_u8(v, zero));
#else
        npyv_u8 r = byte_to_true(v);
#endif
        npyv_store_u8(op, r);
    }

    // Scalar loop to finish off
    for (; len > 0; --len, ++ip, ++op) {
        *op = (*ip == 0);
    }
}

#line 182
static void
simd_absolute_BOOL(npy_bool * op, npy_bool * ip, npy_intp len)
{
    #define UNROLL 16

    const int vstep = npyv_nlanes_u8;
    const int wstep = vstep * UNROLL;

    #if 0
    const npyv_u8 zero = npyv_zero_u8();
    #endif

    // Unrolled vectors loop
    for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
        #line 199
        #if UNROLL > 0
        npyv_u8 v0 = npyv_load_u8(ip + vstep * 0);
#if 0
        npyv_u8 r0 = mask_to_true(npyv_cmpeq_u8(v0, zero));
#else
        npyv_u8 r0 = byte_to_true(v0);
#endif
        npyv_store_u8(op + vstep * 0, r0);
        #endif
        
#line 199
        #if UNROLL > 1
        npyv_u8 v1 = npyv_load_u8(ip + vstep * 1);
#if 0
        npyv_u8 r1 = mask_to_true(npyv_cmpeq_u8(v1, zero));
#else
        npyv_u8 r1 = byte_to_true(v1);
#endif
        npyv_store_u8(op + vstep * 1, r1);
        #endif
        
#line 199
        #if UNROLL > 2
        npyv_u8 v2 = npyv_load_u8(ip + vstep * 2);
#if 0
        npyv_u8 r2 = mask_to_true(npyv_cmpeq_u8(v2, zero));
#else
        npyv_u8 r2 = byte_to_true(v2);
#endif
        npyv_store_u8(op + vstep * 2, r2);
        #endif
        
#line 199
        #if UNROLL > 3
        npyv_u8 v3 = npyv_load_u8(ip + vstep * 3);
#if 0
        npyv_u8 r3 = mask_to_true(npyv_cmpeq_u8(v3, zero));
#else
        npyv_u8 r3 = byte_to_true(v3);
#endif
        npyv_store_u8(op + vstep * 3, r3);
        #endif
        
#line 199
        #if UNROLL > 4
        npyv_u8 v4 = npyv_load_u8(ip + vstep * 4);
#if 0
        npyv_u8 r4 = mask_to_true(npyv_cmpeq_u8(v4, zero));
#else
        npyv_u8 r4 = byte_to_true(v4);
#endif
        npyv_store_u8(op + vstep * 4, r4);
        #endif
        
#line 199
        #if UNROLL > 5
        npyv_u8 v5 = npyv_load_u8(ip + vstep * 5);
#if 0
        npyv_u8 r5 = mask_to_true(npyv_cmpeq_u8(v5, zero));
#else
        npyv_u8 r5 = byte_to_true(v5);
#endif
        npyv_store_u8(op + vstep * 5, r5);
        #endif
        
#line 199
        #if UNROLL > 6
        npyv_u8 v6 = npyv_load_u8(ip + vstep * 6);
#if 0
        npyv_u8 r6 = mask_to_true(npyv_cmpeq_u8(v6, zero));
#else
        npyv_u8 r6 = byte_to_true(v6);
#endif
        npyv_store_u8(op + vstep * 6, r6);
        #endif
        
#line 199
        #if UNROLL > 7
        npyv_u8 v7 = npyv_load_u8(ip + vstep * 7);
#if 0
        npyv_u8 r7 = mask_to_true(npyv_cmpeq_u8(v7, zero));
#else
        npyv_u8 r7 = byte_to_true(v7);
#endif
        npyv_store_u8(op + vstep * 7, r7);
        #endif
        
#line 199
        #if UNROLL > 8
        npyv_u8 v8 = npyv_load_u8(ip + vstep * 8);
#if 0
        npyv_u8 r8 = mask_to_true(npyv_cmpeq_u8(v8, zero));
#else
        npyv_u8 r8 = byte_to_true(v8);
#endif
        npyv_store_u8(op + vstep * 8, r8);
        #endif
        
#line 199
        #if UNROLL > 9
        npyv_u8 v9 = npyv_load_u8(ip + vstep * 9);
#if 0
        npyv_u8 r9 = mask_to_true(npyv_cmpeq_u8(v9, zero));
#else
        npyv_u8 r9 = byte_to_true(v9);
#endif
        npyv_store_u8(op + vstep * 9, r9);
        #endif
        
#line 199
        #if UNROLL > 10
        npyv_u8 v10 = npyv_load_u8(ip + vstep * 10);
#if 0
        npyv_u8 r10 = mask_to_true(npyv_cmpeq_u8(v10, zero));
#else
        npyv_u8 r10 = byte_to_true(v10);
#endif
        npyv_store_u8(op + vstep * 10, r10);
        #endif
        
#line 199
        #if UNROLL > 11
        npyv_u8 v11 = npyv_load_u8(ip + vstep * 11);
#if 0
        npyv_u8 r11 = mask_to_true(npyv_cmpeq_u8(v11, zero));
#else
        npyv_u8 r11 = byte_to_true(v11);
#endif
        npyv_store_u8(op + vstep * 11, r11);
        #endif
        
#line 199
        #if UNROLL > 12
        npyv_u8 v12 = npyv_load_u8(ip + vstep * 12);
#if 0
        npyv_u8 r12 = mask_to_true(npyv_cmpeq_u8(v12, zero));
#else
        npyv_u8 r12 = byte_to_true(v12);
#endif
        npyv_store_u8(op + vstep * 12, r12);
        #endif
        
#line 199
        #if UNROLL > 13
        npyv_u8 v13 = npyv_load_u8(ip + vstep * 13);
#if 0
        npyv_u8 r13 = mask_to_true(npyv_cmpeq_u8(v13, zero));
#else
        npyv_u8 r13 = byte_to_true(v13);
#endif
        npyv_store_u8(op + vstep * 13, r13);
        #endif
        
#line 199
        #if UNROLL > 14
        npyv_u8 v14 = npyv_load_u8(ip + vstep * 14);
#if 0
        npyv_u8 r14 = mask_to_true(npyv_cmpeq_u8(v14, zero));
#else
        npyv_u8 r14 = byte_to_true(v14);
#endif
        npyv_store_u8(op + vstep * 14, r14);
        #endif
        
#line 199
        #if UNROLL > 15
        npyv_u8 v15 = npyv_load_u8(ip + vstep * 15);
#if 0
        npyv_u8 r15 = mask_to_true(npyv_cmpeq_u8(v15, zero));
#else
        npyv_u8 r15 = byte_to_true(v15);
#endif
        npyv_store_u8(op + vstep * 15, r15);
        #endif
        
    }
    #undef UNROLL

    // Single vectors loop
    for (; len >= vstep; len -= vstep, ip += vstep, op += vstep) {
        npyv_u8 v = npyv_load_u8(ip);
#if 0
        npyv_u8 r = mask_to_true(npyv_cmpeq_u8(v, zero));
#else
        npyv_u8 r = byte_to_true(v);
#endif
        npyv_store_u8(op, r);
    }

    // Scalar loop to finish off
    for (; len > 0; --len, ++ip, ++op) {
        *op = (*ip != 0);
    }
}


#endif // NPY_SIMD

/*******************************************************************************
 ** Defining ufunc inner functions
 ******************************************************************************/

#line 239
static NPY_INLINE int
run_binary_simd_logical_or_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (sizeof(npy_bool) == 1 &&
            IS_BLOCKABLE_BINARY(sizeof(npy_bool), NPY_SIMD_WIDTH)) {
        simd_binary_logical_or_BOOL((npy_bool*)args[2], (npy_bool*)args[0],
                               (npy_bool*)args[1], dimensions[0]);
        return 1;
    }
#endif
    return 0;
}


static NPY_INLINE int
run_reduce_simd_logical_or_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (sizeof(npy_bool) == 1 &&
            IS_BLOCKABLE_REDUCE(sizeof(npy_bool), NPY_SIMD_WIDTH)) {
        simd_reduce_logical_or_BOOL((npy_bool*)args[0], (npy_bool*)args[1],
                                dimensions[0]);
        return 1;
    }
#endif
    return 0;
}

#line 239
static NPY_INLINE int
run_binary_simd_logical_and_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (sizeof(npy_bool) == 1 &&
            IS_BLOCKABLE_BINARY(sizeof(npy_bool), NPY_SIMD_WIDTH)) {
        simd_binary_logical_and_BOOL((npy_bool*)args[2], (npy_bool*)args[0],
                               (npy_bool*)args[1], dimensions[0]);
        return 1;
    }
#endif
    return 0;
}


static NPY_INLINE int
run_reduce_simd_logical_and_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (sizeof(npy_bool) == 1 &&
            IS_BLOCKABLE_REDUCE(sizeof(npy_bool), NPY_SIMD_WIDTH)) {
        simd_reduce_logical_and_BOOL((npy_bool*)args[0], (npy_bool*)args[1],
                                dimensions[0]);
        return 1;
    }
#endif
    return 0;
}


#line 272
static NPY_INLINE int
run_unary_simd_logical_not_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (sizeof(npy_bool) == 1 &&
            IS_BLOCKABLE_UNARY(sizeof(npy_bool), NPY_SIMD_WIDTH)) {
        simd_logical_not_BOOL((npy_bool*)args[1], (npy_bool*)args[0], dimensions[0]);
        return 1;
    }
#endif
    return 0;
}

#line 272
static NPY_INLINE int
run_unary_simd_absolute_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if NPY_SIMD
    if (sizeof(npy_bool) == 1 &&
            IS_BLOCKABLE_UNARY(sizeof(npy_bool), NPY_SIMD_WIDTH)) {
        simd_absolute_BOOL((npy_bool*)args[1], (npy_bool*)args[0], dimensions[0]);
        return 1;
    }
#endif
    return 0;
}



#line 293
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_logical_and)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if(IS_BINARY_REDUCE) {
#if NPY_SIMD
        /*
         * stick with our variant for more reliable performance, only known
         * platform which outperforms it by ~20% is an i7 with glibc 2.17
         */
        if (run_reduce_simd_logical_and_BOOL(args, dimensions, steps)) {
            return;
        }
#else
        /* for now only use libc on 32-bit/non-x86 */
        if (steps[1] == 1) {
            npy_bool * op = (npy_bool *)args[0];
#if 1
            /* np.all(), search for a zero (false) */
            if (*op) {
                *op = memchr(args[1], 0, dimensions[0]) == NULL;
            }
#else
            /*
             * np.any(), search for a non-zero (true) via comparing against
             * zero blocks, memcmp is faster than memchr on SSE4 machines
             * with glibc >= 2.12 and memchr can only check for equal 1
             */
            static const npy_bool zero[4096]; /* zero by C standard */
            npy_uintp i, n = dimensions[0];

            for (i = 0; !*op && i < n - (n % sizeof(zero)); i += sizeof(zero)) {
                *op = memcmp(&args[1][i], zero, sizeof(zero)) != 0;
            }
            if (!*op && n - i > 0) {
                *op = memcmp(&args[1][i], zero, n - i) != 0;
            }
#endif
            return;
        }
#endif
        else {
            BINARY_REDUCE_LOOP(npy_bool) {
                const npy_bool in2 = *(npy_bool *)ip2;
                io1 = io1 && in2;
                if (io1 == 0) {
                    break;
                }
            }
            *((npy_bool *)iop1) = io1;
        }
    }
    else {
        if (run_binary_simd_logical_and_BOOL(args, dimensions, steps)) {
            return;
        }
        else {
            BINARY_LOOP {
                const npy_bool in1 = *(npy_bool *)ip1;
                const npy_bool in2 = *(npy_bool *)ip2;
                *((npy_bool *)op1) = in1 && in2;
            }
        }
    }
}

#line 293
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_logical_or)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if(IS_BINARY_REDUCE) {
#if NPY_SIMD
        /*
         * stick with our variant for more reliable performance, only known
         * platform which outperforms it by ~20% is an i7 with glibc 2.17
         */
        if (run_reduce_simd_logical_or_BOOL(args, dimensions, steps)) {
            return;
        }
#else
        /* for now only use libc on 32-bit/non-x86 */
        if (steps[1] == 1) {
            npy_bool * op = (npy_bool *)args[0];
#if 0
            /* np.all(), search for a zero (false) */
            if (*op) {
                *op = memchr(args[1], 0, dimensions[0]) == NULL;
            }
#else
            /*
             * np.any(), search for a non-zero (true) via comparing against
             * zero blocks, memcmp is faster than memchr on SSE4 machines
             * with glibc >= 2.12 and memchr can only check for equal 1
             */
            static const npy_bool zero[4096]; /* zero by C standard */
            npy_uintp i, n = dimensions[0];

            for (i = 0; !*op && i < n - (n % sizeof(zero)); i += sizeof(zero)) {
                *op = memcmp(&args[1][i], zero, sizeof(zero)) != 0;
            }
            if (!*op && n - i > 0) {
                *op = memcmp(&args[1][i], zero, n - i) != 0;
            }
#endif
            return;
        }
#endif
        else {
            BINARY_REDUCE_LOOP(npy_bool) {
                const npy_bool in2 = *(npy_bool *)ip2;
                io1 = io1 || in2;
                if (io1 != 0) {
                    break;
                }
            }
            *((npy_bool *)iop1) = io1;
        }
    }
    else {
        if (run_binary_simd_logical_or_BOOL(args, dimensions, steps)) {
            return;
        }
        else {
            BINARY_LOOP {
                const npy_bool in1 = *(npy_bool *)ip1;
                const npy_bool in2 = *(npy_bool *)ip2;
                *((npy_bool *)op1) = in1 || in2;
            }
        }
    }
}


#line 363
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_logical_not)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if (run_unary_simd_logical_not_BOOL(args, dimensions, steps)) {
        return;
    }
    else {
        UNARY_LOOP {
            npy_bool in1 = *(npy_bool *)ip1;
            *((npy_bool *)op1) = in1 == 0;
        }
    }
}

#line 363
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_absolute)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if (run_unary_simd_absolute_BOOL(args, dimensions, steps)) {
        return;
    }
    else {
        UNARY_LOOP {
            npy_bool in1 = *(npy_bool *)ip1;
            *((npy_bool *)op1) = in1 != 0;
        }
    }
}



