#line 1 "numpy/core/src/umath/loops_arithmetic.dispatch.c.src"

/*
 *****************************************************************************
 **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
 **       Changes should be made to the original source (.src) file         **
 *****************************************************************************
 */

#line 1
/*@targets
 ** $maxopt baseline
 ** sse2 sse41 avx2 avx512f avx512_skx
 ** vsx2 vsx4
 ** neon
 ** vx
 **/
#define _UMATHMODULE
#define _MULTIARRAYMODULE
#define NPY_NO_DEPRECATED_API NPY_API_VERSION

#include "simd/simd.h"
#include "loops_utils.h"
#include "loops.h"
#include "lowlevel_strided_loops.h"
// Provides the various *_LOOP macros
#include "fast_loop_macros.h"

//###############################################################################
//## Division
//###############################################################################
/********************************************************************************
 ** Defining the SIMD kernels
 *
 * Floor division of signed is based on T. Granlund and P. L. Montgomery
 * "Division by invariant integers using multiplication(see [Figure 6.1]
 * http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556)"
 * For details on TRUNC division see simd/intdiv.h for more clarification
 ***********************************************************************************
 ** Figure 6.1: Signed division by run-time invariant divisor, rounded towards -INF
 ***********************************************************************************
 * For q = FLOOR(a/d), all sword:
 *     sword -dsign = SRL(d, N - 1);
 *     uword -nsign = (n < -dsign);
 *     uword -qsign = EOR(-nsign, -dsign);
 *     q = TRUNC((n - (-dsign ) + (-nsign))/d) - (-qsign);
 ********************************************************************************/

#if (defined(NPY_HAVE_VSX) && !defined(NPY_HAVE_VSX4)) || defined(NPY_HAVE_NEON)
    // Due to integer 128-bit multiplication emulation, SIMD 64-bit division
    // may not perform well on both neon and up to VSX3 compared to scalar
    // division.
    #define SIMD_DISABLE_DIV64_OPT
#endif

#if NPY_SIMD
#line 52
#if 8 < 64 || (8 == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
static inline void
simd_divide_by_scalar_contig_s8(char **args, npy_intp len)
{
    npyv_lanetype_s8 *src   = (npyv_lanetype_s8 *) args[0];
    npyv_lanetype_s8 scalar = *(npyv_lanetype_s8 *) args[1];
    npyv_lanetype_s8 *dst   = (npyv_lanetype_s8 *) args[2];
    const int vstep            = npyv_nlanes_s8;
    const npyv_s8x3 divisor = npyv_divisor_s8(scalar);

    if (scalar == -1) {
        npyv_b8 noverflow  = npyv_cvt_b8_s8(npyv_setall_s8(-1));
        const npyv_s8 vzero = npyv_zero_s8();
        const npyv_s8 vmin  = npyv_setall_s8(NPY_MIN_INT8);
        for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
            npyv_s8 a       = npyv_load_s8(src);
            npyv_b8 gt_min = npyv_cmpgt_s8(a, npyv_setall_s8(NPY_MIN_INT8));
            noverflow          = npyv_and_b8(noverflow, gt_min);
            npyv_s8 neg     = npyv_ifsub_s8(gt_min, vzero, a, vmin);
            npyv_store_s8(dst, neg);
        }

        int raise_err = npyv_tobits_b8(npyv_not_b8(noverflow)) != 0;
        for (; len > 0; --len, ++src, ++dst) {
            npyv_lanetype_s8 a = *src;
            if (a == NPY_MIN_INT8) {
                raise_err = 1;
                *dst  = NPY_MIN_INT8;
            } else {
                *dst = -a;
            }
        }
        if (raise_err) {
            npy_set_floatstatus_overflow();
        }
    } else {
        for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
            npyv_s8  nsign_d   = npyv_setall_s8(scalar < 0);
            npyv_s8  a         = npyv_load_s8(src);
            npyv_s8  nsign_a   = npyv_cvt_s8_b8(npyv_cmplt_s8(a, nsign_d));
            nsign_a               = npyv_and_s8(nsign_a, npyv_setall_s8(1));
            npyv_s8  diff_sign = npyv_sub_s8(nsign_a, nsign_d);
            npyv_s8  to_ninf   = npyv_xor_s8(nsign_a, nsign_d);
            npyv_s8  trunc     = npyv_divc_s8(npyv_add_s8(a, diff_sign), divisor);
            npyv_s8  floor     = npyv_sub_s8(trunc, to_ninf);
            npyv_store_s8(dst, floor);
        }

        for (; len > 0; --len, ++src, ++dst) {
            const npyv_lanetype_s8 a = *src;
            npyv_lanetype_s8 r = a / scalar;
            // Negative quotients needs to be rounded down
            if (((a > 0) != (scalar > 0)) && ((r * scalar) != a)) {
                r--;
            }
            *dst = r;
        }
    }
    npyv_cleanup();
}
#endif

#line 52
#if 16 < 64 || (16 == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
static inline void
simd_divide_by_scalar_contig_s16(char **args, npy_intp len)
{
    npyv_lanetype_s16 *src   = (npyv_lanetype_s16 *) args[0];
    npyv_lanetype_s16 scalar = *(npyv_lanetype_s16 *) args[1];
    npyv_lanetype_s16 *dst   = (npyv_lanetype_s16 *) args[2];
    const int vstep            = npyv_nlanes_s16;
    const npyv_s16x3 divisor = npyv_divisor_s16(scalar);

    if (scalar == -1) {
        npyv_b16 noverflow  = npyv_cvt_b16_s16(npyv_setall_s16(-1));
        const npyv_s16 vzero = npyv_zero_s16();
        const npyv_s16 vmin  = npyv_setall_s16(NPY_MIN_INT16);
        for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
            npyv_s16 a       = npyv_load_s16(src);
            npyv_b16 gt_min = npyv_cmpgt_s16(a, npyv_setall_s16(NPY_MIN_INT16));
            noverflow          = npyv_and_b16(noverflow, gt_min);
            npyv_s16 neg     = npyv_ifsub_s16(gt_min, vzero, a, vmin);
            npyv_store_s16(dst, neg);
        }

        int raise_err = npyv_tobits_b16(npyv_not_b16(noverflow)) != 0;
        for (; len > 0; --len, ++src, ++dst) {
            npyv_lanetype_s16 a = *src;
            if (a == NPY_MIN_INT16) {
                raise_err = 1;
                *dst  = NPY_MIN_INT16;
            } else {
                *dst = -a;
            }
        }
        if (raise_err) {
            npy_set_floatstatus_overflow();
        }
    } else {
        for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
            npyv_s16  nsign_d   = npyv_setall_s16(scalar < 0);
            npyv_s16  a         = npyv_load_s16(src);
            npyv_s16  nsign_a   = npyv_cvt_s16_b16(npyv_cmplt_s16(a, nsign_d));
            nsign_a               = npyv_and_s16(nsign_a, npyv_setall_s16(1));
            npyv_s16  diff_sign = npyv_sub_s16(nsign_a, nsign_d);
            npyv_s16  to_ninf   = npyv_xor_s16(nsign_a, nsign_d);
            npyv_s16  trunc     = npyv_divc_s16(npyv_add_s16(a, diff_sign), divisor);
            npyv_s16  floor     = npyv_sub_s16(trunc, to_ninf);
            npyv_store_s16(dst, floor);
        }

        for (; len > 0; --len, ++src, ++dst) {
            const npyv_lanetype_s16 a = *src;
            npyv_lanetype_s16 r = a / scalar;
            // Negative quotients needs to be rounded down
            if (((a > 0) != (scalar > 0)) && ((r * scalar) != a)) {
                r--;
            }
            *dst = r;
        }
    }
    npyv_cleanup();
}
#endif

#line 52
#if 32 < 64 || (32 == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
static inline void
simd_divide_by_scalar_contig_s32(char **args, npy_intp len)
{
    npyv_lanetype_s32 *src   = (npyv_lanetype_s32 *) args[0];
    npyv_lanetype_s32 scalar = *(npyv_lanetype_s32 *) args[1];
    npyv_lanetype_s32 *dst   = (npyv_lanetype_s32 *) args[2];
    const int vstep            = npyv_nlanes_s32;
    const npyv_s32x3 divisor = npyv_divisor_s32(scalar);

    if (scalar == -1) {
        npyv_b32 noverflow  = npyv_cvt_b32_s32(npyv_setall_s32(-1));
        const npyv_s32 vzero = npyv_zero_s32();
        const npyv_s32 vmin  = npyv_setall_s32(NPY_MIN_INT32);
        for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
            npyv_s32 a       = npyv_load_s32(src);
            npyv_b32 gt_min = npyv_cmpgt_s32(a, npyv_setall_s32(NPY_MIN_INT32));
            noverflow          = npyv_and_b32(noverflow, gt_min);
            npyv_s32 neg     = npyv_ifsub_s32(gt_min, vzero, a, vmin);
            npyv_store_s32(dst, neg);
        }

        int raise_err = npyv_tobits_b32(npyv_not_b32(noverflow)) != 0;
        for (; len > 0; --len, ++src, ++dst) {
            npyv_lanetype_s32 a = *src;
            if (a == NPY_MIN_INT32) {
                raise_err = 1;
                *dst  = NPY_MIN_INT32;
            } else {
                *dst = -a;
            }
        }
        if (raise_err) {
            npy_set_floatstatus_overflow();
        }
    } else {
        for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
            npyv_s32  nsign_d   = npyv_setall_s32(scalar < 0);
            npyv_s32  a         = npyv_load_s32(src);
            npyv_s32  nsign_a   = npyv_cvt_s32_b32(npyv_cmplt_s32(a, nsign_d));
            nsign_a               = npyv_and_s32(nsign_a, npyv_setall_s32(1));
            npyv_s32  diff_sign = npyv_sub_s32(nsign_a, nsign_d);
            npyv_s32  to_ninf   = npyv_xor_s32(nsign_a, nsign_d);
            npyv_s32  trunc     = npyv_divc_s32(npyv_add_s32(a, diff_sign), divisor);
            npyv_s32  floor     = npyv_sub_s32(trunc, to_ninf);
            npyv_store_s32(dst, floor);
        }

        for (; len > 0; --len, ++src, ++dst) {
            const npyv_lanetype_s32 a = *src;
            npyv_lanetype_s32 r = a / scalar;
            // Negative quotients needs to be rounded down
            if (((a > 0) != (scalar > 0)) && ((r * scalar) != a)) {
                r--;
            }
            *dst = r;
        }
    }
    npyv_cleanup();
}
#endif

#line 52
#if 64 < 64 || (64 == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
static inline void
simd_divide_by_scalar_contig_s64(char **args, npy_intp len)
{
    npyv_lanetype_s64 *src   = (npyv_lanetype_s64 *) args[0];
    npyv_lanetype_s64 scalar = *(npyv_lanetype_s64 *) args[1];
    npyv_lanetype_s64 *dst   = (npyv_lanetype_s64 *) args[2];
    const int vstep            = npyv_nlanes_s64;
    const npyv_s64x3 divisor = npyv_divisor_s64(scalar);

    if (scalar == -1) {
        npyv_b64 noverflow  = npyv_cvt_b64_s64(npyv_setall_s64(-1));
        const npyv_s64 vzero = npyv_zero_s64();
        const npyv_s64 vmin  = npyv_setall_s64(NPY_MIN_INT64);
        for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
            npyv_s64 a       = npyv_load_s64(src);
            npyv_b64 gt_min = npyv_cmpgt_s64(a, npyv_setall_s64(NPY_MIN_INT64));
            noverflow          = npyv_and_b64(noverflow, gt_min);
            npyv_s64 neg     = npyv_ifsub_s64(gt_min, vzero, a, vmin);
            npyv_store_s64(dst, neg);
        }

        int raise_err = npyv_tobits_b64(npyv_not_b64(noverflow)) != 0;
        for (; len > 0; --len, ++src, ++dst) {
            npyv_lanetype_s64 a = *src;
            if (a == NPY_MIN_INT64) {
                raise_err = 1;
                *dst  = NPY_MIN_INT64;
            } else {
                *dst = -a;
            }
        }
        if (raise_err) {
            npy_set_floatstatus_overflow();
        }
    } else {
        for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
            npyv_s64  nsign_d   = npyv_setall_s64(scalar < 0);
            npyv_s64  a         = npyv_load_s64(src);
            npyv_s64  nsign_a   = npyv_cvt_s64_b64(npyv_cmplt_s64(a, nsign_d));
            nsign_a               = npyv_and_s64(nsign_a, npyv_setall_s64(1));
            npyv_s64  diff_sign = npyv_sub_s64(nsign_a, nsign_d);
            npyv_s64  to_ninf   = npyv_xor_s64(nsign_a, nsign_d);
            npyv_s64  trunc     = npyv_divc_s64(npyv_add_s64(a, diff_sign), divisor);
            npyv_s64  floor     = npyv_sub_s64(trunc, to_ninf);
            npyv_store_s64(dst, floor);
        }

        for (; len > 0; --len, ++src, ++dst) {
            const npyv_lanetype_s64 a = *src;
            npyv_lanetype_s64 r = a / scalar;
            // Negative quotients needs to be rounded down
            if (((a > 0) != (scalar > 0)) && ((r * scalar) != a)) {
                r--;
            }
            *dst = r;
        }
    }
    npyv_cleanup();
}
#endif


#line 120
#if 8 < 64 || (8 == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
static inline void
simd_divide_by_scalar_contig_u8(char **args, npy_intp len)
{
    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[0];
    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[1];
    npyv_lanetype_u8 *dst   = (npyv_lanetype_u8 *) args[2];
    const int vstep            = npyv_nlanes_u8;
    const npyv_u8x3 divisor = npyv_divisor_u8(scalar);

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
        npyv_u8 a = npyv_load_u8(src);
        npyv_u8 c = npyv_divc_u8(a, divisor);
        npyv_store_u8(dst, c);
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u8 a = *src;
        *dst = a / scalar;
    }
    npyv_cleanup();
}
#endif

#line 120
#if 16 < 64 || (16 == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
static inline void
simd_divide_by_scalar_contig_u16(char **args, npy_intp len)
{
    npyv_lanetype_u16 *src   = (npyv_lanetype_u16 *) args[0];
    npyv_lanetype_u16 scalar = *(npyv_lanetype_u16 *) args[1];
    npyv_lanetype_u16 *dst   = (npyv_lanetype_u16 *) args[2];
    const int vstep            = npyv_nlanes_u16;
    const npyv_u16x3 divisor = npyv_divisor_u16(scalar);

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
        npyv_u16 a = npyv_load_u16(src);
        npyv_u16 c = npyv_divc_u16(a, divisor);
        npyv_store_u16(dst, c);
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u16 a = *src;
        *dst = a / scalar;
    }
    npyv_cleanup();
}
#endif

#line 120
#if 32 < 64 || (32 == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
static inline void
simd_divide_by_scalar_contig_u32(char **args, npy_intp len)
{
    npyv_lanetype_u32 *src   = (npyv_lanetype_u32 *) args[0];
    npyv_lanetype_u32 scalar = *(npyv_lanetype_u32 *) args[1];
    npyv_lanetype_u32 *dst   = (npyv_lanetype_u32 *) args[2];
    const int vstep            = npyv_nlanes_u32;
    const npyv_u32x3 divisor = npyv_divisor_u32(scalar);

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
        npyv_u32 a = npyv_load_u32(src);
        npyv_u32 c = npyv_divc_u32(a, divisor);
        npyv_store_u32(dst, c);
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u32 a = *src;
        *dst = a / scalar;
    }
    npyv_cleanup();
}
#endif

#line 120
#if 64 < 64 || (64 == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
static inline void
simd_divide_by_scalar_contig_u64(char **args, npy_intp len)
{
    npyv_lanetype_u64 *src   = (npyv_lanetype_u64 *) args[0];
    npyv_lanetype_u64 scalar = *(npyv_lanetype_u64 *) args[1];
    npyv_lanetype_u64 *dst   = (npyv_lanetype_u64 *) args[2];
    const int vstep            = npyv_nlanes_u64;
    const npyv_u64x3 divisor = npyv_divisor_u64(scalar);

    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
        npyv_u64 a = npyv_load_u64(src);
        npyv_u64 c = npyv_divc_u64(a, divisor);
        npyv_store_u64(dst, c);
    }

    for (; len > 0; --len, ++src, ++dst) {
        const npyv_lanetype_u64 a = *src;
        *dst = a / scalar;
    }
    npyv_cleanup();
}
#endif


#if defined(NPY_HAVE_VSX4)

#line 151
/*
 * Computes division of 2 8-bit signed/unsigned integer vectors
 *
 * As Power10 only supports integer vector division for data of 32 bits or
 * greater, we have to convert npyv_u8 into 4x npyv_u32, execute the integer
 * vector division instruction, and then, convert the result back to npyv_u8.
 */
NPY_FINLINE npyv_u8
vsx4_div_u8(npyv_u8 a, npyv_u8 b)
{
#if 0
    npyv_s16x2 ta, tb;
    npyv_s32x2 ahi, alo, bhi, blo;
    ta.val[0] = vec_unpackh(a);
    ta.val[1] = vec_unpackl(a);
    tb.val[0] = vec_unpackh(b);
    tb.val[1] = vec_unpackl(b);
    ahi.val[0] = vec_unpackh(ta.val[0]);
    ahi.val[1] = vec_unpackl(ta.val[0]);
    alo.val[0] = vec_unpackh(ta.val[1]);
    alo.val[1] = vec_unpackl(ta.val[1]);
    bhi.val[0] = vec_unpackh(tb.val[0]);
    bhi.val[1] = vec_unpackl(tb.val[0]);
    blo.val[0] = vec_unpackh(tb.val[1]);
    blo.val[1] = vec_unpackl(tb.val[1]);
#else
    npyv_u16x2 a_expand = npyv_expand_u16_u8(a);
    npyv_u16x2 b_expand = npyv_expand_u16_u8(b);
    npyv_u32x2 ahi = npyv_expand_u32_u16(a_expand.val[0]);
    npyv_u32x2 alo = npyv_expand_u32_u16(a_expand.val[1]);
    npyv_u32x2 bhi = npyv_expand_u32_u16(b_expand.val[0]);
    npyv_u32x2 blo = npyv_expand_u32_u16(b_expand.val[1]);
#endif
    npyv_u32 v1 = vec_div(ahi.val[0], bhi.val[0]);
    npyv_u32 v2 = vec_div(ahi.val[1], bhi.val[1]);
    npyv_u32 v3 = vec_div(alo.val[0], blo.val[0]);
    npyv_u32 v4 = vec_div(alo.val[1], blo.val[1]);
    npyv_u16 hi = vec_pack(v1, v2);
    npyv_u16 lo = vec_pack(v3, v4);
    return vec_pack(hi, lo);
}

NPY_FINLINE npyv_u16
vsx4_div_u16(npyv_u16 a, npyv_u16 b)
{
#if 0
    npyv_s32x2 a_expand;
    npyv_s32x2 b_expand;
    a_expand.val[0] = vec_unpackh(a);
    a_expand.val[1] = vec_unpackl(a);
    b_expand.val[0] = vec_unpackh(b);
    b_expand.val[1] = vec_unpackl(b);
#else
    npyv_u32x2 a_expand = npyv_expand_u32_u16(a);
    npyv_u32x2 b_expand = npyv_expand_u32_u16(b);
#endif
    npyv_u32 v1 = vec_div(a_expand.val[0], b_expand.val[0]);
    npyv_u32 v2 = vec_div(a_expand.val[1], b_expand.val[1]);
    return vec_pack(v1, v2);
}

#define vsx4_div_u32 vec_div
#define vsx4_div_u64 vec_div

#line 151
/*
 * Computes division of 2 8-bit signed/unsigned integer vectors
 *
 * As Power10 only supports integer vector division for data of 32 bits or
 * greater, we have to convert npyv_u8 into 4x npyv_u32, execute the integer
 * vector division instruction, and then, convert the result back to npyv_u8.
 */
NPY_FINLINE npyv_s8
vsx4_div_s8(npyv_s8 a, npyv_s8 b)
{
#if 1
    npyv_s16x2 ta, tb;
    npyv_s32x2 ahi, alo, bhi, blo;
    ta.val[0] = vec_unpackh(a);
    ta.val[1] = vec_unpackl(a);
    tb.val[0] = vec_unpackh(b);
    tb.val[1] = vec_unpackl(b);
    ahi.val[0] = vec_unpackh(ta.val[0]);
    ahi.val[1] = vec_unpackl(ta.val[0]);
    alo.val[0] = vec_unpackh(ta.val[1]);
    alo.val[1] = vec_unpackl(ta.val[1]);
    bhi.val[0] = vec_unpackh(tb.val[0]);
    bhi.val[1] = vec_unpackl(tb.val[0]);
    blo.val[0] = vec_unpackh(tb.val[1]);
    blo.val[1] = vec_unpackl(tb.val[1]);
#else
    npyv_u16x2 a_expand = npyv_expand_u16_u8(a);
    npyv_u16x2 b_expand = npyv_expand_u16_u8(b);
    npyv_u32x2 ahi = npyv_expand_u32_u16(a_expand.val[0]);
    npyv_u32x2 alo = npyv_expand_u32_u16(a_expand.val[1]);
    npyv_u32x2 bhi = npyv_expand_u32_u16(b_expand.val[0]);
    npyv_u32x2 blo = npyv_expand_u32_u16(b_expand.val[1]);
#endif
    npyv_s32 v1 = vec_div(ahi.val[0], bhi.val[0]);
    npyv_s32 v2 = vec_div(ahi.val[1], bhi.val[1]);
    npyv_s32 v3 = vec_div(alo.val[0], blo.val[0]);
    npyv_s32 v4 = vec_div(alo.val[1], blo.val[1]);
    npyv_s16 hi = vec_pack(v1, v2);
    npyv_s16 lo = vec_pack(v3, v4);
    return vec_pack(hi, lo);
}

NPY_FINLINE npyv_s16
vsx4_div_s16(npyv_s16 a, npyv_s16 b)
{
#if 1
    npyv_s32x2 a_expand;
    npyv_s32x2 b_expand;
    a_expand.val[0] = vec_unpackh(a);
    a_expand.val[1] = vec_unpackl(a);
    b_expand.val[0] = vec_unpackh(b);
    b_expand.val[1] = vec_unpackl(b);
#else
    npyv_u32x2 a_expand = npyv_expand_s32_s16(a);
    npyv_u32x2 b_expand = npyv_expand_s32_s16(b);
#endif
    npyv_s32 v1 = vec_div(a_expand.val[0], b_expand.val[0]);
    npyv_s32 v2 = vec_div(a_expand.val[1], b_expand.val[1]);
    return vec_pack(v1, v2);
}

#define vsx4_div_s32 vec_div
#define vsx4_div_s64 vec_div


#line 221
static inline void
vsx4_simd_divide_contig_u8(char **args, npy_intp len)
{
    npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0];
    npyv_lanetype_u8 *src2 = (npyv_lanetype_u8 *) args[1];
    npyv_lanetype_u8 *dst1 = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 vzero    = npyv_zero_u8();
    const int vstep           = npyv_nlanes_u8;

    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep) {
        npyv_u8 a = npyv_load_u8(src1);
        npyv_u8 b = npyv_load_u8(src2);
        npyv_u8 c = vsx4_div_u8(a, b);
        npyv_store_u8(dst1, c);
        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
            npy_set_floatstatus_divbyzero();
        }
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
        const npyv_lanetype_u8 a = *src1;
        const npyv_lanetype_u8 b = *src2;
        if (NPY_UNLIKELY(b == 0)) {
            npy_set_floatstatus_divbyzero();
            *dst1 = 0;
        } else{
            *dst1 = a / b;
        }
    }
    npyv_cleanup();
}

#line 221
static inline void
vsx4_simd_divide_contig_u16(char **args, npy_intp len)
{
    npyv_lanetype_u16 *src1 = (npyv_lanetype_u16 *) args[0];
    npyv_lanetype_u16 *src2 = (npyv_lanetype_u16 *) args[1];
    npyv_lanetype_u16 *dst1 = (npyv_lanetype_u16 *) args[2];
    const npyv_u16 vzero    = npyv_zero_u16();
    const int vstep           = npyv_nlanes_u16;

    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep) {
        npyv_u16 a = npyv_load_u16(src1);
        npyv_u16 b = npyv_load_u16(src2);
        npyv_u16 c = vsx4_div_u16(a, b);
        npyv_store_u16(dst1, c);
        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
            npy_set_floatstatus_divbyzero();
        }
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
        const npyv_lanetype_u16 a = *src1;
        const npyv_lanetype_u16 b = *src2;
        if (NPY_UNLIKELY(b == 0)) {
            npy_set_floatstatus_divbyzero();
            *dst1 = 0;
        } else{
            *dst1 = a / b;
        }
    }
    npyv_cleanup();
}

#line 221
static inline void
vsx4_simd_divide_contig_u32(char **args, npy_intp len)
{
    npyv_lanetype_u32 *src1 = (npyv_lanetype_u32 *) args[0];
    npyv_lanetype_u32 *src2 = (npyv_lanetype_u32 *) args[1];
    npyv_lanetype_u32 *dst1 = (npyv_lanetype_u32 *) args[2];
    const npyv_u32 vzero    = npyv_zero_u32();
    const int vstep           = npyv_nlanes_u32;

    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep) {
        npyv_u32 a = npyv_load_u32(src1);
        npyv_u32 b = npyv_load_u32(src2);
        npyv_u32 c = vsx4_div_u32(a, b);
        npyv_store_u32(dst1, c);
        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
            npy_set_floatstatus_divbyzero();
        }
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
        const npyv_lanetype_u32 a = *src1;
        const npyv_lanetype_u32 b = *src2;
        if (NPY_UNLIKELY(b == 0)) {
            npy_set_floatstatus_divbyzero();
            *dst1 = 0;
        } else{
            *dst1 = a / b;
        }
    }
    npyv_cleanup();
}

#line 221
static inline void
vsx4_simd_divide_contig_u64(char **args, npy_intp len)
{
    npyv_lanetype_u64 *src1 = (npyv_lanetype_u64 *) args[0];
    npyv_lanetype_u64 *src2 = (npyv_lanetype_u64 *) args[1];
    npyv_lanetype_u64 *dst1 = (npyv_lanetype_u64 *) args[2];
    const npyv_u64 vzero    = npyv_zero_u64();
    const int vstep           = npyv_nlanes_u64;

    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep) {
        npyv_u64 a = npyv_load_u64(src1);
        npyv_u64 b = npyv_load_u64(src2);
        npyv_u64 c = vsx4_div_u64(a, b);
        npyv_store_u64(dst1, c);
        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
            npy_set_floatstatus_divbyzero();
        }
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
        const npyv_lanetype_u64 a = *src1;
        const npyv_lanetype_u64 b = *src2;
        if (NPY_UNLIKELY(b == 0)) {
            npy_set_floatstatus_divbyzero();
            *dst1 = 0;
        } else{
            *dst1 = a / b;
        }
    }
    npyv_cleanup();
}


#line 260
static inline void
vsx4_simd_divide_contig_s8(char **args, npy_intp len)
{
    npyv_lanetype_s8 *src1 = (npyv_lanetype_s8 *) args[0];
    npyv_lanetype_s8 *src2 = (npyv_lanetype_s8 *) args[1];
    npyv_lanetype_s8 *dst1 = (npyv_lanetype_s8 *) args[2];
    const npyv_s8 vneg_one = npyv_setall_s8(-1);
    const npyv_s8 vzero    = npyv_zero_s8();
    const npyv_s8 vmin     = npyv_setall_s8(NPY_MIN_INT8);
    npyv_b8 warn_zero     = npyv_cvt_b8_s8(npyv_zero_s8());
    npyv_b8 warn_overflow = npyv_cvt_b8_s8(npyv_zero_s8());
    const int vstep           = npyv_nlanes_s8;

    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep) {
        npyv_s8 a   = npyv_load_s8(src1);
        npyv_s8 b   = npyv_load_s8(src2);
        npyv_s8 quo = vsx4_div_s8(a, b);
        npyv_s8 rem = npyv_sub_s8(a, vec_mul(b, quo));
        // (b == 0 || (a == NPY_MIN_INT8 && b == -1))
        npyv_b8 bzero    = npyv_cmpeq_s8(b, vzero);
        npyv_b8 amin     = npyv_cmpeq_s8(a, vmin);
        npyv_b8 bneg_one = npyv_cmpeq_s8(b, vneg_one);
        npyv_b8 overflow = npyv_and_s8(bneg_one, amin);
                   warn_zero = npyv_or_s8(bzero, warn_zero);
               warn_overflow = npyv_or_s8(overflow, warn_overflow);
        // handle mixed case the way Python does
        // ((a > 0) == (b > 0) || rem == 0)
        npyv_b8 a_gt_zero  = npyv_cmpgt_s8(a, vzero);
        npyv_b8 b_gt_zero  = npyv_cmpgt_s8(b, vzero);
        npyv_b8 ab_eq_cond = npyv_cmpeq_s8(a_gt_zero, b_gt_zero);
        npyv_b8 rem_zero   = npyv_cmpeq_s8(rem, vzero);
        npyv_b8 or         = npyv_or_s8(ab_eq_cond, rem_zero);
        npyv_s8 to_sub = npyv_select_s8(or, vzero, vneg_one);
                      quo = npyv_add_s8(quo, to_sub);
                      // Divide by zero
                      quo = npyv_select_s8(bzero, vzero, quo);
                      // Overflow
                      quo = npyv_select_s8(overflow, vmin, quo);
        npyv_store_s8(dst1, quo);
    }

    if (!vec_all_eq(warn_zero, vzero)) {
        npy_set_floatstatus_divbyzero();
    }
    if (!vec_all_eq(warn_overflow, vzero)) {
        npy_set_floatstatus_overflow();
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
        const npyv_lanetype_s8 a = *src1;
        const npyv_lanetype_s8 b = *src2;
        if (NPY_UNLIKELY(b == 0)) {
            npy_set_floatstatus_divbyzero();
            *dst1 = 0;
        } else if (NPY_UNLIKELY((a == NPY_MIN_INT8) && (b == -1))) {
            npy_set_floatstatus_overflow();
            *dst1 = NPY_MIN_INT8;
        } else {
            *dst1 = a / b;
            if (((a > 0) != (b > 0)) && ((*dst1 * b) != a)) {
                *dst1 -= 1;
            }
        }
    }
    npyv_cleanup();
}

#line 260
static inline void
vsx4_simd_divide_contig_s16(char **args, npy_intp len)
{
    npyv_lanetype_s16 *src1 = (npyv_lanetype_s16 *) args[0];
    npyv_lanetype_s16 *src2 = (npyv_lanetype_s16 *) args[1];
    npyv_lanetype_s16 *dst1 = (npyv_lanetype_s16 *) args[2];
    const npyv_s16 vneg_one = npyv_setall_s16(-1);
    const npyv_s16 vzero    = npyv_zero_s16();
    const npyv_s16 vmin     = npyv_setall_s16(NPY_MIN_INT16);
    npyv_b16 warn_zero     = npyv_cvt_b16_s16(npyv_zero_s16());
    npyv_b16 warn_overflow = npyv_cvt_b16_s16(npyv_zero_s16());
    const int vstep           = npyv_nlanes_s16;

    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep) {
        npyv_s16 a   = npyv_load_s16(src1);
        npyv_s16 b   = npyv_load_s16(src2);
        npyv_s16 quo = vsx4_div_s16(a, b);
        npyv_s16 rem = npyv_sub_s16(a, vec_mul(b, quo));
        // (b == 0 || (a == NPY_MIN_INT16 && b == -1))
        npyv_b16 bzero    = npyv_cmpeq_s16(b, vzero);
        npyv_b16 amin     = npyv_cmpeq_s16(a, vmin);
        npyv_b16 bneg_one = npyv_cmpeq_s16(b, vneg_one);
        npyv_b16 overflow = npyv_and_s16(bneg_one, amin);
                   warn_zero = npyv_or_s16(bzero, warn_zero);
               warn_overflow = npyv_or_s16(overflow, warn_overflow);
        // handle mixed case the way Python does
        // ((a > 0) == (b > 0) || rem == 0)
        npyv_b16 a_gt_zero  = npyv_cmpgt_s16(a, vzero);
        npyv_b16 b_gt_zero  = npyv_cmpgt_s16(b, vzero);
        npyv_b16 ab_eq_cond = npyv_cmpeq_s16(a_gt_zero, b_gt_zero);
        npyv_b16 rem_zero   = npyv_cmpeq_s16(rem, vzero);
        npyv_b16 or         = npyv_or_s16(ab_eq_cond, rem_zero);
        npyv_s16 to_sub = npyv_select_s16(or, vzero, vneg_one);
                      quo = npyv_add_s16(quo, to_sub);
                      // Divide by zero
                      quo = npyv_select_s16(bzero, vzero, quo);
                      // Overflow
                      quo = npyv_select_s16(overflow, vmin, quo);
        npyv_store_s16(dst1, quo);
    }

    if (!vec_all_eq(warn_zero, vzero)) {
        npy_set_floatstatus_divbyzero();
    }
    if (!vec_all_eq(warn_overflow, vzero)) {
        npy_set_floatstatus_overflow();
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
        const npyv_lanetype_s16 a = *src1;
        const npyv_lanetype_s16 b = *src2;
        if (NPY_UNLIKELY(b == 0)) {
            npy_set_floatstatus_divbyzero();
            *dst1 = 0;
        } else if (NPY_UNLIKELY((a == NPY_MIN_INT16) && (b == -1))) {
            npy_set_floatstatus_overflow();
            *dst1 = NPY_MIN_INT16;
        } else {
            *dst1 = a / b;
            if (((a > 0) != (b > 0)) && ((*dst1 * b) != a)) {
                *dst1 -= 1;
            }
        }
    }
    npyv_cleanup();
}

#line 260
static inline void
vsx4_simd_divide_contig_s32(char **args, npy_intp len)
{
    npyv_lanetype_s32 *src1 = (npyv_lanetype_s32 *) args[0];
    npyv_lanetype_s32 *src2 = (npyv_lanetype_s32 *) args[1];
    npyv_lanetype_s32 *dst1 = (npyv_lanetype_s32 *) args[2];
    const npyv_s32 vneg_one = npyv_setall_s32(-1);
    const npyv_s32 vzero    = npyv_zero_s32();
    const npyv_s32 vmin     = npyv_setall_s32(NPY_MIN_INT32);
    npyv_b32 warn_zero     = npyv_cvt_b32_s32(npyv_zero_s32());
    npyv_b32 warn_overflow = npyv_cvt_b32_s32(npyv_zero_s32());
    const int vstep           = npyv_nlanes_s32;

    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep) {
        npyv_s32 a   = npyv_load_s32(src1);
        npyv_s32 b   = npyv_load_s32(src2);
        npyv_s32 quo = vsx4_div_s32(a, b);
        npyv_s32 rem = npyv_sub_s32(a, vec_mul(b, quo));
        // (b == 0 || (a == NPY_MIN_INT32 && b == -1))
        npyv_b32 bzero    = npyv_cmpeq_s32(b, vzero);
        npyv_b32 amin     = npyv_cmpeq_s32(a, vmin);
        npyv_b32 bneg_one = npyv_cmpeq_s32(b, vneg_one);
        npyv_b32 overflow = npyv_and_s32(bneg_one, amin);
                   warn_zero = npyv_or_s32(bzero, warn_zero);
               warn_overflow = npyv_or_s32(overflow, warn_overflow);
        // handle mixed case the way Python does
        // ((a > 0) == (b > 0) || rem == 0)
        npyv_b32 a_gt_zero  = npyv_cmpgt_s32(a, vzero);
        npyv_b32 b_gt_zero  = npyv_cmpgt_s32(b, vzero);
        npyv_b32 ab_eq_cond = npyv_cmpeq_s32(a_gt_zero, b_gt_zero);
        npyv_b32 rem_zero   = npyv_cmpeq_s32(rem, vzero);
        npyv_b32 or         = npyv_or_s32(ab_eq_cond, rem_zero);
        npyv_s32 to_sub = npyv_select_s32(or, vzero, vneg_one);
                      quo = npyv_add_s32(quo, to_sub);
                      // Divide by zero
                      quo = npyv_select_s32(bzero, vzero, quo);
                      // Overflow
                      quo = npyv_select_s32(overflow, vmin, quo);
        npyv_store_s32(dst1, quo);
    }

    if (!vec_all_eq(warn_zero, vzero)) {
        npy_set_floatstatus_divbyzero();
    }
    if (!vec_all_eq(warn_overflow, vzero)) {
        npy_set_floatstatus_overflow();
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
        const npyv_lanetype_s32 a = *src1;
        const npyv_lanetype_s32 b = *src2;
        if (NPY_UNLIKELY(b == 0)) {
            npy_set_floatstatus_divbyzero();
            *dst1 = 0;
        } else if (NPY_UNLIKELY((a == NPY_MIN_INT32) && (b == -1))) {
            npy_set_floatstatus_overflow();
            *dst1 = NPY_MIN_INT32;
        } else {
            *dst1 = a / b;
            if (((a > 0) != (b > 0)) && ((*dst1 * b) != a)) {
                *dst1 -= 1;
            }
        }
    }
    npyv_cleanup();
}

#line 260
static inline void
vsx4_simd_divide_contig_s64(char **args, npy_intp len)
{
    npyv_lanetype_s64 *src1 = (npyv_lanetype_s64 *) args[0];
    npyv_lanetype_s64 *src2 = (npyv_lanetype_s64 *) args[1];
    npyv_lanetype_s64 *dst1 = (npyv_lanetype_s64 *) args[2];
    const npyv_s64 vneg_one = npyv_setall_s64(-1);
    const npyv_s64 vzero    = npyv_zero_s64();
    const npyv_s64 vmin     = npyv_setall_s64(NPY_MIN_INT64);
    npyv_b64 warn_zero     = npyv_cvt_b64_s64(npyv_zero_s64());
    npyv_b64 warn_overflow = npyv_cvt_b64_s64(npyv_zero_s64());
    const int vstep           = npyv_nlanes_s64;

    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep) {
        npyv_s64 a   = npyv_load_s64(src1);
        npyv_s64 b   = npyv_load_s64(src2);
        npyv_s64 quo = vsx4_div_s64(a, b);
        npyv_s64 rem = npyv_sub_s64(a, vec_mul(b, quo));
        // (b == 0 || (a == NPY_MIN_INT64 && b == -1))
        npyv_b64 bzero    = npyv_cmpeq_s64(b, vzero);
        npyv_b64 amin     = npyv_cmpeq_s64(a, vmin);
        npyv_b64 bneg_one = npyv_cmpeq_s64(b, vneg_one);
        npyv_b64 overflow = npyv_and_s64(bneg_one, amin);
                   warn_zero = npyv_or_s64(bzero, warn_zero);
               warn_overflow = npyv_or_s64(overflow, warn_overflow);
        // handle mixed case the way Python does
        // ((a > 0) == (b > 0) || rem == 0)
        npyv_b64 a_gt_zero  = npyv_cmpgt_s64(a, vzero);
        npyv_b64 b_gt_zero  = npyv_cmpgt_s64(b, vzero);
        npyv_b64 ab_eq_cond = npyv_cmpeq_s64(a_gt_zero, b_gt_zero);
        npyv_b64 rem_zero   = npyv_cmpeq_s64(rem, vzero);
        npyv_b64 or         = npyv_or_s64(ab_eq_cond, rem_zero);
        npyv_s64 to_sub = npyv_select_s64(or, vzero, vneg_one);
                      quo = npyv_add_s64(quo, to_sub);
                      // Divide by zero
                      quo = npyv_select_s64(bzero, vzero, quo);
                      // Overflow
                      quo = npyv_select_s64(overflow, vmin, quo);
        npyv_store_s64(dst1, quo);
    }

    if (!vec_all_eq(warn_zero, vzero)) {
        npy_set_floatstatus_divbyzero();
    }
    if (!vec_all_eq(warn_overflow, vzero)) {
        npy_set_floatstatus_overflow();
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
        const npyv_lanetype_s64 a = *src1;
        const npyv_lanetype_s64 b = *src2;
        if (NPY_UNLIKELY(b == 0)) {
            npy_set_floatstatus_divbyzero();
            *dst1 = 0;
        } else if (NPY_UNLIKELY((a == NPY_MIN_INT64) && (b == -1))) {
            npy_set_floatstatus_overflow();
            *dst1 = NPY_MIN_INT64;
        } else {
            *dst1 = a / b;
            if (((a > 0) != (b > 0)) && ((*dst1 * b) != a)) {
                *dst1 -= 1;
            }
        }
    }
    npyv_cleanup();
}

#endif // NPY_HAVE_VSX4
#endif // NPY_SIMD

/********************************************************************************
 ** Defining ufunc inner functions
 ********************************************************************************/

#line 340
#undef TO_SIMD_SFX
#if 0
#line 345
#elif NPY_BITSOF_BYTE == 8
    #define TO_SIMD_SFX(X) X##_s8

#line 345
#elif NPY_BITSOF_BYTE == 16
    #define TO_SIMD_SFX(X) X##_s16

#line 345
#elif NPY_BITSOF_BYTE == 32
    #define TO_SIMD_SFX(X) X##_s32

#line 345
#elif NPY_BITSOF_BYTE == 64
    #define TO_SIMD_SFX(X) X##_s64

#endif
#if NPY_BITSOF_BYTE == 64 && defined(SIMD_DISABLE_DIV64_OPT)
    #undef TO_SIMD_SFX
#endif

NPY_FINLINE npy_byte floor_div_BYTE(const npy_byte n, const npy_byte d)
{
    /*
     * FIXME: On x86 at least, dividing the smallest representable integer
     * by -1 causes a SIFGPE (division overflow). We treat this case here
     * (to avoid a SIGFPE crash at python level), but a good solution would
     * be to treat integer division problems separately from FPU exceptions
     * (i.e. a different approach than npy_set_floatstatus_divbyzero()).
     */
    if (NPY_UNLIKELY(d == 0 || (n == NPY_MIN_BYTE && d == -1))) {
        if (d == 0) {
            npy_set_floatstatus_divbyzero();
            return 0;
        }
        else {
            npy_set_floatstatus_overflow();
            return NPY_MIN_BYTE;
        }
    }
    npy_byte r = n / d;
    // Negative quotients needs to be rounded down
    if (((n > 0) != (d > 0)) && ((r * d) != n)) {
        r--;
    }
    return r;
}

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_divide)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if (IS_BINARY_REDUCE) {
        BINARY_REDUCE_LOOP(npy_byte) {
            io1 = floor_div_BYTE(io1, *(npy_byte*)ip2);
        }
        *((npy_byte *)iop1) = io1;
    }
#if NPY_SIMD && defined(TO_SIMD_SFX)
#if defined(NPY_HAVE_VSX4)
    // both arguments are arrays of the same size
    else if (IS_BLOCKABLE_BINARY(sizeof(npy_byte), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]);
    }
#endif
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_byte), NPY_SIMD_WIDTH) &&
             (*(npy_byte *)args[1]) != 0) {
        TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
    }
#endif
    else {
        BINARY_LOOP {
            *((npy_byte *)op1) = floor_div_BYTE(*(npy_byte*)ip1, *(npy_byte*)ip2);
        }
    }
}

NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_divide_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp shape = steps[3];
    npy_intp n = dimensions[0];
    npy_intp i;
    npy_byte *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_byte *)(ip1 + is1 * indx);
        *indexed = floor_div_BYTE(*indexed, *(npy_byte *)value);
    }
    return 0;
}


#line 340
#undef TO_SIMD_SFX
#if 0
#line 345
#elif NPY_BITSOF_SHORT == 8
    #define TO_SIMD_SFX(X) X##_s8

#line 345
#elif NPY_BITSOF_SHORT == 16
    #define TO_SIMD_SFX(X) X##_s16

#line 345
#elif NPY_BITSOF_SHORT == 32
    #define TO_SIMD_SFX(X) X##_s32

#line 345
#elif NPY_BITSOF_SHORT == 64
    #define TO_SIMD_SFX(X) X##_s64

#endif
#if NPY_BITSOF_SHORT == 64 && defined(SIMD_DISABLE_DIV64_OPT)
    #undef TO_SIMD_SFX
#endif

NPY_FINLINE npy_short floor_div_SHORT(const npy_short n, const npy_short d)
{
    /*
     * FIXME: On x86 at least, dividing the smallest representable integer
     * by -1 causes a SIFGPE (division overflow). We treat this case here
     * (to avoid a SIGFPE crash at python level), but a good solution would
     * be to treat integer division problems separately from FPU exceptions
     * (i.e. a different approach than npy_set_floatstatus_divbyzero()).
     */
    if (NPY_UNLIKELY(d == 0 || (n == NPY_MIN_SHORT && d == -1))) {
        if (d == 0) {
            npy_set_floatstatus_divbyzero();
            return 0;
        }
        else {
            npy_set_floatstatus_overflow();
            return NPY_MIN_SHORT;
        }
    }
    npy_short r = n / d;
    // Negative quotients needs to be rounded down
    if (((n > 0) != (d > 0)) && ((r * d) != n)) {
        r--;
    }
    return r;
}

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_divide)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if (IS_BINARY_REDUCE) {
        BINARY_REDUCE_LOOP(npy_short) {
            io1 = floor_div_SHORT(io1, *(npy_short*)ip2);
        }
        *((npy_short *)iop1) = io1;
    }
#if NPY_SIMD && defined(TO_SIMD_SFX)
#if defined(NPY_HAVE_VSX4)
    // both arguments are arrays of the same size
    else if (IS_BLOCKABLE_BINARY(sizeof(npy_short), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]);
    }
#endif
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_short), NPY_SIMD_WIDTH) &&
             (*(npy_short *)args[1]) != 0) {
        TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
    }
#endif
    else {
        BINARY_LOOP {
            *((npy_short *)op1) = floor_div_SHORT(*(npy_short*)ip1, *(npy_short*)ip2);
        }
    }
}

NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_divide_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp shape = steps[3];
    npy_intp n = dimensions[0];
    npy_intp i;
    npy_short *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_short *)(ip1 + is1 * indx);
        *indexed = floor_div_SHORT(*indexed, *(npy_short *)value);
    }
    return 0;
}


#line 340
#undef TO_SIMD_SFX
#if 0
#line 345
#elif NPY_BITSOF_INT == 8
    #define TO_SIMD_SFX(X) X##_s8

#line 345
#elif NPY_BITSOF_INT == 16
    #define TO_SIMD_SFX(X) X##_s16

#line 345
#elif NPY_BITSOF_INT == 32
    #define TO_SIMD_SFX(X) X##_s32

#line 345
#elif NPY_BITSOF_INT == 64
    #define TO_SIMD_SFX(X) X##_s64

#endif
#if NPY_BITSOF_INT == 64 && defined(SIMD_DISABLE_DIV64_OPT)
    #undef TO_SIMD_SFX
#endif

NPY_FINLINE npy_int floor_div_INT(const npy_int n, const npy_int d)
{
    /*
     * FIXME: On x86 at least, dividing the smallest representable integer
     * by -1 causes a SIFGPE (division overflow). We treat this case here
     * (to avoid a SIGFPE crash at python level), but a good solution would
     * be to treat integer division problems separately from FPU exceptions
     * (i.e. a different approach than npy_set_floatstatus_divbyzero()).
     */
    if (NPY_UNLIKELY(d == 0 || (n == NPY_MIN_INT && d == -1))) {
        if (d == 0) {
            npy_set_floatstatus_divbyzero();
            return 0;
        }
        else {
            npy_set_floatstatus_overflow();
            return NPY_MIN_INT;
        }
    }
    npy_int r = n / d;
    // Negative quotients needs to be rounded down
    if (((n > 0) != (d > 0)) && ((r * d) != n)) {
        r--;
    }
    return r;
}

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_divide)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if (IS_BINARY_REDUCE) {
        BINARY_REDUCE_LOOP(npy_int) {
            io1 = floor_div_INT(io1, *(npy_int*)ip2);
        }
        *((npy_int *)iop1) = io1;
    }
#if NPY_SIMD && defined(TO_SIMD_SFX)
#if defined(NPY_HAVE_VSX4)
    // both arguments are arrays of the same size
    else if (IS_BLOCKABLE_BINARY(sizeof(npy_int), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]);
    }
#endif
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_int), NPY_SIMD_WIDTH) &&
             (*(npy_int *)args[1]) != 0) {
        TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
    }
#endif
    else {
        BINARY_LOOP {
            *((npy_int *)op1) = floor_div_INT(*(npy_int*)ip1, *(npy_int*)ip2);
        }
    }
}

NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_divide_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp shape = steps[3];
    npy_intp n = dimensions[0];
    npy_intp i;
    npy_int *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_int *)(ip1 + is1 * indx);
        *indexed = floor_div_INT(*indexed, *(npy_int *)value);
    }
    return 0;
}


#line 340
#undef TO_SIMD_SFX
#if 0
#line 345
#elif NPY_BITSOF_LONG == 8
    #define TO_SIMD_SFX(X) X##_s8

#line 345
#elif NPY_BITSOF_LONG == 16
    #define TO_SIMD_SFX(X) X##_s16

#line 345
#elif NPY_BITSOF_LONG == 32
    #define TO_SIMD_SFX(X) X##_s32

#line 345
#elif NPY_BITSOF_LONG == 64
    #define TO_SIMD_SFX(X) X##_s64

#endif
#if NPY_BITSOF_LONG == 64 && defined(SIMD_DISABLE_DIV64_OPT)
    #undef TO_SIMD_SFX
#endif

NPY_FINLINE npy_long floor_div_LONG(const npy_long n, const npy_long d)
{
    /*
     * FIXME: On x86 at least, dividing the smallest representable integer
     * by -1 causes a SIFGPE (division overflow). We treat this case here
     * (to avoid a SIGFPE crash at python level), but a good solution would
     * be to treat integer division problems separately from FPU exceptions
     * (i.e. a different approach than npy_set_floatstatus_divbyzero()).
     */
    if (NPY_UNLIKELY(d == 0 || (n == NPY_MIN_LONG && d == -1))) {
        if (d == 0) {
            npy_set_floatstatus_divbyzero();
            return 0;
        }
        else {
            npy_set_floatstatus_overflow();
            return NPY_MIN_LONG;
        }
    }
    npy_long r = n / d;
    // Negative quotients needs to be rounded down
    if (((n > 0) != (d > 0)) && ((r * d) != n)) {
        r--;
    }
    return r;
}

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_divide)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if (IS_BINARY_REDUCE) {
        BINARY_REDUCE_LOOP(npy_long) {
            io1 = floor_div_LONG(io1, *(npy_long*)ip2);
        }
        *((npy_long *)iop1) = io1;
    }
#if NPY_SIMD && defined(TO_SIMD_SFX)
#if defined(NPY_HAVE_VSX4)
    // both arguments are arrays of the same size
    else if (IS_BLOCKABLE_BINARY(sizeof(npy_long), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]);
    }
#endif
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_long), NPY_SIMD_WIDTH) &&
             (*(npy_long *)args[1]) != 0) {
        TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
    }
#endif
    else {
        BINARY_LOOP {
            *((npy_long *)op1) = floor_div_LONG(*(npy_long*)ip1, *(npy_long*)ip2);
        }
    }
}

NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_divide_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp shape = steps[3];
    npy_intp n = dimensions[0];
    npy_intp i;
    npy_long *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_long *)(ip1 + is1 * indx);
        *indexed = floor_div_LONG(*indexed, *(npy_long *)value);
    }
    return 0;
}


#line 340
#undef TO_SIMD_SFX
#if 0
#line 345
#elif NPY_BITSOF_LONGLONG == 8
    #define TO_SIMD_SFX(X) X##_s8

#line 345
#elif NPY_BITSOF_LONGLONG == 16
    #define TO_SIMD_SFX(X) X##_s16

#line 345
#elif NPY_BITSOF_LONGLONG == 32
    #define TO_SIMD_SFX(X) X##_s32

#line 345
#elif NPY_BITSOF_LONGLONG == 64
    #define TO_SIMD_SFX(X) X##_s64

#endif
#if NPY_BITSOF_LONGLONG == 64 && defined(SIMD_DISABLE_DIV64_OPT)
    #undef TO_SIMD_SFX
#endif

NPY_FINLINE npy_longlong floor_div_LONGLONG(const npy_longlong n, const npy_longlong d)
{
    /*
     * FIXME: On x86 at least, dividing the smallest representable integer
     * by -1 causes a SIFGPE (division overflow). We treat this case here
     * (to avoid a SIGFPE crash at python level), but a good solution would
     * be to treat integer division problems separately from FPU exceptions
     * (i.e. a different approach than npy_set_floatstatus_divbyzero()).
     */
    if (NPY_UNLIKELY(d == 0 || (n == NPY_MIN_LONGLONG && d == -1))) {
        if (d == 0) {
            npy_set_floatstatus_divbyzero();
            return 0;
        }
        else {
            npy_set_floatstatus_overflow();
            return NPY_MIN_LONGLONG;
        }
    }
    npy_longlong r = n / d;
    // Negative quotients needs to be rounded down
    if (((n > 0) != (d > 0)) && ((r * d) != n)) {
        r--;
    }
    return r;
}

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_divide)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if (IS_BINARY_REDUCE) {
        BINARY_REDUCE_LOOP(npy_longlong) {
            io1 = floor_div_LONGLONG(io1, *(npy_longlong*)ip2);
        }
        *((npy_longlong *)iop1) = io1;
    }
#if NPY_SIMD && defined(TO_SIMD_SFX)
#if defined(NPY_HAVE_VSX4)
    // both arguments are arrays of the same size
    else if (IS_BLOCKABLE_BINARY(sizeof(npy_longlong), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]);
    }
#endif
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_longlong), NPY_SIMD_WIDTH) &&
             (*(npy_longlong *)args[1]) != 0) {
        TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
    }
#endif
    else {
        BINARY_LOOP {
            *((npy_longlong *)op1) = floor_div_LONGLONG(*(npy_longlong*)ip1, *(npy_longlong*)ip2);
        }
    }
}

NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_divide_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp shape = steps[3];
    npy_intp n = dimensions[0];
    npy_intp i;
    npy_longlong *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_longlong *)(ip1 + is1 * indx);
        *indexed = floor_div_LONGLONG(*indexed, *(npy_longlong *)value);
    }
    return 0;
}



#line 439
#undef TO_SIMD_SFX
#if 0
#line 444
#elif NPY_BITSOF_BYTE == 8
    #define TO_SIMD_SFX(X) X##_u8

#line 444
#elif NPY_BITSOF_BYTE == 16
    #define TO_SIMD_SFX(X) X##_u16

#line 444
#elif NPY_BITSOF_BYTE == 32
    #define TO_SIMD_SFX(X) X##_u32

#line 444
#elif NPY_BITSOF_BYTE == 64
    #define TO_SIMD_SFX(X) X##_u64

#endif
/*
 * For 64-bit division on Armv7, Aarch64, and IBM/Power, NPYV fall-backs to the scalar division
 * because emulating multiply-high on these architectures is going to be expensive comparing
 * to the native scalar dividers.
 * Therefore it's better to disable NPYV in this special case to avoid any unnecessary shuffles.
 * Power10(VSX4) is an exception here since it has native support for integer vector division.
 */
#if NPY_BITSOF_BYTE == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
    #undef TO_SIMD_SFX
#endif
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_divide)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if (IS_BINARY_REDUCE) {
        BINARY_REDUCE_LOOP(npy_ubyte) {
            const npy_ubyte d = *(npy_ubyte *)ip2;
            if (NPY_UNLIKELY(d == 0)) {
                npy_set_floatstatus_divbyzero();
                io1 = 0;
            } else {
                io1 /= d;
            }
        }
        *((npy_ubyte *)iop1) = io1;
    }
#if NPY_SIMD && defined(TO_SIMD_SFX)
#if defined(NPY_HAVE_VSX4)
    // both arguments are arrays of the same size
    else if (IS_BLOCKABLE_BINARY(sizeof(npy_ubyte), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]);
    }
#endif
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ubyte), NPY_SIMD_WIDTH) &&
             (*(npy_ubyte *)args[1]) != 0) {
        TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
    }
#endif
    else {
        BINARY_LOOP {
            const npy_ubyte in1 = *(npy_ubyte *)ip1;
            const npy_ubyte in2 = *(npy_ubyte *)ip2;
            if (NPY_UNLIKELY(in2 == 0)) {
                npy_set_floatstatus_divbyzero();
                *((npy_ubyte *)op1) = 0;
            } else{
                *((npy_ubyte *)op1) = in1 / in2;
            }
        }
    }
}

NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_divide_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp shape = steps[3];
    npy_intp n = dimensions[0];
    npy_intp i;
    npy_ubyte *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_ubyte *)(ip1 + is1 * indx);
        npy_ubyte in2 = *(npy_ubyte *)value;
        if (NPY_UNLIKELY(in2 == 0)) {
            npy_set_floatstatus_divbyzero();
            *indexed = 0;
        } else {
            *indexed = *indexed / in2;
        }
    }
    return 0;
}


#line 439
#undef TO_SIMD_SFX
#if 0
#line 444
#elif NPY_BITSOF_SHORT == 8
    #define TO_SIMD_SFX(X) X##_u8

#line 444
#elif NPY_BITSOF_SHORT == 16
    #define TO_SIMD_SFX(X) X##_u16

#line 444
#elif NPY_BITSOF_SHORT == 32
    #define TO_SIMD_SFX(X) X##_u32

#line 444
#elif NPY_BITSOF_SHORT == 64
    #define TO_SIMD_SFX(X) X##_u64

#endif
/*
 * For 64-bit division on Armv7, Aarch64, and IBM/Power, NPYV fall-backs to the scalar division
 * because emulating multiply-high on these architectures is going to be expensive comparing
 * to the native scalar dividers.
 * Therefore it's better to disable NPYV in this special case to avoid any unnecessary shuffles.
 * Power10(VSX4) is an exception here since it has native support for integer vector division.
 */
#if NPY_BITSOF_SHORT == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
    #undef TO_SIMD_SFX
#endif
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_divide)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if (IS_BINARY_REDUCE) {
        BINARY_REDUCE_LOOP(npy_ushort) {
            const npy_ushort d = *(npy_ushort *)ip2;
            if (NPY_UNLIKELY(d == 0)) {
                npy_set_floatstatus_divbyzero();
                io1 = 0;
            } else {
                io1 /= d;
            }
        }
        *((npy_ushort *)iop1) = io1;
    }
#if NPY_SIMD && defined(TO_SIMD_SFX)
#if defined(NPY_HAVE_VSX4)
    // both arguments are arrays of the same size
    else if (IS_BLOCKABLE_BINARY(sizeof(npy_ushort), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]);
    }
#endif
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ushort), NPY_SIMD_WIDTH) &&
             (*(npy_ushort *)args[1]) != 0) {
        TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
    }
#endif
    else {
        BINARY_LOOP {
            const npy_ushort in1 = *(npy_ushort *)ip1;
            const npy_ushort in2 = *(npy_ushort *)ip2;
            if (NPY_UNLIKELY(in2 == 0)) {
                npy_set_floatstatus_divbyzero();
                *((npy_ushort *)op1) = 0;
            } else{
                *((npy_ushort *)op1) = in1 / in2;
            }
        }
    }
}

NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_divide_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp shape = steps[3];
    npy_intp n = dimensions[0];
    npy_intp i;
    npy_ushort *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_ushort *)(ip1 + is1 * indx);
        npy_ushort in2 = *(npy_ushort *)value;
        if (NPY_UNLIKELY(in2 == 0)) {
            npy_set_floatstatus_divbyzero();
            *indexed = 0;
        } else {
            *indexed = *indexed / in2;
        }
    }
    return 0;
}


#line 439
#undef TO_SIMD_SFX
#if 0
#line 444
#elif NPY_BITSOF_INT == 8
    #define TO_SIMD_SFX(X) X##_u8

#line 444
#elif NPY_BITSOF_INT == 16
    #define TO_SIMD_SFX(X) X##_u16

#line 444
#elif NPY_BITSOF_INT == 32
    #define TO_SIMD_SFX(X) X##_u32

#line 444
#elif NPY_BITSOF_INT == 64
    #define TO_SIMD_SFX(X) X##_u64

#endif
/*
 * For 64-bit division on Armv7, Aarch64, and IBM/Power, NPYV fall-backs to the scalar division
 * because emulating multiply-high on these architectures is going to be expensive comparing
 * to the native scalar dividers.
 * Therefore it's better to disable NPYV in this special case to avoid any unnecessary shuffles.
 * Power10(VSX4) is an exception here since it has native support for integer vector division.
 */
#if NPY_BITSOF_INT == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
    #undef TO_SIMD_SFX
#endif
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_divide)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if (IS_BINARY_REDUCE) {
        BINARY_REDUCE_LOOP(npy_uint) {
            const npy_uint d = *(npy_uint *)ip2;
            if (NPY_UNLIKELY(d == 0)) {
                npy_set_floatstatus_divbyzero();
                io1 = 0;
            } else {
                io1 /= d;
            }
        }
        *((npy_uint *)iop1) = io1;
    }
#if NPY_SIMD && defined(TO_SIMD_SFX)
#if defined(NPY_HAVE_VSX4)
    // both arguments are arrays of the same size
    else if (IS_BLOCKABLE_BINARY(sizeof(npy_uint), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]);
    }
#endif
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_uint), NPY_SIMD_WIDTH) &&
             (*(npy_uint *)args[1]) != 0) {
        TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
    }
#endif
    else {
        BINARY_LOOP {
            const npy_uint in1 = *(npy_uint *)ip1;
            const npy_uint in2 = *(npy_uint *)ip2;
            if (NPY_UNLIKELY(in2 == 0)) {
                npy_set_floatstatus_divbyzero();
                *((npy_uint *)op1) = 0;
            } else{
                *((npy_uint *)op1) = in1 / in2;
            }
        }
    }
}

NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_divide_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp shape = steps[3];
    npy_intp n = dimensions[0];
    npy_intp i;
    npy_uint *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_uint *)(ip1 + is1 * indx);
        npy_uint in2 = *(npy_uint *)value;
        if (NPY_UNLIKELY(in2 == 0)) {
            npy_set_floatstatus_divbyzero();
            *indexed = 0;
        } else {
            *indexed = *indexed / in2;
        }
    }
    return 0;
}


#line 439
#undef TO_SIMD_SFX
#if 0
#line 444
#elif NPY_BITSOF_LONG == 8
    #define TO_SIMD_SFX(X) X##_u8

#line 444
#elif NPY_BITSOF_LONG == 16
    #define TO_SIMD_SFX(X) X##_u16

#line 444
#elif NPY_BITSOF_LONG == 32
    #define TO_SIMD_SFX(X) X##_u32

#line 444
#elif NPY_BITSOF_LONG == 64
    #define TO_SIMD_SFX(X) X##_u64

#endif
/*
 * For 64-bit division on Armv7, Aarch64, and IBM/Power, NPYV fall-backs to the scalar division
 * because emulating multiply-high on these architectures is going to be expensive comparing
 * to the native scalar dividers.
 * Therefore it's better to disable NPYV in this special case to avoid any unnecessary shuffles.
 * Power10(VSX4) is an exception here since it has native support for integer vector division.
 */
#if NPY_BITSOF_LONG == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
    #undef TO_SIMD_SFX
#endif
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_divide)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if (IS_BINARY_REDUCE) {
        BINARY_REDUCE_LOOP(npy_ulong) {
            const npy_ulong d = *(npy_ulong *)ip2;
            if (NPY_UNLIKELY(d == 0)) {
                npy_set_floatstatus_divbyzero();
                io1 = 0;
            } else {
                io1 /= d;
            }
        }
        *((npy_ulong *)iop1) = io1;
    }
#if NPY_SIMD && defined(TO_SIMD_SFX)
#if defined(NPY_HAVE_VSX4)
    // both arguments are arrays of the same size
    else if (IS_BLOCKABLE_BINARY(sizeof(npy_ulong), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]);
    }
#endif
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ulong), NPY_SIMD_WIDTH) &&
             (*(npy_ulong *)args[1]) != 0) {
        TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
    }
#endif
    else {
        BINARY_LOOP {
            const npy_ulong in1 = *(npy_ulong *)ip1;
            const npy_ulong in2 = *(npy_ulong *)ip2;
            if (NPY_UNLIKELY(in2 == 0)) {
                npy_set_floatstatus_divbyzero();
                *((npy_ulong *)op1) = 0;
            } else{
                *((npy_ulong *)op1) = in1 / in2;
            }
        }
    }
}

NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_divide_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp shape = steps[3];
    npy_intp n = dimensions[0];
    npy_intp i;
    npy_ulong *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_ulong *)(ip1 + is1 * indx);
        npy_ulong in2 = *(npy_ulong *)value;
        if (NPY_UNLIKELY(in2 == 0)) {
            npy_set_floatstatus_divbyzero();
            *indexed = 0;
        } else {
            *indexed = *indexed / in2;
        }
    }
    return 0;
}


#line 439
#undef TO_SIMD_SFX
#if 0
#line 444
#elif NPY_BITSOF_LONGLONG == 8
    #define TO_SIMD_SFX(X) X##_u8

#line 444
#elif NPY_BITSOF_LONGLONG == 16
    #define TO_SIMD_SFX(X) X##_u16

#line 444
#elif NPY_BITSOF_LONGLONG == 32
    #define TO_SIMD_SFX(X) X##_u32

#line 444
#elif NPY_BITSOF_LONGLONG == 64
    #define TO_SIMD_SFX(X) X##_u64

#endif
/*
 * For 64-bit division on Armv7, Aarch64, and IBM/Power, NPYV fall-backs to the scalar division
 * because emulating multiply-high on these architectures is going to be expensive comparing
 * to the native scalar dividers.
 * Therefore it's better to disable NPYV in this special case to avoid any unnecessary shuffles.
 * Power10(VSX4) is an exception here since it has native support for integer vector division.
 */
#if NPY_BITSOF_LONGLONG == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
    #undef TO_SIMD_SFX
#endif
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_divide)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if (IS_BINARY_REDUCE) {
        BINARY_REDUCE_LOOP(npy_ulonglong) {
            const npy_ulonglong d = *(npy_ulonglong *)ip2;
            if (NPY_UNLIKELY(d == 0)) {
                npy_set_floatstatus_divbyzero();
                io1 = 0;
            } else {
                io1 /= d;
            }
        }
        *((npy_ulonglong *)iop1) = io1;
    }
#if NPY_SIMD && defined(TO_SIMD_SFX)
#if defined(NPY_HAVE_VSX4)
    // both arguments are arrays of the same size
    else if (IS_BLOCKABLE_BINARY(sizeof(npy_ulonglong), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]);
    }
#endif
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ulonglong), NPY_SIMD_WIDTH) &&
             (*(npy_ulonglong *)args[1]) != 0) {
        TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
    }
#endif
    else {
        BINARY_LOOP {
            const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
            const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
            if (NPY_UNLIKELY(in2 == 0)) {
                npy_set_floatstatus_divbyzero();
                *((npy_ulonglong *)op1) = 0;
            } else{
                *((npy_ulonglong *)op1) = in1 / in2;
            }
        }
    }
}

NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_divide_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp shape = steps[3];
    npy_intp n = dimensions[0];
    npy_intp i;
    npy_ulonglong *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_ulonglong *)(ip1 + is1 * indx);
        npy_ulonglong in2 = *(npy_ulonglong *)value;
        if (NPY_UNLIKELY(in2 == 0)) {
            npy_set_floatstatus_divbyzero();
            *indexed = 0;
        } else {
            *indexed = *indexed / in2;
        }
    }
    return 0;
}



