#line 1 "numpy/core/src/umath/loops_arithm_fp.dispatch.c.src"

/*
 *****************************************************************************
 **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
 **       Changes should be made to the original source (.src) file         **
 *****************************************************************************
 */

#line 1
/*@targets
 ** $maxopt baseline
 ** sse2 (avx2 fma3)
 ** neon asimd
 ** vsx2 vsx3
 ** vx vxe
 **/
#define _UMATHMODULE
#define _MULTIARRAYMODULE
#define NPY_NO_DEPRECATED_API NPY_API_VERSION

#include "simd/simd.h"
#include "loops_utils.h"
#include "loops.h"
#include "lowlevel_strided_loops.h"
// Provides the various *_LOOP macros
#include "fast_loop_macros.h"

/**
 * TODO:
 *  - Improve the implementation of SIMD complex absolute,
 *    current one kinda slow and it can be optimized by
 *    at least avoiding the division and keep sqrt.
 *  - Vectorize reductions
 *  - Add support for ASIMD/VCMLA through universal intrinics.
 */

//###############################################################################
//## Real Single/Double precision
//###############################################################################
/********************************************************************************
 ** Defining ufunc inner functions
 ********************************************************************************/
#line 43
#line 52
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_add)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp len = dimensions[0];
    char *src0 = args[0], *src1 = args[1], *dst = args[2];
    npy_intp ssrc0 = steps[0], ssrc1 = steps[1], sdst = steps[2];
    // reduce
    if (ssrc0 == 0 && ssrc0 == sdst && src0 == dst) {
    #if 1
        *((npy_float*)src0) += FLOAT_pairwise_sum(src1, len, ssrc1);
    #else
        npy_float acc = *((npy_float*)src0);
        if (ssrc1 == sizeof(npy_float)) {
            for (; len > 0; --len, src1 += sizeof(npy_float)) {
                acc += *(npy_float *)src1;
            }
        } else {
            for (; len > 0; --len, src1 += ssrc1) {
                acc += *(npy_float *)src1;
            }
        }
        *((npy_float*)src0) = acc;
    #endif
        return;
    }
#if 0 && defined(NPY_HAVE_NEON) && !NPY_SIMD_F64
    /**
     * The SIMD branch is disabled on armhf(armv7) due to the absence of native SIMD
     * support for single-precision floating-point division. Only scalar division is
     * supported natively, and without hardware for performance and accuracy comparison,
     * it's challenging to evaluate the benefits of emulated SIMD intrinsic versus
     * native scalar division.
     *
     * The `npyv_div_f32` universal intrinsic emulates the division operation using an
     * approximate reciprocal combined with 3 Newton-Raphson iterations for enhanced
     * precision. However, this approach has limitations:
     *
     * - It can cause unexpected floating-point overflows in special cases, such as when
     *   the divisor is subnormal (refer: https://github.com/numpy/numpy/issues/25097).
     *
     * - The precision may vary between the emulated SIMD and scalar division due to
     *   non-uniform branches (non-contiguous) in the code, leading to precision
     *   inconsistencies.
     *
     * - Considering the necessity of multiple Newton-Raphson iterations, the performance
     *   gain may not sufficiently offset these drawbacks.
     */
#elif NPY_SIMD_F32
    if (len > npyv_nlanes_f32*2 &&
        !is_mem_overlap(src0, ssrc0, dst, sdst, len) &&
        !is_mem_overlap(src1, ssrc1, dst, sdst, len)
    ) {
        const int vstep = npyv_nlanes_u8;
        const int wstep = vstep * 2;
        const int hstep = npyv_nlanes_f32;
        const int lstep = hstep * 2;
        // lots of specializations, to squeeze out max performance
        if (ssrc0 == sizeof(npy_float) && ssrc0 == ssrc1 && ssrc0 == sdst) {
            for (; len >= lstep; len -= lstep, src0 += wstep, src1 += wstep, dst += wstep) {
                npyv_f32 a0 = npyv_load_f32((const npy_float*)src0);
                npyv_f32 a1 = npyv_load_f32((const npy_float*)(src0 + vstep));
                npyv_f32 b0 = npyv_load_f32((const npy_float*)src1);
                npyv_f32 b1 = npyv_load_f32((const npy_float*)(src1 + vstep));
                npyv_f32 r0 = npyv_add_f32(a0, b0);
                npyv_f32 r1 = npyv_add_f32(a1, b1);
                npyv_store_f32((npy_float*)dst, r0);
                npyv_store_f32((npy_float*)(dst + vstep), r1);
            }
            for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
            #if 0
                npyv_f32 a = npyv_load_till_f32((const npy_float*)src0, len, 1.0f);
                npyv_f32 b = npyv_load_till_f32((const npy_float*)src1, len, 1.0f);
            #else
                npyv_f32 a = npyv_load_tillz_f32((const npy_float*)src0, len);
                npyv_f32 b = npyv_load_tillz_f32((const npy_float*)src1, len);
            #endif
                npyv_f32 r = npyv_add_f32(a, b);
                npyv_store_till_f32((npy_float*)dst, len, r);
            }
        }
        else if (ssrc0 == 0 && ssrc1 == sizeof(npy_float) && sdst == ssrc1) {
            npyv_f32 a = npyv_setall_f32(*((npy_float*)src0));
            for (; len >= lstep; len -= lstep, src1 += wstep, dst += wstep) {
                npyv_f32 b0 = npyv_load_f32((const npy_float*)src1);
                npyv_f32 b1 = npyv_load_f32((const npy_float*)(src1 + vstep));
                npyv_f32 r0 = npyv_add_f32(a, b0);
                npyv_f32 r1 = npyv_add_f32(a, b1);
                npyv_store_f32((npy_float*)dst, r0);
                npyv_store_f32((npy_float*)(dst + vstep), r1);
            }
            for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
            #if 0 || 0
                npyv_f32 b = npyv_load_till_f32((const npy_float*)src1, len, 1.0f);
            #else
                npyv_f32 b = npyv_load_tillz_f32((const npy_float*)src1, len);
            #endif
                npyv_f32 r = npyv_add_f32(a, b);
                npyv_store_till_f32((npy_float*)dst, len, r);
            }
        }
        else if (ssrc1 == 0 && ssrc0 == sizeof(npy_float) && sdst == ssrc0) {
            npyv_f32 b = npyv_setall_f32(*((npy_float*)src1));
            for (; len >= lstep; len -= lstep, src0 += wstep, dst += wstep) {
                npyv_f32 a0 = npyv_load_f32((const npy_float*)src0);
                npyv_f32 a1 = npyv_load_f32((const npy_float*)(src0 + vstep));
                npyv_f32 r0 = npyv_add_f32(a0, b);
                npyv_f32 r1 = npyv_add_f32(a1, b);
                npyv_store_f32((npy_float*)dst, r0);
                npyv_store_f32((npy_float*)(dst + vstep), r1);
            }
            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
            #if 0
                npyv_f32 a = npyv_load_till_f32((const npy_float*)src0, len, 1.0f);
            #elif 0
                npyv_f32 a = npyv_load_till_f32((const npy_float*)src0, len, NPY_NANF);
            #else
                npyv_f32 a = npyv_load_tillz_f32((const npy_float*)src0, len);
            #endif
                npyv_f32 r = npyv_add_f32(a, b);
                npyv_store_till_f32((npy_float*)dst, len, r);
            }
        } else {
            goto loop_scalar;
        }
        npyv_cleanup();
        return;
    }
loop_scalar:
#endif
    for (; len > 0; --len, src0 += ssrc0, src1 += ssrc1, dst += sdst) {
        const npy_float a = *((npy_float*)src0);
        const npy_float b = *((npy_float*)src1);
        *((npy_float*)dst) = a + b;
    }
}

NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_add_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp shape = steps[3];
    npy_intp n = dimensions[0];
    npy_intp i;
    npy_float *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_float *)(ip1 + is1 * indx);
        *indexed = *indexed + *(npy_float *)value;
    }
    return 0;
}


#line 52
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_subtract)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp len = dimensions[0];
    char *src0 = args[0], *src1 = args[1], *dst = args[2];
    npy_intp ssrc0 = steps[0], ssrc1 = steps[1], sdst = steps[2];
    // reduce
    if (ssrc0 == 0 && ssrc0 == sdst && src0 == dst) {
    #if 0
        *((npy_float*)src0) -= FLOAT_pairwise_sum(src1, len, ssrc1);
    #else
        npy_float acc = *((npy_float*)src0);
        if (ssrc1 == sizeof(npy_float)) {
            for (; len > 0; --len, src1 += sizeof(npy_float)) {
                acc -= *(npy_float *)src1;
            }
        } else {
            for (; len > 0; --len, src1 += ssrc1) {
                acc -= *(npy_float *)src1;
            }
        }
        *((npy_float*)src0) = acc;
    #endif
        return;
    }
#if 0 && defined(NPY_HAVE_NEON) && !NPY_SIMD_F64
    /**
     * The SIMD branch is disabled on armhf(armv7) due to the absence of native SIMD
     * support for single-precision floating-point division. Only scalar division is
     * supported natively, and without hardware for performance and accuracy comparison,
     * it's challenging to evaluate the benefits of emulated SIMD intrinsic versus
     * native scalar division.
     *
     * The `npyv_div_f32` universal intrinsic emulates the division operation using an
     * approximate reciprocal combined with 3 Newton-Raphson iterations for enhanced
     * precision. However, this approach has limitations:
     *
     * - It can cause unexpected floating-point overflows in special cases, such as when
     *   the divisor is subnormal (refer: https://github.com/numpy/numpy/issues/25097).
     *
     * - The precision may vary between the emulated SIMD and scalar division due to
     *   non-uniform branches (non-contiguous) in the code, leading to precision
     *   inconsistencies.
     *
     * - Considering the necessity of multiple Newton-Raphson iterations, the performance
     *   gain may not sufficiently offset these drawbacks.
     */
#elif NPY_SIMD_F32
    if (len > npyv_nlanes_f32*2 &&
        !is_mem_overlap(src0, ssrc0, dst, sdst, len) &&
        !is_mem_overlap(src1, ssrc1, dst, sdst, len)
    ) {
        const int vstep = npyv_nlanes_u8;
        const int wstep = vstep * 2;
        const int hstep = npyv_nlanes_f32;
        const int lstep = hstep * 2;
        // lots of specializations, to squeeze out max performance
        if (ssrc0 == sizeof(npy_float) && ssrc0 == ssrc1 && ssrc0 == sdst) {
            for (; len >= lstep; len -= lstep, src0 += wstep, src1 += wstep, dst += wstep) {
                npyv_f32 a0 = npyv_load_f32((const npy_float*)src0);
                npyv_f32 a1 = npyv_load_f32((const npy_float*)(src0 + vstep));
                npyv_f32 b0 = npyv_load_f32((const npy_float*)src1);
                npyv_f32 b1 = npyv_load_f32((const npy_float*)(src1 + vstep));
                npyv_f32 r0 = npyv_sub_f32(a0, b0);
                npyv_f32 r1 = npyv_sub_f32(a1, b1);
                npyv_store_f32((npy_float*)dst, r0);
                npyv_store_f32((npy_float*)(dst + vstep), r1);
            }
            for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
            #if 0
                npyv_f32 a = npyv_load_till_f32((const npy_float*)src0, len, 1.0f);
                npyv_f32 b = npyv_load_till_f32((const npy_float*)src1, len, 1.0f);
            #else
                npyv_f32 a = npyv_load_tillz_f32((const npy_float*)src0, len);
                npyv_f32 b = npyv_load_tillz_f32((const npy_float*)src1, len);
            #endif
                npyv_f32 r = npyv_sub_f32(a, b);
                npyv_store_till_f32((npy_float*)dst, len, r);
            }
        }
        else if (ssrc0 == 0 && ssrc1 == sizeof(npy_float) && sdst == ssrc1) {
            npyv_f32 a = npyv_setall_f32(*((npy_float*)src0));
            for (; len >= lstep; len -= lstep, src1 += wstep, dst += wstep) {
                npyv_f32 b0 = npyv_load_f32((const npy_float*)src1);
                npyv_f32 b1 = npyv_load_f32((const npy_float*)(src1 + vstep));
                npyv_f32 r0 = npyv_sub_f32(a, b0);
                npyv_f32 r1 = npyv_sub_f32(a, b1);
                npyv_store_f32((npy_float*)dst, r0);
                npyv_store_f32((npy_float*)(dst + vstep), r1);
            }
            for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
            #if 0 || 0
                npyv_f32 b = npyv_load_till_f32((const npy_float*)src1, len, 1.0f);
            #else
                npyv_f32 b = npyv_load_tillz_f32((const npy_float*)src1, len);
            #endif
                npyv_f32 r = npyv_sub_f32(a, b);
                npyv_store_till_f32((npy_float*)dst, len, r);
            }
        }
        else if (ssrc1 == 0 && ssrc0 == sizeof(npy_float) && sdst == ssrc0) {
            npyv_f32 b = npyv_setall_f32(*((npy_float*)src1));
            for (; len >= lstep; len -= lstep, src0 += wstep, dst += wstep) {
                npyv_f32 a0 = npyv_load_f32((const npy_float*)src0);
                npyv_f32 a1 = npyv_load_f32((const npy_float*)(src0 + vstep));
                npyv_f32 r0 = npyv_sub_f32(a0, b);
                npyv_f32 r1 = npyv_sub_f32(a1, b);
                npyv_store_f32((npy_float*)dst, r0);
                npyv_store_f32((npy_float*)(dst + vstep), r1);
            }
            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
            #if 0
                npyv_f32 a = npyv_load_till_f32((const npy_float*)src0, len, 1.0f);
            #elif 0
                npyv_f32 a = npyv_load_till_f32((const npy_float*)src0, len, NPY_NANF);
            #else
                npyv_f32 a = npyv_load_tillz_f32((const npy_float*)src0, len);
            #endif
                npyv_f32 r = npyv_sub_f32(a, b);
                npyv_store_till_f32((npy_float*)dst, len, r);
            }
        } else {
            goto loop_scalar;
        }
        npyv_cleanup();
        return;
    }
loop_scalar:
#endif
    for (; len > 0; --len, src0 += ssrc0, src1 += ssrc1, dst += sdst) {
        const npy_float a = *((npy_float*)src0);
        const npy_float b = *((npy_float*)src1);
        *((npy_float*)dst) = a - b;
    }
}

NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_subtract_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp shape = steps[3];
    npy_intp n = dimensions[0];
    npy_intp i;
    npy_float *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_float *)(ip1 + is1 * indx);
        *indexed = *indexed - *(npy_float *)value;
    }
    return 0;
}


#line 52
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_multiply)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp len = dimensions[0];
    char *src0 = args[0], *src1 = args[1], *dst = args[2];
    npy_intp ssrc0 = steps[0], ssrc1 = steps[1], sdst = steps[2];
    // reduce
    if (ssrc0 == 0 && ssrc0 == sdst && src0 == dst) {
    #if 0
        *((npy_float*)src0) *= FLOAT_pairwise_sum(src1, len, ssrc1);
    #else
        npy_float acc = *((npy_float*)src0);
        if (ssrc1 == sizeof(npy_float)) {
            for (; len > 0; --len, src1 += sizeof(npy_float)) {
                acc *= *(npy_float *)src1;
            }
        } else {
            for (; len > 0; --len, src1 += ssrc1) {
                acc *= *(npy_float *)src1;
            }
        }
        *((npy_float*)src0) = acc;
    #endif
        return;
    }
#if 0 && defined(NPY_HAVE_NEON) && !NPY_SIMD_F64
    /**
     * The SIMD branch is disabled on armhf(armv7) due to the absence of native SIMD
     * support for single-precision floating-point division. Only scalar division is
     * supported natively, and without hardware for performance and accuracy comparison,
     * it's challenging to evaluate the benefits of emulated SIMD intrinsic versus
     * native scalar division.
     *
     * The `npyv_div_f32` universal intrinsic emulates the division operation using an
     * approximate reciprocal combined with 3 Newton-Raphson iterations for enhanced
     * precision. However, this approach has limitations:
     *
     * - It can cause unexpected floating-point overflows in special cases, such as when
     *   the divisor is subnormal (refer: https://github.com/numpy/numpy/issues/25097).
     *
     * - The precision may vary between the emulated SIMD and scalar division due to
     *   non-uniform branches (non-contiguous) in the code, leading to precision
     *   inconsistencies.
     *
     * - Considering the necessity of multiple Newton-Raphson iterations, the performance
     *   gain may not sufficiently offset these drawbacks.
     */
#elif NPY_SIMD_F32
    if (len > npyv_nlanes_f32*2 &&
        !is_mem_overlap(src0, ssrc0, dst, sdst, len) &&
        !is_mem_overlap(src1, ssrc1, dst, sdst, len)
    ) {
        const int vstep = npyv_nlanes_u8;
        const int wstep = vstep * 2;
        const int hstep = npyv_nlanes_f32;
        const int lstep = hstep * 2;
        // lots of specializations, to squeeze out max performance
        if (ssrc0 == sizeof(npy_float) && ssrc0 == ssrc1 && ssrc0 == sdst) {
            for (; len >= lstep; len -= lstep, src0 += wstep, src1 += wstep, dst += wstep) {
                npyv_f32 a0 = npyv_load_f32((const npy_float*)src0);
                npyv_f32 a1 = npyv_load_f32((const npy_float*)(src0 + vstep));
                npyv_f32 b0 = npyv_load_f32((const npy_float*)src1);
                npyv_f32 b1 = npyv_load_f32((const npy_float*)(src1 + vstep));
                npyv_f32 r0 = npyv_mul_f32(a0, b0);
                npyv_f32 r1 = npyv_mul_f32(a1, b1);
                npyv_store_f32((npy_float*)dst, r0);
                npyv_store_f32((npy_float*)(dst + vstep), r1);
            }
            for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
            #if 0
                npyv_f32 a = npyv_load_till_f32((const npy_float*)src0, len, 1.0f);
                npyv_f32 b = npyv_load_till_f32((const npy_float*)src1, len, 1.0f);
            #else
                npyv_f32 a = npyv_load_tillz_f32((const npy_float*)src0, len);
                npyv_f32 b = npyv_load_tillz_f32((const npy_float*)src1, len);
            #endif
                npyv_f32 r = npyv_mul_f32(a, b);
                npyv_store_till_f32((npy_float*)dst, len, r);
            }
        }
        else if (ssrc0 == 0 && ssrc1 == sizeof(npy_float) && sdst == ssrc1) {
            npyv_f32 a = npyv_setall_f32(*((npy_float*)src0));
            for (; len >= lstep; len -= lstep, src1 += wstep, dst += wstep) {
                npyv_f32 b0 = npyv_load_f32((const npy_float*)src1);
                npyv_f32 b1 = npyv_load_f32((const npy_float*)(src1 + vstep));
                npyv_f32 r0 = npyv_mul_f32(a, b0);
                npyv_f32 r1 = npyv_mul_f32(a, b1);
                npyv_store_f32((npy_float*)dst, r0);
                npyv_store_f32((npy_float*)(dst + vstep), r1);
            }
            for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
            #if 0 || 1
                npyv_f32 b = npyv_load_till_f32((const npy_float*)src1, len, 1.0f);
            #else
                npyv_f32 b = npyv_load_tillz_f32((const npy_float*)src1, len);
            #endif
                npyv_f32 r = npyv_mul_f32(a, b);
                npyv_store_till_f32((npy_float*)dst, len, r);
            }
        }
        else if (ssrc1 == 0 && ssrc0 == sizeof(npy_float) && sdst == ssrc0) {
            npyv_f32 b = npyv_setall_f32(*((npy_float*)src1));
            for (; len >= lstep; len -= lstep, src0 += wstep, dst += wstep) {
                npyv_f32 a0 = npyv_load_f32((const npy_float*)src0);
                npyv_f32 a1 = npyv_load_f32((const npy_float*)(src0 + vstep));
                npyv_f32 r0 = npyv_mul_f32(a0, b);
                npyv_f32 r1 = npyv_mul_f32(a1, b);
                npyv_store_f32((npy_float*)dst, r0);
                npyv_store_f32((npy_float*)(dst + vstep), r1);
            }
            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
            #if 1
                npyv_f32 a = npyv_load_till_f32((const npy_float*)src0, len, 1.0f);
            #elif 0
                npyv_f32 a = npyv_load_till_f32((const npy_float*)src0, len, NPY_NANF);
            #else
                npyv_f32 a = npyv_load_tillz_f32((const npy_float*)src0, len);
            #endif
                npyv_f32 r = npyv_mul_f32(a, b);
                npyv_store_till_f32((npy_float*)dst, len, r);
            }
        } else {
            goto loop_scalar;
        }
        npyv_cleanup();
        return;
    }
loop_scalar:
#endif
    for (; len > 0; --len, src0 += ssrc0, src1 += ssrc1, dst += sdst) {
        const npy_float a = *((npy_float*)src0);
        const npy_float b = *((npy_float*)src1);
        *((npy_float*)dst) = a * b;
    }
}

NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_multiply_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp shape = steps[3];
    npy_intp n = dimensions[0];
    npy_intp i;
    npy_float *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_float *)(ip1 + is1 * indx);
        *indexed = *indexed * *(npy_float *)value;
    }
    return 0;
}


#line 52
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_divide)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp len = dimensions[0];
    char *src0 = args[0], *src1 = args[1], *dst = args[2];
    npy_intp ssrc0 = steps[0], ssrc1 = steps[1], sdst = steps[2];
    // reduce
    if (ssrc0 == 0 && ssrc0 == sdst && src0 == dst) {
    #if 0
        *((npy_float*)src0) /= FLOAT_pairwise_sum(src1, len, ssrc1);
    #else
        npy_float acc = *((npy_float*)src0);
        if (ssrc1 == sizeof(npy_float)) {
            for (; len > 0; --len, src1 += sizeof(npy_float)) {
                acc /= *(npy_float *)src1;
            }
        } else {
            for (; len > 0; --len, src1 += ssrc1) {
                acc /= *(npy_float *)src1;
            }
        }
        *((npy_float*)src0) = acc;
    #endif
        return;
    }
#if 1 && defined(NPY_HAVE_NEON) && !NPY_SIMD_F64
    /**
     * The SIMD branch is disabled on armhf(armv7) due to the absence of native SIMD
     * support for single-precision floating-point division. Only scalar division is
     * supported natively, and without hardware for performance and accuracy comparison,
     * it's challenging to evaluate the benefits of emulated SIMD intrinsic versus
     * native scalar division.
     *
     * The `npyv_div_f32` universal intrinsic emulates the division operation using an
     * approximate reciprocal combined with 3 Newton-Raphson iterations for enhanced
     * precision. However, this approach has limitations:
     *
     * - It can cause unexpected floating-point overflows in special cases, such as when
     *   the divisor is subnormal (refer: https://github.com/numpy/numpy/issues/25097).
     *
     * - The precision may vary between the emulated SIMD and scalar division due to
     *   non-uniform branches (non-contiguous) in the code, leading to precision
     *   inconsistencies.
     *
     * - Considering the necessity of multiple Newton-Raphson iterations, the performance
     *   gain may not sufficiently offset these drawbacks.
     */
#elif NPY_SIMD_F32
    if (len > npyv_nlanes_f32*2 &&
        !is_mem_overlap(src0, ssrc0, dst, sdst, len) &&
        !is_mem_overlap(src1, ssrc1, dst, sdst, len)
    ) {
        const int vstep = npyv_nlanes_u8;
        const int wstep = vstep * 2;
        const int hstep = npyv_nlanes_f32;
        const int lstep = hstep * 2;
        // lots of specializations, to squeeze out max performance
        if (ssrc0 == sizeof(npy_float) && ssrc0 == ssrc1 && ssrc0 == sdst) {
            for (; len >= lstep; len -= lstep, src0 += wstep, src1 += wstep, dst += wstep) {
                npyv_f32 a0 = npyv_load_f32((const npy_float*)src0);
                npyv_f32 a1 = npyv_load_f32((const npy_float*)(src0 + vstep));
                npyv_f32 b0 = npyv_load_f32((const npy_float*)src1);
                npyv_f32 b1 = npyv_load_f32((const npy_float*)(src1 + vstep));
                npyv_f32 r0 = npyv_div_f32(a0, b0);
                npyv_f32 r1 = npyv_div_f32(a1, b1);
                npyv_store_f32((npy_float*)dst, r0);
                npyv_store_f32((npy_float*)(dst + vstep), r1);
            }
            for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
            #if 1
                npyv_f32 a = npyv_load_till_f32((const npy_float*)src0, len, 1.0f);
                npyv_f32 b = npyv_load_till_f32((const npy_float*)src1, len, 1.0f);
            #else
                npyv_f32 a = npyv_load_tillz_f32((const npy_float*)src0, len);
                npyv_f32 b = npyv_load_tillz_f32((const npy_float*)src1, len);
            #endif
                npyv_f32 r = npyv_div_f32(a, b);
                npyv_store_till_f32((npy_float*)dst, len, r);
            }
        }
        else if (ssrc0 == 0 && ssrc1 == sizeof(npy_float) && sdst == ssrc1) {
            npyv_f32 a = npyv_setall_f32(*((npy_float*)src0));
            for (; len >= lstep; len -= lstep, src1 += wstep, dst += wstep) {
                npyv_f32 b0 = npyv_load_f32((const npy_float*)src1);
                npyv_f32 b1 = npyv_load_f32((const npy_float*)(src1 + vstep));
                npyv_f32 r0 = npyv_div_f32(a, b0);
                npyv_f32 r1 = npyv_div_f32(a, b1);
                npyv_store_f32((npy_float*)dst, r0);
                npyv_store_f32((npy_float*)(dst + vstep), r1);
            }
            for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
            #if 1 || 0
                npyv_f32 b = npyv_load_till_f32((const npy_float*)src1, len, 1.0f);
            #else
                npyv_f32 b = npyv_load_tillz_f32((const npy_float*)src1, len);
            #endif
                npyv_f32 r = npyv_div_f32(a, b);
                npyv_store_till_f32((npy_float*)dst, len, r);
            }
        }
        else if (ssrc1 == 0 && ssrc0 == sizeof(npy_float) && sdst == ssrc0) {
            npyv_f32 b = npyv_setall_f32(*((npy_float*)src1));
            for (; len >= lstep; len -= lstep, src0 += wstep, dst += wstep) {
                npyv_f32 a0 = npyv_load_f32((const npy_float*)src0);
                npyv_f32 a1 = npyv_load_f32((const npy_float*)(src0 + vstep));
                npyv_f32 r0 = npyv_div_f32(a0, b);
                npyv_f32 r1 = npyv_div_f32(a1, b);
                npyv_store_f32((npy_float*)dst, r0);
                npyv_store_f32((npy_float*)(dst + vstep), r1);
            }
            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
            #if 0
                npyv_f32 a = npyv_load_till_f32((const npy_float*)src0, len, 1.0f);
            #elif 1
                npyv_f32 a = npyv_load_till_f32((const npy_float*)src0, len, NPY_NANF);
            #else
                npyv_f32 a = npyv_load_tillz_f32((const npy_float*)src0, len);
            #endif
                npyv_f32 r = npyv_div_f32(a, b);
                npyv_store_till_f32((npy_float*)dst, len, r);
            }
        } else {
            goto loop_scalar;
        }
        npyv_cleanup();
        return;
    }
loop_scalar:
#endif
    for (; len > 0; --len, src0 += ssrc0, src1 += ssrc1, dst += sdst) {
        const npy_float a = *((npy_float*)src0);
        const npy_float b = *((npy_float*)src1);
        *((npy_float*)dst) = a / b;
    }
}

NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_divide_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp shape = steps[3];
    npy_intp n = dimensions[0];
    npy_intp i;
    npy_float *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_float *)(ip1 + is1 * indx);
        *indexed = *indexed / *(npy_float *)value;
    }
    return 0;
}



#line 43
#line 52
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_add)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp len = dimensions[0];
    char *src0 = args[0], *src1 = args[1], *dst = args[2];
    npy_intp ssrc0 = steps[0], ssrc1 = steps[1], sdst = steps[2];
    // reduce
    if (ssrc0 == 0 && ssrc0 == sdst && src0 == dst) {
    #if 1
        *((npy_double*)src0) += DOUBLE_pairwise_sum(src1, len, ssrc1);
    #else
        npy_double acc = *((npy_double*)src0);
        if (ssrc1 == sizeof(npy_double)) {
            for (; len > 0; --len, src1 += sizeof(npy_double)) {
                acc += *(npy_double *)src1;
            }
        } else {
            for (; len > 0; --len, src1 += ssrc1) {
                acc += *(npy_double *)src1;
            }
        }
        *((npy_double*)src0) = acc;
    #endif
        return;
    }
#if 0 && defined(NPY_HAVE_NEON) && !NPY_SIMD_F64
    /**
     * The SIMD branch is disabled on armhf(armv7) due to the absence of native SIMD
     * support for single-precision floating-point division. Only scalar division is
     * supported natively, and without hardware for performance and accuracy comparison,
     * it's challenging to evaluate the benefits of emulated SIMD intrinsic versus
     * native scalar division.
     *
     * The `npyv_div_f32` universal intrinsic emulates the division operation using an
     * approximate reciprocal combined with 3 Newton-Raphson iterations for enhanced
     * precision. However, this approach has limitations:
     *
     * - It can cause unexpected floating-point overflows in special cases, such as when
     *   the divisor is subnormal (refer: https://github.com/numpy/numpy/issues/25097).
     *
     * - The precision may vary between the emulated SIMD and scalar division due to
     *   non-uniform branches (non-contiguous) in the code, leading to precision
     *   inconsistencies.
     *
     * - Considering the necessity of multiple Newton-Raphson iterations, the performance
     *   gain may not sufficiently offset these drawbacks.
     */
#elif NPY_SIMD_F64
    if (len > npyv_nlanes_f64*2 &&
        !is_mem_overlap(src0, ssrc0, dst, sdst, len) &&
        !is_mem_overlap(src1, ssrc1, dst, sdst, len)
    ) {
        const int vstep = npyv_nlanes_u8;
        const int wstep = vstep * 2;
        const int hstep = npyv_nlanes_f64;
        const int lstep = hstep * 2;
        // lots of specializations, to squeeze out max performance
        if (ssrc0 == sizeof(npy_double) && ssrc0 == ssrc1 && ssrc0 == sdst) {
            for (; len >= lstep; len -= lstep, src0 += wstep, src1 += wstep, dst += wstep) {
                npyv_f64 a0 = npyv_load_f64((const npy_double*)src0);
                npyv_f64 a1 = npyv_load_f64((const npy_double*)(src0 + vstep));
                npyv_f64 b0 = npyv_load_f64((const npy_double*)src1);
                npyv_f64 b1 = npyv_load_f64((const npy_double*)(src1 + vstep));
                npyv_f64 r0 = npyv_add_f64(a0, b0);
                npyv_f64 r1 = npyv_add_f64(a1, b1);
                npyv_store_f64((npy_double*)dst, r0);
                npyv_store_f64((npy_double*)(dst + vstep), r1);
            }
            for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
            #if 0
                npyv_f64 a = npyv_load_till_f64((const npy_double*)src0, len, 1.0);
                npyv_f64 b = npyv_load_till_f64((const npy_double*)src1, len, 1.0);
            #else
                npyv_f64 a = npyv_load_tillz_f64((const npy_double*)src0, len);
                npyv_f64 b = npyv_load_tillz_f64((const npy_double*)src1, len);
            #endif
                npyv_f64 r = npyv_add_f64(a, b);
                npyv_store_till_f64((npy_double*)dst, len, r);
            }
        }
        else if (ssrc0 == 0 && ssrc1 == sizeof(npy_double) && sdst == ssrc1) {
            npyv_f64 a = npyv_setall_f64(*((npy_double*)src0));
            for (; len >= lstep; len -= lstep, src1 += wstep, dst += wstep) {
                npyv_f64 b0 = npyv_load_f64((const npy_double*)src1);
                npyv_f64 b1 = npyv_load_f64((const npy_double*)(src1 + vstep));
                npyv_f64 r0 = npyv_add_f64(a, b0);
                npyv_f64 r1 = npyv_add_f64(a, b1);
                npyv_store_f64((npy_double*)dst, r0);
                npyv_store_f64((npy_double*)(dst + vstep), r1);
            }
            for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
            #if 0 || 0
                npyv_f64 b = npyv_load_till_f64((const npy_double*)src1, len, 1.0);
            #else
                npyv_f64 b = npyv_load_tillz_f64((const npy_double*)src1, len);
            #endif
                npyv_f64 r = npyv_add_f64(a, b);
                npyv_store_till_f64((npy_double*)dst, len, r);
            }
        }
        else if (ssrc1 == 0 && ssrc0 == sizeof(npy_double) && sdst == ssrc0) {
            npyv_f64 b = npyv_setall_f64(*((npy_double*)src1));
            for (; len >= lstep; len -= lstep, src0 += wstep, dst += wstep) {
                npyv_f64 a0 = npyv_load_f64((const npy_double*)src0);
                npyv_f64 a1 = npyv_load_f64((const npy_double*)(src0 + vstep));
                npyv_f64 r0 = npyv_add_f64(a0, b);
                npyv_f64 r1 = npyv_add_f64(a1, b);
                npyv_store_f64((npy_double*)dst, r0);
                npyv_store_f64((npy_double*)(dst + vstep), r1);
            }
            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
            #if 0
                npyv_f64 a = npyv_load_till_f64((const npy_double*)src0, len, 1.0);
            #elif 0
                npyv_f64 a = npyv_load_till_f64((const npy_double*)src0, len, NPY_NAN);
            #else
                npyv_f64 a = npyv_load_tillz_f64((const npy_double*)src0, len);
            #endif
                npyv_f64 r = npyv_add_f64(a, b);
                npyv_store_till_f64((npy_double*)dst, len, r);
            }
        } else {
            goto loop_scalar;
        }
        npyv_cleanup();
        return;
    }
loop_scalar:
#endif
    for (; len > 0; --len, src0 += ssrc0, src1 += ssrc1, dst += sdst) {
        const npy_double a = *((npy_double*)src0);
        const npy_double b = *((npy_double*)src1);
        *((npy_double*)dst) = a + b;
    }
}

NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_add_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp shape = steps[3];
    npy_intp n = dimensions[0];
    npy_intp i;
    npy_double *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_double *)(ip1 + is1 * indx);
        *indexed = *indexed + *(npy_double *)value;
    }
    return 0;
}


#line 52
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_subtract)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp len = dimensions[0];
    char *src0 = args[0], *src1 = args[1], *dst = args[2];
    npy_intp ssrc0 = steps[0], ssrc1 = steps[1], sdst = steps[2];
    // reduce
    if (ssrc0 == 0 && ssrc0 == sdst && src0 == dst) {
    #if 0
        *((npy_double*)src0) -= DOUBLE_pairwise_sum(src1, len, ssrc1);
    #else
        npy_double acc = *((npy_double*)src0);
        if (ssrc1 == sizeof(npy_double)) {
            for (; len > 0; --len, src1 += sizeof(npy_double)) {
                acc -= *(npy_double *)src1;
            }
        } else {
            for (; len > 0; --len, src1 += ssrc1) {
                acc -= *(npy_double *)src1;
            }
        }
        *((npy_double*)src0) = acc;
    #endif
        return;
    }
#if 0 && defined(NPY_HAVE_NEON) && !NPY_SIMD_F64
    /**
     * The SIMD branch is disabled on armhf(armv7) due to the absence of native SIMD
     * support for single-precision floating-point division. Only scalar division is
     * supported natively, and without hardware for performance and accuracy comparison,
     * it's challenging to evaluate the benefits of emulated SIMD intrinsic versus
     * native scalar division.
     *
     * The `npyv_div_f32` universal intrinsic emulates the division operation using an
     * approximate reciprocal combined with 3 Newton-Raphson iterations for enhanced
     * precision. However, this approach has limitations:
     *
     * - It can cause unexpected floating-point overflows in special cases, such as when
     *   the divisor is subnormal (refer: https://github.com/numpy/numpy/issues/25097).
     *
     * - The precision may vary between the emulated SIMD and scalar division due to
     *   non-uniform branches (non-contiguous) in the code, leading to precision
     *   inconsistencies.
     *
     * - Considering the necessity of multiple Newton-Raphson iterations, the performance
     *   gain may not sufficiently offset these drawbacks.
     */
#elif NPY_SIMD_F64
    if (len > npyv_nlanes_f64*2 &&
        !is_mem_overlap(src0, ssrc0, dst, sdst, len) &&
        !is_mem_overlap(src1, ssrc1, dst, sdst, len)
    ) {
        const int vstep = npyv_nlanes_u8;
        const int wstep = vstep * 2;
        const int hstep = npyv_nlanes_f64;
        const int lstep = hstep * 2;
        // lots of specializations, to squeeze out max performance
        if (ssrc0 == sizeof(npy_double) && ssrc0 == ssrc1 && ssrc0 == sdst) {
            for (; len >= lstep; len -= lstep, src0 += wstep, src1 += wstep, dst += wstep) {
                npyv_f64 a0 = npyv_load_f64((const npy_double*)src0);
                npyv_f64 a1 = npyv_load_f64((const npy_double*)(src0 + vstep));
                npyv_f64 b0 = npyv_load_f64((const npy_double*)src1);
                npyv_f64 b1 = npyv_load_f64((const npy_double*)(src1 + vstep));
                npyv_f64 r0 = npyv_sub_f64(a0, b0);
                npyv_f64 r1 = npyv_sub_f64(a1, b1);
                npyv_store_f64((npy_double*)dst, r0);
                npyv_store_f64((npy_double*)(dst + vstep), r1);
            }
            for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
            #if 0
                npyv_f64 a = npyv_load_till_f64((const npy_double*)src0, len, 1.0);
                npyv_f64 b = npyv_load_till_f64((const npy_double*)src1, len, 1.0);
            #else
                npyv_f64 a = npyv_load_tillz_f64((const npy_double*)src0, len);
                npyv_f64 b = npyv_load_tillz_f64((const npy_double*)src1, len);
            #endif
                npyv_f64 r = npyv_sub_f64(a, b);
                npyv_store_till_f64((npy_double*)dst, len, r);
            }
        }
        else if (ssrc0 == 0 && ssrc1 == sizeof(npy_double) && sdst == ssrc1) {
            npyv_f64 a = npyv_setall_f64(*((npy_double*)src0));
            for (; len >= lstep; len -= lstep, src1 += wstep, dst += wstep) {
                npyv_f64 b0 = npyv_load_f64((const npy_double*)src1);
                npyv_f64 b1 = npyv_load_f64((const npy_double*)(src1 + vstep));
                npyv_f64 r0 = npyv_sub_f64(a, b0);
                npyv_f64 r1 = npyv_sub_f64(a, b1);
                npyv_store_f64((npy_double*)dst, r0);
                npyv_store_f64((npy_double*)(dst + vstep), r1);
            }
            for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
            #if 0 || 0
                npyv_f64 b = npyv_load_till_f64((const npy_double*)src1, len, 1.0);
            #else
                npyv_f64 b = npyv_load_tillz_f64((const npy_double*)src1, len);
            #endif
                npyv_f64 r = npyv_sub_f64(a, b);
                npyv_store_till_f64((npy_double*)dst, len, r);
            }
        }
        else if (ssrc1 == 0 && ssrc0 == sizeof(npy_double) && sdst == ssrc0) {
            npyv_f64 b = npyv_setall_f64(*((npy_double*)src1));
            for (; len >= lstep; len -= lstep, src0 += wstep, dst += wstep) {
                npyv_f64 a0 = npyv_load_f64((const npy_double*)src0);
                npyv_f64 a1 = npyv_load_f64((const npy_double*)(src0 + vstep));
                npyv_f64 r0 = npyv_sub_f64(a0, b);
                npyv_f64 r1 = npyv_sub_f64(a1, b);
                npyv_store_f64((npy_double*)dst, r0);
                npyv_store_f64((npy_double*)(dst + vstep), r1);
            }
            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
            #if 0
                npyv_f64 a = npyv_load_till_f64((const npy_double*)src0, len, 1.0);
            #elif 0
                npyv_f64 a = npyv_load_till_f64((const npy_double*)src0, len, NPY_NAN);
            #else
                npyv_f64 a = npyv_load_tillz_f64((const npy_double*)src0, len);
            #endif
                npyv_f64 r = npyv_sub_f64(a, b);
                npyv_store_till_f64((npy_double*)dst, len, r);
            }
        } else {
            goto loop_scalar;
        }
        npyv_cleanup();
        return;
    }
loop_scalar:
#endif
    for (; len > 0; --len, src0 += ssrc0, src1 += ssrc1, dst += sdst) {
        const npy_double a = *((npy_double*)src0);
        const npy_double b = *((npy_double*)src1);
        *((npy_double*)dst) = a - b;
    }
}

NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_subtract_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp shape = steps[3];
    npy_intp n = dimensions[0];
    npy_intp i;
    npy_double *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_double *)(ip1 + is1 * indx);
        *indexed = *indexed - *(npy_double *)value;
    }
    return 0;
}


#line 52
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_multiply)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp len = dimensions[0];
    char *src0 = args[0], *src1 = args[1], *dst = args[2];
    npy_intp ssrc0 = steps[0], ssrc1 = steps[1], sdst = steps[2];
    // reduce
    if (ssrc0 == 0 && ssrc0 == sdst && src0 == dst) {
    #if 0
        *((npy_double*)src0) *= DOUBLE_pairwise_sum(src1, len, ssrc1);
    #else
        npy_double acc = *((npy_double*)src0);
        if (ssrc1 == sizeof(npy_double)) {
            for (; len > 0; --len, src1 += sizeof(npy_double)) {
                acc *= *(npy_double *)src1;
            }
        } else {
            for (; len > 0; --len, src1 += ssrc1) {
                acc *= *(npy_double *)src1;
            }
        }
        *((npy_double*)src0) = acc;
    #endif
        return;
    }
#if 0 && defined(NPY_HAVE_NEON) && !NPY_SIMD_F64
    /**
     * The SIMD branch is disabled on armhf(armv7) due to the absence of native SIMD
     * support for single-precision floating-point division. Only scalar division is
     * supported natively, and without hardware for performance and accuracy comparison,
     * it's challenging to evaluate the benefits of emulated SIMD intrinsic versus
     * native scalar division.
     *
     * The `npyv_div_f32` universal intrinsic emulates the division operation using an
     * approximate reciprocal combined with 3 Newton-Raphson iterations for enhanced
     * precision. However, this approach has limitations:
     *
     * - It can cause unexpected floating-point overflows in special cases, such as when
     *   the divisor is subnormal (refer: https://github.com/numpy/numpy/issues/25097).
     *
     * - The precision may vary between the emulated SIMD and scalar division due to
     *   non-uniform branches (non-contiguous) in the code, leading to precision
     *   inconsistencies.
     *
     * - Considering the necessity of multiple Newton-Raphson iterations, the performance
     *   gain may not sufficiently offset these drawbacks.
     */
#elif NPY_SIMD_F64
    if (len > npyv_nlanes_f64*2 &&
        !is_mem_overlap(src0, ssrc0, dst, sdst, len) &&
        !is_mem_overlap(src1, ssrc1, dst, sdst, len)
    ) {
        const int vstep = npyv_nlanes_u8;
        const int wstep = vstep * 2;
        const int hstep = npyv_nlanes_f64;
        const int lstep = hstep * 2;
        // lots of specializations, to squeeze out max performance
        if (ssrc0 == sizeof(npy_double) && ssrc0 == ssrc1 && ssrc0 == sdst) {
            for (; len >= lstep; len -= lstep, src0 += wstep, src1 += wstep, dst += wstep) {
                npyv_f64 a0 = npyv_load_f64((const npy_double*)src0);
                npyv_f64 a1 = npyv_load_f64((const npy_double*)(src0 + vstep));
                npyv_f64 b0 = npyv_load_f64((const npy_double*)src1);
                npyv_f64 b1 = npyv_load_f64((const npy_double*)(src1 + vstep));
                npyv_f64 r0 = npyv_mul_f64(a0, b0);
                npyv_f64 r1 = npyv_mul_f64(a1, b1);
                npyv_store_f64((npy_double*)dst, r0);
                npyv_store_f64((npy_double*)(dst + vstep), r1);
            }
            for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
            #if 0
                npyv_f64 a = npyv_load_till_f64((const npy_double*)src0, len, 1.0);
                npyv_f64 b = npyv_load_till_f64((const npy_double*)src1, len, 1.0);
            #else
                npyv_f64 a = npyv_load_tillz_f64((const npy_double*)src0, len);
                npyv_f64 b = npyv_load_tillz_f64((const npy_double*)src1, len);
            #endif
                npyv_f64 r = npyv_mul_f64(a, b);
                npyv_store_till_f64((npy_double*)dst, len, r);
            }
        }
        else if (ssrc0 == 0 && ssrc1 == sizeof(npy_double) && sdst == ssrc1) {
            npyv_f64 a = npyv_setall_f64(*((npy_double*)src0));
            for (; len >= lstep; len -= lstep, src1 += wstep, dst += wstep) {
                npyv_f64 b0 = npyv_load_f64((const npy_double*)src1);
                npyv_f64 b1 = npyv_load_f64((const npy_double*)(src1 + vstep));
                npyv_f64 r0 = npyv_mul_f64(a, b0);
                npyv_f64 r1 = npyv_mul_f64(a, b1);
                npyv_store_f64((npy_double*)dst, r0);
                npyv_store_f64((npy_double*)(dst + vstep), r1);
            }
            for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
            #if 0 || 1
                npyv_f64 b = npyv_load_till_f64((const npy_double*)src1, len, 1.0);
            #else
                npyv_f64 b = npyv_load_tillz_f64((const npy_double*)src1, len);
            #endif
                npyv_f64 r = npyv_mul_f64(a, b);
                npyv_store_till_f64((npy_double*)dst, len, r);
            }
        }
        else if (ssrc1 == 0 && ssrc0 == sizeof(npy_double) && sdst == ssrc0) {
            npyv_f64 b = npyv_setall_f64(*((npy_double*)src1));
            for (; len >= lstep; len -= lstep, src0 += wstep, dst += wstep) {
                npyv_f64 a0 = npyv_load_f64((const npy_double*)src0);
                npyv_f64 a1 = npyv_load_f64((const npy_double*)(src0 + vstep));
                npyv_f64 r0 = npyv_mul_f64(a0, b);
                npyv_f64 r1 = npyv_mul_f64(a1, b);
                npyv_store_f64((npy_double*)dst, r0);
                npyv_store_f64((npy_double*)(dst + vstep), r1);
            }
            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
            #if 1
                npyv_f64 a = npyv_load_till_f64((const npy_double*)src0, len, 1.0);
            #elif 0
                npyv_f64 a = npyv_load_till_f64((const npy_double*)src0, len, NPY_NAN);
            #else
                npyv_f64 a = npyv_load_tillz_f64((const npy_double*)src0, len);
            #endif
                npyv_f64 r = npyv_mul_f64(a, b);
                npyv_store_till_f64((npy_double*)dst, len, r);
            }
        } else {
            goto loop_scalar;
        }
        npyv_cleanup();
        return;
    }
loop_scalar:
#endif
    for (; len > 0; --len, src0 += ssrc0, src1 += ssrc1, dst += sdst) {
        const npy_double a = *((npy_double*)src0);
        const npy_double b = *((npy_double*)src1);
        *((npy_double*)dst) = a * b;
    }
}

NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_multiply_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp shape = steps[3];
    npy_intp n = dimensions[0];
    npy_intp i;
    npy_double *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_double *)(ip1 + is1 * indx);
        *indexed = *indexed * *(npy_double *)value;
    }
    return 0;
}


#line 52
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_divide)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp len = dimensions[0];
    char *src0 = args[0], *src1 = args[1], *dst = args[2];
    npy_intp ssrc0 = steps[0], ssrc1 = steps[1], sdst = steps[2];
    // reduce
    if (ssrc0 == 0 && ssrc0 == sdst && src0 == dst) {
    #if 0
        *((npy_double*)src0) /= DOUBLE_pairwise_sum(src1, len, ssrc1);
    #else
        npy_double acc = *((npy_double*)src0);
        if (ssrc1 == sizeof(npy_double)) {
            for (; len > 0; --len, src1 += sizeof(npy_double)) {
                acc /= *(npy_double *)src1;
            }
        } else {
            for (; len > 0; --len, src1 += ssrc1) {
                acc /= *(npy_double *)src1;
            }
        }
        *((npy_double*)src0) = acc;
    #endif
        return;
    }
#if 1 && defined(NPY_HAVE_NEON) && !NPY_SIMD_F64
    /**
     * The SIMD branch is disabled on armhf(armv7) due to the absence of native SIMD
     * support for single-precision floating-point division. Only scalar division is
     * supported natively, and without hardware for performance and accuracy comparison,
     * it's challenging to evaluate the benefits of emulated SIMD intrinsic versus
     * native scalar division.
     *
     * The `npyv_div_f32` universal intrinsic emulates the division operation using an
     * approximate reciprocal combined with 3 Newton-Raphson iterations for enhanced
     * precision. However, this approach has limitations:
     *
     * - It can cause unexpected floating-point overflows in special cases, such as when
     *   the divisor is subnormal (refer: https://github.com/numpy/numpy/issues/25097).
     *
     * - The precision may vary between the emulated SIMD and scalar division due to
     *   non-uniform branches (non-contiguous) in the code, leading to precision
     *   inconsistencies.
     *
     * - Considering the necessity of multiple Newton-Raphson iterations, the performance
     *   gain may not sufficiently offset these drawbacks.
     */
#elif NPY_SIMD_F64
    if (len > npyv_nlanes_f64*2 &&
        !is_mem_overlap(src0, ssrc0, dst, sdst, len) &&
        !is_mem_overlap(src1, ssrc1, dst, sdst, len)
    ) {
        const int vstep = npyv_nlanes_u8;
        const int wstep = vstep * 2;
        const int hstep = npyv_nlanes_f64;
        const int lstep = hstep * 2;
        // lots of specializations, to squeeze out max performance
        if (ssrc0 == sizeof(npy_double) && ssrc0 == ssrc1 && ssrc0 == sdst) {
            for (; len >= lstep; len -= lstep, src0 += wstep, src1 += wstep, dst += wstep) {
                npyv_f64 a0 = npyv_load_f64((const npy_double*)src0);
                npyv_f64 a1 = npyv_load_f64((const npy_double*)(src0 + vstep));
                npyv_f64 b0 = npyv_load_f64((const npy_double*)src1);
                npyv_f64 b1 = npyv_load_f64((const npy_double*)(src1 + vstep));
                npyv_f64 r0 = npyv_div_f64(a0, b0);
                npyv_f64 r1 = npyv_div_f64(a1, b1);
                npyv_store_f64((npy_double*)dst, r0);
                npyv_store_f64((npy_double*)(dst + vstep), r1);
            }
            for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
            #if 1
                npyv_f64 a = npyv_load_till_f64((const npy_double*)src0, len, 1.0);
                npyv_f64 b = npyv_load_till_f64((const npy_double*)src1, len, 1.0);
            #else
                npyv_f64 a = npyv_load_tillz_f64((const npy_double*)src0, len);
                npyv_f64 b = npyv_load_tillz_f64((const npy_double*)src1, len);
            #endif
                npyv_f64 r = npyv_div_f64(a, b);
                npyv_store_till_f64((npy_double*)dst, len, r);
            }
        }
        else if (ssrc0 == 0 && ssrc1 == sizeof(npy_double) && sdst == ssrc1) {
            npyv_f64 a = npyv_setall_f64(*((npy_double*)src0));
            for (; len >= lstep; len -= lstep, src1 += wstep, dst += wstep) {
                npyv_f64 b0 = npyv_load_f64((const npy_double*)src1);
                npyv_f64 b1 = npyv_load_f64((const npy_double*)(src1 + vstep));
                npyv_f64 r0 = npyv_div_f64(a, b0);
                npyv_f64 r1 = npyv_div_f64(a, b1);
                npyv_store_f64((npy_double*)dst, r0);
                npyv_store_f64((npy_double*)(dst + vstep), r1);
            }
            for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
            #if 1 || 0
                npyv_f64 b = npyv_load_till_f64((const npy_double*)src1, len, 1.0);
            #else
                npyv_f64 b = npyv_load_tillz_f64((const npy_double*)src1, len);
            #endif
                npyv_f64 r = npyv_div_f64(a, b);
                npyv_store_till_f64((npy_double*)dst, len, r);
            }
        }
        else if (ssrc1 == 0 && ssrc0 == sizeof(npy_double) && sdst == ssrc0) {
            npyv_f64 b = npyv_setall_f64(*((npy_double*)src1));
            for (; len >= lstep; len -= lstep, src0 += wstep, dst += wstep) {
                npyv_f64 a0 = npyv_load_f64((const npy_double*)src0);
                npyv_f64 a1 = npyv_load_f64((const npy_double*)(src0 + vstep));
                npyv_f64 r0 = npyv_div_f64(a0, b);
                npyv_f64 r1 = npyv_div_f64(a1, b);
                npyv_store_f64((npy_double*)dst, r0);
                npyv_store_f64((npy_double*)(dst + vstep), r1);
            }
            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
            #if 0
                npyv_f64 a = npyv_load_till_f64((const npy_double*)src0, len, 1.0);
            #elif 1
                npyv_f64 a = npyv_load_till_f64((const npy_double*)src0, len, NPY_NAN);
            #else
                npyv_f64 a = npyv_load_tillz_f64((const npy_double*)src0, len);
            #endif
                npyv_f64 r = npyv_div_f64(a, b);
                npyv_store_till_f64((npy_double*)dst, len, r);
            }
        } else {
            goto loop_scalar;
        }
        npyv_cleanup();
        return;
    }
loop_scalar:
#endif
    for (; len > 0; --len, src0 += ssrc0, src1 += ssrc1, dst += sdst) {
        const npy_double a = *((npy_double*)src0);
        const npy_double b = *((npy_double*)src1);
        *((npy_double*)dst) = a / b;
    }
}

NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_divide_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp shape = steps[3];
    npy_intp n = dimensions[0];
    npy_intp i;
    npy_double *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_double *)(ip1 + is1 * indx);
        *indexed = *indexed / *(npy_double *)value;
    }
    return 0;
}




//###############################################################################
//## Complex Single/Double precision
//###############################################################################

/********************************************************************************
 ** op intrinics
 ********************************************************************************/

#if NPY_SIMD_F32
NPY_FINLINE npyv_f32x2 simd_set2_f32(const float *a)
{
    npyv_f32 fill = npyv_reinterpret_f32_u64(npyv_setall_u64(*(npy_uint64*)a));
    npyv_f32x2 r;
    r.val[0] = fill;
    r.val[1] = fill;
    return r;
}

NPY_FINLINE npyv_f32
simd_cconjugate_f32(npyv_f32 x)
{
#if NPY_SIMD_BIGENDIAN
    const npyv_f32 mask = npyv_reinterpret_f32_u64(npyv_setall_u64(0x80000000));
#else
    const npyv_f32 mask = npyv_reinterpret_f32_u64(npyv_setall_u64(0x8000000000000000ULL));
#endif
    return npyv_xor_f32(x, mask);
}

NPY_FINLINE npyv_f32
simd_cmul_f32(npyv_f32 a, npyv_f32 b)
{
    npyv_f32 b_rev = npyv_permi128_f32(b, 1, 0, 3, 2);
    npyv_f32 a_re = npyv_permi128_f32(a, 0, 0, 2, 2);
    npyv_f32 a_im = npyv_permi128_f32(a, 1, 1, 3, 3);
    // a_im * b_im, a_im * b_re
    npyv_f32 ab_iiir = npyv_mul_f32(a_im, b_rev);
    return npyv_muladdsub_f32(a_re, b, ab_iiir);
}

NPY_FINLINE npyv_f32
simd_csquare_f32(npyv_f32 x)
{ return simd_cmul_f32(x, x); }
#endif

#if NPY_SIMD_F64

NPY_FINLINE npyv_f64x2 simd_set2_f64(const double *a)
{
    npyv_f64 r = npyv_setall_f64(a[0]);
    npyv_f64 i = npyv_setall_f64(a[1]);
    return npyv_zip_f64(r, i);
}

NPY_FINLINE npyv_f64
simd_cconjugate_f64(npyv_f64 x)
{
    const npyv_f64 mask = npyv_reinterpret_f64_u64(npyv_set_u64(
       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL
    ));
    return npyv_xor_f64(x, mask);
}

NPY_FINLINE npyv_f64
simd_cmul_f64(npyv_f64 a, npyv_f64 b)
{
    npyv_f64 b_rev = npyv_permi128_f64(b, 1, 0);
    npyv_f64 a_re = npyv_permi128_f64(a, 0, 0);
    npyv_f64 a_im = npyv_permi128_f64(a, 1, 1);
    // a_im * b_im, a_im * b_re
    npyv_f64 ab_iiir = npyv_mul_f64(a_im, b_rev);
    return npyv_muladdsub_f64(a_re, b, ab_iiir);
}

NPY_FINLINE npyv_f64
simd_csquare_f64(npyv_f64 x)
{ return simd_cmul_f64(x, x); }
#endif

#line 310
#if NPY_SIMD_F32
NPY_FINLINE npyv_f32
simd_cabsolute_f32(npyv_f32 re, npyv_f32 im)
{
    const npyv_f32 inf = npyv_setall_f32(NPY_INFINITYF);
    const npyv_f32 nan = npyv_setall_f32(NPY_NANF);

    re = npyv_abs_f32(re);
    im = npyv_abs_f32(im);
    /*
     * If real or imag = INF, then convert it to inf + j*inf
     * Handles: inf + j*nan, nan + j*inf
     */
    npyv_b32 re_infmask = npyv_cmpeq_f32(re, inf);
    npyv_b32 im_infmask = npyv_cmpeq_f32(im, inf);
    im = npyv_select_f32(re_infmask, inf, im);
    re = npyv_select_f32(im_infmask, inf, re);
    /*
     * If real or imag = NAN, then convert it to nan + j*nan
     * Handles: x + j*nan, nan + j*x
     */
    npyv_b32 re_nnanmask = npyv_notnan_f32(re);
    npyv_b32 im_nnanmask = npyv_notnan_f32(im);
    im = npyv_select_f32(re_nnanmask, im, nan);
    re = npyv_select_f32(im_nnanmask, re, nan);

    npyv_f32 larger  = npyv_max_f32(re, im);
    npyv_f32 smaller = npyv_min_f32(im, re);
    /*
     * Calculate div_mask to prevent 0./0. and inf/inf operations in div
     */
    npyv_b32 zeromask = npyv_cmpeq_f32(larger, npyv_zero_f32());
    npyv_b32 infmask = npyv_cmpeq_f32(smaller, inf);
    npyv_b32 div_mask = npyv_not_b32(npyv_or_b32(zeromask, infmask));

    npyv_f32 ratio = npyv_ifdivz_f32(div_mask, smaller, larger);
    npyv_f32 hypot = npyv_sqrt_f32(
        npyv_muladd_f32(ratio, ratio, npyv_setall_f32(1.0f)
    ));
    return npyv_mul_f32(hypot, larger);
}
#endif // VECTOR

#line 310
#if NPY_SIMD_F64
NPY_FINLINE npyv_f64
simd_cabsolute_f64(npyv_f64 re, npyv_f64 im)
{
    const npyv_f64 inf = npyv_setall_f64(NPY_INFINITY);
    const npyv_f64 nan = npyv_setall_f64(NPY_NAN);

    re = npyv_abs_f64(re);
    im = npyv_abs_f64(im);
    /*
     * If real or imag = INF, then convert it to inf + j*inf
     * Handles: inf + j*nan, nan + j*inf
     */
    npyv_b64 re_infmask = npyv_cmpeq_f64(re, inf);
    npyv_b64 im_infmask = npyv_cmpeq_f64(im, inf);
    im = npyv_select_f64(re_infmask, inf, im);
    re = npyv_select_f64(im_infmask, inf, re);
    /*
     * If real or imag = NAN, then convert it to nan + j*nan
     * Handles: x + j*nan, nan + j*x
     */
    npyv_b64 re_nnanmask = npyv_notnan_f64(re);
    npyv_b64 im_nnanmask = npyv_notnan_f64(im);
    im = npyv_select_f64(re_nnanmask, im, nan);
    re = npyv_select_f64(im_nnanmask, re, nan);

    npyv_f64 larger  = npyv_max_f64(re, im);
    npyv_f64 smaller = npyv_min_f64(im, re);
    /*
     * Calculate div_mask to prevent 0./0. and inf/inf operations in div
     */
    npyv_b64 zeromask = npyv_cmpeq_f64(larger, npyv_zero_f64());
    npyv_b64 infmask = npyv_cmpeq_f64(smaller, inf);
    npyv_b64 div_mask = npyv_not_b64(npyv_or_b64(zeromask, infmask));

    npyv_f64 ratio = npyv_ifdivz_f64(div_mask, smaller, larger);
    npyv_f64 hypot = npyv_sqrt_f64(
        npyv_muladd_f64(ratio, ratio, npyv_setall_f64(1.0)
    ));
    return npyv_mul_f64(hypot, larger);
}
#endif // VECTOR


/********************************************************************************
 ** Defining ufunc inner functions
 ********************************************************************************/
#line 366
#line 374
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(CFLOAT_add)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp len = dimensions[0];
    char *b_src0 = args[0], *b_src1 = args[1], *b_dst = args[2];
    npy_intp b_ssrc0 = steps[0], b_ssrc1 = steps[1], b_sdst = steps[2];
#if 1
    // reduce
    if (b_ssrc0 == 0 && b_ssrc0 == b_sdst && b_src0 == b_dst &&
        b_ssrc1 % (sizeof(npy_float)*2) == 0
    ) {
        npy_float *rl_im = (npy_float *)b_src0;
        npy_float rr, ri;
        CFLOAT_pairwise_sum(&rr, &ri, b_src1, len * 2, b_ssrc1 / 2);
        rl_im[0] += rr;
        rl_im[1] += ri;
        return;
    }
#endif
#if NPY_SIMD_F32
    // Certain versions of Apple clang (commonly used in CI images) produce
    // non-deterministic output in the mul path with AVX2 enabled on x86_64.
    // Work around by scalarising.
    #if 0 \
            && defined(NPY_CPU_AMD64) && defined(__clang__) \
            && defined(__apple_build_version__) \
            && __apple_build_version__ >= 14000000 \
            && __apple_build_version__ < 14030000
        goto loop_scalar;
    #endif  // end affected Apple clang.
    if (is_mem_overlap(b_src0, b_ssrc0, b_dst, b_sdst, len) ||
        is_mem_overlap(b_src1, b_ssrc1, b_dst, b_sdst, len) ||
        b_sdst  % sizeof(npy_float) != 0 || b_sdst == 0 ||
        b_ssrc0 % sizeof(npy_float) != 0 ||
        b_ssrc1 % sizeof(npy_float) != 0
    ) {
        goto loop_scalar;
    }
    const npy_float *src0 = (npy_float*)b_src0;
    const npy_float *src1 = (npy_float*)b_src1;
          npy_float *dst  = (npy_float*)b_dst;

    const npy_intp ssrc0 = b_ssrc0 / sizeof(npy_float);
    const npy_intp ssrc1 = b_ssrc1 / sizeof(npy_float);
    const npy_intp sdst  = b_sdst / sizeof(npy_float);

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 2;
    const int hstep = vstep / 2;

    const int loadable0 = npyv_loadable_stride_s64(ssrc0);
    const int loadable1 = npyv_loadable_stride_s64(ssrc1);
    const int storable = npyv_storable_stride_s64(sdst);

    // lots**lots of specializations, to squeeze out max performance
    // contig
    if (ssrc0 == 2 && ssrc0 == ssrc1 && ssrc0 == sdst) {
        for (; len >= vstep; len -= vstep, src0 += wstep, src1 += wstep, dst += wstep) {
            npyv_f32 a0 = npyv_load_f32(src0);
            npyv_f32 a1 = npyv_load_f32(src0 + vstep);
            npyv_f32 b0 = npyv_load_f32(src1);
            npyv_f32 b1 = npyv_load_f32(src1 + vstep);
            npyv_f32 r0 = npyv_add_f32(a0, b0);
            npyv_f32 r1 = npyv_add_f32(a1, b1);
            npyv_store_f32(dst, r0);
            npyv_store_f32(dst + vstep, r1);
        }
        for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
            npyv_f32 a = npyv_load2_tillz_f32(src0, len);
            npyv_f32 b = npyv_load2_tillz_f32(src1, len);
            npyv_f32 r = npyv_add_f32(a, b);
            npyv_store2_till_f32(dst, len, r);
        }
    }
    // scalar 0
    else if (ssrc0 == 0) {
        npyv_f32x2 a = simd_set2_f32(src0);
        // contig
        if (ssrc1 == 2 && sdst == ssrc1) {
            for (; len >= vstep; len -= vstep, src1 += wstep, dst += wstep) {
                npyv_f32 b0 = npyv_load_f32(src1);
                npyv_f32 b1 = npyv_load_f32(src1 + vstep);
                npyv_f32 r0 = npyv_add_f32(a.val[0], b0);
                npyv_f32 r1 = npyv_add_f32(a.val[1], b1);
                npyv_store_f32(dst, r0);
                npyv_store_f32(dst + vstep, r1);
            }
            for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
            #if 0
                npyv_f32 b = npyv_load2_till_f32(src1, len, 1.0f, 1.0f);
            #else
                npyv_f32 b = npyv_load2_tillz_f32(src1, len);
            #endif
                npyv_f32 r = npyv_add_f32(a.val[0], b);
                npyv_store2_till_f32(dst, len, r);
            }
        }
        // non-contig
        else if (loadable1 && storable) {
            for (; len >= vstep; len -= vstep, src1 += ssrc1*vstep, dst += sdst*vstep) {
                npyv_f32 b0 = npyv_loadn2_f32(src1, ssrc1);
                npyv_f32 b1 = npyv_loadn2_f32(src1 + ssrc1*hstep, ssrc1);
                npyv_f32 r0 = npyv_add_f32(a.val[0], b0);
                npyv_f32 r1 = npyv_add_f32(a.val[1], b1);
                npyv_storen2_f32(dst, sdst, r0);
                npyv_storen2_f32(dst + sdst*hstep, sdst, r1);
            }
            for (; len > 0; len -= hstep, src1 += ssrc1*hstep, dst += sdst*hstep) {
            #if 0
                npyv_f32 b = npyv_loadn2_till_f32(src1, ssrc1, len, 1.0f, 1.0f);
            #else
                npyv_f32 b = npyv_loadn2_tillz_f32(src1, ssrc1, len);
            #endif
                npyv_f32 r = npyv_add_f32(a.val[0], b);
                npyv_storen2_till_f32(dst, sdst, len, r);
            }
        }
        else {
            goto loop_scalar;
        }
    }
    // scalar 1
    else if (ssrc1 == 0) {
        npyv_f32x2 b = simd_set2_f32(src1);
        if (ssrc0 == 2 && sdst == ssrc0) {
            for (; len >= vstep; len -= vstep, src0 += wstep, dst += wstep) {
                npyv_f32 a0 = npyv_load_f32(src0);
                npyv_f32 a1 = npyv_load_f32(src0 + vstep);
                npyv_f32 r0 = npyv_add_f32(a0, b.val[0]);
                npyv_f32 r1 = npyv_add_f32(a1, b.val[1]);
                npyv_store_f32(dst, r0);
                npyv_store_f32(dst + vstep, r1);
            }
            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
            #if 0
                npyv_f32 a = npyv_load2_till_f32(src0, len, 1.0f, 1.0f);
            #else
                npyv_f32 a = npyv_load2_tillz_f32(src0, len);
            #endif
                npyv_f32 r = npyv_add_f32(a, b.val[0]);
                npyv_store2_till_f32(dst, len, r);
            }
        }
        // non-contig
        else if (loadable0 && storable) {
            for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep, dst += sdst*vstep) {
                npyv_f32 a0 = npyv_loadn2_f32(src0, ssrc0);
                npyv_f32 a1 = npyv_loadn2_f32(src0 + ssrc0*hstep, ssrc0);
                npyv_f32 r0 = npyv_add_f32(a0, b.val[0]);
                npyv_f32 r1 = npyv_add_f32(a1, b.val[1]);
                npyv_storen2_f32(dst, sdst, r0);
                npyv_storen2_f32(dst + sdst*hstep, sdst, r1);
            }
            for (; len > 0; len -= hstep, src0 += ssrc0*hstep, dst += sdst*hstep) {
            #if 0
                npyv_f32 a = npyv_loadn2_till_f32(src0, ssrc0, len, 1.0f, 1.0f);
            #else
                npyv_f32 a = npyv_loadn2_tillz_f32(src0, ssrc0, len);
            #endif
                npyv_f32 r = npyv_add_f32(a, b.val[0]);
                npyv_storen2_till_f32(dst, sdst, len, r);
            }
        }
        else {
            goto loop_scalar;
        }
    }
    #if 0
    // non-contig
    else if (loadable0 && loadable1 && storable) {
        for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep,
                            src1 += ssrc1*vstep, dst += sdst*vstep
        ) {
            npyv_f32 a0 = npyv_loadn2_f32(src0, ssrc0);
            npyv_f32 a1 = npyv_loadn2_f32(src0 + ssrc0*hstep, ssrc0);
            npyv_f32 b0 = npyv_loadn2_f32(src1, ssrc1);
            npyv_f32 b1 = npyv_loadn2_f32(src1 + ssrc1*hstep, ssrc1);
            npyv_f32 r0 = npyv_add_f32(a0, b0);
            npyv_f32 r1 = npyv_add_f32(a1, b1);
            npyv_storen2_f32(dst, sdst, r0);
            npyv_storen2_f32(dst + sdst*hstep, sdst, r1);
        }
        for (; len > 0; len -= hstep, src0 += ssrc0*hstep,
                       src1 += ssrc1*hstep, dst += sdst*hstep
        ) {
        #if 0
            npyv_f32 a = npyv_loadn2_till_f32(src0, ssrc0, len, 1.0f, 1.0f);
            npyv_f32 b = npyv_loadn2_till_f32(src1, ssrc1, len, 1.0f, 1.0f);
        #else
            npyv_f32 a = npyv_loadn2_tillz_f32(src0, ssrc0, len);
            npyv_f32 b = npyv_loadn2_tillz_f32(src1, ssrc1, len);
        #endif
            npyv_f32 r = npyv_add_f32(a, b);
            npyv_storen2_till_f32(dst, sdst, len, r);
        }
    }
    #endif
    else {
        goto loop_scalar;
    }
    npyv_cleanup();
    return;
loop_scalar:
#endif
    for (; len > 0; --len, b_src0 += b_ssrc0, b_src1 += b_ssrc1, b_dst += b_sdst) {
        const npy_float a_r = ((npy_float *)b_src0)[0];
        const npy_float a_i = ((npy_float *)b_src0)[1];
        const npy_float b_r = ((npy_float *)b_src1)[0];
        const npy_float b_i = ((npy_float *)b_src1)[1];
    #if 0
        ((npy_float *)b_dst)[0] = a_r*b_r - a_i*b_i;
        ((npy_float *)b_dst)[1] = a_r*b_i + a_i*b_r;
    #else
        ((npy_float *)b_dst)[0] = a_r + b_r;
        ((npy_float *)b_dst)[1] = a_i + b_i;
    #endif
    }
}

NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(CFLOAT_add_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp shape = steps[3];
    npy_intp n = dimensions[0];
    npy_intp i;
    npy_float *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_float *)(ip1 + is1 * indx);
        const npy_float b_r = ((npy_float *)value)[0];
        const npy_float b_i = ((npy_float *)value)[1];
    #if 0
        const npy_float a_r = indexed[0];
        const npy_float a_i = indexed[1];
        indexed[0] = a_r*b_r - a_i*b_i;
        indexed[1] = a_r*b_i + a_i*b_r;
    #else
        indexed[0] += b_r;
        indexed[1] += b_i;
    #endif
    }
    return 0;
}

#line 374
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(CFLOAT_subtract)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp len = dimensions[0];
    char *b_src0 = args[0], *b_src1 = args[1], *b_dst = args[2];
    npy_intp b_ssrc0 = steps[0], b_ssrc1 = steps[1], b_sdst = steps[2];
#if 0
    // reduce
    if (b_ssrc0 == 0 && b_ssrc0 == b_sdst && b_src0 == b_dst &&
        b_ssrc1 % (sizeof(npy_float)*2) == 0
    ) {
        npy_float *rl_im = (npy_float *)b_src0;
        npy_float rr, ri;
        CFLOAT_pairwise_sum(&rr, &ri, b_src1, len * 2, b_ssrc1 / 2);
        rl_im[0] -= rr;
        rl_im[1] -= ri;
        return;
    }
#endif
#if NPY_SIMD_F32
    // Certain versions of Apple clang (commonly used in CI images) produce
    // non-deterministic output in the mul path with AVX2 enabled on x86_64.
    // Work around by scalarising.
    #if 0 \
            && defined(NPY_CPU_AMD64) && defined(__clang__) \
            && defined(__apple_build_version__) \
            && __apple_build_version__ >= 14000000 \
            && __apple_build_version__ < 14030000
        goto loop_scalar;
    #endif  // end affected Apple clang.
    if (is_mem_overlap(b_src0, b_ssrc0, b_dst, b_sdst, len) ||
        is_mem_overlap(b_src1, b_ssrc1, b_dst, b_sdst, len) ||
        b_sdst  % sizeof(npy_float) != 0 || b_sdst == 0 ||
        b_ssrc0 % sizeof(npy_float) != 0 ||
        b_ssrc1 % sizeof(npy_float) != 0
    ) {
        goto loop_scalar;
    }
    const npy_float *src0 = (npy_float*)b_src0;
    const npy_float *src1 = (npy_float*)b_src1;
          npy_float *dst  = (npy_float*)b_dst;

    const npy_intp ssrc0 = b_ssrc0 / sizeof(npy_float);
    const npy_intp ssrc1 = b_ssrc1 / sizeof(npy_float);
    const npy_intp sdst  = b_sdst / sizeof(npy_float);

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 2;
    const int hstep = vstep / 2;

    const int loadable0 = npyv_loadable_stride_s64(ssrc0);
    const int loadable1 = npyv_loadable_stride_s64(ssrc1);
    const int storable = npyv_storable_stride_s64(sdst);

    // lots**lots of specializations, to squeeze out max performance
    // contig
    if (ssrc0 == 2 && ssrc0 == ssrc1 && ssrc0 == sdst) {
        for (; len >= vstep; len -= vstep, src0 += wstep, src1 += wstep, dst += wstep) {
            npyv_f32 a0 = npyv_load_f32(src0);
            npyv_f32 a1 = npyv_load_f32(src0 + vstep);
            npyv_f32 b0 = npyv_load_f32(src1);
            npyv_f32 b1 = npyv_load_f32(src1 + vstep);
            npyv_f32 r0 = npyv_sub_f32(a0, b0);
            npyv_f32 r1 = npyv_sub_f32(a1, b1);
            npyv_store_f32(dst, r0);
            npyv_store_f32(dst + vstep, r1);
        }
        for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
            npyv_f32 a = npyv_load2_tillz_f32(src0, len);
            npyv_f32 b = npyv_load2_tillz_f32(src1, len);
            npyv_f32 r = npyv_sub_f32(a, b);
            npyv_store2_till_f32(dst, len, r);
        }
    }
    // scalar 0
    else if (ssrc0 == 0) {
        npyv_f32x2 a = simd_set2_f32(src0);
        // contig
        if (ssrc1 == 2 && sdst == ssrc1) {
            for (; len >= vstep; len -= vstep, src1 += wstep, dst += wstep) {
                npyv_f32 b0 = npyv_load_f32(src1);
                npyv_f32 b1 = npyv_load_f32(src1 + vstep);
                npyv_f32 r0 = npyv_sub_f32(a.val[0], b0);
                npyv_f32 r1 = npyv_sub_f32(a.val[1], b1);
                npyv_store_f32(dst, r0);
                npyv_store_f32(dst + vstep, r1);
            }
            for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
            #if 0
                npyv_f32 b = npyv_load2_till_f32(src1, len, 1.0f, 1.0f);
            #else
                npyv_f32 b = npyv_load2_tillz_f32(src1, len);
            #endif
                npyv_f32 r = npyv_sub_f32(a.val[0], b);
                npyv_store2_till_f32(dst, len, r);
            }
        }
        // non-contig
        else if (loadable1 && storable) {
            for (; len >= vstep; len -= vstep, src1 += ssrc1*vstep, dst += sdst*vstep) {
                npyv_f32 b0 = npyv_loadn2_f32(src1, ssrc1);
                npyv_f32 b1 = npyv_loadn2_f32(src1 + ssrc1*hstep, ssrc1);
                npyv_f32 r0 = npyv_sub_f32(a.val[0], b0);
                npyv_f32 r1 = npyv_sub_f32(a.val[1], b1);
                npyv_storen2_f32(dst, sdst, r0);
                npyv_storen2_f32(dst + sdst*hstep, sdst, r1);
            }
            for (; len > 0; len -= hstep, src1 += ssrc1*hstep, dst += sdst*hstep) {
            #if 0
                npyv_f32 b = npyv_loadn2_till_f32(src1, ssrc1, len, 1.0f, 1.0f);
            #else
                npyv_f32 b = npyv_loadn2_tillz_f32(src1, ssrc1, len);
            #endif
                npyv_f32 r = npyv_sub_f32(a.val[0], b);
                npyv_storen2_till_f32(dst, sdst, len, r);
            }
        }
        else {
            goto loop_scalar;
        }
    }
    // scalar 1
    else if (ssrc1 == 0) {
        npyv_f32x2 b = simd_set2_f32(src1);
        if (ssrc0 == 2 && sdst == ssrc0) {
            for (; len >= vstep; len -= vstep, src0 += wstep, dst += wstep) {
                npyv_f32 a0 = npyv_load_f32(src0);
                npyv_f32 a1 = npyv_load_f32(src0 + vstep);
                npyv_f32 r0 = npyv_sub_f32(a0, b.val[0]);
                npyv_f32 r1 = npyv_sub_f32(a1, b.val[1]);
                npyv_store_f32(dst, r0);
                npyv_store_f32(dst + vstep, r1);
            }
            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
            #if 0
                npyv_f32 a = npyv_load2_till_f32(src0, len, 1.0f, 1.0f);
            #else
                npyv_f32 a = npyv_load2_tillz_f32(src0, len);
            #endif
                npyv_f32 r = npyv_sub_f32(a, b.val[0]);
                npyv_store2_till_f32(dst, len, r);
            }
        }
        // non-contig
        else if (loadable0 && storable) {
            for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep, dst += sdst*vstep) {
                npyv_f32 a0 = npyv_loadn2_f32(src0, ssrc0);
                npyv_f32 a1 = npyv_loadn2_f32(src0 + ssrc0*hstep, ssrc0);
                npyv_f32 r0 = npyv_sub_f32(a0, b.val[0]);
                npyv_f32 r1 = npyv_sub_f32(a1, b.val[1]);
                npyv_storen2_f32(dst, sdst, r0);
                npyv_storen2_f32(dst + sdst*hstep, sdst, r1);
            }
            for (; len > 0; len -= hstep, src0 += ssrc0*hstep, dst += sdst*hstep) {
            #if 0
                npyv_f32 a = npyv_loadn2_till_f32(src0, ssrc0, len, 1.0f, 1.0f);
            #else
                npyv_f32 a = npyv_loadn2_tillz_f32(src0, ssrc0, len);
            #endif
                npyv_f32 r = npyv_sub_f32(a, b.val[0]);
                npyv_storen2_till_f32(dst, sdst, len, r);
            }
        }
        else {
            goto loop_scalar;
        }
    }
    #if 0
    // non-contig
    else if (loadable0 && loadable1 && storable) {
        for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep,
                            src1 += ssrc1*vstep, dst += sdst*vstep
        ) {
            npyv_f32 a0 = npyv_loadn2_f32(src0, ssrc0);
            npyv_f32 a1 = npyv_loadn2_f32(src0 + ssrc0*hstep, ssrc0);
            npyv_f32 b0 = npyv_loadn2_f32(src1, ssrc1);
            npyv_f32 b1 = npyv_loadn2_f32(src1 + ssrc1*hstep, ssrc1);
            npyv_f32 r0 = npyv_sub_f32(a0, b0);
            npyv_f32 r1 = npyv_sub_f32(a1, b1);
            npyv_storen2_f32(dst, sdst, r0);
            npyv_storen2_f32(dst + sdst*hstep, sdst, r1);
        }
        for (; len > 0; len -= hstep, src0 += ssrc0*hstep,
                       src1 += ssrc1*hstep, dst += sdst*hstep
        ) {
        #if 0
            npyv_f32 a = npyv_loadn2_till_f32(src0, ssrc0, len, 1.0f, 1.0f);
            npyv_f32 b = npyv_loadn2_till_f32(src1, ssrc1, len, 1.0f, 1.0f);
        #else
            npyv_f32 a = npyv_loadn2_tillz_f32(src0, ssrc0, len);
            npyv_f32 b = npyv_loadn2_tillz_f32(src1, ssrc1, len);
        #endif
            npyv_f32 r = npyv_sub_f32(a, b);
            npyv_storen2_till_f32(dst, sdst, len, r);
        }
    }
    #endif
    else {
        goto loop_scalar;
    }
    npyv_cleanup();
    return;
loop_scalar:
#endif
    for (; len > 0; --len, b_src0 += b_ssrc0, b_src1 += b_ssrc1, b_dst += b_sdst) {
        const npy_float a_r = ((npy_float *)b_src0)[0];
        const npy_float a_i = ((npy_float *)b_src0)[1];
        const npy_float b_r = ((npy_float *)b_src1)[0];
        const npy_float b_i = ((npy_float *)b_src1)[1];
    #if 0
        ((npy_float *)b_dst)[0] = a_r*b_r - a_i*b_i;
        ((npy_float *)b_dst)[1] = a_r*b_i + a_i*b_r;
    #else
        ((npy_float *)b_dst)[0] = a_r - b_r;
        ((npy_float *)b_dst)[1] = a_i - b_i;
    #endif
    }
}

NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(CFLOAT_subtract_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp shape = steps[3];
    npy_intp n = dimensions[0];
    npy_intp i;
    npy_float *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_float *)(ip1 + is1 * indx);
        const npy_float b_r = ((npy_float *)value)[0];
        const npy_float b_i = ((npy_float *)value)[1];
    #if 0
        const npy_float a_r = indexed[0];
        const npy_float a_i = indexed[1];
        indexed[0] = a_r*b_r - a_i*b_i;
        indexed[1] = a_r*b_i + a_i*b_r;
    #else
        indexed[0] -= b_r;
        indexed[1] -= b_i;
    #endif
    }
    return 0;
}

#line 374
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(CFLOAT_multiply)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp len = dimensions[0];
    char *b_src0 = args[0], *b_src1 = args[1], *b_dst = args[2];
    npy_intp b_ssrc0 = steps[0], b_ssrc1 = steps[1], b_sdst = steps[2];
#if 0
    // reduce
    if (b_ssrc0 == 0 && b_ssrc0 == b_sdst && b_src0 == b_dst &&
        b_ssrc1 % (sizeof(npy_float)*2) == 0
    ) {
        npy_float *rl_im = (npy_float *)b_src0;
        npy_float rr, ri;
        CFLOAT_pairwise_sum(&rr, &ri, b_src1, len * 2, b_ssrc1 / 2);
        rl_im[0] *= rr;
        rl_im[1] *= ri;
        return;
    }
#endif
#if NPY_SIMD_F32
    // Certain versions of Apple clang (commonly used in CI images) produce
    // non-deterministic output in the mul path with AVX2 enabled on x86_64.
    // Work around by scalarising.
    #if 1 \
            && defined(NPY_CPU_AMD64) && defined(__clang__) \
            && defined(__apple_build_version__) \
            && __apple_build_version__ >= 14000000 \
            && __apple_build_version__ < 14030000
        goto loop_scalar;
    #endif  // end affected Apple clang.
    if (is_mem_overlap(b_src0, b_ssrc0, b_dst, b_sdst, len) ||
        is_mem_overlap(b_src1, b_ssrc1, b_dst, b_sdst, len) ||
        b_sdst  % sizeof(npy_float) != 0 || b_sdst == 0 ||
        b_ssrc0 % sizeof(npy_float) != 0 ||
        b_ssrc1 % sizeof(npy_float) != 0
    ) {
        goto loop_scalar;
    }
    const npy_float *src0 = (npy_float*)b_src0;
    const npy_float *src1 = (npy_float*)b_src1;
          npy_float *dst  = (npy_float*)b_dst;

    const npy_intp ssrc0 = b_ssrc0 / sizeof(npy_float);
    const npy_intp ssrc1 = b_ssrc1 / sizeof(npy_float);
    const npy_intp sdst  = b_sdst / sizeof(npy_float);

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 2;
    const int hstep = vstep / 2;

    const int loadable0 = npyv_loadable_stride_s64(ssrc0);
    const int loadable1 = npyv_loadable_stride_s64(ssrc1);
    const int storable = npyv_storable_stride_s64(sdst);

    // lots**lots of specializations, to squeeze out max performance
    // contig
    if (ssrc0 == 2 && ssrc0 == ssrc1 && ssrc0 == sdst) {
        for (; len >= vstep; len -= vstep, src0 += wstep, src1 += wstep, dst += wstep) {
            npyv_f32 a0 = npyv_load_f32(src0);
            npyv_f32 a1 = npyv_load_f32(src0 + vstep);
            npyv_f32 b0 = npyv_load_f32(src1);
            npyv_f32 b1 = npyv_load_f32(src1 + vstep);
            npyv_f32 r0 = simd_cmul_f32(a0, b0);
            npyv_f32 r1 = simd_cmul_f32(a1, b1);
            npyv_store_f32(dst, r0);
            npyv_store_f32(dst + vstep, r1);
        }
        for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
            npyv_f32 a = npyv_load2_tillz_f32(src0, len);
            npyv_f32 b = npyv_load2_tillz_f32(src1, len);
            npyv_f32 r = simd_cmul_f32(a, b);
            npyv_store2_till_f32(dst, len, r);
        }
    }
    // scalar 0
    else if (ssrc0 == 0) {
        npyv_f32x2 a = simd_set2_f32(src0);
        // contig
        if (ssrc1 == 2 && sdst == ssrc1) {
            for (; len >= vstep; len -= vstep, src1 += wstep, dst += wstep) {
                npyv_f32 b0 = npyv_load_f32(src1);
                npyv_f32 b1 = npyv_load_f32(src1 + vstep);
                npyv_f32 r0 = simd_cmul_f32(a.val[0], b0);
                npyv_f32 r1 = simd_cmul_f32(a.val[1], b1);
                npyv_store_f32(dst, r0);
                npyv_store_f32(dst + vstep, r1);
            }
            for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
            #if 1
                npyv_f32 b = npyv_load2_till_f32(src1, len, 1.0f, 1.0f);
            #else
                npyv_f32 b = npyv_load2_tillz_f32(src1, len);
            #endif
                npyv_f32 r = simd_cmul_f32(a.val[0], b);
                npyv_store2_till_f32(dst, len, r);
            }
        }
        // non-contig
        else if (loadable1 && storable) {
            for (; len >= vstep; len -= vstep, src1 += ssrc1*vstep, dst += sdst*vstep) {
                npyv_f32 b0 = npyv_loadn2_f32(src1, ssrc1);
                npyv_f32 b1 = npyv_loadn2_f32(src1 + ssrc1*hstep, ssrc1);
                npyv_f32 r0 = simd_cmul_f32(a.val[0], b0);
                npyv_f32 r1 = simd_cmul_f32(a.val[1], b1);
                npyv_storen2_f32(dst, sdst, r0);
                npyv_storen2_f32(dst + sdst*hstep, sdst, r1);
            }
            for (; len > 0; len -= hstep, src1 += ssrc1*hstep, dst += sdst*hstep) {
            #if 1
                npyv_f32 b = npyv_loadn2_till_f32(src1, ssrc1, len, 1.0f, 1.0f);
            #else
                npyv_f32 b = npyv_loadn2_tillz_f32(src1, ssrc1, len);
            #endif
                npyv_f32 r = simd_cmul_f32(a.val[0], b);
                npyv_storen2_till_f32(dst, sdst, len, r);
            }
        }
        else {
            goto loop_scalar;
        }
    }
    // scalar 1
    else if (ssrc1 == 0) {
        npyv_f32x2 b = simd_set2_f32(src1);
        if (ssrc0 == 2 && sdst == ssrc0) {
            for (; len >= vstep; len -= vstep, src0 += wstep, dst += wstep) {
                npyv_f32 a0 = npyv_load_f32(src0);
                npyv_f32 a1 = npyv_load_f32(src0 + vstep);
                npyv_f32 r0 = simd_cmul_f32(a0, b.val[0]);
                npyv_f32 r1 = simd_cmul_f32(a1, b.val[1]);
                npyv_store_f32(dst, r0);
                npyv_store_f32(dst + vstep, r1);
            }
            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
            #if 1
                npyv_f32 a = npyv_load2_till_f32(src0, len, 1.0f, 1.0f);
            #else
                npyv_f32 a = npyv_load2_tillz_f32(src0, len);
            #endif
                npyv_f32 r = simd_cmul_f32(a, b.val[0]);
                npyv_store2_till_f32(dst, len, r);
            }
        }
        // non-contig
        else if (loadable0 && storable) {
            for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep, dst += sdst*vstep) {
                npyv_f32 a0 = npyv_loadn2_f32(src0, ssrc0);
                npyv_f32 a1 = npyv_loadn2_f32(src0 + ssrc0*hstep, ssrc0);
                npyv_f32 r0 = simd_cmul_f32(a0, b.val[0]);
                npyv_f32 r1 = simd_cmul_f32(a1, b.val[1]);
                npyv_storen2_f32(dst, sdst, r0);
                npyv_storen2_f32(dst + sdst*hstep, sdst, r1);
            }
            for (; len > 0; len -= hstep, src0 += ssrc0*hstep, dst += sdst*hstep) {
            #if 1
                npyv_f32 a = npyv_loadn2_till_f32(src0, ssrc0, len, 1.0f, 1.0f);
            #else
                npyv_f32 a = npyv_loadn2_tillz_f32(src0, ssrc0, len);
            #endif
                npyv_f32 r = simd_cmul_f32(a, b.val[0]);
                npyv_storen2_till_f32(dst, sdst, len, r);
            }
        }
        else {
            goto loop_scalar;
        }
    }
    #if 1
    // non-contig
    else if (loadable0 && loadable1 && storable) {
        for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep,
                            src1 += ssrc1*vstep, dst += sdst*vstep
        ) {
            npyv_f32 a0 = npyv_loadn2_f32(src0, ssrc0);
            npyv_f32 a1 = npyv_loadn2_f32(src0 + ssrc0*hstep, ssrc0);
            npyv_f32 b0 = npyv_loadn2_f32(src1, ssrc1);
            npyv_f32 b1 = npyv_loadn2_f32(src1 + ssrc1*hstep, ssrc1);
            npyv_f32 r0 = simd_cmul_f32(a0, b0);
            npyv_f32 r1 = simd_cmul_f32(a1, b1);
            npyv_storen2_f32(dst, sdst, r0);
            npyv_storen2_f32(dst + sdst*hstep, sdst, r1);
        }
        for (; len > 0; len -= hstep, src0 += ssrc0*hstep,
                       src1 += ssrc1*hstep, dst += sdst*hstep
        ) {
        #if 1
            npyv_f32 a = npyv_loadn2_till_f32(src0, ssrc0, len, 1.0f, 1.0f);
            npyv_f32 b = npyv_loadn2_till_f32(src1, ssrc1, len, 1.0f, 1.0f);
        #else
            npyv_f32 a = npyv_loadn2_tillz_f32(src0, ssrc0, len);
            npyv_f32 b = npyv_loadn2_tillz_f32(src1, ssrc1, len);
        #endif
            npyv_f32 r = simd_cmul_f32(a, b);
            npyv_storen2_till_f32(dst, sdst, len, r);
        }
    }
    #endif
    else {
        goto loop_scalar;
    }
    npyv_cleanup();
    return;
loop_scalar:
#endif
    for (; len > 0; --len, b_src0 += b_ssrc0, b_src1 += b_ssrc1, b_dst += b_sdst) {
        const npy_float a_r = ((npy_float *)b_src0)[0];
        const npy_float a_i = ((npy_float *)b_src0)[1];
        const npy_float b_r = ((npy_float *)b_src1)[0];
        const npy_float b_i = ((npy_float *)b_src1)[1];
    #if 1
        ((npy_float *)b_dst)[0] = a_r*b_r - a_i*b_i;
        ((npy_float *)b_dst)[1] = a_r*b_i + a_i*b_r;
    #else
        ((npy_float *)b_dst)[0] = a_r * b_r;
        ((npy_float *)b_dst)[1] = a_i * b_i;
    #endif
    }
}

NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(CFLOAT_multiply_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp shape = steps[3];
    npy_intp n = dimensions[0];
    npy_intp i;
    npy_float *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_float *)(ip1 + is1 * indx);
        const npy_float b_r = ((npy_float *)value)[0];
        const npy_float b_i = ((npy_float *)value)[1];
    #if 1
        const npy_float a_r = indexed[0];
        const npy_float a_i = indexed[1];
        indexed[0] = a_r*b_r - a_i*b_i;
        indexed[1] = a_r*b_i + a_i*b_r;
    #else
        indexed[0] *= b_r;
        indexed[1] *= b_i;
    #endif
    }
    return 0;
}


#line 630
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(CFLOAT_conjugate)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp len = dimensions[0];
    char *b_src = args[0], *b_dst = args[1];
    npy_intp b_ssrc = steps[0], b_sdst = steps[1];
#if NPY_SIMD_F32
    if (is_mem_overlap(b_src, b_ssrc, b_dst, b_sdst, len) ||
        b_sdst % sizeof(npy_float) != 0 ||
        b_ssrc % sizeof(npy_float) != 0
    ) {
        goto loop_scalar;
    }
    const npy_float *src  = (npy_float*)b_src;
          npy_float *dst  = (npy_float*)b_dst;
    const npy_intp ssrc = b_ssrc / sizeof(npy_float);
    const npy_intp sdst = b_sdst / sizeof(npy_float);

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 2;
    const int hstep = vstep / 2;

    if (ssrc == 2 && ssrc == sdst) {
        for (; len >= vstep; len -= vstep, src += wstep, dst += wstep) {
            npyv_f32 a0 = npyv_load_f32(src);
            npyv_f32 a1 = npyv_load_f32(src + vstep);
            npyv_f32 r0 = simd_cconjugate_f32(a0);
            npyv_f32 r1 = simd_cconjugate_f32(a1);
            npyv_store_f32(dst, r0);
            npyv_store_f32(dst + vstep, r1);
        }
        for (; len > 0; len -= hstep, src += vstep, dst += vstep) {
            npyv_f32 a = npyv_load2_tillz_f32(src, len);
            npyv_f32 r = simd_cconjugate_f32(a);
            npyv_store2_till_f32(dst, len, r);
        }
    }
    else if (ssrc == 2 && npyv_storable_stride_s64(sdst)) {
        for (; len >= vstep; len -= vstep, src += wstep, dst += sdst*vstep) {
            npyv_f32 a0 = npyv_load_f32(src);
            npyv_f32 a1 = npyv_load_f32(src + vstep);
            npyv_f32 r0 = simd_cconjugate_f32(a0);
            npyv_f32 r1 = simd_cconjugate_f32(a1);
            npyv_storen2_f32(dst, sdst, r0);
            npyv_storen2_f32(dst + sdst*hstep, sdst, r1);
        }
        for (; len > 0; len -= hstep, src += vstep, dst += sdst*hstep) {
            npyv_f32 a = npyv_load2_tillz_f32(src, len);
            npyv_f32 r = simd_cconjugate_f32(a);
            npyv_storen2_till_f32(dst, sdst, len, r);
        }
    }
    else if (sdst == 2 && npyv_loadable_stride_s64(ssrc)) {
        for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += wstep) {
            npyv_f32 a0 = npyv_loadn2_f32(src, ssrc);
            npyv_f32 a1 = npyv_loadn2_f32(src + ssrc*hstep, ssrc);
            npyv_f32 r0 = simd_cconjugate_f32(a0);
            npyv_f32 r1 = simd_cconjugate_f32(a1);
            npyv_store_f32(dst, r0);
            npyv_store_f32(dst + vstep, r1);
        }
        for (; len > 0; len -= hstep, src += ssrc*hstep, dst += vstep) {
            npyv_f32 a = npyv_loadn2_tillz_f32((npy_float*)src, ssrc, len);
            npyv_f32 r = simd_cconjugate_f32(a);
            npyv_store2_till_f32(dst, len, r);
        }
    }
    else {
        goto loop_scalar;
    }
    npyv_cleanup();
    return;
loop_scalar:
#endif
    for (; len > 0; --len, b_src += b_ssrc, b_dst += b_sdst) {
        const npy_float rl = ((npy_float *)b_src)[0];
        const npy_float im = ((npy_float *)b_src)[1];
    #if 0
        ((npy_float *)b_dst)[0] = rl*rl - im*im;
        ((npy_float *)b_dst)[1] = rl*im + im*rl;
    #else
        ((npy_float *)b_dst)[0] = rl;
        ((npy_float *)b_dst)[1] = -im;
    #endif
    }
}

#line 630
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(CFLOAT_square)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp len = dimensions[0];
    char *b_src = args[0], *b_dst = args[1];
    npy_intp b_ssrc = steps[0], b_sdst = steps[1];
#if NPY_SIMD_F32
    if (is_mem_overlap(b_src, b_ssrc, b_dst, b_sdst, len) ||
        b_sdst % sizeof(npy_float) != 0 ||
        b_ssrc % sizeof(npy_float) != 0
    ) {
        goto loop_scalar;
    }
    const npy_float *src  = (npy_float*)b_src;
          npy_float *dst  = (npy_float*)b_dst;
    const npy_intp ssrc = b_ssrc / sizeof(npy_float);
    const npy_intp sdst = b_sdst / sizeof(npy_float);

    const int vstep = npyv_nlanes_f32;
    const int wstep = vstep * 2;
    const int hstep = vstep / 2;

    if (ssrc == 2 && ssrc == sdst) {
        for (; len >= vstep; len -= vstep, src += wstep, dst += wstep) {
            npyv_f32 a0 = npyv_load_f32(src);
            npyv_f32 a1 = npyv_load_f32(src + vstep);
            npyv_f32 r0 = simd_csquare_f32(a0);
            npyv_f32 r1 = simd_csquare_f32(a1);
            npyv_store_f32(dst, r0);
            npyv_store_f32(dst + vstep, r1);
        }
        for (; len > 0; len -= hstep, src += vstep, dst += vstep) {
            npyv_f32 a = npyv_load2_tillz_f32(src, len);
            npyv_f32 r = simd_csquare_f32(a);
            npyv_store2_till_f32(dst, len, r);
        }
    }
    else if (ssrc == 2 && npyv_storable_stride_s64(sdst)) {
        for (; len >= vstep; len -= vstep, src += wstep, dst += sdst*vstep) {
            npyv_f32 a0 = npyv_load_f32(src);
            npyv_f32 a1 = npyv_load_f32(src + vstep);
            npyv_f32 r0 = simd_csquare_f32(a0);
            npyv_f32 r1 = simd_csquare_f32(a1);
            npyv_storen2_f32(dst, sdst, r0);
            npyv_storen2_f32(dst + sdst*hstep, sdst, r1);
        }
        for (; len > 0; len -= hstep, src += vstep, dst += sdst*hstep) {
            npyv_f32 a = npyv_load2_tillz_f32(src, len);
            npyv_f32 r = simd_csquare_f32(a);
            npyv_storen2_till_f32(dst, sdst, len, r);
        }
    }
    else if (sdst == 2 && npyv_loadable_stride_s64(ssrc)) {
        for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += wstep) {
            npyv_f32 a0 = npyv_loadn2_f32(src, ssrc);
            npyv_f32 a1 = npyv_loadn2_f32(src + ssrc*hstep, ssrc);
            npyv_f32 r0 = simd_csquare_f32(a0);
            npyv_f32 r1 = simd_csquare_f32(a1);
            npyv_store_f32(dst, r0);
            npyv_store_f32(dst + vstep, r1);
        }
        for (; len > 0; len -= hstep, src += ssrc*hstep, dst += vstep) {
            npyv_f32 a = npyv_loadn2_tillz_f32((npy_float*)src, ssrc, len);
            npyv_f32 r = simd_csquare_f32(a);
            npyv_store2_till_f32(dst, len, r);
        }
    }
    else {
        goto loop_scalar;
    }
    npyv_cleanup();
    return;
loop_scalar:
#endif
    for (; len > 0; --len, b_src += b_ssrc, b_dst += b_sdst) {
        const npy_float rl = ((npy_float *)b_src)[0];
        const npy_float im = ((npy_float *)b_src)[1];
    #if 1
        ((npy_float *)b_dst)[0] = rl*rl - im*im;
        ((npy_float *)b_dst)[1] = rl*im + im*rl;
    #else
        ((npy_float *)b_dst)[0] = rl;
        ((npy_float *)b_dst)[1] = -im;
    #endif
    }
}


#line 366
#line 374
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(CDOUBLE_add)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp len = dimensions[0];
    char *b_src0 = args[0], *b_src1 = args[1], *b_dst = args[2];
    npy_intp b_ssrc0 = steps[0], b_ssrc1 = steps[1], b_sdst = steps[2];
#if 1
    // reduce
    if (b_ssrc0 == 0 && b_ssrc0 == b_sdst && b_src0 == b_dst &&
        b_ssrc1 % (sizeof(npy_double)*2) == 0
    ) {
        npy_double *rl_im = (npy_double *)b_src0;
        npy_double rr, ri;
        CDOUBLE_pairwise_sum(&rr, &ri, b_src1, len * 2, b_ssrc1 / 2);
        rl_im[0] += rr;
        rl_im[1] += ri;
        return;
    }
#endif
#if NPY_SIMD_F64
    // Certain versions of Apple clang (commonly used in CI images) produce
    // non-deterministic output in the mul path with AVX2 enabled on x86_64.
    // Work around by scalarising.
    #if 0 \
            && defined(NPY_CPU_AMD64) && defined(__clang__) \
            && defined(__apple_build_version__) \
            && __apple_build_version__ >= 14000000 \
            && __apple_build_version__ < 14030000
        goto loop_scalar;
    #endif  // end affected Apple clang.
    if (is_mem_overlap(b_src0, b_ssrc0, b_dst, b_sdst, len) ||
        is_mem_overlap(b_src1, b_ssrc1, b_dst, b_sdst, len) ||
        b_sdst  % sizeof(npy_double) != 0 || b_sdst == 0 ||
        b_ssrc0 % sizeof(npy_double) != 0 ||
        b_ssrc1 % sizeof(npy_double) != 0
    ) {
        goto loop_scalar;
    }
    const npy_double *src0 = (npy_double*)b_src0;
    const npy_double *src1 = (npy_double*)b_src1;
          npy_double *dst  = (npy_double*)b_dst;

    const npy_intp ssrc0 = b_ssrc0 / sizeof(npy_double);
    const npy_intp ssrc1 = b_ssrc1 / sizeof(npy_double);
    const npy_intp sdst  = b_sdst / sizeof(npy_double);

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 2;
    const int hstep = vstep / 2;

    const int loadable0 = npyv_loadable_stride_s64(ssrc0);
    const int loadable1 = npyv_loadable_stride_s64(ssrc1);
    const int storable = npyv_storable_stride_s64(sdst);

    // lots**lots of specializations, to squeeze out max performance
    // contig
    if (ssrc0 == 2 && ssrc0 == ssrc1 && ssrc0 == sdst) {
        for (; len >= vstep; len -= vstep, src0 += wstep, src1 += wstep, dst += wstep) {
            npyv_f64 a0 = npyv_load_f64(src0);
            npyv_f64 a1 = npyv_load_f64(src0 + vstep);
            npyv_f64 b0 = npyv_load_f64(src1);
            npyv_f64 b1 = npyv_load_f64(src1 + vstep);
            npyv_f64 r0 = npyv_add_f64(a0, b0);
            npyv_f64 r1 = npyv_add_f64(a1, b1);
            npyv_store_f64(dst, r0);
            npyv_store_f64(dst + vstep, r1);
        }
        for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
            npyv_f64 a = npyv_load2_tillz_f64(src0, len);
            npyv_f64 b = npyv_load2_tillz_f64(src1, len);
            npyv_f64 r = npyv_add_f64(a, b);
            npyv_store2_till_f64(dst, len, r);
        }
    }
    // scalar 0
    else if (ssrc0 == 0) {
        npyv_f64x2 a = simd_set2_f64(src0);
        // contig
        if (ssrc1 == 2 && sdst == ssrc1) {
            for (; len >= vstep; len -= vstep, src1 += wstep, dst += wstep) {
                npyv_f64 b0 = npyv_load_f64(src1);
                npyv_f64 b1 = npyv_load_f64(src1 + vstep);
                npyv_f64 r0 = npyv_add_f64(a.val[0], b0);
                npyv_f64 r1 = npyv_add_f64(a.val[1], b1);
                npyv_store_f64(dst, r0);
                npyv_store_f64(dst + vstep, r1);
            }
            for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
            #if 0
                npyv_f64 b = npyv_load2_till_f64(src1, len, 1.0, 1.0);
            #else
                npyv_f64 b = npyv_load2_tillz_f64(src1, len);
            #endif
                npyv_f64 r = npyv_add_f64(a.val[0], b);
                npyv_store2_till_f64(dst, len, r);
            }
        }
        // non-contig
        else if (loadable1 && storable) {
            for (; len >= vstep; len -= vstep, src1 += ssrc1*vstep, dst += sdst*vstep) {
                npyv_f64 b0 = npyv_loadn2_f64(src1, ssrc1);
                npyv_f64 b1 = npyv_loadn2_f64(src1 + ssrc1*hstep, ssrc1);
                npyv_f64 r0 = npyv_add_f64(a.val[0], b0);
                npyv_f64 r1 = npyv_add_f64(a.val[1], b1);
                npyv_storen2_f64(dst, sdst, r0);
                npyv_storen2_f64(dst + sdst*hstep, sdst, r1);
            }
            for (; len > 0; len -= hstep, src1 += ssrc1*hstep, dst += sdst*hstep) {
            #if 0
                npyv_f64 b = npyv_loadn2_till_f64(src1, ssrc1, len, 1.0, 1.0);
            #else
                npyv_f64 b = npyv_loadn2_tillz_f64(src1, ssrc1, len);
            #endif
                npyv_f64 r = npyv_add_f64(a.val[0], b);
                npyv_storen2_till_f64(dst, sdst, len, r);
            }
        }
        else {
            goto loop_scalar;
        }
    }
    // scalar 1
    else if (ssrc1 == 0) {
        npyv_f64x2 b = simd_set2_f64(src1);
        if (ssrc0 == 2 && sdst == ssrc0) {
            for (; len >= vstep; len -= vstep, src0 += wstep, dst += wstep) {
                npyv_f64 a0 = npyv_load_f64(src0);
                npyv_f64 a1 = npyv_load_f64(src0 + vstep);
                npyv_f64 r0 = npyv_add_f64(a0, b.val[0]);
                npyv_f64 r1 = npyv_add_f64(a1, b.val[1]);
                npyv_store_f64(dst, r0);
                npyv_store_f64(dst + vstep, r1);
            }
            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
            #if 0
                npyv_f64 a = npyv_load2_till_f64(src0, len, 1.0, 1.0);
            #else
                npyv_f64 a = npyv_load2_tillz_f64(src0, len);
            #endif
                npyv_f64 r = npyv_add_f64(a, b.val[0]);
                npyv_store2_till_f64(dst, len, r);
            }
        }
        // non-contig
        else if (loadable0 && storable) {
            for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep, dst += sdst*vstep) {
                npyv_f64 a0 = npyv_loadn2_f64(src0, ssrc0);
                npyv_f64 a1 = npyv_loadn2_f64(src0 + ssrc0*hstep, ssrc0);
                npyv_f64 r0 = npyv_add_f64(a0, b.val[0]);
                npyv_f64 r1 = npyv_add_f64(a1, b.val[1]);
                npyv_storen2_f64(dst, sdst, r0);
                npyv_storen2_f64(dst + sdst*hstep, sdst, r1);
            }
            for (; len > 0; len -= hstep, src0 += ssrc0*hstep, dst += sdst*hstep) {
            #if 0
                npyv_f64 a = npyv_loadn2_till_f64(src0, ssrc0, len, 1.0, 1.0);
            #else
                npyv_f64 a = npyv_loadn2_tillz_f64(src0, ssrc0, len);
            #endif
                npyv_f64 r = npyv_add_f64(a, b.val[0]);
                npyv_storen2_till_f64(dst, sdst, len, r);
            }
        }
        else {
            goto loop_scalar;
        }
    }
    #if 0
    // non-contig
    else if (loadable0 && loadable1 && storable) {
        for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep,
                            src1 += ssrc1*vstep, dst += sdst*vstep
        ) {
            npyv_f64 a0 = npyv_loadn2_f64(src0, ssrc0);
            npyv_f64 a1 = npyv_loadn2_f64(src0 + ssrc0*hstep, ssrc0);
            npyv_f64 b0 = npyv_loadn2_f64(src1, ssrc1);
            npyv_f64 b1 = npyv_loadn2_f64(src1 + ssrc1*hstep, ssrc1);
            npyv_f64 r0 = npyv_add_f64(a0, b0);
            npyv_f64 r1 = npyv_add_f64(a1, b1);
            npyv_storen2_f64(dst, sdst, r0);
            npyv_storen2_f64(dst + sdst*hstep, sdst, r1);
        }
        for (; len > 0; len -= hstep, src0 += ssrc0*hstep,
                       src1 += ssrc1*hstep, dst += sdst*hstep
        ) {
        #if 0
            npyv_f64 a = npyv_loadn2_till_f64(src0, ssrc0, len, 1.0, 1.0);
            npyv_f64 b = npyv_loadn2_till_f64(src1, ssrc1, len, 1.0, 1.0);
        #else
            npyv_f64 a = npyv_loadn2_tillz_f64(src0, ssrc0, len);
            npyv_f64 b = npyv_loadn2_tillz_f64(src1, ssrc1, len);
        #endif
            npyv_f64 r = npyv_add_f64(a, b);
            npyv_storen2_till_f64(dst, sdst, len, r);
        }
    }
    #endif
    else {
        goto loop_scalar;
    }
    npyv_cleanup();
    return;
loop_scalar:
#endif
    for (; len > 0; --len, b_src0 += b_ssrc0, b_src1 += b_ssrc1, b_dst += b_sdst) {
        const npy_double a_r = ((npy_double *)b_src0)[0];
        const npy_double a_i = ((npy_double *)b_src0)[1];
        const npy_double b_r = ((npy_double *)b_src1)[0];
        const npy_double b_i = ((npy_double *)b_src1)[1];
    #if 0
        ((npy_double *)b_dst)[0] = a_r*b_r - a_i*b_i;
        ((npy_double *)b_dst)[1] = a_r*b_i + a_i*b_r;
    #else
        ((npy_double *)b_dst)[0] = a_r + b_r;
        ((npy_double *)b_dst)[1] = a_i + b_i;
    #endif
    }
}

NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(CDOUBLE_add_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp shape = steps[3];
    npy_intp n = dimensions[0];
    npy_intp i;
    npy_double *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_double *)(ip1 + is1 * indx);
        const npy_double b_r = ((npy_double *)value)[0];
        const npy_double b_i = ((npy_double *)value)[1];
    #if 0
        const npy_double a_r = indexed[0];
        const npy_double a_i = indexed[1];
        indexed[0] = a_r*b_r - a_i*b_i;
        indexed[1] = a_r*b_i + a_i*b_r;
    #else
        indexed[0] += b_r;
        indexed[1] += b_i;
    #endif
    }
    return 0;
}

#line 374
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(CDOUBLE_subtract)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp len = dimensions[0];
    char *b_src0 = args[0], *b_src1 = args[1], *b_dst = args[2];
    npy_intp b_ssrc0 = steps[0], b_ssrc1 = steps[1], b_sdst = steps[2];
#if 0
    // reduce
    if (b_ssrc0 == 0 && b_ssrc0 == b_sdst && b_src0 == b_dst &&
        b_ssrc1 % (sizeof(npy_double)*2) == 0
    ) {
        npy_double *rl_im = (npy_double *)b_src0;
        npy_double rr, ri;
        CDOUBLE_pairwise_sum(&rr, &ri, b_src1, len * 2, b_ssrc1 / 2);
        rl_im[0] -= rr;
        rl_im[1] -= ri;
        return;
    }
#endif
#if NPY_SIMD_F64
    // Certain versions of Apple clang (commonly used in CI images) produce
    // non-deterministic output in the mul path with AVX2 enabled on x86_64.
    // Work around by scalarising.
    #if 0 \
            && defined(NPY_CPU_AMD64) && defined(__clang__) \
            && defined(__apple_build_version__) \
            && __apple_build_version__ >= 14000000 \
            && __apple_build_version__ < 14030000
        goto loop_scalar;
    #endif  // end affected Apple clang.
    if (is_mem_overlap(b_src0, b_ssrc0, b_dst, b_sdst, len) ||
        is_mem_overlap(b_src1, b_ssrc1, b_dst, b_sdst, len) ||
        b_sdst  % sizeof(npy_double) != 0 || b_sdst == 0 ||
        b_ssrc0 % sizeof(npy_double) != 0 ||
        b_ssrc1 % sizeof(npy_double) != 0
    ) {
        goto loop_scalar;
    }
    const npy_double *src0 = (npy_double*)b_src0;
    const npy_double *src1 = (npy_double*)b_src1;
          npy_double *dst  = (npy_double*)b_dst;

    const npy_intp ssrc0 = b_ssrc0 / sizeof(npy_double);
    const npy_intp ssrc1 = b_ssrc1 / sizeof(npy_double);
    const npy_intp sdst  = b_sdst / sizeof(npy_double);

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 2;
    const int hstep = vstep / 2;

    const int loadable0 = npyv_loadable_stride_s64(ssrc0);
    const int loadable1 = npyv_loadable_stride_s64(ssrc1);
    const int storable = npyv_storable_stride_s64(sdst);

    // lots**lots of specializations, to squeeze out max performance
    // contig
    if (ssrc0 == 2 && ssrc0 == ssrc1 && ssrc0 == sdst) {
        for (; len >= vstep; len -= vstep, src0 += wstep, src1 += wstep, dst += wstep) {
            npyv_f64 a0 = npyv_load_f64(src0);
            npyv_f64 a1 = npyv_load_f64(src0 + vstep);
            npyv_f64 b0 = npyv_load_f64(src1);
            npyv_f64 b1 = npyv_load_f64(src1 + vstep);
            npyv_f64 r0 = npyv_sub_f64(a0, b0);
            npyv_f64 r1 = npyv_sub_f64(a1, b1);
            npyv_store_f64(dst, r0);
            npyv_store_f64(dst + vstep, r1);
        }
        for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
            npyv_f64 a = npyv_load2_tillz_f64(src0, len);
            npyv_f64 b = npyv_load2_tillz_f64(src1, len);
            npyv_f64 r = npyv_sub_f64(a, b);
            npyv_store2_till_f64(dst, len, r);
        }
    }
    // scalar 0
    else if (ssrc0 == 0) {
        npyv_f64x2 a = simd_set2_f64(src0);
        // contig
        if (ssrc1 == 2 && sdst == ssrc1) {
            for (; len >= vstep; len -= vstep, src1 += wstep, dst += wstep) {
                npyv_f64 b0 = npyv_load_f64(src1);
                npyv_f64 b1 = npyv_load_f64(src1 + vstep);
                npyv_f64 r0 = npyv_sub_f64(a.val[0], b0);
                npyv_f64 r1 = npyv_sub_f64(a.val[1], b1);
                npyv_store_f64(dst, r0);
                npyv_store_f64(dst + vstep, r1);
            }
            for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
            #if 0
                npyv_f64 b = npyv_load2_till_f64(src1, len, 1.0, 1.0);
            #else
                npyv_f64 b = npyv_load2_tillz_f64(src1, len);
            #endif
                npyv_f64 r = npyv_sub_f64(a.val[0], b);
                npyv_store2_till_f64(dst, len, r);
            }
        }
        // non-contig
        else if (loadable1 && storable) {
            for (; len >= vstep; len -= vstep, src1 += ssrc1*vstep, dst += sdst*vstep) {
                npyv_f64 b0 = npyv_loadn2_f64(src1, ssrc1);
                npyv_f64 b1 = npyv_loadn2_f64(src1 + ssrc1*hstep, ssrc1);
                npyv_f64 r0 = npyv_sub_f64(a.val[0], b0);
                npyv_f64 r1 = npyv_sub_f64(a.val[1], b1);
                npyv_storen2_f64(dst, sdst, r0);
                npyv_storen2_f64(dst + sdst*hstep, sdst, r1);
            }
            for (; len > 0; len -= hstep, src1 += ssrc1*hstep, dst += sdst*hstep) {
            #if 0
                npyv_f64 b = npyv_loadn2_till_f64(src1, ssrc1, len, 1.0, 1.0);
            #else
                npyv_f64 b = npyv_loadn2_tillz_f64(src1, ssrc1, len);
            #endif
                npyv_f64 r = npyv_sub_f64(a.val[0], b);
                npyv_storen2_till_f64(dst, sdst, len, r);
            }
        }
        else {
            goto loop_scalar;
        }
    }
    // scalar 1
    else if (ssrc1 == 0) {
        npyv_f64x2 b = simd_set2_f64(src1);
        if (ssrc0 == 2 && sdst == ssrc0) {
            for (; len >= vstep; len -= vstep, src0 += wstep, dst += wstep) {
                npyv_f64 a0 = npyv_load_f64(src0);
                npyv_f64 a1 = npyv_load_f64(src0 + vstep);
                npyv_f64 r0 = npyv_sub_f64(a0, b.val[0]);
                npyv_f64 r1 = npyv_sub_f64(a1, b.val[1]);
                npyv_store_f64(dst, r0);
                npyv_store_f64(dst + vstep, r1);
            }
            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
            #if 0
                npyv_f64 a = npyv_load2_till_f64(src0, len, 1.0, 1.0);
            #else
                npyv_f64 a = npyv_load2_tillz_f64(src0, len);
            #endif
                npyv_f64 r = npyv_sub_f64(a, b.val[0]);
                npyv_store2_till_f64(dst, len, r);
            }
        }
        // non-contig
        else if (loadable0 && storable) {
            for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep, dst += sdst*vstep) {
                npyv_f64 a0 = npyv_loadn2_f64(src0, ssrc0);
                npyv_f64 a1 = npyv_loadn2_f64(src0 + ssrc0*hstep, ssrc0);
                npyv_f64 r0 = npyv_sub_f64(a0, b.val[0]);
                npyv_f64 r1 = npyv_sub_f64(a1, b.val[1]);
                npyv_storen2_f64(dst, sdst, r0);
                npyv_storen2_f64(dst + sdst*hstep, sdst, r1);
            }
            for (; len > 0; len -= hstep, src0 += ssrc0*hstep, dst += sdst*hstep) {
            #if 0
                npyv_f64 a = npyv_loadn2_till_f64(src0, ssrc0, len, 1.0, 1.0);
            #else
                npyv_f64 a = npyv_loadn2_tillz_f64(src0, ssrc0, len);
            #endif
                npyv_f64 r = npyv_sub_f64(a, b.val[0]);
                npyv_storen2_till_f64(dst, sdst, len, r);
            }
        }
        else {
            goto loop_scalar;
        }
    }
    #if 0
    // non-contig
    else if (loadable0 && loadable1 && storable) {
        for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep,
                            src1 += ssrc1*vstep, dst += sdst*vstep
        ) {
            npyv_f64 a0 = npyv_loadn2_f64(src0, ssrc0);
            npyv_f64 a1 = npyv_loadn2_f64(src0 + ssrc0*hstep, ssrc0);
            npyv_f64 b0 = npyv_loadn2_f64(src1, ssrc1);
            npyv_f64 b1 = npyv_loadn2_f64(src1 + ssrc1*hstep, ssrc1);
            npyv_f64 r0 = npyv_sub_f64(a0, b0);
            npyv_f64 r1 = npyv_sub_f64(a1, b1);
            npyv_storen2_f64(dst, sdst, r0);
            npyv_storen2_f64(dst + sdst*hstep, sdst, r1);
        }
        for (; len > 0; len -= hstep, src0 += ssrc0*hstep,
                       src1 += ssrc1*hstep, dst += sdst*hstep
        ) {
        #if 0
            npyv_f64 a = npyv_loadn2_till_f64(src0, ssrc0, len, 1.0, 1.0);
            npyv_f64 b = npyv_loadn2_till_f64(src1, ssrc1, len, 1.0, 1.0);
        #else
            npyv_f64 a = npyv_loadn2_tillz_f64(src0, ssrc0, len);
            npyv_f64 b = npyv_loadn2_tillz_f64(src1, ssrc1, len);
        #endif
            npyv_f64 r = npyv_sub_f64(a, b);
            npyv_storen2_till_f64(dst, sdst, len, r);
        }
    }
    #endif
    else {
        goto loop_scalar;
    }
    npyv_cleanup();
    return;
loop_scalar:
#endif
    for (; len > 0; --len, b_src0 += b_ssrc0, b_src1 += b_ssrc1, b_dst += b_sdst) {
        const npy_double a_r = ((npy_double *)b_src0)[0];
        const npy_double a_i = ((npy_double *)b_src0)[1];
        const npy_double b_r = ((npy_double *)b_src1)[0];
        const npy_double b_i = ((npy_double *)b_src1)[1];
    #if 0
        ((npy_double *)b_dst)[0] = a_r*b_r - a_i*b_i;
        ((npy_double *)b_dst)[1] = a_r*b_i + a_i*b_r;
    #else
        ((npy_double *)b_dst)[0] = a_r - b_r;
        ((npy_double *)b_dst)[1] = a_i - b_i;
    #endif
    }
}

NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(CDOUBLE_subtract_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp shape = steps[3];
    npy_intp n = dimensions[0];
    npy_intp i;
    npy_double *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_double *)(ip1 + is1 * indx);
        const npy_double b_r = ((npy_double *)value)[0];
        const npy_double b_i = ((npy_double *)value)[1];
    #if 0
        const npy_double a_r = indexed[0];
        const npy_double a_i = indexed[1];
        indexed[0] = a_r*b_r - a_i*b_i;
        indexed[1] = a_r*b_i + a_i*b_r;
    #else
        indexed[0] -= b_r;
        indexed[1] -= b_i;
    #endif
    }
    return 0;
}

#line 374
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(CDOUBLE_multiply)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp len = dimensions[0];
    char *b_src0 = args[0], *b_src1 = args[1], *b_dst = args[2];
    npy_intp b_ssrc0 = steps[0], b_ssrc1 = steps[1], b_sdst = steps[2];
#if 0
    // reduce
    if (b_ssrc0 == 0 && b_ssrc0 == b_sdst && b_src0 == b_dst &&
        b_ssrc1 % (sizeof(npy_double)*2) == 0
    ) {
        npy_double *rl_im = (npy_double *)b_src0;
        npy_double rr, ri;
        CDOUBLE_pairwise_sum(&rr, &ri, b_src1, len * 2, b_ssrc1 / 2);
        rl_im[0] *= rr;
        rl_im[1] *= ri;
        return;
    }
#endif
#if NPY_SIMD_F64
    // Certain versions of Apple clang (commonly used in CI images) produce
    // non-deterministic output in the mul path with AVX2 enabled on x86_64.
    // Work around by scalarising.
    #if 1 \
            && defined(NPY_CPU_AMD64) && defined(__clang__) \
            && defined(__apple_build_version__) \
            && __apple_build_version__ >= 14000000 \
            && __apple_build_version__ < 14030000
        goto loop_scalar;
    #endif  // end affected Apple clang.
    if (is_mem_overlap(b_src0, b_ssrc0, b_dst, b_sdst, len) ||
        is_mem_overlap(b_src1, b_ssrc1, b_dst, b_sdst, len) ||
        b_sdst  % sizeof(npy_double) != 0 || b_sdst == 0 ||
        b_ssrc0 % sizeof(npy_double) != 0 ||
        b_ssrc1 % sizeof(npy_double) != 0
    ) {
        goto loop_scalar;
    }
    const npy_double *src0 = (npy_double*)b_src0;
    const npy_double *src1 = (npy_double*)b_src1;
          npy_double *dst  = (npy_double*)b_dst;

    const npy_intp ssrc0 = b_ssrc0 / sizeof(npy_double);
    const npy_intp ssrc1 = b_ssrc1 / sizeof(npy_double);
    const npy_intp sdst  = b_sdst / sizeof(npy_double);

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 2;
    const int hstep = vstep / 2;

    const int loadable0 = npyv_loadable_stride_s64(ssrc0);
    const int loadable1 = npyv_loadable_stride_s64(ssrc1);
    const int storable = npyv_storable_stride_s64(sdst);

    // lots**lots of specializations, to squeeze out max performance
    // contig
    if (ssrc0 == 2 && ssrc0 == ssrc1 && ssrc0 == sdst) {
        for (; len >= vstep; len -= vstep, src0 += wstep, src1 += wstep, dst += wstep) {
            npyv_f64 a0 = npyv_load_f64(src0);
            npyv_f64 a1 = npyv_load_f64(src0 + vstep);
            npyv_f64 b0 = npyv_load_f64(src1);
            npyv_f64 b1 = npyv_load_f64(src1 + vstep);
            npyv_f64 r0 = simd_cmul_f64(a0, b0);
            npyv_f64 r1 = simd_cmul_f64(a1, b1);
            npyv_store_f64(dst, r0);
            npyv_store_f64(dst + vstep, r1);
        }
        for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
            npyv_f64 a = npyv_load2_tillz_f64(src0, len);
            npyv_f64 b = npyv_load2_tillz_f64(src1, len);
            npyv_f64 r = simd_cmul_f64(a, b);
            npyv_store2_till_f64(dst, len, r);
        }
    }
    // scalar 0
    else if (ssrc0 == 0) {
        npyv_f64x2 a = simd_set2_f64(src0);
        // contig
        if (ssrc1 == 2 && sdst == ssrc1) {
            for (; len >= vstep; len -= vstep, src1 += wstep, dst += wstep) {
                npyv_f64 b0 = npyv_load_f64(src1);
                npyv_f64 b1 = npyv_load_f64(src1 + vstep);
                npyv_f64 r0 = simd_cmul_f64(a.val[0], b0);
                npyv_f64 r1 = simd_cmul_f64(a.val[1], b1);
                npyv_store_f64(dst, r0);
                npyv_store_f64(dst + vstep, r1);
            }
            for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
            #if 1
                npyv_f64 b = npyv_load2_till_f64(src1, len, 1.0, 1.0);
            #else
                npyv_f64 b = npyv_load2_tillz_f64(src1, len);
            #endif
                npyv_f64 r = simd_cmul_f64(a.val[0], b);
                npyv_store2_till_f64(dst, len, r);
            }
        }
        // non-contig
        else if (loadable1 && storable) {
            for (; len >= vstep; len -= vstep, src1 += ssrc1*vstep, dst += sdst*vstep) {
                npyv_f64 b0 = npyv_loadn2_f64(src1, ssrc1);
                npyv_f64 b1 = npyv_loadn2_f64(src1 + ssrc1*hstep, ssrc1);
                npyv_f64 r0 = simd_cmul_f64(a.val[0], b0);
                npyv_f64 r1 = simd_cmul_f64(a.val[1], b1);
                npyv_storen2_f64(dst, sdst, r0);
                npyv_storen2_f64(dst + sdst*hstep, sdst, r1);
            }
            for (; len > 0; len -= hstep, src1 += ssrc1*hstep, dst += sdst*hstep) {
            #if 1
                npyv_f64 b = npyv_loadn2_till_f64(src1, ssrc1, len, 1.0, 1.0);
            #else
                npyv_f64 b = npyv_loadn2_tillz_f64(src1, ssrc1, len);
            #endif
                npyv_f64 r = simd_cmul_f64(a.val[0], b);
                npyv_storen2_till_f64(dst, sdst, len, r);
            }
        }
        else {
            goto loop_scalar;
        }
    }
    // scalar 1
    else if (ssrc1 == 0) {
        npyv_f64x2 b = simd_set2_f64(src1);
        if (ssrc0 == 2 && sdst == ssrc0) {
            for (; len >= vstep; len -= vstep, src0 += wstep, dst += wstep) {
                npyv_f64 a0 = npyv_load_f64(src0);
                npyv_f64 a1 = npyv_load_f64(src0 + vstep);
                npyv_f64 r0 = simd_cmul_f64(a0, b.val[0]);
                npyv_f64 r1 = simd_cmul_f64(a1, b.val[1]);
                npyv_store_f64(dst, r0);
                npyv_store_f64(dst + vstep, r1);
            }
            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
            #if 1
                npyv_f64 a = npyv_load2_till_f64(src0, len, 1.0, 1.0);
            #else
                npyv_f64 a = npyv_load2_tillz_f64(src0, len);
            #endif
                npyv_f64 r = simd_cmul_f64(a, b.val[0]);
                npyv_store2_till_f64(dst, len, r);
            }
        }
        // non-contig
        else if (loadable0 && storable) {
            for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep, dst += sdst*vstep) {
                npyv_f64 a0 = npyv_loadn2_f64(src0, ssrc0);
                npyv_f64 a1 = npyv_loadn2_f64(src0 + ssrc0*hstep, ssrc0);
                npyv_f64 r0 = simd_cmul_f64(a0, b.val[0]);
                npyv_f64 r1 = simd_cmul_f64(a1, b.val[1]);
                npyv_storen2_f64(dst, sdst, r0);
                npyv_storen2_f64(dst + sdst*hstep, sdst, r1);
            }
            for (; len > 0; len -= hstep, src0 += ssrc0*hstep, dst += sdst*hstep) {
            #if 1
                npyv_f64 a = npyv_loadn2_till_f64(src0, ssrc0, len, 1.0, 1.0);
            #else
                npyv_f64 a = npyv_loadn2_tillz_f64(src0, ssrc0, len);
            #endif
                npyv_f64 r = simd_cmul_f64(a, b.val[0]);
                npyv_storen2_till_f64(dst, sdst, len, r);
            }
        }
        else {
            goto loop_scalar;
        }
    }
    #if 1
    // non-contig
    else if (loadable0 && loadable1 && storable) {
        for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep,
                            src1 += ssrc1*vstep, dst += sdst*vstep
        ) {
            npyv_f64 a0 = npyv_loadn2_f64(src0, ssrc0);
            npyv_f64 a1 = npyv_loadn2_f64(src0 + ssrc0*hstep, ssrc0);
            npyv_f64 b0 = npyv_loadn2_f64(src1, ssrc1);
            npyv_f64 b1 = npyv_loadn2_f64(src1 + ssrc1*hstep, ssrc1);
            npyv_f64 r0 = simd_cmul_f64(a0, b0);
            npyv_f64 r1 = simd_cmul_f64(a1, b1);
            npyv_storen2_f64(dst, sdst, r0);
            npyv_storen2_f64(dst + sdst*hstep, sdst, r1);
        }
        for (; len > 0; len -= hstep, src0 += ssrc0*hstep,
                       src1 += ssrc1*hstep, dst += sdst*hstep
        ) {
        #if 1
            npyv_f64 a = npyv_loadn2_till_f64(src0, ssrc0, len, 1.0, 1.0);
            npyv_f64 b = npyv_loadn2_till_f64(src1, ssrc1, len, 1.0, 1.0);
        #else
            npyv_f64 a = npyv_loadn2_tillz_f64(src0, ssrc0, len);
            npyv_f64 b = npyv_loadn2_tillz_f64(src1, ssrc1, len);
        #endif
            npyv_f64 r = simd_cmul_f64(a, b);
            npyv_storen2_till_f64(dst, sdst, len, r);
        }
    }
    #endif
    else {
        goto loop_scalar;
    }
    npyv_cleanup();
    return;
loop_scalar:
#endif
    for (; len > 0; --len, b_src0 += b_ssrc0, b_src1 += b_ssrc1, b_dst += b_sdst) {
        const npy_double a_r = ((npy_double *)b_src0)[0];
        const npy_double a_i = ((npy_double *)b_src0)[1];
        const npy_double b_r = ((npy_double *)b_src1)[0];
        const npy_double b_i = ((npy_double *)b_src1)[1];
    #if 1
        ((npy_double *)b_dst)[0] = a_r*b_r - a_i*b_i;
        ((npy_double *)b_dst)[1] = a_r*b_i + a_i*b_r;
    #else
        ((npy_double *)b_dst)[0] = a_r * b_r;
        ((npy_double *)b_dst)[1] = a_i * b_i;
    #endif
    }
}

NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(CDOUBLE_multiply_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
    char *ip1 = args[0];
    char *indxp = args[1];
    char *value = args[2];
    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
    npy_intp shape = steps[3];
    npy_intp n = dimensions[0];
    npy_intp i;
    npy_double *indexed;
    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
        npy_intp indx = *(npy_intp *)indxp;
        if (indx < 0) {
            indx += shape;
        }
        indexed = (npy_double *)(ip1 + is1 * indx);
        const npy_double b_r = ((npy_double *)value)[0];
        const npy_double b_i = ((npy_double *)value)[1];
    #if 1
        const npy_double a_r = indexed[0];
        const npy_double a_i = indexed[1];
        indexed[0] = a_r*b_r - a_i*b_i;
        indexed[1] = a_r*b_i + a_i*b_r;
    #else
        indexed[0] *= b_r;
        indexed[1] *= b_i;
    #endif
    }
    return 0;
}


#line 630
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(CDOUBLE_conjugate)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp len = dimensions[0];
    char *b_src = args[0], *b_dst = args[1];
    npy_intp b_ssrc = steps[0], b_sdst = steps[1];
#if NPY_SIMD_F64
    if (is_mem_overlap(b_src, b_ssrc, b_dst, b_sdst, len) ||
        b_sdst % sizeof(npy_double) != 0 ||
        b_ssrc % sizeof(npy_double) != 0
    ) {
        goto loop_scalar;
    }
    const npy_double *src  = (npy_double*)b_src;
          npy_double *dst  = (npy_double*)b_dst;
    const npy_intp ssrc = b_ssrc / sizeof(npy_double);
    const npy_intp sdst = b_sdst / sizeof(npy_double);

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 2;
    const int hstep = vstep / 2;

    if (ssrc == 2 && ssrc == sdst) {
        for (; len >= vstep; len -= vstep, src += wstep, dst += wstep) {
            npyv_f64 a0 = npyv_load_f64(src);
            npyv_f64 a1 = npyv_load_f64(src + vstep);
            npyv_f64 r0 = simd_cconjugate_f64(a0);
            npyv_f64 r1 = simd_cconjugate_f64(a1);
            npyv_store_f64(dst, r0);
            npyv_store_f64(dst + vstep, r1);
        }
        for (; len > 0; len -= hstep, src += vstep, dst += vstep) {
            npyv_f64 a = npyv_load2_tillz_f64(src, len);
            npyv_f64 r = simd_cconjugate_f64(a);
            npyv_store2_till_f64(dst, len, r);
        }
    }
    else if (ssrc == 2 && npyv_storable_stride_s64(sdst)) {
        for (; len >= vstep; len -= vstep, src += wstep, dst += sdst*vstep) {
            npyv_f64 a0 = npyv_load_f64(src);
            npyv_f64 a1 = npyv_load_f64(src + vstep);
            npyv_f64 r0 = simd_cconjugate_f64(a0);
            npyv_f64 r1 = simd_cconjugate_f64(a1);
            npyv_storen2_f64(dst, sdst, r0);
            npyv_storen2_f64(dst + sdst*hstep, sdst, r1);
        }
        for (; len > 0; len -= hstep, src += vstep, dst += sdst*hstep) {
            npyv_f64 a = npyv_load2_tillz_f64(src, len);
            npyv_f64 r = simd_cconjugate_f64(a);
            npyv_storen2_till_f64(dst, sdst, len, r);
        }
    }
    else if (sdst == 2 && npyv_loadable_stride_s64(ssrc)) {
        for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += wstep) {
            npyv_f64 a0 = npyv_loadn2_f64(src, ssrc);
            npyv_f64 a1 = npyv_loadn2_f64(src + ssrc*hstep, ssrc);
            npyv_f64 r0 = simd_cconjugate_f64(a0);
            npyv_f64 r1 = simd_cconjugate_f64(a1);
            npyv_store_f64(dst, r0);
            npyv_store_f64(dst + vstep, r1);
        }
        for (; len > 0; len -= hstep, src += ssrc*hstep, dst += vstep) {
            npyv_f64 a = npyv_loadn2_tillz_f64((npy_double*)src, ssrc, len);
            npyv_f64 r = simd_cconjugate_f64(a);
            npyv_store2_till_f64(dst, len, r);
        }
    }
    else {
        goto loop_scalar;
    }
    npyv_cleanup();
    return;
loop_scalar:
#endif
    for (; len > 0; --len, b_src += b_ssrc, b_dst += b_sdst) {
        const npy_double rl = ((npy_double *)b_src)[0];
        const npy_double im = ((npy_double *)b_src)[1];
    #if 0
        ((npy_double *)b_dst)[0] = rl*rl - im*im;
        ((npy_double *)b_dst)[1] = rl*im + im*rl;
    #else
        ((npy_double *)b_dst)[0] = rl;
        ((npy_double *)b_dst)[1] = -im;
    #endif
    }
}

#line 630
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(CDOUBLE_square)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp len = dimensions[0];
    char *b_src = args[0], *b_dst = args[1];
    npy_intp b_ssrc = steps[0], b_sdst = steps[1];
#if NPY_SIMD_F64
    if (is_mem_overlap(b_src, b_ssrc, b_dst, b_sdst, len) ||
        b_sdst % sizeof(npy_double) != 0 ||
        b_ssrc % sizeof(npy_double) != 0
    ) {
        goto loop_scalar;
    }
    const npy_double *src  = (npy_double*)b_src;
          npy_double *dst  = (npy_double*)b_dst;
    const npy_intp ssrc = b_ssrc / sizeof(npy_double);
    const npy_intp sdst = b_sdst / sizeof(npy_double);

    const int vstep = npyv_nlanes_f64;
    const int wstep = vstep * 2;
    const int hstep = vstep / 2;

    if (ssrc == 2 && ssrc == sdst) {
        for (; len >= vstep; len -= vstep, src += wstep, dst += wstep) {
            npyv_f64 a0 = npyv_load_f64(src);
            npyv_f64 a1 = npyv_load_f64(src + vstep);
            npyv_f64 r0 = simd_csquare_f64(a0);
            npyv_f64 r1 = simd_csquare_f64(a1);
            npyv_store_f64(dst, r0);
            npyv_store_f64(dst + vstep, r1);
        }
        for (; len > 0; len -= hstep, src += vstep, dst += vstep) {
            npyv_f64 a = npyv_load2_tillz_f64(src, len);
            npyv_f64 r = simd_csquare_f64(a);
            npyv_store2_till_f64(dst, len, r);
        }
    }
    else if (ssrc == 2 && npyv_storable_stride_s64(sdst)) {
        for (; len >= vstep; len -= vstep, src += wstep, dst += sdst*vstep) {
            npyv_f64 a0 = npyv_load_f64(src);
            npyv_f64 a1 = npyv_load_f64(src + vstep);
            npyv_f64 r0 = simd_csquare_f64(a0);
            npyv_f64 r1 = simd_csquare_f64(a1);
            npyv_storen2_f64(dst, sdst, r0);
            npyv_storen2_f64(dst + sdst*hstep, sdst, r1);
        }
        for (; len > 0; len -= hstep, src += vstep, dst += sdst*hstep) {
            npyv_f64 a = npyv_load2_tillz_f64(src, len);
            npyv_f64 r = simd_csquare_f64(a);
            npyv_storen2_till_f64(dst, sdst, len, r);
        }
    }
    else if (sdst == 2 && npyv_loadable_stride_s64(ssrc)) {
        for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += wstep) {
            npyv_f64 a0 = npyv_loadn2_f64(src, ssrc);
            npyv_f64 a1 = npyv_loadn2_f64(src + ssrc*hstep, ssrc);
            npyv_f64 r0 = simd_csquare_f64(a0);
            npyv_f64 r1 = simd_csquare_f64(a1);
            npyv_store_f64(dst, r0);
            npyv_store_f64(dst + vstep, r1);
        }
        for (; len > 0; len -= hstep, src += ssrc*hstep, dst += vstep) {
            npyv_f64 a = npyv_loadn2_tillz_f64((npy_double*)src, ssrc, len);
            npyv_f64 r = simd_csquare_f64(a);
            npyv_store2_till_f64(dst, len, r);
        }
    }
    else {
        goto loop_scalar;
    }
    npyv_cleanup();
    return;
loop_scalar:
#endif
    for (; len > 0; --len, b_src += b_ssrc, b_dst += b_sdst) {
        const npy_double rl = ((npy_double *)b_src)[0];
        const npy_double im = ((npy_double *)b_src)[1];
    #if 1
        ((npy_double *)b_dst)[0] = rl*rl - im*im;
        ((npy_double *)b_dst)[1] = rl*im + im*rl;
    #else
        ((npy_double *)b_dst)[0] = rl;
        ((npy_double *)b_dst)[1] = -im;
    #endif
    }
}



