#line 1 "numpy/core/src/umath/matmul.c.src"

/*
 *****************************************************************************
 **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
 **       Changes should be made to the original source (.src) file         **
 *****************************************************************************
 */

#line 1
/* -*- c -*- */
#define PY_SSIZE_T_CLEAN
#include <Python.h>

#define _UMATHMODULE
#define _MULTIARRAYMODULE
#define NPY_NO_DEPRECATED_API NPY_API_VERSION

#include "npy_config.h"
#include "numpy/npy_common.h"
#include "numpy/arrayobject.h"
#include "numpy/ufuncobject.h"
#include "numpy/npy_math.h"
#include "numpy/halffloat.h"
#include "lowlevel_strided_loops.h"

#include "npy_pycompat.h"

#include "npy_cblas.h"
#include "arraytypes.h" /* For TYPE_dot functions */

#include <assert.h>

/*
 *****************************************************************************
 **                            BASICS                                       **
 *****************************************************************************
 */

#if defined(HAVE_CBLAS)
/*
 * -1 to be conservative, in case blas internally uses a for loop with an
 * inclusive upper bound
 */
#ifndef HAVE_BLAS_ILP64
#define BLAS_MAXSIZE (NPY_MAX_INT - 1)
#else
#define BLAS_MAXSIZE (NPY_MAX_INT64 - 1)
#endif

/*
 * Determine if a 2d matrix can be used by BLAS
 * 1. Strides must not alias or overlap
 * 2. The faster (second) axis must be contiguous
 * 3. The slower (first) axis stride, in unit steps, must be larger than
 *    the faster axis dimension
 */
static inline npy_bool
is_blasable2d(npy_intp byte_stride1, npy_intp byte_stride2,
              npy_intp d1, npy_intp d2,  npy_intp itemsize)
{
    npy_intp unit_stride1 = byte_stride1 / itemsize;
    if (byte_stride2 != itemsize) {
        return NPY_FALSE;
    }
    if ((byte_stride1 % itemsize ==0) &&
        (unit_stride1 >= d2) &&
        (unit_stride1 <= BLAS_MAXSIZE))
    {
        return NPY_TRUE;
    }
    return NPY_FALSE;
}

static const npy_cdouble oneD = {1.0, 0.0}, zeroD = {0.0, 0.0};
static const npy_cfloat  oneF = {1.0, 0.0}, zeroF = {0.0, 0.0};

#line 77
NPY_NO_EXPORT void
FLOAT_gemv(void *ip1, npy_intp is1_m, npy_intp is1_n,
            void *ip2, npy_intp is2_n, npy_intp NPY_UNUSED(is2_p),
            void *op, npy_intp op_m, npy_intp NPY_UNUSED(op_p),
            npy_intp m, npy_intp n, npy_intp NPY_UNUSED(p))
{
    /*
     * Vector matrix multiplication -- Level 2 BLAS
     * arguments
     * ip1: contiguous data, m*n shape
     * ip2: data in c order, n*1 shape
     * op:  data in c order, m shape
     */
    enum CBLAS_ORDER order;
    CBLAS_INT M, N, lda;

    assert(m <= BLAS_MAXSIZE && n <= BLAS_MAXSIZE);
    assert (is_blasable2d(is2_n, sizeof(npy_float), n, 1, sizeof(npy_float)));
    M = (CBLAS_INT)m;
    N = (CBLAS_INT)n;

    if (is_blasable2d(is1_m, is1_n, m, n, sizeof(npy_float))) {
        order = CblasColMajor;
        lda = (CBLAS_INT)(is1_m / sizeof(npy_float));
    }
    else {
        /* If not ColMajor, caller should have ensured we are RowMajor */
        /* will not assert in release mode */
        order = CblasRowMajor;
        assert(is_blasable2d(is1_n, is1_m, n, m, sizeof(npy_float)));
        lda = (CBLAS_INT)(is1_n / sizeof(npy_float));
    }
    CBLAS_FUNC(cblas_sgemv)(order, CblasTrans, N, M, 1.F, ip1, lda, ip2,
                                     is2_n / sizeof(npy_float), 0.F, op, op_m / sizeof(npy_float));
}

NPY_NO_EXPORT void
FLOAT_matmul_matrixmatrix(void *ip1, npy_intp is1_m, npy_intp is1_n,
                           void *ip2, npy_intp is2_n, npy_intp is2_p,
                           void *op, npy_intp os_m, npy_intp os_p,
                           npy_intp m, npy_intp n, npy_intp p)
{
    /*
     * matrix matrix multiplication -- Level 3 BLAS
     */
    enum CBLAS_ORDER order = CblasRowMajor;
    enum CBLAS_TRANSPOSE trans1, trans2;
    CBLAS_INT M, N, P, lda, ldb, ldc;
    assert(m <= BLAS_MAXSIZE && n <= BLAS_MAXSIZE && p <= BLAS_MAXSIZE);
    M = (CBLAS_INT)m;
    N = (CBLAS_INT)n;
    P = (CBLAS_INT)p;

    assert(is_blasable2d(os_m, os_p, m, p, sizeof(npy_float)));
    ldc = (CBLAS_INT)(os_m / sizeof(npy_float));

    if (is_blasable2d(is1_m, is1_n, m, n, sizeof(npy_float))) {
        trans1 = CblasNoTrans;
        lda = (CBLAS_INT)(is1_m / sizeof(npy_float));
    }
    else {
        /* If not ColMajor, caller should have ensured we are RowMajor */
        /* will not assert in release mode */
        assert(is_blasable2d(is1_n, is1_m, n, m, sizeof(npy_float)));
        trans1 = CblasTrans;
        lda = (CBLAS_INT)(is1_n / sizeof(npy_float));
    }

    if (is_blasable2d(is2_n, is2_p, n, p, sizeof(npy_float))) {
        trans2 = CblasNoTrans;
        ldb = (CBLAS_INT)(is2_n / sizeof(npy_float));
    }
    else {
        /* If not ColMajor, caller should have ensured we are RowMajor */
        /* will not assert in release mode */
        assert(is_blasable2d(is2_p, is2_n, p, n, sizeof(npy_float)));
        trans2 = CblasTrans;
        ldb = (CBLAS_INT)(is2_p / sizeof(npy_float));
    }
    /*
     * Use syrk if we have a case of a matrix times its transpose.
     * Otherwise, use gemm for all other cases.
     */
    if (
        (ip1 == ip2) &&
        (m == p) &&
        (is1_m == is2_p) &&
        (is1_n == is2_n) &&
        (trans1 != trans2)
    ) {
        npy_intp i,j;
        if (trans1 == CblasNoTrans) {
            CBLAS_FUNC(cblas_ssyrk)(
                order, CblasUpper, trans1, P, N, 1.F,
                ip1, lda, 0.F, op, ldc);
        }
        else {
            CBLAS_FUNC(cblas_ssyrk)(
                order, CblasUpper, trans1, P, N, 1.F,
                ip1, ldb, 0.F, op, ldc);
        }
        /* Copy the triangle */
        for (i = 0; i < P; i++) {
            for (j = i + 1; j < P; j++) {
                ((npy_float*)op)[j * ldc + i] = ((npy_float*)op)[i * ldc + j];
            }
        }

    }
    else {
        CBLAS_FUNC(cblas_sgemm)(
            order, trans1, trans2, M, P, N, 1.F, ip1, lda,
            ip2, ldb, 0.F, op, ldc);
    }
}


#line 77
NPY_NO_EXPORT void
DOUBLE_gemv(void *ip1, npy_intp is1_m, npy_intp is1_n,
            void *ip2, npy_intp is2_n, npy_intp NPY_UNUSED(is2_p),
            void *op, npy_intp op_m, npy_intp NPY_UNUSED(op_p),
            npy_intp m, npy_intp n, npy_intp NPY_UNUSED(p))
{
    /*
     * Vector matrix multiplication -- Level 2 BLAS
     * arguments
     * ip1: contiguous data, m*n shape
     * ip2: data in c order, n*1 shape
     * op:  data in c order, m shape
     */
    enum CBLAS_ORDER order;
    CBLAS_INT M, N, lda;

    assert(m <= BLAS_MAXSIZE && n <= BLAS_MAXSIZE);
    assert (is_blasable2d(is2_n, sizeof(npy_double), n, 1, sizeof(npy_double)));
    M = (CBLAS_INT)m;
    N = (CBLAS_INT)n;

    if (is_blasable2d(is1_m, is1_n, m, n, sizeof(npy_double))) {
        order = CblasColMajor;
        lda = (CBLAS_INT)(is1_m / sizeof(npy_double));
    }
    else {
        /* If not ColMajor, caller should have ensured we are RowMajor */
        /* will not assert in release mode */
        order = CblasRowMajor;
        assert(is_blasable2d(is1_n, is1_m, n, m, sizeof(npy_double)));
        lda = (CBLAS_INT)(is1_n / sizeof(npy_double));
    }
    CBLAS_FUNC(cblas_dgemv)(order, CblasTrans, N, M, 1., ip1, lda, ip2,
                                     is2_n / sizeof(npy_double), 0., op, op_m / sizeof(npy_double));
}

NPY_NO_EXPORT void
DOUBLE_matmul_matrixmatrix(void *ip1, npy_intp is1_m, npy_intp is1_n,
                           void *ip2, npy_intp is2_n, npy_intp is2_p,
                           void *op, npy_intp os_m, npy_intp os_p,
                           npy_intp m, npy_intp n, npy_intp p)
{
    /*
     * matrix matrix multiplication -- Level 3 BLAS
     */
    enum CBLAS_ORDER order = CblasRowMajor;
    enum CBLAS_TRANSPOSE trans1, trans2;
    CBLAS_INT M, N, P, lda, ldb, ldc;
    assert(m <= BLAS_MAXSIZE && n <= BLAS_MAXSIZE && p <= BLAS_MAXSIZE);
    M = (CBLAS_INT)m;
    N = (CBLAS_INT)n;
    P = (CBLAS_INT)p;

    assert(is_blasable2d(os_m, os_p, m, p, sizeof(npy_double)));
    ldc = (CBLAS_INT)(os_m / sizeof(npy_double));

    if (is_blasable2d(is1_m, is1_n, m, n, sizeof(npy_double))) {
        trans1 = CblasNoTrans;
        lda = (CBLAS_INT)(is1_m / sizeof(npy_double));
    }
    else {
        /* If not ColMajor, caller should have ensured we are RowMajor */
        /* will not assert in release mode */
        assert(is_blasable2d(is1_n, is1_m, n, m, sizeof(npy_double)));
        trans1 = CblasTrans;
        lda = (CBLAS_INT)(is1_n / sizeof(npy_double));
    }

    if (is_blasable2d(is2_n, is2_p, n, p, sizeof(npy_double))) {
        trans2 = CblasNoTrans;
        ldb = (CBLAS_INT)(is2_n / sizeof(npy_double));
    }
    else {
        /* If not ColMajor, caller should have ensured we are RowMajor */
        /* will not assert in release mode */
        assert(is_blasable2d(is2_p, is2_n, p, n, sizeof(npy_double)));
        trans2 = CblasTrans;
        ldb = (CBLAS_INT)(is2_p / sizeof(npy_double));
    }
    /*
     * Use syrk if we have a case of a matrix times its transpose.
     * Otherwise, use gemm for all other cases.
     */
    if (
        (ip1 == ip2) &&
        (m == p) &&
        (is1_m == is2_p) &&
        (is1_n == is2_n) &&
        (trans1 != trans2)
    ) {
        npy_intp i,j;
        if (trans1 == CblasNoTrans) {
            CBLAS_FUNC(cblas_dsyrk)(
                order, CblasUpper, trans1, P, N, 1.,
                ip1, lda, 0., op, ldc);
        }
        else {
            CBLAS_FUNC(cblas_dsyrk)(
                order, CblasUpper, trans1, P, N, 1.,
                ip1, ldb, 0., op, ldc);
        }
        /* Copy the triangle */
        for (i = 0; i < P; i++) {
            for (j = i + 1; j < P; j++) {
                ((npy_double*)op)[j * ldc + i] = ((npy_double*)op)[i * ldc + j];
            }
        }

    }
    else {
        CBLAS_FUNC(cblas_dgemm)(
            order, trans1, trans2, M, P, N, 1., ip1, lda,
            ip2, ldb, 0., op, ldc);
    }
}


#line 77
NPY_NO_EXPORT void
CFLOAT_gemv(void *ip1, npy_intp is1_m, npy_intp is1_n,
            void *ip2, npy_intp is2_n, npy_intp NPY_UNUSED(is2_p),
            void *op, npy_intp op_m, npy_intp NPY_UNUSED(op_p),
            npy_intp m, npy_intp n, npy_intp NPY_UNUSED(p))
{
    /*
     * Vector matrix multiplication -- Level 2 BLAS
     * arguments
     * ip1: contiguous data, m*n shape
     * ip2: data in c order, n*1 shape
     * op:  data in c order, m shape
     */
    enum CBLAS_ORDER order;
    CBLAS_INT M, N, lda;

    assert(m <= BLAS_MAXSIZE && n <= BLAS_MAXSIZE);
    assert (is_blasable2d(is2_n, sizeof(npy_cfloat), n, 1, sizeof(npy_cfloat)));
    M = (CBLAS_INT)m;
    N = (CBLAS_INT)n;

    if (is_blasable2d(is1_m, is1_n, m, n, sizeof(npy_cfloat))) {
        order = CblasColMajor;
        lda = (CBLAS_INT)(is1_m / sizeof(npy_cfloat));
    }
    else {
        /* If not ColMajor, caller should have ensured we are RowMajor */
        /* will not assert in release mode */
        order = CblasRowMajor;
        assert(is_blasable2d(is1_n, is1_m, n, m, sizeof(npy_cfloat)));
        lda = (CBLAS_INT)(is1_n / sizeof(npy_cfloat));
    }
    CBLAS_FUNC(cblas_cgemv)(order, CblasTrans, N, M, &oneF, ip1, lda, ip2,
                                     is2_n / sizeof(npy_cfloat), &zeroF, op, op_m / sizeof(npy_cfloat));
}

NPY_NO_EXPORT void
CFLOAT_matmul_matrixmatrix(void *ip1, npy_intp is1_m, npy_intp is1_n,
                           void *ip2, npy_intp is2_n, npy_intp is2_p,
                           void *op, npy_intp os_m, npy_intp os_p,
                           npy_intp m, npy_intp n, npy_intp p)
{
    /*
     * matrix matrix multiplication -- Level 3 BLAS
     */
    enum CBLAS_ORDER order = CblasRowMajor;
    enum CBLAS_TRANSPOSE trans1, trans2;
    CBLAS_INT M, N, P, lda, ldb, ldc;
    assert(m <= BLAS_MAXSIZE && n <= BLAS_MAXSIZE && p <= BLAS_MAXSIZE);
    M = (CBLAS_INT)m;
    N = (CBLAS_INT)n;
    P = (CBLAS_INT)p;

    assert(is_blasable2d(os_m, os_p, m, p, sizeof(npy_cfloat)));
    ldc = (CBLAS_INT)(os_m / sizeof(npy_cfloat));

    if (is_blasable2d(is1_m, is1_n, m, n, sizeof(npy_cfloat))) {
        trans1 = CblasNoTrans;
        lda = (CBLAS_INT)(is1_m / sizeof(npy_cfloat));
    }
    else {
        /* If not ColMajor, caller should have ensured we are RowMajor */
        /* will not assert in release mode */
        assert(is_blasable2d(is1_n, is1_m, n, m, sizeof(npy_cfloat)));
        trans1 = CblasTrans;
        lda = (CBLAS_INT)(is1_n / sizeof(npy_cfloat));
    }

    if (is_blasable2d(is2_n, is2_p, n, p, sizeof(npy_cfloat))) {
        trans2 = CblasNoTrans;
        ldb = (CBLAS_INT)(is2_n / sizeof(npy_cfloat));
    }
    else {
        /* If not ColMajor, caller should have ensured we are RowMajor */
        /* will not assert in release mode */
        assert(is_blasable2d(is2_p, is2_n, p, n, sizeof(npy_cfloat)));
        trans2 = CblasTrans;
        ldb = (CBLAS_INT)(is2_p / sizeof(npy_cfloat));
    }
    /*
     * Use syrk if we have a case of a matrix times its transpose.
     * Otherwise, use gemm for all other cases.
     */
    if (
        (ip1 == ip2) &&
        (m == p) &&
        (is1_m == is2_p) &&
        (is1_n == is2_n) &&
        (trans1 != trans2)
    ) {
        npy_intp i,j;
        if (trans1 == CblasNoTrans) {
            CBLAS_FUNC(cblas_csyrk)(
                order, CblasUpper, trans1, P, N, &oneF,
                ip1, lda, &zeroF, op, ldc);
        }
        else {
            CBLAS_FUNC(cblas_csyrk)(
                order, CblasUpper, trans1, P, N, &oneF,
                ip1, ldb, &zeroF, op, ldc);
        }
        /* Copy the triangle */
        for (i = 0; i < P; i++) {
            for (j = i + 1; j < P; j++) {
                ((npy_cfloat*)op)[j * ldc + i] = ((npy_cfloat*)op)[i * ldc + j];
            }
        }

    }
    else {
        CBLAS_FUNC(cblas_cgemm)(
            order, trans1, trans2, M, P, N, &oneF, ip1, lda,
            ip2, ldb, &zeroF, op, ldc);
    }
}


#line 77
NPY_NO_EXPORT void
CDOUBLE_gemv(void *ip1, npy_intp is1_m, npy_intp is1_n,
            void *ip2, npy_intp is2_n, npy_intp NPY_UNUSED(is2_p),
            void *op, npy_intp op_m, npy_intp NPY_UNUSED(op_p),
            npy_intp m, npy_intp n, npy_intp NPY_UNUSED(p))
{
    /*
     * Vector matrix multiplication -- Level 2 BLAS
     * arguments
     * ip1: contiguous data, m*n shape
     * ip2: data in c order, n*1 shape
     * op:  data in c order, m shape
     */
    enum CBLAS_ORDER order;
    CBLAS_INT M, N, lda;

    assert(m <= BLAS_MAXSIZE && n <= BLAS_MAXSIZE);
    assert (is_blasable2d(is2_n, sizeof(npy_cdouble), n, 1, sizeof(npy_cdouble)));
    M = (CBLAS_INT)m;
    N = (CBLAS_INT)n;

    if (is_blasable2d(is1_m, is1_n, m, n, sizeof(npy_cdouble))) {
        order = CblasColMajor;
        lda = (CBLAS_INT)(is1_m / sizeof(npy_cdouble));
    }
    else {
        /* If not ColMajor, caller should have ensured we are RowMajor */
        /* will not assert in release mode */
        order = CblasRowMajor;
        assert(is_blasable2d(is1_n, is1_m, n, m, sizeof(npy_cdouble)));
        lda = (CBLAS_INT)(is1_n / sizeof(npy_cdouble));
    }
    CBLAS_FUNC(cblas_zgemv)(order, CblasTrans, N, M, &oneD, ip1, lda, ip2,
                                     is2_n / sizeof(npy_cdouble), &zeroD, op, op_m / sizeof(npy_cdouble));
}

NPY_NO_EXPORT void
CDOUBLE_matmul_matrixmatrix(void *ip1, npy_intp is1_m, npy_intp is1_n,
                           void *ip2, npy_intp is2_n, npy_intp is2_p,
                           void *op, npy_intp os_m, npy_intp os_p,
                           npy_intp m, npy_intp n, npy_intp p)
{
    /*
     * matrix matrix multiplication -- Level 3 BLAS
     */
    enum CBLAS_ORDER order = CblasRowMajor;
    enum CBLAS_TRANSPOSE trans1, trans2;
    CBLAS_INT M, N, P, lda, ldb, ldc;
    assert(m <= BLAS_MAXSIZE && n <= BLAS_MAXSIZE && p <= BLAS_MAXSIZE);
    M = (CBLAS_INT)m;
    N = (CBLAS_INT)n;
    P = (CBLAS_INT)p;

    assert(is_blasable2d(os_m, os_p, m, p, sizeof(npy_cdouble)));
    ldc = (CBLAS_INT)(os_m / sizeof(npy_cdouble));

    if (is_blasable2d(is1_m, is1_n, m, n, sizeof(npy_cdouble))) {
        trans1 = CblasNoTrans;
        lda = (CBLAS_INT)(is1_m / sizeof(npy_cdouble));
    }
    else {
        /* If not ColMajor, caller should have ensured we are RowMajor */
        /* will not assert in release mode */
        assert(is_blasable2d(is1_n, is1_m, n, m, sizeof(npy_cdouble)));
        trans1 = CblasTrans;
        lda = (CBLAS_INT)(is1_n / sizeof(npy_cdouble));
    }

    if (is_blasable2d(is2_n, is2_p, n, p, sizeof(npy_cdouble))) {
        trans2 = CblasNoTrans;
        ldb = (CBLAS_INT)(is2_n / sizeof(npy_cdouble));
    }
    else {
        /* If not ColMajor, caller should have ensured we are RowMajor */
        /* will not assert in release mode */
        assert(is_blasable2d(is2_p, is2_n, p, n, sizeof(npy_cdouble)));
        trans2 = CblasTrans;
        ldb = (CBLAS_INT)(is2_p / sizeof(npy_cdouble));
    }
    /*
     * Use syrk if we have a case of a matrix times its transpose.
     * Otherwise, use gemm for all other cases.
     */
    if (
        (ip1 == ip2) &&
        (m == p) &&
        (is1_m == is2_p) &&
        (is1_n == is2_n) &&
        (trans1 != trans2)
    ) {
        npy_intp i,j;
        if (trans1 == CblasNoTrans) {
            CBLAS_FUNC(cblas_zsyrk)(
                order, CblasUpper, trans1, P, N, &oneD,
                ip1, lda, &zeroD, op, ldc);
        }
        else {
            CBLAS_FUNC(cblas_zsyrk)(
                order, CblasUpper, trans1, P, N, &oneD,
                ip1, ldb, &zeroD, op, ldc);
        }
        /* Copy the triangle */
        for (i = 0; i < P; i++) {
            for (j = i + 1; j < P; j++) {
                ((npy_cdouble*)op)[j * ldc + i] = ((npy_cdouble*)op)[i * ldc + j];
            }
        }

    }
    else {
        CBLAS_FUNC(cblas_zgemm)(
            order, trans1, trans2, M, P, N, &oneD, ip1, lda,
            ip2, ldb, &zeroD, op, ldc);
    }
}


#endif

/*
 * matmul loops
 * signature is (m?,n),(n,p?)->(m?,p?)
 */

#line 215

NPY_NO_EXPORT void
LONGDOUBLE_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
                           void *_op, npy_intp os_m, npy_intp os_p,
                           npy_intp dm, npy_intp dn, npy_intp dp)
                           
{
    npy_intp m, n, p;
    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;

    ib1_n = is1_n * dn;
    ib2_n = is2_n * dn;
    ib2_p = is2_p * dp;
    ob_p  = os_p * dp;

    for (m = 0; m < dm; m++) {
        for (p = 0; p < dp; p++) {
#if 0 == 1
            (*(npy_longdouble *)op).real = 0;
            (*(npy_longdouble *)op).imag = 0;
#elif 0
            float sum = 0;
#else
            *(npy_longdouble *)op = 0;
#endif
            for (n = 0; n < dn; n++) {
                npy_longdouble val1 = (*(npy_longdouble *)ip1);
                npy_longdouble val2 = (*(npy_longdouble *)ip2);
#if 0
                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
#elif 0 == 1
                (*(npy_longdouble *)op).real += (val1.real * val2.real) -
                                       (val1.imag * val2.imag);
                (*(npy_longdouble *)op).imag += (val1.real * val2.imag) +
                                       (val1.imag * val2.real);
#else
                *(npy_longdouble *)op += val1 * val2;
#endif
                ip2 += is2_n;
                ip1 += is1_n;
            }
#if 0
            *(npy_longdouble *)op = npy_float_to_half(sum);
#endif
            ip1 -= ib1_n;
            ip2 -= ib2_n;
            op  +=  os_p;
            ip2 += is2_p;
        }
        op -= ob_p;
        ip2 -= ib2_p;
        ip1 += is1_m;
        op  +=  os_m;
    }
}


#line 215

NPY_NO_EXPORT void
FLOAT_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
                           void *_op, npy_intp os_m, npy_intp os_p,
                           npy_intp dm, npy_intp dn, npy_intp dp)
                           
{
    npy_intp m, n, p;
    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;

    ib1_n = is1_n * dn;
    ib2_n = is2_n * dn;
    ib2_p = is2_p * dp;
    ob_p  = os_p * dp;

    for (m = 0; m < dm; m++) {
        for (p = 0; p < dp; p++) {
#if 0 == 1
            (*(npy_float *)op).real = 0;
            (*(npy_float *)op).imag = 0;
#elif 0
            float sum = 0;
#else
            *(npy_float *)op = 0;
#endif
            for (n = 0; n < dn; n++) {
                npy_float val1 = (*(npy_float *)ip1);
                npy_float val2 = (*(npy_float *)ip2);
#if 0
                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
#elif 0 == 1
                (*(npy_float *)op).real += (val1.real * val2.real) -
                                       (val1.imag * val2.imag);
                (*(npy_float *)op).imag += (val1.real * val2.imag) +
                                       (val1.imag * val2.real);
#else
                *(npy_float *)op += val1 * val2;
#endif
                ip2 += is2_n;
                ip1 += is1_n;
            }
#if 0
            *(npy_float *)op = npy_float_to_half(sum);
#endif
            ip1 -= ib1_n;
            ip2 -= ib2_n;
            op  +=  os_p;
            ip2 += is2_p;
        }
        op -= ob_p;
        ip2 -= ib2_p;
        ip1 += is1_m;
        op  +=  os_m;
    }
}


#line 215

NPY_NO_EXPORT void
DOUBLE_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
                           void *_op, npy_intp os_m, npy_intp os_p,
                           npy_intp dm, npy_intp dn, npy_intp dp)
                           
{
    npy_intp m, n, p;
    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;

    ib1_n = is1_n * dn;
    ib2_n = is2_n * dn;
    ib2_p = is2_p * dp;
    ob_p  = os_p * dp;

    for (m = 0; m < dm; m++) {
        for (p = 0; p < dp; p++) {
#if 0 == 1
            (*(npy_double *)op).real = 0;
            (*(npy_double *)op).imag = 0;
#elif 0
            float sum = 0;
#else
            *(npy_double *)op = 0;
#endif
            for (n = 0; n < dn; n++) {
                npy_double val1 = (*(npy_double *)ip1);
                npy_double val2 = (*(npy_double *)ip2);
#if 0
                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
#elif 0 == 1
                (*(npy_double *)op).real += (val1.real * val2.real) -
                                       (val1.imag * val2.imag);
                (*(npy_double *)op).imag += (val1.real * val2.imag) +
                                       (val1.imag * val2.real);
#else
                *(npy_double *)op += val1 * val2;
#endif
                ip2 += is2_n;
                ip1 += is1_n;
            }
#if 0
            *(npy_double *)op = npy_float_to_half(sum);
#endif
            ip1 -= ib1_n;
            ip2 -= ib2_n;
            op  +=  os_p;
            ip2 += is2_p;
        }
        op -= ob_p;
        ip2 -= ib2_p;
        ip1 += is1_m;
        op  +=  os_m;
    }
}


#line 215

NPY_NO_EXPORT void
HALF_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
                           void *_op, npy_intp os_m, npy_intp os_p,
                           npy_intp dm, npy_intp dn, npy_intp dp)
                           
{
    npy_intp m, n, p;
    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;

    ib1_n = is1_n * dn;
    ib2_n = is2_n * dn;
    ib2_p = is2_p * dp;
    ob_p  = os_p * dp;

    for (m = 0; m < dm; m++) {
        for (p = 0; p < dp; p++) {
#if 0 == 1
            (*(npy_half *)op).real = 0;
            (*(npy_half *)op).imag = 0;
#elif 1
            float sum = 0;
#else
            *(npy_half *)op = 0;
#endif
            for (n = 0; n < dn; n++) {
                npy_half val1 = (*(npy_half *)ip1);
                npy_half val2 = (*(npy_half *)ip2);
#if 1
                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
#elif 0 == 1
                (*(npy_half *)op).real += (val1.real * val2.real) -
                                       (val1.imag * val2.imag);
                (*(npy_half *)op).imag += (val1.real * val2.imag) +
                                       (val1.imag * val2.real);
#else
                *(npy_half *)op += val1 * val2;
#endif
                ip2 += is2_n;
                ip1 += is1_n;
            }
#if 1
            *(npy_half *)op = npy_float_to_half(sum);
#endif
            ip1 -= ib1_n;
            ip2 -= ib2_n;
            op  +=  os_p;
            ip2 += is2_p;
        }
        op -= ob_p;
        ip2 -= ib2_p;
        ip1 += is1_m;
        op  +=  os_m;
    }
}


#line 215

NPY_NO_EXPORT void
CFLOAT_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
                           void *_op, npy_intp os_m, npy_intp os_p,
                           npy_intp dm, npy_intp dn, npy_intp dp)
                           
{
    npy_intp m, n, p;
    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;

    ib1_n = is1_n * dn;
    ib2_n = is2_n * dn;
    ib2_p = is2_p * dp;
    ob_p  = os_p * dp;

    for (m = 0; m < dm; m++) {
        for (p = 0; p < dp; p++) {
#if 1 == 1
            (*(npy_cfloat *)op).real = 0;
            (*(npy_cfloat *)op).imag = 0;
#elif 0
            float sum = 0;
#else
            *(npy_cfloat *)op = 0;
#endif
            for (n = 0; n < dn; n++) {
                npy_cfloat val1 = (*(npy_cfloat *)ip1);
                npy_cfloat val2 = (*(npy_cfloat *)ip2);
#if 0
                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
#elif 1 == 1
                (*(npy_cfloat *)op).real += (val1.real * val2.real) -
                                       (val1.imag * val2.imag);
                (*(npy_cfloat *)op).imag += (val1.real * val2.imag) +
                                       (val1.imag * val2.real);
#else
                *(npy_cfloat *)op += val1 * val2;
#endif
                ip2 += is2_n;
                ip1 += is1_n;
            }
#if 0
            *(npy_cfloat *)op = npy_float_to_half(sum);
#endif
            ip1 -= ib1_n;
            ip2 -= ib2_n;
            op  +=  os_p;
            ip2 += is2_p;
        }
        op -= ob_p;
        ip2 -= ib2_p;
        ip1 += is1_m;
        op  +=  os_m;
    }
}


#line 215

NPY_NO_EXPORT void
CDOUBLE_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
                           void *_op, npy_intp os_m, npy_intp os_p,
                           npy_intp dm, npy_intp dn, npy_intp dp)
                           
{
    npy_intp m, n, p;
    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;

    ib1_n = is1_n * dn;
    ib2_n = is2_n * dn;
    ib2_p = is2_p * dp;
    ob_p  = os_p * dp;

    for (m = 0; m < dm; m++) {
        for (p = 0; p < dp; p++) {
#if 1 == 1
            (*(npy_cdouble *)op).real = 0;
            (*(npy_cdouble *)op).imag = 0;
#elif 0
            float sum = 0;
#else
            *(npy_cdouble *)op = 0;
#endif
            for (n = 0; n < dn; n++) {
                npy_cdouble val1 = (*(npy_cdouble *)ip1);
                npy_cdouble val2 = (*(npy_cdouble *)ip2);
#if 0
                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
#elif 1 == 1
                (*(npy_cdouble *)op).real += (val1.real * val2.real) -
                                       (val1.imag * val2.imag);
                (*(npy_cdouble *)op).imag += (val1.real * val2.imag) +
                                       (val1.imag * val2.real);
#else
                *(npy_cdouble *)op += val1 * val2;
#endif
                ip2 += is2_n;
                ip1 += is1_n;
            }
#if 0
            *(npy_cdouble *)op = npy_float_to_half(sum);
#endif
            ip1 -= ib1_n;
            ip2 -= ib2_n;
            op  +=  os_p;
            ip2 += is2_p;
        }
        op -= ob_p;
        ip2 -= ib2_p;
        ip1 += is1_m;
        op  +=  os_m;
    }
}


#line 215

NPY_NO_EXPORT void
CLONGDOUBLE_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
                           void *_op, npy_intp os_m, npy_intp os_p,
                           npy_intp dm, npy_intp dn, npy_intp dp)
                           
{
    npy_intp m, n, p;
    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;

    ib1_n = is1_n * dn;
    ib2_n = is2_n * dn;
    ib2_p = is2_p * dp;
    ob_p  = os_p * dp;

    for (m = 0; m < dm; m++) {
        for (p = 0; p < dp; p++) {
#if 1 == 1
            (*(npy_clongdouble *)op).real = 0;
            (*(npy_clongdouble *)op).imag = 0;
#elif 0
            float sum = 0;
#else
            *(npy_clongdouble *)op = 0;
#endif
            for (n = 0; n < dn; n++) {
                npy_clongdouble val1 = (*(npy_clongdouble *)ip1);
                npy_clongdouble val2 = (*(npy_clongdouble *)ip2);
#if 0
                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
#elif 1 == 1
                (*(npy_clongdouble *)op).real += (val1.real * val2.real) -
                                       (val1.imag * val2.imag);
                (*(npy_clongdouble *)op).imag += (val1.real * val2.imag) +
                                       (val1.imag * val2.real);
#else
                *(npy_clongdouble *)op += val1 * val2;
#endif
                ip2 += is2_n;
                ip1 += is1_n;
            }
#if 0
            *(npy_clongdouble *)op = npy_float_to_half(sum);
#endif
            ip1 -= ib1_n;
            ip2 -= ib2_n;
            op  +=  os_p;
            ip2 += is2_p;
        }
        op -= ob_p;
        ip2 -= ib2_p;
        ip1 += is1_m;
        op  +=  os_m;
    }
}


#line 215

NPY_NO_EXPORT void
UBYTE_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
                           void *_op, npy_intp os_m, npy_intp os_p,
                           npy_intp dm, npy_intp dn, npy_intp dp)
                           
{
    npy_intp m, n, p;
    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;

    ib1_n = is1_n * dn;
    ib2_n = is2_n * dn;
    ib2_p = is2_p * dp;
    ob_p  = os_p * dp;

    for (m = 0; m < dm; m++) {
        for (p = 0; p < dp; p++) {
#if 0 == 1
            (*(npy_ubyte *)op).real = 0;
            (*(npy_ubyte *)op).imag = 0;
#elif 0
            float sum = 0;
#else
            *(npy_ubyte *)op = 0;
#endif
            for (n = 0; n < dn; n++) {
                npy_ubyte val1 = (*(npy_ubyte *)ip1);
                npy_ubyte val2 = (*(npy_ubyte *)ip2);
#if 0
                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
#elif 0 == 1
                (*(npy_ubyte *)op).real += (val1.real * val2.real) -
                                       (val1.imag * val2.imag);
                (*(npy_ubyte *)op).imag += (val1.real * val2.imag) +
                                       (val1.imag * val2.real);
#else
                *(npy_ubyte *)op += val1 * val2;
#endif
                ip2 += is2_n;
                ip1 += is1_n;
            }
#if 0
            *(npy_ubyte *)op = npy_float_to_half(sum);
#endif
            ip1 -= ib1_n;
            ip2 -= ib2_n;
            op  +=  os_p;
            ip2 += is2_p;
        }
        op -= ob_p;
        ip2 -= ib2_p;
        ip1 += is1_m;
        op  +=  os_m;
    }
}


#line 215

NPY_NO_EXPORT void
USHORT_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
                           void *_op, npy_intp os_m, npy_intp os_p,
                           npy_intp dm, npy_intp dn, npy_intp dp)
                           
{
    npy_intp m, n, p;
    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;

    ib1_n = is1_n * dn;
    ib2_n = is2_n * dn;
    ib2_p = is2_p * dp;
    ob_p  = os_p * dp;

    for (m = 0; m < dm; m++) {
        for (p = 0; p < dp; p++) {
#if 0 == 1
            (*(npy_ushort *)op).real = 0;
            (*(npy_ushort *)op).imag = 0;
#elif 0
            float sum = 0;
#else
            *(npy_ushort *)op = 0;
#endif
            for (n = 0; n < dn; n++) {
                npy_ushort val1 = (*(npy_ushort *)ip1);
                npy_ushort val2 = (*(npy_ushort *)ip2);
#if 0
                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
#elif 0 == 1
                (*(npy_ushort *)op).real += (val1.real * val2.real) -
                                       (val1.imag * val2.imag);
                (*(npy_ushort *)op).imag += (val1.real * val2.imag) +
                                       (val1.imag * val2.real);
#else
                *(npy_ushort *)op += val1 * val2;
#endif
                ip2 += is2_n;
                ip1 += is1_n;
            }
#if 0
            *(npy_ushort *)op = npy_float_to_half(sum);
#endif
            ip1 -= ib1_n;
            ip2 -= ib2_n;
            op  +=  os_p;
            ip2 += is2_p;
        }
        op -= ob_p;
        ip2 -= ib2_p;
        ip1 += is1_m;
        op  +=  os_m;
    }
}


#line 215

NPY_NO_EXPORT void
UINT_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
                           void *_op, npy_intp os_m, npy_intp os_p,
                           npy_intp dm, npy_intp dn, npy_intp dp)
                           
{
    npy_intp m, n, p;
    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;

    ib1_n = is1_n * dn;
    ib2_n = is2_n * dn;
    ib2_p = is2_p * dp;
    ob_p  = os_p * dp;

    for (m = 0; m < dm; m++) {
        for (p = 0; p < dp; p++) {
#if 0 == 1
            (*(npy_uint *)op).real = 0;
            (*(npy_uint *)op).imag = 0;
#elif 0
            float sum = 0;
#else
            *(npy_uint *)op = 0;
#endif
            for (n = 0; n < dn; n++) {
                npy_uint val1 = (*(npy_uint *)ip1);
                npy_uint val2 = (*(npy_uint *)ip2);
#if 0
                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
#elif 0 == 1
                (*(npy_uint *)op).real += (val1.real * val2.real) -
                                       (val1.imag * val2.imag);
                (*(npy_uint *)op).imag += (val1.real * val2.imag) +
                                       (val1.imag * val2.real);
#else
                *(npy_uint *)op += val1 * val2;
#endif
                ip2 += is2_n;
                ip1 += is1_n;
            }
#if 0
            *(npy_uint *)op = npy_float_to_half(sum);
#endif
            ip1 -= ib1_n;
            ip2 -= ib2_n;
            op  +=  os_p;
            ip2 += is2_p;
        }
        op -= ob_p;
        ip2 -= ib2_p;
        ip1 += is1_m;
        op  +=  os_m;
    }
}


#line 215

NPY_NO_EXPORT void
ULONG_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
                           void *_op, npy_intp os_m, npy_intp os_p,
                           npy_intp dm, npy_intp dn, npy_intp dp)
                           
{
    npy_intp m, n, p;
    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;

    ib1_n = is1_n * dn;
    ib2_n = is2_n * dn;
    ib2_p = is2_p * dp;
    ob_p  = os_p * dp;

    for (m = 0; m < dm; m++) {
        for (p = 0; p < dp; p++) {
#if 0 == 1
            (*(npy_ulong *)op).real = 0;
            (*(npy_ulong *)op).imag = 0;
#elif 0
            float sum = 0;
#else
            *(npy_ulong *)op = 0;
#endif
            for (n = 0; n < dn; n++) {
                npy_ulong val1 = (*(npy_ulong *)ip1);
                npy_ulong val2 = (*(npy_ulong *)ip2);
#if 0
                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
#elif 0 == 1
                (*(npy_ulong *)op).real += (val1.real * val2.real) -
                                       (val1.imag * val2.imag);
                (*(npy_ulong *)op).imag += (val1.real * val2.imag) +
                                       (val1.imag * val2.real);
#else
                *(npy_ulong *)op += val1 * val2;
#endif
                ip2 += is2_n;
                ip1 += is1_n;
            }
#if 0
            *(npy_ulong *)op = npy_float_to_half(sum);
#endif
            ip1 -= ib1_n;
            ip2 -= ib2_n;
            op  +=  os_p;
            ip2 += is2_p;
        }
        op -= ob_p;
        ip2 -= ib2_p;
        ip1 += is1_m;
        op  +=  os_m;
    }
}


#line 215

NPY_NO_EXPORT void
ULONGLONG_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
                           void *_op, npy_intp os_m, npy_intp os_p,
                           npy_intp dm, npy_intp dn, npy_intp dp)
                           
{
    npy_intp m, n, p;
    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;

    ib1_n = is1_n * dn;
    ib2_n = is2_n * dn;
    ib2_p = is2_p * dp;
    ob_p  = os_p * dp;

    for (m = 0; m < dm; m++) {
        for (p = 0; p < dp; p++) {
#if 0 == 1
            (*(npy_ulonglong *)op).real = 0;
            (*(npy_ulonglong *)op).imag = 0;
#elif 0
            float sum = 0;
#else
            *(npy_ulonglong *)op = 0;
#endif
            for (n = 0; n < dn; n++) {
                npy_ulonglong val1 = (*(npy_ulonglong *)ip1);
                npy_ulonglong val2 = (*(npy_ulonglong *)ip2);
#if 0
                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
#elif 0 == 1
                (*(npy_ulonglong *)op).real += (val1.real * val2.real) -
                                       (val1.imag * val2.imag);
                (*(npy_ulonglong *)op).imag += (val1.real * val2.imag) +
                                       (val1.imag * val2.real);
#else
                *(npy_ulonglong *)op += val1 * val2;
#endif
                ip2 += is2_n;
                ip1 += is1_n;
            }
#if 0
            *(npy_ulonglong *)op = npy_float_to_half(sum);
#endif
            ip1 -= ib1_n;
            ip2 -= ib2_n;
            op  +=  os_p;
            ip2 += is2_p;
        }
        op -= ob_p;
        ip2 -= ib2_p;
        ip1 += is1_m;
        op  +=  os_m;
    }
}


#line 215

NPY_NO_EXPORT void
BYTE_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
                           void *_op, npy_intp os_m, npy_intp os_p,
                           npy_intp dm, npy_intp dn, npy_intp dp)
                           
{
    npy_intp m, n, p;
    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;

    ib1_n = is1_n * dn;
    ib2_n = is2_n * dn;
    ib2_p = is2_p * dp;
    ob_p  = os_p * dp;

    for (m = 0; m < dm; m++) {
        for (p = 0; p < dp; p++) {
#if 0 == 1
            (*(npy_byte *)op).real = 0;
            (*(npy_byte *)op).imag = 0;
#elif 0
            float sum = 0;
#else
            *(npy_byte *)op = 0;
#endif
            for (n = 0; n < dn; n++) {
                npy_byte val1 = (*(npy_byte *)ip1);
                npy_byte val2 = (*(npy_byte *)ip2);
#if 0
                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
#elif 0 == 1
                (*(npy_byte *)op).real += (val1.real * val2.real) -
                                       (val1.imag * val2.imag);
                (*(npy_byte *)op).imag += (val1.real * val2.imag) +
                                       (val1.imag * val2.real);
#else
                *(npy_byte *)op += val1 * val2;
#endif
                ip2 += is2_n;
                ip1 += is1_n;
            }
#if 0
            *(npy_byte *)op = npy_float_to_half(sum);
#endif
            ip1 -= ib1_n;
            ip2 -= ib2_n;
            op  +=  os_p;
            ip2 += is2_p;
        }
        op -= ob_p;
        ip2 -= ib2_p;
        ip1 += is1_m;
        op  +=  os_m;
    }
}


#line 215

NPY_NO_EXPORT void
SHORT_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
                           void *_op, npy_intp os_m, npy_intp os_p,
                           npy_intp dm, npy_intp dn, npy_intp dp)
                           
{
    npy_intp m, n, p;
    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;

    ib1_n = is1_n * dn;
    ib2_n = is2_n * dn;
    ib2_p = is2_p * dp;
    ob_p  = os_p * dp;

    for (m = 0; m < dm; m++) {
        for (p = 0; p < dp; p++) {
#if 0 == 1
            (*(npy_short *)op).real = 0;
            (*(npy_short *)op).imag = 0;
#elif 0
            float sum = 0;
#else
            *(npy_short *)op = 0;
#endif
            for (n = 0; n < dn; n++) {
                npy_short val1 = (*(npy_short *)ip1);
                npy_short val2 = (*(npy_short *)ip2);
#if 0
                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
#elif 0 == 1
                (*(npy_short *)op).real += (val1.real * val2.real) -
                                       (val1.imag * val2.imag);
                (*(npy_short *)op).imag += (val1.real * val2.imag) +
                                       (val1.imag * val2.real);
#else
                *(npy_short *)op += val1 * val2;
#endif
                ip2 += is2_n;
                ip1 += is1_n;
            }
#if 0
            *(npy_short *)op = npy_float_to_half(sum);
#endif
            ip1 -= ib1_n;
            ip2 -= ib2_n;
            op  +=  os_p;
            ip2 += is2_p;
        }
        op -= ob_p;
        ip2 -= ib2_p;
        ip1 += is1_m;
        op  +=  os_m;
    }
}


#line 215

NPY_NO_EXPORT void
INT_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
                           void *_op, npy_intp os_m, npy_intp os_p,
                           npy_intp dm, npy_intp dn, npy_intp dp)
                           
{
    npy_intp m, n, p;
    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;

    ib1_n = is1_n * dn;
    ib2_n = is2_n * dn;
    ib2_p = is2_p * dp;
    ob_p  = os_p * dp;

    for (m = 0; m < dm; m++) {
        for (p = 0; p < dp; p++) {
#if 0 == 1
            (*(npy_int *)op).real = 0;
            (*(npy_int *)op).imag = 0;
#elif 0
            float sum = 0;
#else
            *(npy_int *)op = 0;
#endif
            for (n = 0; n < dn; n++) {
                npy_int val1 = (*(npy_int *)ip1);
                npy_int val2 = (*(npy_int *)ip2);
#if 0
                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
#elif 0 == 1
                (*(npy_int *)op).real += (val1.real * val2.real) -
                                       (val1.imag * val2.imag);
                (*(npy_int *)op).imag += (val1.real * val2.imag) +
                                       (val1.imag * val2.real);
#else
                *(npy_int *)op += val1 * val2;
#endif
                ip2 += is2_n;
                ip1 += is1_n;
            }
#if 0
            *(npy_int *)op = npy_float_to_half(sum);
#endif
            ip1 -= ib1_n;
            ip2 -= ib2_n;
            op  +=  os_p;
            ip2 += is2_p;
        }
        op -= ob_p;
        ip2 -= ib2_p;
        ip1 += is1_m;
        op  +=  os_m;
    }
}


#line 215

NPY_NO_EXPORT void
LONG_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
                           void *_op, npy_intp os_m, npy_intp os_p,
                           npy_intp dm, npy_intp dn, npy_intp dp)
                           
{
    npy_intp m, n, p;
    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;

    ib1_n = is1_n * dn;
    ib2_n = is2_n * dn;
    ib2_p = is2_p * dp;
    ob_p  = os_p * dp;

    for (m = 0; m < dm; m++) {
        for (p = 0; p < dp; p++) {
#if 0 == 1
            (*(npy_long *)op).real = 0;
            (*(npy_long *)op).imag = 0;
#elif 0
            float sum = 0;
#else
            *(npy_long *)op = 0;
#endif
            for (n = 0; n < dn; n++) {
                npy_long val1 = (*(npy_long *)ip1);
                npy_long val2 = (*(npy_long *)ip2);
#if 0
                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
#elif 0 == 1
                (*(npy_long *)op).real += (val1.real * val2.real) -
                                       (val1.imag * val2.imag);
                (*(npy_long *)op).imag += (val1.real * val2.imag) +
                                       (val1.imag * val2.real);
#else
                *(npy_long *)op += val1 * val2;
#endif
                ip2 += is2_n;
                ip1 += is1_n;
            }
#if 0
            *(npy_long *)op = npy_float_to_half(sum);
#endif
            ip1 -= ib1_n;
            ip2 -= ib2_n;
            op  +=  os_p;
            ip2 += is2_p;
        }
        op -= ob_p;
        ip2 -= ib2_p;
        ip1 += is1_m;
        op  +=  os_m;
    }
}


#line 215

NPY_NO_EXPORT void
LONGLONG_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
                           void *_op, npy_intp os_m, npy_intp os_p,
                           npy_intp dm, npy_intp dn, npy_intp dp)
                           
{
    npy_intp m, n, p;
    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;

    ib1_n = is1_n * dn;
    ib2_n = is2_n * dn;
    ib2_p = is2_p * dp;
    ob_p  = os_p * dp;

    for (m = 0; m < dm; m++) {
        for (p = 0; p < dp; p++) {
#if 0 == 1
            (*(npy_longlong *)op).real = 0;
            (*(npy_longlong *)op).imag = 0;
#elif 0
            float sum = 0;
#else
            *(npy_longlong *)op = 0;
#endif
            for (n = 0; n < dn; n++) {
                npy_longlong val1 = (*(npy_longlong *)ip1);
                npy_longlong val2 = (*(npy_longlong *)ip2);
#if 0
                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
#elif 0 == 1
                (*(npy_longlong *)op).real += (val1.real * val2.real) -
                                       (val1.imag * val2.imag);
                (*(npy_longlong *)op).imag += (val1.real * val2.imag) +
                                       (val1.imag * val2.real);
#else
                *(npy_longlong *)op += val1 * val2;
#endif
                ip2 += is2_n;
                ip1 += is1_n;
            }
#if 0
            *(npy_longlong *)op = npy_float_to_half(sum);
#endif
            ip1 -= ib1_n;
            ip2 -= ib2_n;
            op  +=  os_p;
            ip2 += is2_p;
        }
        op -= ob_p;
        ip2 -= ib2_p;
        ip1 += is1_m;
        op  +=  os_m;
    }
}


NPY_NO_EXPORT void
BOOL_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
                           void *_op, npy_intp os_m, npy_intp os_p,
                           npy_intp dm, npy_intp dn, npy_intp dp)
                           
{
    npy_intp m, n, p;
    npy_intp ib2_p, ob_p;
    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;

    ib2_p = is2_p * dp;
    ob_p  = os_p * dp;

    for (m = 0; m < dm; m++) {
        for (p = 0; p < dp; p++) {
            char *ip1tmp = ip1;
            char *ip2tmp = ip2;
            *(npy_bool *)op = NPY_FALSE;
            for (n = 0; n < dn; n++) {
                npy_bool val1 = (*(npy_bool *)ip1tmp);
                npy_bool val2 = (*(npy_bool *)ip2tmp);
                if (val1 != 0 && val2 != 0) {
                    *(npy_bool *)op = NPY_TRUE;
                    break;
                }
                ip2tmp += is2_n;
                ip1tmp += is1_n;
            }
            op  +=  os_p;
            ip2 += is2_p;
        }
        op -= ob_p;
        ip2 -= ib2_p;
        ip1 += is1_m;
        op  +=  os_m;
    }
}

NPY_NO_EXPORT void
OBJECT_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
                           void *_op, npy_intp os_m, npy_intp os_p,
                           npy_intp dm, npy_intp dn, npy_intp dp)                         
{
    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;

    npy_intp ib1_n = is1_n * dn;
    npy_intp ib2_n = is2_n * dn;
    npy_intp ib2_p = is2_p * dp;
    npy_intp ob_p  = os_p * dp;

    PyObject *product, *sum_of_products = NULL;

    for (npy_intp m = 0; m < dm; m++) {
        for (npy_intp p = 0; p < dp; p++) {
            if ( 0 == dn ) {
                sum_of_products = PyLong_FromLong(0);
                if (sum_of_products == NULL) {
                    return;
                }
            }

            for (npy_intp n = 0; n < dn; n++) {
                PyObject *obj1 = *(PyObject**)ip1, *obj2 = *(PyObject**)ip2;
                if (obj1 == NULL) {
                    obj1 = Py_None;
                }
                if (obj2 == NULL) {
                    obj2 = Py_None;
                }

                product = PyNumber_Multiply(obj1, obj2);
                if (product == NULL) {
                    Py_XDECREF(sum_of_products);
                    return;
                }

                if (n == 0) {
                    sum_of_products = product;
                }
                else {
                    Py_SETREF(sum_of_products, PyNumber_Add(sum_of_products, product));
                    Py_DECREF(product);
                    if (sum_of_products == NULL) {
                        return;
                    }
                }

                ip2 += is2_n;
                ip1 += is1_n;
            }

            *((PyObject **)op) = sum_of_products;
            ip1 -= ib1_n;
            ip2 -= ib2_n;
            op  +=  os_p;
            ip2 += is2_p;
        }
        op -= ob_p;
        ip2 -= ib2_p;
        ip1 += is1_m;
        op  +=  os_m;
    }
}


#line 395


NPY_NO_EXPORT void
FLOAT_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp dOuter = *dimensions++;
    npy_intp iOuter;
    npy_intp s0 = *steps++;
    npy_intp s1 = *steps++;
    npy_intp s2 = *steps++;
    npy_intp dm = dimensions[0];
    npy_intp dn = dimensions[1];
    npy_intp dp = dimensions[2];
    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
         os_m=steps[4], os_p=steps[5];
#if 1 && defined(HAVE_CBLAS)
    npy_intp sz = sizeof(npy_float);
    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
    npy_bool scalar_out = (dm == 1 && dp == 1);
    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
                                 dp > BLAS_MAXSIZE);
    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
                              is_blasable2d(is1_n, sz, dn, 1, sz));
    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
                              is_blasable2d(is2_n, sz, dn, 1, sz));
#endif

    for (iOuter = 0; iOuter < dOuter; iOuter++,
                         args[0] += s0, args[1] += s1, args[2] += s2) {
        void *ip1=args[0], *ip2=args[1], *op=args[2];
#if 1 && defined(HAVE_CBLAS)
        /*
         * TODO: refactor this out to a inner_loop_selector, in
         * PyUFunc_MatmulLoopSelector. But that call does not have access to
         * n, m, p and strides.
         */
        if (too_big_for_blas || any_zero_dim) {
            FLOAT_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                       ip2, is2_n, is2_p,
                                       op, os_m, os_p, dm, dn, dp);
        }
        else if (special_case) {
            /* Special case variants that have a 1 in the core dimensions */
            if (scalar_out) {
                /* row @ column, 1,1 output */
                FLOAT_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
            } else if (scalar_vec){
                /*
                 * 1,1d @ vector or vector @ 1,1d
                 * could use cblas_Xaxy, but that requires 0ing output
                 * and would not be faster (XXX prove it)
                 */
                FLOAT_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            } else if (vector_matrix) {
                /* vector @ matrix, switch ip1, ip2, p and m */
                FLOAT_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
                            op, os_p, os_m, dp, dn, dm);
            } else if  (matrix_vector) {
                /* matrix @ vector */
                FLOAT_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,

                            op, os_m, os_p, dm, dn, dp);
            } else {
                /* column @ row, 2d output, no blas needed or non-blas-able input */
                FLOAT_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        } else {
            /* matrix @ matrix */
            if (i1blasable && i2blasable && o_c_blasable) {
                FLOAT_matmul_matrixmatrix(ip1, is1_m, is1_n,
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p,
                                           dm, dn, dp);
            } else if (i1blasable && i2blasable && o_f_blasable) {
                /*
                 * Use transpose equivalence:
                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
                 */
                FLOAT_matmul_matrixmatrix(ip2, is2_p, is2_n,
                                           ip1, is1_n, is1_m,
                                           op, os_p, os_m,
                                           dp, dn, dm);
            } else {
                /*
                 * If parameters are castable to int and we copy the
                 * non-blasable (or non-ccontiguous output)
                 * we could still use BLAS, see gh-12365.
                 */
                FLOAT_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        }
#else
        FLOAT_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                   ip2, is2_n, is2_p,
                                   op, os_m, os_p, dm, dn, dp);

#endif
    }
}


#line 395


NPY_NO_EXPORT void
DOUBLE_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp dOuter = *dimensions++;
    npy_intp iOuter;
    npy_intp s0 = *steps++;
    npy_intp s1 = *steps++;
    npy_intp s2 = *steps++;
    npy_intp dm = dimensions[0];
    npy_intp dn = dimensions[1];
    npy_intp dp = dimensions[2];
    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
         os_m=steps[4], os_p=steps[5];
#if 1 && defined(HAVE_CBLAS)
    npy_intp sz = sizeof(npy_double);
    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
    npy_bool scalar_out = (dm == 1 && dp == 1);
    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
                                 dp > BLAS_MAXSIZE);
    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
                              is_blasable2d(is1_n, sz, dn, 1, sz));
    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
                              is_blasable2d(is2_n, sz, dn, 1, sz));
#endif

    for (iOuter = 0; iOuter < dOuter; iOuter++,
                         args[0] += s0, args[1] += s1, args[2] += s2) {
        void *ip1=args[0], *ip2=args[1], *op=args[2];
#if 1 && defined(HAVE_CBLAS)
        /*
         * TODO: refactor this out to a inner_loop_selector, in
         * PyUFunc_MatmulLoopSelector. But that call does not have access to
         * n, m, p and strides.
         */
        if (too_big_for_blas || any_zero_dim) {
            DOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                       ip2, is2_n, is2_p,
                                       op, os_m, os_p, dm, dn, dp);
        }
        else if (special_case) {
            /* Special case variants that have a 1 in the core dimensions */
            if (scalar_out) {
                /* row @ column, 1,1 output */
                DOUBLE_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
            } else if (scalar_vec){
                /*
                 * 1,1d @ vector or vector @ 1,1d
                 * could use cblas_Xaxy, but that requires 0ing output
                 * and would not be faster (XXX prove it)
                 */
                DOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            } else if (vector_matrix) {
                /* vector @ matrix, switch ip1, ip2, p and m */
                DOUBLE_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
                            op, os_p, os_m, dp, dn, dm);
            } else if  (matrix_vector) {
                /* matrix @ vector */
                DOUBLE_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,

                            op, os_m, os_p, dm, dn, dp);
            } else {
                /* column @ row, 2d output, no blas needed or non-blas-able input */
                DOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        } else {
            /* matrix @ matrix */
            if (i1blasable && i2blasable && o_c_blasable) {
                DOUBLE_matmul_matrixmatrix(ip1, is1_m, is1_n,
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p,
                                           dm, dn, dp);
            } else if (i1blasable && i2blasable && o_f_blasable) {
                /*
                 * Use transpose equivalence:
                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
                 */
                DOUBLE_matmul_matrixmatrix(ip2, is2_p, is2_n,
                                           ip1, is1_n, is1_m,
                                           op, os_p, os_m,
                                           dp, dn, dm);
            } else {
                /*
                 * If parameters are castable to int and we copy the
                 * non-blasable (or non-ccontiguous output)
                 * we could still use BLAS, see gh-12365.
                 */
                DOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        }
#else
        DOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                   ip2, is2_n, is2_p,
                                   op, os_m, os_p, dm, dn, dp);

#endif
    }
}


#line 395


NPY_NO_EXPORT void
LONGDOUBLE_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp dOuter = *dimensions++;
    npy_intp iOuter;
    npy_intp s0 = *steps++;
    npy_intp s1 = *steps++;
    npy_intp s2 = *steps++;
    npy_intp dm = dimensions[0];
    npy_intp dn = dimensions[1];
    npy_intp dp = dimensions[2];
    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
         os_m=steps[4], os_p=steps[5];
#if 0 && defined(HAVE_CBLAS)
    npy_intp sz = sizeof(npy_longdouble);
    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
    npy_bool scalar_out = (dm == 1 && dp == 1);
    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
                                 dp > BLAS_MAXSIZE);
    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
                              is_blasable2d(is1_n, sz, dn, 1, sz));
    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
                              is_blasable2d(is2_n, sz, dn, 1, sz));
#endif

    for (iOuter = 0; iOuter < dOuter; iOuter++,
                         args[0] += s0, args[1] += s1, args[2] += s2) {
        void *ip1=args[0], *ip2=args[1], *op=args[2];
#if 0 && defined(HAVE_CBLAS)
        /*
         * TODO: refactor this out to a inner_loop_selector, in
         * PyUFunc_MatmulLoopSelector. But that call does not have access to
         * n, m, p and strides.
         */
        if (too_big_for_blas || any_zero_dim) {
            LONGDOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                       ip2, is2_n, is2_p,
                                       op, os_m, os_p, dm, dn, dp);
        }
        else if (special_case) {
            /* Special case variants that have a 1 in the core dimensions */
            if (scalar_out) {
                /* row @ column, 1,1 output */
                LONGDOUBLE_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
            } else if (scalar_vec){
                /*
                 * 1,1d @ vector or vector @ 1,1d
                 * could use cblas_Xaxy, but that requires 0ing output
                 * and would not be faster (XXX prove it)
                 */
                LONGDOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            } else if (vector_matrix) {
                /* vector @ matrix, switch ip1, ip2, p and m */
                LONGDOUBLE_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
                            op, os_p, os_m, dp, dn, dm);
            } else if  (matrix_vector) {
                /* matrix @ vector */
                LONGDOUBLE_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,

                            op, os_m, os_p, dm, dn, dp);
            } else {
                /* column @ row, 2d output, no blas needed or non-blas-able input */
                LONGDOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        } else {
            /* matrix @ matrix */
            if (i1blasable && i2blasable && o_c_blasable) {
                LONGDOUBLE_matmul_matrixmatrix(ip1, is1_m, is1_n,
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p,
                                           dm, dn, dp);
            } else if (i1blasable && i2blasable && o_f_blasable) {
                /*
                 * Use transpose equivalence:
                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
                 */
                LONGDOUBLE_matmul_matrixmatrix(ip2, is2_p, is2_n,
                                           ip1, is1_n, is1_m,
                                           op, os_p, os_m,
                                           dp, dn, dm);
            } else {
                /*
                 * If parameters are castable to int and we copy the
                 * non-blasable (or non-ccontiguous output)
                 * we could still use BLAS, see gh-12365.
                 */
                LONGDOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        }
#else
        LONGDOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                   ip2, is2_n, is2_p,
                                   op, os_m, os_p, dm, dn, dp);

#endif
    }
}


#line 395


NPY_NO_EXPORT void
HALF_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp dOuter = *dimensions++;
    npy_intp iOuter;
    npy_intp s0 = *steps++;
    npy_intp s1 = *steps++;
    npy_intp s2 = *steps++;
    npy_intp dm = dimensions[0];
    npy_intp dn = dimensions[1];
    npy_intp dp = dimensions[2];
    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
         os_m=steps[4], os_p=steps[5];
#if 0 && defined(HAVE_CBLAS)
    npy_intp sz = sizeof(npy_half);
    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
    npy_bool scalar_out = (dm == 1 && dp == 1);
    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
                                 dp > BLAS_MAXSIZE);
    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
                              is_blasable2d(is1_n, sz, dn, 1, sz));
    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
                              is_blasable2d(is2_n, sz, dn, 1, sz));
#endif

    for (iOuter = 0; iOuter < dOuter; iOuter++,
                         args[0] += s0, args[1] += s1, args[2] += s2) {
        void *ip1=args[0], *ip2=args[1], *op=args[2];
#if 0 && defined(HAVE_CBLAS)
        /*
         * TODO: refactor this out to a inner_loop_selector, in
         * PyUFunc_MatmulLoopSelector. But that call does not have access to
         * n, m, p and strides.
         */
        if (too_big_for_blas || any_zero_dim) {
            HALF_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                       ip2, is2_n, is2_p,
                                       op, os_m, os_p, dm, dn, dp);
        }
        else if (special_case) {
            /* Special case variants that have a 1 in the core dimensions */
            if (scalar_out) {
                /* row @ column, 1,1 output */
                HALF_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
            } else if (scalar_vec){
                /*
                 * 1,1d @ vector or vector @ 1,1d
                 * could use cblas_Xaxy, but that requires 0ing output
                 * and would not be faster (XXX prove it)
                 */
                HALF_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            } else if (vector_matrix) {
                /* vector @ matrix, switch ip1, ip2, p and m */
                HALF_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
                            op, os_p, os_m, dp, dn, dm);
            } else if  (matrix_vector) {
                /* matrix @ vector */
                HALF_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,

                            op, os_m, os_p, dm, dn, dp);
            } else {
                /* column @ row, 2d output, no blas needed or non-blas-able input */
                HALF_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        } else {
            /* matrix @ matrix */
            if (i1blasable && i2blasable && o_c_blasable) {
                HALF_matmul_matrixmatrix(ip1, is1_m, is1_n,
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p,
                                           dm, dn, dp);
            } else if (i1blasable && i2blasable && o_f_blasable) {
                /*
                 * Use transpose equivalence:
                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
                 */
                HALF_matmul_matrixmatrix(ip2, is2_p, is2_n,
                                           ip1, is1_n, is1_m,
                                           op, os_p, os_m,
                                           dp, dn, dm);
            } else {
                /*
                 * If parameters are castable to int and we copy the
                 * non-blasable (or non-ccontiguous output)
                 * we could still use BLAS, see gh-12365.
                 */
                HALF_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        }
#else
        HALF_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                   ip2, is2_n, is2_p,
                                   op, os_m, os_p, dm, dn, dp);

#endif
    }
}


#line 395


NPY_NO_EXPORT void
CFLOAT_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp dOuter = *dimensions++;
    npy_intp iOuter;
    npy_intp s0 = *steps++;
    npy_intp s1 = *steps++;
    npy_intp s2 = *steps++;
    npy_intp dm = dimensions[0];
    npy_intp dn = dimensions[1];
    npy_intp dp = dimensions[2];
    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
         os_m=steps[4], os_p=steps[5];
#if 1 && defined(HAVE_CBLAS)
    npy_intp sz = sizeof(npy_cfloat);
    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
    npy_bool scalar_out = (dm == 1 && dp == 1);
    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
                                 dp > BLAS_MAXSIZE);
    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
                              is_blasable2d(is1_n, sz, dn, 1, sz));
    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
                              is_blasable2d(is2_n, sz, dn, 1, sz));
#endif

    for (iOuter = 0; iOuter < dOuter; iOuter++,
                         args[0] += s0, args[1] += s1, args[2] += s2) {
        void *ip1=args[0], *ip2=args[1], *op=args[2];
#if 1 && defined(HAVE_CBLAS)
        /*
         * TODO: refactor this out to a inner_loop_selector, in
         * PyUFunc_MatmulLoopSelector. But that call does not have access to
         * n, m, p and strides.
         */
        if (too_big_for_blas || any_zero_dim) {
            CFLOAT_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                       ip2, is2_n, is2_p,
                                       op, os_m, os_p, dm, dn, dp);
        }
        else if (special_case) {
            /* Special case variants that have a 1 in the core dimensions */
            if (scalar_out) {
                /* row @ column, 1,1 output */
                CFLOAT_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
            } else if (scalar_vec){
                /*
                 * 1,1d @ vector or vector @ 1,1d
                 * could use cblas_Xaxy, but that requires 0ing output
                 * and would not be faster (XXX prove it)
                 */
                CFLOAT_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            } else if (vector_matrix) {
                /* vector @ matrix, switch ip1, ip2, p and m */
                CFLOAT_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
                            op, os_p, os_m, dp, dn, dm);
            } else if  (matrix_vector) {
                /* matrix @ vector */
                CFLOAT_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,

                            op, os_m, os_p, dm, dn, dp);
            } else {
                /* column @ row, 2d output, no blas needed or non-blas-able input */
                CFLOAT_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        } else {
            /* matrix @ matrix */
            if (i1blasable && i2blasable && o_c_blasable) {
                CFLOAT_matmul_matrixmatrix(ip1, is1_m, is1_n,
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p,
                                           dm, dn, dp);
            } else if (i1blasable && i2blasable && o_f_blasable) {
                /*
                 * Use transpose equivalence:
                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
                 */
                CFLOAT_matmul_matrixmatrix(ip2, is2_p, is2_n,
                                           ip1, is1_n, is1_m,
                                           op, os_p, os_m,
                                           dp, dn, dm);
            } else {
                /*
                 * If parameters are castable to int and we copy the
                 * non-blasable (or non-ccontiguous output)
                 * we could still use BLAS, see gh-12365.
                 */
                CFLOAT_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        }
#else
        CFLOAT_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                   ip2, is2_n, is2_p,
                                   op, os_m, os_p, dm, dn, dp);

#endif
    }
}


#line 395


NPY_NO_EXPORT void
CDOUBLE_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp dOuter = *dimensions++;
    npy_intp iOuter;
    npy_intp s0 = *steps++;
    npy_intp s1 = *steps++;
    npy_intp s2 = *steps++;
    npy_intp dm = dimensions[0];
    npy_intp dn = dimensions[1];
    npy_intp dp = dimensions[2];
    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
         os_m=steps[4], os_p=steps[5];
#if 1 && defined(HAVE_CBLAS)
    npy_intp sz = sizeof(npy_cdouble);
    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
    npy_bool scalar_out = (dm == 1 && dp == 1);
    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
                                 dp > BLAS_MAXSIZE);
    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
                              is_blasable2d(is1_n, sz, dn, 1, sz));
    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
                              is_blasable2d(is2_n, sz, dn, 1, sz));
#endif

    for (iOuter = 0; iOuter < dOuter; iOuter++,
                         args[0] += s0, args[1] += s1, args[2] += s2) {
        void *ip1=args[0], *ip2=args[1], *op=args[2];
#if 1 && defined(HAVE_CBLAS)
        /*
         * TODO: refactor this out to a inner_loop_selector, in
         * PyUFunc_MatmulLoopSelector. But that call does not have access to
         * n, m, p and strides.
         */
        if (too_big_for_blas || any_zero_dim) {
            CDOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                       ip2, is2_n, is2_p,
                                       op, os_m, os_p, dm, dn, dp);
        }
        else if (special_case) {
            /* Special case variants that have a 1 in the core dimensions */
            if (scalar_out) {
                /* row @ column, 1,1 output */
                CDOUBLE_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
            } else if (scalar_vec){
                /*
                 * 1,1d @ vector or vector @ 1,1d
                 * could use cblas_Xaxy, but that requires 0ing output
                 * and would not be faster (XXX prove it)
                 */
                CDOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            } else if (vector_matrix) {
                /* vector @ matrix, switch ip1, ip2, p and m */
                CDOUBLE_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
                            op, os_p, os_m, dp, dn, dm);
            } else if  (matrix_vector) {
                /* matrix @ vector */
                CDOUBLE_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,

                            op, os_m, os_p, dm, dn, dp);
            } else {
                /* column @ row, 2d output, no blas needed or non-blas-able input */
                CDOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        } else {
            /* matrix @ matrix */
            if (i1blasable && i2blasable && o_c_blasable) {
                CDOUBLE_matmul_matrixmatrix(ip1, is1_m, is1_n,
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p,
                                           dm, dn, dp);
            } else if (i1blasable && i2blasable && o_f_blasable) {
                /*
                 * Use transpose equivalence:
                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
                 */
                CDOUBLE_matmul_matrixmatrix(ip2, is2_p, is2_n,
                                           ip1, is1_n, is1_m,
                                           op, os_p, os_m,
                                           dp, dn, dm);
            } else {
                /*
                 * If parameters are castable to int and we copy the
                 * non-blasable (or non-ccontiguous output)
                 * we could still use BLAS, see gh-12365.
                 */
                CDOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        }
#else
        CDOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                   ip2, is2_n, is2_p,
                                   op, os_m, os_p, dm, dn, dp);

#endif
    }
}


#line 395


NPY_NO_EXPORT void
CLONGDOUBLE_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp dOuter = *dimensions++;
    npy_intp iOuter;
    npy_intp s0 = *steps++;
    npy_intp s1 = *steps++;
    npy_intp s2 = *steps++;
    npy_intp dm = dimensions[0];
    npy_intp dn = dimensions[1];
    npy_intp dp = dimensions[2];
    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
         os_m=steps[4], os_p=steps[5];
#if 0 && defined(HAVE_CBLAS)
    npy_intp sz = sizeof(npy_clongdouble);
    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
    npy_bool scalar_out = (dm == 1 && dp == 1);
    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
                                 dp > BLAS_MAXSIZE);
    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
                              is_blasable2d(is1_n, sz, dn, 1, sz));
    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
                              is_blasable2d(is2_n, sz, dn, 1, sz));
#endif

    for (iOuter = 0; iOuter < dOuter; iOuter++,
                         args[0] += s0, args[1] += s1, args[2] += s2) {
        void *ip1=args[0], *ip2=args[1], *op=args[2];
#if 0 && defined(HAVE_CBLAS)
        /*
         * TODO: refactor this out to a inner_loop_selector, in
         * PyUFunc_MatmulLoopSelector. But that call does not have access to
         * n, m, p and strides.
         */
        if (too_big_for_blas || any_zero_dim) {
            CLONGDOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                       ip2, is2_n, is2_p,
                                       op, os_m, os_p, dm, dn, dp);
        }
        else if (special_case) {
            /* Special case variants that have a 1 in the core dimensions */
            if (scalar_out) {
                /* row @ column, 1,1 output */
                CLONGDOUBLE_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
            } else if (scalar_vec){
                /*
                 * 1,1d @ vector or vector @ 1,1d
                 * could use cblas_Xaxy, but that requires 0ing output
                 * and would not be faster (XXX prove it)
                 */
                CLONGDOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            } else if (vector_matrix) {
                /* vector @ matrix, switch ip1, ip2, p and m */
                CLONGDOUBLE_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
                            op, os_p, os_m, dp, dn, dm);
            } else if  (matrix_vector) {
                /* matrix @ vector */
                CLONGDOUBLE_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,

                            op, os_m, os_p, dm, dn, dp);
            } else {
                /* column @ row, 2d output, no blas needed or non-blas-able input */
                CLONGDOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        } else {
            /* matrix @ matrix */
            if (i1blasable && i2blasable && o_c_blasable) {
                CLONGDOUBLE_matmul_matrixmatrix(ip1, is1_m, is1_n,
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p,
                                           dm, dn, dp);
            } else if (i1blasable && i2blasable && o_f_blasable) {
                /*
                 * Use transpose equivalence:
                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
                 */
                CLONGDOUBLE_matmul_matrixmatrix(ip2, is2_p, is2_n,
                                           ip1, is1_n, is1_m,
                                           op, os_p, os_m,
                                           dp, dn, dm);
            } else {
                /*
                 * If parameters are castable to int and we copy the
                 * non-blasable (or non-ccontiguous output)
                 * we could still use BLAS, see gh-12365.
                 */
                CLONGDOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        }
#else
        CLONGDOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                   ip2, is2_n, is2_p,
                                   op, os_m, os_p, dm, dn, dp);

#endif
    }
}


#line 395


NPY_NO_EXPORT void
UBYTE_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp dOuter = *dimensions++;
    npy_intp iOuter;
    npy_intp s0 = *steps++;
    npy_intp s1 = *steps++;
    npy_intp s2 = *steps++;
    npy_intp dm = dimensions[0];
    npy_intp dn = dimensions[1];
    npy_intp dp = dimensions[2];
    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
         os_m=steps[4], os_p=steps[5];
#if 0 && defined(HAVE_CBLAS)
    npy_intp sz = sizeof(npy_ubyte);
    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
    npy_bool scalar_out = (dm == 1 && dp == 1);
    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
                                 dp > BLAS_MAXSIZE);
    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
                              is_blasable2d(is1_n, sz, dn, 1, sz));
    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
                              is_blasable2d(is2_n, sz, dn, 1, sz));
#endif

    for (iOuter = 0; iOuter < dOuter; iOuter++,
                         args[0] += s0, args[1] += s1, args[2] += s2) {
        void *ip1=args[0], *ip2=args[1], *op=args[2];
#if 0 && defined(HAVE_CBLAS)
        /*
         * TODO: refactor this out to a inner_loop_selector, in
         * PyUFunc_MatmulLoopSelector. But that call does not have access to
         * n, m, p and strides.
         */
        if (too_big_for_blas || any_zero_dim) {
            UBYTE_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                       ip2, is2_n, is2_p,
                                       op, os_m, os_p, dm, dn, dp);
        }
        else if (special_case) {
            /* Special case variants that have a 1 in the core dimensions */
            if (scalar_out) {
                /* row @ column, 1,1 output */
                UBYTE_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
            } else if (scalar_vec){
                /*
                 * 1,1d @ vector or vector @ 1,1d
                 * could use cblas_Xaxy, but that requires 0ing output
                 * and would not be faster (XXX prove it)
                 */
                UBYTE_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            } else if (vector_matrix) {
                /* vector @ matrix, switch ip1, ip2, p and m */
                UBYTE_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
                            op, os_p, os_m, dp, dn, dm);
            } else if  (matrix_vector) {
                /* matrix @ vector */
                UBYTE_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,

                            op, os_m, os_p, dm, dn, dp);
            } else {
                /* column @ row, 2d output, no blas needed or non-blas-able input */
                UBYTE_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        } else {
            /* matrix @ matrix */
            if (i1blasable && i2blasable && o_c_blasable) {
                UBYTE_matmul_matrixmatrix(ip1, is1_m, is1_n,
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p,
                                           dm, dn, dp);
            } else if (i1blasable && i2blasable && o_f_blasable) {
                /*
                 * Use transpose equivalence:
                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
                 */
                UBYTE_matmul_matrixmatrix(ip2, is2_p, is2_n,
                                           ip1, is1_n, is1_m,
                                           op, os_p, os_m,
                                           dp, dn, dm);
            } else {
                /*
                 * If parameters are castable to int and we copy the
                 * non-blasable (or non-ccontiguous output)
                 * we could still use BLAS, see gh-12365.
                 */
                UBYTE_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        }
#else
        UBYTE_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                   ip2, is2_n, is2_p,
                                   op, os_m, os_p, dm, dn, dp);

#endif
    }
}


#line 395


NPY_NO_EXPORT void
USHORT_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp dOuter = *dimensions++;
    npy_intp iOuter;
    npy_intp s0 = *steps++;
    npy_intp s1 = *steps++;
    npy_intp s2 = *steps++;
    npy_intp dm = dimensions[0];
    npy_intp dn = dimensions[1];
    npy_intp dp = dimensions[2];
    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
         os_m=steps[4], os_p=steps[5];
#if 0 && defined(HAVE_CBLAS)
    npy_intp sz = sizeof(npy_ushort);
    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
    npy_bool scalar_out = (dm == 1 && dp == 1);
    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
                                 dp > BLAS_MAXSIZE);
    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
                              is_blasable2d(is1_n, sz, dn, 1, sz));
    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
                              is_blasable2d(is2_n, sz, dn, 1, sz));
#endif

    for (iOuter = 0; iOuter < dOuter; iOuter++,
                         args[0] += s0, args[1] += s1, args[2] += s2) {
        void *ip1=args[0], *ip2=args[1], *op=args[2];
#if 0 && defined(HAVE_CBLAS)
        /*
         * TODO: refactor this out to a inner_loop_selector, in
         * PyUFunc_MatmulLoopSelector. But that call does not have access to
         * n, m, p and strides.
         */
        if (too_big_for_blas || any_zero_dim) {
            USHORT_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                       ip2, is2_n, is2_p,
                                       op, os_m, os_p, dm, dn, dp);
        }
        else if (special_case) {
            /* Special case variants that have a 1 in the core dimensions */
            if (scalar_out) {
                /* row @ column, 1,1 output */
                USHORT_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
            } else if (scalar_vec){
                /*
                 * 1,1d @ vector or vector @ 1,1d
                 * could use cblas_Xaxy, but that requires 0ing output
                 * and would not be faster (XXX prove it)
                 */
                USHORT_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            } else if (vector_matrix) {
                /* vector @ matrix, switch ip1, ip2, p and m */
                USHORT_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
                            op, os_p, os_m, dp, dn, dm);
            } else if  (matrix_vector) {
                /* matrix @ vector */
                USHORT_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,

                            op, os_m, os_p, dm, dn, dp);
            } else {
                /* column @ row, 2d output, no blas needed or non-blas-able input */
                USHORT_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        } else {
            /* matrix @ matrix */
            if (i1blasable && i2blasable && o_c_blasable) {
                USHORT_matmul_matrixmatrix(ip1, is1_m, is1_n,
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p,
                                           dm, dn, dp);
            } else if (i1blasable && i2blasable && o_f_blasable) {
                /*
                 * Use transpose equivalence:
                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
                 */
                USHORT_matmul_matrixmatrix(ip2, is2_p, is2_n,
                                           ip1, is1_n, is1_m,
                                           op, os_p, os_m,
                                           dp, dn, dm);
            } else {
                /*
                 * If parameters are castable to int and we copy the
                 * non-blasable (or non-ccontiguous output)
                 * we could still use BLAS, see gh-12365.
                 */
                USHORT_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        }
#else
        USHORT_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                   ip2, is2_n, is2_p,
                                   op, os_m, os_p, dm, dn, dp);

#endif
    }
}


#line 395


NPY_NO_EXPORT void
UINT_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp dOuter = *dimensions++;
    npy_intp iOuter;
    npy_intp s0 = *steps++;
    npy_intp s1 = *steps++;
    npy_intp s2 = *steps++;
    npy_intp dm = dimensions[0];
    npy_intp dn = dimensions[1];
    npy_intp dp = dimensions[2];
    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
         os_m=steps[4], os_p=steps[5];
#if 0 && defined(HAVE_CBLAS)
    npy_intp sz = sizeof(npy_uint);
    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
    npy_bool scalar_out = (dm == 1 && dp == 1);
    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
                                 dp > BLAS_MAXSIZE);
    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
                              is_blasable2d(is1_n, sz, dn, 1, sz));
    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
                              is_blasable2d(is2_n, sz, dn, 1, sz));
#endif

    for (iOuter = 0; iOuter < dOuter; iOuter++,
                         args[0] += s0, args[1] += s1, args[2] += s2) {
        void *ip1=args[0], *ip2=args[1], *op=args[2];
#if 0 && defined(HAVE_CBLAS)
        /*
         * TODO: refactor this out to a inner_loop_selector, in
         * PyUFunc_MatmulLoopSelector. But that call does not have access to
         * n, m, p and strides.
         */
        if (too_big_for_blas || any_zero_dim) {
            UINT_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                       ip2, is2_n, is2_p,
                                       op, os_m, os_p, dm, dn, dp);
        }
        else if (special_case) {
            /* Special case variants that have a 1 in the core dimensions */
            if (scalar_out) {
                /* row @ column, 1,1 output */
                UINT_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
            } else if (scalar_vec){
                /*
                 * 1,1d @ vector or vector @ 1,1d
                 * could use cblas_Xaxy, but that requires 0ing output
                 * and would not be faster (XXX prove it)
                 */
                UINT_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            } else if (vector_matrix) {
                /* vector @ matrix, switch ip1, ip2, p and m */
                UINT_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
                            op, os_p, os_m, dp, dn, dm);
            } else if  (matrix_vector) {
                /* matrix @ vector */
                UINT_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,

                            op, os_m, os_p, dm, dn, dp);
            } else {
                /* column @ row, 2d output, no blas needed or non-blas-able input */
                UINT_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        } else {
            /* matrix @ matrix */
            if (i1blasable && i2blasable && o_c_blasable) {
                UINT_matmul_matrixmatrix(ip1, is1_m, is1_n,
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p,
                                           dm, dn, dp);
            } else if (i1blasable && i2blasable && o_f_blasable) {
                /*
                 * Use transpose equivalence:
                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
                 */
                UINT_matmul_matrixmatrix(ip2, is2_p, is2_n,
                                           ip1, is1_n, is1_m,
                                           op, os_p, os_m,
                                           dp, dn, dm);
            } else {
                /*
                 * If parameters are castable to int and we copy the
                 * non-blasable (or non-ccontiguous output)
                 * we could still use BLAS, see gh-12365.
                 */
                UINT_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        }
#else
        UINT_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                   ip2, is2_n, is2_p,
                                   op, os_m, os_p, dm, dn, dp);

#endif
    }
}


#line 395


NPY_NO_EXPORT void
ULONG_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp dOuter = *dimensions++;
    npy_intp iOuter;
    npy_intp s0 = *steps++;
    npy_intp s1 = *steps++;
    npy_intp s2 = *steps++;
    npy_intp dm = dimensions[0];
    npy_intp dn = dimensions[1];
    npy_intp dp = dimensions[2];
    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
         os_m=steps[4], os_p=steps[5];
#if 0 && defined(HAVE_CBLAS)
    npy_intp sz = sizeof(npy_ulong);
    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
    npy_bool scalar_out = (dm == 1 && dp == 1);
    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
                                 dp > BLAS_MAXSIZE);
    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
                              is_blasable2d(is1_n, sz, dn, 1, sz));
    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
                              is_blasable2d(is2_n, sz, dn, 1, sz));
#endif

    for (iOuter = 0; iOuter < dOuter; iOuter++,
                         args[0] += s0, args[1] += s1, args[2] += s2) {
        void *ip1=args[0], *ip2=args[1], *op=args[2];
#if 0 && defined(HAVE_CBLAS)
        /*
         * TODO: refactor this out to a inner_loop_selector, in
         * PyUFunc_MatmulLoopSelector. But that call does not have access to
         * n, m, p and strides.
         */
        if (too_big_for_blas || any_zero_dim) {
            ULONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                       ip2, is2_n, is2_p,
                                       op, os_m, os_p, dm, dn, dp);
        }
        else if (special_case) {
            /* Special case variants that have a 1 in the core dimensions */
            if (scalar_out) {
                /* row @ column, 1,1 output */
                ULONG_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
            } else if (scalar_vec){
                /*
                 * 1,1d @ vector or vector @ 1,1d
                 * could use cblas_Xaxy, but that requires 0ing output
                 * and would not be faster (XXX prove it)
                 */
                ULONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            } else if (vector_matrix) {
                /* vector @ matrix, switch ip1, ip2, p and m */
                ULONG_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
                            op, os_p, os_m, dp, dn, dm);
            } else if  (matrix_vector) {
                /* matrix @ vector */
                ULONG_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,

                            op, os_m, os_p, dm, dn, dp);
            } else {
                /* column @ row, 2d output, no blas needed or non-blas-able input */
                ULONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        } else {
            /* matrix @ matrix */
            if (i1blasable && i2blasable && o_c_blasable) {
                ULONG_matmul_matrixmatrix(ip1, is1_m, is1_n,
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p,
                                           dm, dn, dp);
            } else if (i1blasable && i2blasable && o_f_blasable) {
                /*
                 * Use transpose equivalence:
                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
                 */
                ULONG_matmul_matrixmatrix(ip2, is2_p, is2_n,
                                           ip1, is1_n, is1_m,
                                           op, os_p, os_m,
                                           dp, dn, dm);
            } else {
                /*
                 * If parameters are castable to int and we copy the
                 * non-blasable (or non-ccontiguous output)
                 * we could still use BLAS, see gh-12365.
                 */
                ULONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        }
#else
        ULONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                   ip2, is2_n, is2_p,
                                   op, os_m, os_p, dm, dn, dp);

#endif
    }
}


#line 395


NPY_NO_EXPORT void
ULONGLONG_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp dOuter = *dimensions++;
    npy_intp iOuter;
    npy_intp s0 = *steps++;
    npy_intp s1 = *steps++;
    npy_intp s2 = *steps++;
    npy_intp dm = dimensions[0];
    npy_intp dn = dimensions[1];
    npy_intp dp = dimensions[2];
    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
         os_m=steps[4], os_p=steps[5];
#if 0 && defined(HAVE_CBLAS)
    npy_intp sz = sizeof(npy_ulonglong);
    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
    npy_bool scalar_out = (dm == 1 && dp == 1);
    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
                                 dp > BLAS_MAXSIZE);
    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
                              is_blasable2d(is1_n, sz, dn, 1, sz));
    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
                              is_blasable2d(is2_n, sz, dn, 1, sz));
#endif

    for (iOuter = 0; iOuter < dOuter; iOuter++,
                         args[0] += s0, args[1] += s1, args[2] += s2) {
        void *ip1=args[0], *ip2=args[1], *op=args[2];
#if 0 && defined(HAVE_CBLAS)
        /*
         * TODO: refactor this out to a inner_loop_selector, in
         * PyUFunc_MatmulLoopSelector. But that call does not have access to
         * n, m, p and strides.
         */
        if (too_big_for_blas || any_zero_dim) {
            ULONGLONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                       ip2, is2_n, is2_p,
                                       op, os_m, os_p, dm, dn, dp);
        }
        else if (special_case) {
            /* Special case variants that have a 1 in the core dimensions */
            if (scalar_out) {
                /* row @ column, 1,1 output */
                ULONGLONG_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
            } else if (scalar_vec){
                /*
                 * 1,1d @ vector or vector @ 1,1d
                 * could use cblas_Xaxy, but that requires 0ing output
                 * and would not be faster (XXX prove it)
                 */
                ULONGLONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            } else if (vector_matrix) {
                /* vector @ matrix, switch ip1, ip2, p and m */
                ULONGLONG_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
                            op, os_p, os_m, dp, dn, dm);
            } else if  (matrix_vector) {
                /* matrix @ vector */
                ULONGLONG_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,

                            op, os_m, os_p, dm, dn, dp);
            } else {
                /* column @ row, 2d output, no blas needed or non-blas-able input */
                ULONGLONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        } else {
            /* matrix @ matrix */
            if (i1blasable && i2blasable && o_c_blasable) {
                ULONGLONG_matmul_matrixmatrix(ip1, is1_m, is1_n,
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p,
                                           dm, dn, dp);
            } else if (i1blasable && i2blasable && o_f_blasable) {
                /*
                 * Use transpose equivalence:
                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
                 */
                ULONGLONG_matmul_matrixmatrix(ip2, is2_p, is2_n,
                                           ip1, is1_n, is1_m,
                                           op, os_p, os_m,
                                           dp, dn, dm);
            } else {
                /*
                 * If parameters are castable to int and we copy the
                 * non-blasable (or non-ccontiguous output)
                 * we could still use BLAS, see gh-12365.
                 */
                ULONGLONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        }
#else
        ULONGLONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                   ip2, is2_n, is2_p,
                                   op, os_m, os_p, dm, dn, dp);

#endif
    }
}


#line 395


NPY_NO_EXPORT void
BYTE_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp dOuter = *dimensions++;
    npy_intp iOuter;
    npy_intp s0 = *steps++;
    npy_intp s1 = *steps++;
    npy_intp s2 = *steps++;
    npy_intp dm = dimensions[0];
    npy_intp dn = dimensions[1];
    npy_intp dp = dimensions[2];
    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
         os_m=steps[4], os_p=steps[5];
#if 0 && defined(HAVE_CBLAS)
    npy_intp sz = sizeof(npy_byte);
    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
    npy_bool scalar_out = (dm == 1 && dp == 1);
    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
                                 dp > BLAS_MAXSIZE);
    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
                              is_blasable2d(is1_n, sz, dn, 1, sz));
    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
                              is_blasable2d(is2_n, sz, dn, 1, sz));
#endif

    for (iOuter = 0; iOuter < dOuter; iOuter++,
                         args[0] += s0, args[1] += s1, args[2] += s2) {
        void *ip1=args[0], *ip2=args[1], *op=args[2];
#if 0 && defined(HAVE_CBLAS)
        /*
         * TODO: refactor this out to a inner_loop_selector, in
         * PyUFunc_MatmulLoopSelector. But that call does not have access to
         * n, m, p and strides.
         */
        if (too_big_for_blas || any_zero_dim) {
            BYTE_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                       ip2, is2_n, is2_p,
                                       op, os_m, os_p, dm, dn, dp);
        }
        else if (special_case) {
            /* Special case variants that have a 1 in the core dimensions */
            if (scalar_out) {
                /* row @ column, 1,1 output */
                BYTE_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
            } else if (scalar_vec){
                /*
                 * 1,1d @ vector or vector @ 1,1d
                 * could use cblas_Xaxy, but that requires 0ing output
                 * and would not be faster (XXX prove it)
                 */
                BYTE_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            } else if (vector_matrix) {
                /* vector @ matrix, switch ip1, ip2, p and m */
                BYTE_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
                            op, os_p, os_m, dp, dn, dm);
            } else if  (matrix_vector) {
                /* matrix @ vector */
                BYTE_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,

                            op, os_m, os_p, dm, dn, dp);
            } else {
                /* column @ row, 2d output, no blas needed or non-blas-able input */
                BYTE_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        } else {
            /* matrix @ matrix */
            if (i1blasable && i2blasable && o_c_blasable) {
                BYTE_matmul_matrixmatrix(ip1, is1_m, is1_n,
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p,
                                           dm, dn, dp);
            } else if (i1blasable && i2blasable && o_f_blasable) {
                /*
                 * Use transpose equivalence:
                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
                 */
                BYTE_matmul_matrixmatrix(ip2, is2_p, is2_n,
                                           ip1, is1_n, is1_m,
                                           op, os_p, os_m,
                                           dp, dn, dm);
            } else {
                /*
                 * If parameters are castable to int and we copy the
                 * non-blasable (or non-ccontiguous output)
                 * we could still use BLAS, see gh-12365.
                 */
                BYTE_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        }
#else
        BYTE_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                   ip2, is2_n, is2_p,
                                   op, os_m, os_p, dm, dn, dp);

#endif
    }
}


#line 395


NPY_NO_EXPORT void
SHORT_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp dOuter = *dimensions++;
    npy_intp iOuter;
    npy_intp s0 = *steps++;
    npy_intp s1 = *steps++;
    npy_intp s2 = *steps++;
    npy_intp dm = dimensions[0];
    npy_intp dn = dimensions[1];
    npy_intp dp = dimensions[2];
    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
         os_m=steps[4], os_p=steps[5];
#if 0 && defined(HAVE_CBLAS)
    npy_intp sz = sizeof(npy_short);
    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
    npy_bool scalar_out = (dm == 1 && dp == 1);
    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
                                 dp > BLAS_MAXSIZE);
    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
                              is_blasable2d(is1_n, sz, dn, 1, sz));
    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
                              is_blasable2d(is2_n, sz, dn, 1, sz));
#endif

    for (iOuter = 0; iOuter < dOuter; iOuter++,
                         args[0] += s0, args[1] += s1, args[2] += s2) {
        void *ip1=args[0], *ip2=args[1], *op=args[2];
#if 0 && defined(HAVE_CBLAS)
        /*
         * TODO: refactor this out to a inner_loop_selector, in
         * PyUFunc_MatmulLoopSelector. But that call does not have access to
         * n, m, p and strides.
         */
        if (too_big_for_blas || any_zero_dim) {
            SHORT_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                       ip2, is2_n, is2_p,
                                       op, os_m, os_p, dm, dn, dp);
        }
        else if (special_case) {
            /* Special case variants that have a 1 in the core dimensions */
            if (scalar_out) {
                /* row @ column, 1,1 output */
                SHORT_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
            } else if (scalar_vec){
                /*
                 * 1,1d @ vector or vector @ 1,1d
                 * could use cblas_Xaxy, but that requires 0ing output
                 * and would not be faster (XXX prove it)
                 */
                SHORT_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            } else if (vector_matrix) {
                /* vector @ matrix, switch ip1, ip2, p and m */
                SHORT_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
                            op, os_p, os_m, dp, dn, dm);
            } else if  (matrix_vector) {
                /* matrix @ vector */
                SHORT_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,

                            op, os_m, os_p, dm, dn, dp);
            } else {
                /* column @ row, 2d output, no blas needed or non-blas-able input */
                SHORT_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        } else {
            /* matrix @ matrix */
            if (i1blasable && i2blasable && o_c_blasable) {
                SHORT_matmul_matrixmatrix(ip1, is1_m, is1_n,
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p,
                                           dm, dn, dp);
            } else if (i1blasable && i2blasable && o_f_blasable) {
                /*
                 * Use transpose equivalence:
                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
                 */
                SHORT_matmul_matrixmatrix(ip2, is2_p, is2_n,
                                           ip1, is1_n, is1_m,
                                           op, os_p, os_m,
                                           dp, dn, dm);
            } else {
                /*
                 * If parameters are castable to int and we copy the
                 * non-blasable (or non-ccontiguous output)
                 * we could still use BLAS, see gh-12365.
                 */
                SHORT_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        }
#else
        SHORT_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                   ip2, is2_n, is2_p,
                                   op, os_m, os_p, dm, dn, dp);

#endif
    }
}


#line 395


NPY_NO_EXPORT void
INT_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp dOuter = *dimensions++;
    npy_intp iOuter;
    npy_intp s0 = *steps++;
    npy_intp s1 = *steps++;
    npy_intp s2 = *steps++;
    npy_intp dm = dimensions[0];
    npy_intp dn = dimensions[1];
    npy_intp dp = dimensions[2];
    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
         os_m=steps[4], os_p=steps[5];
#if 0 && defined(HAVE_CBLAS)
    npy_intp sz = sizeof(npy_int);
    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
    npy_bool scalar_out = (dm == 1 && dp == 1);
    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
                                 dp > BLAS_MAXSIZE);
    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
                              is_blasable2d(is1_n, sz, dn, 1, sz));
    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
                              is_blasable2d(is2_n, sz, dn, 1, sz));
#endif

    for (iOuter = 0; iOuter < dOuter; iOuter++,
                         args[0] += s0, args[1] += s1, args[2] += s2) {
        void *ip1=args[0], *ip2=args[1], *op=args[2];
#if 0 && defined(HAVE_CBLAS)
        /*
         * TODO: refactor this out to a inner_loop_selector, in
         * PyUFunc_MatmulLoopSelector. But that call does not have access to
         * n, m, p and strides.
         */
        if (too_big_for_blas || any_zero_dim) {
            INT_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                       ip2, is2_n, is2_p,
                                       op, os_m, os_p, dm, dn, dp);
        }
        else if (special_case) {
            /* Special case variants that have a 1 in the core dimensions */
            if (scalar_out) {
                /* row @ column, 1,1 output */
                INT_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
            } else if (scalar_vec){
                /*
                 * 1,1d @ vector or vector @ 1,1d
                 * could use cblas_Xaxy, but that requires 0ing output
                 * and would not be faster (XXX prove it)
                 */
                INT_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            } else if (vector_matrix) {
                /* vector @ matrix, switch ip1, ip2, p and m */
                INT_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
                            op, os_p, os_m, dp, dn, dm);
            } else if  (matrix_vector) {
                /* matrix @ vector */
                INT_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,

                            op, os_m, os_p, dm, dn, dp);
            } else {
                /* column @ row, 2d output, no blas needed or non-blas-able input */
                INT_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        } else {
            /* matrix @ matrix */
            if (i1blasable && i2blasable && o_c_blasable) {
                INT_matmul_matrixmatrix(ip1, is1_m, is1_n,
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p,
                                           dm, dn, dp);
            } else if (i1blasable && i2blasable && o_f_blasable) {
                /*
                 * Use transpose equivalence:
                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
                 */
                INT_matmul_matrixmatrix(ip2, is2_p, is2_n,
                                           ip1, is1_n, is1_m,
                                           op, os_p, os_m,
                                           dp, dn, dm);
            } else {
                /*
                 * If parameters are castable to int and we copy the
                 * non-blasable (or non-ccontiguous output)
                 * we could still use BLAS, see gh-12365.
                 */
                INT_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        }
#else
        INT_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                   ip2, is2_n, is2_p,
                                   op, os_m, os_p, dm, dn, dp);

#endif
    }
}


#line 395


NPY_NO_EXPORT void
LONG_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp dOuter = *dimensions++;
    npy_intp iOuter;
    npy_intp s0 = *steps++;
    npy_intp s1 = *steps++;
    npy_intp s2 = *steps++;
    npy_intp dm = dimensions[0];
    npy_intp dn = dimensions[1];
    npy_intp dp = dimensions[2];
    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
         os_m=steps[4], os_p=steps[5];
#if 0 && defined(HAVE_CBLAS)
    npy_intp sz = sizeof(npy_long);
    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
    npy_bool scalar_out = (dm == 1 && dp == 1);
    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
                                 dp > BLAS_MAXSIZE);
    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
                              is_blasable2d(is1_n, sz, dn, 1, sz));
    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
                              is_blasable2d(is2_n, sz, dn, 1, sz));
#endif

    for (iOuter = 0; iOuter < dOuter; iOuter++,
                         args[0] += s0, args[1] += s1, args[2] += s2) {
        void *ip1=args[0], *ip2=args[1], *op=args[2];
#if 0 && defined(HAVE_CBLAS)
        /*
         * TODO: refactor this out to a inner_loop_selector, in
         * PyUFunc_MatmulLoopSelector. But that call does not have access to
         * n, m, p and strides.
         */
        if (too_big_for_blas || any_zero_dim) {
            LONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                       ip2, is2_n, is2_p,
                                       op, os_m, os_p, dm, dn, dp);
        }
        else if (special_case) {
            /* Special case variants that have a 1 in the core dimensions */
            if (scalar_out) {
                /* row @ column, 1,1 output */
                LONG_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
            } else if (scalar_vec){
                /*
                 * 1,1d @ vector or vector @ 1,1d
                 * could use cblas_Xaxy, but that requires 0ing output
                 * and would not be faster (XXX prove it)
                 */
                LONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            } else if (vector_matrix) {
                /* vector @ matrix, switch ip1, ip2, p and m */
                LONG_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
                            op, os_p, os_m, dp, dn, dm);
            } else if  (matrix_vector) {
                /* matrix @ vector */
                LONG_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,

                            op, os_m, os_p, dm, dn, dp);
            } else {
                /* column @ row, 2d output, no blas needed or non-blas-able input */
                LONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        } else {
            /* matrix @ matrix */
            if (i1blasable && i2blasable && o_c_blasable) {
                LONG_matmul_matrixmatrix(ip1, is1_m, is1_n,
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p,
                                           dm, dn, dp);
            } else if (i1blasable && i2blasable && o_f_blasable) {
                /*
                 * Use transpose equivalence:
                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
                 */
                LONG_matmul_matrixmatrix(ip2, is2_p, is2_n,
                                           ip1, is1_n, is1_m,
                                           op, os_p, os_m,
                                           dp, dn, dm);
            } else {
                /*
                 * If parameters are castable to int and we copy the
                 * non-blasable (or non-ccontiguous output)
                 * we could still use BLAS, see gh-12365.
                 */
                LONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        }
#else
        LONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                   ip2, is2_n, is2_p,
                                   op, os_m, os_p, dm, dn, dp);

#endif
    }
}


#line 395


NPY_NO_EXPORT void
LONGLONG_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp dOuter = *dimensions++;
    npy_intp iOuter;
    npy_intp s0 = *steps++;
    npy_intp s1 = *steps++;
    npy_intp s2 = *steps++;
    npy_intp dm = dimensions[0];
    npy_intp dn = dimensions[1];
    npy_intp dp = dimensions[2];
    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
         os_m=steps[4], os_p=steps[5];
#if 0 && defined(HAVE_CBLAS)
    npy_intp sz = sizeof(npy_longlong);
    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
    npy_bool scalar_out = (dm == 1 && dp == 1);
    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
                                 dp > BLAS_MAXSIZE);
    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
                              is_blasable2d(is1_n, sz, dn, 1, sz));
    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
                              is_blasable2d(is2_n, sz, dn, 1, sz));
#endif

    for (iOuter = 0; iOuter < dOuter; iOuter++,
                         args[0] += s0, args[1] += s1, args[2] += s2) {
        void *ip1=args[0], *ip2=args[1], *op=args[2];
#if 0 && defined(HAVE_CBLAS)
        /*
         * TODO: refactor this out to a inner_loop_selector, in
         * PyUFunc_MatmulLoopSelector. But that call does not have access to
         * n, m, p and strides.
         */
        if (too_big_for_blas || any_zero_dim) {
            LONGLONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                       ip2, is2_n, is2_p,
                                       op, os_m, os_p, dm, dn, dp);
        }
        else if (special_case) {
            /* Special case variants that have a 1 in the core dimensions */
            if (scalar_out) {
                /* row @ column, 1,1 output */
                LONGLONG_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
            } else if (scalar_vec){
                /*
                 * 1,1d @ vector or vector @ 1,1d
                 * could use cblas_Xaxy, but that requires 0ing output
                 * and would not be faster (XXX prove it)
                 */
                LONGLONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            } else if (vector_matrix) {
                /* vector @ matrix, switch ip1, ip2, p and m */
                LONGLONG_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
                            op, os_p, os_m, dp, dn, dm);
            } else if  (matrix_vector) {
                /* matrix @ vector */
                LONGLONG_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,

                            op, os_m, os_p, dm, dn, dp);
            } else {
                /* column @ row, 2d output, no blas needed or non-blas-able input */
                LONGLONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        } else {
            /* matrix @ matrix */
            if (i1blasable && i2blasable && o_c_blasable) {
                LONGLONG_matmul_matrixmatrix(ip1, is1_m, is1_n,
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p,
                                           dm, dn, dp);
            } else if (i1blasable && i2blasable && o_f_blasable) {
                /*
                 * Use transpose equivalence:
                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
                 */
                LONGLONG_matmul_matrixmatrix(ip2, is2_p, is2_n,
                                           ip1, is1_n, is1_m,
                                           op, os_p, os_m,
                                           dp, dn, dm);
            } else {
                /*
                 * If parameters are castable to int and we copy the
                 * non-blasable (or non-ccontiguous output)
                 * we could still use BLAS, see gh-12365.
                 */
                LONGLONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        }
#else
        LONGLONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                   ip2, is2_n, is2_p,
                                   op, os_m, os_p, dm, dn, dp);

#endif
    }
}


#line 395


NPY_NO_EXPORT void
BOOL_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp dOuter = *dimensions++;
    npy_intp iOuter;
    npy_intp s0 = *steps++;
    npy_intp s1 = *steps++;
    npy_intp s2 = *steps++;
    npy_intp dm = dimensions[0];
    npy_intp dn = dimensions[1];
    npy_intp dp = dimensions[2];
    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
         os_m=steps[4], os_p=steps[5];
#if 0 && defined(HAVE_CBLAS)
    npy_intp sz = sizeof(npy_bool);
    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
    npy_bool scalar_out = (dm == 1 && dp == 1);
    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
                                 dp > BLAS_MAXSIZE);
    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
                              is_blasable2d(is1_n, sz, dn, 1, sz));
    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
                              is_blasable2d(is2_n, sz, dn, 1, sz));
#endif

    for (iOuter = 0; iOuter < dOuter; iOuter++,
                         args[0] += s0, args[1] += s1, args[2] += s2) {
        void *ip1=args[0], *ip2=args[1], *op=args[2];
#if 0 && defined(HAVE_CBLAS)
        /*
         * TODO: refactor this out to a inner_loop_selector, in
         * PyUFunc_MatmulLoopSelector. But that call does not have access to
         * n, m, p and strides.
         */
        if (too_big_for_blas || any_zero_dim) {
            BOOL_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                       ip2, is2_n, is2_p,
                                       op, os_m, os_p, dm, dn, dp);
        }
        else if (special_case) {
            /* Special case variants that have a 1 in the core dimensions */
            if (scalar_out) {
                /* row @ column, 1,1 output */
                BOOL_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
            } else if (scalar_vec){
                /*
                 * 1,1d @ vector or vector @ 1,1d
                 * could use cblas_Xaxy, but that requires 0ing output
                 * and would not be faster (XXX prove it)
                 */
                BOOL_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            } else if (vector_matrix) {
                /* vector @ matrix, switch ip1, ip2, p and m */
                BOOL_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
                            op, os_p, os_m, dp, dn, dm);
            } else if  (matrix_vector) {
                /* matrix @ vector */
                BOOL_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,

                            op, os_m, os_p, dm, dn, dp);
            } else {
                /* column @ row, 2d output, no blas needed or non-blas-able input */
                BOOL_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        } else {
            /* matrix @ matrix */
            if (i1blasable && i2blasable && o_c_blasable) {
                BOOL_matmul_matrixmatrix(ip1, is1_m, is1_n,
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p,
                                           dm, dn, dp);
            } else if (i1blasable && i2blasable && o_f_blasable) {
                /*
                 * Use transpose equivalence:
                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
                 */
                BOOL_matmul_matrixmatrix(ip2, is2_p, is2_n,
                                           ip1, is1_n, is1_m,
                                           op, os_p, os_m,
                                           dp, dn, dm);
            } else {
                /*
                 * If parameters are castable to int and we copy the
                 * non-blasable (or non-ccontiguous output)
                 * we could still use BLAS, see gh-12365.
                 */
                BOOL_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        }
#else
        BOOL_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                   ip2, is2_n, is2_p,
                                   op, os_m, os_p, dm, dn, dp);

#endif
    }
}


#line 395


NPY_NO_EXPORT void
OBJECT_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    npy_intp dOuter = *dimensions++;
    npy_intp iOuter;
    npy_intp s0 = *steps++;
    npy_intp s1 = *steps++;
    npy_intp s2 = *steps++;
    npy_intp dm = dimensions[0];
    npy_intp dn = dimensions[1];
    npy_intp dp = dimensions[2];
    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
         os_m=steps[4], os_p=steps[5];
#if 0 && defined(HAVE_CBLAS)
    npy_intp sz = sizeof(npy_object);
    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
    npy_bool scalar_out = (dm == 1 && dp == 1);
    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
                                 dp > BLAS_MAXSIZE);
    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
                              is_blasable2d(is1_n, sz, dn, 1, sz));
    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
                              is_blasable2d(is2_n, sz, dn, 1, sz));
#endif

    for (iOuter = 0; iOuter < dOuter; iOuter++,
                         args[0] += s0, args[1] += s1, args[2] += s2) {
        void *ip1=args[0], *ip2=args[1], *op=args[2];
#if 0 && defined(HAVE_CBLAS)
        /*
         * TODO: refactor this out to a inner_loop_selector, in
         * PyUFunc_MatmulLoopSelector. But that call does not have access to
         * n, m, p and strides.
         */
        if (too_big_for_blas || any_zero_dim) {
            OBJECT_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                       ip2, is2_n, is2_p,
                                       op, os_m, os_p, dm, dn, dp);
        }
        else if (special_case) {
            /* Special case variants that have a 1 in the core dimensions */
            if (scalar_out) {
                /* row @ column, 1,1 output */
                OBJECT_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
            } else if (scalar_vec){
                /*
                 * 1,1d @ vector or vector @ 1,1d
                 * could use cblas_Xaxy, but that requires 0ing output
                 * and would not be faster (XXX prove it)
                 */
                OBJECT_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            } else if (vector_matrix) {
                /* vector @ matrix, switch ip1, ip2, p and m */
                OBJECT_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
                            op, os_p, os_m, dp, dn, dm);
            } else if  (matrix_vector) {
                /* matrix @ vector */
                OBJECT_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,

                            op, os_m, os_p, dm, dn, dp);
            } else {
                /* column @ row, 2d output, no blas needed or non-blas-able input */
                OBJECT_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        } else {
            /* matrix @ matrix */
            if (i1blasable && i2blasable && o_c_blasable) {
                OBJECT_matmul_matrixmatrix(ip1, is1_m, is1_n,
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p,
                                           dm, dn, dp);
            } else if (i1blasable && i2blasable && o_f_blasable) {
                /*
                 * Use transpose equivalence:
                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
                 */
                OBJECT_matmul_matrixmatrix(ip2, is2_p, is2_n,
                                           ip1, is1_n, is1_m,
                                           op, os_p, os_m,
                                           dp, dn, dm);
            } else {
                /*
                 * If parameters are castable to int and we copy the
                 * non-blasable (or non-ccontiguous output)
                 * we could still use BLAS, see gh-12365.
                 */
                OBJECT_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                           ip2, is2_n, is2_p,
                                           op, os_m, os_p, dm, dn, dp);
            }
        }
#else
        OBJECT_matmul_inner_noblas(ip1, is1_m, is1_n, 
                                   ip2, is2_n, is2_p,
                                   op, os_m, os_p, dm, dn, dp);

#endif
    }
}



