HeteQuadrature/algoim/bernstein.hpp

#ifndef ALGOIM_BERNSTEIN_HPP
#define ALGOIM_BERNSTEIN_HPP

// algoim::bernstein implements several routines for working with multivariate Bernstein
// polynomials. Many of these methods, especially the orthant, root finding, Sylvester
// and Bezout methods, are based on those described in the paper
//    R. I. Saye, High-order quadrature on multi-component domains implicitly defined
//    by multivariate polynomials, Journal of Computational Physics, 448, 110720 (2022),
//    https://doi.org/10.1016/j.jcp.2021.110720

#include <cassert>
#include <cmath>
#include "real.hpp"
#include "uvector.hpp"
#include "xarray.hpp"
#include "sparkstack.hpp"
#include "binomial.hpp"
#include "utility.hpp"

// Some methods rely on a LAPACK implementation to solve
// generalised eigenvalue problems and SVD factorisation
#if __has_include(<lapacke.h>)
#include <lapacke.h>
#elif __has_include(<mkl_lapacke.h>)
#include <mkl_lapacke.h>
#else
#error \
    "Algoim requires a LAPACKE implementation to compute eigenvalues and SVD factorisations, but a suitable lapacke.h include file was not found; did you forget to specify its include path?"
#endif

namespace algoim::bernstein
{
// Evaluate at x the P Bernstein basis functions of degree P-1
//   out: array of length P
template <typename T>
void evalBernsteinBasis(const T& x, int P, T* out)
{
    assert(P >= 1);
    const real* binom = Binomial::row(P - 1);
    T           p     = 1.0;
    for (int i = 0; i < P; ++i) {
        out[i]  = p * binom[i];
        p      *= x;
    }
    p = 1.0;
    for (int i = P - 1; i >= 0; --i) {
        out[i] *= p;
        p      *= 1.0 - x;
    }
}

// Evaluate an N-dimensional Bernstein polynomial at x
template <int N>
real evalBernsteinPoly(const xarray<real, N>& beta, const uvector<real, N>& x)
{
    uvector<real*, N> basis;
    algoim_spark_alloc_vec(real, basis, beta.ext());
    for (int i = 0; i < N; ++i) evalBernsteinBasis(x(i), beta.ext(i), basis(i));
    real r = 0.0;
    for (auto i = beta.loop(); ~i; ++i) {
        real s = beta.l(i);
        for (int dim = 0; dim < N; ++dim) s *= basis(dim)[i(dim)];
        r += s;
    }
    return r;
}

// Fast evaluation of a 1-D Bernstein polynomial and its derivative; it is assumed that
// binom == Binomial::row(P - 1), left to the caller to evaluate and cache, for speed
void bernsteinValueAndDerivative(const real* alpha, int P, const real* binom, real x, real& value, real& deriv)
{
    assert(P > 1);
    real *a, *b;
    algoim_spark_alloc(real, &a, P, &b, P);
    a[0] = 1;
    for (int i = 1; i < P; ++i) a[i] = a[i - 1] * x;
    b[0] = 1;
    for (int i = 1; i < P; ++i) b[i] = b[i - 1] * (1 - x);
    value = alpha[0] * b[P - 1] + alpha[P - 1] * a[P - 1];
    for (int i = 1; i < P - 1; ++i) value += alpha[i] * binom[i] * a[i] * b[P - 1 - i];
    deriv = (alpha[P - 1] * a[P - 2] - alpha[0] * b[P - 2]) * (P - 1);
    for (int i = 1; i < P - 1; ++i)
        deriv += alpha[i] * binom[i] * (a[i - 1] * b[P - 1 - i] * i - a[i] * b[P - 2 - i] * (P - 1 - i));
}

// Evaluate the gradient of an N-dimensional Bernstein polynomial at x
template <int N>
uvector<real, N> evalBernsteinPolyGradient(const xarray<real, N>& beta, const uvector<real, N>& x)
{
    uvector<real*, N> basis, prime;
    algoim_spark_alloc_vec(real, basis, beta.ext());
    algoim_spark_alloc_vec(real, prime, beta.ext());
    for (int i = 0; i < N; ++i) {
        int P = beta.ext(i);
        assert(P >= 1);
        evalBernsteinBasis(x(i), P, basis(i));
        if (P > 1) {
            real* buff;
            algoim_spark_alloc(real, &buff, P - 1);
            evalBernsteinBasis(x(i), P - 1, buff);
            prime(i)[0]     = (P - 1) * (-buff[0]);
            prime(i)[P - 1] = (P - 1) * (buff[P - 2]);
            for (int j = 1; j < P - 1; ++j) prime(i)[j] = (P - 1) * (buff[j - 1] - buff[j]);
        } else
            prime(i)[0] = 0.0;
    }
    uvector<real, N> g = real(0.0);
    for (auto i = beta.loop(); ~i; ++i) {
        for (int j = 0; j < N; ++j) {
            real s = beta.l(i);
            for (int dim = 0; dim < N; ++dim)
                if (dim == j)
                    s *= prime(dim)[i(dim)];
                else
                    s *= basis(dim)[i(dim)];
            g(j) += s;
        }
    }
    return g;
}

// Assuming p is represented in scaled Bernstein coefficients, reverse that scaling
template <int N>
void reverseScaledCoeff(xarray<real, N>& p)
{
    uvector<const real*, N> binom;
    for (int i = 0; i < N; ++i) binom(i) = Binomial::row(p.ext(i) - 1);
    for (auto i = p.loop(); i; ++i) {
        real alpha = 1;
        for (int dim = 0; dim < N; ++dim) alpha *= binom(dim)[i(dim)];
        p.l(i) /= alpha;
    }
}

// Squared L2 norm of a Bernstein polynomial; the result may be negative, but only if
// the polynomial is essentially machine zero
template <int N>
real squaredL2norm(const xarray<real, N>& p)
{
    uvector<const real*, N> b1, b2;
    for (int dim = 0; dim < N; ++dim) {
        b1(dim) = Binomial::row(p.ext(dim) - 1);
        b2(dim) = Binomial::row(2 * p.ext(dim) - 2);
    }
    real delta = 0;
    for (auto i = p.loop(); ~i; ++i)
        for (auto j = p.loop(); ~j; ++j) {
            real g = 1;
            for (int dim = 0; dim < N; ++dim) g *= (b1(dim)[i(dim)] / b2(dim)[i(dim) + j(dim)]) * b1(dim)[j(dim)];
            delta += p.l(i) * p.l(j) * g;
        }
    for (int dim = 0; dim < N; ++dim) delta /= 2 * p.ext(dim) - 1;
    return delta;
}

// Collapse a multivariate Bernstein polynomial along a given axis-aligned line, i.e., the
// set of points x where x(dim) is free and x(i) = x0(i) for all i != dim
//   out: array of length beta.ext(dim)
template <int N>
void collapseAlongAxis(const xarray<real, N>& beta, const uvector<real, N - 1>& x0, int dim, real* out)
{
    if constexpr (N == 1) {
        assert(dim == 0);
        for (int i = 0; i < beta.ext(0); ++i) out[i] = beta[i];
    } else {
        assert(0 <= dim && dim < N);
        uvector<real*, N - 1> basis;
        algoim_spark_alloc_vec(real, basis, remove_component(beta.ext(), dim));
        for (int i = 0; i < N - 1; ++i) {
            int P = beta.ext(i < dim ? i : i + 1);
            evalBernsteinBasis(x0(i), P, basis(i));
        }
        int P = beta.ext(dim);
        for (int i = 0; i < P; ++i) out[i] = 0.0;
        for (auto i = beta.loop(); ~i; ++i) {
            real s = beta.l(i);
            for (int j = 0; j < N; ++j)
                if (j < dim)
                    s *= basis(j)[i(j)];
                else if (j > dim)
                    s *= basis(j - 1)[i(j)];
            out[i(dim)] += s;
        }
    }
}

// Collapse a multivariate Bernstein polynomial along a given axis-orthogonal hyperplane,
// i.e., the set of points x where x(dim) is a fixed given value
template <int N>
void collapseAlongHyperplane(const xarray<real, N>& beta, int dim, real x, xarray<real, N - 1>& out)
{
    static_assert(N > 1, "N > 1 required");
    assert(all(out.ext() == remove_component(beta.ext(), dim)));
    assert(0 <= dim && dim < N);
    int   P = beta.ext(dim);
    real* basis;
    algoim_spark_alloc(real, &basis, P);
    evalBernsteinBasis(x, P, basis);
    out = 0.0;
    for (auto i = beta.loop(); i; ++i) out.m(remove_component(i(), dim)) += beta.l(i) * basis[i(dim)];
}

// Normalise polynomial by its largest (in absolute value) coefficient
template <int N>
void normalise(xarray<real, N>& alpha)
{
    real x = alpha.maxNorm();
    if (x > 0) alpha *= real(1) / x;
}

// Applying a simple examination of coefficient signs, returns +1 if the
// polynomial is uniformly positive, -1 if the polynomial is uniformly
// negative, or 0 if no guarantees can be made
template <int N>
int uniformSign(const xarray<real, N>& beta)
{
    int s = util::sign(beta[0]);
    for (int i = 1; i < beta.size(); ++i)
        if (util::sign(beta[i]) != s) return 0;
    return s;
}

// Compute coefficients of derivative in lower degree basis
//   alpha: array of length P
//   out: array of length P - 1
template <typename T>
void bernsteinDerivative(const T* alpha, int P, T* out)
{
    assert(P >= 2);
    for (int i = 0; i < P - 1; ++i) {
        out[i]  = alpha[i + 1];
        out[i] -= alpha[i];
        out[i] *= P - 1;
    }
}

// Compute the derivative of a Bernstein polynomial
template <int N>
void bernsteinDerivative(const xarray<real, N>& a, int dim, xarray<real, N>& out)
{
    assert(all(out.ext() == inc_component(a.ext(), dim, -1)));
    int P = a.ext(dim);
    assert(P >= 2);
    // if (P < 2) {
    //     int aaa = 1;
    //     int bbb = 1;
    // }
    for (auto i = out.loop(); ~i; ++i) out.l(i) = a.m(i.shifted(dim, 1)) - a.m(i());
    out *= P - 1;
}

// Compute the derivative of a Bernstein polynomial, in the original basis;
// equivalent to computing normal derivative, and then elevating once
template <int N>
void elevatedDerivative(const xarray<real, N>& a, int dim, xarray<real, N>& out)
{
    assert(all(out.ext() == a.ext()) && 0 <= dim && dim < N);
    int P = a.ext(dim);
    for (auto i = a.loop(); ~i; ++i) {
        if (i(dim) == 0)
            out.l(i) = (a.m(i.shifted(dim, 1)) - a.l(i)) * (P - 1);
        else if (i(dim) == P - 1)
            out.l(i) = (a.l(i) - a.m(i.shifted(dim, -1))) * (P - 1);
        else
            out.l(i) =
                a.m(i.shifted(dim, -1)) * (-i(dim)) + a.l(i) * (2 * i(dim) - P + 1) + a.m(i.shifted(dim, 1)) * (P - 1 - i(dim));
    }
}

// Apply de Casteljau algorithm to compute the Bernstein coefficients of alpha relative to the interval [0,tau]
template <int N>
void deCasteljauLeft(xarray<real, N>& alpha, real tau)
{
    int P = alpha.ext(0);
    for (int i = 1; i < P; ++i)
        for (int j = P - 1; j >= i; --j) {
            alpha.a(j) *= tau; // here the ptr to the array is included in the return
            alpha.a(j) += alpha.a(j - 1) * (1.0 - tau);
        }
}

// Apply de Casteljau algorithm to compute the Bernstein coefficients of alpha relative to the interval [tau,1]
template <int N>
void deCasteljauRight(xarray<real, N>& alpha, real tau)
{
    int P = alpha.ext(0);
    for (int i = 1; i < P; ++i)
        for (int j = 0; j < P - i; ++j) {
            alpha.a(j) *= (1.0 - tau);
            alpha.a(j) += alpha.a(j + 1) * tau;
        }
}

// Apply de Casteljau algorithm to compute the Bernstein coefficients of alpha relative
// to the hyperrectangle [a,b]. It is assumed that the given arrays a & b each have
// length at least N. If, for a particular dimension, a[dim] > b[dim], both the interval
// and coefficients are reversed.
template <int N, bool B = false>
void deCasteljau(xarray<real, N>& alpha, const real* a, const real* b)
{
    using std::abs;
    using std::swap;
    if constexpr (N == 1 || B) {
        int P = alpha.ext(0);
        if (*b < *a) {
            deCasteljau<N, B>(alpha, b, a);
            for (int i = 0; i < P / 2; ++i) swap(alpha.a(i), alpha.a(P - 1 - i));
            return;
        }
        if (abs(*b) >= abs(*a - 1)) {
            deCasteljauLeft(alpha, *b);
            deCasteljauRight(alpha, *a / *b);
        } else {
            deCasteljauRight(alpha, *a);
            deCasteljauLeft(alpha, (*b - *a) / (real(1) - *a));
        }
    } else {
        deCasteljau<2, true>(alpha.flatten().ref(), a, b);
        for (int i = 0; i < alpha.ext(0); ++i) deCasteljau(alpha.slice(i).ref(), a + 1, b + 1);
    }
}

// Apply de Casteljau algorithm to compute the Bernstein coefficients of alpha relative
// to the hyperrectangle [a,b]. If, for a particular dimension, a[dim] > b[dim], both the
// interval and coefficients are reversed.
template <int N>
void deCasteljau(const xarray<real, N>& alpha, const uvector<real, N>& a, const uvector<real, N>& b, xarray<real, N>& out)
{
    assert(all(out.ext() == alpha.ext()));
    out = alpha; // 这里是一个深拷贝，alpha不会被修改
    deCasteljau(out, a.data(), b.data());
}

// Elevate the degree of a Bernstein polynomial
template <int N, bool B = false>
void bernsteinElevate(const xarray<real, N>& alpha, xarray<real, N>& beta)
{
    assert(all(beta.ext() >= alpha.ext()));
    if constexpr (N == 1 || B) {
        int P = alpha.ext(0), Q = beta.ext(0);
        if (P == Q) {
            for (int k = 0; k < P; ++k) beta.a(k) = alpha.a(k);
        } else {
            int n = P - 1;
            int r = Q - 1 - n;
            if (r == 1) {
                beta.a(0)     = alpha.a(0);
                beta.a(n + 1) = alpha.a(n);
                for (int k = 1; k <= n; ++k) {
                    beta.a(k)  = alpha.a(k - 1) * (real(k) / real(n + 1));
                    beta.a(k) += alpha.a(k) * (real(1) - real(k) / real(n + 1));
                }
                return;
            }
            const real* bn  = Binomial::row(n);
            const real* br  = Binomial::row(r);
            const real* bnr = Binomial::row(n + r);
            for (int k = 0; k <= n + r; ++k) {
                beta.a(k) = 0.0;
                for (int j = std::max(0, k - r); j <= std::min(n, k); ++j)
                    beta.a(k) += alpha.a(j) * ((br[k - j] * bn[j]) / bnr[k]);
            }
        }
    } else {
        xarray<real, N> gamma(nullptr, set_component(alpha.ext(), 0, beta.ext(0)));
        algoim_spark_alloc(real, gamma);
        bernsteinElevate<2, true>(alpha.flatten(), gamma.flatten().ref());
        for (int i = 0; i < beta.ext(0); ++i) bernsteinElevate(gamma.slice(i), beta.slice(i).ref());
    }
}

namespace detail
{
// Compute least squares solution of Ax=b, where A is a (P+1) x P lower bidiagonal matrix, with
// diagonal given by 'alpha', lower diagonal by 'beta', each of length P, and the rhs is given
// by 'b', of length P+1; the algorithm applies QR with Givens, which shall create an upper
// bidiagonal R; no pivoting is performed, and the complexity is O(P) for QR factorisation and
// O(P*O) for back-solve on rhs of size PxO
//   in: alpha, length P, shall be overwritten
//   in: beta, length P, shall be overwritten
//   in: b, overwritten with first P rows yielding least squares solution
void lsqr_bidiagonal(real* alpha, real* beta, int P, xarray<real, 2>& b)
{
    assert(b.ext(0) == P + 1 && b.ext(1) > 0);
    real* gamma;
    algoim_spark_alloc_def(real, 0, &gamma, P);
    for (int i = 0; i < P; ++i) {
        real c, s;
        util::givens_get(alpha[i], beta[i], c, s);
        util::givens_rotate(alpha[i], beta[i], c, s);
        if (i < P - 1) util::givens_rotate(gamma[i + 1], alpha[i + 1], c, s);
        for (int k = 0; k < b.ext(1); ++k) util::givens_rotate(b(i, k), b(i + 1, k), c, s);
    }
    b.a(P - 1) /= alpha[P - 1];
    for (int i = P - 2; i >= 0; --i) {
        b.a(i) -= b.a(i + 1) * gamma[i + 1];
        b.a(i) /= alpha[i];
    }
}
} // namespace detail

// Reduce by one the effective degree of a Bernstein polynomial; this routine
// mainly makes sense when the actual polynomial degree is less than the one
// used in its (starting) Bernstein polynomial representation
template <int N, bool B = false>
void bernsteinReduction(xarray<real, N>& alpha, int dim)
{
    assert(all(alpha.ext() >= 1) && 0 <= dim && dim < N && alpha.ext(dim) >= 2);
    if (dim == 0) {
        int   P = alpha.ext(0) - 1;
        real *a, *b;
        algoim_spark_alloc(real, &a, P, &b, P);
        a[0]     = 1;
        b[P - 1] = 1;
        for (int k = 1; k < P; ++k) {
            a[k]     = real(1) - real(k) / real(P);
            b[k - 1] = real(k) / real(P);
        }
        xarray<real, 2> view(alpha.data(), uvector<int, 2>{P + 1, prod(alpha.ext(), 0)});
        detail::lsqr_bidiagonal(a, b, P, view);
    } else if constexpr (N > 1) {
        for (int i = 0; i < alpha.ext(0); ++i) bernsteinReduction<N - 1, true>(alpha.slice(i).ref(), dim - 1);
    }

    if (!B) {
        xarray<real, N> beta(nullptr, alpha.ext());
        algoim_spark_alloc(real, beta);
        beta = alpha;
        alpha.alterExtent(inc_component(alpha.ext(), dim, -1));
        for (auto i = alpha.loop(); ~i; ++i) alpha.l(i) = beta.m(i());
    }
}

// Automatically reduce the degree of alpha; returns true iff degree reduction occurred
template <int N>
bool autoReduction(xarray<real, N>& alpha, real tol = 1.0e3 * std::numeric_limits<real>::epsilon(), int dim = 0)
{
    using std::abs;
    using std::sqrt;
    if (dim < 0 || dim >= N || tol <= 0) return false;
    bool stay = false;
    if (alpha.ext(dim) >= 2) {
        xarray<real, N> beta(nullptr, alpha.ext()), gamma(nullptr, alpha.ext());
        algoim_spark_alloc(real, beta, gamma);
        beta = alpha;
        bernsteinReduction(beta, dim);
        bernsteinElevate(beta, gamma);
        gamma      -= alpha;
        real delta  = sqrt(abs(squaredL2norm(gamma)));
        real norm   = sqrt(abs(squaredL2norm(alpha)));
        if (delta < tol * norm) {
            alpha.alterExtent(beta.ext());
            alpha = beta;
            stay  = true;
        }
    }
    if (stay) {
        autoReduction<N>(alpha, tol, dim);
        return true;
    } else
        return autoReduction<N>(alpha, tol, dim + 1);
}

// Determine if there is a scalar alpha such that sign x(i) + alpha y(i) > 0 for every component i;
// if sign = 0, then returns true if it holds for sign = 1 and/or sign = -1
template <int N>
bool orthantTestBase(const xarray<real, N>& x, const xarray<real, N>& y, int sign = 0)
{
    assert(sign == 0 || sign == -1 || sign == 1);
    assert(all(x.ext() == y.ext()));
    using std::abs;
    using std::isinf;
    using std::max;
    using std::min;
    if (sign == 0) return orthantTestBase(x, y, -1) || orthantTestBase(x, y, 1);
    real alphaMax = std::numeric_limits<real>::infinity();
    real alphaMin = -std::numeric_limits<real>::infinity();
    for (int i = 0; i < x.size(); ++i) {
        if (y[i] == 0.0 && x[i] * sign <= 0.0) return false;
        if (y[i] > 0.0)
            alphaMin = max(alphaMin, -x[i] / y[i] * sign);
        else if (y[i] < 0.0)
            alphaMax = min(alphaMax, -x[i] / y[i] * sign);
    }
    if (isinf(alphaMin) || isinf(alphaMax)) return true;
    if (alphaMax - alphaMin > 1.0e5 * std::numeric_limits<real>::epsilon() * max(abs(alphaMin), abs(alphaMax))) return true;
    return false;
}

// Determine if there are scalars alpha and beta such that {alpha f + beta g > 0} holds for every
// Bernstein coefficient of f and g: if one of the polynomials has a smaller degree than the other,
// it is degree elevated so that the two polynomials have the same degree
template <int N>
bool orthantTest(const xarray<real, N>& f, const xarray<real, N>& g)
{
    if (all(f.ext() == g.ext()))
        return orthantTestBase(f, g);
    else {
        uvector<int, N> ext = max(f.ext(), g.ext());
        xarray<real, N> fe(nullptr, ext), ge(nullptr, ext);
        algoim_spark_alloc(real, fe, ge);
        bernsteinElevate(f, fe);
        bernsteinElevate(g, ge);
        return orthantTestBase(fe, ge);
    }
}

// Modified Chebyshev nodes (which include endpoints) for interpolating degree P-1 polynomials
inline real modifiedChebyshevNode(int i, int P)
{
    assert(0 <= i && i < P);
    using std::cos;
    if (P == 1) {
        int aaa = 1;
        int bbb = 1;
    }
    return 0.5 - 0.5 * cos(util::pi * i / (P - 1));
}

// Methods to compute, and cache, the SVD for Bernstein interpolation based on modified Chebysev nodes
struct BernsteinVandermondeSVD {
    struct SVD // 将矩阵A分解为U * diag(sigma) * Vt，其中U和Vt是正交矩阵，sigma是对角矩阵
    {
        real* U;
        real* Vt;
        real* sigma;
    };

    static SVD get(int P)
    {
        assert(P >= 1);
        static thread_local std::unordered_map<int, std::vector<real>> cache;
        if (cache.count(P) == 1) {
            real* base = cache.at(P).data();
            return SVD{base, base + P * P, base + 2 * P * P};
        }

        real *A, *superb, *basis;
        algoim_spark_alloc(real, &A, P * P, &superb, P, &basis, P);
        for (int i = 0; i < P; ++i) {
            evalBernsteinBasis(modifiedChebyshevNode(i, P), P, basis);
            for (int j = 0; j < P; ++j) A[i * P + j] = basis[j];
        }

        cache[P].resize(P * P + P * P + P);
        real* base = cache[P].data();
        SVD   result{base, base + P * P, base + 2 * P * P};

        static_assert(
            std::is_same_v<real, double>,
            "Algoim's default LAPACK code assumes real == double; a custom SVD solver is required when real != double");
        int info = LAPACKE_dgesvd(LAPACK_ROW_MAJOR, 'A', 'A', P, P, A, P, result.sigma, result.U, P, result.Vt, P, superb);
        if (info != 0) {
            std::cerr << "LAPACKE_dgesvd call failed (algoim::bernstein::BernsteinVandermondeSVD::get), info = " << info
                      << std::endl;
        }
        assert(info == 0 && "LAPACKE_dgesvd call failed (algoim::bernstein::BernsteinVandermondeSVD::get)");
        return result;
    }
};

// Interpolate tensor-product data f, assumed to be nodal values at the same nodes returned by modifiedChebyshevNode()
template <int N, bool B = false>
void bernsteinInterpolate(const xarray<real, N>& f, real tol, xarray<real, N>& out)
{
    assert(all(out.ext() == f.ext()));
    if constexpr (N == 1 || B) {
        int P = f.ext(0);
        int O = prod(f.ext(), 0);
        assert(P >= 1 && O >= 1);

        real* tmp;
        algoim_spark_alloc(real, &tmp, P * O);

        auto svd = BernsteinVandermondeSVD::get(P);

        for (int i = 0; i < P * O; ++i) tmp[i] = 0.0;
        for (int i = 0; i < P; ++i)
            for (int j = 0; j < P; ++j)
                for (int k = 0; k < O; ++k) tmp[i * O + k] += svd.U[j * P + i] * f[j * O + k];

        real minsigma = tol * svd.sigma[0];
        for (int i = 0; i < P; ++i) {
            real alpha = (svd.sigma[i] >= minsigma) ? (real(1) / svd.sigma[i]) : 0.0;
            for (int k = 0; k < O; ++k) tmp[i * O + k] *= alpha;
        }

        out = 0;
        for (int i = 0; i < P; ++i)
            for (int j = 0; j < P; ++j)
                for (int k = 0; k < O; ++k) out[i * O + k] += svd.Vt[j * P + i] * tmp[j * O + k];
    } else {
        xarray<real, N> gamma(nullptr, f.ext());
        algoim_spark_alloc(real, gamma);
        bernsteinInterpolate<2, true>(f.flatten(), tol, gamma.flatten().ref());
        for (int i = 0; i < f.ext(0); ++i) bernsteinInterpolate(gamma.slice(i), tol, out.slice(i).ref());
    }
}

// Interpolate a functional through its nodal evaluation at the modifiedChebyshevNode() points
template <int N, typename F>
void bernsteinInterpolate(F&& f, xarray<real, N>& out)
{
    xarray<real, N> ff(nullptr, out.ext());
    algoim_spark_alloc(real, ff);
    for (auto i = ff.loop(); ~i; ++i) {
        uvector<real, N> x;
        for (int dim = 0; dim < N; ++dim) x(dim) = modifiedChebyshevNode(i(dim), out.ext(dim));
        ff.l(i) = f(x);
    }
    bernsteinInterpolate(ff, std::pow(100.0 * std::numeric_limits<real>::epsilon(), 1.0 / N), out);
}

namespace detail
{
// Compute the generalised eigenvalues for matrix pair A, B
//   in: N by N square matrices; A, B will be overwritten
//   out: array of length N x 2
void generalisedEigenvalues(xarray<real, 2>& A, xarray<real, 2>& B, xarray<real, 2>& out)
{
    int N = A.ext(0);
    assert(all(A.ext() == N) && all(B.ext() == N) && out.ext(0) == N && out.ext(1) == 2);
    real *alphar, *alphai, *beta, *lscale, *rscale;
    algoim_spark_alloc(real, &alphar, N, &alphai, N, &beta, N, &lscale, N, &rscale, N);
    real abnrm, bbnrm;
    int  ilo, ihi;
    static_assert(std::is_same_v<real, double>,
                  "Algoim's default LAPACK code assumes real == double; a custom generalised eigenvalue solver is required "
                  "when real != double");
    int info = LAPACKE_dggevx(LAPACK_ROW_MAJOR, 'B', 'N', 'N', 'N', N, A.data(), N, B.data(), N, alphar, alphai, beta, nullptr,
                              N, nullptr, N, &ilo, &ihi, lscale, rscale, &abnrm, &bbnrm, nullptr, nullptr);
    assert(info == 0 && "LAPACKE_dggevx call failed (algoim::bernstein::detail::generalisedEigenvalues)");
    for (int i = 0; i < N; ++i) {
        if (beta[i] != 0.0)
            out(i, 0) = alphar[i] / beta[i], out(i, 1) = alphai[i] / beta[i];
        else
            out(i, 0) = out(i, 1) = std::numeric_limits<real>::infinity();
    }
}
} // namespace detail

// Compute all complex  roots of a Bernstein polynomial
//   alpha: array of length P
//   out: array of length (P-1) x 2
void rootsBernsteinPoly(const real* alpha, int P, xarray<real, 2>& out)
{
    assert(P >= 2 && out.ext(0) == P - 1 && out.ext(1) == 2);
    using std::abs;
    using std::max;

    real* beta;
    algoim_spark_alloc(real, &beta, P);
    real tol = 0.0;
    for (int i = 0; i < P; ++i) tol = max(tol, abs(alpha[i]));
    tol *= util::sqr(std::numeric_limits<real>::epsilon());
    for (int i = 0; i < P; ++i) beta[i] = (abs(alpha[i]) > tol) ? alpha[i] : 0;

    int             N = P - 1;
    xarray<real, 2> A(nullptr, uvector<int, 2>{N, N});
    xarray<real, 2> B(nullptr, uvector<int, 2>{N, N});
    algoim_spark_alloc(real, A, B);
    A = 0;
    B = 0;
    for (int i = 0; i < N - 1; ++i) A(i, i + 1) = B(i, i + 1) = 1.0;
    for (int i = 0; i < N; ++i) A(N - 1, i) = B(N - 1, i) = -beta[i];
    B(N - 1, N - 1) += beta[N] / N;
    for (int i = 0; i < N - 1; ++i) B(i, i) = real(N - i) / real(i + 1);

    detail::generalisedEigenvalues(A, B, out);
}

namespace detail
{
// Newton's method safeguarded by a standard bisection method; in Bernstein application it
// is only be applied to a Bernstein polynomial guaranteed to have just one real root
template <typename F>
bool newtonBisectionSearch(const F& f, real x0, real x1, real tol, int maxsteps, real& root)
{
    using std::abs;
    real f0, f1, dummy;
    f(x0, f0, dummy);
    f(x1, f1, dummy);
    if ((f0 > 0.0 && f1 > 0.0) || (f0 < 0.0 && f1 < 0.0)) return false;
    if (f0 == real(0.0)) {
        root = x0;
        return true;
    }
    if (f1 == real(0.0)) {
        root = x1;
        return true;
    }

    // x0 and x1 define the bracket; x0 always corresponds to negative value of f; x1 positive value of f
    if (f1 < 0.0) std::swap(x0, x1);

    // Initial guess is midpoint
    real x = (x0 + x1) * 0.5;
    real fx, fpx;
    f(x, fx, fpx);
    real dx = x1 - x0;
    for (int step = 0; step < maxsteps; ++step) {
        if ((fpx * (x - x0) - fx) * (fpx * (x - x1) - fx) < 0.0 && abs(fx) < abs(dx * fpx) * 0.5) {
            // Step in Newton's method falls within bracket and is less than half the previous step size
            dx         = -fx / fpx;
            real xold  = x;
            x         += dx;
            if (xold == x) {
                root = x;
                return true;
            }
        } else {
            // Revert to bisection
            dx = (x1 - x0) * 0.5;
            x  = x0 + dx;
            if (x == x0) {
                root = x;
                return true;
            }
        }
        if (abs(dx) < tol) {
            root = x;
            return true;
        }
        f(x, fx, fpx);
        if (fx == real(0.0)) // Got very lucky
        {
            root = x;
            return true;
        }
        if (fx < 0.0)
            x0 = x;
        else
            x1 = x;
    }
    return false;
}
} // namespace detail

// Compute, if possible, a simple real root in [0,1] of a Bernstein polynomial using
// Descartes' rule of signs:
//   - if it can be guaranteed that there is exactly 0 roots, 0 is returned
//   - if it can be guaranteed that there is exactly 1 root, and that root has
//     been calculated to full precision using Newton's method, 1 is returned
//   - if some coefficients are close to zero (thereby preventing a reliable use of
//     Descartes' rule), -1 is returned
//   - if no other guarantees can be made, -1 is returned
int bernsteinSimpleRoot(const real* alpha, int P, real tol, real& root)
{
    assert(P >= 2);
    using std::abs;
    for (int i = 0; i < P; ++i)
        if (abs(alpha[i]) < tol) return -1;
    int count = 0;
    for (int i = 1; i < P; ++i)
        if (alpha[i - 1] < 0 && alpha[i] >= 0 || alpha[i - 1] >= 0 && alpha[i] < 0) ++count;
    if (count == 0) return 0;
    if (count > 1) return -1;
    real        newton_tol = 10.0 * std::numeric_limits<real>::epsilon();
    const real* binom      = Binomial::row(P - 1);
    bool        b          = detail::newtonBisectionSearch(
        [=](real x, real& value, real& prime) { bernsteinValueAndDerivative(alpha, P, binom, x, value, prime); }, 0, 1,
        newton_tol, 12, root);
    return b ? 1 : -1;
}

// Compute real roots of a Bernstein polynomial using a bisection + Newton's method approach.
// Returns the number of real roots computed (and recorded in out, a buffer of size at least
// P - 1), or -1 if failed
int rootsBernsteinPolyFast(const xarray<real, 1>& alpha, real a, real b, int depth, real tol, real* out)
{
    // Try simple root method
    real root;
    int  res = bernsteinSimpleRoot(alpha.data(), alpha.ext(0), tol, root);
    // If it worked with a guarantee of no roots, return
    if (res == 0) return 0;
    // If it worked with a guarantee of just one root computed accurately,
    // transform that root to the [a,b] interval, record it, and return
    if (res == 1) {
        *out = a + (b - a) * root;
        return 1;
    }
    // Otherwise, the simple root method failed. Apply bisection, provided not already too deep
    if (depth >= 4) return -1;
    xarray<real, 1> beta(nullptr, alpha.ext());
    algoim_spark_alloc(real, beta);
    // Apply to left half
    beta = alpha;
    deCasteljauLeft(beta, 0.5);
    int r1 = rootsBernsteinPolyFast(beta, a, a + (b - a) * 0.5, depth + 1, tol, out);
    if (r1 < 0) return -1;
    // Apply to right half, shifting buffer by r1
    beta = alpha;
    deCasteljauRight(beta, 0.5);
    int r2 = rootsBernsteinPolyFast(beta, a + (b - a) * 0.5, b, depth + 1, tol, out + r1);
    if (r2 < 0) return -1;
    return r1 + r2;
}

// Apply generalised eigenvalue method to compute the real roots of alpha in the interval [0,1],
// returning the number of roots recorded in 'out', a buffer of size at least P - 1
int bernsteinUnitIntervalRealRoots_eigenvalue(const real* alpha, int P, real* out)
{
    using std::abs;
    xarray<real, 2> roots(nullptr, uvector<int, 2>{P - 1, 2});
    algoim_spark_alloc(real, roots);
    rootsBernsteinPoly(alpha, P, roots);
    real tol   = 1.0e4 * std::numeric_limits<real>::epsilon(); // nearly-real-root tolerance
    int  count = 0;
    for (int j = 0; j < P - 1; ++j) {
        if (0 <= roots(j, 0) && roots(j, 0) <= 1 && abs(roots(j, 1)) < tol) {
            *(out + count) = roots(j, 0);
            ++count;
        }
    }
    return count;
}

// Apply a Newton's method-based approach to compute the real roots of alpha in the interval [0,1];
// if succeeded, returns the number of roots recorded in 'out' (a buffer of size at least P -1);
// if failed, returns -1
int bernsteinUnitIntervalRealRoots_fast(const real* alpha, int P, real* out)
{
    using std::abs;
    using std::max;
    // Compute a tolerance by which to declare a nearly-zero coefficient as being
    // too close to zero (for Descartes' rule of signs and to avoid problems where
    // a root lies close to a subinterval endpoint which can confuse bisection)
    real tol = 0;
    for (int i = 0; i < P; ++i) tol = max(tol, abs(alpha[i]));
    tol *= 1.0e4 * std::numeric_limits<real>::epsilon(); // nearly zero coeff tolerance, can be loose
    return rootsBernsteinPolyFast(xarray<real, 1>(const_cast<real*>(alpha), P), 0, 1, 0, tol, out);
}

// Driver method to compute the real roots of a Bernstein polynomial in the interval [0,1];
// the method first tries a fast approach, which succeeds in the vast majority of cases and is
// anywhere between 10x and 100x faster than the backup approach; if the fast approach fails,
// the backup method is applied. Returns the number of computed roots, recorded in the buffer
// 'out' of size at least P - 1
int bernsteinUnitIntervalRealRoots(const real* alpha, int P, real* out)
{
    using std::sqrt;
    if (P == 1) return 0;

    // Direct method for linear polynomials
    if (P == 2) {
        if (alpha[0] == alpha[1]) return 0;
        real x = alpha[0] / (alpha[0] - alpha[1]);
        if (x < 0 || x > 1) return 0;
        if (std::isnan(x)) {
            int aaa = 1;
            int bbb = 1;
        }
        *out = x;
        return 1;
    }

    // Direct method for quadratic polynomials, using numerically-stable quadratic formula
    if (P == 3) {
        real a     = alpha[0] - alpha[1] * 2 + alpha[2];
        real b     = (alpha[1] - alpha[0]) * 2;
        real c     = alpha[0];
        real delta = b * b - a * c * 4;
        if (delta < 0) return 0;
        real q     = -0.5 * (b + (b >= 0 ? sqrt(delta) : -sqrt(delta)));
        real r1    = q / a;
        real r2    = c / q;
        int  count = 0;
        if (0 <= r1 && r1 <= 1) {
            *out = r1;
            ++count;
        }
        if (0 <= r2 && r2 <= 1) {
            *(out + count) = r2;
            ++count;
        }
        return count;
    }

    // Apply fast method, if possible, and resort to eigenvalue method if it fails
    int count = bernsteinUnitIntervalRealRoots_fast(alpha, P, out);
    if (count >= 0) return count;
    return bernsteinUnitIntervalRealRoots_eigenvalue(alpha, P, out);
}

// Build Sylvester matrix for Bernstein polynomials of degrees P-1 and Q-1
//   out: square matrix of dimensions P + Q - 2
void sylvesterMatrix(const real* a, int P, const real* b, int Q, xarray<real, 2>& out)
{
    assert(P >= 1 && Q >= 1 && P + Q >= 3 && out.ext(0) == P + Q - 2 && out.ext(1) == P + Q - 2);
    const real* bP  = Binomial::row(P - 1);
    const real* bQ  = Binomial::row(Q - 1);
    const real* bPQ = Binomial::row(P + Q - 3);
    out             = 0;
    for (int i = 0; i < Q - 1; ++i)
        for (int j = 0; j < P; ++j) out(i, j + i) = a[j] * (bP[j] / bPQ[j + i]);
    for (int i = 0; i < P - 1; ++i)
        for (int j = 0; j < Q; ++j) out(i + Q - 1, j + i) = b[j] * (bQ[j] / bPQ[j + i]);
}

// Build Bezout matrix for Bernstein polynomials of equal degree P-1
//   out: square matrix of dimensions P - 1
void bezoutMatrix(const real* a, const real* b, int P, xarray<real, 2>& out)
{
    assert(P >= 2 && out.ext(0) == P - 1 && out.ext(1) == P - 1);
    const int n = P - 1;
    out         = 0;
    for (int i = 1; i <= n; ++i) out(i - 1, 0) = (a[i] * b[0] - a[0] * b[i]) * real(n) / real(i);
    for (int j = 1; j <= n - 1; ++j) out(n - 1, j) = (a[n] * b[j] - a[j] * b[n]) * real(n) / real(n - j);
    for (int i = n - 1; i >= 1; --i)
        for (int j = 1; j <= i - 1; ++j)
            out(i - 1, j) = (a[i] * b[j] - a[j] * b[i]) * real(n * n) / real(i * (n - j))
                            + out(i, j - 1) * real(j * (n - i)) / real(i * (n - j));
    for (int i = 0; i < n; ++i)
        for (int j = i + 1; j < n; ++j) out(i, j) = out(j, i);
}
} // namespace algoim::bernstein

#endif