//
// Created by 14727 on 2022/12/9.
//

#include "device/Nurbs/nurbs_surface.cuh"
#include "device/Nurbs/nurbs_common.cuh"
#include "utils.h"
#include "device/Nurbs/bvh.cuh"
#include "device/device_utils.cuh"
#include "tinynurbs/tinynurbs.h"

__global__ void
NurbsSurface::g_evaluate(float *res, const float *d_nTexture_u, const float *d_nTexture_v, const float *d_points,
                         int d_pointsCnt_u,
                         int d_pointsCnt_v, int d_POINT_SIZE, float d_lastKnot_u, float d_lastKnot_v, int d_sampleCnt_u,
                         int d_sampleCnt_v) {
    // 二维grid和二维的block
    int ix = blockIdx.x * blockDim.x + threadIdx.x;
    int iy = blockIdx.y * blockDim.y + threadIdx.y;

    float u = ix * d_lastKnot_u / (d_sampleCnt_u - 1);
    float v = iy * d_lastKnot_v / (d_sampleCnt_v - 1);

    if (u > 1.0 * d_lastKnot_u || v > 1.0 * d_lastKnot_v) {
        return;
    }

    float x = 0., y = 0., z = 0., sumW = 0.;
    for (int i = 0; i < d_pointsCnt_u; i++) {
        float N_U = d_nTexture_u[ix * d_pointsCnt_u + i];
        for (int j = 0; j < d_pointsCnt_v; j++) {
            float N_V = d_nTexture_v[iy * d_pointsCnt_v + j];
            int idx = (i * d_pointsCnt_v + j) * d_POINT_SIZE;
            float w = d_points[idx + 3];
            x += N_U * N_V * w * d_points[idx];
            y += N_U * N_V * w * d_points[idx + 1];
            z += N_U * N_V * w * d_points[idx + 2];
            sumW += N_U * N_V * w;
        }
    }
    x = x / sumW;
    y = y / sumW;
    z = z / sumW;

    int baseIdx = (ix * d_sampleCnt_v + iy) * 3;
    res[baseIdx] = x;
    res[baseIdx + 1] = y;
    res[baseIdx + 2] = z;

//    printf("(%d, %d)-->(%g, %g, %g)\n", ix, iy, x, y, z); // %g输出，舍弃无意义的0
}


__global__ void
NurbsSurface::g_derivative(float *derivatives, float *normals, const float *derTexture_u, const float *derTexture_v,
                           const float *nTexture_u, const float *nTexture_v, const float *d_points, int d_pointsCnt_u,
                           int d_pointsCnt_v, int d_POINT_SIZE, float d_lastKnot_u, float d_lastKnot_v,
                           int d_sampleCnt_u, int d_sampleCnt_v) {
    // 二维grid和二维的block
    int ix = blockIdx.x * blockDim.x + threadIdx.x;
    int iy = blockIdx.y * blockDim.y + threadIdx.y;

    if (ix >= d_sampleCnt_u || iy >= d_sampleCnt_v) {
        return;
    }
    float u = ix * d_lastKnot_u / (d_sampleCnt_u - 1);
    float v = iy * d_lastKnot_v / (d_sampleCnt_v - 1);

    float nubsPdx_u = 0., nubsPdy_u = 0, nubsPdz_u = 0., nubsPdw_u = 0.;
    float nubsPdx_v = 0., nubsPdy_v = 0, nubsPdz_v = 0., nubsPdw_v = 0.;

    for (int i = 0; i < d_pointsCnt_u; i++) {
        for (int j = 0; j < d_pointsCnt_u; j++) {
            int baseIdx = (i * d_pointsCnt_v + j) * d_POINT_SIZE;
            float factor_u = derTexture_u[ix * d_pointsCnt_u + i] * nTexture_v[iy * d_pointsCnt_v + j];
            float factor_v = derTexture_v[iy * d_pointsCnt_v + j] * nTexture_u[ix * d_pointsCnt_u + i];
            float wij = d_points[baseIdx + 3];
            nubsPdx_u += factor_u * wij * d_points[baseIdx];
            nubsPdy_u += factor_u * wij * d_points[baseIdx + 1];
            nubsPdz_u += factor_u * wij * d_points[baseIdx + 2];
            nubsPdw_u += factor_u * wij;

            nubsPdx_v += factor_v * wij * d_points[baseIdx];
            nubsPdy_v += factor_v * wij * d_points[baseIdx + 1];
            nubsPdz_v += factor_v * wij * d_points[baseIdx + 2];
            nubsPdw_v += factor_v * wij;
        }
    }

    float x = 0., y = 0., z = 0., w = 0.;
    for (int i = 0; i < d_pointsCnt_u; i++) {
        float N_U = nTexture_u[ix * d_pointsCnt_u + i];
        for (int j = 0; j < d_pointsCnt_v; j++) {
            float N_V = nTexture_v[iy * d_pointsCnt_v + j];
            int idx = (i * d_pointsCnt_v + j) * d_POINT_SIZE;
            float wij = d_points[idx + 3];
            x += N_U * N_V * wij * d_points[idx];
            y += N_U * N_V * wij * d_points[idx + 1];
            z += N_U * N_V * wij * d_points[idx + 2];
            w += N_U * N_V * wij;
        }
    }

    float w2 = w * w;
    float pdx_u = (nubsPdx_u * w - x * nubsPdw_u) / w2;
    float pdy_u = (nubsPdy_u * w - y * nubsPdw_u) / w2;
    float pdz_u = (nubsPdz_u * w - z * nubsPdw_u) / w2;

    float pdx_v = (nubsPdx_v * w - x * nubsPdw_v) / w2;
    float pdy_v = (nubsPdy_v * w - y * nubsPdw_v) / w2;
    float pdz_v = (nubsPdz_v * w - z * nubsPdw_v) / w2;
//    float pdz_u = (nubsPdz_u * w - z )

    int baseIdx = (ix * d_sampleCnt_v + iy) * 6;
    derivatives[baseIdx] = pdx_u;
    derivatives[baseIdx + 1] = pdy_u;
    derivatives[baseIdx + 2] = pdz_u;
    derivatives[baseIdx + 3] = pdx_v;
    derivatives[baseIdx + 4] = pdy_v;
    derivatives[baseIdx + 5] = pdz_v;

    // 叉乘得到法向量
    baseIdx = (ix * d_sampleCnt_v + iy) * 3;
    normals[baseIdx] = pdy_u * pdz_v - pdy_v * pdz_u;
    normals[baseIdx + 1] = pdx_v * pdz_u - pdx_u * pdz_v;
    normals[baseIdx + 2] = pdx_u * pdy_v - pdx_v * pdy_u;
    normalization(normals[baseIdx], normals[baseIdx + 1], normals[baseIdx + 2]);
//    if ((ix == 8 && iy == 9) || (ix == 7 && iy == 9) || (ix == 9 && iy == 9) || (ix == 8 && iy == 8) ||
//        (ix == 8 && iy == 10))
//        printf("(%g,%g)-->u:(%g, %g, %g), v:(%g,%g,%g), normal:(%g,%g,%g)\n", u, v, pdx_u, pdy_u, pdz_u, pdx_v, pdy_v,
//               pdz_v, normals[baseIdx], normals[baseIdx + 1], normals[baseIdx + 2]);
}

__global__ void
NurbsSurface::g_curvature(const float *derivatives, const int sampleCnt_u, const int sampleCnt_v, float lastKnot_u,
                          float lastKnot_v, float *ms, float *k) {
    // 二维grid和二维的block
    int ix = blockIdx.x * blockDim.x + threadIdx.x;
    int iy = blockIdx.y * blockDim.y + threadIdx.y;

    if (ix >= sampleCnt_u || iy >= sampleCnt_v) {
        return;
    }


    float step_u = lastKnot_u / (sampleCnt_u - 1), step_v = lastKnot_v / (sampleCnt_v - 1);
    float u = ix * step_u, v = iy * step_v;

    int baseIdx = (ix * sampleCnt_v + iy) * 6;
    int lastBaseIdx_u = ((ix - 1) * sampleCnt_v + iy) * 6, nextBaseIdx_u = ((ix + 1) * sampleCnt_v + iy) * 6;
    int lastBaseIdx_v = (ix * sampleCnt_v + iy - 1) * 6, nextBaseIdx_v = (ix * sampleCnt_v + iy + 1) * 6;

//    printf("(%g,%g)-->u:(%g, %g, %g), v:(%g,%g,%g)\n", u, v, derivatives[baseIdx], derivatives[baseIdx + 1],
//           derivatives[baseIdx + 2], derivatives[baseIdx + 3], derivatives[baseIdx + 4], derivatives[baseIdx + 5]);

    float sndPdx_uu, sndPdy_uu, sndPdz_uu, sndPdx_vv, sndPdy_vv, sndPdz_vv;  // 二阶导
    float sndPdx_uv, sndPdy_uv, sndPdz_uv, sndPdx_vu, sndPdy_vu, sndPdz_vu;

    if (ix == 0) {
        sndPdx_uu = (derivatives[nextBaseIdx_u] - derivatives[baseIdx]) / step_u;
        sndPdy_uu = (derivatives[nextBaseIdx_u + 1] - derivatives[baseIdx + 1]) / step_u;
        sndPdz_uu = (derivatives[nextBaseIdx_u + 2] - derivatives[baseIdx + 2]) / step_u;

        sndPdx_vu = (derivatives[nextBaseIdx_u + 3] - derivatives[baseIdx + 3]) / step_u;
        sndPdy_vu = (derivatives[nextBaseIdx_u + 4] - derivatives[baseIdx + 4]) / step_u;
        sndPdz_vu = (derivatives[nextBaseIdx_u + 5] - derivatives[baseIdx + 5]) / step_u;
    } else if (ix == sampleCnt_u - 1) {
        sndPdx_uu = (derivatives[baseIdx] - derivatives[lastBaseIdx_u]) / step_u;
        sndPdy_uu = (derivatives[baseIdx + 1] - derivatives[lastBaseIdx_u + 1]) / step_u;
        sndPdz_uu = (derivatives[baseIdx + 2] - derivatives[lastBaseIdx_u + 2]) / step_u;

        sndPdx_vu = (derivatives[baseIdx + 3] - derivatives[lastBaseIdx_u + 3]) / step_u;
        sndPdy_vu = (derivatives[baseIdx + 4] - derivatives[lastBaseIdx_u + 4]) / step_u;
        sndPdz_vu = (derivatives[baseIdx + 5] - derivatives[lastBaseIdx_u + 5]) / step_u;
    } else {
        sndPdx_uu = (derivatives[nextBaseIdx_u] - derivatives[lastBaseIdx_u]) / (2 * step_u);
        sndPdy_uu = (derivatives[nextBaseIdx_u + 1] - derivatives[lastBaseIdx_u + 1]) / (2 * step_u);
        sndPdz_uu = (derivatives[nextBaseIdx_u + 2] - derivatives[lastBaseIdx_u + 2]) / (2 * step_u);

        sndPdx_vu = (derivatives[nextBaseIdx_u + 3] - derivatives[lastBaseIdx_u + 3]) / (2 * step_u);
        sndPdy_vu = (derivatives[nextBaseIdx_u + 4] - derivatives[lastBaseIdx_u + 4]) / (2 * step_u);
        sndPdz_vu = (derivatives[nextBaseIdx_u + 5] - derivatives[lastBaseIdx_u + 5]) / (2 * step_u);
    }

    if (iy == 0) {
        sndPdx_vv = (derivatives[nextBaseIdx_v + 3] - derivatives[baseIdx + 3]) / step_v;
        sndPdy_vv = (derivatives[nextBaseIdx_v + 4] - derivatives[baseIdx + 4]) / step_v;
        sndPdz_vv = (derivatives[nextBaseIdx_v + 5] - derivatives[baseIdx + 5]) / step_v;

        sndPdx_uv = (derivatives[nextBaseIdx_v] - derivatives[baseIdx]) / step_v;
        sndPdy_uv = (derivatives[nextBaseIdx_v + 1] - derivatives[baseIdx + 1]) / step_v;
        sndPdz_uv = (derivatives[nextBaseIdx_v + 2] - derivatives[baseIdx + 2]) / step_v;
    } else if (iy == sampleCnt_v - 1) {
        sndPdx_vv = (derivatives[baseIdx + 3] - derivatives[lastBaseIdx_v + 3]) / step_v;
        sndPdy_vv = (derivatives[baseIdx + 4] - derivatives[lastBaseIdx_v + 4]) / step_v;
        sndPdz_vv = (derivatives[baseIdx + 5] - derivatives[lastBaseIdx_v + 5]) / step_v;

        sndPdx_uv = (derivatives[baseIdx] - derivatives[lastBaseIdx_v]) / step_v;
        sndPdy_uv = (derivatives[baseIdx + 1] - derivatives[lastBaseIdx_v + 1]) / step_v;
        sndPdz_uv = (derivatives[baseIdx + 2] - derivatives[lastBaseIdx_v + 2]) / step_v;
    } else {
        sndPdx_vv = (derivatives[nextBaseIdx_v + 3] - derivatives[lastBaseIdx_v + 3]) / (2 * step_v);
        sndPdy_vv = (derivatives[nextBaseIdx_v + 4] - derivatives[lastBaseIdx_v + 4]) / (2 * step_v);
        sndPdz_vv = (derivatives[nextBaseIdx_v + 5] - derivatives[lastBaseIdx_v + 5]) / (2 * step_v);

        sndPdx_uv = (derivatives[nextBaseIdx_v] - derivatives[lastBaseIdx_v]) / (2 * step_v);
        sndPdy_uv = (derivatives[nextBaseIdx_v + 1] - derivatives[lastBaseIdx_v + 1]) / (2 * step_v);
        sndPdz_uv = (derivatives[nextBaseIdx_v + 2] - derivatives[lastBaseIdx_v + 2]) / (2 * step_v);
    }

    float uvx = (sndPdx_uv + sndPdx_vu) / 2, uvy = (sndPdy_uv + sndPdy_vu) / 2, uvz = (sndPdz_uv + sndPdz_vu) / 2;
//    normalization(sndPdx_uv, sndPdy_uv, sndPdz_uv);
//    normalization(sndPdx_vu, sndPdy_vu, sndPdz_vu);
//    normalization(sndPdx_uu, sndPdy_uu, sndPdz_uu);
//    normalization(uvx, uvy, uvz);
//    normalization(sndPdx_vv, sndPdy_vv, sndPdz_vv);

//    if (ix == 8 && iy == 9) {
//        printf("(%g, %g) --> uu: (%g, %g, %g), uv: (%g, %g, %g), vv: (%g, %g, %g)\n", u, v, sndPdx_uu, sndPdy_uu,
//               sndPdz_uu,
//               uvx, uvy, uvz, sndPdx_vv, sndPdy_vv, sndPdz_vv);
//        printf("uv: (%g, %g, %g), vu: (%g, %g, %g)\n", sndPdx_uv, sndPdy_uv, sndPdz_uv, sndPdx_vu, sndPdy_vu,
//               sndPdz_vu);
//    }

    float m1 = max(max(sndPdx_uu, sndPdy_uu), sndPdz_uu);
    float m2 = max(max(uvx, uvy), uvz);
    float m3 = max(max(sndPdx_vv, sndPdy_vv), sndPdz_vv);

//    __shared__ float ms[363];
    ms[(ix * sampleCnt_v + iy) * 3] = m1;
    ms[(ix * sampleCnt_v + iy) * 3 + 1] = m2;
    ms[(ix * sampleCnt_v + iy) * 3 + 2] = m3;
    __syncthreads();
//    if(ix == 1 && iy == 1) {
//        for(int i = 0; i < sampleCnt_u; i++) {
//            for(int j = 0; j < sampleCnt_v; j++) {
//                printf("%g ", ms[(i * sampleCnt_v + j) * 3]);
//            }
//            printf("\n");
//        }
//    }

    // 规约求最大值
    for (int step = (sampleCnt_u + 1) / 2; step > 1; step = (step + 1) / 2) {
        // step 表示现在参与计算最大值的数据的长度的一半
        if (ix < step && ix + step < sampleCnt_u) {
            ms[(ix * sampleCnt_v + iy) * 3] = max(m1, ms[((ix + step) * sampleCnt_v + iy) * 3]);
            ms[(ix * sampleCnt_v + iy) * 3 + 1] = max(m1, ms[((ix + step) * sampleCnt_v + iy) * 3 + 1]);
            ms[(ix * sampleCnt_v + iy) * 3 + 2] = max(m1, ms[((ix + step) * sampleCnt_v + iy) * 3 + 2]);
        }
    }

    for (int step = (sampleCnt_v + 1) / 2; step > 1; step = (step + 1) / 2) {
        // step 表示现在参与计算最大值的数据的长度的一半
        if (iy < step && iy + step < sampleCnt_v) {
            ms[iy * 3] = max(ms[iy * 3], ms[(iy + step) * 3]);
            ms[iy * 3 + 1] = max(ms[iy * 3 + 1], ms[(iy + step) * 3 + 1]);
            ms[iy * 3 + 2] = max(ms[iy * 3 + 2], ms[(iy + step) * 3 + 2]);
        }
    }
    __syncthreads();
    int n = sampleCnt_u - 1;
    int m = sampleCnt_v - 1;
    *k = (ms[0] / (n * n) + 2 * ms[1] / (n * m) + ms[2] / (m * m)) / 8;
//    if(ix == 1 && iy == 1)printf("%g gggg\n", ms[0]);
}


__host__ NurbsSurface::Surface::Surface(MeshPoints4 controlPoints, std::vector<float> knots_u,
                                        std::vector<float> knots_v) {
    this->knots_u = std::move(knots_u);
    this->knots_v = std::move(knots_v);
    this->controlPoints = std::move(controlPoints);
    recordTime = false;
    d_nTexture_u = nullptr;
    d_nTexture_v = nullptr;
    d_nTexture1_u = nullptr;
    d_nTexture1_v = nullptr;
    d_knots_u = nullptr;
    d_knots_v = nullptr;
    d_points = nullptr;
    d_evaluationRes = nullptr;
    d_derivatives = nullptr;
    d_k = nullptr;
    d_normals = nullptr;
    bvh.nodes = nullptr;
    h_evaluations = nullptr;
    h_derivatives = nullptr;
    h_normals = nullptr;
}

__host__ void NurbsSurface::Surface::evaluate(int sampleCnt_u, int sampleCnt_v) {
    // 构造指向device的controlPoints
    const int pointsCnt_u = controlPoints.size(), pointsCnt_v = controlPoints[0].size();
    const int pointsBytes = pointsCnt_u * pointsCnt_v * sizeof(glm::vec4);
    auto *h_points = (float *) malloc(pointsBytes);
    for (int i = 0; i < pointsCnt_u; i++) {
        for (int j = 0; j < pointsCnt_v; j++) {
            for (int k = 0; k < POINT_SIZE; k++) {
                h_points[(i * pointsCnt_v + j) * POINT_SIZE + k] = controlPoints[i][j][k];
            }
//            printf("%f/ %f/ %f/ %f     ", controlPoints[i][j][0], controlPoints[i][j][1], controlPoints[i][j][2], controlPoints[i][j][3]);
        }
    }
    cudaMalloc((void **) &d_points, pointsBytes);
    cudaMemcpy(d_points, h_points, pointsBytes, cudaMemcpyHostToDevice);

    // 构造指向device的knots
    const int knotsCnt_u = knots_u.size(), knotsCnt_v = knots_v.size();
    const int knotsBytes_u = knotsCnt_u * sizeof(float), knotsBytes_v = knotsCnt_v * sizeof(float);
    auto *h_knots_u = (float *) malloc(knotsBytes_u), *h_knots_v = (float *) malloc(knotsBytes_v);
    for (int i = 0; i < knotsCnt_u; i++) h_knots_u[i] = knots_u[i];
    for (int i = 0; i < knotsCnt_v; i++) h_knots_v[i] = knots_v[i];

    safeCudaFree(d_knots_u);
    safeCudaFree(d_knots_v);
    cudaMalloc((void **) &d_knots_u, knotsBytes_u);
    cudaMalloc((void **) &d_knots_v, knotsBytes_v);
    cudaMemcpy(d_knots_u, h_knots_u, knotsBytes_u, cudaMemcpyHostToDevice);
    cudaMemcpy(d_knots_v, h_knots_v, knotsBytes_v, cudaMemcpyHostToDevice);

    // 构造nTexture
    cudaMalloc((void **) &d_nTexture_u,
               sampleCnt_u * pointsCnt_u * sizeof(float)); // 注意nTexture的大小，在算梯度时用得到i=pointsCnt + 1的基函数值
    cudaMalloc((void **) &d_nTexture_v, sampleCnt_v * pointsCnt_v * sizeof(float));

    // 构造nTexture1
    cudaMalloc((void **) &d_nTexture1_u, sampleCnt_u * (pointsCnt_u + 1) * sizeof(float));
    cudaMalloc((void **) &d_nTexture1_v, sampleCnt_v * (pointsCnt_v + 1) * sizeof(float));

    // 结果数组
    size_t resBytes = sampleCnt_u * sampleCnt_v * 3 * sizeof(float);
    safeCudaFree(d_evaluationRes);
    cudaMalloc((void **) &d_evaluationRes, resBytes);
    safeFree(h_evaluations);
    h_evaluations = (float *) malloc(resBytes);

    // 构造g_basisTexture线程层级
    dim3 blockBasis(512);
    dim3 gridBasis_u((sampleCnt_u + blockBasis.x - 1) / blockBasis.x);
    dim3 gridBasis_v((sampleCnt_v + blockBasis.x - 1) / blockBasis.x);

    // 构造线程层级，调用核函数
    dim3 block(32, 32);
    dim3 grid((sampleCnt_u + block.x - 1) / block.x, (sampleCnt_v + block.y - 1) / block.y);
    // 记录用时
    double time_cost_device;
    if (recordTime) time_cost_device = get_time();
    g_basisTexture<<<gridBasis_u, blockBasis>>>(d_nTexture_u, d_nTexture1_u, d_knots_u, pointsCnt_u, knotsCnt_u,
                                                sampleCnt_u);
    cudaDeviceSynchronize();
    g_basisTexture<<<gridBasis_v, blockBasis>>>(d_nTexture_v, d_nTexture1_v, d_knots_v, pointsCnt_v, knotsCnt_v,
                                                sampleCnt_v);
    cudaDeviceSynchronize();

    g_evaluate <<<grid, block>>>(d_evaluationRes, d_nTexture_u, d_nTexture_v, d_points, pointsCnt_u, pointsCnt_v,
                                 POINT_SIZE, knots_u[knotsCnt_u - 1], knots_v[knotsCnt_v - 1], sampleCnt_u,
                                 sampleCnt_v);
    cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用
    if (recordTime) {
        time_cost_device = get_time() - time_cost_device;
        printf("GPU time cost of surface evaluation for %d samples: %lf\n", sampleCnt_u * sampleCnt_v,
               time_cost_device);
    }

    cudaMemcpy(h_evaluations, d_evaluationRes, resBytes, cudaMemcpyDeviceToHost);


    // 释放内存
    safeFree(h_points);
    safeFree(h_knots_u);
    safeFree(h_knots_v);
}


__host__ std::vector<std::vector<glm::vec3>>
NurbsSurface::Surface::getEvaluateVec(int sampleCnt_u, int sampleCnt_v) const {
    std::vector<std::vector<glm::vec3>> res(sampleCnt_u, std::vector<glm::vec3>(sampleCnt_v, glm::vec3()));
    for (int i = 0; i < sampleCnt_u; i++) {
        int baseIdx = i * sampleCnt_v * 3;
        for (int j = 0; j < sampleCnt_v; j++) {
            baseIdx += j * 3;
            res[i][j].x = h_evaluations[baseIdx];
            res[i][j].y = h_evaluations[baseIdx + 1];
            res[i][j].z = h_evaluations[baseIdx + 2];
//            printf("%d, %d: %f, %f, %f\n", i, j, res[i][j][0], res[i][j][1], res[i][j][2]);
        }
    }
    return res;
}

__host__ std::vector<MeshPoints3> NurbsSurface::Surface::getDerivativeVec(int sampleCnt_u, int sampleCnt_v) const {
    MeshPoints3 der_u(sampleCnt_u, LinePoints3(sampleCnt_v));
    MeshPoints3 der_v(sampleCnt_u, LinePoints3(sampleCnt_v));
    MeshPoints3 normal(sampleCnt_u, LinePoints3(sampleCnt_v));
    for (int i = 0; i < sampleCnt_u; i++) {
        int baseIdx = i * sampleCnt_v * 6;
        for (int j = 0; j < sampleCnt_v; j++) {
            baseIdx += j * 6;
            der_u[i][j].x = h_derivatives[baseIdx];
            der_u[i][j].y = h_derivatives[baseIdx + 1];
            der_u[i][j].z = h_derivatives[baseIdx + 2];
            der_v[i][j].x = h_derivatives[baseIdx + 3];
            der_v[i][j].y = h_derivatives[baseIdx + 4];
            der_v[i][j].z = h_derivatives[baseIdx + 5];
            auto baseIdxNorm = baseIdx / 2;
            normal[i][j].x = h_normals[baseIdxNorm];
            normal[i][j].y = h_normals[baseIdxNorm + 1];
            normal[i][j].z = h_normals[baseIdxNorm + 2];
            // TODO normalize 归一化在gpu中实现！
            normal[i][j] = glm::normalize(normal[i][j]);
        }
    }
    return {der_u, der_v, normal};
}

__host__ void NurbsSurface::Surface::derivative(int sampleCnt_u, int sampleCnt_v) {
    if (POINT_SIZE != controlPoints[0][0].size()) {
        printf("Error! Nurbs控制点应表示为长度为4的齐次坐标\n");
        return;
    }

    float *d_derTexture_u = nullptr;
    float *d_derTexture_v = nullptr;
    const int pointsCnt_u = controlPoints.size(), pointsCnt_v = controlPoints[0].size();
    const int knotsCnt_u = knots_u.size(), knotsCnt_v = knots_v.size();
    cudaMalloc((void **) &d_derTexture_u, sampleCnt_u * pointsCnt_u * sizeof(float));
    cudaMalloc((void **) &d_derTexture_v, sampleCnt_v * pointsCnt_v * sizeof(float));

    // 构造切向量计算结果
    safeCudaFree(d_derivatives);
    size_t derBytes = sampleCnt_u * sampleCnt_v * 6 * sizeof(float);
    cudaMalloc((void **) &d_derivatives, derBytes); // 每个采样所求的切向量是一个六元向量，前三位是对u的偏导、后三位是对v的偏导

    // 构造法向量计算结果
    safeCudaFree(d_normals);
    size_t normalBytes = sampleCnt_u * sampleCnt_v * 3 * sizeof(float);
    cudaMalloc((void **) &d_normals, normalBytes);

    // 构造线程层级
    dim3 block(32, 32);
    dim3 grid((sampleCnt_u + block.x - 1) / block.x, (sampleCnt_v + block.y - 1) / block.y);
    // 构造g_basisTexture线程层级
    dim3 blockTex(512);
    dim3 gridTex_u((sampleCnt_u + blockTex.x - 1) / blockTex.x);
    dim3 gridTex_v((sampleCnt_v + blockTex.x - 1) / blockTex.x);
    // 记录用时
    double time_cost_device;
    if (recordTime) time_cost_device = get_time();
    g_derTexture<<<gridTex_u, blockTex>>>(d_derTexture_u, d_nTexture1_u, d_knots_u, pointsCnt_u, knotsCnt_u,
                                          sampleCnt_u);
    g_derTexture<<<gridTex_v, blockTex>>>(d_derTexture_v, d_nTexture1_v, d_knots_v, pointsCnt_v, knotsCnt_v,
                                          sampleCnt_v);
    cudaDeviceSynchronize();
    g_derivative<<<grid, block>>>(d_derivatives, d_normals, d_derTexture_u, d_derTexture_v, d_nTexture_u, d_nTexture_v,
                                  d_points, pointsCnt_u, pointsCnt_v, POINT_SIZE, knots_u[knotsCnt_u - 1],
                                  knots_v[knotsCnt_v - 1], sampleCnt_u, sampleCnt_v);
    cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用
    if (recordTime) {
        time_cost_device = get_time() - time_cost_device;
        printf("GPU time cost of surface first derivative calculating for %d samples: %lf\n", sampleCnt_u * sampleCnt_v,
               time_cost_device);
    }

    // 结果数组
    safeFree(h_normals);
    h_normals = (float *) malloc(normalBytes);
    cudaMemcpy(h_normals, d_normals, normalBytes, cudaMemcpyDeviceToHost);

    safeFree(h_derivatives);
    h_derivatives = (float *) malloc(derBytes);
    cudaMemcpy(h_derivatives, d_derivatives, derBytes, cudaMemcpyDeviceToHost);

    cudaFree(d_derTexture_u);
    cudaFree(d_derTexture_v);
}

__host__ void NurbsSurface::Surface::curvature(int sampleCnt_u, int sampleCnt_v) {

    if (POINT_SIZE != controlPoints[0][0].size()) {
        printf("Error! Nurbs控制点应表示为长度为4的齐次坐标\n");
        return;
    }

    // 构造记录每个采样点中的最大M1、M2、M3的数组
    // 这里用共享内存会更好，但使用共享内存动态分配长度总是出错（好像是因为长度过长），后续需要思考解决这个问题
    float *ms = nullptr;
    cudaMalloc((void **) &ms, sampleCnt_u * sampleCnt_v * 3 * sizeof(float));

    cudaMalloc((void **) &d_k, sizeof(float));

    // 构造线程层级
    dim3 block(32, 32);
    dim3 grid((sampleCnt_u - 1 + block.x - 1) / block.x, (sampleCnt_v - 1 + block.y - 1) / block.y);

    // 记录用时
    double time_cost_device;
    if (recordTime) time_cost_device = get_time();
    g_curvature<<<grid, block>>>(d_derivatives, sampleCnt_u, sampleCnt_v, knots_u[knots_u.size() - 1],
                                 knots_v[knots_v.size() - 1], ms, d_k);
    cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用
    if (recordTime) {
        time_cost_device = get_time() - time_cost_device;
        printf("GPU time cost of surface curvature calculating for %d samples: %lf\n",
               sampleCnt_u * sampleCnt_v, time_cost_device);
    }
    safeCudaFree(ms);
}

void NurbsSurface::Surface::setRecordTime(bool r) {
    recordTime = r;
}

NurbsSurface::Surface::~Surface() {
    safeCudaFree(d_nTexture_u);
    safeCudaFree(d_nTexture_v);
    safeCudaFree(d_nTexture1_u);
    safeCudaFree(d_nTexture1_v);
    safeCudaFree(d_points);
    safeCudaFree(d_knots_u);
    safeCudaFree(d_knots_v);
    safeCudaFree(d_k);
    safeCudaFree(d_evaluationRes);
    safeCudaFree(d_normals);
    safeCudaFree(d_derivatives);
    safeFree(bvh.nodes);
    safeFree(h_evaluations);
    safeFree(h_normals);
    safeFree(h_derivatives);
    cudaDeviceReset();
}

__host__ void NurbsSurface::Surface::buildBVH(int layerCnt, bool useK) {
    // TODO 构造BVH的函数不应该出现在NURBS Surface中，应该是BVH类的事情！
    // TODO NURBS Surface只需要一个函数去调用BVH对象的方法即可
    int sampleCnt_u = pow(2, layerCnt - 1) + 1, sampleCnt_v = sampleCnt_u;
    if (!useK) {
        // 必须safeFree一下，这样global函数中才能通过d_k = nullptr知道不需要再free
        safeCudaFree(d_k);
    }
    if (POINT_SIZE != controlPoints[0][0].size()) {
        printf("Error! Nurbs控制点应表示为长度为4的齐次坐标\n");
        return;
    }
    // 构造线程层级
    dim3 block(32, 32);
    dim3 grid((sampleCnt_u + block.x - 1) / block.x, (sampleCnt_v + block.y - 1) / block.y);

//    bvh.maxLevel = max(int(ceil(log2f(sampleCnt_u - 1))) + 1, int(ceil(log2f(sampleCnt_v - 1))) + 1);
    bvh.maxLevel = layerCnt;
    bvh.size = (pow(4, bvh.maxLevel) - 1) / 3;  // 等比数列求和公示求出数总的节点数
    size_t bvhBytes = sizeof(BVHNode) * bvh.size;
    BVHNode *d_bvh = nullptr;
    cudaMalloc((void **) &d_bvh, bvhBytes);
    // 记录用时
    double time_cost_device;
    if (recordTime) time_cost_device = get_time();
    g_buildBvh<<<grid, block>>>(d_k, bvh.maxLevel, d_evaluationRes, knots_u[knots_u.size() - 1],
                                knots_v[knots_v.size() - 1], sampleCnt_u, sampleCnt_v, d_bvh);
    cudaDeviceSynchronize();
    if (recordTime) {
        time_cost_device = get_time() - time_cost_device;
        printf("GPU time cost of a %d-layer BVH building: %lf\n",
               bvh.maxLevel, time_cost_device);
    }
    // 将bvh拷贝到cpu中
    safeFree(bvh.nodes);
    bvh.nodes = (BVHNode *) malloc(bvhBytes);
    cudaMemcpy(bvh.nodes, d_bvh, bvhBytes, cudaMemcpyDeviceToHost);
    safeCudaFree(d_bvh);
//    bvh.printQuadTree();
}

__host__ void NurbsSurface::Surface::buildGaussMap(int layerCnt) {
    // TODO，构造GAUSS Map的函数不应该出现在NURBS Surface中，应该是GAUSS Map类的事情！
    int sampleCnt_u = pow(2, layerCnt - 1) + 1, sampleCnt_v = sampleCnt_u;

    if (POINT_SIZE != controlPoints[0][0].size()) {
        printf("Error! Nurbs控制点应表示为长度为4的齐次坐标\n");
        return;
    }
    // 构造线程层级
    dim3 block(32, 32);
    dim3 grid((sampleCnt_u + block.x - 1) / block.x, (sampleCnt_v + block.y - 1) / block.y);

    gauss_map.maxLevel = layerCnt;
    gauss_map.size = (pow(4, layerCnt) - 1) / 3;  // 等比数列求和公示求出数总的节点数
    size_t gaussMapBytes = sizeof(BVHNode) * gauss_map.size;
    BVHNode *d_gaussMapTree = nullptr;
    cudaMalloc((void **) &d_gaussMapTree, gaussMapBytes);
    // 记录用时
    double time_cost_device;
    if (recordTime) time_cost_device = get_time();
    g_buildBvh<<<grid, block>>>(nullptr, layerCnt, d_normals, knots_u[knots_u.size() - 1],
                                knots_v[knots_v.size() - 1], sampleCnt_u, sampleCnt_v, d_gaussMapTree);
    cudaDeviceSynchronize();
    if (recordTime) {
        time_cost_device = get_time() - time_cost_device;
        printf("GPU time cost of a %d-layer Gauss Map building: %lf\n",
               layerCnt, time_cost_device);
    }
    safeFree(gauss_map.nodes);
    gauss_map.nodes = (BVHNode *) malloc(gaussMapBytes);
    cudaMemcpy(gauss_map.nodes, d_gaussMapTree, gaussMapBytes, cudaMemcpyDeviceToHost);
    safeCudaFree(d_gaussMapTree);
}

__host__ void NurbsSurface::recursiveGetOverlapLeafNodes(const BVH &bvh1, const BVH &bvh2, int idx1, int idx2,
                                                         std::vector<std::pair<int, int>> &pairs) {
    auto A = bvh1.nodes[idx1];
    auto B = bvh2.nodes[idx2];
    auto AABBSize = [](const AABB &aabb) {
        return (aabb.pMax.z - aabb.pMin.z) *
               (aabb.pMax.y - aabb.pMin.y) *
               (aabb.pMax.x - aabb.pMin.x);
    };
    // 两个包围盒不相交，返回
    if (!A.bounds.IsOverlap(B.bounds)) return;
    // 相交
    if (A.firstChild == -1 && B.firstChild == -1) {
        // 两者都是叶子节点
        pairs.emplace_back(idx1, idx2);
    } else if (A.firstChild != -1 && B.firstChild == -1) {
        // A是中间结点，B是叶子结点
        for (int i = 0; i < 4; i++) recursiveGetOverlapLeafNodes(bvh1, bvh2, A.firstChild + i, idx2, pairs);
    } else if (A.firstChild == -1 && B.firstChild != -1) {
        // A是叶子结点，B是中间结点
        for (int i = 0; i < 4; i++) recursiveGetOverlapLeafNodes(bvh1, bvh2, idx1, B.firstChild + i, pairs);
    } else {
        // 都是中间结点
        if (AABBSize(A.bounds) > AABBSize(B.bounds)) {
            // A的包围盒更大
            for (int i = 0; i < 4; i++) recursiveGetOverlapLeafNodes(bvh1, bvh2, A.firstChild + i, idx2, pairs);
        } else {
            // B的包围盒更大
            for (int i = 0; i < 4; i++) recursiveGetOverlapLeafNodes(bvh1, bvh2, idx1, B.firstChild + i, pairs);
        }
    }
}


__host__ std::vector<boxPair>
NurbsSurface::getOverlappedLeafNodes(const BVH &bvh1, const BVH &bvh2) {
    std::vector<idx2> resPairs;
    // 记录用时
    double time_cost_device = get_time();

    recursiveGetOverlapLeafNodes(bvh1, bvh2, 0, 0, resPairs);
    std::vector<boxPair> boxPairsIdx2(resPairs.size());
    for (int i = 0; i < resPairs.size(); i++) {
        boxPairsIdx2[i] = {{bvh1.nodes[resPairs[i].first].idx_u,  bvh1.nodes[resPairs[i].first].idx_v},
                           {bvh2.nodes[resPairs[i].second].idx_u, bvh2.nodes[resPairs[i].second].idx_v}};
    }

    time_cost_device = get_time() - time_cost_device;
    printf("CPU time cost for recursively calculating the overlapped leaf nodes: %lf\n", time_cost_device);

    return boxPairsIdx2;
}

__host__ bool
NurbsSurface::isGaussMapsOverlapped(const BVH &gm1, const BVH &gm2, std::pair<int, int> idxRange_u1,
                                    std::pair<int, int> idxRange_v1, std::pair<int, int> idxRange_u2,
                                    std::pair<int, int> idxRange_v2) {
    if (gm1.maxLevel != gm2.maxLevel || gm1.maxLevel <= 0) {
        printf("BVH Layer error!\n");
        return false;
    }
    int commonMaxLayer = gm1.maxLevel;
    int edgeCellCnt = pow(2, commonMaxLayer - 1);
    if (idxRange_u1.first < 0 || idxRange_u2.first < 0 || idxRange_v1.first < 0 || idxRange_v2.first < 0 ||
        idxRange_u1.second >= edgeCellCnt || idxRange_u2.second >= edgeCellCnt ||
        idxRange_v1.second >= edgeCellCnt || idxRange_v2.second >= edgeCellCnt) {
        printf("Error when detecting overlapping: idx range invalid!\n");
        return false;
    }

    auto getRangedBox = [&commonMaxLayer](const BVH &bvh, const std::pair<int, int> &idxRange_u,
                                          const std::pair<int, int> idxRange_v) {
        // 获取某个范围的gauss map的aabb
        AABB bounding;
        for (int i = idxRange_u.first; i <= idxRange_u.second; ++i) {
            for (int j = idxRange_v.first; j <= idxRange_v.second; ++j) {
                bounding = bounding.Union(
                        bvh.nodes[getStartIdxOfLayerN(commonMaxLayer) + h_getChildNodeIdx(i, j)].bounds);
            }
        }
        return bounding;
    };
    return getRangedBox(gm1, idxRange_u1, idxRange_v1)
            .IsOverlap(getRangedBox(gm2, idxRange_u2, idxRange_v2));
}

__host__ bool NurbsSurface::isGaussMapsOverlapped(const BVH &gm1, const BVH &gm2, std::pair<float, float> range_u1,
                                                  std::pair<float, float> range_v1, std::pair<float, float> range_u2,
                                                  std::pair<float, float> range_v2,
                                                  std::pair<float, float> paramRange_u1,
                                                  std::pair<float, float> paramRange_v1,
                                                  std::pair<float, float> paramRange_u2,
                                                  std::pair<float, float> paramRange_v2) {
    if (gm1.maxLevel != gm2.maxLevel || gm1.maxLevel <= 0) {
        printf("BVH Layer error!\n");
        return false;
    }
    int edgeCellCnt = pow(2, gm1.maxLevel - 1);
    // 根据所给参数的范围和参数的定义域范围，获得对应的采样网格中的范围
    auto getIdxRange = [](std::pair<float, float> range, std::pair<float, float> paramRange, int edgeCellCnt) {
        float paramStep = (paramRange.second - paramRange.first) / edgeCellCnt;
        return std::pair<int, int>({int((range.first - paramRange.first) / paramStep),
                                    int((range.second - paramRange.first) / paramStep)});
    };
    auto idxRange_u1 = getIdxRange(range_u1, paramRange_u1, edgeCellCnt);
    auto idxRange_v1 = getIdxRange(range_v1, paramRange_v1, edgeCellCnt);
    auto idxRange_u2 = getIdxRange(range_u2, paramRange_u2, edgeCellCnt);
    auto idxRange_v2 = getIdxRange(range_v2, paramRange_v2, edgeCellCnt);
    return isGaussMapsOverlapped(gm1, gm2, idxRange_u1, idxRange_v1, idxRange_u2, idxRange_v2);
}

__host__ void
NurbsSurface::Surface::recursiveGetRayBVHIntersection(const glm::vec3 dir, const glm::vec3 startPoint, const int idx,
                                                      std::vector<BVHNode> &intersectionLeafNodes) {
    auto bvhNode = bvh.nodes[idx];
    // 射线与AABB判交
    auto isRayBoxIntersect = [&]() {
        const auto &box = bvhNode.bounds;
        float t; // 射线的参数，当t<=0，表示线面交点不在视平面前方，视为没有交点
        if (dir.x != 0.) {
            // 当x分量不为0，则射线会与AABB中垂直于x轴的平面可能有交
            // 注意x的正负。x为正会先遇到较小点所在平面
            if (dir.x > 0) t = (box.pMin.x - startPoint.x) / dir.x;
            else t = (box.pMax.x - startPoint.x) / dir.x;
            if (t > 0.) {
                // 射线与平面在前方有交点。但交点不一定在盒子上
                auto tmpPt = startPoint + t * dir; // 交点
                if (box.pMin.y <= tmpPt.y && box.pMin.z <= tmpPt.z && box.pMax.y >= tmpPt.y && box.pMax.z >= tmpPt.z)
                    return true;
            }
        }
        if (dir.y != 0.) {
            // 同上测试y方向
            if (dir.y > 0) t = (box.pMin.y - startPoint.y) / dir.y;
            else t = (box.pMax.y - startPoint.y) / dir.y;
            if (t > 0.) {
                auto tmpPt = startPoint + t * dir;
                if (box.pMin.x <= tmpPt.x && box.pMin.z <= tmpPt.z && box.pMax.x >= tmpPt.x && box.pMax.z >= tmpPt.z)
                    return true;
            }
        }
        if (dir.z != 0.) {
            // 同上测试z方向
            if (dir.z > 0) t = (box.pMin.z - startPoint.z) / dir.z;
            else t = (box.pMax.z - startPoint.z) / dir.z;
            if (t > 0.) {
                auto tmpPt = startPoint + t * dir;
                if (box.pMin.x <= tmpPt.x && box.pMin.y <= tmpPt.y && box.pMax.x >= tmpPt.x && box.pMax.y >= tmpPt.y)
                    return true;
            }
        }
        return false;
    };

    // 不相交
    if (!isRayBoxIntersect()) return;
    // 相交
    if (bvhNode.firstChild == -1) {
        // 与叶节点相交
        intersectionLeafNodes.emplace_back(bvhNode);
    } else {
        // 与父节点相交
        for (int i = 0; i < 4; i++)
            recursiveGetRayBVHIntersection(dir, startPoint, bvhNode.firstChild + i, intersectionLeafNodes);
    }
}

__host__ std::vector<BVHNode> NurbsSurface::Surface::rayBVHIntersection(glm::vec3 dir, glm::vec3 startPoint) {
    std::vector<BVHNode> res;
    recursiveGetRayBVHIntersection(dir, startPoint, 0, res);
    //TODO sort res by t

//    res.emplace_back(BVHNode());
//    printf("res size: %lld\n", res.size());
    return res;
}