NurbsPerformer/src/device/Nurbs/nurbs_surface.cu


								//

								// Created by 14727 on 2022/12/9.

								//


								#include "device/Nurbs/nurbs_surface.cuh"

								#include "device/Nurbs/nurbs_common.cuh"

								#include "utils.h"

								#include "device/Nurbs/bvh.cuh"

								#include "device/device_utils.cuh"

								#include "tinynurbs/tinynurbs.h"


								__global__ void

								NurbsSurface::g_evaluate(float *res, const float *d_nTexture_u, const float *d_nTexture_v, const float *d_points,

								                         int d_pointsCnt_u,

								                         int d_pointsCnt_v, int d_POINT_SIZE, float d_lastKnot_u, float d_lastKnot_v, int d_sampleCnt_u,

								                         int d_sampleCnt_v) {

								    // 二维grid和二维的block

								    int ix = blockIdx.x * blockDim.x + threadIdx.x;

								    int iy = blockIdx.y * blockDim.y + threadIdx.y;


								    float u = ix * d_lastKnot_u / (d_sampleCnt_u - 1);

								    float v = iy * d_lastKnot_v / (d_sampleCnt_v - 1);


								    if (u > 1.0 * d_lastKnot_u || v > 1.0 * d_lastKnot_v) {

								        return;

								    }


								    float x = 0., y = 0., z = 0., sumW = 0.;

								    for (int i = 0; i < d_pointsCnt_u; i++) {

								        float N_U = d_nTexture_u[ix * d_pointsCnt_u + i];

								        for (int j = 0; j < d_pointsCnt_v; j++) {

								            float N_V = d_nTexture_v[iy * d_pointsCnt_v + j];

								            int idx = (i * d_pointsCnt_v + j) * d_POINT_SIZE;

								            float w = d_points[idx + 3];

								            x += N_U * N_V * w * d_points[idx];

								            y += N_U * N_V * w * d_points[idx + 1];

								            z += N_U * N_V * w * d_points[idx + 2];

								            sumW += N_U * N_V * w;

								        }

								    }

								    x = x / sumW;

								    y = y / sumW;

								    z = z / sumW;


								    int baseIdx = (ix * d_sampleCnt_v + iy) * 3;

								    res[baseIdx] = x;

								    res[baseIdx + 1] = y;

								    res[baseIdx + 2] = z;


								//    printf("(%d, %d)-->(%g, %g, %g)\n", ix, iy, x, y, z); // %g输出，舍弃无意义的0

								}


								__global__ void

								NurbsSurface::g_derivative(float *derivatives, float *normals, const float *derTexture_u, const float *derTexture_v,

								                           const float *nTexture_u, const float *nTexture_v, const float *d_points, int d_pointsCnt_u,

								                           int d_pointsCnt_v, int d_POINT_SIZE, float d_lastKnot_u, float d_lastKnot_v,

								                           int d_sampleCnt_u, int d_sampleCnt_v) {

								    // 二维grid和二维的block

								    int ix = blockIdx.x * blockDim.x + threadIdx.x;

								    int iy = blockIdx.y * blockDim.y + threadIdx.y;


								    if (ix >= d_sampleCnt_u || iy >= d_sampleCnt_v) {

								        return;

								    }

								    float u = ix * d_lastKnot_u / (d_sampleCnt_u - 1);

								    float v = iy * d_lastKnot_v / (d_sampleCnt_v - 1);


								    float nubsPdx_u = 0., nubsPdy_u = 0, nubsPdz_u = 0., nubsPdw_u = 0.;

								    float nubsPdx_v = 0., nubsPdy_v = 0, nubsPdz_v = 0., nubsPdw_v = 0.;


								    for (int i = 0; i < d_pointsCnt_u; i++) {

								        for (int j = 0; j < d_pointsCnt_u; j++) {

								            int baseIdx = (i * d_pointsCnt_v + j) * d_POINT_SIZE;

								            float factor_u = derTexture_u[ix * d_pointsCnt_u + i] * nTexture_v[iy * d_pointsCnt_v + j];

								            float factor_v = derTexture_v[iy * d_pointsCnt_v + j] * nTexture_u[ix * d_pointsCnt_u + i];

								            float wij = d_points[baseIdx + 3];

								            nubsPdx_u += factor_u * wij * d_points[baseIdx];

								            nubsPdy_u += factor_u * wij * d_points[baseIdx + 1];

								            nubsPdz_u += factor_u * wij * d_points[baseIdx + 2];

								            nubsPdw_u += factor_u * wij;


								            nubsPdx_v += factor_v * wij * d_points[baseIdx];

								            nubsPdy_v += factor_v * wij * d_points[baseIdx + 1];

								            nubsPdz_v += factor_v * wij * d_points[baseIdx + 2];

								            nubsPdw_v += factor_v * wij;

								        }

								    }


								    float x = 0., y = 0., z = 0., w = 0.;

								    for (int i = 0; i < d_pointsCnt_u; i++) {

								        float N_U = nTexture_u[ix * d_pointsCnt_u + i];

								        for (int j = 0; j < d_pointsCnt_v; j++) {

								            float N_V = nTexture_v[iy * d_pointsCnt_v + j];

								            int idx = (i * d_pointsCnt_v + j) * d_POINT_SIZE;

								            float wij = d_points[idx + 3];

								            x += N_U * N_V * wij * d_points[idx];

								            y += N_U * N_V * wij * d_points[idx + 1];

								            z += N_U * N_V * wij * d_points[idx + 2];

								            w += N_U * N_V * wij;

								        }

								    }


								    float w2 = w * w;

								    float pdx_u = (nubsPdx_u * w - x * nubsPdw_u) / w2;

								    float pdy_u = (nubsPdy_u * w - y * nubsPdw_u) / w2;

								    float pdz_u = (nubsPdz_u * w - z * nubsPdw_u) / w2;


								    float pdx_v = (nubsPdx_v * w - x * nubsPdw_v) / w2;

								    float pdy_v = (nubsPdy_v * w - y * nubsPdw_v) / w2;

								    float pdz_v = (nubsPdz_v * w - z * nubsPdw_v) / w2;

								//    float pdz_u = (nubsPdz_u * w - z )


								    int baseIdx = (ix * d_sampleCnt_v + iy) * 6;

								    derivatives[baseIdx] = pdx_u;

								    derivatives[baseIdx + 1] = pdy_u;

								    derivatives[baseIdx + 2] = pdz_u;

								    derivatives[baseIdx + 3] = pdx_v;

								    derivatives[baseIdx + 4] = pdy_v;

								    derivatives[baseIdx + 5] = pdz_v;


								    // 叉乘得到法向量

								    baseIdx = (ix * d_sampleCnt_v + iy) * 3;

								    normals[baseIdx] = pdy_u * pdz_v - pdy_v * pdz_u;

								    normals[baseIdx + 1] = pdx_v * pdz_u - pdx_u * pdz_v;

								    normals[baseIdx + 2] = pdx_u * pdy_v - pdx_v * pdy_u;

								    normalization(normals[baseIdx], normals[baseIdx + 1], normals[baseIdx + 2]);

								//    if ((ix == 8 && iy == 9) || (ix == 7 && iy == 9) || (ix == 9 && iy == 9) || (ix == 8 && iy == 8) ||

								//        (ix == 8 && iy == 10))

								//        printf("(%g,%g)-->u:(%g, %g, %g), v:(%g,%g,%g), normal:(%g,%g,%g)\n", u, v, pdx_u, pdy_u, pdz_u, pdx_v, pdy_v,

								//               pdz_v, normals[baseIdx], normals[baseIdx + 1], normals[baseIdx + 2]);

								}


								__global__ void

								NurbsSurface::g_curvature(const float *derivatives, const int sampleCnt_u, const int sampleCnt_v, float lastKnot_u,

								                          float lastKnot_v, float *ms, float *k) {

								    // 二维grid和二维的block

								    int ix = blockIdx.x * blockDim.x + threadIdx.x;

								    int iy = blockIdx.y * blockDim.y + threadIdx.y;


								    if (ix >= sampleCnt_u || iy >= sampleCnt_v) {

								        return;

								    }


								    float step_u = lastKnot_u / (sampleCnt_u - 1), step_v = lastKnot_v / (sampleCnt_v - 1);

								    float u = ix * step_u, v = iy * step_v;


								    int baseIdx = (ix * sampleCnt_v + iy) * 6;

								    int lastBaseIdx_u = ((ix - 1) * sampleCnt_v + iy) * 6, nextBaseIdx_u = ((ix + 1) * sampleCnt_v + iy) * 6;

								    int lastBaseIdx_v = (ix * sampleCnt_v + iy - 1) * 6, nextBaseIdx_v = (ix * sampleCnt_v + iy + 1) * 6;


								//    printf("(%g,%g)-->u:(%g, %g, %g), v:(%g,%g,%g)\n", u, v, derivatives[baseIdx], derivatives[baseIdx + 1],

								//           derivatives[baseIdx + 2], derivatives[baseIdx + 3], derivatives[baseIdx + 4], derivatives[baseIdx + 5]);


								    float sndPdx_uu, sndPdy_uu, sndPdz_uu, sndPdx_vv, sndPdy_vv, sndPdz_vv;  // 二阶导

								    float sndPdx_uv, sndPdy_uv, sndPdz_uv, sndPdx_vu, sndPdy_vu, sndPdz_vu;


								    if (ix == 0) {

								        sndPdx_uu = (derivatives[nextBaseIdx_u] - derivatives[baseIdx]) / step_u;

								        sndPdy_uu = (derivatives[nextBaseIdx_u + 1] - derivatives[baseIdx + 1]) / step_u;

								        sndPdz_uu = (derivatives[nextBaseIdx_u + 2] - derivatives[baseIdx + 2]) / step_u;


								        sndPdx_vu = (derivatives[nextBaseIdx_u + 3] - derivatives[baseIdx + 3]) / step_u;

								        sndPdy_vu = (derivatives[nextBaseIdx_u + 4] - derivatives[baseIdx + 4]) / step_u;

								        sndPdz_vu = (derivatives[nextBaseIdx_u + 5] - derivatives[baseIdx + 5]) / step_u;

								    } else if (ix == sampleCnt_u - 1) {

								        sndPdx_uu = (derivatives[baseIdx] - derivatives[lastBaseIdx_u]) / step_u;

								        sndPdy_uu = (derivatives[baseIdx + 1] - derivatives[lastBaseIdx_u + 1]) / step_u;

								        sndPdz_uu = (derivatives[baseIdx + 2] - derivatives[lastBaseIdx_u + 2]) / step_u;


								        sndPdx_vu = (derivatives[baseIdx + 3] - derivatives[lastBaseIdx_u + 3]) / step_u;

								        sndPdy_vu = (derivatives[baseIdx + 4] - derivatives[lastBaseIdx_u + 4]) / step_u;

								        sndPdz_vu = (derivatives[baseIdx + 5] - derivatives[lastBaseIdx_u + 5]) / step_u;

								    } else {

								        sndPdx_uu = (derivatives[nextBaseIdx_u] - derivatives[lastBaseIdx_u]) / (2 * step_u);

								        sndPdy_uu = (derivatives[nextBaseIdx_u + 1] - derivatives[lastBaseIdx_u + 1]) / (2 * step_u);

								        sndPdz_uu = (derivatives[nextBaseIdx_u + 2] - derivatives[lastBaseIdx_u + 2]) / (2 * step_u);


								        sndPdx_vu = (derivatives[nextBaseIdx_u + 3] - derivatives[lastBaseIdx_u + 3]) / (2 * step_u);

								        sndPdy_vu = (derivatives[nextBaseIdx_u + 4] - derivatives[lastBaseIdx_u + 4]) / (2 * step_u);

								        sndPdz_vu = (derivatives[nextBaseIdx_u + 5] - derivatives[lastBaseIdx_u + 5]) / (2 * step_u);

								    }


								    if (iy == 0) {

								        sndPdx_vv = (derivatives[nextBaseIdx_v + 3] - derivatives[baseIdx + 3]) / step_v;

								        sndPdy_vv = (derivatives[nextBaseIdx_v + 4] - derivatives[baseIdx + 4]) / step_v;

								        sndPdz_vv = (derivatives[nextBaseIdx_v + 5] - derivatives[baseIdx + 5]) / step_v;


								        sndPdx_uv = (derivatives[nextBaseIdx_v] - derivatives[baseIdx]) / step_v;

								        sndPdy_uv = (derivatives[nextBaseIdx_v + 1] - derivatives[baseIdx + 1]) / step_v;

								        sndPdz_uv = (derivatives[nextBaseIdx_v + 2] - derivatives[baseIdx + 2]) / step_v;

								    } else if (iy == sampleCnt_v - 1) {

								        sndPdx_vv = (derivatives[baseIdx + 3] - derivatives[lastBaseIdx_v + 3]) / step_v;

								        sndPdy_vv = (derivatives[baseIdx + 4] - derivatives[lastBaseIdx_v + 4]) / step_v;

								        sndPdz_vv = (derivatives[baseIdx + 5] - derivatives[lastBaseIdx_v + 5]) / step_v;


								        sndPdx_uv = (derivatives[baseIdx] - derivatives[lastBaseIdx_v]) / step_v;

								        sndPdy_uv = (derivatives[baseIdx + 1] - derivatives[lastBaseIdx_v + 1]) / step_v;

								        sndPdz_uv = (derivatives[baseIdx + 2] - derivatives[lastBaseIdx_v + 2]) / step_v;

								    } else {

								        sndPdx_vv = (derivatives[nextBaseIdx_v + 3] - derivatives[lastBaseIdx_v + 3]) / (2 * step_v);

								        sndPdy_vv = (derivatives[nextBaseIdx_v + 4] - derivatives[lastBaseIdx_v + 4]) / (2 * step_v);

								        sndPdz_vv = (derivatives[nextBaseIdx_v + 5] - derivatives[lastBaseIdx_v + 5]) / (2 * step_v);


								        sndPdx_uv = (derivatives[nextBaseIdx_v] - derivatives[lastBaseIdx_v]) / (2 * step_v);

								        sndPdy_uv = (derivatives[nextBaseIdx_v + 1] - derivatives[lastBaseIdx_v + 1]) / (2 * step_v);

								        sndPdz_uv = (derivatives[nextBaseIdx_v + 2] - derivatives[lastBaseIdx_v + 2]) / (2 * step_v);

								    }


								    float uvx = (sndPdx_uv + sndPdx_vu) / 2, uvy = (sndPdy_uv + sndPdy_vu) / 2, uvz = (sndPdz_uv + sndPdz_vu) / 2;

								//    normalization(sndPdx_uv, sndPdy_uv, sndPdz_uv);

								//    normalization(sndPdx_vu, sndPdy_vu, sndPdz_vu);

								//    normalization(sndPdx_uu, sndPdy_uu, sndPdz_uu);

								//    normalization(uvx, uvy, uvz);

								//    normalization(sndPdx_vv, sndPdy_vv, sndPdz_vv);


								//    if (ix == 8 && iy == 9) {

								//        printf("(%g, %g) --> uu: (%g, %g, %g), uv: (%g, %g, %g), vv: (%g, %g, %g)\n", u, v, sndPdx_uu, sndPdy_uu,

								//               sndPdz_uu,

								//               uvx, uvy, uvz, sndPdx_vv, sndPdy_vv, sndPdz_vv);

								//        printf("uv: (%g, %g, %g), vu: (%g, %g, %g)\n", sndPdx_uv, sndPdy_uv, sndPdz_uv, sndPdx_vu, sndPdy_vu,

								//               sndPdz_vu);

								//    }


								    float m1 = max(max(sndPdx_uu, sndPdy_uu), sndPdz_uu);

								    float m2 = max(max(uvx, uvy), uvz);

								    float m3 = max(max(sndPdx_vv, sndPdy_vv), sndPdz_vv);


								//    __shared__ float ms[363];

								    ms[(ix * sampleCnt_v + iy) * 3] = m1;

								    ms[(ix * sampleCnt_v + iy) * 3 + 1] = m2;

								    ms[(ix * sampleCnt_v + iy) * 3 + 2] = m3;

								    __syncthreads();

								//    if(ix == 1 && iy == 1) {

								//        for(int i = 0; i < sampleCnt_u; i++) {

								//            for(int j = 0; j < sampleCnt_v; j++) {

								//                printf("%g ", ms[(i * sampleCnt_v + j) * 3]);

								//            }

								//            printf("\n");

								//        }

								//    }


								    // 规约求最大值

								    for (int step = (sampleCnt_u + 1) / 2; step > 1; step = (step + 1) / 2) {

								        // step 表示现在参与计算最大值的数据的长度的一半

								        if (ix < step && ix + step < sampleCnt_u) {

								            ms[(ix * sampleCnt_v + iy) * 3] = max(m1, ms[((ix + step) * sampleCnt_v + iy) * 3]);

								            ms[(ix * sampleCnt_v + iy) * 3 + 1] = max(m1, ms[((ix + step) * sampleCnt_v + iy) * 3 + 1]);

								            ms[(ix * sampleCnt_v + iy) * 3 + 2] = max(m1, ms[((ix + step) * sampleCnt_v + iy) * 3 + 2]);

								        }

								    }


								    for (int step = (sampleCnt_v + 1) / 2; step > 1; step = (step + 1) / 2) {

								        // step 表示现在参与计算最大值的数据的长度的一半

								        if (iy < step && iy + step < sampleCnt_v) {

								            ms[iy * 3] = max(ms[iy * 3], ms[(iy + step) * 3]);

								            ms[iy * 3 + 1] = max(ms[iy * 3 + 1], ms[(iy + step) * 3 + 1]);

								            ms[iy * 3 + 2] = max(ms[iy * 3 + 2], ms[(iy + step) * 3 + 2]);

								        }

								    }

								    __syncthreads();

								    int n = sampleCnt_u - 1;

								    int m = sampleCnt_v - 1;

								    *k = (ms[0] / (n * n) + 2 * ms[1] / (n * m) + ms[2] / (m * m)) / 8;

								//    if(ix == 1 && iy == 1)printf("%g gggg\n", ms[0]);

								}


								__host__ NurbsSurface::Surface::Surface(MeshPoints4 controlPoints, std::vector<float> knots_u,

								                                        std::vector<float> knots_v) {

								    this->knots_u = std::move(knots_u);

								    this->knots_v = std::move(knots_v);

								    this->controlPoints = std::move(controlPoints);

								    recordTime = false;

								    d_nTexture_u = nullptr;

								    d_nTexture_v = nullptr;

								    d_nTexture1_u = nullptr;

								    d_nTexture1_v = nullptr;

								    d_knots_u = nullptr;

								    d_knots_v = nullptr;

								    d_points = nullptr;

								    d_evaluationRes = nullptr;

								    d_derivatives = nullptr;

								    d_k = nullptr;

								    d_normals = nullptr;

								    bvh.nodes = nullptr;

								    h_evaluations = nullptr;

								    h_derivatives = nullptr;

								    h_normals = nullptr;

								}


								__host__ void NurbsSurface::Surface::evaluate(int sampleCnt_u, int sampleCnt_v) {

								    // 构造指向device的controlPoints

								    const int pointsCnt_u = controlPoints.size(), pointsCnt_v = controlPoints[0].size();

								    const int pointsBytes = pointsCnt_u * pointsCnt_v * sizeof(glm::vec4);

								    auto *h_points = (float *) malloc(pointsBytes);

								    for (int i = 0; i < pointsCnt_u; i++) {

								        for (int j = 0; j < pointsCnt_v; j++) {

								            for (int k = 0; k < POINT_SIZE; k++) {

								                h_points[(i * pointsCnt_v + j) * POINT_SIZE + k] = controlPoints[i][j][k];

								            }

								//            printf("%f/ %f/ %f/ %f     ", controlPoints[i][j][0], controlPoints[i][j][1], controlPoints[i][j][2], controlPoints[i][j][3]);

								        }

								    }

								    cudaMalloc((void **) &d_points, pointsBytes);

								    cudaMemcpy(d_points, h_points, pointsBytes, cudaMemcpyHostToDevice);


								    // 构造指向device的knots

								    const int knotsCnt_u = knots_u.size(), knotsCnt_v = knots_v.size();

								    const int knotsBytes_u = knotsCnt_u * sizeof(float), knotsBytes_v = knotsCnt_v * sizeof(float);

								    auto *h_knots_u = (float *) malloc(knotsBytes_u), *h_knots_v = (float *) malloc(knotsBytes_v);

								    for (int i = 0; i < knotsCnt_u; i++) h_knots_u[i] = knots_u[i];

								    for (int i = 0; i < knotsCnt_v; i++) h_knots_v[i] = knots_v[i];


								    safeCudaFree(d_knots_u);

								    safeCudaFree(d_knots_v);

								    cudaMalloc((void **) &d_knots_u, knotsBytes_u);

								    cudaMalloc((void **) &d_knots_v, knotsBytes_v);

								    cudaMemcpy(d_knots_u, h_knots_u, knotsBytes_u, cudaMemcpyHostToDevice);

								    cudaMemcpy(d_knots_v, h_knots_v, knotsBytes_v, cudaMemcpyHostToDevice);


								    // 构造nTexture

								    cudaMalloc((void **) &d_nTexture_u,

								               sampleCnt_u * pointsCnt_u * sizeof(float)); // 注意nTexture的大小，在算梯度时用得到i=pointsCnt + 1的基函数值

								    cudaMalloc((void **) &d_nTexture_v, sampleCnt_v * pointsCnt_v * sizeof(float));


								    // 构造nTexture1

								    cudaMalloc((void **) &d_nTexture1_u, sampleCnt_u * (pointsCnt_u + 1) * sizeof(float));

								    cudaMalloc((void **) &d_nTexture1_v, sampleCnt_v * (pointsCnt_v + 1) * sizeof(float));


								    // 结果数组

								    size_t resBytes = sampleCnt_u * sampleCnt_v * 3 * sizeof(float);

								    safeCudaFree(d_evaluationRes);

								    cudaMalloc((void **) &d_evaluationRes, resBytes);

								    safeFree(h_evaluations);

								    h_evaluations = (float *) malloc(resBytes);


								    // 构造g_basisTexture线程层级

								    dim3 blockBasis(512);

								    dim3 gridBasis_u((sampleCnt_u + blockBasis.x - 1) / blockBasis.x);

								    dim3 gridBasis_v((sampleCnt_v + blockBasis.x - 1) / blockBasis.x);


								    // 构造线程层级，调用核函数

								    dim3 block(32, 32);

								    dim3 grid((sampleCnt_u + block.x - 1) / block.x, (sampleCnt_v + block.y - 1) / block.y);

								    // 记录用时

								    double time_cost_device;

								    if (recordTime) time_cost_device = get_time();

								    g_basisTexture<<<gridBasis_u, blockBasis>>>(d_nTexture_u, d_nTexture1_u, d_knots_u, pointsCnt_u, knotsCnt_u,

								                                                sampleCnt_u);

								    cudaDeviceSynchronize();

								    g_basisTexture<<<gridBasis_v, blockBasis>>>(d_nTexture_v, d_nTexture1_v, d_knots_v, pointsCnt_v, knotsCnt_v,

								                                                sampleCnt_v);

								    cudaDeviceSynchronize();


								    g_evaluate <<<grid, block>>>(d_evaluationRes, d_nTexture_u, d_nTexture_v, d_points, pointsCnt_u, pointsCnt_v,

								                                 POINT_SIZE, knots_u[knotsCnt_u - 1], knots_v[knotsCnt_v - 1], sampleCnt_u,

								                                 sampleCnt_v);

								    cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用

								    if (recordTime) {

								        time_cost_device = get_time() - time_cost_device;

								        printf("GPU time cost of surface evaluation for %d samples: %lf\n", sampleCnt_u * sampleCnt_v,

								               time_cost_device);

								    }


								    cudaMemcpy(h_evaluations, d_evaluationRes, resBytes, cudaMemcpyDeviceToHost);


								    // 释放内存

								    safeFree(h_points);

								    safeFree(h_knots_u);

								    safeFree(h_knots_v);

								}


								__host__ std::vector<std::vector<glm::vec3>>

								NurbsSurface::Surface::getEvaluateVec(int sampleCnt_u, int sampleCnt_v) const {

								    std::vector<std::vector<glm::vec3>> res(sampleCnt_u, std::vector<glm::vec3>(sampleCnt_v, glm::vec3()));

								    for (int i = 0; i < sampleCnt_u; i++) {

								        int baseIdx = i * sampleCnt_v * 3;

								        for (int j = 0; j < sampleCnt_v; j++) {

								            baseIdx += j * 3;

								            res[i][j].x = h_evaluations[baseIdx];

								            res[i][j].y = h_evaluations[baseIdx + 1];

								            res[i][j].z = h_evaluations[baseIdx + 2];

								//            printf("%d, %d: %f, %f, %f\n", i, j, res[i][j][0], res[i][j][1], res[i][j][2]);

								        }

								    }

								    return res;

								}


								__host__ std::vector<MeshPoints3> NurbsSurface::Surface::getDerivativeVec(int sampleCnt_u, int sampleCnt_v) const {

								    MeshPoints3 der_u(sampleCnt_u, LinePoints3(sampleCnt_v));

								    MeshPoints3 der_v(sampleCnt_u, LinePoints3(sampleCnt_v));

								    MeshPoints3 normal(sampleCnt_u, LinePoints3(sampleCnt_v));

								    for (int i = 0; i < sampleCnt_u; i++) {

								        int baseIdx = i * sampleCnt_v * 6;

								        for (int j = 0; j < sampleCnt_v; j++) {

								            baseIdx += j * 6;

								            der_u[i][j].x = h_derivatives[baseIdx];

								            der_u[i][j].y = h_derivatives[baseIdx + 1];

								            der_u[i][j].z = h_derivatives[baseIdx + 2];

								            der_v[i][j].x = h_derivatives[baseIdx + 3];

								            der_v[i][j].y = h_derivatives[baseIdx + 4];

								            der_v[i][j].z = h_derivatives[baseIdx + 5];

								            auto baseIdxNorm = baseIdx / 2;

								            normal[i][j].x = h_normals[baseIdxNorm];

								            normal[i][j].y = h_normals[baseIdxNorm + 1];

								            normal[i][j].z = h_normals[baseIdxNorm + 2];

								            // TODO normalize 归一化在gpu中实现！

								            normal[i][j] = glm::normalize(normal[i][j]);

								        }

								    }

								    return {der_u, der_v, normal};

								}


								__host__ void NurbsSurface::Surface::derivative(int sampleCnt_u, int sampleCnt_v) {

								    if (POINT_SIZE != controlPoints[0][0].size()) {

								        printf("Error! Nurbs控制点应表示为长度为4的齐次坐标\n");

								        return;

								    }


								    float *d_derTexture_u = nullptr;

								    float *d_derTexture_v = nullptr;

								    const int pointsCnt_u = controlPoints.size(), pointsCnt_v = controlPoints[0].size();

								    const int knotsCnt_u = knots_u.size(), knotsCnt_v = knots_v.size();

								    cudaMalloc((void **) &d_derTexture_u, sampleCnt_u * pointsCnt_u * sizeof(float));

								    cudaMalloc((void **) &d_derTexture_v, sampleCnt_v * pointsCnt_v * sizeof(float));


								    // 构造切向量计算结果

								    safeCudaFree(d_derivatives);

								    size_t derBytes = sampleCnt_u * sampleCnt_v * 6 * sizeof(float);

								    cudaMalloc((void **) &d_derivatives, derBytes); // 每个采样所求的切向量是一个六元向量，前三位是对u的偏导、后三位是对v的偏导


								    // 构造法向量计算结果

								    safeCudaFree(d_normals);

								    size_t normalBytes = sampleCnt_u * sampleCnt_v * 3 * sizeof(float);

								    cudaMalloc((void **) &d_normals, normalBytes);


								    // 构造线程层级

								    dim3 block(32, 32);

								    dim3 grid((sampleCnt_u + block.x - 1) / block.x, (sampleCnt_v + block.y - 1) / block.y);

								    // 构造g_basisTexture线程层级

								    dim3 blockTex(512);

								    dim3 gridTex_u((sampleCnt_u + blockTex.x - 1) / blockTex.x);

								    dim3 gridTex_v((sampleCnt_v + blockTex.x - 1) / blockTex.x);

								    // 记录用时

								    double time_cost_device;

								    if (recordTime) time_cost_device = get_time();

								    g_derTexture<<<gridTex_u, blockTex>>>(d_derTexture_u, d_nTexture1_u, d_knots_u, pointsCnt_u, knotsCnt_u,

								                                          sampleCnt_u);

								    g_derTexture<<<gridTex_v, blockTex>>>(d_derTexture_v, d_nTexture1_v, d_knots_v, pointsCnt_v, knotsCnt_v,

								                                          sampleCnt_v);

								    cudaDeviceSynchronize();

								    g_derivative<<<grid, block>>>(d_derivatives, d_normals, d_derTexture_u, d_derTexture_v, d_nTexture_u, d_nTexture_v,

								                                  d_points, pointsCnt_u, pointsCnt_v, POINT_SIZE, knots_u[knotsCnt_u - 1],

								                                  knots_v[knotsCnt_v - 1], sampleCnt_u, sampleCnt_v);

								    cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用

								    if (recordTime) {

								        time_cost_device = get_time() - time_cost_device;

								        printf("GPU time cost of surface first derivative calculating for %d samples: %lf\n", sampleCnt_u * sampleCnt_v,

								               time_cost_device);

								    }


								    // 结果数组

								    safeFree(h_normals);

								    h_normals = (float *) malloc(normalBytes);

								    cudaMemcpy(h_normals, d_normals, normalBytes, cudaMemcpyDeviceToHost);


								    safeFree(h_derivatives);

								    h_derivatives = (float *) malloc(derBytes);

								    cudaMemcpy(h_derivatives, d_derivatives, derBytes, cudaMemcpyDeviceToHost);


								    cudaFree(d_derTexture_u);

								    cudaFree(d_derTexture_v);

								}


								__host__ void NurbsSurface::Surface::curvature(int sampleCnt_u, int sampleCnt_v) {


								    if (POINT_SIZE != controlPoints[0][0].size()) {

								        printf("Error! Nurbs控制点应表示为长度为4的齐次坐标\n");

								        return;

								    }


								    // 构造记录每个采样点中的最大M1、M2、M3的数组

								    // 这里用共享内存会更好，但使用共享内存动态分配长度总是出错（好像是因为长度过长），后续需要思考解决这个问题

								    float *ms = nullptr;

								    cudaMalloc((void **) &ms, sampleCnt_u * sampleCnt_v * 3 * sizeof(float));


								    cudaMalloc((void **) &d_k, sizeof(float));


								    // 构造线程层级

								    dim3 block(32, 32);

								    dim3 grid((sampleCnt_u - 1 + block.x - 1) / block.x, (sampleCnt_v - 1 + block.y - 1) / block.y);


								    // 记录用时

								    double time_cost_device;

								    if (recordTime) time_cost_device = get_time();

								    g_curvature<<<grid, block>>>(d_derivatives, sampleCnt_u, sampleCnt_v, knots_u[knots_u.size() - 1],

								                                 knots_v[knots_v.size() - 1], ms, d_k);

								    cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用

								    if (recordTime) {

								        time_cost_device = get_time() - time_cost_device;

								        printf("GPU time cost of surface curvature calculating for %d samples: %lf\n",

								               sampleCnt_u * sampleCnt_v, time_cost_device);

								    }

								    safeCudaFree(ms);

								}


								void NurbsSurface::Surface::setRecordTime(bool r) {

								    recordTime = r;

								}


								NurbsSurface::Surface::~Surface() {

								    safeCudaFree(d_nTexture_u);

								    safeCudaFree(d_nTexture_v);

								    safeCudaFree(d_nTexture1_u);

								    safeCudaFree(d_nTexture1_v);

								    safeCudaFree(d_points);

								    safeCudaFree(d_knots_u);

								    safeCudaFree(d_knots_v);

								    safeCudaFree(d_k);

								    safeCudaFree(d_evaluationRes);

								    safeCudaFree(d_normals);

								    safeCudaFree(d_derivatives);

								    safeFree(bvh.nodes);

								    safeFree(h_evaluations);

								    safeFree(h_normals);

								    safeFree(h_derivatives);

								    cudaDeviceReset();

								}


								__host__ void NurbsSurface::Surface::buildBVH(int layerCnt, bool useK) {

								    // TODO 构造BVH的函数不应该出现在NURBS Surface中，应该是BVH类的事情！

								    // TODO NURBS Surface只需要一个函数去调用BVH对象的方法即可

								    int sampleCnt_u = pow(2, layerCnt - 1) + 1, sampleCnt_v = sampleCnt_u;

								    if (!useK) {

								        // 必须safeFree一下，这样global函数中才能通过d_k = nullptr知道不需要再free

								        safeCudaFree(d_k);

								    }

								    if (POINT_SIZE != controlPoints[0][0].size()) {

								        printf("Error! Nurbs控制点应表示为长度为4的齐次坐标\n");

								        return;

								    }

								    // 构造线程层级

								    dim3 block(32, 32);

								    dim3 grid((sampleCnt_u + block.x - 1) / block.x, (sampleCnt_v + block.y - 1) / block.y);


								//    bvh.maxLevel = max(int(ceil(log2f(sampleCnt_u - 1))) + 1, int(ceil(log2f(sampleCnt_v - 1))) + 1);

								    bvh.maxLevel = layerCnt;

								    bvh.size = (pow(4, bvh.maxLevel) - 1) / 3;  // 等比数列求和公示求出数总的节点数

								    size_t bvhBytes = sizeof(BVHNode) * bvh.size;

								    BVHNode *d_bvh = nullptr;

								    cudaMalloc((void **) &d_bvh, bvhBytes);

								    // 记录用时

								    double time_cost_device;

								    if (recordTime) time_cost_device = get_time();

								    g_buildBvh<<<grid, block>>>(d_k, bvh.maxLevel, d_evaluationRes, knots_u[knots_u.size() - 1],

								                                knots_v[knots_v.size() - 1], sampleCnt_u, sampleCnt_v, d_bvh);

								    cudaDeviceSynchronize();

								    if (recordTime) {

								        time_cost_device = get_time() - time_cost_device;

								        printf("GPU time cost of a %d-layer BVH building: %lf\n",

								               bvh.maxLevel, time_cost_device);

								    }

								    // 将bvh拷贝到cpu中

								    safeFree(bvh.nodes);

								    bvh.nodes = (BVHNode *) malloc(bvhBytes);

								    cudaMemcpy(bvh.nodes, d_bvh, bvhBytes, cudaMemcpyDeviceToHost);

								    safeCudaFree(d_bvh);

								//    bvh.printQuadTree();

								}


								__host__ void NurbsSurface::Surface::buildGaussMap(int layerCnt) {

								    // TODO，构造GAUSS Map的函数不应该出现在NURBS Surface中，应该是GAUSS Map类的事情！

								    int sampleCnt_u = pow(2, layerCnt - 1) + 1, sampleCnt_v = sampleCnt_u;


								    if (POINT_SIZE != controlPoints[0][0].size()) {

								        printf("Error! Nurbs控制点应表示为长度为4的齐次坐标\n");

								        return;

								    }

								    // 构造线程层级

								    dim3 block(32, 32);

								    dim3 grid((sampleCnt_u + block.x - 1) / block.x, (sampleCnt_v + block.y - 1) / block.y);


								    gauss_map.maxLevel = layerCnt;

								    gauss_map.size = (pow(4, layerCnt) - 1) / 3;  // 等比数列求和公示求出数总的节点数

								    size_t gaussMapBytes = sizeof(BVHNode) * gauss_map.size;

								    BVHNode *d_gaussMapTree = nullptr;

								    cudaMalloc((void **) &d_gaussMapTree, gaussMapBytes);

								    // 记录用时

								    double time_cost_device;

								    if (recordTime) time_cost_device = get_time();

								    g_buildBvh<<<grid, block>>>(nullptr, layerCnt, d_normals, knots_u[knots_u.size() - 1],

								                                knots_v[knots_v.size() - 1], sampleCnt_u, sampleCnt_v, d_gaussMapTree);

								    cudaDeviceSynchronize();

								    if (recordTime) {

								        time_cost_device = get_time() - time_cost_device;

								        printf("GPU time cost of a %d-layer Gauss Map building: %lf\n",

								               layerCnt, time_cost_device);

								    }

								    safeFree(gauss_map.nodes);

								    gauss_map.nodes = (BVHNode *) malloc(gaussMapBytes);

								    cudaMemcpy(gauss_map.nodes, d_gaussMapTree, gaussMapBytes, cudaMemcpyDeviceToHost);

								    safeCudaFree(d_gaussMapTree);

								}


								__host__ void NurbsSurface::recursiveGetOverlapLeafNodes(const BVH &bvh1, const BVH &bvh2, int idx1, int idx2,

								                                                         std::vector<std::pair<int, int>> &pairs) {

								    auto A = bvh1.nodes[idx1];

								    auto B = bvh2.nodes[idx2];

								    auto AABBSize = [](const AABB &aabb) {

								        return (aabb.pMax.z - aabb.pMin.z) *

								               (aabb.pMax.y - aabb.pMin.y) *

								               (aabb.pMax.x - aabb.pMin.x);

								    };

								    // 两个包围盒不相交，返回

								    if (!A.bounds.IsOverlap(B.bounds)) return;

								    // 相交

								    if (A.firstChild == -1 && B.firstChild == -1) {

								        // 两者都是叶子节点

								        pairs.emplace_back(idx1, idx2);

								    } else if (A.firstChild != -1 && B.firstChild == -1) {

								        // A是中间结点，B是叶子结点

								        for (int i = 0; i < 4; i++) recursiveGetOverlapLeafNodes(bvh1, bvh2, A.firstChild + i, idx2, pairs);

								    } else if (A.firstChild == -1 && B.firstChild != -1) {

								        // A是叶子结点，B是中间结点

								        for (int i = 0; i < 4; i++) recursiveGetOverlapLeafNodes(bvh1, bvh2, idx1, B.firstChild + i, pairs);

								    } else {

								        // 都是中间结点

								        if (AABBSize(A.bounds) > AABBSize(B.bounds)) {

								            // A的包围盒更大

								            for (int i = 0; i < 4; i++) recursiveGetOverlapLeafNodes(bvh1, bvh2, A.firstChild + i, idx2, pairs);

								        } else {

								            // B的包围盒更大

								            for (int i = 0; i < 4; i++) recursiveGetOverlapLeafNodes(bvh1, bvh2, idx1, B.firstChild + i, pairs);

								        }

								    }

								}


								__host__ std::vector<boxPair>

								NurbsSurface::getOverlappedLeafNodes(const BVH &bvh1, const BVH &bvh2) {

								    std::vector<idx2> resPairs;

								    // 记录用时

								    double time_cost_device = get_time();


								    recursiveGetOverlapLeafNodes(bvh1, bvh2, 0, 0, resPairs);

								    std::vector<boxPair> boxPairsIdx2(resPairs.size());

								    for (int i = 0; i < resPairs.size(); i++) {

								        boxPairsIdx2[i] = {{bvh1.nodes[resPairs[i].first].idx_u,  bvh1.nodes[resPairs[i].first].idx_v},

								                           {bvh2.nodes[resPairs[i].second].idx_u, bvh2.nodes[resPairs[i].second].idx_v}};

								    }


								    time_cost_device = get_time() - time_cost_device;

								    printf("CPU time cost for recursively calculating the overlapped leaf nodes: %lf\n", time_cost_device);


								    return boxPairsIdx2;

								}


								__host__ bool

								NurbsSurface::isGaussMapsOverlapped(const BVH &gm1, const BVH &gm2, std::pair<int, int> idxRange_u1,

								                                    std::pair<int, int> idxRange_v1, std::pair<int, int> idxRange_u2,

								                                    std::pair<int, int> idxRange_v2) {

								    if (gm1.maxLevel != gm2.maxLevel || gm1.maxLevel <= 0) {

								        printf("BVH Layer error!\n");

								        return false;

								    }

								    int commonMaxLayer = gm1.maxLevel;

								    int edgeCellCnt = pow(2, commonMaxLayer - 1);

								    if (idxRange_u1.first < 0 || idxRange_u2.first < 0 || idxRange_v1.first < 0 || idxRange_v2.first < 0 ||

								        idxRange_u1.second >= edgeCellCnt || idxRange_u2.second >= edgeCellCnt ||

								        idxRange_v1.second >= edgeCellCnt || idxRange_v2.second >= edgeCellCnt) {

								        printf("Error when detecting overlapping: idx range invalid!\n");

								        return false;

								    }


								    auto getRangedBox = [&commonMaxLayer](const BVH &bvh, const std::pair<int, int> &idxRange_u,

								                                          const std::pair<int, int> idxRange_v) {

								        // 获取某个范围的gauss map的aabb

								        AABB bounding;

								        for (int i = idxRange_u.first; i <= idxRange_u.second; ++i) {

								            for (int j = idxRange_v.first; j <= idxRange_v.second; ++j) {

								                bounding = bounding.Union(

								                        bvh.nodes[getStartIdxOfLayerN(commonMaxLayer) + h_getChildNodeIdx(i, j)].bounds);

								            }

								        }

								        return bounding;

								    };

								    return getRangedBox(gm1, idxRange_u1, idxRange_v1)

								            .IsOverlap(getRangedBox(gm2, idxRange_u2, idxRange_v2));

								}


								__host__ bool NurbsSurface::isGaussMapsOverlapped(const BVH &gm1, const BVH &gm2, std::pair<float, float> range_u1,

								                                                  std::pair<float, float> range_v1, std::pair<float, float> range_u2,

								                                                  std::pair<float, float> range_v2,

								                                                  std::pair<float, float> paramRange_u1,

								                                                  std::pair<float, float> paramRange_v1,

								                                                  std::pair<float, float> paramRange_u2,

								                                                  std::pair<float, float> paramRange_v2) {

								    if (gm1.maxLevel != gm2.maxLevel || gm1.maxLevel <= 0) {

								        printf("BVH Layer error!\n");

								        return false;

								    }

								    int edgeCellCnt = pow(2, gm1.maxLevel - 1);

								    // 根据所给参数的范围和参数的定义域范围，获得对应的采样网格中的范围

								    auto getIdxRange = [](std::pair<float, float> range, std::pair<float, float> paramRange, int edgeCellCnt) {

								        float paramStep = (paramRange.second - paramRange.first) / edgeCellCnt;

								        return std::pair<int, int>({int((range.first - paramRange.first) / paramStep),

								                                    int((range.second - paramRange.first) / paramStep)});

								    };

								    auto idxRange_u1 = getIdxRange(range_u1, paramRange_u1, edgeCellCnt);

								    auto idxRange_v1 = getIdxRange(range_v1, paramRange_v1, edgeCellCnt);

								    auto idxRange_u2 = getIdxRange(range_u2, paramRange_u2, edgeCellCnt);

								    auto idxRange_v2 = getIdxRange(range_v2, paramRange_v2, edgeCellCnt);

								    return isGaussMapsOverlapped(gm1, gm2, idxRange_u1, idxRange_v1, idxRange_u2, idxRange_v2);

								}


								__host__ void

								NurbsSurface::Surface::recursiveGetRayBVHIntersection(const glm::vec3 dir, const glm::vec3 startPoint, const int idx,

								                                                      std::vector<BVHNode> &intersectionLeafNodes) {

								    auto bvhNode = bvh.nodes[idx];

								    // 射线与AABB判交

								    auto isRayBoxIntersect = [&]() {

								        const auto &box = bvhNode.bounds;

								        float t; // 射线的参数，当t<=0，表示线面交点不在视平面前方，视为没有交点

								        if (dir.x != 0.) {

								            // 当x分量不为0，则射线会与AABB中垂直于x轴的平面可能有交

								            // 注意x的正负。x为正会先遇到较小点所在平面

								            if (dir.x > 0) t = (box.pMin.x - startPoint.x) / dir.x;

								            else t = (box.pMax.x - startPoint.x) / dir.x;

								            if (t > 0.) {

								                // 射线与平面在前方有交点。但交点不一定在盒子上

								                auto tmpPt = startPoint + t * dir; // 交点

								                if (box.pMin.y <= tmpPt.y && box.pMin.z <= tmpPt.z && box.pMax.y >= tmpPt.y && box.pMax.z >= tmpPt.z)

								                    return true;

								            }

								        }

								        if (dir.y != 0.) {

								            // 同上测试y方向

								            if (dir.y > 0) t = (box.pMin.y - startPoint.y) / dir.y;

								            else t = (box.pMax.y - startPoint.y) / dir.y;

								            if (t > 0.) {

								                auto tmpPt = startPoint + t * dir;

								                if (box.pMin.x <= tmpPt.x && box.pMin.z <= tmpPt.z && box.pMax.x >= tmpPt.x && box.pMax.z >= tmpPt.z)

								                    return true;

								            }

								        }

								        if (dir.z != 0.) {

								            // 同上测试z方向

								            if (dir.z > 0) t = (box.pMin.z - startPoint.z) / dir.z;

								            else t = (box.pMax.z - startPoint.z) / dir.z;

								            if (t > 0.) {

								                auto tmpPt = startPoint + t * dir;

								                if (box.pMin.x <= tmpPt.x && box.pMin.y <= tmpPt.y && box.pMax.x >= tmpPt.x && box.pMax.y >= tmpPt.y)

								                    return true;

								            }

								        }

								        return false;

								    };


								    // 不相交

								    if (!isRayBoxIntersect()) return;

								    // 相交

								    if (bvhNode.firstChild == -1) {

								        // 与叶节点相交

								        intersectionLeafNodes.emplace_back(bvhNode);

								    } else {

								        // 与父节点相交

								        for (int i = 0; i < 4; i++)

								            recursiveGetRayBVHIntersection(dir, startPoint, bvhNode.firstChild + i, intersectionLeafNodes);

								    }

								}


								__host__ std::vector<BVHNode> NurbsSurface::Surface::rayBVHIntersection(glm::vec3 dir, glm::vec3 startPoint) {

								    std::vector<BVHNode> res;

								    recursiveGetRayBVHIntersection(dir, startPoint, 0, res);

								    //TODO sort res by t


								//    res.emplace_back(BVHNode());

								//    printf("res size: %lld\n", res.size());

								    return res;

								}