NurbsPerformer/src/device/Nurbs/nurbs_surface.cu


								//

								// Created by 14727 on 2022/12/9.

								//

								#pragma once

								#include "chrono"

								#include "device/Nurbs/bvh.cuh"

								#include "device/Nurbs/nurbs_common.cuh"

								#include "device/Nurbs/nurbs_surface.cuh"

								#include "device/device_utils.cuh"

								#include "tinynurbs/tinynurbs.h"

								#include "utils.h"


								__global__ void NurbsSurface::g_evaluate(glm::vec3 *res,

								                                         const float *d_nTexture_u,

								                                         const float *d_nTexture_v,

								                                         const glm::vec4 *d_points,

								                                         int d_pointsCnt_u, int d_pointsCnt_v,

								                                         float d_lastKnot_u, float d_lastKnot_v,

								                                         int d_sampleCnt_u, int d_sampleCnt_v) {

								  // 二维grid和二维的block

								  int ix = blockIdx.x * blockDim.x + threadIdx.x;

								  int iy = blockIdx.y * blockDim.y + threadIdx.y;


								  if (ix >= d_sampleCnt_u || iy >= d_sampleCnt_v)

								    return;

								  //    if(ix == 128 || iy == 128)    printf("!!!!!!!!!@@@@%d, %d, %d, %d\n",

								  //    ix, iy, d_sampleCnt_u, d_sampleCnt_v); float u = ix * d_lastKnot_u /

								  //    (d_sampleCnt_u - 1); float v = iy * d_lastKnot_v / (d_sampleCnt_v - 1);


								  float x = 0., y = 0., z = 0., sumW = 0.;

								  for (int i = 0; i < d_pointsCnt_u; i++) {

								    float N_U = d_nTexture_u[ix * d_pointsCnt_u + i];

								    for (int j = 0; j < d_pointsCnt_v; j++) {

								      float N_V = d_nTexture_v[iy * d_pointsCnt_v + j];

								      int idx = i * d_pointsCnt_v + j;

								      float w = d_points[idx].w;

								      x += N_U * N_V * w * d_points[idx].x;

								      y += N_U * N_V * w * d_points[idx].y;

								      z += N_U * N_V * w * d_points[idx].z;

								      sumW += N_U * N_V * w;

								    }

								  }

								  float sumWInv = 1.f / sumW;

								  x *= sumWInv;

								  y *= sumWInv;

								  z *= sumWInv;


								  int baseIdx = ix * d_sampleCnt_v + iy;

								  res[baseIdx] = {x, y, z};


								  //    printf("(%d, %d)-->(%g, %g, %g)\n", ix, iy, res[baseIdx].x,

								  //    res[baseIdx].y, res[baseIdx].z); // %g输出，舍弃无意义的0

								}


								__global__ void

								NurbsSurface::g_derivative(glm::vec3 *derivatives_u, glm::vec3 *derivatives_v,

								                           glm::vec3 *normals, const float *derTexture_u,

								                           const float *derTexture_v, const float *nTexture_u,

								                           const float *nTexture_v, const glm::vec4 *d_points,

								                           int d_pointsCnt_u, int d_pointsCnt_v,

								                           int d_sampleCnt_u, int d_sampleCnt_v) {

								  // 二维grid和二维的block

								  int ix = blockIdx.x * blockDim.x + threadIdx.x;

								  int iy = blockIdx.y * blockDim.y + threadIdx.y;


								  if (ix >= d_sampleCnt_u || iy >= d_sampleCnt_v) {

								    return;

								  }


								  float nubsPdx_u = 0., nubsPdy_u = 0, nubsPdz_u = 0., nubsPdw_u = 0.;

								  float nubsPdx_v = 0., nubsPdy_v = 0, nubsPdz_v = 0., nubsPdw_v = 0.;


								  // TODO  合并两个循环？

								  for (int i = 0; i < d_pointsCnt_u; i++) {

								    for (int j = 0; j < d_pointsCnt_v; j++) {

								      int baseIdx = i * d_pointsCnt_v + j;

								      float factor_u = derTexture_u[ix * d_pointsCnt_u + i] *

								                       nTexture_v[iy * d_pointsCnt_v + j];

								      float factor_v = derTexture_v[iy * d_pointsCnt_v + j] *

								                       nTexture_u[ix * d_pointsCnt_u + i];

								      float wij = d_points[baseIdx].w;

								      nubsPdx_u += factor_u * wij * d_points[baseIdx].x;

								      nubsPdy_u += factor_u * wij * d_points[baseIdx].y;

								      nubsPdz_u += factor_u * wij * d_points[baseIdx].z;

								      nubsPdw_u += factor_u * wij;


								      nubsPdx_v += factor_v * wij * d_points[baseIdx].x;

								      nubsPdy_v += factor_v * wij * d_points[baseIdx].y;

								      nubsPdz_v += factor_v * wij * d_points[baseIdx].z;

								      nubsPdw_v += factor_v * wij;

								    }

								  }


								  float x = 0., y = 0., z = 0., w = 0.;

								  for (int i = 0; i < d_pointsCnt_u; i++) {

								    float N_U = nTexture_u[ix * d_pointsCnt_u + i];

								    for (int j = 0; j < d_pointsCnt_v; j++) {

								      float N_V = nTexture_v[iy * d_pointsCnt_v + j];

								      int idx = i * d_pointsCnt_v + j;

								      float wij = d_points[idx].w;

								      x += N_U * N_V * wij * d_points[idx].x;

								      y += N_U * N_V * wij * d_points[idx].y;

								      z += N_U * N_V * wij * d_points[idx].z;

								      w += N_U * N_V * wij;

								    }

								  }


								  float w2 = w * w;

								  float pdx_u = (nubsPdx_u * w - x * nubsPdw_u) / w2;

								  float pdy_u = (nubsPdy_u * w - y * nubsPdw_u) / w2;

								  float pdz_u = (nubsPdz_u * w - z * nubsPdw_u) / w2;


								  float pdx_v = (nubsPdx_v * w - x * nubsPdw_v) / w2;

								  float pdy_v = (nubsPdy_v * w - y * nubsPdw_v) / w2;

								  float pdz_v = (nubsPdz_v * w - z * nubsPdw_v) / w2;

								  //    float pdz_u = (nubsPdz_u * w - z )


								  int baseIdx = ix * d_sampleCnt_v + iy;

								  derivatives_u[baseIdx] = {pdx_u, pdy_u, pdz_u};

								  derivatives_v[baseIdx] = {pdx_v, pdy_v, pdz_v};


								  // 叉乘得到法向量

								  normals[baseIdx] = glm::cross(derivatives_u[baseIdx], derivatives_v[baseIdx]);

								  normals[baseIdx] = glm::normalize(normals[baseIdx]);

								  //    if ((ix == 8 && iy == 9) || (ix == 7 && iy == 9) || (ix == 9 && iy == 9)

								  //    || (ix == 8 && iy == 8) ||

								  //        (ix == 8 && iy == 10))

								  //        printf("(%g,%g)-->u:(%g, %g, %g), v:(%g,%g,%g),

								  //        normal:(%g,%g,%g)\n", u, v, pdx_u, pdy_u, pdz_u, pdx_v, pdy_v,

								  //               pdz_v, normals[baseIdx], normals[baseIdx + 1],

								  //               normals[baseIdx + 2]);

								}


								__global__ void NurbsSurface::g_curvature(const glm::vec3 *derivatives_u,

								                                          const glm::vec3 *derivatives_v,

								                                          const int sampleCnt_u,

								                                          const int sampleCnt_v,

								                                          float lastKnot_u, float lastKnot_v,

								                                          float *ms, float *k) {

								  // 二维grid和二维的block

								  int ix = blockIdx.x * blockDim.x + threadIdx.x;

								  int iy = blockIdx.y * blockDim.y + threadIdx.y;


								  if (ix >= sampleCnt_u || iy >= sampleCnt_v) {

								    return;

								  }


								  float step_u = lastKnot_u / (sampleCnt_u - 1),

								        step_v = lastKnot_v / (sampleCnt_v - 1);

								  float u = ix * step_u, v = iy * step_v;


								  int baseIdx = ix * sampleCnt_v + iy;

								  int lastBaseIdx_u = (ix - 1) * sampleCnt_v + iy,

								      nextBaseIdx_u = (ix + 1) * sampleCnt_v + iy;

								  int lastBaseIdx_v = ix * sampleCnt_v + iy - 1,

								      nextBaseIdx_v = ix * sampleCnt_v + iy + 1;


								  //    printf("(%g,%g)-->u:(%g, %g, %g), v:(%g,%g,%g)\n", u, v,

								  //    derivatives[baseIdx], derivatives[baseIdx + 1],

								  //           derivatives[baseIdx + 2], derivatives[baseIdx + 3],

								  //           derivatives[baseIdx + 4], derivatives[baseIdx + 5]);


								  //    float sndPdx_uu, sndPdy_uu, sndPdz_uu, sndPdx_vv, sndPdy_vv, sndPdz_vv;

								  //    // 二阶导 float sndPdx_uv, sndPdy_uv, sndPdz_uv, sndPdx_vu, sndPdy_vu,

								  //    sndPdz_vu;


								  glm::vec3 sndP_uu, sndP_uv, sndP_vu, sndP_vv;


								  if (ix == 0) {

								    sndP_uu = (derivatives_u[nextBaseIdx_u] - derivatives_u[baseIdx]) / step_u;

								    sndP_vu = (derivatives_v[nextBaseIdx_u] - derivatives_v[baseIdx]) / step_u;

								    //        sndPdy_uu = (derivatives[nextBaseIdx_u + 1] - derivatives[baseIdx

								    //        + 1]) / step_u; sndPdz_uu = (derivatives[nextBaseIdx_u + 2] -

								    //        derivatives[baseIdx + 2]) / step_u;


								    //        sndPdx_vu = (derivatives[nextBaseIdx_u + 3] - derivatives[baseIdx

								    //        + 3]) / step_u; sndPdy_vu = (derivatives[nextBaseIdx_u + 4] -

								    //        derivatives[baseIdx + 4]) / step_u; sndPdz_vu =

								    //        (derivatives[nextBaseIdx_u + 5] - derivatives[baseIdx + 5]) /

								    //        step_u;

								  } else if (ix == sampleCnt_u - 1) {

								    sndP_uu = (derivatives_u[baseIdx] - derivatives_u[lastBaseIdx_u]) / step_u;

								    sndP_vu = (derivatives_v[baseIdx] - derivatives_v[lastBaseIdx_u]) / step_u;

								  } else {

								    sndP_uu =

								        (derivatives_u[nextBaseIdx_u] - derivatives_u[lastBaseIdx_u]) / step_u;

								    sndP_vu =

								        (derivatives_v[nextBaseIdx_u] - derivatives_v[lastBaseIdx_u]) / step_u;

								  }


								  if (iy == 0) {

								    sndP_vv = (derivatives_v[nextBaseIdx_u] - derivatives_v[baseIdx]) / step_v;

								    sndP_uv = (derivatives_u[nextBaseIdx_u] - derivatives_u[baseIdx]) / step_v;

								  } else if (iy == sampleCnt_v - 1) {

								    sndP_vv = (derivatives_v[nextBaseIdx_u] - derivatives_v[baseIdx]) / step_v;

								    sndP_uv = (derivatives_u[nextBaseIdx_u] - derivatives_u[baseIdx]) / step_v;

								  } else {

								    sndP_vv = (derivatives_v[nextBaseIdx_u] - derivatives_v[baseIdx]) / step_v;

								    sndP_uv = (derivatives_u[nextBaseIdx_u] - derivatives_u[baseIdx]) / step_v;

								  }


								  sndP_uv = (sndP_uv + sndP_vu) / 2.f;

								  //    normalization(sndPdx_uv, sndPdy_uv, sndPdz_uv);

								  //    normalization(sndPdx_vu, sndPdy_vu, sndPdz_vu);

								  //    normalization(sndPdx_uu, sndPdy_uu, sndPdz_uu);

								  //    normalization(uvx, uvy, uvz);

								  //    normalization(sndPdx_vv, sndPdy_vv, sndPdz_vv);


								  //    if (ix == 8 && iy == 9) {

								  //        printf("(%g, %g) --> uu: (%g, %g, %g), uv: (%g, %g, %g), vv: (%g,

								  //        %g, %g)\n", u, v, sndPdx_uu, sndPdy_uu,

								  //               sndPdz_uu,

								  //               uvx, uvy, uvz, sndPdx_vv, sndPdy_vv, sndPdz_vv);

								  //        printf("uv: (%g, %g, %g), vu: (%g, %g, %g)\n", sndPdx_uv, sndPdy_uv,

								  //        sndPdz_uv, sndPdx_vu, sndPdy_vu,

								  //               sndPdz_vu);

								  //    }


								  float m1 = max(max(sndP_uu.x, sndP_uu.y), sndP_uu.z);

								  float m2 = max(max(sndP_uv.x, sndP_uv.y), sndP_uv.z);

								  float m3 = max(max(sndP_vv.x, sndP_vv.y), sndP_vv.z);


								  //    __shared__ float ms[363];

								  ms[(ix * sampleCnt_v + iy) * 3] = m1;

								  ms[(ix * sampleCnt_v + iy) * 3 + 1] = m2;

								  ms[(ix * sampleCnt_v + iy) * 3 + 2] = m3;

								  __syncthreads();

								  //    if(ix == 1 && iy == 1) {

								  //        for(int i = 0; i < sampleCnt_u; i++) {

								  //            for(int j = 0; j < sampleCnt_v; j++) {

								  //                printf("%g ", ms[(i * sampleCnt_v + j) * 3]);

								  //            }

								  //            printf("\n");

								  //        }

								  //    }


								  // 规约求最大值

								  for (int step = (sampleCnt_u + 1) / 2; step > 1; step = (step + 1) / 2) {

								    // step 表示现在参与计算最大值的数据的长度的一半

								    if (ix < step && ix + step < sampleCnt_u) {

								      ms[(ix * sampleCnt_v + iy) * 3] =

								          max(m1, ms[((ix + step) * sampleCnt_v + iy) * 3]);

								      ms[(ix * sampleCnt_v + iy) * 3 + 1] =

								          max(m1, ms[((ix + step) * sampleCnt_v + iy) * 3 + 1]);

								      ms[(ix * sampleCnt_v + iy) * 3 + 2] =

								          max(m1, ms[((ix + step) * sampleCnt_v + iy) * 3 + 2]);

								    }

								  }


								  for (int step = (sampleCnt_v + 1) / 2; step > 1; step = (step + 1) / 2) {

								    // step 表示现在参与计算最大值的数据的长度的一半

								    if (iy < step && iy + step < sampleCnt_v) {

								      ms[iy * 3] = max(ms[iy * 3], ms[(iy + step) * 3]);

								      ms[iy * 3 + 1] = max(ms[iy * 3 + 1], ms[(iy + step) * 3 + 1]);

								      ms[iy * 3 + 2] = max(ms[iy * 3 + 2], ms[(iy + step) * 3 + 2]);

								    }

								  }

								  __syncthreads();

								  int n = sampleCnt_u - 1;

								  int m = sampleCnt_v - 1;

								  *k = (ms[0] / (n * n) + 2 * ms[1] / (n * m) + ms[2] / (m * m)) / 8;

								  //    if(ix == 1 && iy == 1)printf("%g gggg\n", ms[0]);

								}


								__host__ NurbsSurface::Surface::Surface(MeshPoints4 controlPoints,

								                                        std::vector<float> knots_u,

								                                        std::vector<float> knots_v,

								                                        int _layerCnt) {

								  this->knots_u = std::move(knots_u);

								  this->knots_v = std::move(knots_v);

								  this->controlPoints = std::move(controlPoints);


								  layerCnt = _layerCnt;

								  edgeSampleCnt_u = edgeSampleCnt_v = pow(2, layerCnt - 1) + 1;


								  recordTime = false;

								  d_nTexture_u = nullptr;

								  d_nTexture_v = nullptr;

								  d_nTexture1_u = nullptr;

								  d_nTexture1_v = nullptr;

								  d_knots_u = nullptr;

								  d_knots_v = nullptr;

								  d_points = nullptr;

								  d_evaluations = nullptr;

								  d_derivatives_u = nullptr;

								  d_derivatives_v = nullptr;

								  d_k = nullptr;

								  d_normals = nullptr;

								  bvh.nodes = nullptr;

								  h_evaluations = nullptr;

								  h_derivatives_u = nullptr;

								  h_derivatives_v = nullptr;

								  h_normals = nullptr;

								}


								__host__ void NurbsSurface::Surface::evaluate() {

								  // 构造指向device的controlPoints

								  const int pointsCnt_u = controlPoints.size(),

								            pointsCnt_v = controlPoints[0].size();

								  const int pointsBytes = pointsCnt_u * pointsCnt_v * sizeof(glm::vec4);

								  auto *h_points = (glm::vec4 *)malloc(pointsBytes);

								  for (int i = 0; i < pointsCnt_u; i++) {

								    for (int j = 0; j < pointsCnt_v; j++) {

								      h_points[i * pointsCnt_v + j] = controlPoints[i][j];

								    }

								  }

								  cudaMalloc((void **)&d_points, pointsBytes);

								  cudaMemcpy(d_points, h_points, pointsBytes, cudaMemcpyHostToDevice);


								  // 构造指向device的knots

								  const int knotsCnt_u = knots_u.size(), knotsCnt_v = knots_v.size();

								  const int knotsBytes_u = knotsCnt_u * sizeof(float),

								            knotsBytes_v = knotsCnt_v * sizeof(float);

								  auto *h_knots_u = (float *)malloc(knotsBytes_u),

								       *h_knots_v = (float *)malloc(knotsBytes_v);

								  for (int i = 0; i < knotsCnt_u; i++)

								    h_knots_u[i] = knots_u[i];

								  for (int i = 0; i < knotsCnt_v; i++)

								    h_knots_v[i] = knots_v[i];


								  safeCudaFree(d_knots_u);

								  safeCudaFree(d_knots_v);

								  cudaMalloc((void **)&d_knots_u, knotsBytes_u);

								  cudaMalloc((void **)&d_knots_v, knotsBytes_v);

								  cudaMemcpy(d_knots_u, h_knots_u, knotsBytes_u, cudaMemcpyHostToDevice);

								  cudaMemcpy(d_knots_v, h_knots_v, knotsBytes_v, cudaMemcpyHostToDevice);


								  // 构造nTexture

								  cudaMalloc(

								      (void **)&d_nTexture_u,

								      edgeSampleCnt_u * pointsCnt_u *

								          sizeof(float)); // 注意nTexture的大小，在算梯度时用得到i=pointsCnt

								                          // + 1的基函数值

								  cudaMalloc((void **)&d_nTexture_v,

								             edgeSampleCnt_v * pointsCnt_v * sizeof(float));


								  // 构造nTexture1

								  cudaMalloc((void **)&d_nTexture1_u,

								             edgeSampleCnt_u * (pointsCnt_u + 1) * sizeof(float));

								  cudaMalloc((void **)&d_nTexture1_v,

								             edgeSampleCnt_v * (pointsCnt_v + 1) * sizeof(float));


								  // 结果数组

								  size_t resBytes = edgeSampleCnt_u * edgeSampleCnt_v * sizeof(glm::vec3);

								  safeCudaFree(d_evaluations);

								  cudaMalloc((void **)&d_evaluations, resBytes);

								  safeFree(h_evaluations);

								  h_evaluations = (glm::vec3 *)malloc(resBytes);


								  // 构造g_basisTexture线程层级

								  dim3 blockBasis(512);

								  dim3 gridBasis_u((edgeSampleCnt_u + blockBasis.x - 1) / blockBasis.x);

								  dim3 gridBasis_v((edgeSampleCnt_v + blockBasis.x - 1) / blockBasis.x);


								  // 构造线程层级，调用核函数

								  dim3 block(32, 32);

								  dim3 grid((edgeSampleCnt_u + block.x - 1) / block.x,

								            (edgeSampleCnt_v + block.y - 1) / block.y);

								  // 记录用时

								  auto startMom = std::chrono::steady_clock::now();

								  g_basisTexture<<<gridBasis_u, blockBasis>>>(d_nTexture_u, d_nTexture1_u,

								                                              d_knots_u, pointsCnt_u,

								                                              knotsCnt_u, edgeSampleCnt_u);

								  // cudaDeviceSynchronize();

								  auto duration = std::chrono::duration<double, std::milli>(

								                      std::chrono::steady_clock::now() - startMom)

								                      .count();

								  printf("GPU time cost of u_basis evaluation for %d*%d samples: %lf ms\n",

								         edgeSampleCnt_u, edgeSampleCnt_v, duration);


								  startMom = std::chrono::steady_clock::now();

								  g_basisTexture<<<gridBasis_v, blockBasis>>>(d_nTexture_v, d_nTexture1_v,

								                                              d_knots_v, pointsCnt_v,

								                                              knotsCnt_v, edgeSampleCnt_u);

								  duration = std::chrono::duration<double, std::milli>(

								                 std::chrono::steady_clock::now() - startMom)

								                 .count();

								  // cudaDeviceSynchronize();

								  printf("GPU time cost of v_basis evaluation for %d*%d samples: %lf ms\n",

								         edgeSampleCnt_u, edgeSampleCnt_v, duration);


								  // 记录用时

								  startMom = std::chrono::steady_clock::now();

								  g_evaluate<<<grid, block>>>(d_evaluations, d_nTexture_u, d_nTexture_v,

								                              d_points, pointsCnt_u, pointsCnt_v,

								                              knots_u[knotsCnt_u - 1], knots_v[knotsCnt_v - 1],

								                              edgeSampleCnt_u, edgeSampleCnt_v);

								  duration = std::chrono::duration<double, std::milli>(

								                 std::chrono::steady_clock::now() - startMom)

								                 .count();

								  // cudaDeviceSynchronize(); //

								  // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用

								  if (recordTime) {

								    auto duration = std::chrono::duration<double, std::milli>(

								                        std::chrono::steady_clock::now() - startMom)

								                        .count();

								    printf("GPU time cost of surface evaluation for %d*%d samples: %lf ms\n",

								           edgeSampleCnt_u, edgeSampleCnt_v, duration);

								  }


								  // TODO 拷贝到h_evaluations放在该函数内合适吗？

								  cudaMemcpy(h_evaluations, d_evaluations, resBytes, cudaMemcpyDeviceToHost);


								  // 释放内存

								  safeFree(h_points);

								  safeFree(h_knots_u);

								  safeFree(h_knots_v);

								}


								__host__ std::vector<std::vector<glm::vec3>>

								NurbsSurface::Surface::getEvaluateVec() const {

								  std::vector<std::vector<glm::vec3>> res(

								      edgeSampleCnt_u, std::vector<glm::vec3>(edgeSampleCnt_v, glm::vec3()));

								  for (int i = 0; i < edgeSampleCnt_u; i++) {

								    int baseIdx = i * edgeSampleCnt_v;

								    for (int j = 0; j < edgeSampleCnt_v; j++) {

								      baseIdx += j;

								      res[i][j].x = h_evaluations[baseIdx].x;

								      res[i][j].y = h_evaluations[baseIdx].y;

								      res[i][j].z = h_evaluations[baseIdx].z;

								      //            printf("%d, %d: %f, %f, %f\n", i, j, res[i][j][0],

								      //            res[i][j][1], res[i][j][2]);

								    }

								  }

								  return res;

								}


								__host__ std::vector<MeshPoints3>

								NurbsSurface::Surface::getDerivativeVec() const {

								  MeshPoints3 der_u(edgeSampleCnt_u, LinePoints3(edgeSampleCnt_v));

								  MeshPoints3 der_v(edgeSampleCnt_u, LinePoints3(edgeSampleCnt_v));

								  MeshPoints3 normal(edgeSampleCnt_u, LinePoints3(edgeSampleCnt_v));

								  for (int i = 0; i < edgeSampleCnt_u; i++) {

								    int baseIdx = i * edgeSampleCnt_v;

								    for (int j = 0; j < edgeSampleCnt_v; j++) {

								      baseIdx += j;

								      der_u[i][j] = h_derivatives_u[baseIdx];

								      der_v[i][j] = h_derivatives_v[baseIdx];

								      normal[i][j] = h_normals[baseIdx];

								    }

								  }

								  return {der_u, der_v, normal};

								}


								__host__ void NurbsSurface::Surface::derivative() {

								  float *d_derTexture_u = nullptr;

								  float *d_derTexture_v = nullptr;

								  const int pointsCnt_u = controlPoints.size(),

								            pointsCnt_v = controlPoints[0].size();

								  const int knotsCnt_u = knots_u.size(), knotsCnt_v = knots_v.size();

								  cudaMalloc((void **)&d_derTexture_u,

								             edgeSampleCnt_u * pointsCnt_u * sizeof(float));

								  cudaMalloc((void **)&d_derTexture_v,

								             edgeSampleCnt_u * pointsCnt_v * sizeof(float));


								  // 构造切向量计算结果

								  size_t vec3MeshBytes = edgeSampleCnt_u * edgeSampleCnt_v * sizeof(glm::vec3);

								  safeCudaFree(d_derivatives_u);

								  cudaMalloc((void **)&d_derivatives_u, vec3MeshBytes);

								  safeCudaFree(d_derivatives_v);

								  cudaMalloc((void **)&d_derivatives_v, vec3MeshBytes);


								  // 构造法向量计算结果

								  safeCudaFree(d_normals);

								  cudaMalloc((void **)&d_normals, vec3MeshBytes);


								  // 构造线程层级

								  dim3 block(32, 32);

								  dim3 grid((edgeSampleCnt_u + block.x - 1) / block.x,

								            (edgeSampleCnt_v + block.y - 1) / block.y);

								  // 构造g_basisTexture线程层级

								  dim3 blockTex(512);

								  dim3 gridTex_u((edgeSampleCnt_u + blockTex.x - 1) / blockTex.x);

								  dim3 gridTex_v((edgeSampleCnt_v + blockTex.x - 1) / blockTex.x);

								  // 记录用时

								  auto startMom = std::chrono::steady_clock::now();

								  g_derTexture<<<gridTex_u, blockTex>>>(d_derTexture_u, d_nTexture1_u,

								                                        d_knots_u, pointsCnt_u, knotsCnt_u,

								                                        edgeSampleCnt_u);

								  g_derTexture<<<gridTex_v, blockTex>>>(d_derTexture_v, d_nTexture1_v,

								                                        d_knots_v, pointsCnt_v, knotsCnt_v,

								                                        edgeSampleCnt_v);

								  cudaDeviceSynchronize();

								  g_derivative<<<grid, block>>>(d_derivatives_u, d_derivatives_v, d_normals,

								                                d_derTexture_u, d_derTexture_v, d_nTexture_u,

								                                d_nTexture_v, d_points, pointsCnt_u,

								                                pointsCnt_v, edgeSampleCnt_u, edgeSampleCnt_v);

								  cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用

								  if (recordTime) {

								    auto duration = std::chrono::duration<double, std::milli>(

								                        std::chrono::steady_clock::now() - startMom)

								                        .count();

								    printf("GPU time cost of surface first derivative calculating for %d * %d "

								           "samples: %lf ms\n",

								           edgeSampleCnt_u, edgeSampleCnt_v, duration);

								  }


								  // 结果数组

								  safeFree(h_normals);

								  h_normals = (glm::vec3 *)malloc(vec3MeshBytes);

								  cudaMemcpy(h_normals, d_normals, vec3MeshBytes, cudaMemcpyDeviceToHost);


								  safeFree(h_derivatives_u);

								  safeFree(h_derivatives_v);

								  h_derivatives_u = (glm::vec3 *)malloc(vec3MeshBytes);

								  h_derivatives_v = (glm::vec3 *)malloc(vec3MeshBytes);

								  cudaMemcpy(h_derivatives_u, d_derivatives_u, vec3MeshBytes,

								             cudaMemcpyDeviceToHost);

								  cudaMemcpy(h_derivatives_v, d_derivatives_v, vec3MeshBytes,

								             cudaMemcpyDeviceToHost);


								  cudaFree(d_derTexture_u);

								  cudaFree(d_derTexture_v);

								}


								__host__ void NurbsSurface::Surface::curvature() {


								  // 构造记录每个采样点中的最大M1、M2、M3的数组

								  // 这里用共享内存会更好，但使用共享内存动态分配长度总是出错（好像是因为长度过长），后续需要思考解决这个问题

								  float *ms = nullptr;

								  cudaMalloc((void **)&ms,

								             edgeSampleCnt_u * edgeSampleCnt_v * 3 * sizeof(float));


								  cudaMalloc((void **)&d_k, sizeof(float));


								  // 构造线程层级

								  dim3 block(32, 32);

								  dim3 grid((edgeSampleCnt_u - 1 + block.x - 1) / block.x,

								            (edgeSampleCnt_v - 1 + block.y - 1) / block.y);


								  // 记录用时

								  auto startMom = std::chrono::steady_clock::now();

								  g_curvature<<<grid, block>>>(

								      d_derivatives_u, d_derivatives_v, edgeSampleCnt_u, edgeSampleCnt_v,

								      knots_u[knots_u.size() - 1], knots_v[knots_v.size() - 1], ms, d_k);

								  cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用

								  if (recordTime) {

								    auto duration = std::chrono::duration<double, std::milli>(

								                        std::chrono::steady_clock::now() - startMom)

								                        .count();

								    printf("GPU time cost of surface curvature calculating for %d*%d samples: "

								           "%lf\n",

								           edgeSampleCnt_u, edgeSampleCnt_v, duration);

								  }

								  safeCudaFree(ms);

								}


								void NurbsSurface::Surface::setRecordTime(bool r) { recordTime = r; }


								NurbsSurface::Surface::~Surface() {

								  safeCudaFree(d_nTexture_u);

								  safeCudaFree(d_nTexture_v);

								  safeCudaFree(d_nTexture1_u);

								  safeCudaFree(d_nTexture1_v);

								  safeCudaFree(d_points);

								  safeCudaFree(d_knots_u);

								  safeCudaFree(d_knots_v);

								  safeCudaFree(d_k);

								  safeCudaFree(d_evaluations);

								  safeCudaFree(d_normals);

								  safeCudaFree(d_derivatives_u);

								  safeCudaFree(d_derivatives_v);

								  safeFree(bvh.nodes);

								  safeFree(h_evaluations);

								  safeFree(h_normals);

								  safeFree(h_derivatives_u);

								  safeFree(h_derivatives_v);

								  cudaDeviceReset();

								}


								__host__ void NurbsSurface::Surface::buildBVH(bool useK) {

								  // TODO 构造BVH的函数不应该出现在NURBS Surface中，应该是BVH类的事情！

								  // TODO NURBS Surface只需要一个函数去调用BVH对象的方法即可

								  if (!useK) {

								    // 必须safeFree一下，这样global函数中才能通过d_k = nullptr知道不需要再free

								    safeCudaFree(d_k);

								  }

								  // 构造线程层级

								  dim3 block(32, 32);

								  dim3 grid((edgeSampleCnt_u + block.x - 1) / block.x,

								            (edgeSampleCnt_v + block.y - 1) / block.y);


								  bvh.maxLevel = layerCnt;

								  bvh.size = (pow(4, bvh.maxLevel) - 1) / 3; // 等比数列求和公示求出数总的节点数

								  size_t bvhBytes = sizeof(BVHNode) * bvh.size;

								  BVHNode *d_bvh = nullptr;

								  cudaMalloc((void **)&d_bvh, bvhBytes);

								  // 记录用时

								  auto startMom = std::chrono::steady_clock::now();

								  g_buildBvh<<<grid, block>>>(

								      d_k, bvh.maxLevel, d_evaluations, knots_u[knots_u.size() - 1],

								      knots_v[knots_v.size() - 1], edgeSampleCnt_u, edgeSampleCnt_v, d_bvh);

								  cudaDeviceSynchronize();

								  if (recordTime) {

								    auto duration = std::chrono::duration<double, std::milli>(

								                        std::chrono::steady_clock::now() - startMom)

								                        .count();

								    printf("GPU time cost of a %d-layer BVH building: %lf ms\n", bvh.maxLevel,

								           duration);

								  }

								  // 将bvh拷贝到cpu中

								  safeFree(bvh.nodes);

								  bvh.nodes = (BVHNode *)malloc(bvhBytes);

								  cudaMemcpy(bvh.nodes, d_bvh, bvhBytes, cudaMemcpyDeviceToHost);

								  safeCudaFree(d_bvh);

								}


								__host__ void NurbsSurface::Surface::buildGaussMap() {

								  // TODO，构造GAUSS Map的函数不应该出现在NURBS Surface中，应该是GAUSS

								  // Map类的事情！


								  // 构造线程层级

								  dim3 block(32, 32);

								  dim3 grid((edgeSampleCnt_u + block.x - 1) / block.x,

								            (edgeSampleCnt_v + block.y - 1) / block.y);


								  gauss_map.maxLevel = layerCnt;

								  gauss_map.size =

								      (pow(4, layerCnt) - 1) / 3; // 等比数列求和公示求出数总的节点数

								  size_t gaussMapBytes = sizeof(BVHNode) * gauss_map.size;

								  BVHNode *d_gaussMapTree = nullptr;

								  cudaMalloc((void **)&d_gaussMapTree, gaussMapBytes);

								  // 记录用时

								  double time_cost_device;

								  if (recordTime)

								    time_cost_device = get_time();

								  g_buildBvh<<<grid, block>>>(nullptr, layerCnt, d_normals,

								                              knots_u[knots_u.size() - 1],

								                              knots_v[knots_v.size() - 1], edgeSampleCnt_u,

								                              edgeSampleCnt_v, d_gaussMapTree);

								  cudaDeviceSynchronize();

								  if (recordTime) {

								    time_cost_device = get_time() - time_cost_device;

								    printf("GPU time cost of a %d-layer Gauss Map building: %lf\n", layerCnt,

								           time_cost_device);

								  }

								  safeFree(gauss_map.nodes);

								  gauss_map.nodes = (BVHNode *)malloc(gaussMapBytes);

								  cudaMemcpy(gauss_map.nodes, d_gaussMapTree, gaussMapBytes,

								             cudaMemcpyDeviceToHost);

								  safeCudaFree(d_gaussMapTree);

								}


								__host__ void NurbsSurface::recursiveGetOverlapLeafNodes(

								    const BVH &bvh1, const BVH &bvh2, int idx1, int idx2,

								    std::vector<std::pair<int, int>> &pairs) {

								  auto A = bvh1.nodes[idx1];

								  auto B = bvh2.nodes[idx2];

								  auto AABBSize = [](const AABB &aabb) {

								    return (aabb.pMax.z - aabb.pMin.z) * (aabb.pMax.y - aabb.pMin.y) *

								           (aabb.pMax.x - aabb.pMin.x);

								  };

								  // 两个包围盒不相交，返回

								  if (!A.bounds.IsOverlap(B.bounds))

								    return;

								  // 相交

								  if (A.firstChild == -1 && B.firstChild == -1) {

								    // 两者都是叶子节点

								    pairs.emplace_back(idx1, idx2);

								  } else if (A.firstChild != -1 && B.firstChild == -1) {

								    // A是中间结点，B是叶子结点

								    for (int i = 0; i < 4; i++)

								      recursiveGetOverlapLeafNodes(bvh1, bvh2, A.firstChild + i, idx2, pairs);

								  } else if (A.firstChild == -1 && B.firstChild != -1) {

								    // A是叶子结点，B是中间结点

								    for (int i = 0; i < 4; i++)

								      recursiveGetOverlapLeafNodes(bvh1, bvh2, idx1, B.firstChild + i, pairs);

								  } else {

								    // 都是中间结点

								    if (AABBSize(A.bounds) > AABBSize(B.bounds)) {

								      // A的包围盒更大

								      for (int i = 0; i < 4; i++)

								        recursiveGetOverlapLeafNodes(bvh1, bvh2, A.firstChild + i, idx2, pairs);

								    } else {

								      // B的包围盒更大

								      for (int i = 0; i < 4; i++)

								        recursiveGetOverlapLeafNodes(bvh1, bvh2, idx1, B.firstChild + i, pairs);

								    }

								  }

								}


								__host__ std::vector<boxPair>

								NurbsSurface::getOverlappedLeafNodes(const BVH &bvh1, const BVH &bvh2) {

								  std::vector<idx2> resPairs;

								  // 记录用时

								  double time_cost_device = get_time();


								  recursiveGetOverlapLeafNodes(bvh1, bvh2, 0, 0, resPairs);

								  std::vector<boxPair> boxPairsIdx2(resPairs.size());

								  for (int i = 0; i < resPairs.size(); i++) {

								    boxPairsIdx2[i] = {{bvh1.nodes[resPairs[i].first].idx_u,

								                        bvh1.nodes[resPairs[i].first].idx_v},

								                       {bvh2.nodes[resPairs[i].second].idx_u,

								                        bvh2.nodes[resPairs[i].second].idx_v}};

								  }


								  time_cost_device = get_time() - time_cost_device;

								  printf("CPU time cost for recursively calculating the overlapped leaf nodes: "

								         "%lf\n",

								         time_cost_device);


								  return boxPairsIdx2;

								}


								__host__ bool NurbsSurface::isGaussMapsOverlapped(

								    const BVH &gm1, const BVH &gm2, std::pair<int, int> idxRange_u1,

								    std::pair<int, int> idxRange_v1, std::pair<int, int> idxRange_u2,

								    std::pair<int, int> idxRange_v2) {

								  if (gm1.maxLevel != gm2.maxLevel || gm1.maxLevel <= 0) {

								    printf("BVH Layer error!\n");

								    return false;

								  }

								  int commonMaxLayer = gm1.maxLevel;

								  int edgeCellCnt = pow(2, commonMaxLayer - 1);

								  if (idxRange_u1.first < 0 || idxRange_u2.first < 0 || idxRange_v1.first < 0 ||

								      idxRange_v2.first < 0 || idxRange_u1.second >= edgeCellCnt ||

								      idxRange_u2.second >= edgeCellCnt || idxRange_v1.second >= edgeCellCnt ||

								      idxRange_v2.second >= edgeCellCnt) {

								    printf("Error when detecting overlapping: idx range invalid!\n");

								    return false;

								  }


								  auto getRangedBox = [&commonMaxLayer](const BVH &bvh,

								                                        const std::pair<int, int> &idxRange_u,

								                                        const std::pair<int, int> idxRange_v) {

								    // 获取某个范围的gauss map的aabb

								    AABB bounding;

								    for (int i = idxRange_u.first; i <= idxRange_u.second; ++i) {

								      for (int j = idxRange_v.first; j <= idxRange_v.second; ++j) {

								        bounding =

								            bounding.Union(bvh.nodes[getStartIdxOfLayerN(commonMaxLayer) +

								                                     h_getChildNodeIdx(i, j)]

								                               .bounds);

								      }

								    }

								    return bounding;

								  };

								  return getRangedBox(gm1, idxRange_u1, idxRange_v1)

								      .IsOverlap(getRangedBox(gm2, idxRange_u2, idxRange_v2));

								}


								__host__ bool NurbsSurface::isGaussMapsOverlapped(

								    const BVH &gm1, const BVH &gm2, std::pair<float, float> range_u1,

								    std::pair<float, float> range_v1, std::pair<float, float> range_u2,

								    std::pair<float, float> range_v2, std::pair<float, float> paramRange_u1,

								    std::pair<float, float> paramRange_v1,

								    std::pair<float, float> paramRange_u2,

								    std::pair<float, float> paramRange_v2) {

								  if (gm1.maxLevel != gm2.maxLevel || gm1.maxLevel <= 0) {

								    printf("BVH Layer error!\n");

								    return false;

								  }

								  int edgeCellCnt = pow(2, gm1.maxLevel - 1);

								  // 根据所给参数的范围和参数的定义域范围，获得对应的采样网格中的范围

								  auto getIdxRange = [](std::pair<float, float> range,

								                        std::pair<float, float> paramRange, int edgeCellCnt) {

								    float paramStep = (paramRange.second - paramRange.first) / edgeCellCnt;

								    return std::pair<int, int>(

								        {int((range.first - paramRange.first) / paramStep),

								         int((range.second - paramRange.first) / paramStep)});

								  };

								  auto idxRange_u1 = getIdxRange(range_u1, paramRange_u1, edgeCellCnt);

								  auto idxRange_v1 = getIdxRange(range_v1, paramRange_v1, edgeCellCnt);

								  auto idxRange_u2 = getIdxRange(range_u2, paramRange_u2, edgeCellCnt);

								  auto idxRange_v2 = getIdxRange(range_v2, paramRange_v2, edgeCellCnt);

								  return isGaussMapsOverlapped(gm1, gm2, idxRange_u1, idxRange_v1, idxRange_u2,

								                               idxRange_v2);

								}


								__host__ void NurbsSurface::Surface::recursiveGetRayBVHIntersection(

								    const glm::vec3 dir, const glm::vec3 startPoint, const int idx,

								    std::vector<BVHNode> &intersectionLeafNodes) {

								  auto bvhNode = bvh.nodes[idx];

								  // 射线与AABB判交

								  auto isRayBoxIntersect = [&]() {

								    const auto &box = bvhNode.bounds;

								    float t; // 射线的参数，当t<=0，表示线面交点不在视平面前方，视为没有交点

								    if (dir.x != 0.) {

								      // 当x分量不为0，则射线会与AABB中垂直于x轴的平面可能有交

								      // 注意x的正负。x为正会先遇到较小点所在平面

								      if (dir.x > 0)

								        t = (box.pMin.x - startPoint.x) / dir.x;

								      else

								        t = (box.pMax.x - startPoint.x) / dir.x;

								      if (t > 0.) {

								        // 射线与平面在前方有交点。但交点不一定在盒子上

								        auto tmpPt = startPoint + t * dir; // 交点

								        if (box.pMin.y <= tmpPt.y && box.pMin.z <= tmpPt.z &&

								            box.pMax.y >= tmpPt.y && box.pMax.z >= tmpPt.z)

								          return true;

								      }

								    }

								    if (dir.y != 0.) {

								      // 同上测试y方向

								      if (dir.y > 0)

								        t = (box.pMin.y - startPoint.y) / dir.y;

								      else

								        t = (box.pMax.y - startPoint.y) / dir.y;

								      if (t > 0.) {

								        auto tmpPt = startPoint + t * dir;

								        if (box.pMin.x <= tmpPt.x && box.pMin.z <= tmpPt.z &&

								            box.pMax.x >= tmpPt.x && box.pMax.z >= tmpPt.z)

								          return true;

								      }

								    }

								    if (dir.z != 0.) {

								      // 同上测试z方向

								      if (dir.z > 0)

								        t = (box.pMin.z - startPoint.z) / dir.z;

								      else

								        t = (box.pMax.z - startPoint.z) / dir.z;

								      if (t > 0.) {

								        auto tmpPt = startPoint + t * dir;

								        if (box.pMin.x <= tmpPt.x && box.pMin.y <= tmpPt.y &&

								            box.pMax.x >= tmpPt.x && box.pMax.y >= tmpPt.y)

								          return true;

								      }

								    }

								    return false;

								  };


								  // 不相交

								  if (!isRayBoxIntersect())

								    return;

								  // 相交

								  if (bvhNode.firstChild == -1) {

								    // 与叶节点相交

								    intersectionLeafNodes.emplace_back(bvhNode);

								  } else {

								    // 与父节点相交

								    for (int i = 0; i < 4; i++)

								      recursiveGetRayBVHIntersection(dir, startPoint, bvhNode.firstChild + i,

								                                     intersectionLeafNodes);

								  }

								}


								__host__ std::vector<BVHNode>

								NurbsSurface::Surface::rayBVHIntersection(glm::vec3 dir, glm::vec3 startPoint) {

								  std::vector<BVHNode> res;

								  recursiveGetRayBVHIntersection(dir, startPoint, 0, res);

								  // TODO sort res by t


								  //    res.emplace_back(BVHNode());

								  //    printf("res size: %lld\n", res.size());

								  return res;

								}


								__host__ SrfMesh *NurbsSurface::Surface::getDMesh() {

								  if (edgeSampleCnt_u != edgeSampleCnt_v || edgeSampleCnt_u <= 2) {

								    printf("getMesh Error: the surface is not sampled as a Quadtree!\n");

								    return nullptr;

								  }

								  auto *res = new SrfMesh(true, edgeSampleCnt_u, d_evaluations, d_derivatives_u,

								                          d_derivatives_u, d_normals);

								  return res;

								}