加入了GPU版本的LoopDetection，逐步用glm::vec代替std::vec<float>来表示一个点

3 years ago · e736268cc1
20 changed files with 643 additions and 158 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -13,7 +13,7 @@ set(PROJECT_SOURCES
        src/device/Nurbs/nurbs_surface.cu   include/device/Nurbs/nurbs_surface.cuh
        src/device/Nurbs/nurbs_curve.cu     include/device/Nurbs/nurbs_curve.cuh
        src/device/Nurbs/bvh.cu             include/device/Nurbs/bvh.cuh
-        )
+        src/device/Nurbs/loop_detection.cu include/device/Nurbs/loop_detection.cuh src/device/srf_mesh.cu include/device/srf_mesh.cuh)
 add_executable(NurbsPerformer ${PROJECT_SOURCES})
@ -30,8 +30,9 @@ add_executable(NurbsPerformer ${PROJECT_SOURCES})
 #include_directories("/usr/local/device-11.8/targets/x86_64-linux/include")
 # windows
 include_directories("$ENV{CUDA_PATH}/include")
-
+include_directories("E:/CLib/glm")
 include_directories(include)
 include_directories("E:/CLib/tinynurbs/include")
 #MESSAGE("CUDA PATH::: $ENV{CUDA_PATH}")
 #MESSAGE("CUDA PATH::: $ENV{LD_LIBRARY_PATH}")
--- a/include/device/Nurbs/bvh.cuh
+++ b/include/device/Nurbs/bvh.cuh
@ -7,6 +7,14 @@
 #include "device/aabb.cuh"
 struct BVHNode {
    AABB bounds;  // AABB包围盒
    int firstChild = -1, level = 0;  // 第一个孩子节点的下标，该结点所在层次
    // 曲面片u, v的范围
    float u0 = 0., u1 = 0., v0 = 0., v1 = 0.;
    int idx_u = -1, idx_v = -1;
 };
 /**
 * 当前层的第一个节点的坐标
 * @param layer 层号。根节点为第一层
@ -29,7 +37,7 @@ public:
 };
 __global__ void
-buildBvh(const float *k, int level, const float *evaluatedPoints, float lastKnot_u, float lastKnot_v,
+g_buildBvh(const float *k, int level, const float *evaluatedPoints, float lastKnot_u, float lastKnot_v,
         int sampleCnt_u, int sampleCnt_v, BVHNode *BVH);
 #endif //NURBSEVALUATOR_BVH_CUH
--- a/include/device/Nurbs/loop_detection.cuh
+++ b/include/device/Nurbs/loop_detection.cuh
@ -0,0 +1,42 @@
 #ifndef NURBSPERFORMER_LOOP_DETECTION_CUH
 #define NURBSPERFORMER_LOOP_DETECTION_CUH
 #include "device/srf_mesh.cuh"
 #include "map"
 #include "set"
 class LoopDetection {
 public:
    SrfMesh *d_s_srfMesh;
    SrfMesh *d_f_srfMesh;
    float s_paramRegionSize_u;
    float s_paramRegionSize_v;
    int edgeSampleCnt;
    glm::vec<2, int> *d_selectedPointsIdx;
    glm::vec2 *d_vectorFields;
    int *d_rotationNumbers;
    LoopDetection(glm::vec3 *s_eval, glm::vec3 *f_eval, glm::vec3 *s_tan_u, glm::vec3 *f_tan_u, glm::vec3 *s_tan_v,
                  glm::vec3 *f_tan_v, glm::vec3 *s_norm, glm::vec3 *f_norm, int edgeSampleCnt,
                  float s_paramRegionSize_u, float s_paramRegionSize_v);
    __host__ void h_initOrientedDisAndVecFields(
            const std::map<std::pair<int, int>, std::set<std::pair<int, int>>> &intersectBoxPairs);
    __host__ void h_getRotationNumber();
    ~LoopDetection();
 };
 __global__ void
 g_initOrientedDisAndVecFields(SrfMesh *s_srfMesh, SrfMesh *f_srfMesh, int s_cellCnt, glm::vec<2, int> *s_cells,
                              glm::vec<2, int> **f_cells, const int *f_cellCnt, glm::vec<2, int> *selectedPointsIdx,
                              glm::vec2 *vectorFields);
 __global__ void
 g_getRotationNumber(SrfMesh *s_srfMesh, float paramRegionSize_u, float paramRegionSize_v, glm::vec2 *vectorFields,
                    int *rotationNumbers);
 #endif //NURBSPERFORMER_LOOP_DETECTION_CUH
--- a/include/device/Nurbs/nurbs_curve.cuh
+++ b/include/device/Nurbs/nurbs_curve.cuh
@ -1,13 +1,9 @@
 //
 // Created by 14727 on 2022/12/9.
 //
 #ifndef NURBSEVALUATOR_NURBS_CURVE_CUH
 #define NURBSEVALUATOR_NURBS_CURVE_CUH
 #include <cuda_runtime.h>
 #include <vector>
-//#include <map>
+#include <glm/glm.hpp>
 namespace NurbsCurve {
    const int POINT_SIZE = 4;
@ -51,7 +47,7 @@ namespace NurbsCurve {
         * @param sampleCnt_ 在参数域内均匀采样的采样数，它会更新成员变量中的sampleCnt
         * @return 由 map 组成的vector{<u, {x, y, z}>}
         */
-        std::vector<std::vector<float>> evaluate(int sampleCnt_);
+        std::vector<glm::vec3> evaluate(int sampleCnt_);
        /**
         * 供外部CPU程序使用的、负责调用gpu并行计算切向量的方法
--- a/include/device/Nurbs/nurbs_surface.cuh
+++ b/include/device/Nurbs/nurbs_surface.cuh
@ -8,9 +8,16 @@
 #include <vector>
 #include "cuda_runtime.h"
 #include "bvh.cuh"
 #include "glm/glm.hpp"
 typedef std::vector<glm::vec3> LinePoints3;
 typedef std::vector<LinePoints3> MeshPoints3;
 typedef std::vector<std::vector<glm::vec4>> MeshPoints4;
 typedef std::pair<int, int> idx2;
 typedef std::pair<idx2, idx2> boxPair;
 namespace NurbsSurface {
-    const int POINT_SIZE = 4;
+//    const int POINT_SIZE = 4;
    /**
     * 曲线计算的核函数
     * @param d_pointSize 点的大小(3: [x, y, z] | 4：[x, y, z, w])
@ -32,7 +39,7 @@ namespace NurbsSurface {
    class Surface {
    private:
-        std::vector<std::vector<std::vector<float>>> controlPoints;
+        MeshPoints4 controlPoints;
        float *d_points;
        std::vector<float> knots_u;
        std::vector<float> knots_v;
@ -50,7 +57,13 @@ namespace NurbsSurface {
        float *d_normals;
        float *d_k;  // 最大曲率
        __host__ void recursiveGetRayBVHIntersection(glm::vec3 dir, glm::vec3 startPoint, int idx,
                                                     std::vector<BVHNode> &intersectionLeafNodes);
    public:
        float *h_evaluations;
        float *h_derivatives;
        float *h_normals;
        BVH bvh;
        BVH gauss_map;
        /**
@ -59,17 +72,26 @@ namespace NurbsSurface {
         * @param knots_u u方向knots
         * @param knots_v v方向knots
         */
-        __host__ explicit Surface(std::vector<std::vector<std::vector<float>>> controlPoints,
+        __host__ explicit Surface(MeshPoints4 controlPoints, std::vector<float> knots_u, std::vector<float> knots_v);
                                  std::vector<float> knots_u, std::vector<float> knots_v);
        /**
-         * 供外部CPU程序使用的、负责调用gpu并行计算的方法
+         * 供外部CPU程序使用的、负责调用gpu并行计算采样点xyz值的方法
         * @param sampleCnt_u u方向采样数目
         * @param sampleCnt_v v方向采样数目
         * @return 由 map 组成的vector{<<u, v>, {x, y, z}>}
         */
-        __host__ std::vector<std::vector<std::vector<float>>>
+        __host__ std::vector<std::vector<glm::vec3>> getEvaluateVec(int sampleCnt_u, int sampleCnt_v) const;
-        evaluate(int sampleCnt_u_, int sampleCnt_v_);
+
        __host__ std::vector<MeshPoints3> getDerivativeVec(int sampleCnt_u, int sampleCnt_v) const;
        /**
         * 供外部CPU程序使用的、负责调用gpu并行计算采样点xyz值的方法
         * @param sampleCnt_u u方向采样数目
         * @param sampleCnt_v v方向采样数目
         * @return 表示为vertices形式的采样点值数组
         */
        __host__ void evaluate(int sampleCnt_u, int sampleCnt_v);
        /**
         * 供外部CPU程序使用的、负责调用gpu并行计算切向量的方法
@ -84,13 +106,18 @@ namespace NurbsSurface {
        /**
         * 供外部CPU程序使用的、负责调用gpu并行计算BVH的方法
         */
-        __host__ void buildBHV(int layerCnt, bool useK = false);
+        __host__ void buildBVH(int layerCnt, bool useK = false);
        /**
         * 供外部CPU程序使用的、负责调用gpu并行计算Gauss Map Tree的方法
         */
        __host__ void buildGaussMap(int layerCnt);
        /**
         * 供外部CPU程序使用的、负责调用递归函数求解BVH与ray交点的方法
         */
        __host__ std::vector<BVHNode> rayBVHIntersection(glm::vec3 dir, glm::vec3 startPoint);
        void setRecordTime(bool r);
@ -99,9 +126,9 @@ namespace NurbsSurface {
    };
    __host__ void recursiveGetOverlapLeafNodes(const BVH &bvh1, const BVH &bvh2, int idx1, int idx2,
-                                                      std::vector<std::pair<int, int>> &pairs);
+                                               std::vector<std::pair<int, int>> &pairs);
-    __host__ std::vector<std::pair<int, int>> getOverlappedLeafNodes(const BVH &bvh1, const BVH &bvh2);
+    __host__ std::vector<boxPair> getOverlappedLeafNodes(const BVH &bvh1, const BVH &bvh2);
    /**
     * 判断两个曲面的Gauss Map在指定的、各自的参数范围内有没有交
@ -112,8 +139,8 @@ namespace NurbsSurface {
     * @return true:gauss map的包围盒有交集，说明gauss map<可能>有重合；false: gauss map一定没有重合
     */
    __host__ bool isGaussMapsOverlapped(const BVH &gm1, const BVH &gm2, std::pair<int, int> idxRange_u1,
-                                               std::pair<int, int> idxRange_v1, std::pair<int, int> idxRange_u2,
+                                        std::pair<int, int> idxRange_v1, std::pair<int, int> idxRange_u2,
-                                               std::pair<int, int> idxRange_v2);
+                                        std::pair<int, int> idxRange_v2);
    /**
     * 判断两个曲面的Gauss Map在指定的、各自的参数范围内有没有交
     * @param range_u1 第一个gauss map的参数u范围
@ -127,11 +154,11 @@ namespace NurbsSurface {
     * @return true:gauss map的包围盒有交集，说明gauss map<可能>有重合；false: gauss map一定没有重合
     */
    __host__ bool isGaussMapsOverlapped(const BVH &gm1, const BVH &gm2, std::pair<float, float> range_u1,
-                                               std::pair<float, float> range_v1, std::pair<float, float> range_u2,
+                                        std::pair<float, float> range_v1, std::pair<float, float> range_u2,
-                                               std::pair<float, float> range_v2, std::pair<float, float> paramRange_u1,
+                                        std::pair<float, float> range_v2, std::pair<float, float> paramRange_u1,
-                                               std::pair<float, float> paramRange_v1,
+                                        std::pair<float, float> paramRange_v1,
-                                               std::pair<float, float> paramRange_u2,
+                                        std::pair<float, float> paramRange_u2,
-                                               std::pair<float, float> paramRange_v2);
+                                        std::pair<float, float> paramRange_v2);
 }
--- a/include/device/aabb.cuh
+++ b/include/device/aabb.cuh
@ -6,31 +6,25 @@
 #define NURBSEVALUATOR_AABB_CUH
 #include "vec.cuh"
 #include "glm/glm.hpp"
 #include "cstdio"
 class AABB {
 public:
    // 边界
-    FVec3 pMin, pMax;
+    glm::vec3 pMin, pMax;
    __device__ __host__ AABB();
-    __device__ __host__ explicit AABB(const FVec3& p);
+    __device__ __host__ explicit AABB(const glm::vec3& p);
-    __device__ __host__ AABB(const FVec3& p1, const FVec3& p2);
+    __device__ __host__ AABB(const glm::vec3& p1, const glm::vec3& p2);
    // aabb包围盒合并操作，包围盒和点
-    __device__ __host__ AABB Union(const FVec3& p);
+    __device__ __host__ AABB Union(const glm::vec3& p) const;
    // aabb包围盒合并操作，两个包围盒
-    __device__ __host__ AABB Union(const AABB& b2);
+    __device__ __host__ AABB Union(const AABB& b2) const;
    // aabb包围盒沿各坐标轴政府方向分别扩张
    __device__ __host__ void expand(float k);
    // 判断两个aabb包围盒是否重叠
-    __device__ __host__ bool IsOverlap(const AABB& b2);
+    __device__ __host__ bool IsOverlap(const AABB& b2) const;
 };
 struct BVHNode {
    AABB bounds;  // AABB包围盒
    int firstChild = -1, level = 0;  // 第一个孩子节点的下标，该结点所在层次，孩子个数
    // 曲面片u, v的范围
    float u0 = 0., u1 = 0., v0 = 0., v1 = 0.;
 };
 #endif //NURBSEVALUATOR_AABB_CUH
--- a/include/device/device_utils.cuh
+++ b/include/device/device_utils.cuh
@ -5,6 +5,8 @@
 #ifndef NURBSEVALUATOR_DEVICE_UTILS_CUH
 #define NURBSEVALUATOR_DEVICE_UTILS_CUH
 #define D_DBL_MAX 1.7976931348623158e+308
 #define D_FLT_MAX 3.402823466e+38F
 __device__ void d_safeFree(float * &p);
--- a/include/device/srf_mesh.cuh
+++ b/include/device/srf_mesh.cuh
@ -0,0 +1,17 @@
 #ifndef NURBSPERFORMER_SRF_MESH_CUH
 #define NURBSPERFORMER_SRF_MESH_CUH
 #include "glm/glm.hpp"
 class SrfMesh {
 public:
    int edgeSampleCnt;
    glm::vec3 *evaluationRes;
    glm::vec3 *tangent_u;
    glm::vec3 *tangent_v;
    glm::vec3 *normal;
    SrfMesh(glm::vec3 *evalRes, glm::vec3 *tan_u, glm::vec3 *tan_v, glm::vec3 *nor, int edgeSampleCnt);
 };
 #endif //NURBSPERFORMER_SRF_MESH_CUH
--- a/include/utils.h
+++ b/include/utils.h
@ -8,10 +8,13 @@
 #include <ctime>
 double get_time();
 #else
 #include <windows.h>
 #include "device/Nurbs/bvh.cuh"
 #include "device/srf_mesh.cuh"
 double get_time();
 #endif
 /**
@ -19,9 +22,27 @@ double get_time();
 *  注意指针是引用传参，因为要把指针本身置空
 */
 void safeCudaFree(float *&p);
 void safeCudaFree(BVHNode *&p);
-//template<typename T>
+
 template<typename T>
 void safeCudaFree(T *&p) {
    if (p != nullptr) {
        cudaFree(p);
        p = nullptr;
    }
 }
 template<typename T>
 void safeFree(T *&p) {
    if (p != nullptr) {
        free(p);
        p = nullptr;
    }
 }
 void safeFree(float *&p);
 void safeFree(BVHNode *&p);
--- a/src/device/Nurbs/bvh.cu
+++ b/src/device/Nurbs/bvh.cu
@ -2,11 +2,8 @@
 // Created by 14727 on 2022/12/11.
 //
-#include "../../../include/device/Nurbs/bvh.cuh"
+#include "device/Nurbs/bvh.cuh"
-/**
+
 * 当前层的第一个节点的坐标
 * @param layer 层号。根节点为第一层
 */
 __device__ __host__ int getStartIdxOfLayerN(int layer) {
    if (layer == 1) return 0;
    int num = 1;
@ -48,7 +45,7 @@ __host__ int h_getChildNodeIdx(int ix, int iy){
 }
 __global__ void
-buildBvh(const float *k, int level, const float *evaluatedPoints, float lastKnot_u, float lastKnot_v,
+g_buildBvh(const float *k, int level, const float *evaluatedPoints, float lastKnot_u, float lastKnot_v,
                int sampleCnt_u, int sampleCnt_v, BVHNode *BVH) {
    // 假设还是二维grid和二维的block
    int ix = blockIdx.x * blockDim.x + threadIdx.x;
@ -74,22 +71,24 @@ buildBvh(const float *k, int level, const float *evaluatedPoints, float lastKnot
 //        if(idx == 75) printf("75: %g, %g, %g\n", evaluatedPoints[tmpIdx], evaluatedPoints[tmpIdx + 1], evaluatedPoints[tmpIdx + 2]);
        if (step == 1) {
-            AABB bound(FVec3(evaluatedPoints[tmpIdx], evaluatedPoints[tmpIdx + 1],
+            AABB bound(glm::vec3(evaluatedPoints[tmpIdx], evaluatedPoints[tmpIdx + 1],
                             evaluatedPoints[tmpIdx + 2]));  // 最初只包含u，v点
            // 叶子节点
            BVH[idx].u0 = u;
            BVH[idx].u1 = u + step_u * singleStepVal_u;
            BVH[idx].v0 = v;
            BVH[idx].v1 = v + step_v * singleStepVal_v;
            BVH[idx].idx_u = ix;
            BVH[idx].idx_v = iy;
            BVH[idx].level = level;
            int tmpIdx1 = ((ix + 1) * sampleCnt_v + iy) * 3;
            int tmpIdx2 = (ix * sampleCnt_v + iy + 1) * 3;
            int tmpIdx3 = ((ix + 1) * sampleCnt_v + iy + 1) * 3;
-            bound = bound.Union(FVec3(evaluatedPoints[tmpIdx1], evaluatedPoints[tmpIdx1 + 1],
+            bound = bound.Union(glm::vec3(evaluatedPoints[tmpIdx1], evaluatedPoints[tmpIdx1 + 1],
                                      evaluatedPoints[tmpIdx1 + 2]));
-            bound = bound.Union(FVec3(evaluatedPoints[tmpIdx2], evaluatedPoints[tmpIdx2 + 1],
+            bound = bound.Union(glm::vec3(evaluatedPoints[tmpIdx2], evaluatedPoints[tmpIdx2 + 1],
                                      evaluatedPoints[tmpIdx2 + 2]));
-            bound = bound.Union(FVec3(evaluatedPoints[tmpIdx3], evaluatedPoints[tmpIdx3 + 1],
+            bound = bound.Union(glm::vec3(evaluatedPoints[tmpIdx3], evaluatedPoints[tmpIdx3 + 1],
                                      evaluatedPoints[tmpIdx3 + 2]));
            if(k != nullptr) bound.expand(*k);
            BVH[idx].bounds = bound;
--- a/src/device/Nurbs/loop_detection.cu
+++ b/src/device/Nurbs/loop_detection.cu
@ -0,0 +1,229 @@
 #include <vector>
 #include "device/Nurbs/loop_detection.cuh"
 #include "utils.h"
 #include "cuda_runtime.h"
 #include "device/device_utils.cuh"
 LoopDetection::LoopDetection(glm::vec3 *const s_eval, glm::vec3 *const f_eval, glm::vec3 *const s_tan_u,
                             glm::vec3 *const f_tan_u, glm::vec3 *const s_tan_v, glm::vec3 *const f_tan_v,
                             glm::vec3 *const s_norm, glm::vec3 *const f_norm, int edgeSampleCnt,
                             float s_paramRegionSize_u, float s_paramRegionSize_v) :
        edgeSampleCnt(edgeSampleCnt),
        s_paramRegionSize_u(s_paramRegionSize_u),
        s_paramRegionSize_v(s_paramRegionSize_v) {
    auto initSrfMesh = [](SrfMesh *&d_srfMesh, glm::vec3 *eval, glm::vec3 *tan_u, glm::vec3 *tan_v, glm::vec3 *norm,
                          int edgeSampleCnt) {
        auto h_srfMesh = new SrfMesh(eval, tan_u, tan_v, norm, edgeSampleCnt);
        safeCudaFree(d_srfMesh);
        auto srfMeshSize = sizeof(SrfMesh);
        cudaMalloc((void **) &d_srfMesh, srfMeshSize);
        cudaMemcpy(d_srfMesh, h_srfMesh, srfMeshSize, cudaMemcpyHostToDevice);
        safeFree(h_srfMesh);
    };
    initSrfMesh(d_s_srfMesh, s_eval, s_tan_u, s_tan_v, s_norm, edgeSampleCnt);
    initSrfMesh(d_f_srfMesh, f_eval, f_tan_u, f_tan_v, f_norm, edgeSampleCnt);
 }
 LoopDetection::~LoopDetection() {
    safeCudaFree(d_selectedPointsIdx);
    safeCudaFree(d_vectorFields);
    safeCudaFree(d_s_srfMesh);
    safeCudaFree(d_f_srfMesh);
    safeCudaFree(d_rotationNumbers);
 }
 __host__ void LoopDetection::h_initOrientedDisAndVecFields(
        const std::map<std::pair<int, int>, std::set<std::pair<int, int>>> &intersectBoxPairs) {
    int sampleCnt = edgeSampleCnt * edgeSampleCnt;
    safeCudaFree(d_selectedPointsIdx);
    auto selectedPointsIdxVec = std::vector<glm::vec2>(sampleCnt, glm::vec<2, int>(-1, -1));
    auto h_selectedPointsIdx = new glm::vec<2, int>[sampleCnt];
    std::copy(selectedPointsIdxVec.begin(), selectedPointsIdxVec.end(), h_selectedPointsIdx);
    int selectedPtsIdxBytes = sizeof(glm::vec<2, int>) * sampleCnt;
    cudaMalloc((void**)&d_selectedPointsIdx, selectedPtsIdxBytes);
    cudaMemcpy(d_selectedPointsIdx, h_selectedPointsIdx, selectedPtsIdxBytes, cudaMemcpyHostToDevice);
    safeCudaFree(d_vectorFields);
    cudaMalloc((void**)&d_vectorFields, sizeof(glm::vec2) * sampleCnt);
    int mapCnt = intersectBoxPairs.size();
    auto *h_s_cells = new glm::vec<2, int>[mapCnt];
    glm::vec<2, int> *d_s_cells;
    auto **h_f_cells = new glm::vec<2, int> *[mapCnt];
    glm::vec<2, int> **d_f_cells;
    auto *h_f_cellsCnt = new int[mapCnt];
    int *d_f_cellsCnt;
    int i = 0;
    for (const auto &boxPair: intersectBoxPairs) {
        h_s_cells[i] = glm::vec<2, int>(boxPair.first.first, boxPair.first.second);
        h_f_cellsCnt[i] = boxPair.second.size();
        auto h_f_cellsTmp = new glm::vec<2, int>[h_f_cellsCnt[i]];
        int j = 0;
        for (auto box: boxPair.second) {
            h_f_cellsTmp[j] = glm::vec<2, int>(box.first, box.second);
            j++;
        }
        auto f_relatedCellsBytes = sizeof(glm::vec<2, int>) * h_f_cellsCnt[i];
        cudaMalloc((void **) &h_f_cells[i], f_relatedCellsBytes);
        cudaMemcpy(h_f_cells[i], h_f_cellsTmp, f_relatedCellsBytes, cudaMemcpyHostToDevice);
        free(h_f_cellsTmp);
        i++;
    }
    int s_cellsBytes = sizeof(glm::vec<2, int>) * mapCnt;
    cudaMalloc((void **) &d_s_cells, s_cellsBytes);
    cudaMemcpy(d_s_cells, h_s_cells, s_cellsBytes, cudaMemcpyHostToDevice);
    int f_cellsPtrBytes = sizeof(glm::vec<2, int>) * mapCnt;
    cudaMalloc((void **) &d_f_cells, f_cellsPtrBytes);
    cudaMemcpy(d_f_cells, h_f_cells, f_cellsPtrBytes, cudaMemcpyHostToDevice);
    int f_cellsCntBytes = sizeof(int) * mapCnt;
    cudaMalloc((void **) &d_f_cellsCnt, f_cellsCntBytes);
    cudaMemcpy(d_f_cellsCnt, h_f_cellsCnt, f_cellsCntBytes, cudaMemcpyHostToDevice);
    dim3 block(64, 64);
    dim3 grid((mapCnt + block.x * block.y - 1) / (block.x * block.y));
    g_initOrientedDisAndVecFields <<< grid, block >>>(d_s_srfMesh, d_f_srfMesh, mapCnt, d_s_cells, d_f_cells,
                                                      d_f_cellsCnt, d_selectedPointsIdx, d_vectorFields);
    cudaFree(d_s_cells);
    cudaFree(d_f_cells);
    cudaFree(d_f_cellsCnt);
    for (int k = 0; k < mapCnt; k++) {
        cudaFree(h_f_cells[k]);
    }
    free(h_s_cells);
    free(h_f_cells);
    free(h_f_cellsCnt);
    free(h_selectedPointsIdx);
 }
 __host__ void LoopDetection::h_getRotationNumber() {
    safeCudaFree(d_rotationNumbers);
    cudaMalloc((void**)&d_rotationNumbers, sizeof(int) * edgeSampleCnt * edgeSampleCnt);
    dim3 block(32, 32);
    dim3 grid((edgeSampleCnt + block.x - 1) / block.x, (edgeSampleCnt + block.y - 1) / block.y);
    g_getRotationNumber<<<grid, block>>>(d_s_srfMesh, s_paramRegionSize_u, s_paramRegionSize_v, d_vectorFields,
                                  d_rotationNumbers);
 }
 __device__ const int neighborIdxes[4][2] = {{1,  0},
                                            {0,  1},
                                            {-1, 0},
                                            {0,  -1}};
 __device__ const int dirs[2][2][2] = {{{-1, 1},  {1, 1}},
                                      {{-1, -1}, {1, -1}}}; // 必须是左上，右上，左下，右下的顺序
 __global__ void
 g_initOrientedDisAndVecFields(SrfMesh *s_srfMesh, SrfMesh *f_srfMesh, int s_cellCnt, glm::vec<2, int> *s_cells,
                              glm::vec<2, int> **f_cells, const int *f_cellCnt, glm::vec<2, int> *selectedPointsIdx,
                              glm::vec2 *vectorFields) {
 //    // 二维的grid和一维的block
 //    int idx = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x * blockDim.y + threadIdx.x;
    // 一维的grid和二维的block
    int idx = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
    if (idx >= s_cellCnt) return;
 //    printf("idx: %d\n", idx);
    if (idx == 0) printf("<%d, %d>, %f\n", neighborIdxes[0][0], neighborIdxes[0][1], -D_FLT_MAX);
    int s_edgeSampleCnt = s_srfMesh->edgeSampleCnt;
    int f_edgeSampleCnt = f_srfMesh->edgeSampleCnt;
    for (auto neighborIdx: neighborIdxes) {
        for (int i = 0; i < 2; i++) {
            for (int j = 0; j < 2; j++) {
                float minDis = -D_FLT_MAX;
                int minDisFIdx_u = -1, minDisFIdx_v = -1;
                int idxI = s_cells[idx].x + i + neighborIdx[0];
                int idxJ = s_cells[idx].y + j + neighborIdx[1];
                if (idxI < 0 || idxI >= s_edgeSampleCnt || idxJ < 0 || idxJ >= s_edgeSampleCnt ||
                    selectedPointsIdx[idxI * s_edgeSampleCnt + idxJ] != glm::vec<2, int>(2, 2)) {
                    // 尽量不重复计算
                    // 但在多线程情况下，这样还是可以能重复计算
                    continue;
                }
                for (int fi = 0; fi < f_cellCnt[idx]; fi++) {
                    for (int k = 0; k < 2; k++) {
                        for (int l = 0; l < 2; l++) {
                            for (int step_u = 0; step_u <= 2; step_u++) {
                                for (int step_v = 0; step_v + step_u <= 2; step_v++) {
                                    int fIdx_u = f_cells[idx][fi].x + k + step_u * dirs[k][l][0];
                                    int fIdx_v = f_cells[idx][fi].y + l + step_v * dirs[k][l][1];
                                    if (fIdx_u < 0 || fIdx_u >= f_edgeSampleCnt || fIdx_v < 0 ||
                                        fIdx_v >= f_edgeSampleCnt)
                                        continue;
                                    auto dis = glm::distance(s_srfMesh->evaluationRes[idxI][idxJ],
                                                             f_srfMesh->evaluationRes[fIdx_u][fIdx_v]);
                                    if (dis < minDis) {
                                        minDis = dis;
                                        minDisFIdx_u = fIdx_u;
                                        minDisFIdx_v = fIdx_v;
                                    }
                                }
                            }
                        }
                    }
                }
                selectedPointsIdx[idxI * s_edgeSampleCnt + idxJ] = glm::vec<2, int>(minDisFIdx_u, minDisFIdx_v);
                auto n2 = f_srfMesh->normal[minDisFIdx_u * f_edgeSampleCnt + minDisFIdx_v];
                auto ps_pu = s_srfMesh->tangent_u[idxI * s_edgeSampleCnt + idxJ];
                auto ps_pv = s_srfMesh->tangent_v[idxI * s_edgeSampleCnt + idxJ];
                vectorFields[idxI * s_edgeSampleCnt + idxJ] = glm::vec2(glm::dot(n2, ps_pu), glm::dot(n2, ps_pv));
            }
        }
    }
 }
 __global__ void
 g_getRotationNumber(SrfMesh *s_srfMesh, float paramRegionSize_u, float paramRegionSize_v, glm::vec2 *vectorFields,
                    int *rotationNumbers) {
    // 二维grid和二维的block
    int ix = blockIdx.x * blockDim.x + threadIdx.x;
    int iy = blockIdx.y * blockDim.y + threadIdx.y;
    int s_edgeSampleCnt = s_srfMesh->edgeSampleCnt;
    float F[2][2], G[2][2];
    for (int biasI = 0; biasI < 2; biasI++) {
        for (int biasJ = 0; biasJ < 2; biasJ++) {
            auto idxI = ix + biasI;
            auto idxJ = iy + biasJ;
            auto v = vectorFields[idxI * s_edgeSampleCnt + idxJ];
            auto vSquareSum = v.x * v.x + v.y * v.y;
            auto pTheta_pChi = -v.y / vSquareSum;
            auto pTheta_pPhi = v.x / vSquareSum;
 //                    auto pChi_pu =
 //                            (vectorFields[idxI + 1][idxJ].x - vectorFields[idxI - 1][idxJ].x) / (2 * uParamCellSize);
 //                    auto pChi_pv =
 //                            (vectorFields[idxI][idxJ + 1].x - vectorFields[idxI][idxJ - 1].x) / (2 * vParamCellSize);
            glm::vec2 pV_pu;
            glm::vec2 pV_pv;
            if (idxI == 0)
                pV_pu = (vectorFields[(idxI + 1) * s_edgeSampleCnt + idxJ] -
                         vectorFields[idxI * s_edgeSampleCnt + idxJ]) / paramRegionSize_u;
            else if (idxI == s_edgeSampleCnt - 1)
                pV_pu = (vectorFields[idxI * s_edgeSampleCnt + idxJ] -
                         vectorFields[(idxI - 1) * s_edgeSampleCnt + idxJ]) / paramRegionSize_u;
            else
                pV_pu = (vectorFields[(idxI + 1) * s_edgeSampleCnt + idxJ] -
                         vectorFields[(idxI - 1) * s_edgeSampleCnt + idxJ]) / (2 * paramRegionSize_u);
            if (idxJ == 0)
                pV_pv = (vectorFields[idxI * s_edgeSampleCnt + idxJ + 1] -
                         vectorFields[idxI * s_edgeSampleCnt + idxJ]) / paramRegionSize_v;
            else if (idxJ == s_edgeSampleCnt - 1)
                pV_pv = (vectorFields[idxI * s_edgeSampleCnt + idxJ] -
                         vectorFields[idxI * s_edgeSampleCnt + idxJ - 1]) / paramRegionSize_v;
            else
                pV_pv = (vectorFields[idxI * s_edgeSampleCnt + idxJ + 1] -
                         vectorFields[idxI * s_edgeSampleCnt + idxJ - 1]) / (2 * paramRegionSize_v);
            F[biasI][biasJ] = pTheta_pChi * pV_pu.x + pTheta_pPhi * pV_pu.y;
            G[biasI][biasJ] = pTheta_pChi * pV_pv.x + pTheta_pPhi * pV_pv.y;
        }
    }
    rotationNumbers[ix * s_edgeSampleCnt + iy] = int(
            round(((F[0][1] + F[1][1] - F[0][0] - F[1][0]) * paramRegionSize_u +
                   (G[0][0] + G[0][1] - G[1][0] - G[1][1]) * paramRegionSize_v)) / 2.);
 }
--- a/src/device/Nurbs/nurbs_common.cu
+++ b/src/device/Nurbs/nurbs_common.cu
@ -2,8 +2,8 @@
 // Created by 14727 on 2022/11/19.
 //
-#include "../../../include/device/Nurbs/nurbs_common.cuh"
+#include "device/Nurbs/nurbs_common.cuh"
-#include "../../../include/device/device_utils.cuh"
+#include "device/device_utils.cuh"
 #include <cstdio>
 __device__ void d_basisFunction(float *N_Texture, const float *knots, float u, int degree, int d_knotsCnt) {
--- a/src/device/Nurbs/nurbs_curve.cu
+++ b/src/device/Nurbs/nurbs_curve.cu
@ -1,9 +1,10 @@
 //
 // Created by 14727 on 2022/12/9.
 //
-#include "../../../include/device/Nurbs/nurbs_common.cuh"
+#include "device/Nurbs/nurbs_common.cuh"
-#include "../../../include/device/Nurbs/nurbs_curve.cuh"
+#include "device/Nurbs/nurbs_curve.cuh"
-#include "../../../include/utils.h"
+#include "utils.h"
 #include "device/Nurbs/loop_detection.cuh"
 __global__ void
 NurbsCurve::g_evaluate(float *res, const float *NTexture, const float *d_points, const int d_pointsCnt,
@ -107,7 +108,7 @@ __global__ void NurbsCurve::g_curvature(const float *derivatives, int sampleCnt,
 }
 NurbsCurve::Curve::Curve(std::vector<std::vector<float>> controlPoints,
-                                          std::vector<float> knots) {
+                         std::vector<float> knots) {
    this->knots = std::move(knots);
    this->controlPoints = std::move(controlPoints);
    recordTime = false;
@ -130,7 +131,7 @@ NurbsCurve::Curve::~Curve() {
    cudaDeviceReset();
 }
-std::vector<std::vector<float>>
+std::vector<glm::vec3>
 NurbsCurve::Curve::evaluate(int sampleCnt) {
    if (POINT_SIZE != controlPoints[0].size()) {
@ -166,13 +167,14 @@ NurbsCurve::Curve::evaluate(int sampleCnt) {
    // 分配nTexture1的内存。只需要GPU内存
    safeCudaFree(d_nTexture1);  // 注意内存管理
-    cudaMalloc((void **) &d_nTexture1, sampleCnt * (pointsCnt + 1) * sizeof(float)); // 注意nTexture的大小，在算梯度时用得到i=pointsCnt + 1的基函数值
+    cudaMalloc((void **) &d_nTexture1,
               sampleCnt * (pointsCnt + 1) * sizeof(float)); // 注意nTexture的大小，在算梯度时用得到i=pointsCnt + 1的基函数值
    // 结果数组
    size_t resBytes = sampleCnt * 3 * sizeof(float);
    float *d_res;
-    cudaMalloc((void **)&d_res, resBytes);
+    cudaMalloc((void **) &d_res, resBytes);
-    auto *h_res = (float *)malloc(resBytes);
+    auto *h_res = (float *) malloc(resBytes);
    // 构造g_basisTexture线程层级
@ -198,19 +200,27 @@ NurbsCurve::Curve::evaluate(int sampleCnt) {
        printf("GPU time cost of curve evaluation for %d samples: %lf\n", sampleCnt, time_cost_device);
    }
    // TODO 应当有一个evaluate函数单独地只用于计算d_res
    // TODO 因为不是所有调用evaluate的操作都需要一个CPU上的结果，因此这一部分数据拷贝可能是多余的
    cudaMemcpy(h_res, d_res, resBytes, cudaMemcpyDeviceToHost);
-    std::vector<std::vector<float>> res(sampleCnt, std::vector<float>(3, 0.));
+    std::vector<glm::vec3> res(sampleCnt, glm::vec3());
-    for(int i = 0; i < sampleCnt; i++) {
+    for (int i = 0; i < sampleCnt; i++) {
        int baseIdx = i * 3;
-        res[i][0] = h_res[baseIdx];
+        res[i].x = h_res[baseIdx];
-        res[i][1] = h_res[baseIdx + 1];
+        res[i].y = h_res[baseIdx + 1];
-        res[i][2] = h_res[baseIdx + 2];
+        res[i].z = h_res[baseIdx + 2];
    }
    safeFree(h_points);
    safeFree(h_knots);
    safeCudaFree(d_res);
    safeFree(h_res);
    int cnt = 68;
    dim3 block1(8, 8);
    dim3 grid1((cnt + block1.x * block1.y - 1) / (block1.x * block1.y));
 //    g_initOrientedDisAndVecFields <<< grid1, block1 >>>(nullptr, nullptr, cnt, nullptr, nullptr, nullptr, nullptr, nullptr);
    return res;
 }
--- a/src/device/Nurbs/nurbs_surface.cu
+++ b/src/device/Nurbs/nurbs_surface.cu
@ -7,6 +7,7 @@
 #include "utils.h"
 #include "device/Nurbs/bvh.cuh"
 #include "device/device_utils.cuh"
 #include "tinynurbs/tinynurbs.h"
 __global__ void
 NurbsSurface::g_evaluate(float *res, const float *d_nTexture_u, const float *d_nTexture_v, const float *d_points,
@ -266,8 +267,8 @@ NurbsSurface::g_curvature(const float *derivatives, const int sampleCnt_u, const
 }
-__host__ NurbsSurface::Surface::Surface(std::vector<std::vector<std::vector<float>>> controlPoints,
+__host__ NurbsSurface::Surface::Surface(MeshPoints4 controlPoints, std::vector<float> knots_u,
-                                        std::vector<float> knots_u, std::vector<float> knots_v) {
+                                        std::vector<float> knots_v) {
    this->knots_u = std::move(knots_u);
    this->knots_v = std::move(knots_v);
    this->controlPoints = std::move(controlPoints);
@ -284,24 +285,22 @@ __host__ NurbsSurface::Surface::Surface(std::vector<std::vector<std::vector<floa
    d_k = nullptr;
    d_normals = nullptr;
    bvh.nodes = nullptr;
    h_evaluations = nullptr;
    h_derivatives = nullptr;
    h_normals = nullptr;
 }
-__host__ std::vector<std::vector<std::vector<float>>>
+__host__ void NurbsSurface::Surface::evaluate(int sampleCnt_u, int sampleCnt_v) {
 NurbsSurface::Surface::evaluate(int sampleCnt_u, int sampleCnt_v) {
    if (POINT_SIZE != controlPoints[0][0].size()) {
        printf("Error! Nurbs控制点应表示为长度为4的齐次坐标\n");
        return {};
    }
    // 构造指向device的controlPoints
    const int pointsCnt_u = controlPoints.size(), pointsCnt_v = controlPoints[0].size();
-    const int pointsBytes = pointsCnt_u * pointsCnt_v * POINT_SIZE * sizeof(float);
+    const int pointsBytes = pointsCnt_u * pointsCnt_v * sizeof(glm::vec4);
    auto *h_points = (float *) malloc(pointsBytes);
    for (int i = 0; i < pointsCnt_u; i++) {
        for (int j = 0; j < pointsCnt_v; j++) {
            for (int k = 0; k < POINT_SIZE; k++) {
                h_points[(i * pointsCnt_v + j) * POINT_SIZE + k] = controlPoints[i][j][k];
            }
 //            printf("%f/ %f/ %f/ %f     ", controlPoints[i][j][0], controlPoints[i][j][1], controlPoints[i][j][2], controlPoints[i][j][3]);
        }
    }
    cudaMalloc((void **) &d_points, pointsBytes);
@ -314,6 +313,8 @@ NurbsSurface::Surface::evaluate(int sampleCnt_u, int sampleCnt_v) {
    for (int i = 0; i < knotsCnt_u; i++) h_knots_u[i] = knots_u[i];
    for (int i = 0; i < knotsCnt_v; i++) h_knots_v[i] = knots_v[i];
    safeCudaFree(d_knots_u);
    safeCudaFree(d_knots_v);
    cudaMalloc((void **) &d_knots_u, knotsBytes_u);
    cudaMalloc((void **) &d_knots_v, knotsBytes_v);
    cudaMemcpy(d_knots_u, h_knots_u, knotsBytes_u, cudaMemcpyHostToDevice);
@ -330,10 +331,10 @@ NurbsSurface::Surface::evaluate(int sampleCnt_u, int sampleCnt_v) {
    // 结果数组
    size_t resBytes = sampleCnt_u * sampleCnt_v * 3 * sizeof(float);
-
+    safeCudaFree(d_evaluationRes);
    cudaMalloc((void **) &d_evaluationRes, resBytes);
-    auto *h_res = (float *) malloc(resBytes);
+    safeFree(h_evaluations);
-
+    h_evaluations = (float *) malloc(resBytes);
    // 构造g_basisTexture线程层级
    dim3 blockBasis(512);
@ -354,8 +355,8 @@ NurbsSurface::Surface::evaluate(int sampleCnt_u, int sampleCnt_v) {
    cudaDeviceSynchronize();
    g_evaluate <<<grid, block>>>(d_evaluationRes, d_nTexture_u, d_nTexture_v, d_points, pointsCnt_u, pointsCnt_v,
-                                 POINT_SIZE,
+                                 POINT_SIZE, knots_u[knotsCnt_u - 1], knots_v[knotsCnt_v - 1], sampleCnt_u,
-                                 knots_u[knotsCnt_u - 1], knots_v[knotsCnt_v - 1], sampleCnt_u, sampleCnt_v);
+                                 sampleCnt_v);
    cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用
    if (recordTime) {
        time_cost_device = get_time() - time_cost_device;
@ -363,34 +364,58 @@ NurbsSurface::Surface::evaluate(int sampleCnt_u, int sampleCnt_v) {
               time_cost_device);
    }
-    cudaMemcpy(h_res, d_evaluationRes, resBytes, cudaMemcpyDeviceToHost);
+    cudaMemcpy(h_evaluations, d_evaluationRes, resBytes, cudaMemcpyDeviceToHost);
-    std::vector<std::vector<std::vector<float>>> res(sampleCnt_u, std::vector<std::vector<float>>(sampleCnt_v,
+
-                                                                                                  std::vector<float>(3,
+
-                                                                                                                     0.)));
+    // 释放内存
    safeFree(h_points);
    safeFree(h_knots_u);
    safeFree(h_knots_v);
 }
 __host__ std::vector<std::vector<glm::vec3>>
 NurbsSurface::Surface::getEvaluateVec(int sampleCnt_u, int sampleCnt_v) const {
    std::vector<std::vector<glm::vec3>> res(sampleCnt_u, std::vector<glm::vec3>(sampleCnt_v, glm::vec3()));
    for (int i = 0; i < sampleCnt_u; i++) {
        int baseIdx = i * sampleCnt_v * 3;
        for (int j = 0; j < sampleCnt_v; j++) {
            baseIdx += j * 3;
-            res[i][j][0] = h_res[baseIdx];
+            res[i][j].x = h_evaluations[baseIdx];
-            res[i][j][1] = h_res[baseIdx + 1];
+            res[i][j].y = h_evaluations[baseIdx + 1];
-            res[i][j][2] = h_res[baseIdx + 2];
+            res[i][j].z = h_evaluations[baseIdx + 2];
 //            printf("%d, %d: %f, %f, %f\n", i, j, res[i][j][0], res[i][j][1], res[i][j][2]);
        }
    }
    // 释放内存
    safeFree(h_points);
    safeFree(h_knots_u);
    safeFree(h_knots_v);
    safeFree(h_res);
    return res;
 }
-__host__ void NurbsSurface::Surface::derivative(int sampleCnt_u, int sampleCnt_v) {
+__host__ std::vector<MeshPoints3> NurbsSurface::Surface::getDerivativeVec(int sampleCnt_u, int sampleCnt_v) const {
-    // 先完成evaluation
+    MeshPoints3 der_u(sampleCnt_u, LinePoints3(sampleCnt_v));
-    evaluate(sampleCnt_u, sampleCnt_v);
+    MeshPoints3 der_v(sampleCnt_u, LinePoints3(sampleCnt_v));
    MeshPoints3 normal(sampleCnt_u, LinePoints3(sampleCnt_v));
    for (int i = 0; i < sampleCnt_u; i++) {
        int baseIdx = i * sampleCnt_v * 6;
        for (int j = 0; j < sampleCnt_v; j++) {
            baseIdx += j * 6;
            der_u[i][j].x = h_derivatives[baseIdx];
            der_u[i][j].y = h_derivatives[baseIdx + 1];
            der_u[i][j].z = h_derivatives[baseIdx + 2];
            der_v[i][j].x = h_derivatives[baseIdx + 3];
            der_v[i][j].y = h_derivatives[baseIdx + 4];
            der_v[i][j].z = h_derivatives[baseIdx + 5];
            auto baseIdxNorm = baseIdx / 2;
            normal[i][j].x = h_normals[baseIdxNorm];
            normal[i][j].y = h_normals[baseIdxNorm + 1];
            normal[i][j].z = h_normals[baseIdxNorm + 2];
            // TODO normalize 归一化在gpu中实现！
            normal[i][j] = glm::normalize(normal[i][j]);
        }
    }
    return {der_u, der_v, normal};
 }
 __host__ void NurbsSurface::Surface::derivative(int sampleCnt_u, int sampleCnt_v) {
    if (POINT_SIZE != controlPoints[0][0].size()) {
        printf("Error! Nurbs控制点应表示为长度为4的齐次坐标\n");
        return;
@ -405,13 +430,13 @@ __host__ void NurbsSurface::Surface::derivative(int sampleCnt_u, int sampleCnt_v
    // 构造切向量计算结果
    safeCudaFree(d_derivatives);
-    cudaMalloc((void **) &d_derivatives,
+    size_t derBytes = sampleCnt_u * sampleCnt_v * 6 * sizeof(float);
-               sampleCnt_v * sampleCnt_u * 6 * sizeof(float)); // 每个采样所求的切向量是一个六元向量，前三位是对u的偏导、后三位是对v的偏导
+    cudaMalloc((void **) &d_derivatives, derBytes); // 每个采样所求的切向量是一个六元向量，前三位是对u的偏导、后三位是对v的偏导
    // 构造法向量计算结果
    safeCudaFree(d_normals);
-    cudaMalloc((void **) &d_normals,
+    size_t normalBytes = sampleCnt_u * sampleCnt_v * 3 * sizeof(float);
-               sampleCnt_v * sampleCnt_u * 3 * sizeof(float));
+    cudaMalloc((void **) &d_normals, normalBytes);
    // 构造线程层级
    dim3 block(32, 32);
@ -437,13 +462,22 @@ __host__ void NurbsSurface::Surface::derivative(int sampleCnt_u, int sampleCnt_v
        printf("GPU time cost of surface first derivative calculating for %d samples: %lf\n", sampleCnt_u * sampleCnt_v,
               time_cost_device);
    }
    // 结果数组
    safeFree(h_normals);
    h_normals = (float *) malloc(normalBytes);
    cudaMemcpy(h_normals, d_normals, normalBytes, cudaMemcpyDeviceToHost);
    safeFree(h_derivatives);
    h_derivatives = (float *) malloc(derBytes);
    cudaMemcpy(h_derivatives, d_derivatives, derBytes, cudaMemcpyDeviceToHost);
    cudaFree(d_derTexture_u);
    cudaFree(d_derTexture_v);
 }
 __host__ void NurbsSurface::Surface::curvature(int sampleCnt_u, int sampleCnt_v) {
-    // 先计算切向量
+
    derivative(sampleCnt_u, sampleCnt_v);
    if (POINT_SIZE != controlPoints[0][0].size()) {
        printf("Error! Nurbs控制点应表示为长度为4的齐次坐标\n");
        return;
@ -491,17 +525,18 @@ NurbsSurface::Surface::~Surface() {
    safeCudaFree(d_normals);
    safeCudaFree(d_derivatives);
    safeFree(bvh.nodes);
    safeFree(h_evaluations);
    safeFree(h_normals);
    safeFree(h_derivatives);
    cudaDeviceReset();
 }
-__host__ void NurbsSurface::Surface::buildBHV(int layerCnt, bool useK) {
+__host__ void NurbsSurface::Surface::buildBVH(int layerCnt, bool useK) {
    // TODO 构造BVH的函数不应该出现在NURBS Surface中，应该是BVH类的事情！
    // TODO NURBS Surface只需要一个函数去调用BVH对象的方法即可
    int sampleCnt_u = pow(2, layerCnt - 1) + 1, sampleCnt_v = sampleCnt_u;
-    if (useK) {
+    if (!useK) {
-        // 先计算曲率，过程中也会计算曲面值
+        // 必须safeFree一下，这样global函数中才能通过d_k = nullptr知道不需要再free
        curvature(sampleCnt_u, sampleCnt_v);
    } else {
        // 只做evaluation
        evaluate(sampleCnt_u, sampleCnt_v);
        safeCudaFree(d_k);
    }
    if (POINT_SIZE != controlPoints[0][0].size()) {
@ -521,14 +556,15 @@ __host__ void NurbsSurface::Surface::buildBHV(int layerCnt, bool useK) {
    // 记录用时
    double time_cost_device;
    if (recordTime) time_cost_device = get_time();
-    buildBvh<<<grid, block>>>(d_k, bvh.maxLevel, d_evaluationRes, knots_u[knots_u.size() - 1],
+    g_buildBvh<<<grid, block>>>(d_k, bvh.maxLevel, d_evaluationRes, knots_u[knots_u.size() - 1],
-                              knots_v[knots_v.size() - 1], sampleCnt_u, sampleCnt_v, d_bvh);
+                                knots_v[knots_v.size() - 1], sampleCnt_u, sampleCnt_v, d_bvh);
    cudaDeviceSynchronize();
    if (recordTime) {
        time_cost_device = get_time() - time_cost_device;
        printf("GPU time cost of a %d-layer BVH building: %lf\n",
               bvh.maxLevel, time_cost_device);
    }
    // 将bvh拷贝到cpu中
    safeFree(bvh.nodes);
    bvh.nodes = (BVHNode *) malloc(bvhBytes);
    cudaMemcpy(bvh.nodes, d_bvh, bvhBytes, cudaMemcpyDeviceToHost);
@ -537,9 +573,9 @@ __host__ void NurbsSurface::Surface::buildBHV(int layerCnt, bool useK) {
 }
 __host__ void NurbsSurface::Surface::buildGaussMap(int layerCnt) {
    // TODO，构造GAUSS Map的函数不应该出现在NURBS Surface中，应该是GAUSS Map类的事情！
    int sampleCnt_u = pow(2, layerCnt - 1) + 1, sampleCnt_v = sampleCnt_u;
-    // 先计算法向量
+
    derivative(sampleCnt_u, sampleCnt_v);
    if (POINT_SIZE != controlPoints[0][0].size()) {
        printf("Error! Nurbs控制点应表示为长度为4的齐次坐标\n");
        return;
@ -556,8 +592,8 @@ __host__ void NurbsSurface::Surface::buildGaussMap(int layerCnt) {
    // 记录用时
    double time_cost_device;
    if (recordTime) time_cost_device = get_time();
-    buildBvh<<<grid, block>>>(nullptr, layerCnt, d_normals, knots_u[knots_u.size() - 1],
+    g_buildBvh<<<grid, block>>>(nullptr, layerCnt, d_normals, knots_u[knots_u.size() - 1],
-                              knots_v[knots_v.size() - 1], sampleCnt_u, sampleCnt_v, d_gaussMapTree);
+                                knots_v[knots_v.size() - 1], sampleCnt_u, sampleCnt_v, d_gaussMapTree);
    cudaDeviceSynchronize();
    if (recordTime) {
        time_cost_device = get_time() - time_cost_device;
@ -603,16 +639,24 @@ __host__ void NurbsSurface::recursiveGetOverlapLeafNodes(const BVH &bvh1, const
    }
 }
-__host__ std::vector<std::pair<int, int>>
+
 __host__ std::vector<boxPair>
 NurbsSurface::getOverlappedLeafNodes(const BVH &bvh1, const BVH &bvh2) {
-    std::vector<std::pair<int, int>> resPairs;
+    std::vector<idx2> resPairs;
    // 记录用时
    double time_cost_device = get_time();
    recursiveGetOverlapLeafNodes(bvh1, bvh2, 0, 0, resPairs);
    std::vector<boxPair> boxPairsIdx2(resPairs.size());
    for (int i = 0; i < resPairs.size(); i++) {
        boxPairsIdx2[i] = {{bvh1.nodes[resPairs[i].first].idx_u,  bvh1.nodes[resPairs[i].first].idx_v},
                           {bvh2.nodes[resPairs[i].second].idx_u, bvh2.nodes[resPairs[i].second].idx_v}};
    }
    time_cost_device = get_time() - time_cost_device;
    printf("CPU time cost for recursively calculating the overlapped leaf nodes: %lf\n", time_cost_device);
-    return resPairs;
+    return boxPairsIdx2;
 }
 __host__ bool
@ -631,8 +675,10 @@ NurbsSurface::isGaussMapsOverlapped(const BVH &gm1, const BVH &gm2, std::pair<in
        printf("Error when detecting overlapping: idx range invalid!\n");
        return false;
    }
    auto getRangedBox = [&commonMaxLayer](const BVH &bvh, const std::pair<int, int> &idxRange_u,
                                          const std::pair<int, int> idxRange_v) {
        // 获取某个范围的gauss map的aabb
        AABB bounding;
        for (int i = idxRange_u.first; i <= idxRange_u.second; ++i) {
            for (int j = idxRange_v.first; j <= idxRange_v.second; ++j) {
@ -670,3 +716,69 @@ __host__ bool NurbsSurface::isGaussMapsOverlapped(const BVH &gm1, const BVH &gm2
    auto idxRange_v2 = getIdxRange(range_v2, paramRange_v2, edgeCellCnt);
    return isGaussMapsOverlapped(gm1, gm2, idxRange_u1, idxRange_v1, idxRange_u2, idxRange_v2);
 }
 __host__ void
 NurbsSurface::Surface::recursiveGetRayBVHIntersection(const glm::vec3 dir, const glm::vec3 startPoint, const int idx,
                                                      std::vector<BVHNode> &intersectionLeafNodes) {
    auto bvhNode = bvh.nodes[idx];
    // 射线与AABB判交
    auto isRayBoxIntersect = [&]() {
        const auto &box = bvhNode.bounds;
        float t; // 射线的参数，当t<=0，表示线面交点不在视平面前方，视为没有交点
        if (dir.x != 0.) {
            // 当x分量不为0，则射线会与AABB中垂直于x轴的平面可能有交
            // 注意x的正负。x为正会先遇到较小点所在平面
            if (dir.x > 0) t = (box.pMin.x - startPoint.x) / dir.x;
            else t = (box.pMax.x - startPoint.x) / dir.x;
            if (t > 0.) {
                // 射线与平面在前方有交点。但交点不一定在盒子上
                auto tmpPt = startPoint + t * dir; // 交点
                if (box.pMin.y <= tmpPt.y && box.pMin.z <= tmpPt.z && box.pMax.y >= tmpPt.y && box.pMax.z >= tmpPt.z)
                    return true;
            }
        }
        if (dir.y != 0.) {
            // 同上测试y方向
            if (dir.y > 0) t = (box.pMin.y - startPoint.y) / dir.y;
            else t = (box.pMax.y - startPoint.y) / dir.y;
            if (t > 0.) {
                auto tmpPt = startPoint + t * dir;
                if (box.pMin.x <= tmpPt.x && box.pMin.z <= tmpPt.z && box.pMax.x >= tmpPt.x && box.pMax.z >= tmpPt.z)
                    return true;
            }
        }
        if (dir.z != 0.) {
            // 同上测试z方向
            if (dir.z > 0) t = (box.pMin.z - startPoint.z) / dir.z;
            else t = (box.pMax.z - startPoint.z) / dir.z;
            if (t > 0.) {
                auto tmpPt = startPoint + t * dir;
                if (box.pMin.x <= tmpPt.x && box.pMin.y <= tmpPt.y && box.pMax.x >= tmpPt.x && box.pMax.y >= tmpPt.y)
                    return true;
            }
        }
        return false;
    };
    // 不相交
    if (!isRayBoxIntersect()) return;
    // 相交
    if (bvhNode.firstChild == -1) {
        // 与叶节点相交
        intersectionLeafNodes.emplace_back(bvhNode);
    } else {
        // 与父节点相交
        for (int i = 0; i < 4; i++)
            recursiveGetRayBVHIntersection(dir, startPoint, bvhNode.firstChild + i, intersectionLeafNodes);
    }
 }
 __host__ std::vector<BVHNode> NurbsSurface::Surface::rayBVHIntersection(glm::vec3 dir, glm::vec3 startPoint) {
    std::vector<BVHNode> res;
    recursiveGetRayBVHIntersection(dir, startPoint, 0, res);
    //TODO sort res by t
 //    res.emplace_back(BVHNode());
 //    printf("res size: %lld\n", res.size());
    return res;
 }
--- a/src/device/aabb.cu
+++ b/src/device/aabb.cu
@ -2,36 +2,34 @@
 // Created by 14727 on 2022/12/11.
 //
-#include "../../include/device/aabb.cuh"
+#include "device/aabb.cuh"
-
+#include "device/device_utils.cuh"
 #define D_DBL_MAX 1.7976931348623158e+308
 #define D_FLT_MAX 3.402823466e+38F
 __device__ __host__ AABB::AABB() {
    float minNum = -D_FLT_MAX;
    float maxNum = D_FLT_MAX;
-    pMax = FVec3(minNum, minNum, minNum);
+    pMax = glm::vec3(minNum, minNum, minNum);
-    pMin = FVec3(maxNum, maxNum, maxNum);
+    pMin = glm::vec3(maxNum, maxNum, maxNum);
 }
-__device__ __host__ AABB::AABB(const FVec3 &p) : pMin(p), pMax(p) {}
+__device__ __host__ AABB::AABB(const glm::vec3 &p) : pMin(p), pMax(p) {}
-__device__ __host__ AABB::AABB(const FVec3 &p1, const FVec3 &p2) {
+__device__ __host__ AABB::AABB(const glm::vec3 &p1, const glm::vec3 &p2) {
-    pMin = FVec3(fmin(p1.x, p2.x), fmin(p1.y, p2.y), fmin(p1.z, p2.z));
+    pMin = glm::vec3(fmin(p1.x, p2.x), fmin(p1.y, p2.y), fmin(p1.z, p2.z));
-    pMax = FVec3(fmax(p1.x, p2.x), fmax(p1.y, p2.y), fmax(p1.z, p2.z));
+    pMax = glm::vec3(fmax(p1.x, p2.x), fmax(p1.y, p2.y), fmax(p1.z, p2.z));
 }
-__device__ __host__ AABB AABB::Union(const FVec3 &p) {
+__device__ __host__ AABB AABB::Union(const glm::vec3 &p) const {
-    return {FVec3(fmin(pMin.x, p.x), fmin(pMin.y, p.y), fmin(pMin.z, p.z)),
+    return {glm::vec3(fmin(pMin.x, p.x), fmin(pMin.y, p.y), fmin(pMin.z, p.z)),
-            FVec3(fmax(pMax.x, p.x), fmax(pMax.y, p.y), fmax(pMax.z, p.z))};
+            glm::vec3(fmax(pMax.x, p.x), fmax(pMax.y, p.y), fmax(pMax.z, p.z))};
 }
-__device__ __host__ AABB AABB::Union(const AABB &b2) {
+__device__ __host__ AABB AABB::Union(const AABB &b2) const {
-    return {FVec3(fmin(pMin.x, b2.pMin.x), fmin(pMin.y, b2.pMin.y), fmin(pMin.z, b2.pMin.z)),
+    return {glm::vec3(fmin(pMin.x, b2.pMin.x), fmin(pMin.y, b2.pMin.y), fmin(pMin.z, b2.pMin.z)),
-            FVec3(fmax(pMax.x, b2.pMax.x), fmax(pMax.y, b2.pMax.y), fmax(pMax.z, b2.pMax.z))};
+            glm::vec3(fmax(pMax.x, b2.pMax.x), fmax(pMax.y, b2.pMax.y), fmax(pMax.z, b2.pMax.z))};
 }
-__device__ __host__ bool AABB::IsOverlap(const AABB& b2) {
+__device__ __host__ bool AABB::IsOverlap(const AABB &b2) const {
    if (pMin.x > b2.pMax.x) return false;
    if (b2.pMin.x > pMax.x) return false;
    if (pMin.y > b2.pMax.y) return false;
--- a/src/device/device_utils.cu
+++ b/src/device/device_utils.cu
@ -2,7 +2,7 @@
 // Created by 14727 on 2022/12/9.
 //
-#include "../../include/device/device_utils.cuh"
+#include "device/device_utils.cuh"
 __device__ void d_safeFree(float *&p) {
    if (p != nullptr) {
--- a/src/device/srf_mesh.cu
+++ b/src/device/srf_mesh.cu
@ -0,0 +1,4 @@
 #include "device/srf_mesh.cuh"
 SrfMesh::SrfMesh(glm::vec3 *evalRes, glm::vec3 *tan_u, glm::vec3 *tan_v, glm::vec3 *nor, int edgeSampleCnt) :
        evaluationRes(evalRes), tangent_u(tan_u), tangent_v(tan_v), normal(nor), edgeSampleCnt(edgeSampleCnt) {}
--- a/src/device/vec.cu
+++ b/src/device/vec.cu
@ -2,7 +2,7 @@
 // Created by 14727 on 2022/12/11.
 //
-#include "../../include/device/vec.cuh"
+#include "device/vec.cuh"
 __device__ __host__ FVec3::FVec3(float x_, float y_, float z_) : x(x_), y(y_), z(z_) {}
--- a/src/utils.cpp
+++ b/src/utils.cpp
@ -1,4 +1,4 @@
-#include "../include/utils.h"
+#include "utils.h"
 #include "cuda_runtime.h"
 #if IN_UNIX
--- a/tests/main.cpp
+++ b/tests/main.cpp
@ -1,6 +1,7 @@
 #include <cstdio>
-#include "../include/device/Nurbs/nurbs_curve.cuh"
+#include "device/Nurbs/nurbs_curve.cuh"
-#include "../include/device/Nurbs/nurbs_surface.cuh"
+#include "device/Nurbs/nurbs_surface.cuh"
 #include "device/Nurbs/loop_detection.cuh"
 int main() {
 //    NurbsSurface::Surface nurbsSurfaceEvaluator({
@ -28,28 +29,50 @@ int main() {
 //                                                        {{-0.61, 0.22,  0.09, 1},  {-0.21, 0.20,  0.09, 1},  {0.13, 0.20,  0.12, 1},  {0.49, 0.19,  -0.03, 1}},
 //                                                        {{-0.52, 0.51,  0.13, 1},  {-0.20, 0.32,  0.36, 1},  {0.18, 0.29,  0.28, 1},  {0.51, 0.53,  0.12, 1}}
 //                                                }, {0, 0, 0, 0, 1, 1, 1, 1}, {0, 0, 0, 0, 1, 1, 1, 1});
 //    NurbsSurface::Surface nurbsSurfaceEvaluator({
 //                                                        {{2, -2,  -2, 1}, {-2.5, -2.2, -1.5, 3}, {-2, -2,  -0.5, 4}, {-2, -2,  1.5, 2}},
 //                                                        {{2, -1,  -2, 2}, {-2.5, -1.2, -1.5, 1}, {-2, -1,  -0.5, 1}, {-2, -1,  1.5, 1}},
 //                                                        {{3, 1.2, -2, 9}, {-2.5, 1.2,  -1.5, 1}, {-2, 1.5, -0.5, 7}, {-2, 1.5, 1.5, 4}},
 //                                                        {{2, 2,   -2, 1}, {-2.5, 2,    -1.5, 1}, {-2, 2.5, -0.5, 7}, {-2, 2,   1.5, 1}}
 //                                                }, {0, 0, 0, 0, 1, 1, 1, 1}, {0, 0, 0, 0, 1, 1, 1, 1});
 //    NurbsSurface::Surface nurbsSurfaceEvaluator({
 //                                                               {{-0.5, -0.5, -0.2, 1}, {-0.2, -0.5, -0.1, 1}, {0.2, -0.5, -0.1, 1}, {0.5, -0.5, -0.2, 1}},
 //                                                               {{-0.5, -0.2, -0.1, 1}, {-0.2, -0.2, 0., 1},   {0.2, -0.2, 0., 1},   {0.5, -0.2, -0.1, 1}},
 //                                                               {{-0.5, 0.2, -0.1, 1},  {-0.2, 0.2, 0., 1},    {0.2, 0.2, 0., 1},    {0.5, 0.2, -0.1, 1}},
 //                                                               {{-0.5, 0.5, -0.2, 1},  {-0.2, 0.5, -0.1, 1},  {0.2, 0.5, -0.1, 1},  {0.5, 0.5, -0.2, 1}}
 //                                                       }, {0, 0, 0, 0, 1, 1, 1, 1}, {0, 0, 0, 0, 1, 1, 1, 1});
    NurbsSurface::Surface nurbsSurfaceEvaluator({
-                                                        {{2, -2,  -2, 1}, {-2.5, -2.2, -1.5, 3}, {-2, -2,  -0.5, 4}, {-2, -2,  1.5, 2}},
+                                                        {{0,   0, 0.2,  1.1}, {0,   0.2, 0.3,  0.9}, {0,   0.4, 0.25, 1.1}, {0,   0.6, 0.12,  0.9}, {0,   0.8, 0.22,  1},   {0,   1, 0.12, 1}},
-                                                        {{2, -1,  -2, 2}, {-2.5, -1.2, -1.5, 1}, {-2, -1,  -0.5, 1}, {-2, -1,  1.5, 1}},
+                                                        {{0.2, 0, 0.22, 1},   {0.2, 0.2, -0.3, 1},   {0.2, 0.4, 0.4,  1},   {0.2, 0.6, -0.1,  1.1}, {0.2, 0.8, -0.1,  1},   {0.2, 1, 0.2,  0.9}},
-                                                        {{3, 1.2, -2, 9}, {-2.5, 1.2,  -1.5, 1}, {-2, 1.5, -0.5, 7}, {-2, 1.5, 1.5, 4}},
+                                                        {{0.4, 0, 0.18, 0.9}, {0.4, 0.2, -0.3, 1},   {0.4, 0.4, 0.4,  1.1}, {0.4, 0.6, -0.1,  1},   {0.4, 0.8, -0.1,  0.9}, {0.4, 1, 0.23, 1}},
-                                                        {{2, 2,   -2, 1}, {-2.5, 2,    -1.5, 1}, {-2, 2.5, -0.5, 7}, {-2, 2,   1.5, 1}}
+                                                        {{0.6, 0, 0.2,  0.9}, {0.6, 0.2, 0.4,  0.9}, {0.6, 0.4, 0.4,  1},   {0.6, 0.6, 0.4,   0.9}, {0.6, 0.8, 0.4,   1},   {0.6, 1, 0.2,  1}},
-                                                }, {0, 0, 0, 0, 1, 1, 1, 1}, {0, 0, 0, 0, 1, 1, 1, 1});
+                                                        {{0.8, 0, 0.19, 1.1}, {0.8, 0.2, -0.4, 1},   {0.8, 0.4, 0.4,  1.1}, {0.8, 0.6, -0.27, 1},   {0.8, 0.8, -0.27, 1},   {0.8, 1, 0.2,  0.9}},
                                                        {{1,   0, 0.2,  1.1}, {1,   0.2, 0.12, 1},   {1,   0.4, 0.22, 1},   {1,   0.6, 0.32,  1},   {1,   0.8, 0.22,  1.2}, {1,   1, 0.12, 1.1}}
                                                }, {0, 0, 0, 0, 0, 1, 1, 1, 1, 1}, {0, 0, 0, 0, 0, 1, 1, 1, 1, 1});
    nurbsSurfaceEvaluator.setRecordTime(true);
-    nurbsSurfaceEvaluator.buildGaussMap(9); // 构建曲面的Gauss map
+//    nurbsSurfaceEvaluator.buildGaussMap(6); // 构建曲面的Gauss map
 //    nurbsSurfaceEvaluator.gauss_map.printQuadTree(); // 打印整棵Gauss map树
-    auto overlappedLeafNodes = NurbsSurface::getOverlappedLeafNodes(nurbsSurfaceEvaluator.gauss_map,
+//    auto overlappedLeafNodes = NurbsSurface::getOverlappedLeafNodes(nurbsSurfaceEvaluator.gauss_map,
-                                                                    nurbsSurfaceEvaluator.gauss_map);
+//                                                                    nurbsSurfaceEvaluator.gauss_map);
-    printf("overlapped leafNodes cnt: %llu\n", overlappedLeafNodes.size());
+//    printf("overlapped leafNodes cnt: %llu\n", overlappedLeafNodes.size());
    //判定两个Surface构造的GaussMap在指定u、v范围内是否有重合
-    auto isRegionallyOverlapped = NurbsSurface::isGaussMapsOverlapped(nurbsSurfaceEvaluator.gauss_map,
+//    auto isRegionallyOverlapped = NurbsSurface::isGaussMapsOverlapped(nurbsSurfaceEvaluator.gauss_map,
-                                                                      nurbsSurfaceEvaluator.gauss_map, {0.2, 0.3},
+//                                                                      nurbsSurfaceEvaluator.gauss_map, {0.2, 0.3},
-                                                                      {0.2, 0.3}, {0.28, 0.38}, {0.28, 0.38}, {0, 1},
+//                                                                      {0.2, 0.3}, {0.28, 0.38}, {0.28, 0.38}, {0, 1},
-                                                                      {0, 1}, {0, 1}, {0, 1});
+//                                                                      {0, 1}, {0, 1}, {0, 1});
-    printf("is guass map overlapped: %d\n", isRegionallyOverlapped);
+//    printf("is guass map overlapped: %d\n", isRegionallyOverlapped);
    int level = 8;
    int edgeSampleCnt = pow(2, level - 1) + 1;
    nurbsSurfaceEvaluator.evaluate(edgeSampleCnt, edgeSampleCnt);
    nurbsSurfaceEvaluator.derivative(edgeSampleCnt, edgeSampleCnt);
    // 如果buildBVH需要曲率K，这里需要再先: nurbsSurfaceEvaluator.curvature(edgeSampleCnt, edgeSampleCnt);
    // 同时buildBVH传入第二个参数
    nurbsSurfaceEvaluator.buildBVH(level); // 构建曲面的BVH
    auto derInfo = nurbsSurfaceEvaluator.getDerivativeVec(edgeSampleCnt, edgeSampleCnt); // 求值
 //    nurbsSurfaceEvaluator.buildBHV(9); // 构建曲面的BVH
 //    nurbsSurfaceEvaluator.evaluate(101, 101); // 求值
    printf("==============================\n");
    NurbsCurve::Curve nurbsCurveEvaluator(
@ -62,6 +85,8 @@ int main() {
            {0, 0, 0, 0.1, 0.5, 0.8, 1, 1, 1});
    nurbsCurveEvaluator.setRecordTime(true);
    auto res = nurbsCurveEvaluator.evaluate(11);
    for (auto &re: res) {
        printf("%f, %f, %f\n", re[0], re[1], re[2]);