correct BVH

4 years ago · 0b882462e4
18 changed files with 480 additions and 68 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -3,7 +3,16 @@ project(NurbsEvaluator LANGUAGES CXX CUDA)

 set(CMAKE_CUDA_STANDARD 14)

-add_executable(NurbsEvaluator src/main.cpp src/utils.cpp include/utils.h src/device/NurbsCommon.cu include/device/NurbsCommon.cuh src/device/DeviceUtils.cu include/device/DeviceUtils.cuh src/device/NurbsSurface.cu include/device/NurbsSurface.cuh src/device/NurbsCurve.cu include/device/NurbsCurve.cuh)
+add_executable(NurbsEvaluator src/main.cpp
+        src/utils.cpp include/utils.h
+        src/device/Nurbs/nurbs_common.cu include/device/Nurbs/nurbs_common.cuh
+        src/device/device_utils.cu include/device/device_utils.cuh
+        src/device/Nurbs/nurbs_surface.cu include/device/Nurbs/nurbs_surface.cuh
+        src/device/Nurbs/nurbs_curve.cu include/device/Nurbs/nurbs_curve.cuh
+        src/device/aabb.cu include/device/aabb.cuh
+        src/device/vec.cu include/device/vec.cuh
+        src/device/Nurbs/bvh.cu include/device/Nurbs/bvh.cuh
+        )

 #add_compile_options("$<$<C_COMPILER_ID:MSVC>:/utf-8>")
 #add_compile_options("$<$<CXX_COMPILER_ID:MSVC>:/utf-8>")
--- a/include/device/Nurbs/bvh.cuh
+++ b/include/device/Nurbs/bvh.cuh
@ -0,0 +1,29 @@
+//
+// Created by 14727 on 2022/12/11.
+//
+
+#ifndef NURBSEVALUATOR_BVH_CUH
+#define NURBSEVALUATOR_BVH_CUH
+
+#include "../aabb.cuh"
+
+struct BVHNode {
+    AABB bounds;  // AABB包围盒
+    int firstChild, level, childNum = 4;  // 第一个孩子节点的下标，该结点所在层次，孩子个数
+    // 曲面片u, v的范围
+    float u0, u1, v0, v1;
+};
+
+class BVH {
+public:
+    int maxLevel, size;
+    BVHNode* nodes = nullptr;
+
+    __host__ void printQuadTree();
+};
+
+__global__ void
+buildSurfaceBvh(const float *k, int level, const float *evaluatedPoints, float d_lastKnot_u, float d_lastKnot_v,
+                int d_sampleCnt_u, int d_sampleCnt_v, BVHNode *BVH);
+
+#endif //NURBSEVALUATOR_BVH_CUH
--- a/include/device/Nurbs/nurbs_common.cuh
+++ b/include/device/Nurbs/nurbs_common.cuh
@ -2,8 +2,8 @@
 // Created by 14727 on 2022/11/19.
 //

-#ifndef NURBSEVALUATOR_NURBSCOMMON_CUH
-#define NURBSEVALUATOR_NURBSCOMMON_CUH
+#ifndef NURBSEVALUATOR_NURBS_COMMON_CUH
+#define NURBSEVALUATOR_NURBS_COMMON_CUH

 /**
 * 当u值已知时，根据基函数N的递推表达式，采用动态规划的方式求解N值
@ -29,4 +29,4 @@ __global__ void
 g_derTexture(float *derTexture, const float *nTexture1, const float *d_knots, int d_pointsCnt, int d_knotsCnt,
             int d_sampleCnt);

-#endif //NURBSEVALUATOR_NURBSCOMMON_CUH
+#endif //NURBSEVALUATOR_NURBS_COMMON_CUH
--- a/include/device/Nurbs/nurbs_curve.cuh
+++ b/include/device/Nurbs/nurbs_curve.cuh
@ -2,8 +2,8 @@
 // Created by 14727 on 2022/12/9.
 //

-#ifndef NURBSEVALUATOR_NURBSCURVE_CUH
-#define NURBSEVALUATOR_NURBSCURVE_CUH
+#ifndef NURBSEVALUATOR_NURBS_CURVE_CUH
+#define NURBSEVALUATOR_NURBS_CURVE_CUH

 #include <cuda_runtime.h>
 #include <vector>
@ -70,4 +70,4 @@ namespace NurbsCurve {
 }


-#endif //NURBSEVALUATOR_NURBSCURVE_CUH
+#endif //NURBSEVALUATOR_NURBS_CURVE_CUH
--- a/include/device/Nurbs/nurbs_surface.cuh
+++ b/include/device/Nurbs/nurbs_surface.cuh
@ -2,10 +2,11 @@
 // Created by 14727 on 2022/12/9.
 //

-#ifndef NURBSEVALUATOR_NURBSSURFACE_CUH
-#define NURBSEVALUATOR_NURBSSURFACE_CUH
+#ifndef NURBSEVALUATOR_NURBS_SURFACE_CUH
+#define NURBSEVALUATOR_NURBS_SURFACE_CUH
 #include <vector>
 #include "cuda_runtime.h"
+#include "bvh.cuh"

 namespace NurbsSurface {
    const int POINT_SIZE = 4;
@ -25,7 +26,7 @@ namespace NurbsSurface {
                 float d_lastKnot_v, int d_sampleCnt_u, int d_sampleCnt_v);

    __global__ static void
-    g_curvature(const float *derivatives, int sampleCnt_u, int sampleCnt_v, float lastKnot_u, float lastKnot_v);
+    g_curvature(const float *derivatives, int sampleCnt_u, int sampleCnt_v, float lastKnot_u, float lastKnot_v, float *ms, float*k);

    class Surface {
    private:
@ -42,7 +43,11 @@ namespace NurbsSurface {
        float *d_nTexture1_u;   // u方向指向度为p-1时的device中的nurbs基函数矩阵
        float *d_nTexture1_v;   // v方向指向度为p-1时的device中的nurbs基函数矩阵

+        float *d_evaluationRes;  // evaluation结果
        float *d_derivatives;  // 一阶导计算结果
+        float *d_k;  // 最大曲率
+
+        BVH bvh;

    public:
        /**
@ -73,6 +78,11 @@ namespace NurbsSurface {
         */
        __host__ void curvature(int sampleCnt_u, int sampleCnt_v);

+        /**
+         * 供外部CPU程序使用的、负责调用gpu并行计算BVH的方法
+         */
+        __host__ void buildBHV(int sampleCnt_u, int sampleCnt_v);
+
        void setRecordTime(bool r);

        ~Surface();
@ -82,4 +92,4 @@ namespace NurbsSurface {



-#endif //NURBSEVALUATOR_NURBSSURFACE_CUH
+#endif //NURBSEVALUATOR_NURBS_SURFACE_CUH
--- a/include/device/aabb.cuh
+++ b/include/device/aabb.cuh
@ -0,0 +1,28 @@
+//
+// Created by 14727 on 2022/12/11.
+//
+
+#ifndef NURBSEVALUATOR_AABB_CUH
+#define NURBSEVALUATOR_AABB_CUH
+
+#include "vec.cuh"
+#include "cstdio"
+
+class AABB {
+public:
+    // 边界
+    FVec3 pMin, pMax;
+    __device__ __host__ AABB();
+    __device__ __host__ explicit AABB(const FVec3& p);
+    __device__ __host__ AABB(const FVec3& p1, const FVec3& p2);
+
+    // aabb包围盒合并操作，包围盒和点
+    __device__ __host__ AABB Union(const FVec3& p);
+    // aabb包围盒合并操作，两个包围盒
+    __device__ __host__ AABB Union(const AABB& b2);
+    // 判断两个aabb包围盒是否重叠
+    __device__ __host__ bool IsOverlap(const AABB& b2);
+};
+
+
+#endif //NURBSEVALUATOR_AABB_CUH
--- a/include/device/device_utils.cuh
+++ b/include/device/device_utils.cuh
@ -2,8 +2,8 @@
 // Created by 14727 on 2022/12/9.
 //

-#ifndef NURBSEVALUATOR_DEVICEUTILS_CUH
-#define NURBSEVALUATOR_DEVICEUTILS_CUH
+#ifndef NURBSEVALUATOR_DEVICE_UTILS_CUH
+#define NURBSEVALUATOR_DEVICE_UTILS_CUH


 __device__ void d_safeFree(float * &p);
@ -20,4 +20,4 @@ __device__ bool d_floatEqual(float a, float b);
 __device__ void normalization(float &a, float &b, float &c);


-#endif //NURBSEVALUATOR_DEVICEUTILS_CUH
+#endif //NURBSEVALUATOR_DEVICE_UTILS_CUH
--- a/include/device/vec.cuh
+++ b/include/device/vec.cuh
@ -0,0 +1,19 @@
+//
+// Created by 14727 on 2022/12/11.
+//
+
+#ifndef NURBSEVALUATOR_VEC_CUH
+#define NURBSEVALUATOR_VEC_CUH
+#include "cuda_runtime.h"
+
+class FVec3 {
+public:
+    float x, y, z;
+    __device__ __host__ FVec3(float x_, float y_, float z_);
+    __device__ __host__ FVec3(FVec3 const &fVec3);
+
+    __device__ __host__ FVec3();
+};
+
+
+#endif //NURBSEVALUATOR_VEC_CUH
--- a/include/utils.h
+++ b/include/utils.h
@ -9,6 +9,8 @@
 double get_time();
 #else
 #include <windows.h>
+#include "device/Nurbs/bvh.cuh"
+
 double get_time();
 #endif

@ -17,6 +19,10 @@ double get_time();
 *  注意指针是引用传参，因为要把指针本身置空
 */
 void safeCudaFree(float *&p);
+void safeCudaFree(BVHNode *&p);
+//template<typename T>
 void safeFree(float *&p);
+void safeFree(BVHNode *&p);
+

 #endif //UNTITLED1_UTILS_H
--- a/src/device/Nurbs/bvh.cu
+++ b/src/device/Nurbs/bvh.cu
@ -0,0 +1,137 @@
+//
+// Created by 14727 on 2022/12/11.
+//
+
+#include "../../../include/device/Nurbs/bvh.cuh"
+/**
+ * 当前层的第一个节点的坐标
+ * @param layer 层号。根节点为第一层
+ */
+__device__ int getStartIdxOfLayerN(int layer) {
+    if (layer == 1) return 0;
+    int num = 1;
+    // 找规律得出
+    for (int i = 2; i < layer; i++) {
+        num = num * 4 + 1;
+    }
+    return num;
+}
+
+/**
+ * 根据u、v参数域中的小矩形的位置，判断它在BVH叶节点层（也就是最后一层）中的位置
+ * @param ix
+ * @param iy
+ * @return
+ */
+__device__ int getChildNodeIdx(int ix, int iy) {
+    int idx = 0;
+    while(ix > 0) {
+        int log2XCeil = int(__log2f(ix));
+        idx += int(pow(4, log2XCeil));
+        ix -= int(pow(2, log2XCeil));
+    }
+    while(iy > 0) {
+        int log2XCeil = int(__log2f(iy));
+        idx += 2 * int(pow(4, log2XCeil));
+        iy -= int(pow(2, log2XCeil));
+    }
+    return idx;
+}
+
+
+__global__ void
+buildSurfaceBvh(const float *k, int level, const float *evaluatedPoints, float lastKnot_u, float lastKnot_v,
+                int sampleCnt_u, int sampleCnt_v, BVHNode *BVH) {
+    // 假设还是二维grid和二维的block
+    int ix = blockIdx.x * blockDim.x + threadIdx.x;
+    int iy = blockIdx.y * blockDim.y + threadIdx.y;
+    if (ix >= sampleCnt_u - 1 || iy >= sampleCnt_v - 1) {
+        return;
+    }
+    float singleStepVal_u = lastKnot_u / (sampleCnt_u - 1);
+    float singleStepVal_v = lastKnot_v / (sampleCnt_v - 1);
+
+    float u = singleStepVal_u * ix, v = singleStepVal_v * iy;
+
+    int bias = getChildNodeIdx(ix, iy);  // 每层的节点在总数组中的编号相对该层第一个节点的偏置
+    for (int step = 1; step <= sampleCnt_u - 1 || step <= sampleCnt_v - 1; step = step * 2, --level) {
+
+        float step_u = min(step, sampleCnt_u - 1);
+        float step_v = min(step, sampleCnt_v - 1);
+
+        int baseIdx = getStartIdxOfLayerN(level);  // 当前层的第一个节点的坐标
+        int idx = baseIdx + bias;
+
+        int tmpIdx = (ix * sampleCnt_v + iy) * 3;
+//        AABB bound(FVec3(evaluatedPoints[tmpIdx] + *k, evaluatedPoints[tmpIdx + 1] + *k,
+//                         evaluatedPoints[tmpIdx + 2] + *k));  // 最初只包含u，v点
+        AABB bound(FVec3(evaluatedPoints[tmpIdx], evaluatedPoints[tmpIdx + 1],
+                         evaluatedPoints[tmpIdx + 2]));  // 最初只包含u，v点
+        if(idx == 75) printf("75: %g, %g, %g\n", evaluatedPoints[tmpIdx], evaluatedPoints[tmpIdx + 1], evaluatedPoints[tmpIdx + 2]);
+        if (step == 1) {
+            // 叶子节点
+            BVH[idx].u0 = u;
+            BVH[idx].u1 = u + step_u * singleStepVal_u;
+            BVH[idx].v0 = v;
+            BVH[idx].v1 = v + step_v * singleStepVal_v;
+            BVH[idx].level = level;
+//            int tmpIdx1 = ((ix + 1) * sampleCnt_v + iy) * 3;
+//            bound = bound.Union(FVec3(evaluatedPoints[tmpIdx1] + *k, evaluatedPoints[tmpIdx1 + 1] + *k,
+//                                      evaluatedPoints[tmpIdx1 + 2] + *k));
+//            int tmpIdx2 = (ix * sampleCnt_v + iy + 1) * 3;
+//            bound = bound.Union(FVec3(evaluatedPoints[tmpIdx2] + *k, evaluatedPoints[tmpIdx2 + 1] + *k,
+//                                      evaluatedPoints[tmpIdx2 + 2] + *k));
+//            int tmpIdx3 = ((ix + 1) * sampleCnt_v + iy + 1) * 3;
+//            bound = bound.Union(FVec3(evaluatedPoints[tmpIdx3] + *k, evaluatedPoints[tmpIdx3 + 1] + *k,
+//                                      evaluatedPoints[tmpIdx3 + 2] + *k));
+            int tmpIdx1 = ((ix + 1) * sampleCnt_v + iy) * 3;
+            bound = bound.Union(FVec3(evaluatedPoints[tmpIdx1], evaluatedPoints[tmpIdx1 + 1],
+                                      evaluatedPoints[tmpIdx1 + 2]));
+            int tmpIdx2 = (ix * sampleCnt_v + iy + 1) * 3;
+            bound = bound.Union(FVec3(evaluatedPoints[tmpIdx2], evaluatedPoints[tmpIdx2 + 1],
+                                      evaluatedPoints[tmpIdx2 + 2]));
+            int tmpIdx3 = ((ix + 1) * sampleCnt_v + iy + 1) * 3;
+            bound = bound.Union(FVec3(evaluatedPoints[tmpIdx3], evaluatedPoints[tmpIdx3 + 1],
+                                      evaluatedPoints[tmpIdx3 + 2]));
+
+            BVH[idx].bounds = bound;
+            BVH[idx].childNum = -1;
+
+
+        } else {
+            if (ix % step == 0 || iy % step == 0) {
+                // 非叶子节点
+                BVH[idx].u0 = u;
+                BVH[idx].u1 = u + step_u * singleStepVal_u;
+                BVH[idx].v0 = v;
+                BVH[idx].v1 = v + step_v * singleStepVal_v;
+                BVH[idx].level = level;
+
+                int firstChild = 4 * idx + 1;
+                bound = bound.Union(BVH[firstChild].bounds);
+                bound = bound.Union(BVH[firstChild + 1].bounds);
+                bound = bound.Union(BVH[firstChild + 2].bounds);
+                bound = bound.Union(BVH[firstChild + 3].bounds);
+                BVH[idx].firstChild = firstChild;
+                BVH[idx].bounds = bound;
+            }
+        }
+
+        bias /= 4;
+
+        __syncthreads();
+    }
+}
+
+__host__ void BVH::printQuadTree() {
+
+    for (int l = 1, levelNodesCnt = 1, baseIdx = 0; l <= maxLevel; l++, levelNodesCnt *= 4, baseIdx = baseIdx * 4 + 1) {
+        for (int biasIdx = 0; biasIdx < levelNodesCnt; biasIdx++) {
+            int idx = baseIdx + biasIdx;
+            auto pMax = nodes[idx].bounds.pMax;
+            auto pMin = nodes[idx].bounds.pMin;
+            printf("<<%g, %g, %g>,<%g, %g, %g>>  ", pMin.x, pMin.y, pMin.z, pMax.x, pMax.y, pMax.z);
+        }
+        printf("\n =============$$$$$$$============= \n");
+    }
+}
--- a/src/device/Nurbs/nurbs_common.cu
+++ b/src/device/Nurbs/nurbs_common.cu
@ -2,8 +2,9 @@
 // Created by 14727 on 2022/11/19.
 //

-#include "../../include/device/NurbsCommon.cuh"
-#include "../../include/device/DeviceUtils.cuh"
+#include "../../../include/device/Nurbs/nurbs_common.cuh"
+#include "../../../include/device/device_utils.cuh"
+#include <cstdio>

 __device__ void d_basisFunction(float *N_Texture, const float *knots, float u, int degree, int d_knotsCnt) {
    int m = d_knotsCnt - 1;
@ -45,7 +46,7 @@ __global__ void g_basisTexture(float *nTexture, float *nTexture1, const float *d
    for (int i = 0; i < d_pointsCnt; i++) {
        nTexture[idx * d_pointsCnt + i] = N_dp[d_degree * (d_knotsCnt - 1) + i];
        nTexture1[idx * (d_pointsCnt + 1) + i] = N_dp[(d_degree - 1) * (d_knotsCnt - 1) + i];
-//        printf("nTexture1: %g ", nTexture1[idx * (d_pointsCnt + 1) + i]);
+//        if(idx == 3)printf("nTexture1: %g ", nTexture1[idx * (d_pointsCnt + 1) + i]);
    }
    nTexture1[idx * (d_pointsCnt + 1) + d_pointsCnt] = N_dp[(d_degree - 1) * (d_knotsCnt - 1) +
                                                            d_pointsCnt]; // nTexture1多记录一列数据
--- a/src/device/Nurbs/nurbs_curve.cu
+++ b/src/device/Nurbs/nurbs_curve.cu
@ -1,9 +1,9 @@
 //
 // Created by 14727 on 2022/12/9.
 //
-#include "../../include/device/NurbsCommon.cuh"
-#include "../../include/device/NurbsCurve.cuh"
-#include "../../include/utils.h"
+#include "../../../include/device/Nurbs/nurbs_common.cuh"
+#include "../../../include/device/Nurbs/nurbs_curve.cuh"
+#include "../../../include/utils.h"

 __global__ void
 NurbsCurve::g_evaluate(float *res, const float *NTexture, const float *d_points, const int d_pointsCnt,
--- a/src/device/Nurbs/nurbs_surface.cu
+++ b/src/device/Nurbs/nurbs_surface.cu
@ -2,13 +2,14 @@
 // Created by 14727 on 2022/12/9.
 //

-#include "../../include/device/NurbsSurface.cuh"
-#include "../../include/device/NurbsCommon.cuh"
-#include "../../include/device/DeviceUtils.cuh"
-#include "../../include/utils.h"
+#include "../../../include/device/Nurbs/nurbs_surface.cuh"
+#include "../../../include/device/Nurbs/nurbs_common.cuh"
+#include "../../../include/utils.h"
+#include "../../../include/device/Nurbs/bvh.cuh"

 __global__ void
-NurbsSurface::g_evaluate(float* res, const float *d_nTexture_u, const float *d_nTexture_v, const float *d_points, int d_pointsCnt_u,
+NurbsSurface::g_evaluate(float *res, const float *d_nTexture_u, const float *d_nTexture_v, const float *d_points,
+                         int d_pointsCnt_u,
                         int d_pointsCnt_v, int d_POINT_SIZE, float d_lastKnot_u, float d_lastKnot_v, int d_sampleCnt_u,
                         int d_sampleCnt_v) {

@ -40,14 +41,15 @@ NurbsSurface::g_evaluate(float* res, const float *d_nTexture_u, const float *d_n
    y = y / sumW;
    z = z / sumW;

-//    int baseIdx = (ix * d_sampleCnt_v + iy) * 3;
-//    res[baseIdx] = x;
-//    res[baseIdx + 1] = y;
-//    res[baseIdx + 2] = z;
+    int baseIdx = (ix * d_sampleCnt_v + iy) * 3;
+    res[baseIdx] = x;
+    res[baseIdx + 1] = y;
+    res[baseIdx + 2] = z;

 //    printf("(%d, %d)-->(%g, %g, %g)\n", ix, iy, x, y, z); // %g输出，舍弃无意义的0
 }

+
 __global__ void
 NurbsSurface::g_derivative(float *derivatives, const float *derTexture_u, const float *derTexture_v,
                           const float *nTexture_u, const float *nTexture_v, const float *d_points, int d_pointsCnt_u,
@ -125,8 +127,8 @@ NurbsSurface::g_derivative(float *derivatives, const float *derTexture_u, const
 }

 __global__ void
-NurbsSurface::g_curvature(const float *derivatives, int sampleCnt_u, int sampleCnt_v, float lastKnot_u,
-                          float lastKnot_v) {
+NurbsSurface::g_curvature(const float *derivatives, const int sampleCnt_u, const int sampleCnt_v, float lastKnot_u,
+                          float lastKnot_v, float *ms, float *k) {
    // 二维grid和二维的block
    int ix = blockIdx.x * blockDim.x + threadIdx.x;
    int iy = blockIdx.y * blockDim.y + threadIdx.y;
@ -202,20 +204,68 @@ NurbsSurface::g_curvature(const float *derivatives, int sampleCnt_u, int sampleC
    }

    float uvx = (sndPdx_uv + sndPdx_vu) / 2, uvy = (sndPdy_uv + sndPdy_vu) / 2, uvz = (sndPdz_uv + sndPdz_vu) / 2;
-    normalization(sndPdx_uu, sndPdy_uu, sndPdz_uu);
-    normalization(uvx, uvy, uvz);
-    normalization(sndPdx_vv, sndPdy_vv, sndPdz_vv);
+//    normalization(sndPdx_uv, sndPdy_uv, sndPdz_uv);
+//    normalization(sndPdx_vu, sndPdy_vu, sndPdz_vu);
+//    normalization(sndPdx_uu, sndPdy_uu, sndPdz_uu);
+//    normalization(uvx, uvy, uvz);
+//    normalization(sndPdx_vv, sndPdy_vv, sndPdz_vv);


-    if (ix == 8 && iy == 9)
+    if (ix == 8 && iy == 9) {
        printf("(%g, %g) --> uu: (%g, %g, %g), uv: (%g, %g, %g), vv: (%g, %g, %g)\n", u, v, sndPdx_uu, sndPdy_uu,
               sndPdz_uu,
               uvx, uvy, uvz, sndPdx_vv, sndPdy_vv, sndPdz_vv);
+        printf("uv: (%g, %g, %g), vu: (%g, %g, %g)\n", sndPdx_uv, sndPdy_uv, sndPdz_uv, sndPdx_vu, sndPdy_vu,
+               sndPdz_vu);
+    }
+
+    float m1 = max(max(sndPdx_uu, sndPdy_uu), sndPdz_uu);
+    float m2 = max(max(uvx, uvy), uvz);
+    float m3 = max(max(sndPdx_vv, sndPdy_vv), sndPdz_vv);
+
+
+//    __shared__ float ms[363];
+    ms[(ix * sampleCnt_v + iy) * 3] = m1;
+    ms[(ix * sampleCnt_v + iy) * 3 + 1] = m2;
+    ms[(ix * sampleCnt_v + iy) * 3 + 2] = m3;
+    __syncthreads();
+//    if(ix == 1 && iy == 1) {
+//        for(int i = 0; i < sampleCnt_u; i++) {
+//            for(int j = 0; j < sampleCnt_v; j++) {
+//                printf("%g ", ms[(i * sampleCnt_v + j) * 3]);
+//            }
+//            printf("\n");
+//        }
+//    }
+
+    // 规约求最大值
+    for (int step = (sampleCnt_u + 1) / 2; step > 1; step = (step + 1) / 2) {
+        // step 表示现在参与计算最大值的数据的长度的一半
+        if (ix < step && ix + step < sampleCnt_u) {
+            ms[(ix * sampleCnt_v + iy) * 3] = max(m1, ms[((ix + step) * sampleCnt_v + iy) * 3]);
+            ms[(ix * sampleCnt_v + iy) * 3 + 1] = max(m1, ms[((ix + step) * sampleCnt_v + iy) * 3 + 1]);
+            ms[(ix * sampleCnt_v + iy) * 3 + 2] = max(m1, ms[((ix + step) * sampleCnt_v + iy) * 3 + 2]);
+        }
+    }
+
+    for (int step = (sampleCnt_v + 1) / 2; step > 1; step = (step + 1) / 2) {
+        // step 表示现在参与计算最大值的数据的长度的一半
+        if (iy < step && iy + step < sampleCnt_v) {
+            ms[iy * 3] = max(ms[iy * 3], ms[(iy + step) * 3]);
+            ms[iy * 3 + 1] = max(ms[iy * 3 + 1], ms[(iy + step) * 3 + 1]);
+            ms[iy * 3 + 2] = max(ms[iy * 3 + 2], ms[(iy + step) * 3 + 2]);
+        }
+    }
+    __syncthreads();
+    int n = sampleCnt_u - 1;
+    int m = sampleCnt_v - 1;
+    *k = (ms[0] / (n * n) + 2 * ms[1] / (n * m) + ms[2] / (m * m)) / 8;
+//    if(ix == 1 && iy == 1)printf("%g gggg\n", ms[0]);
 }


 __host__ NurbsSurface::Surface::Surface(std::vector<std::vector<std::vector<float>>> controlPoints,
-                                            std::vector<float> knots_u, std::vector<float> knots_v) {
+                                        std::vector<float> knots_u, std::vector<float> knots_v) {
    this->knots_u = std::move(knots_u);
    this->knots_v = std::move(knots_v);
    this->controlPoints = std::move(controlPoints);
@ -227,7 +277,10 @@ __host__ NurbsSurface::Surface::Surface(std::vector<std::vector<std::vector<floa
    d_knots_u = nullptr;
    d_knots_v = nullptr;
    d_points = nullptr;
+    d_evaluationRes = nullptr;
    d_derivatives = nullptr;
+    d_k = nullptr;
+    bvh.nodes = nullptr;
 }

 __host__ std::vector<std::vector<std::vector<float>>>
@ -274,9 +327,9 @@ NurbsSurface::Surface::evaluate(int sampleCnt_u, int sampleCnt_v) {

    // 结果数组
    size_t resBytes = sampleCnt_u * sampleCnt_v * 3 * sizeof(float);
-    float *d_res;
-    cudaMalloc((void **)&d_res, resBytes);
-    auto *h_res = (float *)malloc(resBytes);
+
+    cudaMalloc((void **) &d_evaluationRes, resBytes);
+    auto *h_res = (float *) malloc(resBytes);


    // 构造g_basisTexture线程层级
@ -297,7 +350,8 @@ NurbsSurface::Surface::evaluate(int sampleCnt_u, int sampleCnt_v) {
    cudaDeviceSynchronize();

    if (recordTime) time_cost_device = get_time();
-    g_evaluate <<<grid, block>>>(d_res, d_nTexture_u, d_nTexture_v, d_points, pointsCnt_u, pointsCnt_v, POINT_SIZE,
+    g_evaluate <<<grid, block>>>(d_evaluationRes, d_nTexture_u, d_nTexture_v, d_points, pointsCnt_u, pointsCnt_v,
+                                 POINT_SIZE,
                                 knots_u[knotsCnt_u - 1], knots_v[knotsCnt_v - 1], sampleCnt_u, sampleCnt_v);
    cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用
    if (recordTime) {
@ -306,11 +360,13 @@ NurbsSurface::Surface::evaluate(int sampleCnt_u, int sampleCnt_v) {
               time_cost_device);
    }

-    cudaMemcpy(h_res, d_res, resBytes, cudaMemcpyDeviceToHost);
-    std::vector<std::vector<std::vector<float>>> res(sampleCnt_u, std::vector<std::vector<float>>(sampleCnt_v, std::vector<float>(3, 0.)));
-    for(int i = 0; i < sampleCnt_u; i++) {
+    cudaMemcpy(h_res, d_evaluationRes, resBytes, cudaMemcpyDeviceToHost);
+    std::vector<std::vector<std::vector<float>>> res(sampleCnt_u, std::vector<std::vector<float>>(sampleCnt_v,
+                                                                                                  std::vector<float>(3,
+                                                                                                                     0.)));
+    for (int i = 0; i < sampleCnt_u; i++) {
        int baseIdx = i * sampleCnt_v * 3;
-        for(int j = 0; j < sampleCnt_v; j++) {
+        for (int j = 0; j < sampleCnt_v; j++) {
            baseIdx += j * 3;
            res[i][j][0] = h_res[baseIdx];
            res[i][j][1] = h_res[baseIdx + 1];
@ -323,7 +379,6 @@ NurbsSurface::Surface::evaluate(int sampleCnt_u, int sampleCnt_v) {
    safeFree(h_points);
    safeFree(h_knots_u);
    safeFree(h_knots_v);
-    safeCudaFree(d_res);
    safeFree(h_res);

    return res;
@ -380,8 +435,6 @@ __host__ void NurbsSurface::Surface::derivative(int sampleCnt_u, int sampleCnt_v
    cudaFree(d_derTexture_v);
 }

-
-
 __host__ void NurbsSurface::Surface::curvature(int sampleCnt_u, int sampleCnt_v) {
    // 先计算切向量
    derivative(sampleCnt_u, sampleCnt_v);
@ -390,15 +443,22 @@ __host__ void NurbsSurface::Surface::curvature(int sampleCnt_u, int sampleCnt_v)
        return;
    }

+    // 构造记录每个采样点中的最大M1、M2、M3的数组
+    // 这里用共享内存会更好，但使用共享内存动态分配长度总是出错（好像是因为长度过长），后续需要思考解决这个问题
+    float *ms = nullptr;
+    cudaMalloc((void **) &ms, sampleCnt_u * sampleCnt_v * 3 * sizeof(float));
+
+    cudaMalloc((void **) &d_k, sizeof(float));
+
    // 构造线程层级
    dim3 block(32, 32);
-    dim3 grid((sampleCnt_u + block.x - 1) / block.x, (sampleCnt_v + block.y - 1) / block.y);
+    dim3 grid((sampleCnt_u - 1 + block.x - 1) / block.x, (sampleCnt_v - 1 + block.y - 1) / block.y);

    // 记录用时
    double time_cost_device;
    if (recordTime) time_cost_device = get_time();
    g_curvature<<<grid, block>>>(d_derivatives, sampleCnt_u, sampleCnt_v, knots_u[knots_u.size() - 1],
-                                 knots_v[knots_v.size() - 1]);
+                                 knots_v[knots_v.size() - 1], ms, d_k);
    cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用
    if (recordTime) {
        time_cost_device = get_time() - time_cost_device;
@ -406,6 +466,7 @@ __host__ void NurbsSurface::Surface::curvature(int sampleCnt_u, int sampleCnt_v)
               sampleCnt_u * sampleCnt_v,
               time_cost_device);
    }
+    safeCudaFree(ms);
 }

 void NurbsSurface::Surface::setRecordTime(bool r) {
@ -420,5 +481,42 @@ NurbsSurface::Surface::~Surface() {
    safeCudaFree(d_points);
    safeCudaFree(d_knots_u);
    safeCudaFree(d_knots_v);
+    safeCudaFree(d_k);
+    safeCudaFree(d_evaluationRes);
+    safeFree(bvh.nodes);
    cudaDeviceReset();
-}
+}
+
+__host__ void NurbsSurface::Surface::buildBHV(int sampleCnt_u, int sampleCnt_v) {
+    // 先计算曲率
+    curvature(sampleCnt_u, sampleCnt_v);
+    if (POINT_SIZE != controlPoints[0][0].size()) {
+        printf("Error! Nurbs控制点应表示为长度为4的齐次坐标\n");
+        return;
+    }
+    // 构造线程层级
+    dim3 block(32, 32);
+    dim3 grid((sampleCnt_u + block.x - 1) / block.x, (sampleCnt_v + block.y - 1) / block.y);
+
+    bvh.maxLevel = max(int(ceil(log2f(sampleCnt_u - 1))) + 1, int(ceil(log2f(sampleCnt_v - 1))) + 1);
+    bvh.size = (pow(4, bvh.maxLevel) - 1) / 3;  // 等比数列求和公示求出数总的节点数
+    size_t bvhBytes = sizeof(BVHNode) * bvh.size;
+    BVHNode *d_bvh = nullptr;
+    cudaMalloc((void **) &d_bvh, bvhBytes);
+    // 记录用时
+    double time_cost_device = get_time();
+    buildSurfaceBvh<<<grid, block>>>(d_k, bvh.maxLevel, d_evaluationRes, knots_u[knots_u.size() - 1],
+                                     knots_v[knots_v.size() - 1], sampleCnt_u, sampleCnt_v, d_bvh);
+    cudaDeviceSynchronize();
+    if (recordTime) {
+        time_cost_device = get_time() - time_cost_device;
+        printf("GPU time cost of surface second derivative calculating for %d samples: %lf\n",
+               sampleCnt_u * sampleCnt_v,
+               time_cost_device);
+    }
+    if (bvh.nodes == nullptr) bvh.nodes = (BVHNode *) malloc(bvhBytes);
+    cudaMemcpy(bvh.nodes, d_bvh, bvhBytes, cudaMemcpyDeviceToHost);
+    safeCudaFree(d_bvh);
+
+//    bvh.printQuadTree();
+}
--- a/src/device/aabb.cu
+++ b/src/device/aabb.cu
@ -0,0 +1,42 @@
+//
+// Created by 14727 on 2022/12/11.
+//
+
+#include "../../include/device/aabb.cuh"
+
+#define D_DBL_MAX 1.7976931348623158e+308
+#define D_FLT_MAX 3.402823466e+38F
+
+__device__ __host__ AABB::AABB() {
+    float minNum = -D_FLT_MAX;
+    float maxNum = D_FLT_MAX;
+    pMin = FVec3(minNum, minNum, minNum);
+    pMax = FVec3(maxNum, maxNum, maxNum);
+}
+
+__device__ __host__ AABB::AABB(const FVec3 &p) : pMin(p), pMax(p) {}
+
+__device__ __host__ AABB::AABB(const FVec3 &p1, const FVec3 &p2) {
+    pMin = FVec3(fmin(p1.x, p2.x), fmin(p1.y, p2.y), fmin(p1.z, p2.z));
+    pMax = FVec3(fmax(p1.x, p2.x), fmax(p1.y, p2.y), fmax(p1.z, p2.z));
+}
+
+__device__ __host__ AABB AABB::Union(const FVec3 &p) {
+    return {FVec3(fmin(pMin.x, p.x), fmin(pMin.y, p.y), fmin(pMin.z, p.z)),
+            FVec3(fmax(pMax.x, p.x), fmax(pMax.y, p.y), fmax(pMax.z, p.z))};
+}
+
+__device__ __host__ AABB AABB::Union(const AABB &b2) {
+    return {FVec3(fmin(pMin.x, b2.pMin.x), fmin(pMin.y, b2.pMin.y), fmin(pMin.z, b2.pMin.z)),
+            FVec3(fmax(pMax.x, b2.pMax.x), fmax(pMax.y, b2.pMax.y), fmax(pMax.z, b2.pMax.z))};
+}
+
+__device__ __host__ bool AABB::IsOverlap(const AABB& b2) {
+    if (pMin.x > b2.pMax.x) return false;
+    if (b2.pMin.x > pMax.x) return false;
+    if (pMin.y > b2.pMax.y) return false;
+    if (b2.pMin.y > pMax.y) return false;
+    if (pMin.z > b2.pMax.z) return false;
+    if (b2.pMin.z > pMax.z) return false;
+    return true;
+}
--- a/src/device/device_utils.cu
+++ b/src/device/device_utils.cu
@ -2,7 +2,7 @@
 // Created by 14727 on 2022/12/9.
 //

-#include "../../include/device/DeviceUtils.cuh"
+#include "../../include/device/device_utils.cuh"

 __device__ void d_safeFree(float *&p) {
    if (p != nullptr) {
--- a/src/device/vec.cu
+++ b/src/device/vec.cu
@ -0,0 +1,13 @@
+//
+// Created by 14727 on 2022/12/11.
+//
+
+#include "../../include/device/vec.cuh"
+
+__device__ __host__ FVec3::FVec3(float x_, float y_, float z_) : x(x_), y(y_), z(z_) {}
+
+__device__ __host__ FVec3::FVec3(const FVec3 &fVec3) : x(static_cast<float>(fVec3.x)),
+                                              y(static_cast<float>(fVec3.y)),
+                                              z(static_cast<float>(fVec3.z)) {}
+
+__device__ __host__ FVec3::FVec3():x(0.), y(0.), z(0.) {}
--- a/src/main.cpp
+++ b/src/main.cpp
@ -1,17 +1,17 @@
 #include <cstdio>
-#include "../include/device/NurbsCurve.cuh"
-#include "../include/device/NurbsSurface.cuh"
+#include "../include/device/Nurbs/nurbs_curve.cuh"
+#include "../include/device/Nurbs/nurbs_surface.cuh"

 int main() {
-    NurbsSurface::Surface nurbsSurfaceEvaluator({
-                                                          {{-1,   0,   0, 0.3},   {0,   1, 6, 0.8},  {1,  0,   4, 0.5},   {2,  0.5,  3, 0.8},    {3,    3,   1, 0.6},   {4,  -5,    0, 0.7}},
-                                                          {{-2,   1,   1.2, 0.2}, {1,   2, 3, 0.3},  {2,  2,   3, 0.6},   {-1, -0.3, 2, 0.4},    {-1,   2,   0, 0.9},   {7,  -8,    2, 0.3}},
-                                                          {{-3.4, 2,   3, 0.8},   {2,   3, 0, 0.6},  {4,  3,   7, 0.3},   {-2, 0,    -0.2, 0.4}, {1,    1.7, 5, 0.6},   {9,  -10.3, 6, 0.7}},
-                                                          {{-1.5, 3.2, 1, 0.5},   {2.6, 7, -2, 0.7}, {5,  0.8, 4.2, 0.8}, {-4, 1,    4, 0.7},    {2.1,  4,   -2, 0.3},  {11, -6,    4, 0.6}},
-                                                          {{-0.2, 2,   0, 0.7},   {5,   3, 2, 0.4},  {5,  1.5, 1.4, 0.6}, {-3, 2,    5, 0.8},    {0.8,  1.3, 0, 0.5},   {15, -2,    0.9, 0.6}},
-                                                          {{3,    1.4, -1, 0.4},  {6,   2, 4, 0.6},  {-1, 0,   -2, 0.4},  {0,  2.8,  2, 0.6},    {-0.5, 2,   1.2, 0.9}, {7,  -3,    -2, 0.3}},},
-                                                  {0, 0, 0, 0.1, 0.5, 0.8, 1, 1, 1},
-                                                  {0, 0, 0, 0.2, 0.7, 0.8, 1, 1, 1});
+//    NurbsSurface::Surface nurbsSurfaceEvaluator({
+//                                                          {{-1,   0,   0, 0.3},   {0,   1, 6, 0.8},  {1,  0,   4, 0.5},   {2,  0.5,  3, 0.8},    {3,    3,   1, 0.6},   {4,  -5,    0, 0.7}},
+//                                                          {{-2,   1,   1.2, 0.2}, {1,   2, 3, 0.3},  {2,  2,   3, 0.6},   {-1, -0.3, 2, 0.4},    {-1,   2,   0, 0.9},   {7,  -8,    2, 0.3}},
+//                                                          {{-3.4, 2,   3, 0.8},   {2,   3, 0, 0.6},  {4,  3,   7, 0.3},   {-2, 0,    -0.2, 0.4}, {1,    1.7, 5, 0.6},   {9,  -10.3, 6, 0.7}},
+//                                                          {{-1.5, 3.2, 1, 0.5},   {2.6, 7, -2, 0.7}, {5,  0.8, 4.2, 0.8}, {-4, 1,    4, 0.7},    {2.1,  4,   -2, 0.3},  {11, -6,    4, 0.6}},
+//                                                          {{-0.2, 2,   0, 0.7},   {5,   3, 2, 0.4},  {5,  1.5, 1.4, 0.6}, {-3, 2,    5, 0.8},    {0.8,  1.3, 0, 0.5},   {15, -2,    0.9, 0.6}},
+//                                                          {{3,    1.4, -1, 0.4},  {6,   2, 4, 0.6},  {-1, 0,   -2, 0.4},  {0,  2.8,  2, 0.6},    {-0.5, 2,   1.2, 0.9}, {7,  -3,    -2, 0.3}},},
+//                                                  {0, 0, 0, 0.1, 0.5, 0.8, 1, 1, 1},
+//                                                  {0, 0, 0, 0.2, 0.7, 0.8, 1, 1, 1});
 //    NurbsSurface::Evaluator nurbsSurfaceEvaluator({
 //                                                          {{-1,   0,   0, 1},   {0,   1, 6, 1},  {1,  0,   4, 1},   {2,  0.5,  3, 1},    {3,    3,   1, 1},   {4,  -5,    0, 1}},
 //                                                          {{-2,   1,   1.2, 1}, {1,   2, 3, 1},  {2,  2,   3, 1},   {-1, -0.3, 2, 1},    {-1,   2,   0, 1},   {7,  -8,    2, 1}},
@ -21,9 +21,16 @@ int main() {
 //                                                          {{3,    1.4, -1, 1},  {6,   2, 4, 1},  {-1, 0,   -2, 1},  {0,  2.8,  2, 1},    {-0.5, 2,   1.2, 1}, {7,  -3,    -2, 1}},},
 //                                                  {0, 0, 0, 0.1, 0.5, 0.8, 1, 1, 1},
 //                                                  {0, 0, 0, 0.2, 0.7, 0.8, 1, 1, 1});
+
+    NurbsSurface::Surface nurbsSurfaceEvaluator({
+                                                        {{-0.57, -0.48, -0.31, 1}, {-0.17, -0.49, -0.43, 1}, {0.3,  -0.53, -0.27, 1}, {0.66, -0.42, -0.36, 1}},
+                                                        {{-0.53, -0.15, -0.03, 1}, {-0.21, -0.12, 0.09, 1},  {0.10, -0.18, 0.03, 1},  {0.49, -0.15, -0.03, 1}},
+                                                        {{-0.61, 0.22,  0.09, 1},  {-0.21, 0.20,  0.09, 1},  {0.13, 0.20,  0.12, 1},  {0.49, 0.19,  -0.03, 1}},
+                                                        {{-0.52, 0.51,  0.13, 1},  {-0.20, 0.32,  0.36, 1},  {0.18, 0.29,  0.28, 1},  {0.51, 0.53,  0.12, 1}}
+                                                }, {0, 0, 0, 0, 1, 1, 1, 1}, {0, 0, 0, 0, 1, 1, 1, 1});
    nurbsSurfaceEvaluator.setRecordTime(true);

-    nurbsSurfaceEvaluator.curvature(101, 101);
+    nurbsSurfaceEvaluator.buildBHV(17, 17);
 //    nurbsSurfaceEvaluator.evaluate(101, 101);
    printf("==============================\n");

@ -34,15 +41,14 @@ int main() {
             {2,  0.5, 3, 0.4},
             {3,  3,   1, 0.5},
             {4,  -5,  0, 0.7}},
-            {0, 0, 0, 0.1, 0.5, 0.8, 1, 1, 1});
+            {0, 0, 0.1, 0.2, 0.5, 0.8, 1, 1, 1});
    nurbsCurveEvaluator.setRecordTime(true);
-//    nurbsCurveEvaluator.curvature(11);
+
    auto res = nurbsCurveEvaluator.evaluate(11);
-    for(auto & re : res) {
+    for (auto &re: res) {
        printf("%f, %f, %f\n", re[0], re[1], re[2]);
    }
    printf("\n");
-//    nurbsCurveEvaluator.derivative();

    return 0;
 }
--- a/src/utils.cpp
+++ b/src/utils.cpp
@ -38,9 +38,23 @@ void safeCudaFree(float *&p) {
    }
 }

+void safeCudaFree(BVHNode *&p){
+    if (p != nullptr) {
+        cudaFree(p);
+        p = nullptr;
+    }
+}
+//template<typename T>
 void safeFree(float *&p) {
    if (p != nullptr) {
        free(p);
        p = nullptr;
    }
-}
+}
+
+void safeFree(BVHNode *&p) {
+    if (p != nullptr) {
+        free(p);
+        p = nullptr;
+    }
+}