重新调整项目结构

3 years ago · 7896d545c2
20 changed files with 1120 additions and 1090 deletions
--- a/.gitignore
+++ b/.gitignore
@ -41,5 +41,4 @@
 *.app

 cmake-build-debug/
-.idea/
-
+.idea/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -3,7 +3,7 @@ project(NurbsEvaluator LANGUAGES CXX CUDA)

 set(CMAKE_CUDA_STANDARD 14)

-add_executable(NurbsEvaluator main.cpp NurbsEvaluator.cu NurbsEvaluator.cuh utils.cpp utils.h NurbsBasis.cu NurbsBasis.cuh)
+add_executable(NurbsEvaluator src/main.cpp src/utils.cpp include/utils.h src/device/NurbsCommon.cu include/device/NurbsCommon.cuh src/device/DeviceUtils.cu include/device/DeviceUtils.cuh src/device/NurbsSurface.cu include/device/NurbsSurface.cuh src/device/NurbsCurve.cu include/device/NurbsCurve.cuh)

 #add_compile_options("$<$<C_COMPILER_ID:MSVC>:/utf-8>")
 #add_compile_options("$<$<CXX_COMPILER_ID:MSVC>:/utf-8>")
@ -14,7 +14,19 @@ add_executable(NurbsEvaluator main.cpp NurbsEvaluator.cu NurbsEvaluator.cuh util
 #add_library(NurbsEvaluator NurbsEvaluator.cu NurbsEvaluator.cuh utils.cpp utils.h)

 # 引用系统环境变量CUDA_PATH
+# linux
+#include_directories("/usr/local/device-11.8/targets/x86_64-linux/include")
+# windows
 include_directories("$ENV{CUDA_PATH}/include")

+#MESSAGE("CUDA PATH::: $ENV{CUDA_PATH}")
+#MESSAGE("CUDA PATH::: $ENV{LD_LIBRARY_PATH}")
+#MESSAGE("CUDA PATH::: $ENV{CPATH}")
+
 set_target_properties(NurbsEvaluator PROPERTIES
        CUDA_SEPARABLE_COMPILATION ON)
+
+
+if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+    set(CMAKE_CUDA_ARCHITECTURES 70 75 80)
+endif(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
--- a/NurbsBasis.cu
+++ b/NurbsBasis.cu
@ -1,38 +0,0 @@
-//
-// Created by 14727 on 2022/11/19.
-//
-
-#include "NurbsBasis.cuh"
-
-__device__ void d_basisFunction1(float *N_Texture, const float *knots, float u, int degree, int d_knotsCnt) {
-    int m = d_knotsCnt - 1;
-    for (int p = 0; p <= degree; p++) {
-        for (int i = 0; i + p <= m - 1; i++) {
-            if (p == 0) {
-                if ((u > knots[i] || d_floatEqual1(u, knots[i])) && (u < knots[i + 1])
-                    ||
-                    d_floatEqual1(u, knots[i + 1]) && d_floatEqual1(u, knots[m])) {
-                    N_Texture[p * m + i] = 1.0;
-                } else {
-                    N_Texture[p * m + i] = 0.0;
-                }
-            } else {
-                float Nip_1 = N_Texture[(p - 1) * m + i];
-                float Ni1p_1 = N_Texture[(p - 1) * m + i + 1];
-                float left = d_floatEqual1(knots[i + p], knots[i]) ? 0 : (u - knots[i]) * Nip_1 /
-                                                                        (knots[i + p] - knots[i]);
-                float right = d_floatEqual1(knots[i + p + 1], knots[i + 1]) ? 0 : (knots[i + p + 1] - u) * Ni1p_1 /
-                                                                                 (knots[i + p + 1] - knots[i + 1]);
-                N_Texture[p * m + i] = left + right;
-            }
-        }
-    }
-}
-
-__device__ bool d_floatEqual1(float a, float b) {
-    return abs(a - b) < 0.00001;
-}
-
-void NurbsBasis::myPrint11(int a, int b) {
-        printf("In NurbsBasis %d!!!\n", a * b);
-}
--- a/NurbsBasis.cuh
+++ b/NurbsBasis.cuh
@ -1,28 +0,0 @@
-//
-// Created by 14727 on 2022/11/19.
-//
-
-#ifndef NURBSEVALUATOR_NURBSBASIS_CUH
-#define NURBSEVALUATOR_NURBSBASIS_CUH
-
-#include <cuda_runtime.h>
-#include "cstdio"
-
-/**
- * 当u值已知时，根据基函数N的递推表达式，采用动态规划的方式求解N值
- * @param N_Texture 结果返回在N_Texture中
- */
-extern __device__ void d_basisFunction1(float *N_Texture, const float *knots, float u, int degree, int d_knotsCnt);
-
-/**
- * device中判断两个浮点数是否相等。与CPU中一样，GPU中的浮点数也存在很小的误差，直接使用==判断往往容易将相等误判为不等
- * @return true：相等
- */
-extern __device__ bool d_floatEqual1(float a, float b);
-
-namespace NurbsBasis {
-    void myPrint11(int a, int b);
-};
-
-
-#endif //NURBSEVALUATOR_NURBSBASIS_CUH
--- a/NurbsEvaluator.cu
+++ b/NurbsEvaluator.cu
@ -1,791 +0,0 @@
-//
-// Created by 14727 on 2022/11/7.
-//
-
-#include <utility>
-
-#include "NurbsEvaluator.cuh"
-#include "cstdio"
-#include "utils.h"
-//#include "NurbsBasis.cuh"
-
-__device__ void normalization(float &a, float &b, float &c) {
-    float sumA = a * a;
-    float sumB = b * b;
-    float sumC = c * c;
-    float sum = sumA + sumB + sumC;
-    a = sqrt(sumA / sum);
-    b = sqrt(sumB / sum);
-    c = sqrt(sumC / sum);
-}
-
-//extern __device__ void NurbsBasis::d_basisFunction(float *N_Texture, const float *knots, float u, int degree, int d_knotsCnt) {};
-
-__host__ NurbsSurface::Evaluator::Evaluator(std::vector<std::vector<std::vector<float>>> controlPoints,
-                                            std::vector<float> knots_u, std::vector<float> knots_v) {
-    this->knots_u = std::move(knots_u);
-    this->knots_v = std::move(knots_v);
-    this->controlPoints = std::move(controlPoints);
-    recordTime = false;
-    d_nTexture_u = nullptr;
-    d_nTexture_v = nullptr;
-    d_nTexture1_u = nullptr;
-    d_nTexture1_v = nullptr;
-    d_knots_u = nullptr;
-    d_knots_v = nullptr;
-    d_points = nullptr;
-    d_derivatives = nullptr;
-}
-
-__host__ std::vector<std::map<std::pair<float, float>, std::vector<float>>>
-NurbsSurface::Evaluator::evaluate(int sampleCnt_u, int sampleCnt_v) {
-    if (POINT_SIZE != controlPoints[0][0].size()) {
-        printf("Error! Nurbs控制点应表示为长度为4的齐次坐标\n");
-        return {};
-    }
-
-    // 构造指向device的controlPoints
-    const int pointsCnt_u = controlPoints.size(), pointsCnt_v = controlPoints[0].size();
-    const int pointsBytes = pointsCnt_u * pointsCnt_v * POINT_SIZE * sizeof(float);
-    auto *h_points = (float *) malloc(pointsBytes);
-    for (int i = 0; i < pointsCnt_u; i++) {
-        for (int j = 0; j < pointsCnt_v; j++) {
-            for (int k = 0; k < POINT_SIZE; k++) {
-                h_points[(i * pointsCnt_v + j) * POINT_SIZE + k] = controlPoints[i][j][k];
-            }
-        }
-    }
-    cudaMalloc((void **) &d_points, pointsBytes);
-    cudaMemcpy(d_points, h_points, pointsBytes, cudaMemcpyHostToDevice);
-
-    // 构造指向device的knots
-    const int knotsCnt_u = knots_u.size(), knotsCnt_v = knots_v.size();
-    const int knotsBytes_u = knotsCnt_u * sizeof(float), knotsBytes_v = knotsCnt_v * sizeof(float);
-    auto *h_knots_u = (float *) malloc(knotsBytes_u), *h_knots_v = (float *) malloc(knotsBytes_v);
-    for (int i = 0; i < knotsCnt_u; i++) h_knots_u[i] = knots_u[i];
-    for (int i = 0; i < knotsCnt_v; i++) h_knots_v[i] = knots_v[i];
-
-    cudaMalloc((void **) &d_knots_u, knotsBytes_u);
-    cudaMalloc((void **) &d_knots_v, knotsBytes_v);
-    cudaMemcpy(d_knots_u, h_knots_u, knotsBytes_u, cudaMemcpyHostToDevice);
-    cudaMemcpy(d_knots_v, h_knots_v, knotsBytes_v, cudaMemcpyHostToDevice);
-
-    // 构造nTexture
-    cudaMalloc((void **) &d_nTexture_u,
-               sampleCnt_u * pointsCnt_u * sizeof(float)); // 注意nTexture的大小，在算梯度时用得到i=pointsCnt + 1的基函数值
-    cudaMalloc((void **) &d_nTexture_v, sampleCnt_v * pointsCnt_v * sizeof(float));
-
-    // 构造nTexture1
-    cudaMalloc((void **) &d_nTexture1_u, sampleCnt_u * (pointsCnt_u + 1) * sizeof(float));
-    cudaMalloc((void **) &d_nTexture1_v, sampleCnt_v * (pointsCnt_v + 1) * sizeof(float));
-
-    // 构造g_basisTexture线程层级
-    dim3 blockBasis(512);
-    dim3 gridBasis_u((sampleCnt_u + blockBasis.x - 1) / blockBasis.x);
-    dim3 gridBasis_v((sampleCnt_v + blockBasis.x - 1) / blockBasis.x);
-
-    // 构造线程层级，调用核函数
-    dim3 block(32, 32);
-    dim3 grid((sampleCnt_u + block.x - 1) / block.x, (sampleCnt_v + block.y - 1) / block.y);
-    // 记录用时
-    double time_cost_device;
-    g_basisTexture<<<gridBasis_u, blockBasis>>>(d_nTexture_u, d_nTexture1_u, d_knots_u, pointsCnt_u, knotsCnt_u,
-                                                sampleCnt_u);
-    cudaDeviceSynchronize();
-    g_basisTexture<<<gridBasis_v, blockBasis>>>(d_nTexture_v, d_nTexture1_v, d_knots_v, pointsCnt_v, knotsCnt_v,
-                                                sampleCnt_v);
-    cudaDeviceSynchronize();
-
-    if (recordTime) time_cost_device = utils::get_time_windows();
-    g_evaluate <<<grid, block>>>(d_nTexture_u, d_nTexture_v, d_points, pointsCnt_u, pointsCnt_v, POINT_SIZE,
-                                 knots_u[knotsCnt_u - 1], knots_v[knotsCnt_v - 1], sampleCnt_u, sampleCnt_v);
-    cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用
-    if (recordTime) {
-        time_cost_device = utils::get_time_windows() - time_cost_device;
-        printf("GPU time cost of surface evaluation for %d samples: %lf\n", sampleCnt_u * sampleCnt_v,
-               time_cost_device);
-    }
-
-    // 释放内存
-    free(h_points);
-    free(h_knots_u);
-    free(h_knots_v);
-    printf("First derivatives and normal vectors calculated by GPU:\n");
-    return {};
-}
-
-
-__host__ std::vector<std::map<float, std::vector<float>>>
-NurbsCurve::Evaluator::evaluate(int sampleCnt) {
-
-    if (POINT_SIZE != controlPoints[0].size()) {
-        printf("Error! Nurbs控制点应表示为长度为4的齐次坐标\n");
-        return {};
-    }
-    // 构造指向device的controlPoints
-    const int pointsCnt = controlPoints.size();
-    const int pointsBytes = pointsCnt * POINT_SIZE * sizeof(float);
-    auto *h_points = (float *) malloc(pointsBytes);
-    for (int i = 0; i < pointsCnt; i++) {
-        for (int j = 0; j < POINT_SIZE; j++) {
-            h_points[i * POINT_SIZE + j] = controlPoints[i][j];
-        }
-    }
-    myCudaFree(d_points);  // 注意内存管理
-    cudaMalloc((void **) &d_points, pointsBytes);
-    cudaMemcpy(d_points, h_points, pointsBytes, cudaMemcpyHostToDevice);
-
-    // 构造指向device的knots
-    const int knotsCnt = knots.size();
-    const int knotsBytes = knotsCnt * sizeof(float);
-    auto *h_knots = (float *) malloc(knotsBytes);
-    for (int i = 0; i < knotsCnt; i++) h_knots[i] = knots[i];
-    myCudaFree(d_knots);  // 注意内存管理
-    cudaMalloc((void **) &d_knots, knotsBytes);
-    cudaMemcpy(d_knots, h_knots, knotsBytes, cudaMemcpyHostToDevice);
-
-    // 分配nTexture的内存。只需要GPU内存
-//    float *d_nTexture = nullptr;
-    myCudaFree(d_nTexture);  // 注意内存管理
-    cudaMalloc((void **) &d_nTexture,
-               sampleCnt * pointsCnt * sizeof(float)); // 注意nTexture的大小，在算梯度时用得到i=pointsCnt + 1的基函数值
-
-    // 分配nTexture1的内存。只需要GPU内存
-//    float *d_nTexture1 = nullptr;
-    myCudaFree(d_nTexture1);  // 注意内存管理
-    cudaMalloc((void **) &d_nTexture1, sampleCnt * (pointsCnt + 1) * sizeof(float));
-
-    // 构造g_basisTexture线程层级
-    dim3 blockBasis(512);
-    dim3 gridBasis((sampleCnt + blockBasis.x - 1) / blockBasis.x);
-
-    // 构造线程层级
-    dim3 block(32, 32);
-    dim3 grid((sampleCnt + block.x * block.y - 1) / (block.x * block.y));
-    // 记录用时
-    double time_cost_device;
-    if (recordTime) time_cost_device = utils::get_time_windows();
-    printf("there..\n");
-    g_basisTexture <<<gridBasis, blockBasis>>>(d_nTexture, d_nTexture1, d_knots, pointsCnt, knotsCnt, sampleCnt);
-//    cudaMemcpy(d_nTextureCpy, d_nTexture, nTextureBytes, cudaMemcpyDeviceToDevice); // 有同步功能
-    cudaDeviceSynchronize();
-    printf("here..\n");
-    g_evaluate <<<grid, block>>>(d_nTexture, d_points, pointsCnt, POINT_SIZE, knots[knotsCnt - 1], sampleCnt);
-//    g_test<<<1,6>>>(d_nTextureCpy);
-    cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用
-    if (recordTime) {
-        time_cost_device = utils::get_time_windows() - time_cost_device;
-        printf("GPU time cost of curve evaluation for %d samples: %lf\n", sampleCnt, time_cost_device);
-    }
-    free(h_points);
-    free(h_knots);
-    return {};
-}
-
-__host__ void NurbsSurface::Evaluator::derivative(int sampleCnt_u, int sampleCnt_v) {
-    // 先完成evaluation
-    evaluate(sampleCnt_u, sampleCnt_v);
-
-    if (POINT_SIZE != controlPoints[0][0].size()) {
-        printf("Error! Nurbs控制点应表示为长度为4的齐次坐标\n");
-        return;
-    }
-
-    float *d_derTexture_u = nullptr;
-    float *d_derTexture_v = nullptr;
-    const int pointsCnt_u = controlPoints.size(), pointsCnt_v = controlPoints[0].size();
-    const int knotsCnt_u = knots_u.size(), knotsCnt_v = knots_v.size();
-    cudaMalloc((void **) &d_derTexture_u, sampleCnt_u * pointsCnt_u * sizeof(float));
-    cudaMalloc((void **) &d_derTexture_v, sampleCnt_v * pointsCnt_v * sizeof(float));
-
-    // 构造切向量计算结果
-    myCudaFree(d_derivatives);
-    cudaMalloc((void **) &d_derivatives,
-               sampleCnt_v * sampleCnt_u * 6 * sizeof(float)); // 每个采用所求的切向量是一个六元向量，前三位是对u的偏导、后三位是对v的偏导
-
-    // 构造线程层级
-    dim3 block(32, 32);
-    dim3 grid((sampleCnt_u + block.x - 1) / block.x, (sampleCnt_v + block.y - 1) / block.y);
-    // 构造g_basisTexture线程层级
-    dim3 blockTex(512);
-    dim3 gridTex_u((sampleCnt_u + blockTex.x - 1) / blockTex.x);
-    dim3 gridTex_v((sampleCnt_v + blockTex.x - 1) / blockTex.x);
-    // 记录用时
-    double time_cost_device;
-    if (recordTime) time_cost_device = utils::get_time_windows();
-    g_derTexture<<<gridTex_u, blockTex>>>(d_derTexture_u, d_nTexture1_u, d_knots_u, pointsCnt_u, knotsCnt_u,
-                                          sampleCnt_u);
-    g_derTexture<<<gridTex_v, blockTex>>>(d_derTexture_v, d_nTexture1_v, d_knots_v, pointsCnt_v, knotsCnt_v,
-                                          sampleCnt_v);
-    cudaDeviceSynchronize();
-    g_derivative<<<grid, block>>>(d_derivatives, d_derTexture_u, d_derTexture_v, d_nTexture_u, d_nTexture_v, d_points,
-                                  pointsCnt_u,
-                                  pointsCnt_v, POINT_SIZE, knots_u[knotsCnt_u - 1], knots_v[knotsCnt_v - 1],
-                                  sampleCnt_u,
-                                  sampleCnt_v);
-    cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用
-    if (recordTime) {
-        time_cost_device = utils::get_time_windows() - time_cost_device;
-        printf("GPU time cost of surface first derivative calculating for %d samples: %lf\n", sampleCnt_u * sampleCnt_v,
-               time_cost_device);
-    }
-    cudaFree(d_derTexture_u);
-    cudaFree(d_derTexture_v);
-}
-
-__host__ void NurbsCurve::Evaluator::derivative(int sampleCnt) {
-    // 先完成evaluation
-    evaluate(sampleCnt);
-
-    if (POINT_SIZE != controlPoints[0].size()) {
-        printf("Error! Nurbs控制点应表示为长度为4的齐次坐标\n");
-        return;
-    }
-
-    float *d_derTexture = nullptr;
-    const int pointsCnt = controlPoints.size();
-    const int knotsCnt = knots.size();
-    cudaMalloc((void **) &d_derTexture, sampleCnt * pointsCnt * sizeof(float));
-
-    // 构造线程层级
-    dim3 block(32, 32);
-    dim3 grid((sampleCnt + block.x * block.y - 1) / (block.x * block.y));
-    // 构造g_basisTexture线程层级
-    dim3 blockTex(512);
-    dim3 gridTex((sampleCnt + blockTex.x - 1) / blockTex.x);
-
-
-    // 构造切向量计算结果
-    myCudaFree(d_derivatives);
-    cudaMalloc((void **) &d_derivatives, sampleCnt * 3 * sizeof(float)); // 每个采用所求的切向量是一个三维向量
-
-    // 记录用时
-    double time_cost_device;
-    if (recordTime) time_cost_device = utils::get_time_windows();
-    g_derTexture<<<gridTex, blockTex>>>(d_derTexture, d_nTexture1, d_knots, pointsCnt, knotsCnt, sampleCnt);
-    cudaDeviceSynchronize();
-    g_derivative<<<grid, block>>>(d_derivatives, d_derTexture, d_nTexture, d_points, pointsCnt, POINT_SIZE,
-                                  knots[knotsCnt - 1], sampleCnt);
-    cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用
-    if (recordTime) {
-        time_cost_device = utils::get_time_windows() - time_cost_device;
-        printf("GPU time cost of curve first derivative calculating for %d samples: %lf\n", sampleCnt,
-               time_cost_device);
-    }
-    cudaFree(d_derTexture);
-}
-
-__host__ void NurbsSurface::Evaluator::curvature(int sampleCnt_u, int sampleCnt_v) {
-    // 先计算切向量
-    derivative(sampleCnt_u, sampleCnt_v);
-    if (POINT_SIZE != controlPoints[0][0].size()) {
-        printf("Error! Nurbs控制点应表示为长度为4的齐次坐标\n");
-        return;
-    }
-
-    // 构造线程层级
-    dim3 block(32, 32);
-    dim3 grid((sampleCnt_u + block.x - 1) / block.x, (sampleCnt_v + block.y - 1) / block.y);
-
-    // 记录用时
-    double time_cost_device;
-    if (recordTime) time_cost_device = utils::get_time_windows();
-    g_curvature<<<grid, block>>>(d_derivatives, sampleCnt_u, sampleCnt_v, knots_u[knots_u.size() - 1],
-                                 knots_v[knots_v.size() - 1]);
-    cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用
-    if (recordTime) {
-        time_cost_device = utils::get_time_windows() - time_cost_device;
-        printf("GPU time cost of surface second derivative calculating for %d samples: %lf\n",
-               sampleCnt_u * sampleCnt_v,
-               time_cost_device);
-    }
-}
-
-__host__ void NurbsCurve::Evaluator::curvature(int sampleCnt) {
-    // 先计算切向量
-    derivative(sampleCnt);
-
-    if (POINT_SIZE != controlPoints[0].size()) {
-        printf("Error! Nurbs控制点应表示为长度为4的齐次坐标\n");
-        return;
-    }
-
-    // 构造线程层级
-    dim3 block(32, 32);
-    dim3 grid((sampleCnt + block.x * block.y - 1) / (block.x * block.y));
-
-    // 记录用时
-    double time_cost_device;
-    if (recordTime) time_cost_device = utils::get_time_windows();
-    g_curvature<<<grid, block>>>(d_derivatives, sampleCnt, knots[knots.size() - 1]);
-    cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用
-    if (recordTime) {
-        time_cost_device = utils::get_time_windows() - time_cost_device;
-        printf("GPU time cost of curve second derivative calculating for %d samples: %lf\n", sampleCnt,
-               time_cost_device);
-    }
-}
-
-
-//__global__ void
-//NurbsSurface::g_evaluate(const float *d_points, const float *d_knots_u, const float *d_knots_v,
-//                         int d_pointsCnt_u,
-//                         int d_pointsCnt_v, int d_POINT_SIZE, int d_knotsCnt_u, int d_knotsCnt_v,
-//                         int d_sampleCnt_u, int d_sampleCnt_v) {
-////    printf(" surface calculating... \n");
-//    // 二维grid和二维的block
-//    int ix = int(blockIdx.x * blockDim.x + threadIdx.x);
-//    int iy = int(blockIdx.y * blockDim.y + threadIdx.y);
-//
-//    float d_paramCeil_u = d_knots_u[d_knotsCnt_u - 1];
-//    float d_paramCeil_v = d_knots_v[d_knotsCnt_v - 1];
-//
-//    float u = ix * d_paramCeil_u / (d_sampleCnt_u - 1);
-//    float v = iy * d_paramCeil_v / (d_sampleCnt_v - 1);
-//
-//    if (u > 1.0 * d_paramCeil_u || v > 1.0 * d_paramCeil_v) {
-//        return;
-//    }
-//
-//    int d_degree_u = d_knotsCnt_u - 1 - d_pointsCnt_u;
-//    int d_degree_v = d_knotsCnt_v - 1 - d_pointsCnt_v;
-//    // 注意，在device中，全局内存还是以malloc和free的方式分配和回收的，而不是使用cudaMalloc和cudaFree
-//    auto *N_Texture_U = (float *) malloc((d_degree_u + 1) * (d_knotsCnt_u - 1) * sizeof(float));
-//    auto *N_Texture_V = (float *) malloc((d_degree_v + 1) * (d_knotsCnt_v - 1) * sizeof(float));
-//    d_basisFunction(N_Texture_U, d_knots_u, u, d_degree_u, d_knotsCnt_u);
-//    d_basisFunction(N_Texture_V, d_knots_v, v, d_degree_v, d_knotsCnt_v);
-//    float x = 0., y = 0., z = 0.;
-//    for (int i = 0; i < d_pointsCnt_u; i++) {
-//        for (int j = 0; j < d_pointsCnt_v; j++) {
-//            float N_U = N_Texture_U[d_degree_u * (d_knotsCnt_u - 1) + i];
-//            float N_V = N_Texture_V[d_degree_v * (d_knotsCnt_v - 1) + j];
-//            int idx = (i * d_pointsCnt_v + j) * d_POINT_SIZE;
-//            x += N_U * N_V * d_points[idx];
-//            y += N_U * N_V * d_points[idx + 1];
-//            z += N_U * N_V * d_points[idx + 2];
-//        }
-//    }
-//    printf("(%g, %g)-->(%g, %g, %g)\n", u, v, x, y, z); // %g输出，舍弃无意义的0
-//    free(N_Texture_U);
-//    free(N_Texture_V);
-//}
-
-__global__ void
-NurbsSurface::g_evaluate(const float *d_nTexture_u, const float *d_nTexture_v, const float *d_points, int d_pointsCnt_u,
-                         int d_pointsCnt_v, int d_POINT_SIZE, float d_lastKnot_u, float d_lastKnot_v, int d_sampleCnt_u,
-                         int d_sampleCnt_v) {
-
-    // 二维grid和二维的block
-    int ix = blockIdx.x * blockDim.x + threadIdx.x;
-    int iy = blockIdx.y * blockDim.y + threadIdx.y;
-
-    float u = ix * d_lastKnot_u / (d_sampleCnt_u - 1);
-    float v = iy * d_lastKnot_v / (d_sampleCnt_v - 1);
-
-    if (u > 1.0 * d_lastKnot_u || v > 1.0 * d_lastKnot_v) {
-        return;
-    }
-
-    float x = 0., y = 0., z = 0., sumW = 0.;
-    for (int i = 0; i < d_pointsCnt_u; i++) {
-        float N_U = d_nTexture_u[ix * d_pointsCnt_u + i];
-        for (int j = 0; j < d_pointsCnt_v; j++) {
-            float N_V = d_nTexture_v[iy * d_pointsCnt_v + j];
-            int idx = (i * d_pointsCnt_v + j) * d_POINT_SIZE;
-            float w = d_points[idx + 3];
-            x += N_U * N_V * w * d_points[idx];
-            y += N_U * N_V * w * d_points[idx + 1];
-            z += N_U * N_V * w * d_points[idx + 2];
-            sumW += N_U * N_V * w;
-        }
-    }
-    x = x / sumW;
-    y = y / sumW;
-    z = z / sumW;
-//    printf("(%g, %g)-->(%g, %g, %g)\n", u, v, x, y, z); // %g输出，舍弃无意义的0
-}
-
-__global__ void
-NurbsSurface::g_derivative(float *derivatives, const float *derTexture_u, const float *derTexture_v,
-                           const float *nTexture_u, const float *nTexture_v, const float *d_points, int d_pointsCnt_u,
-                           int d_pointsCnt_v, int d_POINT_SIZE, float d_lastKnot_u, float d_lastKnot_v,
-                           int d_sampleCnt_u, int d_sampleCnt_v) {
-    // 二维grid和二维的block
-    int ix = blockIdx.x * blockDim.x + threadIdx.x;
-    int iy = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if (ix >= d_sampleCnt_u || iy >= d_sampleCnt_v) {
-        return;
-    }
-    float u = ix * d_lastKnot_u / (d_sampleCnt_u - 1);
-    float v = iy * d_lastKnot_v / (d_sampleCnt_v - 1);
-
-    float nubsPdx_u = 0., nubsPdy_u = 0, nubsPdz_u = 0., nubsPdw_u = 0.;
-    float nubsPdx_v = 0., nubsPdy_v = 0, nubsPdz_v = 0., nubsPdw_v = 0.;
-
-    for (int i = 0; i < d_pointsCnt_u; i++) {
-        for (int j = 0; j < d_pointsCnt_u; j++) {
-            int baseIdx = (i * d_pointsCnt_v + j) * d_POINT_SIZE;
-            float factor_u = derTexture_u[ix * d_pointsCnt_u + i] * nTexture_v[iy * d_pointsCnt_v + j];
-            float factor_v = derTexture_v[iy * d_pointsCnt_v + j] * nTexture_u[ix * d_pointsCnt_u + i];
-            float wij = d_points[baseIdx + 3];
-            nubsPdx_u += factor_u * wij * d_points[baseIdx];
-            nubsPdy_u += factor_u * wij * d_points[baseIdx + 1];
-            nubsPdz_u += factor_u * wij * d_points[baseIdx + 2];
-            nubsPdw_u += factor_u * wij;
-
-            nubsPdx_v += factor_v * wij * d_points[baseIdx];
-            nubsPdy_v += factor_v * wij * d_points[baseIdx + 1];
-            nubsPdz_v += factor_v * wij * d_points[baseIdx + 2];
-            nubsPdw_v += factor_v * wij;
-        }
-    }
-
-    float x = 0., y = 0., z = 0., w = 0.;
-    for (int i = 0; i < d_pointsCnt_u; i++) {
-        float N_U = nTexture_u[ix * d_pointsCnt_u + i];
-        for (int j = 0; j < d_pointsCnt_v; j++) {
-            float N_V = nTexture_v[iy * d_pointsCnt_v + j];
-            int idx = (i * d_pointsCnt_v + j) * d_POINT_SIZE;
-            float wij = d_points[idx + 3];
-            x += N_U * N_V * wij * d_points[idx];
-            y += N_U * N_V * wij * d_points[idx + 1];
-            z += N_U * N_V * wij * d_points[idx + 2];
-            w += N_U * N_V * wij;
-        }
-    }
-
-    float w2 = w * w;
-    float pdx_u = (nubsPdx_u * w - x * nubsPdw_u) / w2;
-    float pdy_u = (nubsPdy_u * w - y * nubsPdw_u) / w2;
-    float pdz_u = (nubsPdz_u * w - z * nubsPdw_u) / w2;
-
-    float pdx_v = (nubsPdx_v * w - x * nubsPdw_v) / w2;
-    float pdy_v = (nubsPdy_v * w - y * nubsPdw_v) / w2;
-    float pdz_v = (nubsPdz_v * w - z * nubsPdw_v) / w2;
-//    float pdz_u = (nubsPdz_u * w - z )
-
-    int baseIdx = (ix * d_sampleCnt_v + iy) * 6;
-    derivatives[baseIdx] = pdx_u;
-    derivatives[baseIdx + 1] = pdy_u;
-    derivatives[baseIdx + 2] = pdz_u;
-    derivatives[baseIdx + 3] = pdx_v;
-    derivatives[baseIdx + 4] = pdy_v;
-    derivatives[baseIdx + 5] = pdz_v;
-
-    float x_n = pdy_u * pdz_v - pdy_v * pdz_u, y_n = pdx_v * pdz_u - pdx_u * pdz_v, z_n =
-            pdx_u * pdy_v - pdx_v * pdy_u; // 叉乘得到法向量
-    if((ix == 8 && iy == 9) || (ix == 7 && iy == 9) || (ix == 9 && iy == 9) || (ix == 8 && iy == 8) || (ix == 8 && iy == 10))
-    printf("(%g,%g)-->u:(%g, %g, %g), v:(%g,%g,%g), normal:(%g,%g,%g)\n", u, v, pdx_u, pdy_u, pdz_u, pdx_v, pdy_v,
-           pdz_v, x_n, y_n, z_n);
-}
-
-__global__ void
-NurbsSurface::g_curvature(const float *derivatives, int sampleCnt_u, int sampleCnt_v, float lastKnot_u,
-                          float lastKnot_v) {
-    // 二维grid和二维的block
-    int ix = blockIdx.x * blockDim.x + threadIdx.x;
-    int iy = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if (ix >= sampleCnt_u || iy >= sampleCnt_v) {
-        return;
-    }
-
-
-    float step_u = lastKnot_u / (sampleCnt_u - 1), step_v = lastKnot_v / (sampleCnt_v - 1);
-    float u = ix * step_u, v = iy * step_v;
-
-    int baseIdx = (ix * sampleCnt_v + iy) * 6;
-    int lastBaseIdx_u = ((ix - 1) * sampleCnt_v + iy) * 6, nextBaseIdx_u = ((ix + 1) * sampleCnt_v + iy) * 6;
-    int lastBaseIdx_v = (ix * sampleCnt_v + iy - 1) * 6, nextBaseIdx_v = (ix * sampleCnt_v + iy + 1) * 6;
-
-//    printf("(%g,%g)-->u:(%g, %g, %g), v:(%g,%g,%g)\n", u, v, derivatives[baseIdx], derivatives[baseIdx + 1],
-//           derivatives[baseIdx + 2], derivatives[baseIdx + 3], derivatives[baseIdx + 4], derivatives[baseIdx + 5]);
-
-    float sndPdx_uu, sndPdy_uu, sndPdz_uu, sndPdx_vv, sndPdy_vv, sndPdz_vv;  // 二阶导
-    float sndPdx_uv, sndPdy_uv, sndPdz_uv, sndPdx_vu, sndPdy_vu, sndPdz_vu;
-
-    if (ix == 0) {
-        sndPdx_uu = (derivatives[nextBaseIdx_u] - derivatives[baseIdx]) / step_u;
-        sndPdy_uu = (derivatives[nextBaseIdx_u + 1] - derivatives[baseIdx + 1]) / step_u;
-        sndPdz_uu = (derivatives[nextBaseIdx_u + 2] - derivatives[baseIdx + 2]) / step_u;
-
-        sndPdx_vu = (derivatives[nextBaseIdx_u + 3] - derivatives[baseIdx + 3]) / step_u;
-        sndPdy_vu = (derivatives[nextBaseIdx_u + 4] - derivatives[baseIdx + 4]) / step_u;
-        sndPdz_vu = (derivatives[nextBaseIdx_u + 5] - derivatives[baseIdx + 5]) / step_u;
-    } else if (ix == sampleCnt_u - 1) {
-        sndPdx_uu = (derivatives[baseIdx] - derivatives[lastBaseIdx_u]) / step_u;
-        sndPdy_uu = (derivatives[baseIdx + 1] - derivatives[lastBaseIdx_u + 1]) / step_u;
-        sndPdz_uu = (derivatives[baseIdx + 2] - derivatives[lastBaseIdx_u + 2]) / step_u;
-
-        sndPdx_vu = (derivatives[baseIdx + 3] - derivatives[lastBaseIdx_u + 3]) / step_u;
-        sndPdy_vu = (derivatives[baseIdx + 4] - derivatives[lastBaseIdx_u + 4]) / step_u;
-        sndPdz_vu = (derivatives[baseIdx + 5] - derivatives[lastBaseIdx_u + 5]) / step_u;
-    } else {
-        sndPdx_uu = (derivatives[nextBaseIdx_u] - derivatives[lastBaseIdx_u]) / (2 * step_u);
-        sndPdy_uu = (derivatives[nextBaseIdx_u + 1] - derivatives[lastBaseIdx_u + 1]) / (2 * step_u);
-        sndPdz_uu = (derivatives[nextBaseIdx_u + 2] - derivatives[lastBaseIdx_u + 2]) / (2 * step_u);
-
-        sndPdx_vu = (derivatives[nextBaseIdx_u + 3] - derivatives[lastBaseIdx_u + 3]) / (2 * step_u);
-        sndPdy_vu = (derivatives[nextBaseIdx_u + 4] - derivatives[lastBaseIdx_u + 4]) / (2 * step_u);
-        sndPdz_vu = (derivatives[nextBaseIdx_u + 5] - derivatives[lastBaseIdx_u + 5]) / (2 * step_u);
-    }
-
-    if (iy == 0) {
-        sndPdx_vv = (derivatives[nextBaseIdx_v + 3] - derivatives[baseIdx + 3]) / step_v;
-        sndPdy_vv = (derivatives[nextBaseIdx_v + 4] - derivatives[baseIdx + 4]) / step_v;
-        sndPdz_vv = (derivatives[nextBaseIdx_v + 5] - derivatives[baseIdx + 5]) / step_v;
-
-        sndPdx_uv = (derivatives[nextBaseIdx_v] - derivatives[baseIdx]) / step_v;
-        sndPdy_uv = (derivatives[nextBaseIdx_v + 1] - derivatives[baseIdx + 1]) / step_v;
-        sndPdz_uv = (derivatives[nextBaseIdx_v + 2] - derivatives[baseIdx + 2]) / step_v;
-    } else if (iy == sampleCnt_v - 1) {
-        sndPdx_vv = (derivatives[baseIdx + 3] - derivatives[lastBaseIdx_v + 3]) / step_v;
-        sndPdy_vv = (derivatives[baseIdx + 4] - derivatives[lastBaseIdx_v + 4]) / step_v;
-        sndPdz_vv = (derivatives[baseIdx + 5] - derivatives[lastBaseIdx_v + 5]) / step_v;
-
-        sndPdx_uv = (derivatives[baseIdx] - derivatives[lastBaseIdx_v]) / step_v;
-        sndPdy_uv = (derivatives[baseIdx + 1] - derivatives[lastBaseIdx_v + 1]) / step_v;
-        sndPdz_uv = (derivatives[baseIdx + 2] - derivatives[lastBaseIdx_v + 2]) / step_v;
-    } else {
-        sndPdx_vv = (derivatives[nextBaseIdx_v + 3] - derivatives[lastBaseIdx_v + 3]) / (2 * step_v);
-        sndPdy_vv = (derivatives[nextBaseIdx_v + 4] - derivatives[lastBaseIdx_v + 4]) / (2 * step_v);
-        sndPdz_vv = (derivatives[nextBaseIdx_v + 5] - derivatives[lastBaseIdx_v + 5]) / (2 * step_v);
-
-        sndPdx_uv = (derivatives[nextBaseIdx_v] - derivatives[lastBaseIdx_v]) / (2 * step_v);
-        sndPdy_uv = (derivatives[nextBaseIdx_v + 1] - derivatives[lastBaseIdx_v + 1]) / (2 * step_v);
-        sndPdz_uv = (derivatives[nextBaseIdx_v + 2] - derivatives[lastBaseIdx_v + 2]) / (2 * step_v);
-    }
-
-    float uvx = (sndPdx_uv + sndPdx_vu) / 2, uvy = (sndPdy_uv + sndPdy_vu) / 2, uvz = (sndPdz_uv + sndPdz_vu) / 2;
-    normalization(sndPdx_uu, sndPdy_uu, sndPdz_uu);
-    normalization(uvx, uvy, uvz);
-    normalization(sndPdx_vv, sndPdy_vv, sndPdz_vv);
-
-
-    if(ix == 8 && iy == 9)
-    printf("(%g, %g) --> uu: (%g, %g, %g), uv: (%g, %g, %g), vv: (%g, %g, %g)\n", u, v, sndPdx_uu, sndPdy_uu, sndPdz_uu,
-           uvx, uvy, uvz, sndPdx_vv, sndPdy_vv, sndPdz_vv);
-}
-
-__global__ void
-NurbsCurve::g_evaluate(const float *NTexture, const float *d_points, const int d_pointsCnt,
-                       const int d_POINT_SIZE, const float d_lastKnot, const int d_sampleCnt) {
-//    printf(" curve calculating... \n");
-    // 二维grid和一维的block
-//    int idx = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
-    // 二维block和一维grid
-    int idx = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
-    float u = idx * d_lastKnot / (d_sampleCnt - 1);
-    if (u > 1.0 * d_lastKnot) {
-        return;
-    }
-//
-//    int d_degree = d_knotsCnt - 1 - d_pointsCnt;
-//    // 注意，在device中，全局内存还是以malloc和free的方式分配和回收的，而不是使用cudaMalloc和cudaFree
-//    auto *N_dp = (float *) malloc((d_degree + 1) * (d_knotsCnt - 1) * sizeof(float));
-//    d_basisFunction(N_dp, d_knots, u, d_degree, d_knotsCnt);
-    float x = 0., y = 0., z = 0., sumW = 0.;
-    for (int i = 0; i < d_pointsCnt; i++) {
-        float N = NTexture[idx * d_pointsCnt + i];
-        int baseIdx = i * d_POINT_SIZE;
-        float w = d_points[baseIdx + 3];
-        x += N * w * d_points[baseIdx];
-        y += N * w * d_points[baseIdx + 1];
-        z += N * w * d_points[baseIdx + 2];
-        sumW += N * w;
-    }
-    x = x / sumW;
-    y = y / sumW;
-    z = z / sumW;
-    printf("(%g)-->(%g, %g, %g)\n", u, x, y, z); // %g输出，舍弃无意义的0
-}
-
-__global__ void
-NurbsCurve::g_derivative(float *derivatives, const float *derTexture, const float *nTexture, const float *d_points,
-                         int d_pointsCnt, int d_POINT_SIZE,
-                         float d_lastKnot, int d_sampleCnt) {
-    // 二维block和一维grid
-    int idx = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
-    if (idx >= d_sampleCnt) return;
-    float u = idx * d_lastKnot / (d_sampleCnt - 1);
-    float nubs_dx = 0., nubs_dy = 0., nubs_dz = 0., nubs_dw = 0.;
-//    printf("POINT_SIZE: %d\n", d_POINT_SIZE);
-
-    for (int i = 0; i < d_pointsCnt; i++) {
-        int baseIdx = i * d_POINT_SIZE;
-        float nFactor = derTexture[idx * d_pointsCnt + i];
-        float wi = d_points[baseIdx + 3];
-        nubs_dx += nFactor * wi * d_points[baseIdx];
-        nubs_dy += nFactor * wi * d_points[baseIdx + 1];
-        nubs_dz += nFactor * wi * d_points[baseIdx + 2];
-        nubs_dw += nFactor * wi;
-//        printf("(x, y, z): (%g, %g, %g)\n", d_points[baseIdx], d_points[baseIdx + 1], d_points[baseIdx + 2]);
-    }
-
-    float x = 0., y = 0., z = 0., w = 0.;
-    for (int i = 0; i < d_pointsCnt; i++) {
-        float N = nTexture[idx * d_pointsCnt + i];
-        int baseIdx = i * d_POINT_SIZE;
-        float wi = d_points[baseIdx + 3];
-        x += N * wi * d_points[baseIdx];
-        y += N * wi * d_points[baseIdx + 1];
-        z += N * wi * d_points[baseIdx + 2];
-        w += N * wi;
-    }
-    float dx = (nubs_dx * w - x * nubs_dw) / (w * w);
-    float dy = (nubs_dy * w - y * nubs_dw) / (w * w);
-    float dz = (nubs_dz * w - z * nubs_dw) / (w * w);
-    int baseIdx = idx * 3;
-    derivatives[baseIdx] = dx;
-    derivatives[baseIdx + 1] = dy;
-    derivatives[baseIdx + 2] = dz;
-    printf("(%g)-->(%g, %g, %g)\n", u, dx, dy, dz);
-}
-
-__global__ void NurbsCurve::g_curvature(const float *derivatives, int sampleCnt, float lastKnot) {
-    // 二维block和一维grid
-    int idx = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
-    if (idx >= sampleCnt) return;
-    float step = lastKnot / (sampleCnt - 1);
-    float u = idx * step;
-    float sndPdx, sndPdy, sndPdz;  // 二阶导
-    int baseIdx = idx * 3, lastBaseIdx = (idx - 1) * 3, nextBaseIdx = (idx + 1) * 3;
-    if (idx == 0) {
-        sndPdx = (derivatives[nextBaseIdx] - derivatives[baseIdx]) / step;
-        sndPdy = (derivatives[nextBaseIdx + 1] - derivatives[baseIdx + 1]) / step;
-        sndPdz = (derivatives[nextBaseIdx + 2] - derivatives[baseIdx + 2]) / step;
-    } else if (idx == sampleCnt - 1) {
-        sndPdx = (derivatives[baseIdx] - derivatives[lastBaseIdx]) / step;
-        sndPdy = (derivatives[baseIdx + 1] - derivatives[lastBaseIdx + 1]) / step;
-        sndPdz = (derivatives[baseIdx + 2] - derivatives[lastBaseIdx + 2]) / step;
-    } else {
-        sndPdx = (derivatives[nextBaseIdx] - derivatives[lastBaseIdx]) / (2 * step);
-        sndPdy = (derivatives[nextBaseIdx + 1] - derivatives[lastBaseIdx + 1]) / (2 * step);
-        sndPdz = (derivatives[nextBaseIdx + 2] - derivatives[lastBaseIdx + 2]) / (2 * step);
-    }
-    printf("%g --> (%g, %g, %g)\n", u, sndPdx, sndPdy, sndPdz);
-}
-
-__global__ void g_basisTexture(float *nTexture, float *nTexture1, const float *d_knots, int d_pointsCnt, int d_knotsCnt,
-                               int d_sampleCnt) {
-    // 一维grid和一维block
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;  // 采样点编号
-    float d_paramCeil = d_knots[d_knotsCnt - 1];
-    float u = idx * d_paramCeil / (d_sampleCnt - 1);
-    if (u > 1.0 * d_paramCeil) {
-        return;
-    }
-    int d_degree = d_knotsCnt - 1 - d_pointsCnt;
-    auto *N_dp = (float *) malloc((d_degree + 1) * (d_knotsCnt - 1) * sizeof(float));
-    d_basisFunction(N_dp, d_knots, u, d_degree, d_knotsCnt);
-    for (int i = 0; i < d_pointsCnt; i++) {
-        nTexture[idx * d_pointsCnt + i] = N_dp[d_degree * (d_knotsCnt - 1) + i];
-        nTexture1[idx * (d_pointsCnt + 1) + i] = N_dp[(d_degree - 1) * (d_knotsCnt - 1) + i];
-//        printf("nTexture1: %g ", nTexture1[idx * (d_pointsCnt + 1) + i]);
-    }
-    nTexture1[idx * (d_pointsCnt + 1) + d_pointsCnt] = N_dp[(d_degree - 1) * (d_knotsCnt - 1) +
-                                                            d_pointsCnt]; // nTexture1多记录一列数据
-    free(N_dp);
-}
-
-
-__global__ void
-g_derTexture(float *derTexture, const float *nTexture1, const float *d_knots, int d_pointsCnt, int d_knotsCnt,
-             int d_sampleCnt) {
-    // 一维grid和一维block
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;  // 采样点编号
-    if (idx >= d_sampleCnt) {
-        return;
-    }
-    int degree = d_knotsCnt - 1 - d_pointsCnt;
-
-//    printf("degree: %d\n", degree);
-    for (int i = 0; i < d_pointsCnt; i++) {
-        float left = d_floatEqual(d_knots[i + degree], d_knots[i]) ? 0 :
-                     nTexture1[idx * (d_pointsCnt + 1) + i] * (degree - 1) / (d_knots[i + degree] - d_knots[i]);
-        float right = d_floatEqual(d_knots[i + degree + 1], d_knots[i + 1]) ? 0 :
-                      nTexture1[idx * (d_pointsCnt + 1) + i + 1] * (degree - 1) /
-                      (d_knots[i + degree + 1] - d_knots[i + 1]);
-        derTexture[idx * d_pointsCnt + i] = left - right;
-//        printf("<%d, %d> -- %g \n", idx, i, left - right);
-//        printf("nTex1: %g \n", nTexture1[idx * (d_pointsCnt + 1) + i]);
-    }
-}
-
-__host__ NurbsCurve::Evaluator::Evaluator(std::vector<std::vector<float>> controlPoints,
-                                          std::vector<float> knots) {
-    this->knots = std::move(knots);
-    this->controlPoints = std::move(controlPoints);
-    recordTime = false;
-    d_nTexture = nullptr;
-    d_nTexture1 = nullptr;
-    d_points = nullptr;
-    d_knots = nullptr;
-    d_derivatives = nullptr;
-}
-
-
-__device__ void d_basisFunction(float *N_Texture, const float *knots, float u, int degree, int d_knotsCnt) {
-    int m = d_knotsCnt - 1;
-    for (int p = 0; p <= degree; p++) {
-        for (int i = 0; i + p <= m - 1; i++) {
-            if (p == 0) {
-                if ((u > knots[i] || d_floatEqual(u, knots[i])) && (u < knots[i + 1])
-                    ||
-                    d_floatEqual(u, knots[i + 1]) && d_floatEqual(u, knots[m])) {
-                    N_Texture[p * m + i] = 1.0;
-                } else {
-                    N_Texture[p * m + i] = 0.0;
-                }
-            } else {
-                float Nip_1 = N_Texture[(p - 1) * m + i];
-                float Ni1p_1 = N_Texture[(p - 1) * m + i + 1];
-                float left = d_floatEqual(knots[i + p], knots[i]) ? 0 : (u - knots[i]) * Nip_1 /
-                                                                        (knots[i + p] - knots[i]);
-                float right = d_floatEqual(knots[i + p + 1], knots[i + 1]) ? 0 : (knots[i + p + 1] - u) * Ni1p_1 /
-                                                                                 (knots[i + p + 1] - knots[i + 1]);
-                N_Texture[p * m + i] = left + right;
-            }
-        }
-    }
-}
-
-__device__ bool d_floatEqual(float a, float b) {
-    return abs(a - b) < 0.00001;
-}
-
-__host__ void myCudaFree(float *&p) {
-    if (p != nullptr) {
-        cudaFree(p);
-        p = nullptr;
-    }
-}
-
-
-void NurbsCurve::Evaluator::setRecordTime(bool r) {
-    recordTime = r;
-}
-
-void NurbsSurface::Evaluator::setRecordTime(bool r) {
-    recordTime = r;
-}
-
-NurbsSurface::Evaluator::~Evaluator() {
-    myCudaFree(d_nTexture_u);
-    myCudaFree(d_nTexture_v);
-    myCudaFree(d_nTexture1_u);
-    myCudaFree(d_nTexture1_v);
-    myCudaFree(d_points);
-    myCudaFree(d_knots_u);
-    myCudaFree(d_knots_v);
-    cudaDeviceReset();
-}
-
-NurbsCurve::Evaluator::~Evaluator() {
-    myCudaFree(d_nTexture);
-    myCudaFree(d_nTexture1);
-    myCudaFree(d_points);
-    myCudaFree(d_knots);
-    cudaDeviceReset();
-}
--- a/NurbsEvaluator.cuh
+++ b/NurbsEvaluator.cuh
@ -1,191 +0,0 @@
-#ifndef UNTITLED1_NURBSEVALUATOR_CUH
-#define UNTITLED1_NURBSEVALUATOR_CUH
-
-#include <cuda_runtime.h>
-#include <thrust/device_vector.h>
-#include <vector>
-#include <map>
-
-const int POINT_SIZE = 4;
-
-/**
- *  保证释放后的指针指向空。这样一来保证指针不乱指，free的时候不会出错、二来可以判断指针是否已经free
- *  注意指针是引用传参，因为要把指针本身置空
- */
-__host__ void myCudaFree(float *&p);
-
-namespace NurbsSurface {
-    /**
-     * 曲线计算的核函数
-     * @param d_pointSize 点的大小(3: [x, y, z] | 4：[x, y, z, w])
-     */
-    __global__ static void
-    g_evaluate(const float *d_nTexture_u, const float *d_nTexture_v, const float *d_points, int d_pointsCnt_u,
-               int d_pointsCnt_v, int d_pointSize, float d_lastKnot_u, float d_lastKnot_v, int d_sampleCnt_u,
-               int d_sampleCnt_v);
-
-    __global__ static void
-    g_derivative(float *derivatives, const float *derTexture_u, const float *derTexture_v, const float *nTexture_u,
-                 const float *nTexture_v,
-                 const float *d_points, int d_pointsCnt_u, int d_pointsCnt_v, int d_pointSize, float d_lastKnot_u,
-                 float d_lastKnot_v, int d_sampleCnt_u, int d_sampleCnt_v);
-
-    __global__ static void
-    g_curvature(const float *derivatives, int sampleCnt_u, int sampleCnt_v, float lastKnot_u, float lastKnot_v);
-
-    class Evaluator {
-    private:
-        std::vector<std::vector<std::vector<float>>> controlPoints;
-        float *d_points;
-        std::vector<float> knots_u;
-        std::vector<float> knots_v;
-        float *d_knots_u;
-        float *d_knots_v;
-        bool recordTime;
-
-        float *d_nTexture_u;   // u方向指向度为p时的device中的nurbs基函数矩阵
-        float *d_nTexture_v;   // v方向指向度为p时的device中的nurbs基函数矩阵
-        float *d_nTexture1_u;   // u方向指向度为p-1时的device中的nurbs基函数矩阵
-        float *d_nTexture1_v;   // v方向指向度为p-1时的device中的nurbs基函数矩阵
-
-        float *d_derivatives;  // 一阶导计算结果
-
-//        int sampleCnt_u;
-//        int sampleCnt_v;
-
-    public:
-        /**
-         * 构造函数
-         * @param controlPoints 控制点矩阵[pointsCnt_u][pointsCnt_v][3]
-         * @param knots_u u方向knots
-         * @param knots_v v方向knots
-         */
-        __host__ explicit Evaluator(std::vector<std::vector<std::vector<float>>> controlPoints,
-                                    std::vector<float> knots_u, std::vector<float> knots_v);
-
-        /**
-         * 供外部CPU程序使用的、负责调用gpu并行计算的方法
-         * @param sampleCnt_u u方向采样数目
-         * @param sampleCnt_v v方向采样数目
-         * @return 由 map 组成的vector{<<u, v>, {x, y, z}>}
-         */
-        __host__ std::vector<std::map<std::pair<float, float>, std::vector<float>>>
-        evaluate(int sampleCnt_u_, int sampleCnt_v_);
-
-        /**
-         * 供外部CPU程序使用的、负责调用gpu并行计算切向量的方法
-         */
-        __host__ void derivative(int sampleCnt_u, int sampleCnt_v);
-
-        /**
-         * 供外部CPU程序使用的、负责调用gpu并行计算二阶导的方法
-         */
-        __host__ void curvature(int sampleCnt_u, int sampleCnt_v);
-
-        void setRecordTime(bool r);
-
-        ~Evaluator();
-
-    };
-}
-
-/**
- * 曲线部分
- */
-namespace NurbsCurve {
-
-    __global__ void g_test(float *nTexture);
-
-    /**
-     * 曲线计算的核函数
-     * @param d_pointSize 点的大小(3: [x, y, z] | 4：[x, y, z, w])
-     */
-    __global__ static void
-    g_evaluate(const float *NTexture, const float *d_points, int d_pointsCnt, int d_pointSize,
-               float d_lastKnot, int d_sampleCnt);
-
-    __global__ static void
-    g_derivative(float *derivatives, const float *derTexture, const float *nTexture, const float *d_points,
-                 int d_pointsCnt, int d_pointSize, float d_lastKnot,
-                 int d_sampleCnt);
-
-    __global__ static void g_curvature(const float *derivatives, int sampleCnt, float lastKnot);
-
-    class Evaluator {
-    private:
-        std::vector<std::vector<float>> controlPoints;
-        std::vector<float> knots;
-        float *d_knots;
-        float *d_points;
-        bool recordTime;
-
-        float *d_nTexture;   // 指向度为p时的device中的nurbs基函数矩阵
-        float *d_nTexture1;   // 指向度为p-1时的device中的nurbs基函数矩阵
-
-        float *d_derivatives{};  // 一阶导计算结果
-
-    public:
-        /**
-         * 构造函数
-         * @param controlPoints 控制点矩阵[pointsCnt][3]
-         */
-        __host__ explicit Evaluator(std::vector<std::vector<float>> controlPoints, std::vector<float> knots);
-
-        /**
-         * 供外部CPU程序使用的、负责调用gpu并行进行evaluation的方法
-         * @param sampleCnt_ 在参数域内均匀采样的采样数，它会更新成员变量中的sampleCnt
-         * @return 由 map 组成的vector{<u, {x, y, z}>}
-         */
-        __host__ std::vector<std::map<float, std::vector<float>>> evaluate(int sampleCnt_);
-
-        /**
-         * 供外部CPU程序使用的、负责调用gpu并行计算切向量的方法
-         */
-        __host__ void derivative(int sampleCnt);
-
-        /**
-         * 供外部CPU程序使用的、负责调用gpu并行计算二阶导的方法
-         */
-        __host__ void curvature(int sampleCnt);
-
-        __host__ ~Evaluator();
-
-        void setRecordTime(bool r);
-    };
-}
-
-
-
-
-
-/**
- * 计算并保存基函数值
- * @param nTexture 记录度数为p的基函数值，规模为【sampleCnt，pointsCnt】
- * @param nTexture1 记录度数为p-1的基函数值，规模为【sampleCnt+1，pointsCnt】
- */
-__global__ static void
-g_basisTexture(float *nTexture, float *nTexture1, const float *d_knots, int d_pointsCnt, int d_knotsCnt,
-               int d_sampleCnt);
-
-/**
- * 计算并保存基函数对采样点切向量的分量值
- * @param derTexture 记录度数为p的Nurbs基函数对采样点切向量的分量值，大小为【sampleCnt，pointsCnt】
- * @param nTexture1 度数为p-1的基函数值，规模为【sampleCnt+1，pointsCnt】
- */
-__global__ static void
-g_derTexture(float *derTexture, const float *nTexture1, const float *d_knots, int d_pointsCnt, int d_knotsCnt,
-             int d_sampleCnt);
-
-/**
- * 当u值已知时，根据基函数N的递推表达式，采用动态规划的方式求解N值
- * @param N_Texture 结果返回在N_Texture中
- */
-__device__ void d_basisFunction(float *nTexture, const float *knots, float u, int degree, int d_knotsCnt);
-
-/**
- * device中判断两个浮点数是否相等。与CPU中一样，GPU中的浮点数也存在很小的误差，直接使用==判断往往容易将相等误判为不等
- * @return true：相等
- */
-__device__ bool d_floatEqual(float a, float b);
-
-#endif
--- a/README.md
+++ b/README.md
@ -15,13 +15,15 @@ A tool for evaluating multiple NURBS curve/surface points using the GPU.
 根据CMake构建项目，运行main文件即可生成可执行文件
 ### 作为依赖使用
 1. CMakeLists.txt中注释以下代码，不再生成可执行文件
+
 ```cmake
-add_executable(NurbsEvaluator main.cpp NurbsEvaluator.cu NurbsEvaluator.cuh utils.cpp utils.h)
+add_executable(NurbsEvaluator src/main.cpp src/utils.cpp include/utils.h)
 ```
 2. CMakeLists.txt中取消注释以下代码，表示需要生成静态库。构建项目。
+
 ```cmake
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
-add_library(NurbsEvaluator NurbsEvaluator.cu NurbsEvaluator.cuh utils.cpp utils.h)
+add_library(NurbsEvaluator src/utils.cpp include/utils.h)
 ```
 3. 原项目连接到生成的静态库目录。如果是由CMake构建，可以用 target_link_libraries 指向依赖(.lib)，用 target_include_directories 指向头文件目录。
 4. 使用.cu文件调用NurbsEvaluator
--- a/include/device/DeviceUtils.cuh
+++ b/include/device/DeviceUtils.cuh
@ -0,0 +1,23 @@
+//
+// Created by 14727 on 2022/12/9.
+//
+
+#ifndef NURBSEVALUATOR_DEVICEUTILS_CUH
+#define NURBSEVALUATOR_DEVICEUTILS_CUH
+
+
+__device__ void d_safeFree(float * &p);
+
+/**
+ * device中判断两个浮点数是否相等。与CPU中一样，GPU中的浮点数也存在很小的误差，直接使用==判断往往容易将相等误判为不等
+ * @return true：相等
+ */
+__device__ bool d_floatEqual(float a, float b);
+
+/**
+ * 正则归一化，一般用于方向向量的归一化
+ */
+__device__ void normalization(float &a, float &b, float &c);
+
+
+#endif //NURBSEVALUATOR_DEVICEUTILS_CUH
--- a/include/device/NurbsCommon.cuh
+++ b/include/device/NurbsCommon.cuh
@ -0,0 +1,32 @@
+//
+// Created by 14727 on 2022/11/19.
+//
+
+#ifndef NURBSEVALUATOR_NURBSCOMMON_CUH
+#define NURBSEVALUATOR_NURBSCOMMON_CUH
+
+/**
+ * 当u值已知时，根据基函数N的递推表达式，采用动态规划的方式求解N值
+ * @param N_Texture 结果返回在N_Texture中
+ */
+__device__ void d_basisFunction(float *nTexture, const float *knots, float u, int degree, int d_knotsCnt);
+
+/**
+ * 计算并保存基函数值
+ * @param nTexture 记录度数为p的基函数值，规模为【sampleCnt，pointsCnt】
+ * @param nTexture1 记录度数为p-1的基函数值，规模为【sampleCnt+1，pointsCnt】
+ */
+__global__ void
+g_basisTexture(float *nTexture, float *nTexture1, const float *d_knots, int d_pointsCnt, int d_knotsCnt,
+               int d_sampleCnt);
+
+/**
+ * 计算并保存基函数对采样点切向量的分量值
+ * @param derTexture 记录度数为p的Nurbs基函数对采样点切向量的分量值，大小为【sampleCnt，pointsCnt】
+ * @param nTexture1 度数为p-1的基函数值，规模为【sampleCnt+1，pointsCnt】
+ */
+__global__ void
+g_derTexture(float *derTexture, const float *nTexture1, const float *d_knots, int d_pointsCnt, int d_knotsCnt,
+             int d_sampleCnt);
+
+#endif //NURBSEVALUATOR_NURBSCOMMON_CUH
--- a/include/device/NurbsCurve.cuh
+++ b/include/device/NurbsCurve.cuh
@ -0,0 +1,73 @@
+//
+// Created by 14727 on 2022/12/9.
+//
+
+#ifndef NURBSEVALUATOR_NURBSCURVE_CUH
+#define NURBSEVALUATOR_NURBSCURVE_CUH
+
+#include <cuda_runtime.h>
+#include <vector>
+//#include <map>
+
+namespace NurbsCurve {
+    const int POINT_SIZE = 4;
+    /**
+     * 曲线计算的核函数
+     * @param d_pointSize 点的大小(3: [x, y, z] | 4：[x, y, z, w])
+     */
+    __global__ static void
+    g_evaluate(float *res, const float *NTexture, const float *d_points, int d_pointsCnt, int d_pointSize,
+               float d_lastKnot, int d_sampleCnt);
+
+    __global__ static void
+    g_derivative(float *derivatives, const float *derTexture, const float *nTexture, const float *d_points,
+                 int d_pointsCnt, int d_pointSize, float d_lastKnot,
+                 int d_sampleCnt);
+
+    __global__ static void g_curvature(const float *derivatives, int sampleCnt, float lastKnot);
+
+    class Curve {
+    private:
+        std::vector<std::vector<float>> controlPoints;
+        std::vector<float> knots;
+        float *d_knots;
+        float *d_points;
+        bool recordTime;
+
+        float *d_nTexture;   // 指向度为p时的device中的nurbs基函数矩阵
+        float *d_nTexture1;   // 指向度为p-1时的device中的nurbs基函数矩阵
+
+        float *d_derivatives{};  // 一阶导计算结果
+
+    public:
+        /**
+         * 构造函数
+         * @param controlPoints 控制点矩阵[pointsCnt][3]
+         */
+        explicit Curve(std::vector<std::vector<float>> controlPoints, std::vector<float> knots);
+
+        /**
+         * 供外部CPU程序使用的、负责调用gpu并行进行evaluation的方法
+         * @param sampleCnt_ 在参数域内均匀采样的采样数，它会更新成员变量中的sampleCnt
+         * @return 由 map 组成的vector{<u, {x, y, z}>}
+         */
+        std::vector<std::vector<float>> evaluate(int sampleCnt_);
+
+        /**
+         * 供外部CPU程序使用的、负责调用gpu并行计算切向量的方法
+         */
+        void derivative(int sampleCnt);
+
+        /**
+         * 供外部CPU程序使用的、负责调用gpu并行计算二阶导的方法
+         */
+        void curvature(int sampleCnt);
+
+        ~Curve();
+
+        void setRecordTime(bool r);
+    };
+}
+
+
+#endif //NURBSEVALUATOR_NURBSCURVE_CUH
--- a/include/device/NurbsSurface.cuh
+++ b/include/device/NurbsSurface.cuh
@ -0,0 +1,85 @@
+//
+// Created by 14727 on 2022/12/9.
+//
+
+#ifndef NURBSEVALUATOR_NURBSSURFACE_CUH
+#define NURBSEVALUATOR_NURBSSURFACE_CUH
+#include <vector>
+#include "cuda_runtime.h"
+
+namespace NurbsSurface {
+    const int POINT_SIZE = 4;
+    /**
+     * 曲线计算的核函数
+     * @param d_pointSize 点的大小(3: [x, y, z] | 4：[x, y, z, w])
+     */
+    __global__ static void
+    g_evaluate(float* res, const float *d_nTexture_u, const float *d_nTexture_v, const float *d_points, int d_pointsCnt_u,
+               int d_pointsCnt_v, int d_pointSize, float d_lastKnot_u, float d_lastKnot_v, int d_sampleCnt_u,
+               int d_sampleCnt_v);
+
+    __global__ static void
+    g_derivative(float *derivatives, const float *derTexture_u, const float *derTexture_v, const float *nTexture_u,
+                 const float *nTexture_v,
+                 const float *d_points, int d_pointsCnt_u, int d_pointsCnt_v, int d_pointSize, float d_lastKnot_u,
+                 float d_lastKnot_v, int d_sampleCnt_u, int d_sampleCnt_v);
+
+    __global__ static void
+    g_curvature(const float *derivatives, int sampleCnt_u, int sampleCnt_v, float lastKnot_u, float lastKnot_v);
+
+    class Surface {
+    private:
+        std::vector<std::vector<std::vector<float>>> controlPoints;
+        float *d_points;
+        std::vector<float> knots_u;
+        std::vector<float> knots_v;
+        float *d_knots_u;
+        float *d_knots_v;
+        bool recordTime;
+
+        float *d_nTexture_u;   // u方向指向度为p时的device中的nurbs基函数矩阵
+        float *d_nTexture_v;   // v方向指向度为p时的device中的nurbs基函数矩阵
+        float *d_nTexture1_u;   // u方向指向度为p-1时的device中的nurbs基函数矩阵
+        float *d_nTexture1_v;   // v方向指向度为p-1时的device中的nurbs基函数矩阵
+
+        float *d_derivatives;  // 一阶导计算结果
+
+    public:
+        /**
+         * 构造函数
+         * @param controlPoints 控制点矩阵[pointsCnt_u][pointsCnt_v][3]
+         * @param knots_u u方向knots
+         * @param knots_v v方向knots
+         */
+        __host__ explicit Surface(std::vector<std::vector<std::vector<float>>> controlPoints,
+                                    std::vector<float> knots_u, std::vector<float> knots_v);
+
+        /**
+         * 供外部CPU程序使用的、负责调用gpu并行计算的方法
+         * @param sampleCnt_u u方向采样数目
+         * @param sampleCnt_v v方向采样数目
+         * @return 由 map 组成的vector{<<u, v>, {x, y, z}>}
+         */
+        __host__ std::vector<std::vector<std::vector<float>>>
+        evaluate(int sampleCnt_u_, int sampleCnt_v_);
+
+        /**
+         * 供外部CPU程序使用的、负责调用gpu并行计算切向量的方法
+         */
+        __host__ void derivative(int sampleCnt_u, int sampleCnt_v);
+
+        /**
+         * 供外部CPU程序使用的、负责调用gpu并行计算二阶导的方法
+         */
+        __host__ void curvature(int sampleCnt_u, int sampleCnt_v);
+
+        void setRecordTime(bool r);
+
+        ~Surface();
+
+    };
+}
+
+
+
+#endif //NURBSEVALUATOR_NURBSSURFACE_CUH
--- a/include/utils.h
+++ b/include/utils.h
@ -0,0 +1,22 @@
+#ifndef UNTITLED1_UTILS_H
+#define UNTITLED1_UTILS_H
+
+#define IN_UNIX 0  // 确定当前运行的操作系统（需要通过系统调用获得时间）
+
+#if IN_UNIX
+#include <sys/time.h>
+#include <ctime>
+double get_time();
+#else
+#include <windows.h>
+double get_time();
+#endif
+
+/**
+ *  保证释放后的指针指向空。这样一来保证指针不乱指，free的时候不会出错、二来可以判断指针是否已经free
+ *  注意指针是引用传参，因为要把指针本身置空
+ */
+void safeCudaFree(float *&p);
+void safeFree(float *&p);
+
+#endif //UNTITLED1_UTILS_H
--- a/src/device/DeviceUtils.cu
+++ b/src/device/DeviceUtils.cu
@ -0,0 +1,26 @@
+//
+// Created by 14727 on 2022/12/9.
+//
+
+#include "../../include/device/DeviceUtils.cuh"
+
+__device__ void d_safeFree(float *&p) {
+    if (p != nullptr) {
+        free(p);
+        p = nullptr;
+    }
+}
+
+__device__ bool d_floatEqual(float a, float b) {
+    return abs(a - b) < 0.00001;
+}
+
+__device__ void normalization(float &a, float &b, float &c) {
+    float sumA = a * a;
+    float sumB = b * b;
+    float sumC = c * c;
+    float sum = sumA + sumB + sumC;
+    a = sqrt(sumA / sum);
+    b = sqrt(sumB / sum);
+    c = sqrt(sumC / sum);
+}
--- a/src/device/NurbsCommon.cu
+++ b/src/device/NurbsCommon.cu
@ -0,0 +1,76 @@
+//
+// Created by 14727 on 2022/11/19.
+//
+
+#include "../../include/device/NurbsCommon.cuh"
+#include "../../include/device/DeviceUtils.cuh"
+
+__device__ void d_basisFunction(float *N_Texture, const float *knots, float u, int degree, int d_knotsCnt) {
+    int m = d_knotsCnt - 1;
+    for (int p = 0; p <= degree; p++) {
+        for (int i = 0; i + p <= m - 1; i++) {
+            if (p == 0) {
+                if ((u > knots[i] || d_floatEqual(u, knots[i])) && (u < knots[i + 1])
+                    ||
+                    d_floatEqual(u, knots[i + 1]) && d_floatEqual(u, knots[m])) {
+                    N_Texture[p * m + i] = 1.0;
+                } else {
+                    N_Texture[p * m + i] = 0.0;
+                }
+            } else {
+                float Nip_1 = N_Texture[(p - 1) * m + i];
+                float Ni1p_1 = N_Texture[(p - 1) * m + i + 1];
+                float left = d_floatEqual(knots[i + p], knots[i]) ? 0 : (u - knots[i]) * Nip_1 /
+                                                                        (knots[i + p] - knots[i]);
+                float right = d_floatEqual(knots[i + p + 1], knots[i + 1]) ? 0 : (knots[i + p + 1] - u) * Ni1p_1 /
+                                                                                 (knots[i + p + 1] - knots[i + 1]);
+                N_Texture[p * m + i] = left + right;
+            }
+        }
+    }
+}
+
+__global__ void g_basisTexture(float *nTexture, float *nTexture1, const float *d_knots, int d_pointsCnt, int d_knotsCnt,
+                               int d_sampleCnt) {
+    // 一维grid和一维block
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;  // 采样点编号
+    float d_paramCeil = d_knots[d_knotsCnt - 1];
+    float u = idx * d_paramCeil / (d_sampleCnt - 1);
+    if (u > 1.0 * d_paramCeil) {
+        return;
+    }
+    int d_degree = d_knotsCnt - 1 - d_pointsCnt;
+    auto *N_dp = (float *) malloc((d_degree + 1) * (d_knotsCnt - 1) * sizeof(float));
+    d_basisFunction(N_dp, d_knots, u, d_degree, d_knotsCnt);
+    for (int i = 0; i < d_pointsCnt; i++) {
+        nTexture[idx * d_pointsCnt + i] = N_dp[d_degree * (d_knotsCnt - 1) + i];
+        nTexture1[idx * (d_pointsCnt + 1) + i] = N_dp[(d_degree - 1) * (d_knotsCnt - 1) + i];
+//        printf("nTexture1: %g ", nTexture1[idx * (d_pointsCnt + 1) + i]);
+    }
+    nTexture1[idx * (d_pointsCnt + 1) + d_pointsCnt] = N_dp[(d_degree - 1) * (d_knotsCnt - 1) +
+                                                            d_pointsCnt]; // nTexture1多记录一列数据
+    d_safeFree(N_dp);
+}
+
+__global__ void
+g_derTexture(float *derTexture, const float *nTexture1, const float *d_knots, int d_pointsCnt, int d_knotsCnt,
+             int d_sampleCnt) {
+    // 一维grid和一维block
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;  // 采样点编号
+    if (idx >= d_sampleCnt) {
+        return;
+    }
+    int degree = d_knotsCnt - 1 - d_pointsCnt;
+
+//    printf("degree: %d\n", degree);
+    for (int i = 0; i < d_pointsCnt; i++) {
+        float left = d_floatEqual(d_knots[i + degree], d_knots[i]) ? 0 :
+                     nTexture1[idx * (d_pointsCnt + 1) + i] * (degree - 1) / (d_knots[i + degree] - d_knots[i]);
+        float right = d_floatEqual(d_knots[i + degree + 1], d_knots[i + 1]) ? 0 :
+                      nTexture1[idx * (d_pointsCnt + 1) + i + 1] * (degree - 1) /
+                      (d_knots[i + degree + 1] - d_knots[i + 1]);
+        derTexture[idx * d_pointsCnt + i] = left - right;
+//        printf("<%d, %d> -- %g \n", idx, i, left - right);
+//        printf("nTex1: %g \n", nTexture1[idx * (d_pointsCnt + 1) + i]);
+    }
+}
--- a/src/device/NurbsCurve.cu
+++ b/src/device/NurbsCurve.cu
@ -0,0 +1,283 @@
+//
+// Created by 14727 on 2022/12/9.
+//
+#include "../../include/device/NurbsCommon.cuh"
+#include "../../include/device/NurbsCurve.cuh"
+#include "../../include/utils.h"
+
+__global__ void
+NurbsCurve::g_evaluate(float *res, const float *NTexture, const float *d_points, const int d_pointsCnt,
+                       const int d_POINT_SIZE, const float d_lastKnot, const int d_sampleCnt) {
+//    printf(" curve calculating... \n");
+    // 二维grid和一维的block
+//    int idx = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
+    // 二维block和一维grid
+    int idx = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
+    float u = idx * d_lastKnot / (d_sampleCnt - 1);
+    if (u > 1.0 * d_lastKnot) {
+        return;
+    }
+//
+//    int d_degree = d_knotsCnt - 1 - d_pointsCnt;
+//    // 注意，在device中，全局内存还是以malloc和free的方式分配和回收的，而不是使用cudaMalloc和cudaFree
+//    auto *N_dp = (float *) malloc((d_degree + 1) * (d_knotsCnt - 1) * sizeof(float));
+//    d_basisFunction(N_dp, d_knots, u, d_degree, d_knotsCnt);
+    float x = 0., y = 0., z = 0., sumW = 0.;
+    for (int i = 0; i < d_pointsCnt; i++) {
+        float N = NTexture[idx * d_pointsCnt + i];
+        int baseIdx = i * d_POINT_SIZE;
+        float w = d_points[baseIdx + 3];
+        x += N * w * d_points[baseIdx];
+        y += N * w * d_points[baseIdx + 1];
+        z += N * w * d_points[baseIdx + 2];
+        sumW += N * w;
+    }
+    x = x / sumW;
+    y = y / sumW;
+    z = z / sumW;
+    int baseIdx = idx * 3;  // 这里的结果点大小应该固定为3
+    res[baseIdx] = x;
+    res[baseIdx + 1] = y;
+    res[baseIdx + 2] = z;
+    printf("(%g)-->(%g, %g, %g)\n", u, x, y, z); // %g输出，舍弃无意义的0
+}
+
+__global__ void
+NurbsCurve::g_derivative(float *derivatives, const float *derTexture, const float *nTexture, const float *d_points,
+                         int d_pointsCnt, int d_POINT_SIZE,
+                         float d_lastKnot, int d_sampleCnt) {
+    // 二维block和一维grid
+    int idx = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
+    if (idx >= d_sampleCnt) return;
+    float u = idx * d_lastKnot / (d_sampleCnt - 1);
+    float nubs_dx = 0., nubs_dy = 0., nubs_dz = 0., nubs_dw = 0.;
+
+    for (int i = 0; i < d_pointsCnt; i++) {
+        int baseIdx = i * d_POINT_SIZE;
+        float nFactor = derTexture[idx * d_pointsCnt + i];
+        float wi = d_points[baseIdx + 3];
+        nubs_dx += nFactor * wi * d_points[baseIdx];
+        nubs_dy += nFactor * wi * d_points[baseIdx + 1];
+        nubs_dz += nFactor * wi * d_points[baseIdx + 2];
+        nubs_dw += nFactor * wi;
+    }
+
+    float x = 0., y = 0., z = 0., w = 0.;
+    for (int i = 0; i < d_pointsCnt; i++) {
+        float N = nTexture[idx * d_pointsCnt + i];
+        int baseIdx = i * d_POINT_SIZE;
+        float wi = d_points[baseIdx + 3];
+        x += N * wi * d_points[baseIdx];
+        y += N * wi * d_points[baseIdx + 1];
+        z += N * wi * d_points[baseIdx + 2];
+        w += N * wi;
+    }
+    float dx = (nubs_dx * w - x * nubs_dw) / (w * w);
+    float dy = (nubs_dy * w - y * nubs_dw) / (w * w);
+    float dz = (nubs_dz * w - z * nubs_dw) / (w * w);
+    int baseIdx = idx * 3;
+    derivatives[baseIdx] = dx;
+    derivatives[baseIdx + 1] = dy;
+    derivatives[baseIdx + 2] = dz;
+    printf("(%g)-->(%g, %g, %g)\n", u, dx, dy, dz);
+}
+
+__global__ void NurbsCurve::g_curvature(const float *derivatives, int sampleCnt, float lastKnot) {
+    // 二维block和一维grid
+    int idx = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
+    if (idx >= sampleCnt) return;
+    float step = lastKnot / (sampleCnt - 1);
+    float u = idx * step;
+    float sndPdx, sndPdy, sndPdz;  // 二阶导
+    int baseIdx = idx * 3, lastBaseIdx = (idx - 1) * 3, nextBaseIdx = (idx + 1) * 3;
+    if (idx == 0) {
+        sndPdx = (derivatives[nextBaseIdx] - derivatives[baseIdx]) / step;
+        sndPdy = (derivatives[nextBaseIdx + 1] - derivatives[baseIdx + 1]) / step;
+        sndPdz = (derivatives[nextBaseIdx + 2] - derivatives[baseIdx + 2]) / step;
+    } else if (idx == sampleCnt - 1) {
+        sndPdx = (derivatives[baseIdx] - derivatives[lastBaseIdx]) / step;
+        sndPdy = (derivatives[baseIdx + 1] - derivatives[lastBaseIdx + 1]) / step;
+        sndPdz = (derivatives[baseIdx + 2] - derivatives[lastBaseIdx + 2]) / step;
+    } else {
+        sndPdx = (derivatives[nextBaseIdx] - derivatives[lastBaseIdx]) / (2 * step);
+        sndPdy = (derivatives[nextBaseIdx + 1] - derivatives[lastBaseIdx + 1]) / (2 * step);
+        sndPdz = (derivatives[nextBaseIdx + 2] - derivatives[lastBaseIdx + 2]) / (2 * step);
+    }
+    printf("%g --> (%g, %g, %g)\n", u, sndPdx, sndPdy, sndPdz);
+}
+
+NurbsCurve::Curve::Curve(std::vector<std::vector<float>> controlPoints,
+                                          std::vector<float> knots) {
+    this->knots = std::move(knots);
+    this->controlPoints = std::move(controlPoints);
+    recordTime = false;
+    d_nTexture = nullptr;
+    d_nTexture1 = nullptr;
+    d_points = nullptr;
+    d_knots = nullptr;
+    d_derivatives = nullptr;
+}
+
+void NurbsCurve::Curve::setRecordTime(bool r) {
+    recordTime = r;
+}
+
+NurbsCurve::Curve::~Curve() {
+    safeCudaFree(d_nTexture);
+    safeCudaFree(d_nTexture1);
+    safeCudaFree(d_points);
+    safeCudaFree(d_knots);
+    cudaDeviceReset();
+}
+
+std::vector<std::vector<float>>
+NurbsCurve::Curve::evaluate(int sampleCnt) {
+
+    if (POINT_SIZE != controlPoints[0].size()) {
+        printf("Error! Nurbs控制点应表示为长度为4的齐次坐标\n");
+        return {};
+    }
+    // 构造指向device的controlPoints
+    const int pointsCnt = controlPoints.size();
+    const int pointsBytes = pointsCnt * POINT_SIZE * sizeof(float);
+    auto *h_points = (float *) malloc(pointsBytes);
+    for (int i = 0; i < pointsCnt; i++) {
+        for (int j = 0; j < POINT_SIZE; j++) {
+            h_points[i * POINT_SIZE + j] = controlPoints[i][j];
+        }
+    }
+    safeCudaFree(d_points);  // 注意内存管理
+    cudaMalloc((void **) &d_points, pointsBytes);
+    cudaMemcpy(d_points, h_points, pointsBytes, cudaMemcpyHostToDevice);
+
+    // 构造指向device的knots
+    const int knotsCnt = knots.size();
+    const int knotsBytes = knotsCnt * sizeof(float);
+    auto *h_knots = (float *) malloc(knotsBytes);
+    for (int i = 0; i < knotsCnt; i++) h_knots[i] = knots[i];
+    safeCudaFree(d_knots);  // 注意内存管理
+    cudaMalloc((void **) &d_knots, knotsBytes);
+    cudaMemcpy(d_knots, h_knots, knotsBytes, cudaMemcpyHostToDevice);
+
+    // 分配nTexture的内存。只需要GPU内存
+    safeCudaFree(d_nTexture);  // 注意内存管理
+    cudaMalloc((void **) &d_nTexture,
+               sampleCnt * pointsCnt * sizeof(float));
+
+    // 分配nTexture1的内存。只需要GPU内存
+    safeCudaFree(d_nTexture1);  // 注意内存管理
+    cudaMalloc((void **) &d_nTexture1, sampleCnt * (pointsCnt + 1) * sizeof(float)); // 注意nTexture的大小，在算梯度时用得到i=pointsCnt + 1的基函数值
+
+    // 结果数组
+    size_t resBytes = sampleCnt * 3 * sizeof(float);
+    float *d_res;
+    cudaMalloc((void **)&d_res, resBytes);
+    auto *h_res = (float *)malloc(resBytes);
+
+
+    // 构造g_basisTexture线程层级
+    dim3 blockBasis(512);
+    dim3 gridBasis((sampleCnt + blockBasis.x - 1) / blockBasis.x);
+
+    // 构造线程层级
+    dim3 block(32, 32);
+    dim3 grid((sampleCnt + block.x * block.y - 1) / (block.x * block.y));
+    // 记录用时
+    double time_cost_device;
+    if (recordTime) time_cost_device = get_time();
+    printf("there..\n");
+    g_basisTexture <<<gridBasis, blockBasis>>>(d_nTexture, d_nTexture1, d_knots, pointsCnt, knotsCnt, sampleCnt);
+//    cudaMemcpy(d_nTextureCpy, d_nTexture, nTextureBytes, cudaMemcpyDeviceToDevice); // 有同步功能
+    cudaDeviceSynchronize();
+    printf("here..\n");
+    g_evaluate <<<grid, block>>>(d_res, d_nTexture, d_points, pointsCnt, POINT_SIZE, knots[knotsCnt - 1], sampleCnt);
+//    g_test<<<1,6>>>(d_nTextureCpy);
+    cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用
+    if (recordTime) {
+        time_cost_device = get_time() - time_cost_device;
+        printf("GPU time cost of curve evaluation for %d samples: %lf\n", sampleCnt, time_cost_device);
+    }
+
+    cudaMemcpy(h_res, d_res, resBytes, cudaMemcpyDeviceToHost);
+    std::vector<std::vector<float>> res(sampleCnt, std::vector<float>(3, 0.));
+    for(int i = 0; i < sampleCnt; i++) {
+        int baseIdx = i * 3;
+        res[i][0] = h_res[baseIdx];
+        res[i][1] = h_res[baseIdx + 1];
+        res[i][2] = h_res[baseIdx + 2];
+    }
+
+    safeFree(h_points);
+    safeFree(h_knots);
+    safeCudaFree(d_res);
+    safeFree(h_res);
+    return res;
+}
+
+void NurbsCurve::Curve::derivative(int sampleCnt) {
+    // 先完成evaluation
+    evaluate(sampleCnt);
+
+    if (POINT_SIZE != controlPoints[0].size()) {
+        printf("Error! Nurbs控制点应表示为长度为4的齐次坐标\n");
+        return;
+    }
+
+    float *d_derTexture = nullptr;
+    const int pointsCnt = controlPoints.size();
+    const int knotsCnt = knots.size();
+    cudaMalloc((void **) &d_derTexture, sampleCnt * pointsCnt * sizeof(float));
+
+    // 构造线程层级
+    dim3 block(32, 32);
+    dim3 grid((sampleCnt + block.x * block.y - 1) / (block.x * block.y));
+    // 构造g_basisTexture线程层级
+    dim3 blockTex(512);
+    dim3 gridTex((sampleCnt + blockTex.x - 1) / blockTex.x);
+
+
+    // 构造切向量计算结果
+    safeCudaFree(d_derivatives);
+    cudaMalloc((void **) &d_derivatives, sampleCnt * 3 * sizeof(float)); // 每个采用所求的切向量是一个三维向量
+
+    // 记录用时
+    double time_cost_device;
+    if (recordTime) time_cost_device = get_time();
+    g_derTexture<<<gridTex, blockTex>>>(d_derTexture, d_nTexture1, d_knots, pointsCnt, knotsCnt, sampleCnt);
+    cudaDeviceSynchronize();
+    g_derivative<<<grid, block>>>(d_derivatives, d_derTexture, d_nTexture, d_points, pointsCnt, POINT_SIZE,
+                                  knots[knotsCnt - 1], sampleCnt);
+    cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用
+    if (recordTime) {
+        time_cost_device = get_time() - time_cost_device;
+        printf("GPU time cost of curve first derivative calculating for %d samples: %lf\n", sampleCnt,
+               time_cost_device);
+    }
+    cudaFree(d_derTexture);
+}
+
+void NurbsCurve::Curve::curvature(int sampleCnt) {
+    // 先计算切向量
+    derivative(sampleCnt);
+
+    if (POINT_SIZE != controlPoints[0].size()) {
+        printf("Error! Nurbs控制点应表示为长度为4的齐次坐标\n");
+        return;
+    }
+
+    // 构造线程层级
+    dim3 block(32, 32);
+    dim3 grid((sampleCnt + block.x * block.y - 1) / (block.x * block.y));
+
+    // 记录用时
+    double time_cost_device;
+    if (recordTime) time_cost_device = get_time();
+    g_curvature<<<grid, block>>>(d_derivatives, sampleCnt, knots[knots.size() - 1]);
+    cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用
+    if (recordTime) {
+        time_cost_device = get_time() - time_cost_device;
+        printf("GPU time cost of curve second derivative calculating for %d samples: %lf\n", sampleCnt,
+               time_cost_device);
+    }
+}
+
--- a/src/device/NurbsSurface.cu
+++ b/src/device/NurbsSurface.cu
@ -0,0 +1,424 @@
+//
+// Created by 14727 on 2022/12/9.
+//
+
+#include "../../include/device/NurbsSurface.cuh"
+#include "../../include/device/NurbsCommon.cuh"
+#include "../../include/device/DeviceUtils.cuh"
+#include "../../include/utils.h"
+
+__global__ void
+NurbsSurface::g_evaluate(float* res, const float *d_nTexture_u, const float *d_nTexture_v, const float *d_points, int d_pointsCnt_u,
+                         int d_pointsCnt_v, int d_POINT_SIZE, float d_lastKnot_u, float d_lastKnot_v, int d_sampleCnt_u,
+                         int d_sampleCnt_v) {
+
+    // 二维grid和二维的block
+    int ix = blockIdx.x * blockDim.x + threadIdx.x;
+    int iy = blockIdx.y * blockDim.y + threadIdx.y;
+
+    float u = ix * d_lastKnot_u / (d_sampleCnt_u - 1);
+    float v = iy * d_lastKnot_v / (d_sampleCnt_v - 1);
+
+    if (u > 1.0 * d_lastKnot_u || v > 1.0 * d_lastKnot_v) {
+        return;
+    }
+
+    float x = 0., y = 0., z = 0., sumW = 0.;
+    for (int i = 0; i < d_pointsCnt_u; i++) {
+        float N_U = d_nTexture_u[ix * d_pointsCnt_u + i];
+        for (int j = 0; j < d_pointsCnt_v; j++) {
+            float N_V = d_nTexture_v[iy * d_pointsCnt_v + j];
+            int idx = (i * d_pointsCnt_v + j) * d_POINT_SIZE;
+            float w = d_points[idx + 3];
+            x += N_U * N_V * w * d_points[idx];
+            y += N_U * N_V * w * d_points[idx + 1];
+            z += N_U * N_V * w * d_points[idx + 2];
+            sumW += N_U * N_V * w;
+        }
+    }
+    x = x / sumW;
+    y = y / sumW;
+    z = z / sumW;
+
+//    int baseIdx = (ix * d_sampleCnt_v + iy) * 3;
+//    res[baseIdx] = x;
+//    res[baseIdx + 1] = y;
+//    res[baseIdx + 2] = z;
+
+//    printf("(%d, %d)-->(%g, %g, %g)\n", ix, iy, x, y, z); // %g输出，舍弃无意义的0
+}
+
+__global__ void
+NurbsSurface::g_derivative(float *derivatives, const float *derTexture_u, const float *derTexture_v,
+                           const float *nTexture_u, const float *nTexture_v, const float *d_points, int d_pointsCnt_u,
+                           int d_pointsCnt_v, int d_POINT_SIZE, float d_lastKnot_u, float d_lastKnot_v,
+                           int d_sampleCnt_u, int d_sampleCnt_v) {
+    // 二维grid和二维的block
+    int ix = blockIdx.x * blockDim.x + threadIdx.x;
+    int iy = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (ix >= d_sampleCnt_u || iy >= d_sampleCnt_v) {
+        return;
+    }
+    float u = ix * d_lastKnot_u / (d_sampleCnt_u - 1);
+    float v = iy * d_lastKnot_v / (d_sampleCnt_v - 1);
+
+    float nubsPdx_u = 0., nubsPdy_u = 0, nubsPdz_u = 0., nubsPdw_u = 0.;
+    float nubsPdx_v = 0., nubsPdy_v = 0, nubsPdz_v = 0., nubsPdw_v = 0.;
+
+    for (int i = 0; i < d_pointsCnt_u; i++) {
+        for (int j = 0; j < d_pointsCnt_u; j++) {
+            int baseIdx = (i * d_pointsCnt_v + j) * d_POINT_SIZE;
+            float factor_u = derTexture_u[ix * d_pointsCnt_u + i] * nTexture_v[iy * d_pointsCnt_v + j];
+            float factor_v = derTexture_v[iy * d_pointsCnt_v + j] * nTexture_u[ix * d_pointsCnt_u + i];
+            float wij = d_points[baseIdx + 3];
+            nubsPdx_u += factor_u * wij * d_points[baseIdx];
+            nubsPdy_u += factor_u * wij * d_points[baseIdx + 1];
+            nubsPdz_u += factor_u * wij * d_points[baseIdx + 2];
+            nubsPdw_u += factor_u * wij;
+
+            nubsPdx_v += factor_v * wij * d_points[baseIdx];
+            nubsPdy_v += factor_v * wij * d_points[baseIdx + 1];
+            nubsPdz_v += factor_v * wij * d_points[baseIdx + 2];
+            nubsPdw_v += factor_v * wij;
+        }
+    }
+
+    float x = 0., y = 0., z = 0., w = 0.;
+    for (int i = 0; i < d_pointsCnt_u; i++) {
+        float N_U = nTexture_u[ix * d_pointsCnt_u + i];
+        for (int j = 0; j < d_pointsCnt_v; j++) {
+            float N_V = nTexture_v[iy * d_pointsCnt_v + j];
+            int idx = (i * d_pointsCnt_v + j) * d_POINT_SIZE;
+            float wij = d_points[idx + 3];
+            x += N_U * N_V * wij * d_points[idx];
+            y += N_U * N_V * wij * d_points[idx + 1];
+            z += N_U * N_V * wij * d_points[idx + 2];
+            w += N_U * N_V * wij;
+        }
+    }
+
+    float w2 = w * w;
+    float pdx_u = (nubsPdx_u * w - x * nubsPdw_u) / w2;
+    float pdy_u = (nubsPdy_u * w - y * nubsPdw_u) / w2;
+    float pdz_u = (nubsPdz_u * w - z * nubsPdw_u) / w2;
+
+    float pdx_v = (nubsPdx_v * w - x * nubsPdw_v) / w2;
+    float pdy_v = (nubsPdy_v * w - y * nubsPdw_v) / w2;
+    float pdz_v = (nubsPdz_v * w - z * nubsPdw_v) / w2;
+//    float pdz_u = (nubsPdz_u * w - z )
+
+    int baseIdx = (ix * d_sampleCnt_v + iy) * 6;
+    derivatives[baseIdx] = pdx_u;
+    derivatives[baseIdx + 1] = pdy_u;
+    derivatives[baseIdx + 2] = pdz_u;
+    derivatives[baseIdx + 3] = pdx_v;
+    derivatives[baseIdx + 4] = pdy_v;
+    derivatives[baseIdx + 5] = pdz_v;
+
+    float x_n = pdy_u * pdz_v - pdy_v * pdz_u, y_n = pdx_v * pdz_u - pdx_u * pdz_v, z_n =
+            pdx_u * pdy_v - pdx_v * pdy_u; // 叉乘得到法向量
+    if ((ix == 8 && iy == 9) || (ix == 7 && iy == 9) || (ix == 9 && iy == 9) || (ix == 8 && iy == 8) ||
+        (ix == 8 && iy == 10))
+        printf("(%g,%g)-->u:(%g, %g, %g), v:(%g,%g,%g), normal:(%g,%g,%g)\n", u, v, pdx_u, pdy_u, pdz_u, pdx_v, pdy_v,
+               pdz_v, x_n, y_n, z_n);
+}
+
+__global__ void
+NurbsSurface::g_curvature(const float *derivatives, int sampleCnt_u, int sampleCnt_v, float lastKnot_u,
+                          float lastKnot_v) {
+    // 二维grid和二维的block
+    int ix = blockIdx.x * blockDim.x + threadIdx.x;
+    int iy = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (ix >= sampleCnt_u || iy >= sampleCnt_v) {
+        return;
+    }
+
+
+    float step_u = lastKnot_u / (sampleCnt_u - 1), step_v = lastKnot_v / (sampleCnt_v - 1);
+    float u = ix * step_u, v = iy * step_v;
+
+    int baseIdx = (ix * sampleCnt_v + iy) * 6;
+    int lastBaseIdx_u = ((ix - 1) * sampleCnt_v + iy) * 6, nextBaseIdx_u = ((ix + 1) * sampleCnt_v + iy) * 6;
+    int lastBaseIdx_v = (ix * sampleCnt_v + iy - 1) * 6, nextBaseIdx_v = (ix * sampleCnt_v + iy + 1) * 6;
+
+//    printf("(%g,%g)-->u:(%g, %g, %g), v:(%g,%g,%g)\n", u, v, derivatives[baseIdx], derivatives[baseIdx + 1],
+//           derivatives[baseIdx + 2], derivatives[baseIdx + 3], derivatives[baseIdx + 4], derivatives[baseIdx + 5]);
+
+    float sndPdx_uu, sndPdy_uu, sndPdz_uu, sndPdx_vv, sndPdy_vv, sndPdz_vv;  // 二阶导
+    float sndPdx_uv, sndPdy_uv, sndPdz_uv, sndPdx_vu, sndPdy_vu, sndPdz_vu;
+
+    if (ix == 0) {
+        sndPdx_uu = (derivatives[nextBaseIdx_u] - derivatives[baseIdx]) / step_u;
+        sndPdy_uu = (derivatives[nextBaseIdx_u + 1] - derivatives[baseIdx + 1]) / step_u;
+        sndPdz_uu = (derivatives[nextBaseIdx_u + 2] - derivatives[baseIdx + 2]) / step_u;
+
+        sndPdx_vu = (derivatives[nextBaseIdx_u + 3] - derivatives[baseIdx + 3]) / step_u;
+        sndPdy_vu = (derivatives[nextBaseIdx_u + 4] - derivatives[baseIdx + 4]) / step_u;
+        sndPdz_vu = (derivatives[nextBaseIdx_u + 5] - derivatives[baseIdx + 5]) / step_u;
+    } else if (ix == sampleCnt_u - 1) {
+        sndPdx_uu = (derivatives[baseIdx] - derivatives[lastBaseIdx_u]) / step_u;
+        sndPdy_uu = (derivatives[baseIdx + 1] - derivatives[lastBaseIdx_u + 1]) / step_u;
+        sndPdz_uu = (derivatives[baseIdx + 2] - derivatives[lastBaseIdx_u + 2]) / step_u;
+
+        sndPdx_vu = (derivatives[baseIdx + 3] - derivatives[lastBaseIdx_u + 3]) / step_u;
+        sndPdy_vu = (derivatives[baseIdx + 4] - derivatives[lastBaseIdx_u + 4]) / step_u;
+        sndPdz_vu = (derivatives[baseIdx + 5] - derivatives[lastBaseIdx_u + 5]) / step_u;
+    } else {
+        sndPdx_uu = (derivatives[nextBaseIdx_u] - derivatives[lastBaseIdx_u]) / (2 * step_u);
+        sndPdy_uu = (derivatives[nextBaseIdx_u + 1] - derivatives[lastBaseIdx_u + 1]) / (2 * step_u);
+        sndPdz_uu = (derivatives[nextBaseIdx_u + 2] - derivatives[lastBaseIdx_u + 2]) / (2 * step_u);
+
+        sndPdx_vu = (derivatives[nextBaseIdx_u + 3] - derivatives[lastBaseIdx_u + 3]) / (2 * step_u);
+        sndPdy_vu = (derivatives[nextBaseIdx_u + 4] - derivatives[lastBaseIdx_u + 4]) / (2 * step_u);
+        sndPdz_vu = (derivatives[nextBaseIdx_u + 5] - derivatives[lastBaseIdx_u + 5]) / (2 * step_u);
+    }
+
+    if (iy == 0) {
+        sndPdx_vv = (derivatives[nextBaseIdx_v + 3] - derivatives[baseIdx + 3]) / step_v;
+        sndPdy_vv = (derivatives[nextBaseIdx_v + 4] - derivatives[baseIdx + 4]) / step_v;
+        sndPdz_vv = (derivatives[nextBaseIdx_v + 5] - derivatives[baseIdx + 5]) / step_v;
+
+        sndPdx_uv = (derivatives[nextBaseIdx_v] - derivatives[baseIdx]) / step_v;
+        sndPdy_uv = (derivatives[nextBaseIdx_v + 1] - derivatives[baseIdx + 1]) / step_v;
+        sndPdz_uv = (derivatives[nextBaseIdx_v + 2] - derivatives[baseIdx + 2]) / step_v;
+    } else if (iy == sampleCnt_v - 1) {
+        sndPdx_vv = (derivatives[baseIdx + 3] - derivatives[lastBaseIdx_v + 3]) / step_v;
+        sndPdy_vv = (derivatives[baseIdx + 4] - derivatives[lastBaseIdx_v + 4]) / step_v;
+        sndPdz_vv = (derivatives[baseIdx + 5] - derivatives[lastBaseIdx_v + 5]) / step_v;
+
+        sndPdx_uv = (derivatives[baseIdx] - derivatives[lastBaseIdx_v]) / step_v;
+        sndPdy_uv = (derivatives[baseIdx + 1] - derivatives[lastBaseIdx_v + 1]) / step_v;
+        sndPdz_uv = (derivatives[baseIdx + 2] - derivatives[lastBaseIdx_v + 2]) / step_v;
+    } else {
+        sndPdx_vv = (derivatives[nextBaseIdx_v + 3] - derivatives[lastBaseIdx_v + 3]) / (2 * step_v);
+        sndPdy_vv = (derivatives[nextBaseIdx_v + 4] - derivatives[lastBaseIdx_v + 4]) / (2 * step_v);
+        sndPdz_vv = (derivatives[nextBaseIdx_v + 5] - derivatives[lastBaseIdx_v + 5]) / (2 * step_v);
+
+        sndPdx_uv = (derivatives[nextBaseIdx_v] - derivatives[lastBaseIdx_v]) / (2 * step_v);
+        sndPdy_uv = (derivatives[nextBaseIdx_v + 1] - derivatives[lastBaseIdx_v + 1]) / (2 * step_v);
+        sndPdz_uv = (derivatives[nextBaseIdx_v + 2] - derivatives[lastBaseIdx_v + 2]) / (2 * step_v);
+    }
+
+    float uvx = (sndPdx_uv + sndPdx_vu) / 2, uvy = (sndPdy_uv + sndPdy_vu) / 2, uvz = (sndPdz_uv + sndPdz_vu) / 2;
+    normalization(sndPdx_uu, sndPdy_uu, sndPdz_uu);
+    normalization(uvx, uvy, uvz);
+    normalization(sndPdx_vv, sndPdy_vv, sndPdz_vv);
+
+
+    if (ix == 8 && iy == 9)
+        printf("(%g, %g) --> uu: (%g, %g, %g), uv: (%g, %g, %g), vv: (%g, %g, %g)\n", u, v, sndPdx_uu, sndPdy_uu,
+               sndPdz_uu,
+               uvx, uvy, uvz, sndPdx_vv, sndPdy_vv, sndPdz_vv);
+}
+
+
+__host__ NurbsSurface::Surface::Surface(std::vector<std::vector<std::vector<float>>> controlPoints,
+                                            std::vector<float> knots_u, std::vector<float> knots_v) {
+    this->knots_u = std::move(knots_u);
+    this->knots_v = std::move(knots_v);
+    this->controlPoints = std::move(controlPoints);
+    recordTime = false;
+    d_nTexture_u = nullptr;
+    d_nTexture_v = nullptr;
+    d_nTexture1_u = nullptr;
+    d_nTexture1_v = nullptr;
+    d_knots_u = nullptr;
+    d_knots_v = nullptr;
+    d_points = nullptr;
+    d_derivatives = nullptr;
+}
+
+__host__ std::vector<std::vector<std::vector<float>>>
+NurbsSurface::Surface::evaluate(int sampleCnt_u, int sampleCnt_v) {
+    if (POINT_SIZE != controlPoints[0][0].size()) {
+        printf("Error! Nurbs控制点应表示为长度为4的齐次坐标\n");
+        return {};
+    }
+
+    // 构造指向device的controlPoints
+    const int pointsCnt_u = controlPoints.size(), pointsCnt_v = controlPoints[0].size();
+    const int pointsBytes = pointsCnt_u * pointsCnt_v * POINT_SIZE * sizeof(float);
+    auto *h_points = (float *) malloc(pointsBytes);
+    for (int i = 0; i < pointsCnt_u; i++) {
+        for (int j = 0; j < pointsCnt_v; j++) {
+            for (int k = 0; k < POINT_SIZE; k++) {
+                h_points[(i * pointsCnt_v + j) * POINT_SIZE + k] = controlPoints[i][j][k];
+            }
+        }
+    }
+    cudaMalloc((void **) &d_points, pointsBytes);
+    cudaMemcpy(d_points, h_points, pointsBytes, cudaMemcpyHostToDevice);
+
+    // 构造指向device的knots
+    const int knotsCnt_u = knots_u.size(), knotsCnt_v = knots_v.size();
+    const int knotsBytes_u = knotsCnt_u * sizeof(float), knotsBytes_v = knotsCnt_v * sizeof(float);
+    auto *h_knots_u = (float *) malloc(knotsBytes_u), *h_knots_v = (float *) malloc(knotsBytes_v);
+    for (int i = 0; i < knotsCnt_u; i++) h_knots_u[i] = knots_u[i];
+    for (int i = 0; i < knotsCnt_v; i++) h_knots_v[i] = knots_v[i];
+
+    cudaMalloc((void **) &d_knots_u, knotsBytes_u);
+    cudaMalloc((void **) &d_knots_v, knotsBytes_v);
+    cudaMemcpy(d_knots_u, h_knots_u, knotsBytes_u, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_knots_v, h_knots_v, knotsBytes_v, cudaMemcpyHostToDevice);
+
+    // 构造nTexture
+    cudaMalloc((void **) &d_nTexture_u,
+               sampleCnt_u * pointsCnt_u * sizeof(float)); // 注意nTexture的大小，在算梯度时用得到i=pointsCnt + 1的基函数值
+    cudaMalloc((void **) &d_nTexture_v, sampleCnt_v * pointsCnt_v * sizeof(float));
+
+    // 构造nTexture1
+    cudaMalloc((void **) &d_nTexture1_u, sampleCnt_u * (pointsCnt_u + 1) * sizeof(float));
+    cudaMalloc((void **) &d_nTexture1_v, sampleCnt_v * (pointsCnt_v + 1) * sizeof(float));
+
+    // 结果数组
+    size_t resBytes = sampleCnt_u * sampleCnt_v * 3 * sizeof(float);
+    float *d_res;
+    cudaMalloc((void **)&d_res, resBytes);
+    auto *h_res = (float *)malloc(resBytes);
+
+
+    // 构造g_basisTexture线程层级
+    dim3 blockBasis(512);
+    dim3 gridBasis_u((sampleCnt_u + blockBasis.x - 1) / blockBasis.x);
+    dim3 gridBasis_v((sampleCnt_v + blockBasis.x - 1) / blockBasis.x);
+
+    // 构造线程层级，调用核函数
+    dim3 block(32, 32);
+    dim3 grid((sampleCnt_u + block.x - 1) / block.x, (sampleCnt_v + block.y - 1) / block.y);
+    // 记录用时
+    double time_cost_device;
+    g_basisTexture<<<gridBasis_u, blockBasis>>>(d_nTexture_u, d_nTexture1_u, d_knots_u, pointsCnt_u, knotsCnt_u,
+                                                sampleCnt_u);
+    cudaDeviceSynchronize();
+    g_basisTexture<<<gridBasis_v, blockBasis>>>(d_nTexture_v, d_nTexture1_v, d_knots_v, pointsCnt_v, knotsCnt_v,
+                                                sampleCnt_v);
+    cudaDeviceSynchronize();
+
+    if (recordTime) time_cost_device = get_time();
+    g_evaluate <<<grid, block>>>(d_res, d_nTexture_u, d_nTexture_v, d_points, pointsCnt_u, pointsCnt_v, POINT_SIZE,
+                                 knots_u[knotsCnt_u - 1], knots_v[knotsCnt_v - 1], sampleCnt_u, sampleCnt_v);
+    cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用
+    if (recordTime) {
+        time_cost_device = get_time() - time_cost_device;
+        printf("GPU time cost of surface evaluation for %d samples: %lf\n", sampleCnt_u * sampleCnt_v,
+               time_cost_device);
+    }
+
+    cudaMemcpy(h_res, d_res, resBytes, cudaMemcpyDeviceToHost);
+    std::vector<std::vector<std::vector<float>>> res(sampleCnt_u, std::vector<std::vector<float>>(sampleCnt_v, std::vector<float>(3, 0.)));
+    for(int i = 0; i < sampleCnt_u; i++) {
+        int baseIdx = i * sampleCnt_v * 3;
+        for(int j = 0; j < sampleCnt_v; j++) {
+            baseIdx += j * 3;
+            res[i][j][0] = h_res[baseIdx];
+            res[i][j][1] = h_res[baseIdx + 1];
+            res[i][j][2] = h_res[baseIdx + 2];
+//            printf("%d, %d: %f, %f, %f\n", i, j, res[i][j][0], res[i][j][1], res[i][j][2]);
+        }
+    }
+
+    // 释放内存
+    safeFree(h_points);
+    safeFree(h_knots_u);
+    safeFree(h_knots_v);
+    safeCudaFree(d_res);
+    safeFree(h_res);
+
+    return res;
+}
+
+__host__ void NurbsSurface::Surface::derivative(int sampleCnt_u, int sampleCnt_v) {
+    // 先完成evaluation
+    evaluate(sampleCnt_u, sampleCnt_v);
+
+    if (POINT_SIZE != controlPoints[0][0].size()) {
+        printf("Error! Nurbs控制点应表示为长度为4的齐次坐标\n");
+        return;
+    }
+
+    float *d_derTexture_u = nullptr;
+    float *d_derTexture_v = nullptr;
+    const int pointsCnt_u = controlPoints.size(), pointsCnt_v = controlPoints[0].size();
+    const int knotsCnt_u = knots_u.size(), knotsCnt_v = knots_v.size();
+    cudaMalloc((void **) &d_derTexture_u, sampleCnt_u * pointsCnt_u * sizeof(float));
+    cudaMalloc((void **) &d_derTexture_v, sampleCnt_v * pointsCnt_v * sizeof(float));
+
+    // 构造切向量计算结果
+    safeCudaFree(d_derivatives);
+    cudaMalloc((void **) &d_derivatives,
+               sampleCnt_v * sampleCnt_u * 6 * sizeof(float)); // 每个采用所求的切向量是一个六元向量，前三位是对u的偏导、后三位是对v的偏导
+
+    // 构造线程层级
+    dim3 block(32, 32);
+    dim3 grid((sampleCnt_u + block.x - 1) / block.x, (sampleCnt_v + block.y - 1) / block.y);
+    // 构造g_basisTexture线程层级
+    dim3 blockTex(512);
+    dim3 gridTex_u((sampleCnt_u + blockTex.x - 1) / blockTex.x);
+    dim3 gridTex_v((sampleCnt_v + blockTex.x - 1) / blockTex.x);
+    // 记录用时
+    double time_cost_device;
+    if (recordTime) time_cost_device = get_time();
+    g_derTexture<<<gridTex_u, blockTex>>>(d_derTexture_u, d_nTexture1_u, d_knots_u, pointsCnt_u, knotsCnt_u,
+                                          sampleCnt_u);
+    g_derTexture<<<gridTex_v, blockTex>>>(d_derTexture_v, d_nTexture1_v, d_knots_v, pointsCnt_v, knotsCnt_v,
+                                          sampleCnt_v);
+    cudaDeviceSynchronize();
+    g_derivative<<<grid, block>>>(d_derivatives, d_derTexture_u, d_derTexture_v, d_nTexture_u, d_nTexture_v, d_points,
+                                  pointsCnt_u,
+                                  pointsCnt_v, POINT_SIZE, knots_u[knotsCnt_u - 1], knots_v[knotsCnt_v - 1],
+                                  sampleCnt_u,
+                                  sampleCnt_v);
+    cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用
+    if (recordTime) {
+        time_cost_device = get_time() - time_cost_device;
+        printf("GPU time cost of surface first derivative calculating for %d samples: %lf\n", sampleCnt_u * sampleCnt_v,
+               time_cost_device);
+    }
+    cudaFree(d_derTexture_u);
+    cudaFree(d_derTexture_v);
+}
+
+
+
+__host__ void NurbsSurface::Surface::curvature(int sampleCnt_u, int sampleCnt_v) {
+    // 先计算切向量
+    derivative(sampleCnt_u, sampleCnt_v);
+    if (POINT_SIZE != controlPoints[0][0].size()) {
+        printf("Error! Nurbs控制点应表示为长度为4的齐次坐标\n");
+        return;
+    }
+
+    // 构造线程层级
+    dim3 block(32, 32);
+    dim3 grid((sampleCnt_u + block.x - 1) / block.x, (sampleCnt_v + block.y - 1) / block.y);
+
+    // 记录用时
+    double time_cost_device;
+    if (recordTime) time_cost_device = get_time();
+    g_curvature<<<grid, block>>>(d_derivatives, sampleCnt_u, sampleCnt_v, knots_u[knots_u.size() - 1],
+                                 knots_v[knots_v.size() - 1]);
+    cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用
+    if (recordTime) {
+        time_cost_device = get_time() - time_cost_device;
+        printf("GPU time cost of surface second derivative calculating for %d samples: %lf\n",
+               sampleCnt_u * sampleCnt_v,
+               time_cost_device);
+    }
+}
+
+void NurbsSurface::Surface::setRecordTime(bool r) {
+    recordTime = r;
+}
+
+NurbsSurface::Surface::~Surface() {
+    safeCudaFree(d_nTexture_u);
+    safeCudaFree(d_nTexture_v);
+    safeCudaFree(d_nTexture1_u);
+    safeCudaFree(d_nTexture1_v);
+    safeCudaFree(d_points);
+    safeCudaFree(d_knots_u);
+    safeCudaFree(d_knots_v);
+    cudaDeviceReset();
+}
--- a/src/main.cpp
+++ b/src/main.cpp
@ -1,8 +1,9 @@
 #include <cstdio>
-#include "NurbsEvaluator.cuh"
+#include "../include/device/NurbsCurve.cuh"
+#include "../include/device/NurbsSurface.cuh"

 int main() {
-    NurbsSurface::Evaluator nurbsSurfaceEvaluator({
+    NurbsSurface::Surface nurbsSurfaceEvaluator({
                                                          {{-1,   0,   0, 0.3},   {0,   1, 6, 0.8},  {1,  0,   4, 0.5},   {2,  0.5,  3, 0.8},    {3,    3,   1, 0.6},   {4,  -5,    0, 0.7}},
                                                          {{-2,   1,   1.2, 0.2}, {1,   2, 3, 0.3},  {2,  2,   3, 0.6},   {-1, -0.3, 2, 0.4},    {-1,   2,   0, 0.9},   {7,  -8,    2, 0.3}},
                                                          {{-3.4, 2,   3, 0.8},   {2,   3, 0, 0.6},  {4,  3,   7, 0.3},   {-2, 0,    -0.2, 0.4}, {1,    1.7, 5, 0.6},   {9,  -10.3, 6, 0.7}},
@ -21,11 +22,12 @@ int main() {
 //                                                  {0, 0, 0, 0.1, 0.5, 0.8, 1, 1, 1},
 //                                                  {0, 0, 0, 0.2, 0.7, 0.8, 1, 1, 1});
    nurbsSurfaceEvaluator.setRecordTime(true);
-    nurbsSurfaceEvaluator.curvature(10001, 10001);

+    nurbsSurfaceEvaluator.curvature(101, 101);
+//    nurbsSurfaceEvaluator.evaluate(101, 101);
    printf("==============================\n");

-    NurbsCurve::Evaluator nurbsCurveEvaluator(
+    NurbsCurve::Curve nurbsCurveEvaluator(
            {{-1, 0,   0, 0.3},
             {0,  1,   6, 0.4},
             {1,  0,   4, 0.5},
@ -34,10 +36,13 @@ int main() {
             {4,  -5,  0, 0.7}},
            {0, 0, 0, 0.1, 0.5, 0.8, 1, 1, 1});
    nurbsCurveEvaluator.setRecordTime(true);
-    nurbsCurveEvaluator.curvature(11);
+//    nurbsCurveEvaluator.curvature(11);
+    auto res = nurbsCurveEvaluator.evaluate(11);
+    for(auto & re : res) {
+        printf("%f, %f, %f\n", re[0], re[1], re[2]);
+    }
    printf("\n");
 //    nurbsCurveEvaluator.derivative();

    return 0;
-}
-
+}
--- a/src/utils.cpp
+++ b/src/utils.cpp
@ -0,0 +1,46 @@
+#include "../include/utils.h"
+#include "cuda_runtime.h"
+
+#if IN_UNIX
+double get_time() {
+    struct timeval tv{};
+    double t;
+
+    gettimeofday(&tv, (struct timezone *) nullptr);
+    t = tv.tv_sec + (double) tv.tv_usec * 1e-6;
+
+    return t;
+}
+#else
+double get_time() {
+    LARGE_INTEGER timer;
+    static LARGE_INTEGER fre;
+    static int init = 0;
+    double t;
+
+    if (init != 1) {
+        QueryPerformanceFrequency(&fre);
+        init = 1;
+    }
+
+    QueryPerformanceCounter(&timer);
+
+    t = timer.QuadPart * 1. / fre.QuadPart;
+
+    return t;
+}
+#endif
+
+void safeCudaFree(float *&p) {
+    if (p != nullptr) {
+        cudaFree(p);
+        p = nullptr;
+    }
+}
+
+void safeFree(float *&p) {
+    if (p != nullptr) {
+        free(p);
+        p = nullptr;
+    }
+}
--- a/utils.cpp
+++ b/utils.cpp
@ -1,19 +0,0 @@
-#include "utils.h"
-
-double utils::get_time_windows() {
-    LARGE_INTEGER timer;
-    static LARGE_INTEGER fre;
-    static int init = 0;
-    double t;
-
-    if (init != 1) {
-        QueryPerformanceFrequency(&fre);
-        init = 1;
-    }
-
-    QueryPerformanceCounter(&timer);
-
-    t = timer.QuadPart * 1. / fre.QuadPart;
-
-    return t;
-}
--- a/utils.h
+++ b/utils.h
@ -1,11 +0,0 @@
-#ifndef UNTITLED1_UTILS_H
-#define UNTITLED1_UTILS_H
-#include <windows.h>
-
-
-namespace utils {
-    double get_time_windows();
-}
-
-
-#endif //UNTITLED1_UTILS_H