B-spline的切、法向量求解

4 years ago · de1c34cd84
6 changed files with 537 additions and 127 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -3,7 +3,7 @@ project(NurbsEvaluator LANGUAGES CXX CUDA)

 set(CMAKE_CUDA_STANDARD 14)

-add_executable(NurbsEvaluator main.cpp NurbsEvaluator.cu NurbsEvaluator.cuh utils.cpp utils.h)
+add_executable(NurbsEvaluator main.cpp NurbsEvaluator.cu NurbsEvaluator.cuh utils.cpp utils.h NurbsBasis.cu NurbsBasis.cuh)

 #add_compile_options("$<$<C_COMPILER_ID:MSVC>:/utf-8>")
 #add_compile_options("$<$<CXX_COMPILER_ID:MSVC>:/utf-8>")
--- a/NurbsBasis.cu
+++ b/NurbsBasis.cu
@ -0,0 +1,38 @@
+//
+// Created by 14727 on 2022/11/19.
+//
+
+#include "NurbsBasis.cuh"
+
+__device__ void d_basisFunction1(float *N_Texture, const float *knots, float u, int degree, int d_knotsCnt) {
+    int m = d_knotsCnt - 1;
+    for (int p = 0; p <= degree; p++) {
+        for (int i = 0; i + p <= m - 1; i++) {
+            if (p == 0) {
+                if ((u > knots[i] || d_floatEqual1(u, knots[i])) && (u < knots[i + 1])
+                    ||
+                    d_floatEqual1(u, knots[i + 1]) && d_floatEqual1(u, knots[m])) {
+                    N_Texture[p * m + i] = 1.0;
+                } else {
+                    N_Texture[p * m + i] = 0.0;
+                }
+            } else {
+                float Nip_1 = N_Texture[(p - 1) * m + i];
+                float Ni1p_1 = N_Texture[(p - 1) * m + i + 1];
+                float left = d_floatEqual1(knots[i + p], knots[i]) ? 0 : (u - knots[i]) * Nip_1 /
+                                                                        (knots[i + p] - knots[i]);
+                float right = d_floatEqual1(knots[i + p + 1], knots[i + 1]) ? 0 : (knots[i + p + 1] - u) * Ni1p_1 /
+                                                                                 (knots[i + p + 1] - knots[i + 1]);
+                N_Texture[p * m + i] = left + right;
+            }
+        }
+    }
+}
+
+__device__ bool d_floatEqual1(float a, float b) {
+    return abs(a - b) < 0.00001;
+}
+
+void NurbsBasis::myPrint11(int a, int b) {
+        printf("In NurbsBasis %d!!!\n", a * b);
+}
--- a/NurbsBasis.cuh
+++ b/NurbsBasis.cuh
@ -0,0 +1,28 @@
+//
+// Created by 14727 on 2022/11/19.
+//
+
+#ifndef NURBSEVALUATOR_NURBSBASIS_CUH
+#define NURBSEVALUATOR_NURBSBASIS_CUH
+
+#include <cuda_runtime.h>
+#include "cstdio"
+
+/**
+ * 当u值已知时，根据基函数N的递推表达式，采用动态规划的方式求解N值
+ * @param N_Texture 结果返回在N_Texture中
+ */
+extern __device__ void d_basisFunction1(float *N_Texture, const float *knots, float u, int degree, int d_knotsCnt);
+
+/**
+ * device中判断两个浮点数是否相等。与CPU中一样，GPU中的浮点数也存在很小的误差，直接使用==判断往往容易将相等误判为不等
+ * @return true：相等
+ */
+extern __device__ bool d_floatEqual1(float a, float b);
+
+namespace NurbsBasis {
+    void myPrint11(int a, int b);
+};
+
+
+#endif //NURBSEVALUATOR_NURBSBASIS_CUH
--- a/NurbsEvaluator.cu
+++ b/NurbsEvaluator.cu
@ -7,6 +7,9 @@
 #include "NurbsEvaluator.cuh"
 #include "cstdio"
 #include "utils.h"
+//#include "NurbsBasis.cuh"
+
+//extern __device__ void NurbsBasis::d_basisFunction(float *N_Texture, const float *knots, float u, int degree, int d_knotsCnt) {};

 __host__ NurbsSurface::Evaluator::Evaluator(std::vector<std::vector<std::vector<float>>> controlPoints,
                                            std::vector<float> knots_u, std::vector<float> knots_v) {
@ -14,97 +17,288 @@ __host__ NurbsSurface::Evaluator::Evaluator(std::vector<std::vector<std::vector<
    this->knots_v = std::move(knots_v);
    this->controlPoints = std::move(controlPoints);
    recordTime = false;
+    d_nTexture_u = nullptr;
+    d_nTexture_v = nullptr;
+    d_nTexture1_u = nullptr;
+    d_nTexture1_v = nullptr;
+    d_knots_u = nullptr;
+    d_knots_v = nullptr;
+    d_points = nullptr;
 }

 __host__ std::vector<std::map<std::pair<float, float>, std::vector<float>>>
-NurbsSurface::Evaluator::calculate(int sampleCnt_u, int sampleCnt_v) {
+NurbsSurface::Evaluator::evaluate(int sampleCnt_u_, int sampleCnt_v_) {
+    sampleCnt_u = sampleCnt_u_;
+    sampleCnt_v = sampleCnt_v_;
+    printf("outside  printf..\n");
+//    NurbsBasis::myPrint11(1, 3);
+
    // 构造指向device的controlPoints
-    const int pointsCntU = controlPoints.size(), pointsCntV = controlPoints[0].size(), pointSize = controlPoints[0][0].size();
-    const int pointsBytes = pointsCntU * pointsCntV * pointSize * sizeof(float);
+    const int pointsCnt_u = controlPoints.size(), pointsCnt_v = controlPoints[0].size(), pointSize = controlPoints[0][0].size();
+    const int pointsBytes = pointsCnt_u * pointsCnt_v * pointSize * sizeof(float);
    auto *h_points = (float *) malloc(pointsBytes);
-    for (int i = 0; i < pointsCntU; i++) {
-        for (int j = 0; j < pointsCntV; j++) {
+    for (int i = 0; i < pointsCnt_u; i++) {
+        for (int j = 0; j < pointsCnt_v; j++) {
            for (int k = 0; k < pointSize; k++) {
-                h_points[(i * pointsCntV + j) * pointSize + k] = controlPoints[i][j][k];
+                h_points[(i * pointsCnt_v + j) * pointSize + k] = controlPoints[i][j][k];
            }
        }
    }
-    float *d_points;
    cudaMalloc((void **) &d_points, pointsBytes);
    cudaMemcpy(d_points, h_points, pointsBytes, cudaMemcpyHostToDevice);

    // 构造指向device的knots
    const int knotsCnt_u = knots_u.size(), knotsCnt_v = knots_v.size();
-    const int knotsBytesU = knotsCnt_u * sizeof(float), knotsBytesV = knotsCnt_v * sizeof(float);
-    auto *h_knotsU = (float *) malloc(knotsBytesU), *h_knotsV = (float *) malloc(knotsBytesV);
-    for (int i = 0; i < knotsCnt_u; i++) h_knotsU[i] = knots_u[i];
-    for (int i = 0; i < knotsCnt_v; i++) h_knotsV[i] = knots_v[i];
-
-    float *d_knots_u;
-    float *d_knots_v;
-    cudaMalloc((void **) &d_knots_u, knotsBytesU);
-    cudaMalloc((void **) &d_knots_v, knotsBytesV);
-    cudaMemcpy(d_knots_u, h_knotsU, knotsBytesU, cudaMemcpyHostToDevice);
-    cudaMemcpy(d_knots_v, h_knotsV, knotsBytesV, cudaMemcpyHostToDevice);
+    const int knotsBytes_u = knotsCnt_u * sizeof(float), knotsBytes_v = knotsCnt_v * sizeof(float);
+    auto *h_knots_u = (float *) malloc(knotsBytes_u), *h_knots_v = (float *) malloc(knotsBytes_v);
+    for (int i = 0; i < knotsCnt_u; i++) h_knots_u[i] = knots_u[i];
+    for (int i = 0; i < knotsCnt_v; i++) h_knots_v[i] = knots_v[i];
+
+    cudaMalloc((void **) &d_knots_u, knotsBytes_u);
+    cudaMalloc((void **) &d_knots_v, knotsBytes_v);
+    cudaMemcpy(d_knots_u, h_knots_u, knotsBytes_u, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_knots_v, h_knots_v, knotsBytes_v, cudaMemcpyHostToDevice);
+
+    // 构造nTexture
+    cudaMalloc((void **) &d_nTexture_u,
+               sampleCnt_u * pointsCnt_u * sizeof(float)); // 注意nTexture的大小，在算梯度时用得到i=pointsCnt + 1的基函数值
+    cudaMalloc((void **) &d_nTexture_v, sampleCnt_v * pointsCnt_v * sizeof(float));
+
+    // 构造nTexture1
+    cudaMalloc((void **) &d_nTexture1_u, sampleCnt_u * (pointsCnt_u + 1) * sizeof(float));
+    cudaMalloc((void **) &d_nTexture1_v, sampleCnt_v * (pointsCnt_v + 1) * sizeof(float));
+
+    // 构造g_basisTexture线程层级
+    dim3 blockBasis(512);
+    dim3 gridBasis_u((sampleCnt_u + blockBasis.x - 1) / blockBasis.x);
+    dim3 gridBasis_v((sampleCnt_v + blockBasis.x - 1) / blockBasis.x);

    // 构造线程层级，调用核函数
    dim3 block(32, 32);
    dim3 grid((sampleCnt_u + block.x - 1) / block.x, (sampleCnt_v + block.y - 1) / block.y);
    // 记录用时
    double time_cost_device;
-    if(recordTime) time_cost_device = utils::get_time_windows();
-    calculate_kernel <<<grid, block>>>(d_points, d_knots_u, d_knots_v, pointsCntU, pointsCntV, pointSize,
-                                       knots_u.size(), knots_v.size(), sampleCnt_u, sampleCnt_v);
-    if(recordTime) {
-        cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用
+    g_basisTexture<<<gridBasis_u, blockBasis>>>(d_nTexture_u, d_nTexture1_u, d_knots_u, pointsCnt_u, knotsCnt_u,
+                                                sampleCnt_u);
+    cudaDeviceSynchronize();
+    g_basisTexture<<<gridBasis_v, blockBasis>>>(d_nTexture_v, d_nTexture1_v, d_knots_v, pointsCnt_v, knotsCnt_v,
+                                                sampleCnt_v);
+    cudaDeviceSynchronize();
+
+    if (recordTime) time_cost_device = utils::get_time_windows();
+    g_evaluate <<<grid, block>>>(d_nTexture_u, d_nTexture_v, d_points, pointsCnt_u, pointsCnt_v, pointSize,
+                                 knots_u[knotsCnt_u - 1], knots_v[knotsCnt_v - 1], sampleCnt_u, sampleCnt_v);
+    cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用
+    if (recordTime) {
        time_cost_device = utils::get_time_windows() - time_cost_device;
-        printf("GPU time cost of surface evaluation for %d samples: %lf\n", sampleCnt_u * sampleCnt_v, time_cost_device);
+        printf("GPU time cost of surface evaluation for %d samples: %lf\n", sampleCnt_u * sampleCnt_v,
+               time_cost_device);
    }

    // 释放内存
    free(h_points);
-    free(h_knotsU);
-    free(h_knotsV);
-    cudaFree(d_points);
-    cudaFree(d_knots_u);
-    cudaFree(d_knots_v);
+    free(h_knots_u);
+    free(h_knots_v);
+    printf("First derivatives and normal vectors calculated by GPU:\n");
+    derivative();
+
+    cudaDeviceReset();
+    return {};
+}
+
+
+__host__ std::vector<std::map<float, std::vector<float>>>
+NurbsCurve::Evaluator::evaluate(int sampleCnt_) {
+    this->sampleCnt = sampleCnt_;
+    // 构造指向device的controlPoints
+    const int pointsCnt = controlPoints.size(), pointSize = controlPoints[0].size();
+    const int pointsBytes = pointsCnt * pointSize * sizeof(float);
+    auto *h_points = (float *) malloc(pointsBytes);
+    for (int i = 0; i < pointsCnt; i++) {
+        for (int j = 0; j < pointSize; j++) {
+            h_points[i * pointSize + j] = controlPoints[i][j];
+        }
+    }
+    cudaMalloc((void **) &d_points, pointsBytes);
+    cudaMemcpy(d_points, h_points, pointsBytes, cudaMemcpyHostToDevice);
+
+    // 构造指向device的knots
+    const int knotsCnt = knots.size();
+    const int knotsBytes = knotsCnt * sizeof(float);
+    auto *h_knots = (float *) malloc(knotsBytes);
+    for (int i = 0; i < knotsCnt; i++) h_knots[i] = knots[i];
+    cudaMalloc((void **) &d_knots, knotsBytes);
+    cudaMemcpy(d_knots, h_knots, knotsBytes, cudaMemcpyHostToDevice);
+
+    // 分配nTexture的内存。只需要GPU内存
+//    float *d_nTexture = nullptr;
+    cudaMalloc((void **) &d_nTexture,
+               sampleCnt * pointsCnt * sizeof(float)); // 注意nTexture的大小，在算梯度时用得到i=pointsCnt + 1的基函数值
+
+    // 分配nTexture1的内存。只需要GPU内存
+//    float *d_nTexture1 = nullptr;
+    cudaMalloc((void **) &d_nTexture1, sampleCnt * (pointsCnt + 1) * sizeof(float));
+
+    // 构造g_basisTexture线程层级
+    dim3 blockBasis(512);
+    dim3 gridBasis((sampleCnt + blockBasis.x - 1) / blockBasis.x);
+
+    // 构造线程层级
+    dim3 block(32, 32);
+    dim3 grid((sampleCnt + block.x * block.y - 1) / (block.x * block.y));
+    // 记录用时
+    double time_cost_device;
+    if (recordTime) time_cost_device = utils::get_time_windows();
+    printf("there..\n");
+    g_basisTexture <<<gridBasis, blockBasis>>>(d_nTexture, d_nTexture1, d_knots, pointsCnt, knotsCnt, sampleCnt);
+//    cudaMemcpy(d_nTextureCpy, d_nTexture, nTextureBytes, cudaMemcpyDeviceToDevice); // 有同步功能
+    cudaDeviceSynchronize();
+    printf("here..\n");
+    g_evaluate <<<grid, block>>>(d_nTexture, d_points, pointsCnt, pointSize, knots[knotsCnt - 1], sampleCnt);
+//    g_test<<<1,6>>>(d_nTextureCpy);
+    cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用
+    if (recordTime) {
+        time_cost_device = utils::get_time_windows() - time_cost_device;
+        printf("GPU time cost of curve evaluation for %d samples: %lf\n", sampleCnt, time_cost_device);
+    }
+    free(h_points);
+    free(h_knots);
+
+    printf("First derivatives calculated by GPU:\n");
+    derivative();

    cudaDeviceReset();
    return {};
 }

+__host__ void NurbsSurface::Evaluator::derivative() {
+    float *d_derTexture_u = nullptr;
+    float *d_derTexture_v = nullptr;
+    const int pointsCnt_u = controlPoints.size(), pointsCnt_v = controlPoints[0].size(), pointSize = controlPoints[0][0].size();
+    const int knotsCnt_u = knots_u.size(), knotsCnt_v = knots_v.size();
+    cudaMalloc((void **) &d_derTexture_u, sampleCnt_u * pointsCnt_u * sizeof(float));
+    cudaMalloc((void **) &d_derTexture_v, sampleCnt_v * pointsCnt_v * sizeof(float));

+    // 构造线程层级
+    dim3 block(32, 32);
+    dim3 grid((sampleCnt_u + block.x - 1) / block.x, (sampleCnt_v + block.y - 1) / block.y);
+    // 构造g_basisTexture线程层级
+    dim3 blockTex(512);
+    dim3 gridTex_u((sampleCnt_u + blockTex.x - 1) / blockTex.x);
+    dim3 gridTex_v((sampleCnt_v + blockTex.x - 1) / blockTex.x);
+    // 记录用时
+    double time_cost_device;
+    if (recordTime) time_cost_device = utils::get_time_windows();
+    g_derTexture<<<gridTex_u, blockTex>>>(d_derTexture_u, d_nTexture1_u, d_knots_u, pointsCnt_u, knotsCnt_u,
+                                          sampleCnt_u);
+    g_derTexture<<<gridTex_v, blockTex>>>(d_derTexture_v, d_nTexture1_v, d_knots_v, pointsCnt_v, knotsCnt_v,
+                                          sampleCnt_v);
+    cudaDeviceSynchronize();
+    g_derivative<<<grid, block>>>(d_derTexture_u, d_derTexture_v, d_nTexture_u, d_nTexture_v, d_points, pointsCnt_u,
+                                  pointsCnt_v, pointSize, knots_u[knotsCnt_u - 1], knots_v[knotsCnt_v - 1], sampleCnt_u,
+                                  sampleCnt_v);
+    cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用
+    if (recordTime) {
+        time_cost_device = utils::get_time_windows() - time_cost_device;
+        printf("GPU time cost of surface first derivative calculating for %d samples: %lf\n", sampleCnt_u * sampleCnt_v,
+               time_cost_device);
+    }
+    cudaFree(d_derTexture_u);
+    cudaFree(d_derTexture_v);
+}
+
+__host__ void NurbsCurve::Evaluator::derivative() {
+    float *d_derTexture = nullptr;
+    const int pointsCnt = controlPoints.size(), pointSize = controlPoints[0].size();
+    const int knotsCnt = knots.size();
+    cudaMalloc((void **) &d_derTexture, sampleCnt * pointsCnt * sizeof(float));
+
+    // 构造线程层级
+    dim3 block(32, 32);
+    dim3 grid((sampleCnt + block.x * block.y - 1) / (block.x * block.y));
+    // 构造g_basisTexture线程层级
+    dim3 blockTex(512);
+    dim3 gridTex((sampleCnt + blockTex.x - 1) / blockTex.x);
+    // 记录用时
+    double time_cost_device;
+    if (recordTime) time_cost_device = utils::get_time_windows();
+    g_derTexture<<<gridTex, blockTex>>>(d_derTexture, d_nTexture1, d_knots, pointsCnt, knotsCnt, sampleCnt);
+    cudaDeviceSynchronize();
+    g_derivative<<<grid, block>>>(d_derTexture, d_points, pointsCnt, pointSize, knots[knotsCnt - 1], sampleCnt);
+    cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用
+    if (recordTime) {
+        time_cost_device = utils::get_time_windows() - time_cost_device;
+        printf("GPU time cost of curve first derivative calculating for %d samples: %lf\n", sampleCnt,
+               time_cost_device);
+    }
+    cudaFree(d_derTexture);
+}
+
+
+//__global__ void
+//NurbsSurface::g_evaluate(const float *d_points, const float *d_knots_u, const float *d_knots_v,
+//                         int d_pointsCnt_u,
+//                         int d_pointsCnt_v, int d_pointSize, int d_knotsCnt_u, int d_knotsCnt_v,
+//                         int d_sampleCnt_u, int d_sampleCnt_v) {
+////    printf(" surface calculating... \n");
+//    // 二维grid和二维的block
+//    int ix = int(blockIdx.x * blockDim.x + threadIdx.x);
+//    int iy = int(blockIdx.y * blockDim.y + threadIdx.y);
+//
+//    float d_paramCeil_u = d_knots_u[d_knotsCnt_u - 1];
+//    float d_paramCeil_v = d_knots_v[d_knotsCnt_v - 1];
+//
+//    float u = ix * d_paramCeil_u / (d_sampleCnt_u - 1);
+//    float v = iy * d_paramCeil_v / (d_sampleCnt_v - 1);
+//
+//    if (u > 1.0 * d_paramCeil_u || v > 1.0 * d_paramCeil_v) {
+//        return;
+//    }
+//
+//    int d_degree_u = d_knotsCnt_u - 1 - d_pointsCnt_u;
+//    int d_degree_v = d_knotsCnt_v - 1 - d_pointsCnt_v;
+//    // 注意，在device中，全局内存还是以malloc和free的方式分配和回收的，而不是使用cudaMalloc和cudaFree
+//    auto *N_Texture_U = (float *) malloc((d_degree_u + 1) * (d_knotsCnt_u - 1) * sizeof(float));
+//    auto *N_Texture_V = (float *) malloc((d_degree_v + 1) * (d_knotsCnt_v - 1) * sizeof(float));
+//    d_basisFunction(N_Texture_U, d_knots_u, u, d_degree_u, d_knotsCnt_u);
+//    d_basisFunction(N_Texture_V, d_knots_v, v, d_degree_v, d_knotsCnt_v);
+//    float x = 0., y = 0., z = 0.;
+//    for (int i = 0; i < d_pointsCnt_u; i++) {
+//        for (int j = 0; j < d_pointsCnt_v; j++) {
+//            float N_U = N_Texture_U[d_degree_u * (d_knotsCnt_u - 1) + i];
+//            float N_V = N_Texture_V[d_degree_v * (d_knotsCnt_v - 1) + j];
+//            int idx = (i * d_pointsCnt_v + j) * d_pointSize;
+//            x += N_U * N_V * d_points[idx];
+//            y += N_U * N_V * d_points[idx + 1];
+//            z += N_U * N_V * d_points[idx + 2];
+//        }
+//    }
+//    printf("(%g, %g)-->(%g, %g, %g)\n", u, v, x, y, z); // %g输出，舍弃无意义的0
+//    free(N_Texture_U);
+//    free(N_Texture_V);
+//}

 __global__ void
-NurbsSurface::calculate_kernel(const float *d_points, const float *d_knots_u, const float *d_knots_v, int d_pointsCnt_u,
-                               int d_pointsCnt_v, int d_pointSize, int d_knotsCnt_u, int d_knotsCnt_v,
-                               int d_sampleCnt_u, int d_sampleCnt_v) {
-    // 二维grid和二维的block
-    int ix = int(blockIdx.x * blockDim.x + threadIdx.x);
-    int iy = int(blockIdx.y * blockDim.y + threadIdx.y);
+NurbsSurface::g_evaluate(const float *d_nTexture_u, const float *d_nTexture_v, const float *d_points, int d_pointsCnt_u,
+                         int d_pointsCnt_v, int d_pointSize, float d_lastKnot_u, float d_lastKnot_v, int d_sampleCnt_u,
+                         int d_sampleCnt_v) {

-    float d_paramCeil_u = d_knots_u[d_knotsCnt_u - 1];
-    float d_paramCeil_v = d_knots_v[d_knotsCnt_v - 1];
+    // 二维grid和二维的block
+    int ix = blockIdx.x * blockDim.x + threadIdx.x;
+    int iy = blockIdx.y * blockDim.y + threadIdx.y;

-    float u = ix * d_paramCeil_u / (d_sampleCnt_u - 1);
-    float v = iy * d_paramCeil_v / (d_sampleCnt_v - 1);
+    float u = ix * d_lastKnot_u / (d_sampleCnt_u - 1);
+    float v = iy * d_lastKnot_v / (d_sampleCnt_v - 1);

-    if (u > 1.0 * d_paramCeil_u || v > 1.0 * d_paramCeil_v) {
+    if (u > 1.0 * d_lastKnot_u || v > 1.0 * d_lastKnot_v) {
        return;
    }

-    int d_degree_u = d_knotsCnt_u - 1 - d_pointsCnt_u;
-    int d_degree_v = d_knotsCnt_v - 1 - d_pointsCnt_v;
-    // 注意，在device中，全局内存还是以malloc和free的方式分配和回收的，而不是使用cudaMalloc和cudaFree
-    auto *N_Texture_U = (float *) malloc((d_degree_u + 1) * (d_knotsCnt_u - 1) * sizeof(float));
-    auto *N_Texture_V = (float *) malloc((d_degree_v + 1) * (d_knotsCnt_v - 1) * sizeof(float));
-    d_basisFunction(N_Texture_U, d_knots_u, u, d_degree_u, d_knotsCnt_u);
-    d_basisFunction(N_Texture_V, d_knots_v, v, d_degree_v, d_knotsCnt_v);
    float x = 0., y = 0., z = 0.;
    for (int i = 0; i < d_pointsCnt_u; i++) {
+        float N_U = d_nTexture_u[ix * d_pointsCnt_u + i];
        for (int j = 0; j < d_pointsCnt_v; j++) {
-            float N_U = N_Texture_U[d_degree_u * (d_knotsCnt_u - 1) + i];
-            float N_V = N_Texture_V[d_degree_v * (d_knotsCnt_v - 1) + j];
+            float N_V = d_nTexture_v[iy * d_pointsCnt_v + j];
            int idx = (i * d_pointsCnt_v + j) * d_pointSize;
            x += N_U * N_V * d_points[idx];
            y += N_U * N_V * d_points[idx + 1];
@ -112,88 +306,150 @@ NurbsSurface::calculate_kernel(const float *d_points, const float *d_knots_u, co
        }
    }
    printf("(%g, %g)-->(%g, %g, %g)\n", u, v, x, y, z); // %g输出，舍弃无意义的0
-    free(N_Texture_U);
-    free(N_Texture_V);
 }

 __global__ void
-NurbsCurve::calculate_kernel(const float *d_points, const float *d_knots, int d_pointsCnt, int d_pointSize,
-                             int d_knotsCnt, int d_sampleCnt) {
-    // 二维grid和一维的block
-    int idx = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
-    float d_paramCeil = d_knots[d_knotsCnt - 1];
-    float u = idx * d_paramCeil / (d_sampleCnt - 1);
-    if (u > 1.0 * d_paramCeil) {
+NurbsSurface::g_derivative(const float *derTexture_u, const float *derTexture_v, const float *nTexture_u,
+                           const float *nTexture_v, const float *d_points, int d_pointsCnt_u, int d_pointsCnt_v,
+                           int d_pointSize, float d_lastKnot_u, float d_lastKnot_v, int d_sampleCnt_u,
+                           int d_sampleCnt_v) {
+    // 二维grid和二维的block
+    int ix = blockIdx.x * blockDim.x + threadIdx.x;
+    int iy = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (ix >= d_sampleCnt_u || iy >= d_sampleCnt_v) {
        return;
    }
+    float u = ix * d_lastKnot_u / (d_sampleCnt_u - 1);
+    float v = iy * d_lastKnot_v / (d_sampleCnt_v - 1);

-    int d_degree = d_knotsCnt - 1 - d_pointsCnt;
-    // 注意，在device中，全局内存还是以malloc和free的方式分配和回收的，而不是使用cudaMalloc和cudaFree
-    auto *N_Texture = (float *) malloc((d_degree + 1) * (d_knotsCnt - 1) * sizeof(float));
-    d_basisFunction(N_Texture, d_knots, u, d_degree, d_knotsCnt);
+    float x_u = 0., y_u = 0, z_u = 0.;
+    float x_v = 0., y_v = 0, z_v = 0.;
+
+    for (int i = 0; i < d_pointsCnt_u; i++) {
+        for (int j = 0; j < d_pointsCnt_u; j++) {
+            int baseIdx = (i * d_pointsCnt_v + j) * d_pointSize;
+            float factor_u = derTexture_u[ix * d_pointsCnt_u + i] * nTexture_v[iy * d_pointsCnt_v + j];
+            float factor_v = derTexture_v[iy * d_pointsCnt_v + j] * nTexture_u[ix * d_pointsCnt_u + i];
+            x_u += factor_u * d_points[baseIdx];
+            y_u += factor_u * d_points[baseIdx + 1];
+            z_u += factor_u * d_points[baseIdx + 2];
+
+            x_v += factor_v * d_points[baseIdx];
+            y_v += factor_v * d_points[baseIdx + 1];
+            z_v += factor_v * d_points[baseIdx + 2];
+        }
+    }
+    float x_n = y_u * z_v - y_v * z_u, y_n = x_v * z_u - x_u * z_v, z_n = x_u * y_v - x_v * y_u; // 叉乘得到法向量
+    printf("(%g,%g)-->u:(%g, %g, %g), v:(%g,%g,%g), normal:(%g,%g,%g)\n", u, v, x_u, y_u, z_u, x_v, y_v, z_v,
+           x_n, y_n, z_n);
+}
+
+__global__ void
+NurbsCurve::g_evaluate(const float *NTexture, const float *d_points, const int d_pointsCnt,
+                       const int d_pointSize, const float d_lastKnot, const int d_sampleCnt) {
+//    printf(" curve calculating... \n");
+    // 二维grid和一维的block
+//    int idx = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
+    // 二维block和一维grid
+    int idx = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
+    float u = idx * d_lastKnot / (d_sampleCnt - 1);
+    if (u > 1.0 * d_lastKnot) {
+        return;
+    }
+//
+//    int d_degree = d_knotsCnt - 1 - d_pointsCnt;
+//    // 注意，在device中，全局内存还是以malloc和free的方式分配和回收的，而不是使用cudaMalloc和cudaFree
+//    auto *N_dp = (float *) malloc((d_degree + 1) * (d_knotsCnt - 1) * sizeof(float));
+//    d_basisFunction(N_dp, d_knots, u, d_degree, d_knotsCnt);
    float x = 0., y = 0., z = 0.;
    for (int i = 0; i < d_pointsCnt; i++) {
-        float N = N_Texture[d_degree * (d_knotsCnt - 1) + i];
+        float N = NTexture[idx * d_pointsCnt + i];
        int baseIdx = i * d_pointSize;
        x += N * d_points[baseIdx];
        y += N * d_points[baseIdx + 1];
        z += N * d_points[baseIdx + 2];
    }
    printf("(%g)-->(%g, %g, %g)\n", u, x, y, z); // %g输出，舍弃无意义的0
-    free(N_Texture);
 }

-__host__ NurbsCurve::Evaluator::Evaluator(std::vector<std::vector<float>> controlPoints,
-                                          std::vector<float> knots) {
-    this->knots = std::move(knots);
-    this->controlPoints = std::move(controlPoints);
-    recordTime = false;
+__global__ void
+NurbsCurve::g_derivative(const float *derTexture, const float *d_points, int d_pointsCnt, int d_pointSize,
+                         float d_lastKnot, int d_sampleCnt) {
+    // 二维block和一维grid
+    int idx = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
+    if (idx >= d_sampleCnt) return;
+    float u = idx * d_lastKnot / (d_sampleCnt - 1);
+    float x = 0., y = 0, z = 0.;
+//    printf("pointSize: %d\n", d_pointSize);
+    for (int i = 0; i < d_pointsCnt; i++) {
+        int baseIdx = i * d_pointSize;
+        float nFactor = derTexture[idx * d_pointsCnt + i];
+        x += nFactor * d_points[baseIdx];
+        y += nFactor * d_points[baseIdx + 1];
+        z += nFactor * d_points[baseIdx + 2];
+//        printf("(x, y, z): (%g, %g, %g)\n", d_points[baseIdx], d_points[baseIdx + 1], d_points[baseIdx + 2]);
+    }
+    printf("(%g)-->(%g, %g, %g)\n", u, x, y, z);
 }

-
-__host__ std::vector<std::map<float, std::vector<float>>>
-NurbsCurve::Evaluator::calculate(int sampleCnt) {
-    // 构造指向device的controlPoints
-    const int pointsCnt = controlPoints.size(), pointSize = controlPoints[0].size();
-    const int pointsBytes = pointsCnt * pointSize * sizeof(float);
-    auto *h_points = (float *) malloc(pointsBytes);
-    for (int i = 0; i < pointsCnt; i++) {
-        for (int j = 0; j < pointSize; j++) {
-            h_points[i * pointSize + j] = controlPoints[i][j];
-        }
+__global__ void g_basisTexture(float *nTexture, float *nTexture1, const float *d_knots, int d_pointsCnt, int d_knotsCnt,
+                               int d_sampleCnt) {
+    // 一维grid和一维block
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;  // 采样点编号
+    float d_paramCeil = d_knots[d_knotsCnt - 1];
+    float u = idx * d_paramCeil / (d_sampleCnt - 1);
+    if (u > 1.0 * d_paramCeil) {
+        return;
    }
-    float *d_points;
-    cudaMalloc((void **) &d_points, pointsBytes);
-    cudaMemcpy(d_points, h_points, pointsBytes, cudaMemcpyHostToDevice);
+    int d_degree = d_knotsCnt - 1 - d_pointsCnt;
+    auto *N_dp = (float *) malloc((d_degree + 1) * (d_knotsCnt - 1) * sizeof(float));
+    d_basisFunction(N_dp, d_knots, u, d_degree, d_knotsCnt);
+    for (int i = 0; i < d_pointsCnt; i++) {
+        nTexture[idx * d_pointsCnt + i] = N_dp[d_degree * (d_knotsCnt - 1) + i];
+        nTexture1[idx * (d_pointsCnt + 1) + i] = N_dp[(d_degree - 1) * (d_knotsCnt - 1) + i];
+//        printf("nTexture1: %g ", nTexture1[idx * (d_pointsCnt + 1) + i]);
+    }
+    nTexture1[idx * (d_pointsCnt + 1) + d_pointsCnt] = N_dp[(d_degree - 1) * (d_knotsCnt - 1) +
+                                                            d_pointsCnt]; // nTexture1多记录一列数据
+    free(N_dp);
+}

-    // 构造指向device的knots
-    const int knotsCnt = knots.size();
-    const int knotsBytes = knotsCnt * sizeof(float);
-    auto *h_knots = (float *) malloc(knotsBytes);
-    for (int i = 0; i < knotsCnt; i++) h_knots[i] = knots[i];
-    float *d_knots;
-    cudaMalloc((void **) &d_knots, knotsBytes);
-    cudaMemcpy(d_knots, h_knots, knotsBytes, cudaMemcpyHostToDevice);

-    // 构造线程层级，调用核函数
-    dim3 block(32, 32);
-    dim3 grid((sampleCnt + block.x * block.y - 1) / (block.x * block.y));
-    // 记录用时
-    double time_cost_device;
-    if(recordTime)  time_cost_device = utils::get_time_windows();
-    calculate_kernel <<<grid, block>>>(d_points, d_knots, pointsCnt, pointSize, knotsCnt, sampleCnt);
-    if(recordTime) {
-        cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用
-        time_cost_device = utils::get_time_windows() - time_cost_device;
-        printf("GPU time cost of surface evaluation for %d samples: %lf\n", sampleCnt * sampleCnt, time_cost_device);
+__global__ void
+g_derTexture(float *derTexture, const float *nTexture1, const float *d_knots, int d_pointsCnt, int d_knotsCnt,
+             int d_sampleCnt) {
+    // 一维grid和一维block
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;  // 采样点编号
+    if (idx >= d_sampleCnt) {
+        return;
    }
-    free(h_points);
-    free(h_knots);
-    cudaFree(d_points);
-    cudaFree(d_knots);
+    int degree = d_knotsCnt - 1 - d_pointsCnt;

-    cudaDeviceReset();
-    return {};
+//    printf("degree: %d\n", degree);
+    for (int i = 0; i < d_pointsCnt; i++) {
+        float left = d_floatEqual(d_knots[i + degree], d_knots[i]) ? 0 :
+                     nTexture1[idx * (d_pointsCnt + 1) + i] * (degree - 1) / (d_knots[i + degree] - d_knots[i]);
+        float right = d_floatEqual(d_knots[i + degree + 1], d_knots[i + 1]) ? 0 :
+                      nTexture1[idx * (d_pointsCnt + 1) + i + 1] * (degree - 1) /
+                      (d_knots[i + degree + 1] - d_knots[i + 1]);
+        derTexture[idx * d_pointsCnt + i] = left - right;
+//        printf("<%d, %d> -- %g \n", idx, i, left - right);
+//        printf("nTex1: %g \n", nTexture1[idx * (d_pointsCnt + 1) + i]);
+    }
+}
+
+__host__ NurbsCurve::Evaluator::Evaluator(std::vector<std::vector<float>> controlPoints,
+                                          std::vector<float> knots) {
+    this->knots = std::move(knots);
+    this->controlPoints = std::move(controlPoints);
+    recordTime = false;
+    d_nTexture = nullptr;
+    d_nTexture1 = nullptr;
+
+    sampleCnt = 0;
+    d_points = nullptr;
+    d_knots = nullptr;
 }


@ -203,8 +459,8 @@ __device__ void d_basisFunction(float *N_Texture, const float *knots, float u, i
        for (int i = 0; i + p <= m - 1; i++) {
            if (p == 0) {
                if ((u > knots[i] || d_floatEqual(u, knots[i])) && (u < knots[i + 1])
-                ||
-                d_floatEqual(u, knots[i + 1]) && d_floatEqual(u, knots[m])) {
+                    ||
+                    d_floatEqual(u, knots[i + 1]) && d_floatEqual(u, knots[m])) {
                    N_Texture[p * m + i] = 1.0;
                } else {
                    N_Texture[p * m + i] = 0.0;
@ -226,11 +482,28 @@ __device__ bool d_floatEqual(float a, float b) {
    return abs(a - b) < 0.00001;
 }

-void NurbsCurve::Evaluator::setRecordTime(bool r){
+
+void NurbsCurve::Evaluator::setRecordTime(bool r) {
    recordTime = r;
 }

-void NurbsSurface::Evaluator::setRecordTime(bool r){
+void NurbsSurface::Evaluator::setRecordTime(bool r) {
    recordTime = r;
 }

+NurbsSurface::Evaluator::~Evaluator() {
+    cudaFree(d_nTexture_u);
+    cudaFree(d_nTexture_v);
+    cudaFree(d_nTexture1_u);
+    cudaFree(d_nTexture1_v);
+    cudaFree(d_points);
+    cudaFree(d_knots_u);
+    cudaFree(d_knots_v);
+}
+
+NurbsCurve::Evaluator::~Evaluator() {
+    cudaFree(d_nTexture);
+    cudaFree(d_nTexture1);
+    cudaFree(d_points);
+    cudaFree(d_knots);
+}
--- a/NurbsEvaluator.cuh
+++ b/NurbsEvaluator.cuh
@ -8,21 +8,37 @@

 namespace NurbsSurface {
    /**
-     * 曲面计算的核函数，负责计算曲面中的一个点的值
+     * 曲线计算的核函数
     * @param d_pointSize 点的大小(3: [x, y, z] | 4：[x, y, z, w])
     */
-    __global__ void
-    calculate_kernel(const float *d_points, const float *d_knots_u, const float *d_knots_v, int d_pointsCnt_u,
-                     int d_pointsCnt_v, int d_pointSize, int d_knotsCnt_u, int d_knotsCnt_v, int d_sampleCnt_u,
-                     int d_sampleCnt_v);
+    __global__ static void
+    g_evaluate(const float *d_nTexture_u, const float *d_nTexture_v, const float *d_points, int d_pointsCnt_u,
+               int d_pointsCnt_v, int d_pointSize, float d_lastKnot_u, float d_lastKnot_v, int d_sampleCnt_u,
+               int d_sampleCnt_v);
+
+    __global__ static void
+    g_derivative(const float *derTexture_u, const float *derTexture_v, const float *nTexture_u, const float *nTexture_v,
+                 const float *d_points, int d_pointsCnt_u, int d_pointsCnt_v, int d_pointSize, float d_lastKnot_u,
+                 float d_lastKnot_v, int d_sampleCnt_u, int d_sampleCnt_v);

    class Evaluator {
    private:
        std::vector<std::vector<std::vector<float>>> controlPoints;
+        float *d_points;
        std::vector<float> knots_u;
        std::vector<float> knots_v;
+        float *d_knots_u;
+        float *d_knots_v;
        bool recordTime;

+        float *d_nTexture_u;   // u方向指向度为p时的device中的nurbs基函数矩阵
+        float *d_nTexture_v;   // v方向指向度为p时的device中的nurbs基函数矩阵
+        float *d_nTexture1_u;   // u方向指向度为p-1时的device中的nurbs基函数矩阵
+        float *d_nTexture1_v;   // v方向指向度为p-1时的device中的nurbs基函数矩阵
+
+        int sampleCnt_u;
+        int sampleCnt_v;
+
    public:
        /**
         * 构造函数
@ -40,10 +56,17 @@ namespace NurbsSurface {
         * @return 由 map 组成的vector{<<u, v>, {x, y, z}>}
         */
        __host__ std::vector<std::map<std::pair<float, float>, std::vector<float>>>
-        calculate(int sampleCnt_u, int sampleCnt_v);
+        evaluate(int sampleCnt_u_, int sampleCnt_v_);
+
+        /**
+         * 供外部CPU程序使用的、负责调用gpu并行计算切向量的方法
+         */
+        __host__ void derivative();

        void setRecordTime(bool r);

+        ~Evaluator();
+
    };
 }

@ -51,40 +74,86 @@ namespace NurbsSurface {
 * 曲线部分
 */
 namespace NurbsCurve {
+
+    __global__ void g_test(float *nTexture);
+
    /**
     * 曲线计算的核函数
     * @param d_pointSize 点的大小(3: [x, y, z] | 4：[x, y, z, w])
     */
    __global__ static void
-    calculate_kernel(const float *d_points, const float *d_knots, int d_pointsCnt, int d_pointSize, int d_knotsCnt,
-                     int d_sampleCnt);
+    g_evaluate(const float *NTexture, const float *d_points, int d_pointsCnt, int d_pointSize,
+               float d_lastKnot, int d_sampleCnt);
+
+    __global__ static void
+    g_derivative(const float *derTexture, const float *d_points, int d_pointsCnt, int d_pointSize, float d_lastKnot,
+                 int d_sampleCnt);
+

    class Evaluator {
    private:
        std::vector<std::vector<float>> controlPoints;
        std::vector<float> knots;
+        float *d_knots;
+        float *d_points;
        bool recordTime;
+
+        float *d_nTexture;   // 指向度为p时的device中的nurbs基函数矩阵
+        float *d_nTexture1;   // 指向度为p-1时的device中的nurbs基函数矩阵
+
+        int sampleCnt;
    public:
        /**
         * 构造函数
         * @param controlPoints 控制点矩阵[pointsCnt][3]
         */
        __host__ explicit Evaluator(std::vector<std::vector<float>> controlPoints, std::vector<float> knots);
+
        /**
-         * 供外部CPU程序使用的、负责调用gpu并行计算的方法
-         * @param sampleCnt
+         * 供外部CPU程序使用的、负责调用gpu并行进行evaluation的方法
+         * @param sampleCnt_ 在参数域内均匀采样的采样数，它会更新成员变量中的sampleCnt
         * @return 由 map 组成的vector{<u, {x, y, z}>}
         */
-        __host__ std::vector<std::map<float, std::vector<float>>> calculate(int sampleCnt);
+        __host__ std::vector<std::map<float, std::vector<float>>> evaluate(int sampleCnt_);
+
+        /**
+         * 供外部CPU程序使用的、负责调用gpu并行计算切向量的方法
+         */
+        __host__ void derivative();
+
+        __host__ ~Evaluator();
+
        void setRecordTime(bool r);
    };
 }

+
+
+
+
+/**
+ * 计算并保存基函数值
+ * @param nTexture 记录度数为p的基函数值，规模为【sampleCnt，pointsCnt】
+ * @param nTexture1 记录度数为p-1的基函数值，规模为【sampleCnt+1，pointsCnt】
+ */
+__global__ static void
+g_basisTexture(float *nTexture, float *nTexture1, const float *d_knots, int d_pointsCnt, int d_knotsCnt,
+               int d_sampleCnt);
+
+/**
+ * 计算并保存基函数对采样点切向量的分量值
+ * @param derTexture 记录度数为p的Nurbs基函数对采样点切向量的分量值，大小为【sampleCnt，pointsCnt】
+ * @param nTexture1 度数为p-1的基函数值，规模为【sampleCnt+1，pointsCnt】
+ */
+__global__ static void
+g_derTexture(float *derTexture, const float *nTexture1, const float *d_knots, int d_pointsCnt, int d_knotsCnt,
+             int d_sampleCnt);
+
 /**
 * 当u值已知时，根据基函数N的递推表达式，采用动态规划的方式求解N值
 * @param N_Texture 结果返回在N_Texture中
 */
-__device__ void d_basisFunction(float *N_Texture, const float *knots, float u, int degree, int d_knotsCnt);
+__device__ void d_basisFunction(float *nTexture, const float *knots, float u, int degree, int d_knotsCnt);

 /**
 * device中判断两个浮点数是否相等。与CPU中一样，GPU中的浮点数也存在很小的误差，直接使用==判断往往容易将相等误判为不等
--- a/main.cpp
+++ b/main.cpp
@ -12,7 +12,7 @@ int main() {
                                                  {0, 0, 0, 0.1, 0.5, 0.8, 1, 1, 1},
                                                  {0, 0, 0, 0.2, 0.7, 0.8, 1, 1, 1});
    nurbsSurfaceEvaluator.setRecordTime(true);
-    nurbsSurfaceEvaluator.calculate(3, 4);
+    nurbsSurfaceEvaluator.evaluate(3, 4);

    printf("==============================\n");

@ -25,7 +25,9 @@ int main() {
             {4,  -5,  0}},
            {0, 0, 0, 0.1, 0.5, 0.8, 1, 1, 1});
    nurbsCurveEvaluator.setRecordTime(true);
-    nurbsCurveEvaluator.calculate(5);
+    nurbsCurveEvaluator.evaluate(11);
+    printf("\n");
+//    nurbsCurveEvaluator.derivative();

    return 0;
 }