diff --git a/CMakeLists.txt b/CMakeLists.txt index 9aaee53..f6c9e4e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,7 +3,7 @@ project(NurbsEvaluator LANGUAGES CXX CUDA) set(CMAKE_CUDA_STANDARD 14) -add_executable(NurbsEvaluator main.cpp NurbsEvaluator.cu NurbsEvaluator.cuh utils.cpp utils.h) +add_executable(NurbsEvaluator main.cpp NurbsEvaluator.cu NurbsEvaluator.cuh utils.cpp utils.h NurbsBasis.cu NurbsBasis.cuh) #add_compile_options("$<$:/utf-8>") #add_compile_options("$<$:/utf-8>") diff --git a/NurbsBasis.cu b/NurbsBasis.cu new file mode 100644 index 0000000..109224e --- /dev/null +++ b/NurbsBasis.cu @@ -0,0 +1,38 @@ +// +// Created by 14727 on 2022/11/19. +// + +#include "NurbsBasis.cuh" + +__device__ void d_basisFunction1(float *N_Texture, const float *knots, float u, int degree, int d_knotsCnt) { + int m = d_knotsCnt - 1; + for (int p = 0; p <= degree; p++) { + for (int i = 0; i + p <= m - 1; i++) { + if (p == 0) { + if ((u > knots[i] || d_floatEqual1(u, knots[i])) && (u < knots[i + 1]) + || + d_floatEqual1(u, knots[i + 1]) && d_floatEqual1(u, knots[m])) { + N_Texture[p * m + i] = 1.0; + } else { + N_Texture[p * m + i] = 0.0; + } + } else { + float Nip_1 = N_Texture[(p - 1) * m + i]; + float Ni1p_1 = N_Texture[(p - 1) * m + i + 1]; + float left = d_floatEqual1(knots[i + p], knots[i]) ? 0 : (u - knots[i]) * Nip_1 / + (knots[i + p] - knots[i]); + float right = d_floatEqual1(knots[i + p + 1], knots[i + 1]) ? 0 : (knots[i + p + 1] - u) * Ni1p_1 / + (knots[i + p + 1] - knots[i + 1]); + N_Texture[p * m + i] = left + right; + } + } + } +} + +__device__ bool d_floatEqual1(float a, float b) { + return abs(a - b) < 0.00001; +} + +void NurbsBasis::myPrint11(int a, int b) { + printf("In NurbsBasis %d!!!\n", a * b); +} diff --git a/NurbsBasis.cuh b/NurbsBasis.cuh new file mode 100644 index 0000000..3051841 --- /dev/null +++ b/NurbsBasis.cuh @@ -0,0 +1,28 @@ +// +// Created by 14727 on 2022/11/19. +// + +#ifndef NURBSEVALUATOR_NURBSBASIS_CUH +#define NURBSEVALUATOR_NURBSBASIS_CUH + +#include +#include "cstdio" + +/** + * 当u值已知时,根据基函数N的递推表达式,采用动态规划的方式求解N值 + * @param N_Texture 结果返回在N_Texture中 + */ +extern __device__ void d_basisFunction1(float *N_Texture, const float *knots, float u, int degree, int d_knotsCnt); + +/** + * device中判断两个浮点数是否相等。与CPU中一样,GPU中的浮点数也存在很小的误差,直接使用==判断往往容易将相等误判为不等 + * @return true:相等 + */ +extern __device__ bool d_floatEqual1(float a, float b); + +namespace NurbsBasis { + void myPrint11(int a, int b); +}; + + +#endif //NURBSEVALUATOR_NURBSBASIS_CUH diff --git a/NurbsEvaluator.cu b/NurbsEvaluator.cu index 19409fc..ce2cadf 100644 --- a/NurbsEvaluator.cu +++ b/NurbsEvaluator.cu @@ -7,6 +7,9 @@ #include "NurbsEvaluator.cuh" #include "cstdio" #include "utils.h" +//#include "NurbsBasis.cuh" + +//extern __device__ void NurbsBasis::d_basisFunction(float *N_Texture, const float *knots, float u, int degree, int d_knotsCnt) {}; __host__ NurbsSurface::Evaluator::Evaluator(std::vector>> controlPoints, std::vector knots_u, std::vector knots_v) { @@ -14,97 +17,288 @@ __host__ NurbsSurface::Evaluator::Evaluator(std::vectorknots_v = std::move(knots_v); this->controlPoints = std::move(controlPoints); recordTime = false; + d_nTexture_u = nullptr; + d_nTexture_v = nullptr; + d_nTexture1_u = nullptr; + d_nTexture1_v = nullptr; + d_knots_u = nullptr; + d_knots_v = nullptr; + d_points = nullptr; } __host__ std::vector, std::vector>> -NurbsSurface::Evaluator::calculate(int sampleCnt_u, int sampleCnt_v) { +NurbsSurface::Evaluator::evaluate(int sampleCnt_u_, int sampleCnt_v_) { + sampleCnt_u = sampleCnt_u_; + sampleCnt_v = sampleCnt_v_; + printf("outside printf..\n"); +// NurbsBasis::myPrint11(1, 3); + // 构造指向device的controlPoints - const int pointsCntU = controlPoints.size(), pointsCntV = controlPoints[0].size(), pointSize = controlPoints[0][0].size(); - const int pointsBytes = pointsCntU * pointsCntV * pointSize * sizeof(float); + const int pointsCnt_u = controlPoints.size(), pointsCnt_v = controlPoints[0].size(), pointSize = controlPoints[0][0].size(); + const int pointsBytes = pointsCnt_u * pointsCnt_v * pointSize * sizeof(float); auto *h_points = (float *) malloc(pointsBytes); - for (int i = 0; i < pointsCntU; i++) { - for (int j = 0; j < pointsCntV; j++) { + for (int i = 0; i < pointsCnt_u; i++) { + for (int j = 0; j < pointsCnt_v; j++) { for (int k = 0; k < pointSize; k++) { - h_points[(i * pointsCntV + j) * pointSize + k] = controlPoints[i][j][k]; + h_points[(i * pointsCnt_v + j) * pointSize + k] = controlPoints[i][j][k]; } } } - float *d_points; cudaMalloc((void **) &d_points, pointsBytes); cudaMemcpy(d_points, h_points, pointsBytes, cudaMemcpyHostToDevice); // 构造指向device的knots const int knotsCnt_u = knots_u.size(), knotsCnt_v = knots_v.size(); - const int knotsBytesU = knotsCnt_u * sizeof(float), knotsBytesV = knotsCnt_v * sizeof(float); - auto *h_knotsU = (float *) malloc(knotsBytesU), *h_knotsV = (float *) malloc(knotsBytesV); - for (int i = 0; i < knotsCnt_u; i++) h_knotsU[i] = knots_u[i]; - for (int i = 0; i < knotsCnt_v; i++) h_knotsV[i] = knots_v[i]; - - float *d_knots_u; - float *d_knots_v; - cudaMalloc((void **) &d_knots_u, knotsBytesU); - cudaMalloc((void **) &d_knots_v, knotsBytesV); - cudaMemcpy(d_knots_u, h_knotsU, knotsBytesU, cudaMemcpyHostToDevice); - cudaMemcpy(d_knots_v, h_knotsV, knotsBytesV, cudaMemcpyHostToDevice); + const int knotsBytes_u = knotsCnt_u * sizeof(float), knotsBytes_v = knotsCnt_v * sizeof(float); + auto *h_knots_u = (float *) malloc(knotsBytes_u), *h_knots_v = (float *) malloc(knotsBytes_v); + for (int i = 0; i < knotsCnt_u; i++) h_knots_u[i] = knots_u[i]; + for (int i = 0; i < knotsCnt_v; i++) h_knots_v[i] = knots_v[i]; + + cudaMalloc((void **) &d_knots_u, knotsBytes_u); + cudaMalloc((void **) &d_knots_v, knotsBytes_v); + cudaMemcpy(d_knots_u, h_knots_u, knotsBytes_u, cudaMemcpyHostToDevice); + cudaMemcpy(d_knots_v, h_knots_v, knotsBytes_v, cudaMemcpyHostToDevice); + + // 构造nTexture + cudaMalloc((void **) &d_nTexture_u, + sampleCnt_u * pointsCnt_u * sizeof(float)); // 注意nTexture的大小,在算梯度时用得到i=pointsCnt + 1的基函数值 + cudaMalloc((void **) &d_nTexture_v, sampleCnt_v * pointsCnt_v * sizeof(float)); + + // 构造nTexture1 + cudaMalloc((void **) &d_nTexture1_u, sampleCnt_u * (pointsCnt_u + 1) * sizeof(float)); + cudaMalloc((void **) &d_nTexture1_v, sampleCnt_v * (pointsCnt_v + 1) * sizeof(float)); + + // 构造g_basisTexture线程层级 + dim3 blockBasis(512); + dim3 gridBasis_u((sampleCnt_u + blockBasis.x - 1) / blockBasis.x); + dim3 gridBasis_v((sampleCnt_v + blockBasis.x - 1) / blockBasis.x); // 构造线程层级,调用核函数 dim3 block(32, 32); dim3 grid((sampleCnt_u + block.x - 1) / block.x, (sampleCnt_v + block.y - 1) / block.y); // 记录用时 double time_cost_device; - if(recordTime) time_cost_device = utils::get_time_windows(); - calculate_kernel <<>>(d_points, d_knots_u, d_knots_v, pointsCntU, pointsCntV, pointSize, - knots_u.size(), knots_v.size(), sampleCnt_u, sampleCnt_v); - if(recordTime) { - cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用 + g_basisTexture<<>>(d_nTexture_u, d_nTexture1_u, d_knots_u, pointsCnt_u, knotsCnt_u, + sampleCnt_u); + cudaDeviceSynchronize(); + g_basisTexture<<>>(d_nTexture_v, d_nTexture1_v, d_knots_v, pointsCnt_v, knotsCnt_v, + sampleCnt_v); + cudaDeviceSynchronize(); + + if (recordTime) time_cost_device = utils::get_time_windows(); + g_evaluate <<>>(d_nTexture_u, d_nTexture_v, d_points, pointsCnt_u, pointsCnt_v, pointSize, + knots_u[knotsCnt_u - 1], knots_v[knotsCnt_v - 1], sampleCnt_u, sampleCnt_v); + cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用 + if (recordTime) { time_cost_device = utils::get_time_windows() - time_cost_device; - printf("GPU time cost of surface evaluation for %d samples: %lf\n", sampleCnt_u * sampleCnt_v, time_cost_device); + printf("GPU time cost of surface evaluation for %d samples: %lf\n", sampleCnt_u * sampleCnt_v, + time_cost_device); } // 释放内存 free(h_points); - free(h_knotsU); - free(h_knotsV); - cudaFree(d_points); - cudaFree(d_knots_u); - cudaFree(d_knots_v); + free(h_knots_u); + free(h_knots_v); + printf("First derivatives and normal vectors calculated by GPU:\n"); + derivative(); + + cudaDeviceReset(); + return {}; +} + + +__host__ std::vector>> +NurbsCurve::Evaluator::evaluate(int sampleCnt_) { + this->sampleCnt = sampleCnt_; + // 构造指向device的controlPoints + const int pointsCnt = controlPoints.size(), pointSize = controlPoints[0].size(); + const int pointsBytes = pointsCnt * pointSize * sizeof(float); + auto *h_points = (float *) malloc(pointsBytes); + for (int i = 0; i < pointsCnt; i++) { + for (int j = 0; j < pointSize; j++) { + h_points[i * pointSize + j] = controlPoints[i][j]; + } + } + cudaMalloc((void **) &d_points, pointsBytes); + cudaMemcpy(d_points, h_points, pointsBytes, cudaMemcpyHostToDevice); + + // 构造指向device的knots + const int knotsCnt = knots.size(); + const int knotsBytes = knotsCnt * sizeof(float); + auto *h_knots = (float *) malloc(knotsBytes); + for (int i = 0; i < knotsCnt; i++) h_knots[i] = knots[i]; + cudaMalloc((void **) &d_knots, knotsBytes); + cudaMemcpy(d_knots, h_knots, knotsBytes, cudaMemcpyHostToDevice); + + // 分配nTexture的内存。只需要GPU内存 +// float *d_nTexture = nullptr; + cudaMalloc((void **) &d_nTexture, + sampleCnt * pointsCnt * sizeof(float)); // 注意nTexture的大小,在算梯度时用得到i=pointsCnt + 1的基函数值 + + // 分配nTexture1的内存。只需要GPU内存 +// float *d_nTexture1 = nullptr; + cudaMalloc((void **) &d_nTexture1, sampleCnt * (pointsCnt + 1) * sizeof(float)); + + // 构造g_basisTexture线程层级 + dim3 blockBasis(512); + dim3 gridBasis((sampleCnt + blockBasis.x - 1) / blockBasis.x); + + // 构造线程层级 + dim3 block(32, 32); + dim3 grid((sampleCnt + block.x * block.y - 1) / (block.x * block.y)); + // 记录用时 + double time_cost_device; + if (recordTime) time_cost_device = utils::get_time_windows(); + printf("there..\n"); + g_basisTexture <<>>(d_nTexture, d_nTexture1, d_knots, pointsCnt, knotsCnt, sampleCnt); +// cudaMemcpy(d_nTextureCpy, d_nTexture, nTextureBytes, cudaMemcpyDeviceToDevice); // 有同步功能 + cudaDeviceSynchronize(); + printf("here..\n"); + g_evaluate <<>>(d_nTexture, d_points, pointsCnt, pointSize, knots[knotsCnt - 1], sampleCnt); +// g_test<<<1,6>>>(d_nTextureCpy); + cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用 + if (recordTime) { + time_cost_device = utils::get_time_windows() - time_cost_device; + printf("GPU time cost of curve evaluation for %d samples: %lf\n", sampleCnt, time_cost_device); + } + free(h_points); + free(h_knots); + + printf("First derivatives calculated by GPU:\n"); + derivative(); cudaDeviceReset(); return {}; } +__host__ void NurbsSurface::Evaluator::derivative() { + float *d_derTexture_u = nullptr; + float *d_derTexture_v = nullptr; + const int pointsCnt_u = controlPoints.size(), pointsCnt_v = controlPoints[0].size(), pointSize = controlPoints[0][0].size(); + const int knotsCnt_u = knots_u.size(), knotsCnt_v = knots_v.size(); + cudaMalloc((void **) &d_derTexture_u, sampleCnt_u * pointsCnt_u * sizeof(float)); + cudaMalloc((void **) &d_derTexture_v, sampleCnt_v * pointsCnt_v * sizeof(float)); + // 构造线程层级 + dim3 block(32, 32); + dim3 grid((sampleCnt_u + block.x - 1) / block.x, (sampleCnt_v + block.y - 1) / block.y); + // 构造g_basisTexture线程层级 + dim3 blockTex(512); + dim3 gridTex_u((sampleCnt_u + blockTex.x - 1) / blockTex.x); + dim3 gridTex_v((sampleCnt_v + blockTex.x - 1) / blockTex.x); + // 记录用时 + double time_cost_device; + if (recordTime) time_cost_device = utils::get_time_windows(); + g_derTexture<<>>(d_derTexture_u, d_nTexture1_u, d_knots_u, pointsCnt_u, knotsCnt_u, + sampleCnt_u); + g_derTexture<<>>(d_derTexture_v, d_nTexture1_v, d_knots_v, pointsCnt_v, knotsCnt_v, + sampleCnt_v); + cudaDeviceSynchronize(); + g_derivative<<>>(d_derTexture_u, d_derTexture_v, d_nTexture_u, d_nTexture_v, d_points, pointsCnt_u, + pointsCnt_v, pointSize, knots_u[knotsCnt_u - 1], knots_v[knotsCnt_v - 1], sampleCnt_u, + sampleCnt_v); + cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用 + if (recordTime) { + time_cost_device = utils::get_time_windows() - time_cost_device; + printf("GPU time cost of surface first derivative calculating for %d samples: %lf\n", sampleCnt_u * sampleCnt_v, + time_cost_device); + } + cudaFree(d_derTexture_u); + cudaFree(d_derTexture_v); +} + +__host__ void NurbsCurve::Evaluator::derivative() { + float *d_derTexture = nullptr; + const int pointsCnt = controlPoints.size(), pointSize = controlPoints[0].size(); + const int knotsCnt = knots.size(); + cudaMalloc((void **) &d_derTexture, sampleCnt * pointsCnt * sizeof(float)); + + // 构造线程层级 + dim3 block(32, 32); + dim3 grid((sampleCnt + block.x * block.y - 1) / (block.x * block.y)); + // 构造g_basisTexture线程层级 + dim3 blockTex(512); + dim3 gridTex((sampleCnt + blockTex.x - 1) / blockTex.x); + // 记录用时 + double time_cost_device; + if (recordTime) time_cost_device = utils::get_time_windows(); + g_derTexture<<>>(d_derTexture, d_nTexture1, d_knots, pointsCnt, knotsCnt, sampleCnt); + cudaDeviceSynchronize(); + g_derivative<<>>(d_derTexture, d_points, pointsCnt, pointSize, knots[knotsCnt - 1], sampleCnt); + cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用 + if (recordTime) { + time_cost_device = utils::get_time_windows() - time_cost_device; + printf("GPU time cost of curve first derivative calculating for %d samples: %lf\n", sampleCnt, + time_cost_device); + } + cudaFree(d_derTexture); +} + + +//__global__ void +//NurbsSurface::g_evaluate(const float *d_points, const float *d_knots_u, const float *d_knots_v, +// int d_pointsCnt_u, +// int d_pointsCnt_v, int d_pointSize, int d_knotsCnt_u, int d_knotsCnt_v, +// int d_sampleCnt_u, int d_sampleCnt_v) { +//// printf(" surface calculating... \n"); +// // 二维grid和二维的block +// int ix = int(blockIdx.x * blockDim.x + threadIdx.x); +// int iy = int(blockIdx.y * blockDim.y + threadIdx.y); +// +// float d_paramCeil_u = d_knots_u[d_knotsCnt_u - 1]; +// float d_paramCeil_v = d_knots_v[d_knotsCnt_v - 1]; +// +// float u = ix * d_paramCeil_u / (d_sampleCnt_u - 1); +// float v = iy * d_paramCeil_v / (d_sampleCnt_v - 1); +// +// if (u > 1.0 * d_paramCeil_u || v > 1.0 * d_paramCeil_v) { +// return; +// } +// +// int d_degree_u = d_knotsCnt_u - 1 - d_pointsCnt_u; +// int d_degree_v = d_knotsCnt_v - 1 - d_pointsCnt_v; +// // 注意,在device中,全局内存还是以malloc和free的方式分配和回收的,而不是使用cudaMalloc和cudaFree +// auto *N_Texture_U = (float *) malloc((d_degree_u + 1) * (d_knotsCnt_u - 1) * sizeof(float)); +// auto *N_Texture_V = (float *) malloc((d_degree_v + 1) * (d_knotsCnt_v - 1) * sizeof(float)); +// d_basisFunction(N_Texture_U, d_knots_u, u, d_degree_u, d_knotsCnt_u); +// d_basisFunction(N_Texture_V, d_knots_v, v, d_degree_v, d_knotsCnt_v); +// float x = 0., y = 0., z = 0.; +// for (int i = 0; i < d_pointsCnt_u; i++) { +// for (int j = 0; j < d_pointsCnt_v; j++) { +// float N_U = N_Texture_U[d_degree_u * (d_knotsCnt_u - 1) + i]; +// float N_V = N_Texture_V[d_degree_v * (d_knotsCnt_v - 1) + j]; +// int idx = (i * d_pointsCnt_v + j) * d_pointSize; +// x += N_U * N_V * d_points[idx]; +// y += N_U * N_V * d_points[idx + 1]; +// z += N_U * N_V * d_points[idx + 2]; +// } +// } +// printf("(%g, %g)-->(%g, %g, %g)\n", u, v, x, y, z); // %g输出,舍弃无意义的0 +// free(N_Texture_U); +// free(N_Texture_V); +//} __global__ void -NurbsSurface::calculate_kernel(const float *d_points, const float *d_knots_u, const float *d_knots_v, int d_pointsCnt_u, - int d_pointsCnt_v, int d_pointSize, int d_knotsCnt_u, int d_knotsCnt_v, - int d_sampleCnt_u, int d_sampleCnt_v) { - // 二维grid和二维的block - int ix = int(blockIdx.x * blockDim.x + threadIdx.x); - int iy = int(blockIdx.y * blockDim.y + threadIdx.y); +NurbsSurface::g_evaluate(const float *d_nTexture_u, const float *d_nTexture_v, const float *d_points, int d_pointsCnt_u, + int d_pointsCnt_v, int d_pointSize, float d_lastKnot_u, float d_lastKnot_v, int d_sampleCnt_u, + int d_sampleCnt_v) { - float d_paramCeil_u = d_knots_u[d_knotsCnt_u - 1]; - float d_paramCeil_v = d_knots_v[d_knotsCnt_v - 1]; + // 二维grid和二维的block + int ix = blockIdx.x * blockDim.x + threadIdx.x; + int iy = blockIdx.y * blockDim.y + threadIdx.y; - float u = ix * d_paramCeil_u / (d_sampleCnt_u - 1); - float v = iy * d_paramCeil_v / (d_sampleCnt_v - 1); + float u = ix * d_lastKnot_u / (d_sampleCnt_u - 1); + float v = iy * d_lastKnot_v / (d_sampleCnt_v - 1); - if (u > 1.0 * d_paramCeil_u || v > 1.0 * d_paramCeil_v) { + if (u > 1.0 * d_lastKnot_u || v > 1.0 * d_lastKnot_v) { return; } - int d_degree_u = d_knotsCnt_u - 1 - d_pointsCnt_u; - int d_degree_v = d_knotsCnt_v - 1 - d_pointsCnt_v; - // 注意,在device中,全局内存还是以malloc和free的方式分配和回收的,而不是使用cudaMalloc和cudaFree - auto *N_Texture_U = (float *) malloc((d_degree_u + 1) * (d_knotsCnt_u - 1) * sizeof(float)); - auto *N_Texture_V = (float *) malloc((d_degree_v + 1) * (d_knotsCnt_v - 1) * sizeof(float)); - d_basisFunction(N_Texture_U, d_knots_u, u, d_degree_u, d_knotsCnt_u); - d_basisFunction(N_Texture_V, d_knots_v, v, d_degree_v, d_knotsCnt_v); float x = 0., y = 0., z = 0.; for (int i = 0; i < d_pointsCnt_u; i++) { + float N_U = d_nTexture_u[ix * d_pointsCnt_u + i]; for (int j = 0; j < d_pointsCnt_v; j++) { - float N_U = N_Texture_U[d_degree_u * (d_knotsCnt_u - 1) + i]; - float N_V = N_Texture_V[d_degree_v * (d_knotsCnt_v - 1) + j]; + float N_V = d_nTexture_v[iy * d_pointsCnt_v + j]; int idx = (i * d_pointsCnt_v + j) * d_pointSize; x += N_U * N_V * d_points[idx]; y += N_U * N_V * d_points[idx + 1]; @@ -112,88 +306,150 @@ NurbsSurface::calculate_kernel(const float *d_points, const float *d_knots_u, co } } printf("(%g, %g)-->(%g, %g, %g)\n", u, v, x, y, z); // %g输出,舍弃无意义的0 - free(N_Texture_U); - free(N_Texture_V); } __global__ void -NurbsCurve::calculate_kernel(const float *d_points, const float *d_knots, int d_pointsCnt, int d_pointSize, - int d_knotsCnt, int d_sampleCnt) { - // 二维grid和一维的block - int idx = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; - float d_paramCeil = d_knots[d_knotsCnt - 1]; - float u = idx * d_paramCeil / (d_sampleCnt - 1); - if (u > 1.0 * d_paramCeil) { +NurbsSurface::g_derivative(const float *derTexture_u, const float *derTexture_v, const float *nTexture_u, + const float *nTexture_v, const float *d_points, int d_pointsCnt_u, int d_pointsCnt_v, + int d_pointSize, float d_lastKnot_u, float d_lastKnot_v, int d_sampleCnt_u, + int d_sampleCnt_v) { + // 二维grid和二维的block + int ix = blockIdx.x * blockDim.x + threadIdx.x; + int iy = blockIdx.y * blockDim.y + threadIdx.y; + + if (ix >= d_sampleCnt_u || iy >= d_sampleCnt_v) { return; } + float u = ix * d_lastKnot_u / (d_sampleCnt_u - 1); + float v = iy * d_lastKnot_v / (d_sampleCnt_v - 1); - int d_degree = d_knotsCnt - 1 - d_pointsCnt; - // 注意,在device中,全局内存还是以malloc和free的方式分配和回收的,而不是使用cudaMalloc和cudaFree - auto *N_Texture = (float *) malloc((d_degree + 1) * (d_knotsCnt - 1) * sizeof(float)); - d_basisFunction(N_Texture, d_knots, u, d_degree, d_knotsCnt); + float x_u = 0., y_u = 0, z_u = 0.; + float x_v = 0., y_v = 0, z_v = 0.; + + for (int i = 0; i < d_pointsCnt_u; i++) { + for (int j = 0; j < d_pointsCnt_u; j++) { + int baseIdx = (i * d_pointsCnt_v + j) * d_pointSize; + float factor_u = derTexture_u[ix * d_pointsCnt_u + i] * nTexture_v[iy * d_pointsCnt_v + j]; + float factor_v = derTexture_v[iy * d_pointsCnt_v + j] * nTexture_u[ix * d_pointsCnt_u + i]; + x_u += factor_u * d_points[baseIdx]; + y_u += factor_u * d_points[baseIdx + 1]; + z_u += factor_u * d_points[baseIdx + 2]; + + x_v += factor_v * d_points[baseIdx]; + y_v += factor_v * d_points[baseIdx + 1]; + z_v += factor_v * d_points[baseIdx + 2]; + } + } + float x_n = y_u * z_v - y_v * z_u, y_n = x_v * z_u - x_u * z_v, z_n = x_u * y_v - x_v * y_u; // 叉乘得到法向量 + printf("(%g,%g)-->u:(%g, %g, %g), v:(%g,%g,%g), normal:(%g,%g,%g)\n", u, v, x_u, y_u, z_u, x_v, y_v, z_v, + x_n, y_n, z_n); +} + +__global__ void +NurbsCurve::g_evaluate(const float *NTexture, const float *d_points, const int d_pointsCnt, + const int d_pointSize, const float d_lastKnot, const int d_sampleCnt) { +// printf(" curve calculating... \n"); + // 二维grid和一维的block +// int idx = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; + // 二维block和一维grid + int idx = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x; + float u = idx * d_lastKnot / (d_sampleCnt - 1); + if (u > 1.0 * d_lastKnot) { + return; + } +// +// int d_degree = d_knotsCnt - 1 - d_pointsCnt; +// // 注意,在device中,全局内存还是以malloc和free的方式分配和回收的,而不是使用cudaMalloc和cudaFree +// auto *N_dp = (float *) malloc((d_degree + 1) * (d_knotsCnt - 1) * sizeof(float)); +// d_basisFunction(N_dp, d_knots, u, d_degree, d_knotsCnt); float x = 0., y = 0., z = 0.; for (int i = 0; i < d_pointsCnt; i++) { - float N = N_Texture[d_degree * (d_knotsCnt - 1) + i]; + float N = NTexture[idx * d_pointsCnt + i]; int baseIdx = i * d_pointSize; x += N * d_points[baseIdx]; y += N * d_points[baseIdx + 1]; z += N * d_points[baseIdx + 2]; } printf("(%g)-->(%g, %g, %g)\n", u, x, y, z); // %g输出,舍弃无意义的0 - free(N_Texture); } -__host__ NurbsCurve::Evaluator::Evaluator(std::vector> controlPoints, - std::vector knots) { - this->knots = std::move(knots); - this->controlPoints = std::move(controlPoints); - recordTime = false; +__global__ void +NurbsCurve::g_derivative(const float *derTexture, const float *d_points, int d_pointsCnt, int d_pointSize, + float d_lastKnot, int d_sampleCnt) { + // 二维block和一维grid + int idx = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x; + if (idx >= d_sampleCnt) return; + float u = idx * d_lastKnot / (d_sampleCnt - 1); + float x = 0., y = 0, z = 0.; +// printf("pointSize: %d\n", d_pointSize); + for (int i = 0; i < d_pointsCnt; i++) { + int baseIdx = i * d_pointSize; + float nFactor = derTexture[idx * d_pointsCnt + i]; + x += nFactor * d_points[baseIdx]; + y += nFactor * d_points[baseIdx + 1]; + z += nFactor * d_points[baseIdx + 2]; +// printf("(x, y, z): (%g, %g, %g)\n", d_points[baseIdx], d_points[baseIdx + 1], d_points[baseIdx + 2]); + } + printf("(%g)-->(%g, %g, %g)\n", u, x, y, z); } - -__host__ std::vector>> -NurbsCurve::Evaluator::calculate(int sampleCnt) { - // 构造指向device的controlPoints - const int pointsCnt = controlPoints.size(), pointSize = controlPoints[0].size(); - const int pointsBytes = pointsCnt * pointSize * sizeof(float); - auto *h_points = (float *) malloc(pointsBytes); - for (int i = 0; i < pointsCnt; i++) { - for (int j = 0; j < pointSize; j++) { - h_points[i * pointSize + j] = controlPoints[i][j]; - } +__global__ void g_basisTexture(float *nTexture, float *nTexture1, const float *d_knots, int d_pointsCnt, int d_knotsCnt, + int d_sampleCnt) { + // 一维grid和一维block + int idx = blockIdx.x * blockDim.x + threadIdx.x; // 采样点编号 + float d_paramCeil = d_knots[d_knotsCnt - 1]; + float u = idx * d_paramCeil / (d_sampleCnt - 1); + if (u > 1.0 * d_paramCeil) { + return; } - float *d_points; - cudaMalloc((void **) &d_points, pointsBytes); - cudaMemcpy(d_points, h_points, pointsBytes, cudaMemcpyHostToDevice); + int d_degree = d_knotsCnt - 1 - d_pointsCnt; + auto *N_dp = (float *) malloc((d_degree + 1) * (d_knotsCnt - 1) * sizeof(float)); + d_basisFunction(N_dp, d_knots, u, d_degree, d_knotsCnt); + for (int i = 0; i < d_pointsCnt; i++) { + nTexture[idx * d_pointsCnt + i] = N_dp[d_degree * (d_knotsCnt - 1) + i]; + nTexture1[idx * (d_pointsCnt + 1) + i] = N_dp[(d_degree - 1) * (d_knotsCnt - 1) + i]; +// printf("nTexture1: %g ", nTexture1[idx * (d_pointsCnt + 1) + i]); + } + nTexture1[idx * (d_pointsCnt + 1) + d_pointsCnt] = N_dp[(d_degree - 1) * (d_knotsCnt - 1) + + d_pointsCnt]; // nTexture1多记录一列数据 + free(N_dp); +} - // 构造指向device的knots - const int knotsCnt = knots.size(); - const int knotsBytes = knotsCnt * sizeof(float); - auto *h_knots = (float *) malloc(knotsBytes); - for (int i = 0; i < knotsCnt; i++) h_knots[i] = knots[i]; - float *d_knots; - cudaMalloc((void **) &d_knots, knotsBytes); - cudaMemcpy(d_knots, h_knots, knotsBytes, cudaMemcpyHostToDevice); - // 构造线程层级,调用核函数 - dim3 block(32, 32); - dim3 grid((sampleCnt + block.x * block.y - 1) / (block.x * block.y)); - // 记录用时 - double time_cost_device; - if(recordTime) time_cost_device = utils::get_time_windows(); - calculate_kernel <<>>(d_points, d_knots, pointsCnt, pointSize, knotsCnt, sampleCnt); - if(recordTime) { - cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用 - time_cost_device = utils::get_time_windows() - time_cost_device; - printf("GPU time cost of surface evaluation for %d samples: %lf\n", sampleCnt * sampleCnt, time_cost_device); +__global__ void +g_derTexture(float *derTexture, const float *nTexture1, const float *d_knots, int d_pointsCnt, int d_knotsCnt, + int d_sampleCnt) { + // 一维grid和一维block + int idx = blockIdx.x * blockDim.x + threadIdx.x; // 采样点编号 + if (idx >= d_sampleCnt) { + return; } - free(h_points); - free(h_knots); - cudaFree(d_points); - cudaFree(d_knots); + int degree = d_knotsCnt - 1 - d_pointsCnt; - cudaDeviceReset(); - return {}; +// printf("degree: %d\n", degree); + for (int i = 0; i < d_pointsCnt; i++) { + float left = d_floatEqual(d_knots[i + degree], d_knots[i]) ? 0 : + nTexture1[idx * (d_pointsCnt + 1) + i] * (degree - 1) / (d_knots[i + degree] - d_knots[i]); + float right = d_floatEqual(d_knots[i + degree + 1], d_knots[i + 1]) ? 0 : + nTexture1[idx * (d_pointsCnt + 1) + i + 1] * (degree - 1) / + (d_knots[i + degree + 1] - d_knots[i + 1]); + derTexture[idx * d_pointsCnt + i] = left - right; +// printf("<%d, %d> -- %g \n", idx, i, left - right); +// printf("nTex1: %g \n", nTexture1[idx * (d_pointsCnt + 1) + i]); + } +} + +__host__ NurbsCurve::Evaluator::Evaluator(std::vector> controlPoints, + std::vector knots) { + this->knots = std::move(knots); + this->controlPoints = std::move(controlPoints); + recordTime = false; + d_nTexture = nullptr; + d_nTexture1 = nullptr; + + sampleCnt = 0; + d_points = nullptr; + d_knots = nullptr; } @@ -203,8 +459,8 @@ __device__ void d_basisFunction(float *N_Texture, const float *knots, float u, i for (int i = 0; i + p <= m - 1; i++) { if (p == 0) { if ((u > knots[i] || d_floatEqual(u, knots[i])) && (u < knots[i + 1]) - || - d_floatEqual(u, knots[i + 1]) && d_floatEqual(u, knots[m])) { + || + d_floatEqual(u, knots[i + 1]) && d_floatEqual(u, knots[m])) { N_Texture[p * m + i] = 1.0; } else { N_Texture[p * m + i] = 0.0; @@ -226,11 +482,28 @@ __device__ bool d_floatEqual(float a, float b) { return abs(a - b) < 0.00001; } -void NurbsCurve::Evaluator::setRecordTime(bool r){ + +void NurbsCurve::Evaluator::setRecordTime(bool r) { recordTime = r; } -void NurbsSurface::Evaluator::setRecordTime(bool r){ +void NurbsSurface::Evaluator::setRecordTime(bool r) { recordTime = r; } +NurbsSurface::Evaluator::~Evaluator() { + cudaFree(d_nTexture_u); + cudaFree(d_nTexture_v); + cudaFree(d_nTexture1_u); + cudaFree(d_nTexture1_v); + cudaFree(d_points); + cudaFree(d_knots_u); + cudaFree(d_knots_v); +} + +NurbsCurve::Evaluator::~Evaluator() { + cudaFree(d_nTexture); + cudaFree(d_nTexture1); + cudaFree(d_points); + cudaFree(d_knots); +} diff --git a/NurbsEvaluator.cuh b/NurbsEvaluator.cuh index 62ad9f0..9aac536 100644 --- a/NurbsEvaluator.cuh +++ b/NurbsEvaluator.cuh @@ -8,21 +8,37 @@ namespace NurbsSurface { /** - * 曲面计算的核函数,负责计算曲面中的一个点的值 + * 曲线计算的核函数 * @param d_pointSize 点的大小(3: [x, y, z] | 4:[x, y, z, w]) */ - __global__ void - calculate_kernel(const float *d_points, const float *d_knots_u, const float *d_knots_v, int d_pointsCnt_u, - int d_pointsCnt_v, int d_pointSize, int d_knotsCnt_u, int d_knotsCnt_v, int d_sampleCnt_u, - int d_sampleCnt_v); + __global__ static void + g_evaluate(const float *d_nTexture_u, const float *d_nTexture_v, const float *d_points, int d_pointsCnt_u, + int d_pointsCnt_v, int d_pointSize, float d_lastKnot_u, float d_lastKnot_v, int d_sampleCnt_u, + int d_sampleCnt_v); + + __global__ static void + g_derivative(const float *derTexture_u, const float *derTexture_v, const float *nTexture_u, const float *nTexture_v, + const float *d_points, int d_pointsCnt_u, int d_pointsCnt_v, int d_pointSize, float d_lastKnot_u, + float d_lastKnot_v, int d_sampleCnt_u, int d_sampleCnt_v); class Evaluator { private: std::vector>> controlPoints; + float *d_points; std::vector knots_u; std::vector knots_v; + float *d_knots_u; + float *d_knots_v; bool recordTime; + float *d_nTexture_u; // u方向指向度为p时的device中的nurbs基函数矩阵 + float *d_nTexture_v; // v方向指向度为p时的device中的nurbs基函数矩阵 + float *d_nTexture1_u; // u方向指向度为p-1时的device中的nurbs基函数矩阵 + float *d_nTexture1_v; // v方向指向度为p-1时的device中的nurbs基函数矩阵 + + int sampleCnt_u; + int sampleCnt_v; + public: /** * 构造函数 @@ -40,10 +56,17 @@ namespace NurbsSurface { * @return 由 map 组成的vector{<, {x, y, z}>} */ __host__ std::vector, std::vector>> - calculate(int sampleCnt_u, int sampleCnt_v); + evaluate(int sampleCnt_u_, int sampleCnt_v_); + + /** + * 供外部CPU程序使用的、负责调用gpu并行计算切向量的方法 + */ + __host__ void derivative(); void setRecordTime(bool r); + ~Evaluator(); + }; } @@ -51,40 +74,86 @@ namespace NurbsSurface { * 曲线部分 */ namespace NurbsCurve { + + __global__ void g_test(float *nTexture); + /** * 曲线计算的核函数 * @param d_pointSize 点的大小(3: [x, y, z] | 4:[x, y, z, w]) */ __global__ static void - calculate_kernel(const float *d_points, const float *d_knots, int d_pointsCnt, int d_pointSize, int d_knotsCnt, - int d_sampleCnt); + g_evaluate(const float *NTexture, const float *d_points, int d_pointsCnt, int d_pointSize, + float d_lastKnot, int d_sampleCnt); + + __global__ static void + g_derivative(const float *derTexture, const float *d_points, int d_pointsCnt, int d_pointSize, float d_lastKnot, + int d_sampleCnt); + class Evaluator { private: std::vector> controlPoints; std::vector knots; + float *d_knots; + float *d_points; bool recordTime; + + float *d_nTexture; // 指向度为p时的device中的nurbs基函数矩阵 + float *d_nTexture1; // 指向度为p-1时的device中的nurbs基函数矩阵 + + int sampleCnt; public: /** * 构造函数 * @param controlPoints 控制点矩阵[pointsCnt][3] */ __host__ explicit Evaluator(std::vector> controlPoints, std::vector knots); + /** - * 供外部CPU程序使用的、负责调用gpu并行计算的方法 - * @param sampleCnt + * 供外部CPU程序使用的、负责调用gpu并行进行evaluation的方法 + * @param sampleCnt_ 在参数域内均匀采样的采样数,它会更新成员变量中的sampleCnt * @return 由 map 组成的vector{} */ - __host__ std::vector>> calculate(int sampleCnt); + __host__ std::vector>> evaluate(int sampleCnt_); + + /** + * 供外部CPU程序使用的、负责调用gpu并行计算切向量的方法 + */ + __host__ void derivative(); + + __host__ ~Evaluator(); + void setRecordTime(bool r); }; } + + + + +/** + * 计算并保存基函数值 + * @param nTexture 记录度数为p的基函数值,规模为【sampleCnt,pointsCnt】 + * @param nTexture1 记录度数为p-1的基函数值,规模为【sampleCnt+1,pointsCnt】 + */ +__global__ static void +g_basisTexture(float *nTexture, float *nTexture1, const float *d_knots, int d_pointsCnt, int d_knotsCnt, + int d_sampleCnt); + +/** + * 计算并保存基函数对采样点切向量的分量值 + * @param derTexture 记录度数为p的Nurbs基函数对采样点切向量的分量值,大小为【sampleCnt,pointsCnt】 + * @param nTexture1 度数为p-1的基函数值,规模为【sampleCnt+1,pointsCnt】 + */ +__global__ static void +g_derTexture(float *derTexture, const float *nTexture1, const float *d_knots, int d_pointsCnt, int d_knotsCnt, + int d_sampleCnt); + /** * 当u值已知时,根据基函数N的递推表达式,采用动态规划的方式求解N值 * @param N_Texture 结果返回在N_Texture中 */ -__device__ void d_basisFunction(float *N_Texture, const float *knots, float u, int degree, int d_knotsCnt); +__device__ void d_basisFunction(float *nTexture, const float *knots, float u, int degree, int d_knotsCnt); /** * device中判断两个浮点数是否相等。与CPU中一样,GPU中的浮点数也存在很小的误差,直接使用==判断往往容易将相等误判为不等 diff --git a/main.cpp b/main.cpp index 81234c7..efece4a 100644 --- a/main.cpp +++ b/main.cpp @@ -12,7 +12,7 @@ int main() { {0, 0, 0, 0.1, 0.5, 0.8, 1, 1, 1}, {0, 0, 0, 0.2, 0.7, 0.8, 1, 1, 1}); nurbsSurfaceEvaluator.setRecordTime(true); - nurbsSurfaceEvaluator.calculate(3, 4); + nurbsSurfaceEvaluator.evaluate(3, 4); printf("==============================\n"); @@ -25,7 +25,9 @@ int main() { {4, -5, 0}}, {0, 0, 0, 0.1, 0.5, 0.8, 1, 1, 1}); nurbsCurveEvaluator.setRecordTime(true); - nurbsCurveEvaluator.calculate(5); + nurbsCurveEvaluator.evaluate(11); + printf("\n"); +// nurbsCurveEvaluator.derivative(); return 0; } \ No newline at end of file