|
@ -7,6 +7,9 @@ |
|
|
#include "NurbsEvaluator.cuh" |
|
|
#include "NurbsEvaluator.cuh" |
|
|
#include "cstdio" |
|
|
#include "cstdio" |
|
|
#include "utils.h" |
|
|
#include "utils.h" |
|
|
|
|
|
//#include "NurbsBasis.cuh" |
|
|
|
|
|
|
|
|
|
|
|
//extern __device__ void NurbsBasis::d_basisFunction(float *N_Texture, const float *knots, float u, int degree, int d_knotsCnt) {}; |
|
|
|
|
|
|
|
|
__host__ NurbsSurface::Evaluator::Evaluator(std::vector<std::vector<std::vector<float>>> controlPoints, |
|
|
__host__ NurbsSurface::Evaluator::Evaluator(std::vector<std::vector<std::vector<float>>> controlPoints, |
|
|
std::vector<float> knots_u, std::vector<float> knots_v) { |
|
|
std::vector<float> knots_u, std::vector<float> knots_v) { |
|
@ -14,97 +17,288 @@ __host__ NurbsSurface::Evaluator::Evaluator(std::vector<std::vector<std::vector< |
|
|
this->knots_v = std::move(knots_v); |
|
|
this->knots_v = std::move(knots_v); |
|
|
this->controlPoints = std::move(controlPoints); |
|
|
this->controlPoints = std::move(controlPoints); |
|
|
recordTime = false; |
|
|
recordTime = false; |
|
|
|
|
|
d_nTexture_u = nullptr; |
|
|
|
|
|
d_nTexture_v = nullptr; |
|
|
|
|
|
d_nTexture1_u = nullptr; |
|
|
|
|
|
d_nTexture1_v = nullptr; |
|
|
|
|
|
d_knots_u = nullptr; |
|
|
|
|
|
d_knots_v = nullptr; |
|
|
|
|
|
d_points = nullptr; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
__host__ std::vector<std::map<std::pair<float, float>, std::vector<float>>> |
|
|
__host__ std::vector<std::map<std::pair<float, float>, std::vector<float>>> |
|
|
NurbsSurface::Evaluator::calculate(int sampleCnt_u, int sampleCnt_v) { |
|
|
NurbsSurface::Evaluator::evaluate(int sampleCnt_u_, int sampleCnt_v_) { |
|
|
|
|
|
sampleCnt_u = sampleCnt_u_; |
|
|
|
|
|
sampleCnt_v = sampleCnt_v_; |
|
|
|
|
|
printf("outside printf..\n"); |
|
|
|
|
|
// NurbsBasis::myPrint11(1, 3); |
|
|
|
|
|
|
|
|
// 构造指向device的controlPoints |
|
|
// 构造指向device的controlPoints |
|
|
const int pointsCntU = controlPoints.size(), pointsCntV = controlPoints[0].size(), pointSize = controlPoints[0][0].size(); |
|
|
const int pointsCnt_u = controlPoints.size(), pointsCnt_v = controlPoints[0].size(), pointSize = controlPoints[0][0].size(); |
|
|
const int pointsBytes = pointsCntU * pointsCntV * pointSize * sizeof(float); |
|
|
const int pointsBytes = pointsCnt_u * pointsCnt_v * pointSize * sizeof(float); |
|
|
auto *h_points = (float *) malloc(pointsBytes); |
|
|
auto *h_points = (float *) malloc(pointsBytes); |
|
|
for (int i = 0; i < pointsCntU; i++) { |
|
|
for (int i = 0; i < pointsCnt_u; i++) { |
|
|
for (int j = 0; j < pointsCntV; j++) { |
|
|
for (int j = 0; j < pointsCnt_v; j++) { |
|
|
for (int k = 0; k < pointSize; k++) { |
|
|
for (int k = 0; k < pointSize; k++) { |
|
|
h_points[(i * pointsCntV + j) * pointSize + k] = controlPoints[i][j][k]; |
|
|
h_points[(i * pointsCnt_v + j) * pointSize + k] = controlPoints[i][j][k]; |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
float *d_points; |
|
|
|
|
|
cudaMalloc((void **) &d_points, pointsBytes); |
|
|
cudaMalloc((void **) &d_points, pointsBytes); |
|
|
cudaMemcpy(d_points, h_points, pointsBytes, cudaMemcpyHostToDevice); |
|
|
cudaMemcpy(d_points, h_points, pointsBytes, cudaMemcpyHostToDevice); |
|
|
|
|
|
|
|
|
// 构造指向device的knots |
|
|
// 构造指向device的knots |
|
|
const int knotsCnt_u = knots_u.size(), knotsCnt_v = knots_v.size(); |
|
|
const int knotsCnt_u = knots_u.size(), knotsCnt_v = knots_v.size(); |
|
|
const int knotsBytesU = knotsCnt_u * sizeof(float), knotsBytesV = knotsCnt_v * sizeof(float); |
|
|
const int knotsBytes_u = knotsCnt_u * sizeof(float), knotsBytes_v = knotsCnt_v * sizeof(float); |
|
|
auto *h_knotsU = (float *) malloc(knotsBytesU), *h_knotsV = (float *) malloc(knotsBytesV); |
|
|
auto *h_knots_u = (float *) malloc(knotsBytes_u), *h_knots_v = (float *) malloc(knotsBytes_v); |
|
|
for (int i = 0; i < knotsCnt_u; i++) h_knotsU[i] = knots_u[i]; |
|
|
for (int i = 0; i < knotsCnt_u; i++) h_knots_u[i] = knots_u[i]; |
|
|
for (int i = 0; i < knotsCnt_v; i++) h_knotsV[i] = knots_v[i]; |
|
|
for (int i = 0; i < knotsCnt_v; i++) h_knots_v[i] = knots_v[i]; |
|
|
|
|
|
|
|
|
float *d_knots_u; |
|
|
cudaMalloc((void **) &d_knots_u, knotsBytes_u); |
|
|
float *d_knots_v; |
|
|
cudaMalloc((void **) &d_knots_v, knotsBytes_v); |
|
|
cudaMalloc((void **) &d_knots_u, knotsBytesU); |
|
|
cudaMemcpy(d_knots_u, h_knots_u, knotsBytes_u, cudaMemcpyHostToDevice); |
|
|
cudaMalloc((void **) &d_knots_v, knotsBytesV); |
|
|
cudaMemcpy(d_knots_v, h_knots_v, knotsBytes_v, cudaMemcpyHostToDevice); |
|
|
cudaMemcpy(d_knots_u, h_knotsU, knotsBytesU, cudaMemcpyHostToDevice); |
|
|
|
|
|
cudaMemcpy(d_knots_v, h_knotsV, knotsBytesV, cudaMemcpyHostToDevice); |
|
|
// 构造nTexture |
|
|
|
|
|
cudaMalloc((void **) &d_nTexture_u, |
|
|
|
|
|
sampleCnt_u * pointsCnt_u * sizeof(float)); // 注意nTexture的大小,在算梯度时用得到i=pointsCnt + 1的基函数值 |
|
|
|
|
|
cudaMalloc((void **) &d_nTexture_v, sampleCnt_v * pointsCnt_v * sizeof(float)); |
|
|
|
|
|
|
|
|
|
|
|
// 构造nTexture1 |
|
|
|
|
|
cudaMalloc((void **) &d_nTexture1_u, sampleCnt_u * (pointsCnt_u + 1) * sizeof(float)); |
|
|
|
|
|
cudaMalloc((void **) &d_nTexture1_v, sampleCnt_v * (pointsCnt_v + 1) * sizeof(float)); |
|
|
|
|
|
|
|
|
|
|
|
// 构造g_basisTexture线程层级 |
|
|
|
|
|
dim3 blockBasis(512); |
|
|
|
|
|
dim3 gridBasis_u((sampleCnt_u + blockBasis.x - 1) / blockBasis.x); |
|
|
|
|
|
dim3 gridBasis_v((sampleCnt_v + blockBasis.x - 1) / blockBasis.x); |
|
|
|
|
|
|
|
|
// 构造线程层级,调用核函数 |
|
|
// 构造线程层级,调用核函数 |
|
|
dim3 block(32, 32); |
|
|
dim3 block(32, 32); |
|
|
dim3 grid((sampleCnt_u + block.x - 1) / block.x, (sampleCnt_v + block.y - 1) / block.y); |
|
|
dim3 grid((sampleCnt_u + block.x - 1) / block.x, (sampleCnt_v + block.y - 1) / block.y); |
|
|
// 记录用时 |
|
|
// 记录用时 |
|
|
double time_cost_device; |
|
|
double time_cost_device; |
|
|
if(recordTime) time_cost_device = utils::get_time_windows(); |
|
|
g_basisTexture<<<gridBasis_u, blockBasis>>>(d_nTexture_u, d_nTexture1_u, d_knots_u, pointsCnt_u, knotsCnt_u, |
|
|
calculate_kernel <<<grid, block>>>(d_points, d_knots_u, d_knots_v, pointsCntU, pointsCntV, pointSize, |
|
|
sampleCnt_u); |
|
|
knots_u.size(), knots_v.size(), sampleCnt_u, sampleCnt_v); |
|
|
cudaDeviceSynchronize(); |
|
|
if(recordTime) { |
|
|
g_basisTexture<<<gridBasis_v, blockBasis>>>(d_nTexture_v, d_nTexture1_v, d_knots_v, pointsCnt_v, knotsCnt_v, |
|
|
cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用 |
|
|
sampleCnt_v); |
|
|
|
|
|
cudaDeviceSynchronize(); |
|
|
|
|
|
|
|
|
|
|
|
if (recordTime) time_cost_device = utils::get_time_windows(); |
|
|
|
|
|
g_evaluate <<<grid, block>>>(d_nTexture_u, d_nTexture_v, d_points, pointsCnt_u, pointsCnt_v, pointSize, |
|
|
|
|
|
knots_u[knotsCnt_u - 1], knots_v[knotsCnt_v - 1], sampleCnt_u, sampleCnt_v); |
|
|
|
|
|
cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用 |
|
|
|
|
|
if (recordTime) { |
|
|
time_cost_device = utils::get_time_windows() - time_cost_device; |
|
|
time_cost_device = utils::get_time_windows() - time_cost_device; |
|
|
printf("GPU time cost of surface evaluation for %d samples: %lf\n", sampleCnt_u * sampleCnt_v, time_cost_device); |
|
|
printf("GPU time cost of surface evaluation for %d samples: %lf\n", sampleCnt_u * sampleCnt_v, |
|
|
|
|
|
time_cost_device); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
// 释放内存 |
|
|
// 释放内存 |
|
|
free(h_points); |
|
|
free(h_points); |
|
|
free(h_knotsU); |
|
|
free(h_knots_u); |
|
|
free(h_knotsV); |
|
|
free(h_knots_v); |
|
|
cudaFree(d_points); |
|
|
printf("First derivatives and normal vectors calculated by GPU:\n"); |
|
|
cudaFree(d_knots_u); |
|
|
derivative(); |
|
|
cudaFree(d_knots_v); |
|
|
|
|
|
|
|
|
cudaDeviceReset(); |
|
|
|
|
|
return {}; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__host__ std::vector<std::map<float, std::vector<float>>> |
|
|
|
|
|
NurbsCurve::Evaluator::evaluate(int sampleCnt_) { |
|
|
|
|
|
this->sampleCnt = sampleCnt_; |
|
|
|
|
|
// 构造指向device的controlPoints |
|
|
|
|
|
const int pointsCnt = controlPoints.size(), pointSize = controlPoints[0].size(); |
|
|
|
|
|
const int pointsBytes = pointsCnt * pointSize * sizeof(float); |
|
|
|
|
|
auto *h_points = (float *) malloc(pointsBytes); |
|
|
|
|
|
for (int i = 0; i < pointsCnt; i++) { |
|
|
|
|
|
for (int j = 0; j < pointSize; j++) { |
|
|
|
|
|
h_points[i * pointSize + j] = controlPoints[i][j]; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
cudaMalloc((void **) &d_points, pointsBytes); |
|
|
|
|
|
cudaMemcpy(d_points, h_points, pointsBytes, cudaMemcpyHostToDevice); |
|
|
|
|
|
|
|
|
|
|
|
// 构造指向device的knots |
|
|
|
|
|
const int knotsCnt = knots.size(); |
|
|
|
|
|
const int knotsBytes = knotsCnt * sizeof(float); |
|
|
|
|
|
auto *h_knots = (float *) malloc(knotsBytes); |
|
|
|
|
|
for (int i = 0; i < knotsCnt; i++) h_knots[i] = knots[i]; |
|
|
|
|
|
cudaMalloc((void **) &d_knots, knotsBytes); |
|
|
|
|
|
cudaMemcpy(d_knots, h_knots, knotsBytes, cudaMemcpyHostToDevice); |
|
|
|
|
|
|
|
|
|
|
|
// 分配nTexture的内存。只需要GPU内存 |
|
|
|
|
|
// float *d_nTexture = nullptr; |
|
|
|
|
|
cudaMalloc((void **) &d_nTexture, |
|
|
|
|
|
sampleCnt * pointsCnt * sizeof(float)); // 注意nTexture的大小,在算梯度时用得到i=pointsCnt + 1的基函数值 |
|
|
|
|
|
|
|
|
|
|
|
// 分配nTexture1的内存。只需要GPU内存 |
|
|
|
|
|
// float *d_nTexture1 = nullptr; |
|
|
|
|
|
cudaMalloc((void **) &d_nTexture1, sampleCnt * (pointsCnt + 1) * sizeof(float)); |
|
|
|
|
|
|
|
|
|
|
|
// 构造g_basisTexture线程层级 |
|
|
|
|
|
dim3 blockBasis(512); |
|
|
|
|
|
dim3 gridBasis((sampleCnt + blockBasis.x - 1) / blockBasis.x); |
|
|
|
|
|
|
|
|
|
|
|
// 构造线程层级 |
|
|
|
|
|
dim3 block(32, 32); |
|
|
|
|
|
dim3 grid((sampleCnt + block.x * block.y - 1) / (block.x * block.y)); |
|
|
|
|
|
// 记录用时 |
|
|
|
|
|
double time_cost_device; |
|
|
|
|
|
if (recordTime) time_cost_device = utils::get_time_windows(); |
|
|
|
|
|
printf("there..\n"); |
|
|
|
|
|
g_basisTexture <<<gridBasis, blockBasis>>>(d_nTexture, d_nTexture1, d_knots, pointsCnt, knotsCnt, sampleCnt); |
|
|
|
|
|
// cudaMemcpy(d_nTextureCpy, d_nTexture, nTextureBytes, cudaMemcpyDeviceToDevice); // 有同步功能 |
|
|
|
|
|
cudaDeviceSynchronize(); |
|
|
|
|
|
printf("here..\n"); |
|
|
|
|
|
g_evaluate <<<grid, block>>>(d_nTexture, d_points, pointsCnt, pointSize, knots[knotsCnt - 1], sampleCnt); |
|
|
|
|
|
// g_test<<<1,6>>>(d_nTextureCpy); |
|
|
|
|
|
cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用 |
|
|
|
|
|
if (recordTime) { |
|
|
|
|
|
time_cost_device = utils::get_time_windows() - time_cost_device; |
|
|
|
|
|
printf("GPU time cost of curve evaluation for %d samples: %lf\n", sampleCnt, time_cost_device); |
|
|
|
|
|
} |
|
|
|
|
|
free(h_points); |
|
|
|
|
|
free(h_knots); |
|
|
|
|
|
|
|
|
|
|
|
printf("First derivatives calculated by GPU:\n"); |
|
|
|
|
|
derivative(); |
|
|
|
|
|
|
|
|
cudaDeviceReset(); |
|
|
cudaDeviceReset(); |
|
|
return {}; |
|
|
return {}; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
__host__ void NurbsSurface::Evaluator::derivative() { |
|
|
|
|
|
float *d_derTexture_u = nullptr; |
|
|
|
|
|
float *d_derTexture_v = nullptr; |
|
|
|
|
|
const int pointsCnt_u = controlPoints.size(), pointsCnt_v = controlPoints[0].size(), pointSize = controlPoints[0][0].size(); |
|
|
|
|
|
const int knotsCnt_u = knots_u.size(), knotsCnt_v = knots_v.size(); |
|
|
|
|
|
cudaMalloc((void **) &d_derTexture_u, sampleCnt_u * pointsCnt_u * sizeof(float)); |
|
|
|
|
|
cudaMalloc((void **) &d_derTexture_v, sampleCnt_v * pointsCnt_v * sizeof(float)); |
|
|
|
|
|
|
|
|
|
|
|
// 构造线程层级 |
|
|
|
|
|
dim3 block(32, 32); |
|
|
|
|
|
dim3 grid((sampleCnt_u + block.x - 1) / block.x, (sampleCnt_v + block.y - 1) / block.y); |
|
|
|
|
|
// 构造g_basisTexture线程层级 |
|
|
|
|
|
dim3 blockTex(512); |
|
|
|
|
|
dim3 gridTex_u((sampleCnt_u + blockTex.x - 1) / blockTex.x); |
|
|
|
|
|
dim3 gridTex_v((sampleCnt_v + blockTex.x - 1) / blockTex.x); |
|
|
|
|
|
// 记录用时 |
|
|
|
|
|
double time_cost_device; |
|
|
|
|
|
if (recordTime) time_cost_device = utils::get_time_windows(); |
|
|
|
|
|
g_derTexture<<<gridTex_u, blockTex>>>(d_derTexture_u, d_nTexture1_u, d_knots_u, pointsCnt_u, knotsCnt_u, |
|
|
|
|
|
sampleCnt_u); |
|
|
|
|
|
g_derTexture<<<gridTex_v, blockTex>>>(d_derTexture_v, d_nTexture1_v, d_knots_v, pointsCnt_v, knotsCnt_v, |
|
|
|
|
|
sampleCnt_v); |
|
|
|
|
|
cudaDeviceSynchronize(); |
|
|
|
|
|
g_derivative<<<grid, block>>>(d_derTexture_u, d_derTexture_v, d_nTexture_u, d_nTexture_v, d_points, pointsCnt_u, |
|
|
|
|
|
pointsCnt_v, pointSize, knots_u[knotsCnt_u - 1], knots_v[knotsCnt_v - 1], sampleCnt_u, |
|
|
|
|
|
sampleCnt_v); |
|
|
|
|
|
cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用 |
|
|
|
|
|
if (recordTime) { |
|
|
|
|
|
time_cost_device = utils::get_time_windows() - time_cost_device; |
|
|
|
|
|
printf("GPU time cost of surface first derivative calculating for %d samples: %lf\n", sampleCnt_u * sampleCnt_v, |
|
|
|
|
|
time_cost_device); |
|
|
|
|
|
} |
|
|
|
|
|
cudaFree(d_derTexture_u); |
|
|
|
|
|
cudaFree(d_derTexture_v); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
__host__ void NurbsCurve::Evaluator::derivative() { |
|
|
|
|
|
float *d_derTexture = nullptr; |
|
|
|
|
|
const int pointsCnt = controlPoints.size(), pointSize = controlPoints[0].size(); |
|
|
|
|
|
const int knotsCnt = knots.size(); |
|
|
|
|
|
cudaMalloc((void **) &d_derTexture, sampleCnt * pointsCnt * sizeof(float)); |
|
|
|
|
|
|
|
|
|
|
|
// 构造线程层级 |
|
|
|
|
|
dim3 block(32, 32); |
|
|
|
|
|
dim3 grid((sampleCnt + block.x * block.y - 1) / (block.x * block.y)); |
|
|
|
|
|
// 构造g_basisTexture线程层级 |
|
|
|
|
|
dim3 blockTex(512); |
|
|
|
|
|
dim3 gridTex((sampleCnt + blockTex.x - 1) / blockTex.x); |
|
|
|
|
|
// 记录用时 |
|
|
|
|
|
double time_cost_device; |
|
|
|
|
|
if (recordTime) time_cost_device = utils::get_time_windows(); |
|
|
|
|
|
g_derTexture<<<gridTex, blockTex>>>(d_derTexture, d_nTexture1, d_knots, pointsCnt, knotsCnt, sampleCnt); |
|
|
|
|
|
cudaDeviceSynchronize(); |
|
|
|
|
|
g_derivative<<<grid, block>>>(d_derTexture, d_points, pointsCnt, pointSize, knots[knotsCnt - 1], sampleCnt); |
|
|
|
|
|
cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用 |
|
|
|
|
|
if (recordTime) { |
|
|
|
|
|
time_cost_device = utils::get_time_windows() - time_cost_device; |
|
|
|
|
|
printf("GPU time cost of curve first derivative calculating for %d samples: %lf\n", sampleCnt, |
|
|
|
|
|
time_cost_device); |
|
|
|
|
|
} |
|
|
|
|
|
cudaFree(d_derTexture); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//__global__ void |
|
|
|
|
|
//NurbsSurface::g_evaluate(const float *d_points, const float *d_knots_u, const float *d_knots_v, |
|
|
|
|
|
// int d_pointsCnt_u, |
|
|
|
|
|
// int d_pointsCnt_v, int d_pointSize, int d_knotsCnt_u, int d_knotsCnt_v, |
|
|
|
|
|
// int d_sampleCnt_u, int d_sampleCnt_v) { |
|
|
|
|
|
//// printf(" surface calculating... \n"); |
|
|
|
|
|
// // 二维grid和二维的block |
|
|
|
|
|
// int ix = int(blockIdx.x * blockDim.x + threadIdx.x); |
|
|
|
|
|
// int iy = int(blockIdx.y * blockDim.y + threadIdx.y); |
|
|
|
|
|
// |
|
|
|
|
|
// float d_paramCeil_u = d_knots_u[d_knotsCnt_u - 1]; |
|
|
|
|
|
// float d_paramCeil_v = d_knots_v[d_knotsCnt_v - 1]; |
|
|
|
|
|
// |
|
|
|
|
|
// float u = ix * d_paramCeil_u / (d_sampleCnt_u - 1); |
|
|
|
|
|
// float v = iy * d_paramCeil_v / (d_sampleCnt_v - 1); |
|
|
|
|
|
// |
|
|
|
|
|
// if (u > 1.0 * d_paramCeil_u || v > 1.0 * d_paramCeil_v) { |
|
|
|
|
|
// return; |
|
|
|
|
|
// } |
|
|
|
|
|
// |
|
|
|
|
|
// int d_degree_u = d_knotsCnt_u - 1 - d_pointsCnt_u; |
|
|
|
|
|
// int d_degree_v = d_knotsCnt_v - 1 - d_pointsCnt_v; |
|
|
|
|
|
// // 注意,在device中,全局内存还是以malloc和free的方式分配和回收的,而不是使用cudaMalloc和cudaFree |
|
|
|
|
|
// auto *N_Texture_U = (float *) malloc((d_degree_u + 1) * (d_knotsCnt_u - 1) * sizeof(float)); |
|
|
|
|
|
// auto *N_Texture_V = (float *) malloc((d_degree_v + 1) * (d_knotsCnt_v - 1) * sizeof(float)); |
|
|
|
|
|
// d_basisFunction(N_Texture_U, d_knots_u, u, d_degree_u, d_knotsCnt_u); |
|
|
|
|
|
// d_basisFunction(N_Texture_V, d_knots_v, v, d_degree_v, d_knotsCnt_v); |
|
|
|
|
|
// float x = 0., y = 0., z = 0.; |
|
|
|
|
|
// for (int i = 0; i < d_pointsCnt_u; i++) { |
|
|
|
|
|
// for (int j = 0; j < d_pointsCnt_v; j++) { |
|
|
|
|
|
// float N_U = N_Texture_U[d_degree_u * (d_knotsCnt_u - 1) + i]; |
|
|
|
|
|
// float N_V = N_Texture_V[d_degree_v * (d_knotsCnt_v - 1) + j]; |
|
|
|
|
|
// int idx = (i * d_pointsCnt_v + j) * d_pointSize; |
|
|
|
|
|
// x += N_U * N_V * d_points[idx]; |
|
|
|
|
|
// y += N_U * N_V * d_points[idx + 1]; |
|
|
|
|
|
// z += N_U * N_V * d_points[idx + 2]; |
|
|
|
|
|
// } |
|
|
|
|
|
// } |
|
|
|
|
|
// printf("(%g, %g)-->(%g, %g, %g)\n", u, v, x, y, z); // %g输出,舍弃无意义的0 |
|
|
|
|
|
// free(N_Texture_U); |
|
|
|
|
|
// free(N_Texture_V); |
|
|
|
|
|
//} |
|
|
|
|
|
|
|
|
__global__ void |
|
|
__global__ void |
|
|
NurbsSurface::calculate_kernel(const float *d_points, const float *d_knots_u, const float *d_knots_v, int d_pointsCnt_u, |
|
|
NurbsSurface::g_evaluate(const float *d_nTexture_u, const float *d_nTexture_v, const float *d_points, int d_pointsCnt_u, |
|
|
int d_pointsCnt_v, int d_pointSize, int d_knotsCnt_u, int d_knotsCnt_v, |
|
|
int d_pointsCnt_v, int d_pointSize, float d_lastKnot_u, float d_lastKnot_v, int d_sampleCnt_u, |
|
|
int d_sampleCnt_u, int d_sampleCnt_v) { |
|
|
int d_sampleCnt_v) { |
|
|
// 二维grid和二维的block |
|
|
|
|
|
int ix = int(blockIdx.x * blockDim.x + threadIdx.x); |
|
|
|
|
|
int iy = int(blockIdx.y * blockDim.y + threadIdx.y); |
|
|
|
|
|
|
|
|
|
|
|
float d_paramCeil_u = d_knots_u[d_knotsCnt_u - 1]; |
|
|
// 二维grid和二维的block |
|
|
float d_paramCeil_v = d_knots_v[d_knotsCnt_v - 1]; |
|
|
int ix = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
|
|
|
int iy = blockIdx.y * blockDim.y + threadIdx.y; |
|
|
|
|
|
|
|
|
float u = ix * d_paramCeil_u / (d_sampleCnt_u - 1); |
|
|
float u = ix * d_lastKnot_u / (d_sampleCnt_u - 1); |
|
|
float v = iy * d_paramCeil_v / (d_sampleCnt_v - 1); |
|
|
float v = iy * d_lastKnot_v / (d_sampleCnt_v - 1); |
|
|
|
|
|
|
|
|
if (u > 1.0 * d_paramCeil_u || v > 1.0 * d_paramCeil_v) { |
|
|
if (u > 1.0 * d_lastKnot_u || v > 1.0 * d_lastKnot_v) { |
|
|
return; |
|
|
return; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
int d_degree_u = d_knotsCnt_u - 1 - d_pointsCnt_u; |
|
|
|
|
|
int d_degree_v = d_knotsCnt_v - 1 - d_pointsCnt_v; |
|
|
|
|
|
// 注意,在device中,全局内存还是以malloc和free的方式分配和回收的,而不是使用cudaMalloc和cudaFree |
|
|
|
|
|
auto *N_Texture_U = (float *) malloc((d_degree_u + 1) * (d_knotsCnt_u - 1) * sizeof(float)); |
|
|
|
|
|
auto *N_Texture_V = (float *) malloc((d_degree_v + 1) * (d_knotsCnt_v - 1) * sizeof(float)); |
|
|
|
|
|
d_basisFunction(N_Texture_U, d_knots_u, u, d_degree_u, d_knotsCnt_u); |
|
|
|
|
|
d_basisFunction(N_Texture_V, d_knots_v, v, d_degree_v, d_knotsCnt_v); |
|
|
|
|
|
float x = 0., y = 0., z = 0.; |
|
|
float x = 0., y = 0., z = 0.; |
|
|
for (int i = 0; i < d_pointsCnt_u; i++) { |
|
|
for (int i = 0; i < d_pointsCnt_u; i++) { |
|
|
|
|
|
float N_U = d_nTexture_u[ix * d_pointsCnt_u + i]; |
|
|
for (int j = 0; j < d_pointsCnt_v; j++) { |
|
|
for (int j = 0; j < d_pointsCnt_v; j++) { |
|
|
float N_U = N_Texture_U[d_degree_u * (d_knotsCnt_u - 1) + i]; |
|
|
float N_V = d_nTexture_v[iy * d_pointsCnt_v + j]; |
|
|
float N_V = N_Texture_V[d_degree_v * (d_knotsCnt_v - 1) + j]; |
|
|
|
|
|
int idx = (i * d_pointsCnt_v + j) * d_pointSize; |
|
|
int idx = (i * d_pointsCnt_v + j) * d_pointSize; |
|
|
x += N_U * N_V * d_points[idx]; |
|
|
x += N_U * N_V * d_points[idx]; |
|
|
y += N_U * N_V * d_points[idx + 1]; |
|
|
y += N_U * N_V * d_points[idx + 1]; |
|
@ -112,88 +306,150 @@ NurbsSurface::calculate_kernel(const float *d_points, const float *d_knots_u, co |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
printf("(%g, %g)-->(%g, %g, %g)\n", u, v, x, y, z); // %g输出,舍弃无意义的0 |
|
|
printf("(%g, %g)-->(%g, %g, %g)\n", u, v, x, y, z); // %g输出,舍弃无意义的0 |
|
|
free(N_Texture_U); |
|
|
|
|
|
free(N_Texture_V); |
|
|
|
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
__global__ void |
|
|
__global__ void |
|
|
NurbsCurve::calculate_kernel(const float *d_points, const float *d_knots, int d_pointsCnt, int d_pointSize, |
|
|
NurbsSurface::g_derivative(const float *derTexture_u, const float *derTexture_v, const float *nTexture_u, |
|
|
int d_knotsCnt, int d_sampleCnt) { |
|
|
const float *nTexture_v, const float *d_points, int d_pointsCnt_u, int d_pointsCnt_v, |
|
|
// 二维grid和一维的block |
|
|
int d_pointSize, float d_lastKnot_u, float d_lastKnot_v, int d_sampleCnt_u, |
|
|
int idx = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; |
|
|
int d_sampleCnt_v) { |
|
|
float d_paramCeil = d_knots[d_knotsCnt - 1]; |
|
|
// 二维grid和二维的block |
|
|
float u = idx * d_paramCeil / (d_sampleCnt - 1); |
|
|
int ix = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
if (u > 1.0 * d_paramCeil) { |
|
|
int iy = blockIdx.y * blockDim.y + threadIdx.y; |
|
|
|
|
|
|
|
|
|
|
|
if (ix >= d_sampleCnt_u || iy >= d_sampleCnt_v) { |
|
|
return; |
|
|
return; |
|
|
} |
|
|
} |
|
|
|
|
|
float u = ix * d_lastKnot_u / (d_sampleCnt_u - 1); |
|
|
|
|
|
float v = iy * d_lastKnot_v / (d_sampleCnt_v - 1); |
|
|
|
|
|
|
|
|
int d_degree = d_knotsCnt - 1 - d_pointsCnt; |
|
|
float x_u = 0., y_u = 0, z_u = 0.; |
|
|
// 注意,在device中,全局内存还是以malloc和free的方式分配和回收的,而不是使用cudaMalloc和cudaFree |
|
|
float x_v = 0., y_v = 0, z_v = 0.; |
|
|
auto *N_Texture = (float *) malloc((d_degree + 1) * (d_knotsCnt - 1) * sizeof(float)); |
|
|
|
|
|
d_basisFunction(N_Texture, d_knots, u, d_degree, d_knotsCnt); |
|
|
for (int i = 0; i < d_pointsCnt_u; i++) { |
|
|
|
|
|
for (int j = 0; j < d_pointsCnt_u; j++) { |
|
|
|
|
|
int baseIdx = (i * d_pointsCnt_v + j) * d_pointSize; |
|
|
|
|
|
float factor_u = derTexture_u[ix * d_pointsCnt_u + i] * nTexture_v[iy * d_pointsCnt_v + j]; |
|
|
|
|
|
float factor_v = derTexture_v[iy * d_pointsCnt_v + j] * nTexture_u[ix * d_pointsCnt_u + i]; |
|
|
|
|
|
x_u += factor_u * d_points[baseIdx]; |
|
|
|
|
|
y_u += factor_u * d_points[baseIdx + 1]; |
|
|
|
|
|
z_u += factor_u * d_points[baseIdx + 2]; |
|
|
|
|
|
|
|
|
|
|
|
x_v += factor_v * d_points[baseIdx]; |
|
|
|
|
|
y_v += factor_v * d_points[baseIdx + 1]; |
|
|
|
|
|
z_v += factor_v * d_points[baseIdx + 2]; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
float x_n = y_u * z_v - y_v * z_u, y_n = x_v * z_u - x_u * z_v, z_n = x_u * y_v - x_v * y_u; // 叉乘得到法向量 |
|
|
|
|
|
printf("(%g,%g)-->u:(%g, %g, %g), v:(%g,%g,%g), normal:(%g,%g,%g)\n", u, v, x_u, y_u, z_u, x_v, y_v, z_v, |
|
|
|
|
|
x_n, y_n, z_n); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
__global__ void |
|
|
|
|
|
NurbsCurve::g_evaluate(const float *NTexture, const float *d_points, const int d_pointsCnt, |
|
|
|
|
|
const int d_pointSize, const float d_lastKnot, const int d_sampleCnt) { |
|
|
|
|
|
// printf(" curve calculating... \n"); |
|
|
|
|
|
// 二维grid和一维的block |
|
|
|
|
|
// int idx = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; |
|
|
|
|
|
// 二维block和一维grid |
|
|
|
|
|
int idx = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x; |
|
|
|
|
|
float u = idx * d_lastKnot / (d_sampleCnt - 1); |
|
|
|
|
|
if (u > 1.0 * d_lastKnot) { |
|
|
|
|
|
return; |
|
|
|
|
|
} |
|
|
|
|
|
// |
|
|
|
|
|
// int d_degree = d_knotsCnt - 1 - d_pointsCnt; |
|
|
|
|
|
// // 注意,在device中,全局内存还是以malloc和free的方式分配和回收的,而不是使用cudaMalloc和cudaFree |
|
|
|
|
|
// auto *N_dp = (float *) malloc((d_degree + 1) * (d_knotsCnt - 1) * sizeof(float)); |
|
|
|
|
|
// d_basisFunction(N_dp, d_knots, u, d_degree, d_knotsCnt); |
|
|
float x = 0., y = 0., z = 0.; |
|
|
float x = 0., y = 0., z = 0.; |
|
|
for (int i = 0; i < d_pointsCnt; i++) { |
|
|
for (int i = 0; i < d_pointsCnt; i++) { |
|
|
float N = N_Texture[d_degree * (d_knotsCnt - 1) + i]; |
|
|
float N = NTexture[idx * d_pointsCnt + i]; |
|
|
int baseIdx = i * d_pointSize; |
|
|
int baseIdx = i * d_pointSize; |
|
|
x += N * d_points[baseIdx]; |
|
|
x += N * d_points[baseIdx]; |
|
|
y += N * d_points[baseIdx + 1]; |
|
|
y += N * d_points[baseIdx + 1]; |
|
|
z += N * d_points[baseIdx + 2]; |
|
|
z += N * d_points[baseIdx + 2]; |
|
|
} |
|
|
} |
|
|
printf("(%g)-->(%g, %g, %g)\n", u, x, y, z); // %g输出,舍弃无意义的0 |
|
|
printf("(%g)-->(%g, %g, %g)\n", u, x, y, z); // %g输出,舍弃无意义的0 |
|
|
free(N_Texture); |
|
|
|
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
__host__ NurbsCurve::Evaluator::Evaluator(std::vector<std::vector<float>> controlPoints, |
|
|
__global__ void |
|
|
std::vector<float> knots) { |
|
|
NurbsCurve::g_derivative(const float *derTexture, const float *d_points, int d_pointsCnt, int d_pointSize, |
|
|
this->knots = std::move(knots); |
|
|
float d_lastKnot, int d_sampleCnt) { |
|
|
this->controlPoints = std::move(controlPoints); |
|
|
// 二维block和一维grid |
|
|
recordTime = false; |
|
|
int idx = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x; |
|
|
|
|
|
if (idx >= d_sampleCnt) return; |
|
|
|
|
|
float u = idx * d_lastKnot / (d_sampleCnt - 1); |
|
|
|
|
|
float x = 0., y = 0, z = 0.; |
|
|
|
|
|
// printf("pointSize: %d\n", d_pointSize); |
|
|
|
|
|
for (int i = 0; i < d_pointsCnt; i++) { |
|
|
|
|
|
int baseIdx = i * d_pointSize; |
|
|
|
|
|
float nFactor = derTexture[idx * d_pointsCnt + i]; |
|
|
|
|
|
x += nFactor * d_points[baseIdx]; |
|
|
|
|
|
y += nFactor * d_points[baseIdx + 1]; |
|
|
|
|
|
z += nFactor * d_points[baseIdx + 2]; |
|
|
|
|
|
// printf("(x, y, z): (%g, %g, %g)\n", d_points[baseIdx], d_points[baseIdx + 1], d_points[baseIdx + 2]); |
|
|
|
|
|
} |
|
|
|
|
|
printf("(%g)-->(%g, %g, %g)\n", u, x, y, z); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
__global__ void g_basisTexture(float *nTexture, float *nTexture1, const float *d_knots, int d_pointsCnt, int d_knotsCnt, |
|
|
__host__ std::vector<std::map<float, std::vector<float>>> |
|
|
int d_sampleCnt) { |
|
|
NurbsCurve::Evaluator::calculate(int sampleCnt) { |
|
|
// 一维grid和一维block |
|
|
// 构造指向device的controlPoints |
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x; // 采样点编号 |
|
|
const int pointsCnt = controlPoints.size(), pointSize = controlPoints[0].size(); |
|
|
float d_paramCeil = d_knots[d_knotsCnt - 1]; |
|
|
const int pointsBytes = pointsCnt * pointSize * sizeof(float); |
|
|
float u = idx * d_paramCeil / (d_sampleCnt - 1); |
|
|
auto *h_points = (float *) malloc(pointsBytes); |
|
|
if (u > 1.0 * d_paramCeil) { |
|
|
for (int i = 0; i < pointsCnt; i++) { |
|
|
return; |
|
|
for (int j = 0; j < pointSize; j++) { |
|
|
|
|
|
h_points[i * pointSize + j] = controlPoints[i][j]; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
} |
|
|
float *d_points; |
|
|
int d_degree = d_knotsCnt - 1 - d_pointsCnt; |
|
|
cudaMalloc((void **) &d_points, pointsBytes); |
|
|
auto *N_dp = (float *) malloc((d_degree + 1) * (d_knotsCnt - 1) * sizeof(float)); |
|
|
cudaMemcpy(d_points, h_points, pointsBytes, cudaMemcpyHostToDevice); |
|
|
d_basisFunction(N_dp, d_knots, u, d_degree, d_knotsCnt); |
|
|
|
|
|
for (int i = 0; i < d_pointsCnt; i++) { |
|
|
|
|
|
nTexture[idx * d_pointsCnt + i] = N_dp[d_degree * (d_knotsCnt - 1) + i]; |
|
|
|
|
|
nTexture1[idx * (d_pointsCnt + 1) + i] = N_dp[(d_degree - 1) * (d_knotsCnt - 1) + i]; |
|
|
|
|
|
// printf("nTexture1: %g ", nTexture1[idx * (d_pointsCnt + 1) + i]); |
|
|
|
|
|
} |
|
|
|
|
|
nTexture1[idx * (d_pointsCnt + 1) + d_pointsCnt] = N_dp[(d_degree - 1) * (d_knotsCnt - 1) + |
|
|
|
|
|
d_pointsCnt]; // nTexture1多记录一列数据 |
|
|
|
|
|
free(N_dp); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
// 构造指向device的knots |
|
|
|
|
|
const int knotsCnt = knots.size(); |
|
|
|
|
|
const int knotsBytes = knotsCnt * sizeof(float); |
|
|
|
|
|
auto *h_knots = (float *) malloc(knotsBytes); |
|
|
|
|
|
for (int i = 0; i < knotsCnt; i++) h_knots[i] = knots[i]; |
|
|
|
|
|
float *d_knots; |
|
|
|
|
|
cudaMalloc((void **) &d_knots, knotsBytes); |
|
|
|
|
|
cudaMemcpy(d_knots, h_knots, knotsBytes, cudaMemcpyHostToDevice); |
|
|
|
|
|
|
|
|
|
|
|
// 构造线程层级,调用核函数 |
|
|
__global__ void |
|
|
dim3 block(32, 32); |
|
|
g_derTexture(float *derTexture, const float *nTexture1, const float *d_knots, int d_pointsCnt, int d_knotsCnt, |
|
|
dim3 grid((sampleCnt + block.x * block.y - 1) / (block.x * block.y)); |
|
|
int d_sampleCnt) { |
|
|
// 记录用时 |
|
|
// 一维grid和一维block |
|
|
double time_cost_device; |
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x; // 采样点编号 |
|
|
if(recordTime) time_cost_device = utils::get_time_windows(); |
|
|
if (idx >= d_sampleCnt) { |
|
|
calculate_kernel <<<grid, block>>>(d_points, d_knots, pointsCnt, pointSize, knotsCnt, sampleCnt); |
|
|
return; |
|
|
if(recordTime) { |
|
|
|
|
|
cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用 |
|
|
|
|
|
time_cost_device = utils::get_time_windows() - time_cost_device; |
|
|
|
|
|
printf("GPU time cost of surface evaluation for %d samples: %lf\n", sampleCnt * sampleCnt, time_cost_device); |
|
|
|
|
|
} |
|
|
} |
|
|
free(h_points); |
|
|
int degree = d_knotsCnt - 1 - d_pointsCnt; |
|
|
free(h_knots); |
|
|
|
|
|
cudaFree(d_points); |
|
|
|
|
|
cudaFree(d_knots); |
|
|
|
|
|
|
|
|
|
|
|
cudaDeviceReset(); |
|
|
// printf("degree: %d\n", degree); |
|
|
return {}; |
|
|
for (int i = 0; i < d_pointsCnt; i++) { |
|
|
|
|
|
float left = d_floatEqual(d_knots[i + degree], d_knots[i]) ? 0 : |
|
|
|
|
|
nTexture1[idx * (d_pointsCnt + 1) + i] * (degree - 1) / (d_knots[i + degree] - d_knots[i]); |
|
|
|
|
|
float right = d_floatEqual(d_knots[i + degree + 1], d_knots[i + 1]) ? 0 : |
|
|
|
|
|
nTexture1[idx * (d_pointsCnt + 1) + i + 1] * (degree - 1) / |
|
|
|
|
|
(d_knots[i + degree + 1] - d_knots[i + 1]); |
|
|
|
|
|
derTexture[idx * d_pointsCnt + i] = left - right; |
|
|
|
|
|
// printf("<%d, %d> -- %g \n", idx, i, left - right); |
|
|
|
|
|
// printf("nTex1: %g \n", nTexture1[idx * (d_pointsCnt + 1) + i]); |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
__host__ NurbsCurve::Evaluator::Evaluator(std::vector<std::vector<float>> controlPoints, |
|
|
|
|
|
std::vector<float> knots) { |
|
|
|
|
|
this->knots = std::move(knots); |
|
|
|
|
|
this->controlPoints = std::move(controlPoints); |
|
|
|
|
|
recordTime = false; |
|
|
|
|
|
d_nTexture = nullptr; |
|
|
|
|
|
d_nTexture1 = nullptr; |
|
|
|
|
|
|
|
|
|
|
|
sampleCnt = 0; |
|
|
|
|
|
d_points = nullptr; |
|
|
|
|
|
d_knots = nullptr; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -203,8 +459,8 @@ __device__ void d_basisFunction(float *N_Texture, const float *knots, float u, i |
|
|
for (int i = 0; i + p <= m - 1; i++) { |
|
|
for (int i = 0; i + p <= m - 1; i++) { |
|
|
if (p == 0) { |
|
|
if (p == 0) { |
|
|
if ((u > knots[i] || d_floatEqual(u, knots[i])) && (u < knots[i + 1]) |
|
|
if ((u > knots[i] || d_floatEqual(u, knots[i])) && (u < knots[i + 1]) |
|
|
|| |
|
|
|| |
|
|
d_floatEqual(u, knots[i + 1]) && d_floatEqual(u, knots[m])) { |
|
|
d_floatEqual(u, knots[i + 1]) && d_floatEqual(u, knots[m])) { |
|
|
N_Texture[p * m + i] = 1.0; |
|
|
N_Texture[p * m + i] = 1.0; |
|
|
} else { |
|
|
} else { |
|
|
N_Texture[p * m + i] = 0.0; |
|
|
N_Texture[p * m + i] = 0.0; |
|
@ -226,11 +482,28 @@ __device__ bool d_floatEqual(float a, float b) { |
|
|
return abs(a - b) < 0.00001; |
|
|
return abs(a - b) < 0.00001; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
void NurbsCurve::Evaluator::setRecordTime(bool r){ |
|
|
|
|
|
|
|
|
void NurbsCurve::Evaluator::setRecordTime(bool r) { |
|
|
recordTime = r; |
|
|
recordTime = r; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
void NurbsSurface::Evaluator::setRecordTime(bool r){ |
|
|
void NurbsSurface::Evaluator::setRecordTime(bool r) { |
|
|
recordTime = r; |
|
|
recordTime = r; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
NurbsSurface::Evaluator::~Evaluator() { |
|
|
|
|
|
cudaFree(d_nTexture_u); |
|
|
|
|
|
cudaFree(d_nTexture_v); |
|
|
|
|
|
cudaFree(d_nTexture1_u); |
|
|
|
|
|
cudaFree(d_nTexture1_v); |
|
|
|
|
|
cudaFree(d_points); |
|
|
|
|
|
cudaFree(d_knots_u); |
|
|
|
|
|
cudaFree(d_knots_v); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
NurbsCurve::Evaluator::~Evaluator() { |
|
|
|
|
|
cudaFree(d_nTexture); |
|
|
|
|
|
cudaFree(d_nTexture1); |
|
|
|
|
|
cudaFree(d_points); |
|
|
|
|
|
cudaFree(d_knots); |
|
|
|
|
|
} |
|
|