// // Created by 14727 on 2022/12/9. // #include "device/Nurbs/nurbs_surface.cuh" #include "device/Nurbs/nurbs_common.cuh" #include "utils.h" #include "device/Nurbs/bvh.cuh" #include "device/device_utils.cuh" #include "tinynurbs/tinynurbs.h" __global__ void NurbsSurface::g_evaluate(float *res, const float *d_nTexture_u, const float *d_nTexture_v, const float *d_points, int d_pointsCnt_u, int d_pointsCnt_v, int d_POINT_SIZE, float d_lastKnot_u, float d_lastKnot_v, int d_sampleCnt_u, int d_sampleCnt_v) { // 二维grid和二维的block int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; float u = ix * d_lastKnot_u / (d_sampleCnt_u - 1); float v = iy * d_lastKnot_v / (d_sampleCnt_v - 1); if (u > 1.0 * d_lastKnot_u || v > 1.0 * d_lastKnot_v) { return; } float x = 0., y = 0., z = 0., sumW = 0.; for (int i = 0; i < d_pointsCnt_u; i++) { float N_U = d_nTexture_u[ix * d_pointsCnt_u + i]; for (int j = 0; j < d_pointsCnt_v; j++) { float N_V = d_nTexture_v[iy * d_pointsCnt_v + j]; int idx = (i * d_pointsCnt_v + j) * d_POINT_SIZE; float w = d_points[idx + 3]; x += N_U * N_V * w * d_points[idx]; y += N_U * N_V * w * d_points[idx + 1]; z += N_U * N_V * w * d_points[idx + 2]; sumW += N_U * N_V * w; } } x = x / sumW; y = y / sumW; z = z / sumW; int baseIdx = (ix * d_sampleCnt_v + iy) * 3; res[baseIdx] = x; res[baseIdx + 1] = y; res[baseIdx + 2] = z; // printf("(%d, %d)-->(%g, %g, %g)\n", ix, iy, x, y, z); // %g输出,舍弃无意义的0 } __global__ void NurbsSurface::g_derivative(float *derivatives, float *normals, const float *derTexture_u, const float *derTexture_v, const float *nTexture_u, const float *nTexture_v, const float *d_points, int d_pointsCnt_u, int d_pointsCnt_v, int d_POINT_SIZE, float d_lastKnot_u, float d_lastKnot_v, int d_sampleCnt_u, int d_sampleCnt_v) { // 二维grid和二维的block int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; if (ix >= d_sampleCnt_u || iy >= d_sampleCnt_v) { return; } float u = ix * d_lastKnot_u / (d_sampleCnt_u - 1); float v = iy * d_lastKnot_v / (d_sampleCnt_v - 1); float nubsPdx_u = 0., nubsPdy_u = 0, nubsPdz_u = 0., nubsPdw_u = 0.; float nubsPdx_v = 0., nubsPdy_v = 0, nubsPdz_v = 0., nubsPdw_v = 0.; for (int i = 0; i < d_pointsCnt_u; i++) { for (int j = 0; j < d_pointsCnt_u; j++) { int baseIdx = (i * d_pointsCnt_v + j) * d_POINT_SIZE; float factor_u = derTexture_u[ix * d_pointsCnt_u + i] * nTexture_v[iy * d_pointsCnt_v + j]; float factor_v = derTexture_v[iy * d_pointsCnt_v + j] * nTexture_u[ix * d_pointsCnt_u + i]; float wij = d_points[baseIdx + 3]; nubsPdx_u += factor_u * wij * d_points[baseIdx]; nubsPdy_u += factor_u * wij * d_points[baseIdx + 1]; nubsPdz_u += factor_u * wij * d_points[baseIdx + 2]; nubsPdw_u += factor_u * wij; nubsPdx_v += factor_v * wij * d_points[baseIdx]; nubsPdy_v += factor_v * wij * d_points[baseIdx + 1]; nubsPdz_v += factor_v * wij * d_points[baseIdx + 2]; nubsPdw_v += factor_v * wij; } } float x = 0., y = 0., z = 0., w = 0.; for (int i = 0; i < d_pointsCnt_u; i++) { float N_U = nTexture_u[ix * d_pointsCnt_u + i]; for (int j = 0; j < d_pointsCnt_v; j++) { float N_V = nTexture_v[iy * d_pointsCnt_v + j]; int idx = (i * d_pointsCnt_v + j) * d_POINT_SIZE; float wij = d_points[idx + 3]; x += N_U * N_V * wij * d_points[idx]; y += N_U * N_V * wij * d_points[idx + 1]; z += N_U * N_V * wij * d_points[idx + 2]; w += N_U * N_V * wij; } } float w2 = w * w; float pdx_u = (nubsPdx_u * w - x * nubsPdw_u) / w2; float pdy_u = (nubsPdy_u * w - y * nubsPdw_u) / w2; float pdz_u = (nubsPdz_u * w - z * nubsPdw_u) / w2; float pdx_v = (nubsPdx_v * w - x * nubsPdw_v) / w2; float pdy_v = (nubsPdy_v * w - y * nubsPdw_v) / w2; float pdz_v = (nubsPdz_v * w - z * nubsPdw_v) / w2; // float pdz_u = (nubsPdz_u * w - z ) int baseIdx = (ix * d_sampleCnt_v + iy) * 6; derivatives[baseIdx] = pdx_u; derivatives[baseIdx + 1] = pdy_u; derivatives[baseIdx + 2] = pdz_u; derivatives[baseIdx + 3] = pdx_v; derivatives[baseIdx + 4] = pdy_v; derivatives[baseIdx + 5] = pdz_v; // 叉乘得到法向量 baseIdx = (ix * d_sampleCnt_v + iy) * 3; normals[baseIdx] = pdy_u * pdz_v - pdy_v * pdz_u; normals[baseIdx + 1] = pdx_v * pdz_u - pdx_u * pdz_v; normals[baseIdx + 2] = pdx_u * pdy_v - pdx_v * pdy_u; normalization(normals[baseIdx], normals[baseIdx + 1], normals[baseIdx + 2]); // if ((ix == 8 && iy == 9) || (ix == 7 && iy == 9) || (ix == 9 && iy == 9) || (ix == 8 && iy == 8) || // (ix == 8 && iy == 10)) // printf("(%g,%g)-->u:(%g, %g, %g), v:(%g,%g,%g), normal:(%g,%g,%g)\n", u, v, pdx_u, pdy_u, pdz_u, pdx_v, pdy_v, // pdz_v, normals[baseIdx], normals[baseIdx + 1], normals[baseIdx + 2]); } __global__ void NurbsSurface::g_curvature(const float *derivatives, const int sampleCnt_u, const int sampleCnt_v, float lastKnot_u, float lastKnot_v, float *ms, float *k) { // 二维grid和二维的block int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; if (ix >= sampleCnt_u || iy >= sampleCnt_v) { return; } float step_u = lastKnot_u / (sampleCnt_u - 1), step_v = lastKnot_v / (sampleCnt_v - 1); float u = ix * step_u, v = iy * step_v; int baseIdx = (ix * sampleCnt_v + iy) * 6; int lastBaseIdx_u = ((ix - 1) * sampleCnt_v + iy) * 6, nextBaseIdx_u = ((ix + 1) * sampleCnt_v + iy) * 6; int lastBaseIdx_v = (ix * sampleCnt_v + iy - 1) * 6, nextBaseIdx_v = (ix * sampleCnt_v + iy + 1) * 6; // printf("(%g,%g)-->u:(%g, %g, %g), v:(%g,%g,%g)\n", u, v, derivatives[baseIdx], derivatives[baseIdx + 1], // derivatives[baseIdx + 2], derivatives[baseIdx + 3], derivatives[baseIdx + 4], derivatives[baseIdx + 5]); float sndPdx_uu, sndPdy_uu, sndPdz_uu, sndPdx_vv, sndPdy_vv, sndPdz_vv; // 二阶导 float sndPdx_uv, sndPdy_uv, sndPdz_uv, sndPdx_vu, sndPdy_vu, sndPdz_vu; if (ix == 0) { sndPdx_uu = (derivatives[nextBaseIdx_u] - derivatives[baseIdx]) / step_u; sndPdy_uu = (derivatives[nextBaseIdx_u + 1] - derivatives[baseIdx + 1]) / step_u; sndPdz_uu = (derivatives[nextBaseIdx_u + 2] - derivatives[baseIdx + 2]) / step_u; sndPdx_vu = (derivatives[nextBaseIdx_u + 3] - derivatives[baseIdx + 3]) / step_u; sndPdy_vu = (derivatives[nextBaseIdx_u + 4] - derivatives[baseIdx + 4]) / step_u; sndPdz_vu = (derivatives[nextBaseIdx_u + 5] - derivatives[baseIdx + 5]) / step_u; } else if (ix == sampleCnt_u - 1) { sndPdx_uu = (derivatives[baseIdx] - derivatives[lastBaseIdx_u]) / step_u; sndPdy_uu = (derivatives[baseIdx + 1] - derivatives[lastBaseIdx_u + 1]) / step_u; sndPdz_uu = (derivatives[baseIdx + 2] - derivatives[lastBaseIdx_u + 2]) / step_u; sndPdx_vu = (derivatives[baseIdx + 3] - derivatives[lastBaseIdx_u + 3]) / step_u; sndPdy_vu = (derivatives[baseIdx + 4] - derivatives[lastBaseIdx_u + 4]) / step_u; sndPdz_vu = (derivatives[baseIdx + 5] - derivatives[lastBaseIdx_u + 5]) / step_u; } else { sndPdx_uu = (derivatives[nextBaseIdx_u] - derivatives[lastBaseIdx_u]) / (2 * step_u); sndPdy_uu = (derivatives[nextBaseIdx_u + 1] - derivatives[lastBaseIdx_u + 1]) / (2 * step_u); sndPdz_uu = (derivatives[nextBaseIdx_u + 2] - derivatives[lastBaseIdx_u + 2]) / (2 * step_u); sndPdx_vu = (derivatives[nextBaseIdx_u + 3] - derivatives[lastBaseIdx_u + 3]) / (2 * step_u); sndPdy_vu = (derivatives[nextBaseIdx_u + 4] - derivatives[lastBaseIdx_u + 4]) / (2 * step_u); sndPdz_vu = (derivatives[nextBaseIdx_u + 5] - derivatives[lastBaseIdx_u + 5]) / (2 * step_u); } if (iy == 0) { sndPdx_vv = (derivatives[nextBaseIdx_v + 3] - derivatives[baseIdx + 3]) / step_v; sndPdy_vv = (derivatives[nextBaseIdx_v + 4] - derivatives[baseIdx + 4]) / step_v; sndPdz_vv = (derivatives[nextBaseIdx_v + 5] - derivatives[baseIdx + 5]) / step_v; sndPdx_uv = (derivatives[nextBaseIdx_v] - derivatives[baseIdx]) / step_v; sndPdy_uv = (derivatives[nextBaseIdx_v + 1] - derivatives[baseIdx + 1]) / step_v; sndPdz_uv = (derivatives[nextBaseIdx_v + 2] - derivatives[baseIdx + 2]) / step_v; } else if (iy == sampleCnt_v - 1) { sndPdx_vv = (derivatives[baseIdx + 3] - derivatives[lastBaseIdx_v + 3]) / step_v; sndPdy_vv = (derivatives[baseIdx + 4] - derivatives[lastBaseIdx_v + 4]) / step_v; sndPdz_vv = (derivatives[baseIdx + 5] - derivatives[lastBaseIdx_v + 5]) / step_v; sndPdx_uv = (derivatives[baseIdx] - derivatives[lastBaseIdx_v]) / step_v; sndPdy_uv = (derivatives[baseIdx + 1] - derivatives[lastBaseIdx_v + 1]) / step_v; sndPdz_uv = (derivatives[baseIdx + 2] - derivatives[lastBaseIdx_v + 2]) / step_v; } else { sndPdx_vv = (derivatives[nextBaseIdx_v + 3] - derivatives[lastBaseIdx_v + 3]) / (2 * step_v); sndPdy_vv = (derivatives[nextBaseIdx_v + 4] - derivatives[lastBaseIdx_v + 4]) / (2 * step_v); sndPdz_vv = (derivatives[nextBaseIdx_v + 5] - derivatives[lastBaseIdx_v + 5]) / (2 * step_v); sndPdx_uv = (derivatives[nextBaseIdx_v] - derivatives[lastBaseIdx_v]) / (2 * step_v); sndPdy_uv = (derivatives[nextBaseIdx_v + 1] - derivatives[lastBaseIdx_v + 1]) / (2 * step_v); sndPdz_uv = (derivatives[nextBaseIdx_v + 2] - derivatives[lastBaseIdx_v + 2]) / (2 * step_v); } float uvx = (sndPdx_uv + sndPdx_vu) / 2, uvy = (sndPdy_uv + sndPdy_vu) / 2, uvz = (sndPdz_uv + sndPdz_vu) / 2; // normalization(sndPdx_uv, sndPdy_uv, sndPdz_uv); // normalization(sndPdx_vu, sndPdy_vu, sndPdz_vu); // normalization(sndPdx_uu, sndPdy_uu, sndPdz_uu); // normalization(uvx, uvy, uvz); // normalization(sndPdx_vv, sndPdy_vv, sndPdz_vv); // if (ix == 8 && iy == 9) { // printf("(%g, %g) --> uu: (%g, %g, %g), uv: (%g, %g, %g), vv: (%g, %g, %g)\n", u, v, sndPdx_uu, sndPdy_uu, // sndPdz_uu, // uvx, uvy, uvz, sndPdx_vv, sndPdy_vv, sndPdz_vv); // printf("uv: (%g, %g, %g), vu: (%g, %g, %g)\n", sndPdx_uv, sndPdy_uv, sndPdz_uv, sndPdx_vu, sndPdy_vu, // sndPdz_vu); // } float m1 = max(max(sndPdx_uu, sndPdy_uu), sndPdz_uu); float m2 = max(max(uvx, uvy), uvz); float m3 = max(max(sndPdx_vv, sndPdy_vv), sndPdz_vv); // __shared__ float ms[363]; ms[(ix * sampleCnt_v + iy) * 3] = m1; ms[(ix * sampleCnt_v + iy) * 3 + 1] = m2; ms[(ix * sampleCnt_v + iy) * 3 + 2] = m3; __syncthreads(); // if(ix == 1 && iy == 1) { // for(int i = 0; i < sampleCnt_u; i++) { // for(int j = 0; j < sampleCnt_v; j++) { // printf("%g ", ms[(i * sampleCnt_v + j) * 3]); // } // printf("\n"); // } // } // 规约求最大值 for (int step = (sampleCnt_u + 1) / 2; step > 1; step = (step + 1) / 2) { // step 表示现在参与计算最大值的数据的长度的一半 if (ix < step && ix + step < sampleCnt_u) { ms[(ix * sampleCnt_v + iy) * 3] = max(m1, ms[((ix + step) * sampleCnt_v + iy) * 3]); ms[(ix * sampleCnt_v + iy) * 3 + 1] = max(m1, ms[((ix + step) * sampleCnt_v + iy) * 3 + 1]); ms[(ix * sampleCnt_v + iy) * 3 + 2] = max(m1, ms[((ix + step) * sampleCnt_v + iy) * 3 + 2]); } } for (int step = (sampleCnt_v + 1) / 2; step > 1; step = (step + 1) / 2) { // step 表示现在参与计算最大值的数据的长度的一半 if (iy < step && iy + step < sampleCnt_v) { ms[iy * 3] = max(ms[iy * 3], ms[(iy + step) * 3]); ms[iy * 3 + 1] = max(ms[iy * 3 + 1], ms[(iy + step) * 3 + 1]); ms[iy * 3 + 2] = max(ms[iy * 3 + 2], ms[(iy + step) * 3 + 2]); } } __syncthreads(); int n = sampleCnt_u - 1; int m = sampleCnt_v - 1; *k = (ms[0] / (n * n) + 2 * ms[1] / (n * m) + ms[2] / (m * m)) / 8; // if(ix == 1 && iy == 1)printf("%g gggg\n", ms[0]); } __host__ NurbsSurface::Surface::Surface(MeshPoints4 controlPoints, std::vector knots_u, std::vector knots_v) { this->knots_u = std::move(knots_u); this->knots_v = std::move(knots_v); this->controlPoints = std::move(controlPoints); recordTime = false; d_nTexture_u = nullptr; d_nTexture_v = nullptr; d_nTexture1_u = nullptr; d_nTexture1_v = nullptr; d_knots_u = nullptr; d_knots_v = nullptr; d_points = nullptr; d_evaluationRes = nullptr; d_derivatives = nullptr; d_k = nullptr; d_normals = nullptr; bvh.nodes = nullptr; h_evaluations = nullptr; h_derivatives = nullptr; h_normals = nullptr; } __host__ void NurbsSurface::Surface::evaluate(int sampleCnt_u, int sampleCnt_v) { // 构造指向device的controlPoints const int pointsCnt_u = controlPoints.size(), pointsCnt_v = controlPoints[0].size(); const int pointsBytes = pointsCnt_u * pointsCnt_v * sizeof(glm::vec4); auto *h_points = (float *) malloc(pointsBytes); for (int i = 0; i < pointsCnt_u; i++) { for (int j = 0; j < pointsCnt_v; j++) { for (int k = 0; k < POINT_SIZE; k++) { h_points[(i * pointsCnt_v + j) * POINT_SIZE + k] = controlPoints[i][j][k]; } // printf("%f/ %f/ %f/ %f ", controlPoints[i][j][0], controlPoints[i][j][1], controlPoints[i][j][2], controlPoints[i][j][3]); } } cudaMalloc((void **) &d_points, pointsBytes); cudaMemcpy(d_points, h_points, pointsBytes, cudaMemcpyHostToDevice); // 构造指向device的knots const int knotsCnt_u = knots_u.size(), knotsCnt_v = knots_v.size(); const int knotsBytes_u = knotsCnt_u * sizeof(float), knotsBytes_v = knotsCnt_v * sizeof(float); auto *h_knots_u = (float *) malloc(knotsBytes_u), *h_knots_v = (float *) malloc(knotsBytes_v); for (int i = 0; i < knotsCnt_u; i++) h_knots_u[i] = knots_u[i]; for (int i = 0; i < knotsCnt_v; i++) h_knots_v[i] = knots_v[i]; safeCudaFree(d_knots_u); safeCudaFree(d_knots_v); cudaMalloc((void **) &d_knots_u, knotsBytes_u); cudaMalloc((void **) &d_knots_v, knotsBytes_v); cudaMemcpy(d_knots_u, h_knots_u, knotsBytes_u, cudaMemcpyHostToDevice); cudaMemcpy(d_knots_v, h_knots_v, knotsBytes_v, cudaMemcpyHostToDevice); // 构造nTexture cudaMalloc((void **) &d_nTexture_u, sampleCnt_u * pointsCnt_u * sizeof(float)); // 注意nTexture的大小,在算梯度时用得到i=pointsCnt + 1的基函数值 cudaMalloc((void **) &d_nTexture_v, sampleCnt_v * pointsCnt_v * sizeof(float)); // 构造nTexture1 cudaMalloc((void **) &d_nTexture1_u, sampleCnt_u * (pointsCnt_u + 1) * sizeof(float)); cudaMalloc((void **) &d_nTexture1_v, sampleCnt_v * (pointsCnt_v + 1) * sizeof(float)); // 结果数组 size_t resBytes = sampleCnt_u * sampleCnt_v * 3 * sizeof(float); safeCudaFree(d_evaluationRes); cudaMalloc((void **) &d_evaluationRes, resBytes); safeFree(h_evaluations); h_evaluations = (float *) malloc(resBytes); // 构造g_basisTexture线程层级 dim3 blockBasis(512); dim3 gridBasis_u((sampleCnt_u + blockBasis.x - 1) / blockBasis.x); dim3 gridBasis_v((sampleCnt_v + blockBasis.x - 1) / blockBasis.x); // 构造线程层级,调用核函数 dim3 block(32, 32); dim3 grid((sampleCnt_u + block.x - 1) / block.x, (sampleCnt_v + block.y - 1) / block.y); // 记录用时 double time_cost_device; if (recordTime) time_cost_device = get_time(); g_basisTexture<<>>(d_nTexture_u, d_nTexture1_u, d_knots_u, pointsCnt_u, knotsCnt_u, sampleCnt_u); cudaDeviceSynchronize(); g_basisTexture<<>>(d_nTexture_v, d_nTexture1_v, d_knots_v, pointsCnt_v, knotsCnt_v, sampleCnt_v); cudaDeviceSynchronize(); g_evaluate <<>>(d_evaluationRes, d_nTexture_u, d_nTexture_v, d_points, pointsCnt_u, pointsCnt_v, POINT_SIZE, knots_u[knotsCnt_u - 1], knots_v[knotsCnt_v - 1], sampleCnt_u, sampleCnt_v); cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用 if (recordTime) { time_cost_device = get_time() - time_cost_device; printf("GPU time cost of surface evaluation for %d samples: %lf\n", sampleCnt_u * sampleCnt_v, time_cost_device); } cudaMemcpy(h_evaluations, d_evaluationRes, resBytes, cudaMemcpyDeviceToHost); // 释放内存 safeFree(h_points); safeFree(h_knots_u); safeFree(h_knots_v); } __host__ std::vector> NurbsSurface::Surface::getEvaluateVec(int sampleCnt_u, int sampleCnt_v) const { std::vector> res(sampleCnt_u, std::vector(sampleCnt_v, glm::vec3())); for (int i = 0; i < sampleCnt_u; i++) { int baseIdx = i * sampleCnt_v * 3; for (int j = 0; j < sampleCnt_v; j++) { baseIdx += j * 3; res[i][j].x = h_evaluations[baseIdx]; res[i][j].y = h_evaluations[baseIdx + 1]; res[i][j].z = h_evaluations[baseIdx + 2]; // printf("%d, %d: %f, %f, %f\n", i, j, res[i][j][0], res[i][j][1], res[i][j][2]); } } return res; } __host__ std::vector NurbsSurface::Surface::getDerivativeVec(int sampleCnt_u, int sampleCnt_v) const { MeshPoints3 der_u(sampleCnt_u, LinePoints3(sampleCnt_v)); MeshPoints3 der_v(sampleCnt_u, LinePoints3(sampleCnt_v)); MeshPoints3 normal(sampleCnt_u, LinePoints3(sampleCnt_v)); for (int i = 0; i < sampleCnt_u; i++) { int baseIdx = i * sampleCnt_v * 6; for (int j = 0; j < sampleCnt_v; j++) { baseIdx += j * 6; der_u[i][j].x = h_derivatives[baseIdx]; der_u[i][j].y = h_derivatives[baseIdx + 1]; der_u[i][j].z = h_derivatives[baseIdx + 2]; der_v[i][j].x = h_derivatives[baseIdx + 3]; der_v[i][j].y = h_derivatives[baseIdx + 4]; der_v[i][j].z = h_derivatives[baseIdx + 5]; auto baseIdxNorm = baseIdx / 2; normal[i][j].x = h_normals[baseIdxNorm]; normal[i][j].y = h_normals[baseIdxNorm + 1]; normal[i][j].z = h_normals[baseIdxNorm + 2]; // TODO normalize 归一化在gpu中实现! normal[i][j] = glm::normalize(normal[i][j]); } } return {der_u, der_v, normal}; } __host__ void NurbsSurface::Surface::derivative(int sampleCnt_u, int sampleCnt_v) { if (POINT_SIZE != controlPoints[0][0].size()) { printf("Error! Nurbs控制点应表示为长度为4的齐次坐标\n"); return; } float *d_derTexture_u = nullptr; float *d_derTexture_v = nullptr; const int pointsCnt_u = controlPoints.size(), pointsCnt_v = controlPoints[0].size(); const int knotsCnt_u = knots_u.size(), knotsCnt_v = knots_v.size(); cudaMalloc((void **) &d_derTexture_u, sampleCnt_u * pointsCnt_u * sizeof(float)); cudaMalloc((void **) &d_derTexture_v, sampleCnt_v * pointsCnt_v * sizeof(float)); // 构造切向量计算结果 safeCudaFree(d_derivatives); size_t derBytes = sampleCnt_u * sampleCnt_v * 6 * sizeof(float); cudaMalloc((void **) &d_derivatives, derBytes); // 每个采样所求的切向量是一个六元向量,前三位是对u的偏导、后三位是对v的偏导 // 构造法向量计算结果 safeCudaFree(d_normals); size_t normalBytes = sampleCnt_u * sampleCnt_v * 3 * sizeof(float); cudaMalloc((void **) &d_normals, normalBytes); // 构造线程层级 dim3 block(32, 32); dim3 grid((sampleCnt_u + block.x - 1) / block.x, (sampleCnt_v + block.y - 1) / block.y); // 构造g_basisTexture线程层级 dim3 blockTex(512); dim3 gridTex_u((sampleCnt_u + blockTex.x - 1) / blockTex.x); dim3 gridTex_v((sampleCnt_v + blockTex.x - 1) / blockTex.x); // 记录用时 double time_cost_device; if (recordTime) time_cost_device = get_time(); g_derTexture<<>>(d_derTexture_u, d_nTexture1_u, d_knots_u, pointsCnt_u, knotsCnt_u, sampleCnt_u); g_derTexture<<>>(d_derTexture_v, d_nTexture1_v, d_knots_v, pointsCnt_v, knotsCnt_v, sampleCnt_v); cudaDeviceSynchronize(); g_derivative<<>>(d_derivatives, d_normals, d_derTexture_u, d_derTexture_v, d_nTexture_u, d_nTexture_v, d_points, pointsCnt_u, pointsCnt_v, POINT_SIZE, knots_u[knotsCnt_u - 1], knots_v[knotsCnt_v - 1], sampleCnt_u, sampleCnt_v); cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用 if (recordTime) { time_cost_device = get_time() - time_cost_device; printf("GPU time cost of surface first derivative calculating for %d samples: %lf\n", sampleCnt_u * sampleCnt_v, time_cost_device); } // 结果数组 safeFree(h_normals); h_normals = (float *) malloc(normalBytes); cudaMemcpy(h_normals, d_normals, normalBytes, cudaMemcpyDeviceToHost); safeFree(h_derivatives); h_derivatives = (float *) malloc(derBytes); cudaMemcpy(h_derivatives, d_derivatives, derBytes, cudaMemcpyDeviceToHost); cudaFree(d_derTexture_u); cudaFree(d_derTexture_v); } __host__ void NurbsSurface::Surface::curvature(int sampleCnt_u, int sampleCnt_v) { if (POINT_SIZE != controlPoints[0][0].size()) { printf("Error! Nurbs控制点应表示为长度为4的齐次坐标\n"); return; } // 构造记录每个采样点中的最大M1、M2、M3的数组 // 这里用共享内存会更好,但使用共享内存动态分配长度总是出错(好像是因为长度过长),后续需要思考解决这个问题 float *ms = nullptr; cudaMalloc((void **) &ms, sampleCnt_u * sampleCnt_v * 3 * sizeof(float)); cudaMalloc((void **) &d_k, sizeof(float)); // 构造线程层级 dim3 block(32, 32); dim3 grid((sampleCnt_u - 1 + block.x - 1) / block.x, (sampleCnt_v - 1 + block.y - 1) / block.y); // 记录用时 double time_cost_device; if (recordTime) time_cost_device = get_time(); g_curvature<<>>(d_derivatives, sampleCnt_u, sampleCnt_v, knots_u[knots_u.size() - 1], knots_v[knots_v.size() - 1], ms, d_k); cudaDeviceSynchronize(); // 所用线程结束后再获取结束时间。cudaThreadSynchronize()在CUDA1.0后被弃用 if (recordTime) { time_cost_device = get_time() - time_cost_device; printf("GPU time cost of surface curvature calculating for %d samples: %lf\n", sampleCnt_u * sampleCnt_v, time_cost_device); } safeCudaFree(ms); } void NurbsSurface::Surface::setRecordTime(bool r) { recordTime = r; } NurbsSurface::Surface::~Surface() { safeCudaFree(d_nTexture_u); safeCudaFree(d_nTexture_v); safeCudaFree(d_nTexture1_u); safeCudaFree(d_nTexture1_v); safeCudaFree(d_points); safeCudaFree(d_knots_u); safeCudaFree(d_knots_v); safeCudaFree(d_k); safeCudaFree(d_evaluationRes); safeCudaFree(d_normals); safeCudaFree(d_derivatives); safeFree(bvh.nodes); safeFree(h_evaluations); safeFree(h_normals); safeFree(h_derivatives); cudaDeviceReset(); } __host__ void NurbsSurface::Surface::buildBVH(int layerCnt, bool useK) { // TODO 构造BVH的函数不应该出现在NURBS Surface中,应该是BVH类的事情! // TODO NURBS Surface只需要一个函数去调用BVH对象的方法即可 int sampleCnt_u = pow(2, layerCnt - 1) + 1, sampleCnt_v = sampleCnt_u; if (!useK) { // 必须safeFree一下,这样global函数中才能通过d_k = nullptr知道不需要再free safeCudaFree(d_k); } if (POINT_SIZE != controlPoints[0][0].size()) { printf("Error! Nurbs控制点应表示为长度为4的齐次坐标\n"); return; } // 构造线程层级 dim3 block(32, 32); dim3 grid((sampleCnt_u + block.x - 1) / block.x, (sampleCnt_v + block.y - 1) / block.y); // bvh.maxLevel = max(int(ceil(log2f(sampleCnt_u - 1))) + 1, int(ceil(log2f(sampleCnt_v - 1))) + 1); bvh.maxLevel = layerCnt; bvh.size = (pow(4, bvh.maxLevel) - 1) / 3; // 等比数列求和公示求出数总的节点数 size_t bvhBytes = sizeof(BVHNode) * bvh.size; BVHNode *d_bvh = nullptr; cudaMalloc((void **) &d_bvh, bvhBytes); // 记录用时 double time_cost_device; if (recordTime) time_cost_device = get_time(); g_buildBvh<<>>(d_k, bvh.maxLevel, d_evaluationRes, knots_u[knots_u.size() - 1], knots_v[knots_v.size() - 1], sampleCnt_u, sampleCnt_v, d_bvh); cudaDeviceSynchronize(); if (recordTime) { time_cost_device = get_time() - time_cost_device; printf("GPU time cost of a %d-layer BVH building: %lf\n", bvh.maxLevel, time_cost_device); } // 将bvh拷贝到cpu中 safeFree(bvh.nodes); bvh.nodes = (BVHNode *) malloc(bvhBytes); cudaMemcpy(bvh.nodes, d_bvh, bvhBytes, cudaMemcpyDeviceToHost); safeCudaFree(d_bvh); // bvh.printQuadTree(); } __host__ void NurbsSurface::Surface::buildGaussMap(int layerCnt) { // TODO,构造GAUSS Map的函数不应该出现在NURBS Surface中,应该是GAUSS Map类的事情! int sampleCnt_u = pow(2, layerCnt - 1) + 1, sampleCnt_v = sampleCnt_u; if (POINT_SIZE != controlPoints[0][0].size()) { printf("Error! Nurbs控制点应表示为长度为4的齐次坐标\n"); return; } // 构造线程层级 dim3 block(32, 32); dim3 grid((sampleCnt_u + block.x - 1) / block.x, (sampleCnt_v + block.y - 1) / block.y); gauss_map.maxLevel = layerCnt; gauss_map.size = (pow(4, layerCnt) - 1) / 3; // 等比数列求和公示求出数总的节点数 size_t gaussMapBytes = sizeof(BVHNode) * gauss_map.size; BVHNode *d_gaussMapTree = nullptr; cudaMalloc((void **) &d_gaussMapTree, gaussMapBytes); // 记录用时 double time_cost_device; if (recordTime) time_cost_device = get_time(); g_buildBvh<<>>(nullptr, layerCnt, d_normals, knots_u[knots_u.size() - 1], knots_v[knots_v.size() - 1], sampleCnt_u, sampleCnt_v, d_gaussMapTree); cudaDeviceSynchronize(); if (recordTime) { time_cost_device = get_time() - time_cost_device; printf("GPU time cost of a %d-layer Gauss Map building: %lf\n", layerCnt, time_cost_device); } safeFree(gauss_map.nodes); gauss_map.nodes = (BVHNode *) malloc(gaussMapBytes); cudaMemcpy(gauss_map.nodes, d_gaussMapTree, gaussMapBytes, cudaMemcpyDeviceToHost); safeCudaFree(d_gaussMapTree); } __host__ void NurbsSurface::recursiveGetOverlapLeafNodes(const BVH &bvh1, const BVH &bvh2, int idx1, int idx2, std::vector> &pairs) { auto A = bvh1.nodes[idx1]; auto B = bvh2.nodes[idx2]; auto AABBSize = [](const AABB &aabb) { return (aabb.pMax.z - aabb.pMin.z) * (aabb.pMax.y - aabb.pMin.y) * (aabb.pMax.x - aabb.pMin.x); }; // 两个包围盒不相交,返回 if (!A.bounds.IsOverlap(B.bounds)) return; // 相交 if (A.firstChild == -1 && B.firstChild == -1) { // 两者都是叶子节点 pairs.emplace_back(idx1, idx2); } else if (A.firstChild != -1 && B.firstChild == -1) { // A是中间结点,B是叶子结点 for (int i = 0; i < 4; i++) recursiveGetOverlapLeafNodes(bvh1, bvh2, A.firstChild + i, idx2, pairs); } else if (A.firstChild == -1 && B.firstChild != -1) { // A是叶子结点,B是中间结点 for (int i = 0; i < 4; i++) recursiveGetOverlapLeafNodes(bvh1, bvh2, idx1, B.firstChild + i, pairs); } else { // 都是中间结点 if (AABBSize(A.bounds) > AABBSize(B.bounds)) { // A的包围盒更大 for (int i = 0; i < 4; i++) recursiveGetOverlapLeafNodes(bvh1, bvh2, A.firstChild + i, idx2, pairs); } else { // B的包围盒更大 for (int i = 0; i < 4; i++) recursiveGetOverlapLeafNodes(bvh1, bvh2, idx1, B.firstChild + i, pairs); } } } __host__ std::vector NurbsSurface::getOverlappedLeafNodes(const BVH &bvh1, const BVH &bvh2) { std::vector resPairs; // 记录用时 double time_cost_device = get_time(); recursiveGetOverlapLeafNodes(bvh1, bvh2, 0, 0, resPairs); std::vector boxPairsIdx2(resPairs.size()); for (int i = 0; i < resPairs.size(); i++) { boxPairsIdx2[i] = {{bvh1.nodes[resPairs[i].first].idx_u, bvh1.nodes[resPairs[i].first].idx_v}, {bvh2.nodes[resPairs[i].second].idx_u, bvh2.nodes[resPairs[i].second].idx_v}}; } time_cost_device = get_time() - time_cost_device; printf("CPU time cost for recursively calculating the overlapped leaf nodes: %lf\n", time_cost_device); return boxPairsIdx2; } __host__ bool NurbsSurface::isGaussMapsOverlapped(const BVH &gm1, const BVH &gm2, std::pair idxRange_u1, std::pair idxRange_v1, std::pair idxRange_u2, std::pair idxRange_v2) { if (gm1.maxLevel != gm2.maxLevel || gm1.maxLevel <= 0) { printf("BVH Layer error!\n"); return false; } int commonMaxLayer = gm1.maxLevel; int edgeCellCnt = pow(2, commonMaxLayer - 1); if (idxRange_u1.first < 0 || idxRange_u2.first < 0 || idxRange_v1.first < 0 || idxRange_v2.first < 0 || idxRange_u1.second >= edgeCellCnt || idxRange_u2.second >= edgeCellCnt || idxRange_v1.second >= edgeCellCnt || idxRange_v2.second >= edgeCellCnt) { printf("Error when detecting overlapping: idx range invalid!\n"); return false; } auto getRangedBox = [&commonMaxLayer](const BVH &bvh, const std::pair &idxRange_u, const std::pair idxRange_v) { // 获取某个范围的gauss map的aabb AABB bounding; for (int i = idxRange_u.first; i <= idxRange_u.second; ++i) { for (int j = idxRange_v.first; j <= idxRange_v.second; ++j) { bounding = bounding.Union( bvh.nodes[getStartIdxOfLayerN(commonMaxLayer) + h_getChildNodeIdx(i, j)].bounds); } } return bounding; }; return getRangedBox(gm1, idxRange_u1, idxRange_v1) .IsOverlap(getRangedBox(gm2, idxRange_u2, idxRange_v2)); } __host__ bool NurbsSurface::isGaussMapsOverlapped(const BVH &gm1, const BVH &gm2, std::pair range_u1, std::pair range_v1, std::pair range_u2, std::pair range_v2, std::pair paramRange_u1, std::pair paramRange_v1, std::pair paramRange_u2, std::pair paramRange_v2) { if (gm1.maxLevel != gm2.maxLevel || gm1.maxLevel <= 0) { printf("BVH Layer error!\n"); return false; } int edgeCellCnt = pow(2, gm1.maxLevel - 1); // 根据所给参数的范围和参数的定义域范围,获得对应的采样网格中的范围 auto getIdxRange = [](std::pair range, std::pair paramRange, int edgeCellCnt) { float paramStep = (paramRange.second - paramRange.first) / edgeCellCnt; return std::pair({int((range.first - paramRange.first) / paramStep), int((range.second - paramRange.first) / paramStep)}); }; auto idxRange_u1 = getIdxRange(range_u1, paramRange_u1, edgeCellCnt); auto idxRange_v1 = getIdxRange(range_v1, paramRange_v1, edgeCellCnt); auto idxRange_u2 = getIdxRange(range_u2, paramRange_u2, edgeCellCnt); auto idxRange_v2 = getIdxRange(range_v2, paramRange_v2, edgeCellCnt); return isGaussMapsOverlapped(gm1, gm2, idxRange_u1, idxRange_v1, idxRange_u2, idxRange_v2); } __host__ void NurbsSurface::Surface::recursiveGetRayBVHIntersection(const glm::vec3 dir, const glm::vec3 startPoint, const int idx, std::vector &intersectionLeafNodes) { auto bvhNode = bvh.nodes[idx]; // 射线与AABB判交 auto isRayBoxIntersect = [&]() { const auto &box = bvhNode.bounds; float t; // 射线的参数,当t<=0,表示线面交点不在视平面前方,视为没有交点 if (dir.x != 0.) { // 当x分量不为0,则射线会与AABB中垂直于x轴的平面可能有交 // 注意x的正负。x为正会先遇到较小点所在平面 if (dir.x > 0) t = (box.pMin.x - startPoint.x) / dir.x; else t = (box.pMax.x - startPoint.x) / dir.x; if (t > 0.) { // 射线与平面在前方有交点。但交点不一定在盒子上 auto tmpPt = startPoint + t * dir; // 交点 if (box.pMin.y <= tmpPt.y && box.pMin.z <= tmpPt.z && box.pMax.y >= tmpPt.y && box.pMax.z >= tmpPt.z) return true; } } if (dir.y != 0.) { // 同上测试y方向 if (dir.y > 0) t = (box.pMin.y - startPoint.y) / dir.y; else t = (box.pMax.y - startPoint.y) / dir.y; if (t > 0.) { auto tmpPt = startPoint + t * dir; if (box.pMin.x <= tmpPt.x && box.pMin.z <= tmpPt.z && box.pMax.x >= tmpPt.x && box.pMax.z >= tmpPt.z) return true; } } if (dir.z != 0.) { // 同上测试z方向 if (dir.z > 0) t = (box.pMin.z - startPoint.z) / dir.z; else t = (box.pMax.z - startPoint.z) / dir.z; if (t > 0.) { auto tmpPt = startPoint + t * dir; if (box.pMin.x <= tmpPt.x && box.pMin.y <= tmpPt.y && box.pMax.x >= tmpPt.x && box.pMax.y >= tmpPt.y) return true; } } return false; }; // 不相交 if (!isRayBoxIntersect()) return; // 相交 if (bvhNode.firstChild == -1) { // 与叶节点相交 intersectionLeafNodes.emplace_back(bvhNode); } else { // 与父节点相交 for (int i = 0; i < 4; i++) recursiveGetRayBVHIntersection(dir, startPoint, bvhNode.firstChild + i, intersectionLeafNodes); } } __host__ std::vector NurbsSurface::Surface::rayBVHIntersection(glm::vec3 dir, glm::vec3 startPoint) { std::vector res; recursiveGetRayBVHIntersection(dir, startPoint, 0, res); //TODO sort res by t // res.emplace_back(BVHNode()); // printf("res size: %lld\n", res.size()); return res; }