// // Created by dtouch on 23-5-23. // #include "rod_generate.cuh" #include "device_functions.h" #include "float.h" __global__ void g_rod_generate(int *beamData, int beamCnt, float *pointData, int pointCnt, cudaPitchedPtr sdf, const cudaExtent *extent, size_t floatSize, const Eigen::Vector3f *sampleMin, const Eigen::Vector3f *sampleMax, int radius) { // 3-dim grid and 3-dim block Eigen::Map> rod_beams(beamData, beamCnt, 2); Eigen::Map> rod_points(pointData, pointCnt, 3); auto ix = blockIdx.x * blockDim.x + threadIdx.x; auto iy = blockIdx.y * blockDim.y + threadIdx.y; auto iz = blockIdx.z * blockDim.z + threadIdx.z; // if (ix == 0 && iy == 0 && iz == 0) { // for (int i = 0; i < beamCnt; ++i) { // printf("%d, %d\n", rod_beams(i, 0), rod_beams(i, 1)); // } // for(int i = 0; i < pointCnt; ++i) { // printf("%f, %f, %f\n", rod_points(i, 0), rod_points(i, 1), rod_points(i, 2)); // } // } if (ix >= extent->width / floatSize || iy >= extent->height || iz >= extent->depth) { return; } auto x = sampleMin->x() + static_cast(ix) * (sampleMax->x() - sampleMin->x()) / static_cast(extent->width / floatSize); auto y = sampleMin->y() + static_cast(iy) * (sampleMax->y() - sampleMin->y()) / static_cast(extent->height); auto z = sampleMin->z() + static_cast(iz) * (sampleMax->z() - sampleMin->z()) / static_cast(extent->depth); // printf("%d, %d, %d\n", ix, iy, iz); // 获取sdf中下标为(ix,iy)的元素的行首指针 // auto sdfPtr = reinterpret_cast((char *) sdf.ptr + iy * sdf.pitch + iz * sdf.pitch * extent->height); char *sdfPtr = (char *) sdf.ptr; size_t pitch = sdf.pitch; size_t slicePitch = pitch * extent->height; auto p = Eigen::Vector3f(x, y, z); char *slice = sdfPtr + iz * slicePitch; auto *row = (float *) (slice + iy * pitch); // row[ix] is initialized as the max float in GPU row[ix] = FLT_MAX; // auto aTmp = Eigen::Vector3f(rod_points.row(rod_beams(2, 1))); // printf("aTmp: (%f, %f, %f)\n", aTmp.x(), aTmp.y(), aTmp.z()); for (int i = 0; i < rod_beams.rows(); ++i) { auto a = Eigen::Vector3f(rod_points.row(rod_beams(i, 0))); auto b = Eigen::Vector3f(rod_points.row(rod_beams(i, 1))); auto ab = b - a; auto ap = p - a; auto bp = p - b; if (ab.x() * bp.x() + ab.y() * bp.y() + ab.z() + bp.z() < 0 && ab.x() * ap.x() + ab.y() * ap.y() + ab.z() * ap.z() > 0) { row[ix] = min(row[ix], (ap.cross(bp)).norm() / ab.norm()); } else { row[ix] = min(row[ix], min(ap.norm(), bp.norm())); } } row[ix] -= radius; } __host__ float* h_rod_generate(const RodCrystal &rodCrystal, const Eigen::Vector3i &sampleCnt, const Eigen::Vector3f &sampleMin, const Eigen::Vector3f &sampleMax, float radius) { int *d_beamData; size_t beamBytes = rodCrystal.rod_beams.rows() * rodCrystal.rod_beams.cols() * sizeof(int); cudaMalloc(&d_beamData, beamBytes); cudaMemcpy(d_beamData, rodCrystal.rod_beams.data(), beamBytes, cudaMemcpyHostToDevice); float *d_pointData; size_t pointBytes = rodCrystal.rod_points.rows() * rodCrystal.rod_points.cols() * sizeof(float); cudaMalloc(&d_pointData, pointBytes); cudaMemcpy(d_pointData, rodCrystal.rod_points.data(), pointBytes, cudaMemcpyHostToDevice); // RodCrystal *d_rodCrystal; // cudaMalloc(&d_rodCrystal, sizeof(rodCrystal)); // cudaMemcpy(d_rodCrystal, &rodCrystal, sizeof(rodCrystal), cudaMemcpyHostToDevice); // printf("size of rodCrystal: %lu; size of class RodCrystal: %lu\n", sizeof(rodCrystal), sizeof(RodCrystal)); // printf("size of rodCrystal.rod_points: %lu\n", sizeof(rodCrystal.rod_points)); // printf("size of rodCrystal.rod_beams: %lu\n", sizeof(rodCrystal.rod_beams)); // printf("size of rodCrystal.rod_points.row(0): %lu\n", sizeof(rodCrystal.rod_points.row(0))); int sampleCntAll = sampleCnt.x() * sampleCnt.y() * sampleCnt.z(); float *h_sdf; h_sdf = (float *) malloc(sampleCnt.x() * sampleCnt.y() * sampleCnt.z() * sizeof(float)); for (int i = 0; i < sampleCnt.x() * sampleCnt.y() * sampleCnt.z(); ++i) { h_sdf[i] = i; } cudaPitchedPtr d_sdf{}; cudaExtent extent = make_cudaExtent(sampleCnt.x() * sizeof(float), sampleCnt.y(), sampleCnt.z()); cudaMalloc3D(&d_sdf, extent); cudaMemcpy3DParms copyParams = {nullptr}; // 为什么srcPtr是一个pitchedPtr? copyParams.srcPtr = make_cudaPitchedPtr((void *) h_sdf, sampleCnt.x() * sizeof(float), sampleCnt.x(), sampleCnt.y()); copyParams.dstPtr = d_sdf; copyParams.extent = extent; copyParams.kind = cudaMemcpyHostToDevice; cudaMemcpy3D(©Params); Eigen::Vector3f *d_sampleMin; cudaMalloc(&d_sampleMin, sizeof(sampleMin)); cudaMemcpy(d_sampleMin, &sampleMin, sizeof(sampleMin), cudaMemcpyHostToDevice); Eigen::Vector3f *d_sampleMax; cudaMalloc(&d_sampleMax, sizeof(sampleMax)); cudaMemcpy(d_sampleMax, &sampleMax, sizeof(sampleMax), cudaMemcpyHostToDevice); cudaExtent *d_extent; cudaMalloc(&d_extent, sizeof(extent)); cudaMemcpy(d_extent, &extent, sizeof(extent), cudaMemcpyHostToDevice); dim3 grid(16, 16, 16); dim3 block((sampleCnt.x() + grid.x - 1) / grid.x, (sampleCnt.y() + grid.y - 1) / grid.y, (sampleCnt.z() + grid.z - 1) / grid.z); g_rod_generate<<>>(d_beamData, rodCrystal.rod_beams.rows(), d_pointData, rodCrystal.rod_points.rows(), d_sdf, d_extent, sizeof(float), d_sampleMin, d_sampleMax, radius); cudaDeviceSynchronize(); // for (int i = 0; i < sampleCnt.x() * sampleCnt.y() * sampleCnt.z(); ++i) { // h_sdf[i] = -i; // } auto tmpBeam = rodCrystal.rod_beams(0, 1); printf("tmpBeam: %d\n", tmpBeam); printf("copy back to host\n"); copyParams = {nullptr}; copyParams.srcPtr = d_sdf; copyParams.dstPtr = make_cudaPitchedPtr((void *) h_sdf, sampleCnt.x() * sizeof(float), sampleCnt.x(), sampleCnt.y()); copyParams.extent = extent; copyParams.kind = cudaMemcpyDeviceToHost; cudaMemcpy3D(©Params); // cudaFree(d_rodCrystal); cudaFree(d_sdf.ptr); cudaFree(d_sampleMin); cudaFree(d_sampleMax); cudaFree(d_extent); cudaFree(d_beamData); cudaFree(d_pointData); printf("["); for (int i = 0; i < sampleCnt.x(); i++) { printf("["); for (int j = 0; j < sampleCnt.y(); j++) { printf("["); for (int k = 0; k < sampleCnt.z(); k++) { printf("%f", h_sdf[i * sampleCnt.y() * sampleCnt.z() + j * sampleCnt.z() + k]); if (k != sampleCnt.z() - 1) printf(","); } printf("]"); if (j != sampleCnt.y() - 1) printf(","); } printf("]"); } printf("]"); free(h_sdf); }