fix bugs in CUDA codes

3 years ago · 5b2ab8fc5c
14 changed files with 201 additions and 79 deletions
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@ -0,0 +1,8 @@
 # Default ignored files
 /shelf/
 /workspace.xml
 # Editor-based HTTP Client requests
 /httpRequests/
 # Datasource local storage ignored files
 /dataSources/
 /dataSources.local.xml
--- a/.idea/.name
+++ b/.idea/.name
@ -0,0 +1 @@
 renderSDF
--- a/.idea/ISDF.iml
+++ b/.idea/ISDF.iml
@ -0,0 +1,2 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <module classpath="CMake" type="CPP_MODULE" version="4" />
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -0,0 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="CMakeWorkspace" PROJECT_DIR="$PROJECT_DIR$/sdf_generate">
    <contentRoot DIR="$PROJECT_DIR$" />
  </component>
 </project>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@ -0,0 +1,8 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="ProjectModuleManager">
    <modules>
      <module fileurl="file://$PROJECT_DIR$/.idea/ISDF.iml" filepath="$PROJECT_DIR$/.idea/ISDF.iml" />
    </modules>
  </component>
 </project>
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@ -0,0 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="VcsDirectoryMappings">
    <mapping directory="$PROJECT_DIR$" vcs="Git" />
  </component>
 </project>
--- a/sdf_generate/CMakeLists.txt
+++ b/sdf_generate/CMakeLists.txt
@ -1,4 +1,9 @@
 cmake_minimum_required(VERSION 3.16)
 if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
    set(CMAKE_CUDA_ARCHITECTURES 86)
 endif()
 project(renderSDF CXX CUDA)
 set(CMAKE_CUDA_STANDARD 14)
@ -12,7 +17,8 @@ include_directories(include)
 AUX_SOURCE_DIRECTORY(src DIR_SRCS)
 FILE(GLOB_RECURSE DIR_INCLUDE include/*.h include/*.hpp include/*.cuh)
-add_executable(sdfGenerate ${DIR_SRCS} ${DIR_INCLUDE} main.cpp)
+ADD_LIBRARY(sdfGenerate SHARED ${DIR_SRCS} ${DIR_INCLUDE} main.cpp)
 #add_executable(sdfGenerate ${DIR_SRCS} ${DIR_INCLUDE} main.cpp)
 target_link_libraries(sdfGenerate Eigen3::Eigen)
--- a/sdf_generate/include/cudaEigenTest.cuh
+++ b/sdf_generate/include/cudaEigenTest.cuh
@ -8,11 +8,11 @@
 #include "Eigen/Eigen"
 #include "Eigen/Dense"
 #include "cuda_runtime.h"
-#include "glm/glm.hpp"
+//#include "glm/glm.hpp"
 __global__ void eigenKernel(Eigen::Matrix<double, 4, 3> factor, Eigen::Vector3d testVector);
-__global__ void glmKernel(glm::vec3 testVector);
+//__global__ void glmKernel(glm::vec3 testVector);
 __host__ void testHost();
--- a/sdf_generate/include/rod_generate.cuh
+++ b/sdf_generate/include/rod_generate.cuh
@ -10,13 +10,19 @@
 #include "Eigen/Eigen"
 #include "unsupported/Eigen/CXX11/Tensor"
 #ifdef _MSC_VER
 #define DLL_EXPORT __declspec( dllexport )
 #else
 #define DLL_EXPORT
 #endif
 __global__
 void g_rod_generate(const RodCrystal *rodCrystal, cudaPitchedPtr sdf, const cudaExtent* extent, const Eigen::Vector3f *sampleMin,
                    const Eigen::Vector3f *sampleMax, int radius);
-__host__ void
+extern "C" DLL_EXPORT __host__ float*
 h_rod_generate(const RodCrystal &rodCrystal, const Eigen::Vector3i &sampleCnt, const Eigen::Vector3f &sampleMin,
-               const Eigen::Vector3f &sampleMax, int radius);
+               const Eigen::Vector3f &sampleMax, float radius);
 class rod_generate {
--- a/sdf_generate/main.cpp
+++ b/sdf_generate/main.cpp
@ -5,11 +5,12 @@
 #include "iostream"
 #include "cudaEigenTest.cuh"
 #include "bvh.h"
 #include "rod_generate.cuh"
-int main() {
+float *test() {
    // a case with 3*3*3 points (2*2*2 resolution)
-    Eigen::Matrix<float, Eigen::Dynamic, 3>rod_points;
+    Eigen::Matrix<float, Eigen::Dynamic, 3> rod_points;
    rod_points.resize(27, 3);
    rod_points << 0, 0, 0,
            1, 0, 0,
            2, 0, 0,
@ -93,9 +94,13 @@ int main() {
            8, 17,
            17, 26;
    RodCrystal rod(rod_points, rod_beams);
 //    RodBVH bvh(rod);
 //    bvh.build();
    return h_rod_generate(rod, Eigen::Vector3i(10, 10, 10), Eigen::Vector3f(-100, -100, -100),
                          Eigen::Vector3f(100, 100, 100), 0.2f);
 }
-
+int main() {
-    RodBVH bvh(rod);
+    test();
    bvh.build();
    return 0;
 }
--- a/sdf_generate/src/bvh.cpp
+++ b/sdf_generate/src/bvh.cpp
@ -68,7 +68,7 @@ void RodBVH::quickSelect(std::vector<int> &rods, int axis) {
        }
        std::swap(rods[right], rods[storeIndex]);
        if (storeIndex == mid) {
-            return storeIndex;
+            return;
        } else if (storeIndex > mid) {
            right = storeIndex - 1;
        } else {
--- a/sdf_generate/src/cudaEigenTest.cu
+++ b/sdf_generate/src/cudaEigenTest.cu
@ -45,23 +45,23 @@ __global__ void eigenKernel(Eigen::Matrix<double, 2, 2> factor, Eigen::Matrix<do
 //    }
 }
-__global__ void glmKernel(glm::vec3 testVector) {
+//__global__ void glmKernel(glm::vec3 testVector) {
-    if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
+//    if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
-        printf("testVector: %f, %f, %f\n", testVector.x, testVector.y, testVector.z);
+//        printf("testVector: %f, %f, %f\n", testVector.x, testVector.y, testVector.z);
-        glm::mat<2, 2, float> a = {1, 2, 3, 4};
+//        glm::mat<2, 2, float> a = {1, 2, 3, 4};
-        glm::mat<2, 2, float> b = {2, 3, 4, 5};
+//        glm::mat<2, 2, float> b = {2, 3, 4, 5};
-        glm::mat<2, 2, float> c = a * b;
+//        glm::mat<2, 2, float> c = a * b;
-        glm::vec2 d = {3, 4};
+//        glm::vec2 d = {3, 4};
-        glm::vec2 e = c * d;
+//        glm::vec2 e = c * d;
-        glm::vec3 factor = {1, 2, 3};
+//        glm::vec3 factor = {1, 2, 3};
-        glm::vec3 result = factor * testVector;
+//        glm::vec3 result = factor * testVector;
-        printf("c: %f, %f, %f, %f\n", c[0][0], c[0][1], c[1][0], c[1][1]);
+//        printf("c: %f, %f, %f, %f\n", c[0][0], c[0][1], c[1][0], c[1][1]);
-        printf("e: %f, %f\n", e[0], e[1]);
+//        printf("e: %f, %f\n", e[0], e[1]);
-        printf("result: %f, %f, %f\n", result[0], result[1], result[2]);
+//        printf("result: %f, %f, %f\n", result[0], result[1], result[2]);
-    }
+//    }
-
+//
-
+//
-}
+//}
 __global__ void testKernel(float x) {
    printf("testVector: %f, %f, %f\n", x, x, x);
@ -76,8 +76,8 @@ __host__ void testHost() {
    eigenKernel <<< 1, 32>>>(factor, testVector);
    cudaDeviceSynchronize();
-    glm::vec3 testVectorGlm(4, 5, 6);
+//    glm::vec3 testVectorGlm(4, 5, 6);
-    glmKernel<<< 1, 32 >>>(testVectorGlm);
+//    glmKernel<<< 1, 32 >>>(testVectorGlm);
    cudaDeviceSynchronize();
 //
 //    testKernel<<< 1, 32 >>>(3);
@ -92,19 +92,19 @@ __host__ void testHost() {
 //    printf("result in CPU: %f, %f\n", result(2, 0), result(2, 1));
 //    printf("result in CPU: %f, %f\n", result(3, 0), result(3, 1));
-    glm::mat<4, 3, float> factorGlm{};
+//    glm::mat<4, 3, float> factorGlm{};
-    factorGlm[0][0] = 1;
+//    factorGlm[0][0] = 1;
-    factorGlm[0][1] = 2;
+//    factorGlm[0][1] = 2;
-    factorGlm[0][2] = 3;
+//    factorGlm[0][2] = 3;
-    factorGlm[1][0] = 4;
+//    factorGlm[1][0] = 4;
-    factorGlm[1][1] = 5;
+//    factorGlm[1][1] = 5;
-    factorGlm[1][2] = 6;
+//    factorGlm[1][2] = 6;
-    factorGlm[2][0] = 7;
+//    factorGlm[2][0] = 7;
-    factorGlm[2][1] = 8;
+//    factorGlm[2][1] = 8;
-    factorGlm[2][2] = 9;
+//    factorGlm[2][2] = 9;
-    factorGlm[3][0] = 10;
+//    factorGlm[3][0] = 10;
-    factorGlm[3][1] = 11;
+//    factorGlm[3][1] = 11;
-    factorGlm[3][2] = 12;
+//    factorGlm[3][2] = 12;
-    glm::vec4 resultGlm = testVectorGlm * factorGlm;
+//    glm::vec4 resultGlm = testVectorGlm * factorGlm;
 }
--- a/sdf_generate/src/rod.cpp
+++ b/sdf_generate/src/rod.cpp
@ -9,7 +9,7 @@
 RodCrystal::RodCrystal(Eigen::Matrix<float, Eigen::Dynamic, 3> _rod_points, Eigen::Matrix<int, Eigen::Dynamic, 2> _rod_beams) : rod_points(std::move(_rod_points)),
                                                                                  rod_beams(std::move(_rod_beams)) {
    rod_mid.resize(rod_beams.rows(), 3);
-    for (int i = 0; i < rod_beams.rows(); ++i) {
+//    for (int i = 0; i < rod_beams.rows(); ++i) {
-        rod_mid.row(i) = (rod_points.row(rod_beams(i, 0)) + rod_points.row(rod_beams(i, 1))) / 2;
+//        rod_mid.row(i) = (rod_points.row(rod_beams(i, 0)) + rod_points.row(rod_beams(i, 1))) / 2;
-    }
+//    }
 }
--- a/sdf_generate/src/rod_generate.cu
+++ b/sdf_generate/src/rod_generate.cu
@ -2,46 +2,91 @@
 // Created by dtouch on 23-5-23.
 //
-#include "../include/rod_generate.cuh"
+#include "rod_generate.cuh"
 #include "device_functions.h"
 #include "float.h"
 __global__
-void g_rod_generate(const RodCrystal *rodCrystal, cudaPitchedPtr sdf, const cudaExtent *extent,
+void
-                    const Eigen::Vector3f *sampleMin, const Eigen::Vector3f *sampleMax, int radius) {
+g_rod_generate(int *beamData, int beamCnt, float *pointData, int pointCnt, cudaPitchedPtr sdf, const cudaExtent *extent,
               size_t floatSize, const Eigen::Vector3f *sampleMin, const Eigen::Vector3f *sampleMax, int radius) {
    // 3-dim grid and 3-dim block
    Eigen::Map<Eigen::Matrix<int, Eigen::Dynamic, 2>> rod_beams(beamData, beamCnt, 2);
    Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, 3>> rod_points(pointData, pointCnt, 3);
    auto ix = blockIdx.x * blockDim.x + threadIdx.x;
    auto iy = blockIdx.y * blockDim.y + threadIdx.y;
    auto iz = blockIdx.z * blockDim.z + threadIdx.z;
 //    if (ix == 0 && iy == 0 && iz == 0) {
 //        for (int i = 0; i < beamCnt; ++i) {
 //            printf("%d, %d\n", rod_beams(i, 0), rod_beams(i, 1));
 //        }
 //        for(int i = 0; i < pointCnt; ++i) {
 //            printf("%f, %f, %f\n", rod_points(i, 0), rod_points(i, 1), rod_points(i, 2));
 //        }
 //    }
    if (ix >= extent->width / floatSize || iy >= extent->height || iz >= extent->depth) {
        return;
    }
    auto x = sampleMin->x() +
-             static_cast<float>(ix) * (sampleMax->x() - sampleMin->x()) / static_cast<float>(extent->width);
+             static_cast<float>(ix) * (sampleMax->x() - sampleMin->x()) / static_cast<float>(extent->width / floatSize);
    auto y = sampleMin->y() +
             static_cast<float>(iy) * (sampleMax->y() - sampleMin->y()) / static_cast<float>(extent->height);
    auto z = sampleMin->z() +
             static_cast<float>(iz) * (sampleMax->z() - sampleMin->z()) / static_cast<float>(extent->depth);
 //    printf("%d, %d, %d\n", ix, iy, iz);
    // 获取sdf中下标为(ix,iy)的元素的行首指针
-    auto sdfPtr = reinterpret_cast<float *>((char *) sdf.ptr + iy * sdf.pitch + iz * sdf.pitch * extent->height);
+//    auto sdfPtr = reinterpret_cast<float *>((char *) sdf.ptr + iy * sdf.pitch + iz * sdf.pitch * extent->height);
    char *sdfPtr = (char *) sdf.ptr;
    size_t pitch = sdf.pitch;
    size_t slicePitch = pitch * extent->height;
    auto p = Eigen::Vector3f(x, y, z);
-    for (int i = 0; i < rodCrystal->rod_beams.rows(); ++i) {
+    char *slice = sdfPtr + iz * slicePitch;
-        auto a = Eigen::Matrix<float, 3, 1>(rodCrystal->rod_points.row(rodCrystal->rod_beams(i, 0)));
+    auto *row = (float *) (slice + iy * pitch);
-        auto b = Eigen::Matrix<float, 3, 1>(rodCrystal->rod_points.row(rodCrystal->rod_beams(i, 1)));
+    // row[ix] is initialized as the max float in GPU
    row[ix] = FLT_MAX;
 //    auto aTmp = Eigen::Vector3f(rod_points.row(rod_beams(2, 1)));
 //    printf("aTmp: (%f, %f, %f)\n", aTmp.x(), aTmp.y(), aTmp.z());
    for (int i = 0; i < rod_beams.rows(); ++i) {
        auto a = Eigen::Vector3f(rod_points.row(rod_beams(i, 0)));
        auto b = Eigen::Vector3f(rod_points.row(rod_beams(i, 1)));
        auto ab = b - a;
        auto ap = p - a;
        auto bp = p - b;
-        if (ab.dot(-bp) > 0 && ab.dot(ap) > 0) {
+        if (ab.x() * bp.x() + ab.y() * bp.y() + ab.z() + bp.z() < 0 &&
-            sdfPtr[ix] = (ap.cross(bp)).norm() / ab.norm();
+            ab.x() * ap.x() + ab.y() * ap.y() + ab.z() * ap.z() > 0) {
            row[ix] = min(row[ix], (ap.cross(bp)).norm() / ab.norm());
        } else {
-            sdfPtr[iz] = std::min(ap.norm(), bp.norm());
+            row[ix] = min(row[ix], min(ap.norm(), bp.norm()));
        }
    }
    row[ix] -= radius;
 }
-__host__ void
+__host__ float*
 h_rod_generate(const RodCrystal &rodCrystal, const Eigen::Vector3i &sampleCnt, const Eigen::Vector3f &sampleMin,
-               const Eigen::Vector3f &sampleMax, int radius) {
+               const Eigen::Vector3f &sampleMax, float radius) {
-    RodCrystal *d_rodCrystal;
+    int *d_beamData;
-    cudaMalloc(&d_rodCrystal, sizeof(RodCrystal));
+    size_t beamBytes = rodCrystal.rod_beams.rows() * rodCrystal.rod_beams.cols() * sizeof(int);
-    cudaMemcpy(d_rodCrystal, &rodCrystal, sizeof(RodCrystal), cudaMemcpyHostToDevice);
+    cudaMalloc(&d_beamData, beamBytes);
    cudaMemcpy(d_beamData, rodCrystal.rod_beams.data(), beamBytes, cudaMemcpyHostToDevice);
    float *d_pointData;
    size_t pointBytes = rodCrystal.rod_points.rows() * rodCrystal.rod_points.cols() * sizeof(float);
    cudaMalloc(&d_pointData, pointBytes);
    cudaMemcpy(d_pointData, rodCrystal.rod_points.data(), pointBytes, cudaMemcpyHostToDevice);
 //    RodCrystal *d_rodCrystal;
 //    cudaMalloc(&d_rodCrystal, sizeof(rodCrystal));
 //    cudaMemcpy(d_rodCrystal, &rodCrystal, sizeof(rodCrystal), cudaMemcpyHostToDevice);
 //    printf("size of rodCrystal: %lu; size of class RodCrystal: %lu\n", sizeof(rodCrystal), sizeof(RodCrystal));
 //    printf("size of rodCrystal.rod_points: %lu\n", sizeof(rodCrystal.rod_points));
 //    printf("size of rodCrystal.rod_beams: %lu\n", sizeof(rodCrystal.rod_beams));
 //    printf("size of rodCrystal.rod_points.row(0): %lu\n", sizeof(rodCrystal.rod_points.row(0)));
    int sampleCntAll = sampleCnt.x() * sampleCnt.y() * sampleCnt.z();
    float *h_sdf;
    h_sdf = (float *) malloc(sampleCnt.x() * sampleCnt.y() * sampleCnt.z() * sizeof(float));
    for (int i = 0; i < sampleCnt.x() * sampleCnt.y() * sampleCnt.z(); ++i) {
        h_sdf[i] = i;
    }
@ -58,24 +103,33 @@ h_rod_generate(const RodCrystal &rodCrystal, const Eigen::Vector3i &sampleCnt, c
    cudaMemcpy3D(&copyParams);
    Eigen::Vector3f *d_sampleMin;
-    cudaMalloc(&d_sampleMin, sizeof(Eigen::Vector3f));
+    cudaMalloc(&d_sampleMin, sizeof(sampleMin));
-    cudaMemcpy(d_sampleMin, &sampleMin, sizeof(Eigen::Vector3f), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_sampleMin, &sampleMin, sizeof(sampleMin), cudaMemcpyHostToDevice);
    Eigen::Vector3f *d_sampleMax;
-    cudaMalloc(&d_sampleMax, sizeof(Eigen::Vector3f));
+    cudaMalloc(&d_sampleMax, sizeof(sampleMax));
-    cudaMemcpy(d_sampleMax, &sampleMax, sizeof(Eigen::Vector3f), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_sampleMax, &sampleMax, sizeof(sampleMax), cudaMemcpyHostToDevice);
-
+
-    cudaExtent  *d_extent;
+    cudaExtent *d_extent;
-    cudaMalloc(&d_extent, sizeof(Eigen::Vector3i));
+    cudaMalloc(&d_extent, sizeof(extent));
-    cudaMemcpy(d_extent, &sampleCnt, sizeof(Eigen::Vector3i), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_extent, &extent, sizeof(extent), cudaMemcpyHostToDevice);
-
+
-    dim3 block(64, 64, 64);
+    dim3 grid(16, 16, 16);
-    dim3 grid((sampleCnt.x() + block.x - 1) / block.x,
+    dim3 block((sampleCnt.x() + grid.x - 1) / grid.x,
-              (sampleCnt.y() + block.y - 1) / block.y,
+               (sampleCnt.y() + grid.y - 1) / grid.y,
-              (sampleCnt.z() + block.z - 1) / block.z);
+               (sampleCnt.z() + grid.z - 1) / grid.z);
-    g_rod_generate<<<grid, block>>>(d_rodCrystal, d_sdf, d_extent, d_sampleMin, d_sampleMax, radius);
+    g_rod_generate<<<grid, block>>>(d_beamData, rodCrystal.rod_beams.rows(), d_pointData, rodCrystal.rod_points.rows(),
                                    d_sdf, d_extent, sizeof(float), d_sampleMin, d_sampleMax, radius);
    cudaDeviceSynchronize();
 //    for (int i = 0; i < sampleCnt.x() * sampleCnt.y() * sampleCnt.z(); ++i) {
 //        h_sdf[i] = -i;
 //    }
    auto tmpBeam = rodCrystal.rod_beams(0, 1);
    printf("tmpBeam: %d\n", tmpBeam);
    printf("copy back to host\n");
    copyParams = {nullptr};
    copyParams.srcPtr = d_sdf;
@ -85,9 +139,29 @@ h_rod_generate(const RodCrystal &rodCrystal, const Eigen::Vector3i &sampleCnt, c
    copyParams.kind = cudaMemcpyDeviceToHost;
    cudaMemcpy3D(&copyParams);
-    cudaFree(d_rodCrystal);
+//    cudaFree(d_rodCrystal);
    cudaFree(d_sdf.ptr);
    cudaFree(d_sampleMin);
    cudaFree(d_sampleMax);
    cudaFree(d_extent);
    cudaFree(d_beamData);
    cudaFree(d_pointData);
    printf("[");
    for (int i = 0; i < sampleCnt.x(); i++) {
        printf("[");
        for (int j = 0; j < sampleCnt.y(); j++) {
            printf("[");
            for (int k = 0; k < sampleCnt.z(); k++) {
                printf("%f", h_sdf[i * sampleCnt.y() * sampleCnt.z() + j * sampleCnt.z() + k]);
                if (k != sampleCnt.z() - 1)
                    printf(",");
            }
            printf("]");
            if (j != sampleCnt.y() - 1)
                printf(",");
        }
        printf("]");
    }
    printf("]");
    free(h_sdf);
 }
	`@ -0,0 +1,2 @@`
					`<?xml version="1.0" encoding="UTF-8"?>`
					`<module classpath="CMake" type="CPP_MODULE" version="4" />`