Proper creation of Optix 7.1 TLAS Instance Acceleration Structure

698 views Asked by At

I am trying to figure out how to build a TLAS properly. Using the samples that came with OptiX 7.1 and Ingo Wald's Optix 7 samples, started with a triangle (just the BLAS that holds the geometry) and it works fine (moved the triangle sample of the SDK to Wald's example framework). Next I introduced a TLAS with one instance (that is the BLAS from before) and used that TLAS in the shader but I am not getting a single hit. What am I not doing correctly?

OptixTraversableHandle SampleRenderer::buildAccelerator() {

    OptixTraversableHandle geometryAcceleratorHandle{ 0 };
    CUdeviceptr dAcceleratorBuffer;
    OptixAccelBuildOptions acceleratorOptions{};
    acceleratorOptions.buildFlags = OPTIX_BUILD_FLAG_NONE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
    acceleratorOptions.operation = OPTIX_BUILD_OPERATION_BUILD;

    //Triangle build input: simple list of three vertices
    const std::array<float3, 3> vertices{ { { 0.33f, 0.33f, 0.0f },{  0.33f, -0.33f, 0.0f },{  0.66f,  0.33f, 0.0f }} };
    const size_t verticesSize = sizeof(float3) * vertices.size();
    CUdeviceptr dVertices{ 0ull };
    CUDA_CHECK(Malloc(reinterpret_cast<void**>(&dVertices), verticesSize));
    CUDA_CHECK(Memcpy(reinterpret_cast<void*>(dVertices), vertices.data(), verticesSize, cudaMemcpyHostToDevice));

    //Build input is a simple list of non-indexed triangle vertices
    const uint32_t triangleInputFlags{ OPTIX_GEOMETRY_FLAG_NONE };
    OptixBuildInput triangleInput{};
    triangleInput.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
    triangleInput.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3;
    triangleInput.triangleArray.numVertices = static_cast<uint32_t>(vertices.size());
    triangleInput.triangleArray.vertexBuffers = &dVertices;
    triangleInput.triangleArray.flags = &triangleInputFlags;
    triangleInput.triangleArray.numSbtRecords = 1u;

    OptixAccelBufferSizes blasBufferSizes;
    OPTIX_CHECK(optixAccelComputeMemoryUsage(optixContext, &acceleratorOptions, &triangleInput, 1, &blasBufferSizes));
    CUdeviceptr dTempBuffer;
    CUDA_CHECK(Malloc(reinterpret_cast<void**>(&dTempBuffer), blasBufferSizes.tempSizeInBytes));

    CUDA_CHECK(Malloc(reinterpret_cast<void**>(&dAcceleratorBuffer), blasBufferSizes.outputSizeInBytes));

    OPTIX_CHECK(
        optixAccelBuild(
            optixContext,
            0,
            &acceleratorOptions,
            &triangleInput,
            1,
            dTempBuffer,
            blasBufferSizes.tempSizeInBytes,
            dAcceleratorBuffer,
            blasBufferSizes.outputSizeInBytes,
            &geometryAcceleratorHandle,
            nullptr,
            0)
        );

    CUDA_CHECK(Free((void*)dTempBuffer));
    CUDA_CHECK(Free((void*)dVertices));

    return geometryAcceleratorHandle;
}

Instead of using the return value of the function above, I feed it to the TLAS creation function below and use its output handle in the shaders:

OptixTraversableHandle SampleRenderer::buildInstanceAccelerator(const OptixTraversableHandle& geoHandle){
    OptixInstance optixInstance = { { 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f } };
    optixInstance.flags = OPTIX_INSTANCE_FLAG_NONE;
    optixInstance.instanceId = 0u;
    optixInstance.sbtOffset = 0u;
    optixInstance.visibilityMask = 1u;
    optixInstance.traversableHandle = geoHandle;
    CUdeviceptr dOptixInstance;
    CUDA_CHECK(Malloc(reinterpret_cast<void**>(&dOptixInstance), sizeof(OptixInstance)));
    CUDA_CHECK(Memcpy(reinterpret_cast<void*>(dOptixInstance), &optixInstance, sizeof(OptixInstance), cudaMemcpyHostToDevice));

    OptixAabb optixAabb[2]{
        { -1.5f, -1.0f, -0.5f,
          -0.5f,  0.5f,  0.5f  },
        {  0.5f,  0.0f, -0.01f,
           1.5f,  1.5f,  0.01f } };
    CUdeviceptr  dAabb;
    CUDA_CHECK(Malloc(reinterpret_cast<void**>(&dAabb), 2 * sizeof(OptixAabb)));
    CUDA_CHECK(Memcpy(reinterpret_cast<void*>(dAabb), optixAabb, 2 * sizeof(OptixAabb), cudaMemcpyHostToDevice));
    OptixBuildInput instanceBuildInput{};
    instanceBuildInput.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
    instanceBuildInput.instanceArray.instances = dOptixInstance;
    instanceBuildInput.instanceArray.numInstances = 1u;
    instanceBuildInput.instanceArray.aabbs = dAabb;
    instanceBuildInput.instanceArray.numAabbs =1u;

    OptixAccelBuildOptions acceleratorBuildOptions{};
    acceleratorBuildOptions.buildFlags = OPTIX_BUILD_FLAG_NONE;
    acceleratorBuildOptions.operation = OPTIX_BUILD_OPERATION_BUILD;

    OptixAccelBufferSizes acceleratorBufferSizes;
    OPTIX_CHECK(optixAccelComputeMemoryUsage(
        optixContext,
        &acceleratorBuildOptions,
        &instanceBuildInput,
        1u,
        &acceleratorBufferSizes));
    CUdeviceptr dTempBuffer;
    CUDA_CHECK(Malloc(reinterpret_cast<void**>(&dTempBuffer), acceleratorBufferSizes.tempSizeInBytes));
    CUdeviceptr dInstanceAcceleratorBuffer;
    CUDA_CHECK(Malloc(reinterpret_cast<void**>(&dInstanceAcceleratorBuffer), acceleratorBufferSizes.outputSizeInBytes));
    
    OptixTraversableHandle instanceAcceleratorHandle{ 0 };
    OPTIX_CHECK(optixAccelBuild(
        optixContext,
        0,
        &acceleratorBuildOptions,
        &instanceBuildInput,
        1,
        dTempBuffer,
        acceleratorBufferSizes.tempSizeInBytes,
        dInstanceAcceleratorBuffer,
        acceleratorBufferSizes.outputSizeInBytes,
        &instanceAcceleratorHandle,
        nullptr,
        0));
    return instanceAcceleratorHandle;
}

For reference the shader code (it works just fine when the OptixTraversableHandle comes from the first function above:

namespace osc {

extern "C" __constant__ LaunchParams optixLaunchParams;
//Single ray type
enum { SURFACE_RAY_TYPE = 0, RAY_TYPE_COUNT };

static __forceinline__ __device__ void* unpackPointer(uint32_t i0, uint32_t i1) {
    const uint64_t uptr = static_cast<uint64_t>(i0) << 32 | i1;
    void* ptr = reinterpret_cast<void*>(uptr);
    return ptr;
}

static __forceinline__ __device__ void  packPointer(void* ptr, uint32_t& i0, uint32_t& i1) {
    const uint64_t uptr = reinterpret_cast<uint64_t>(ptr);
    i0 = uptr >> 32;
    i1 = uptr & 0x00000000ffffffff;
}

template<typename T>    static __forceinline__ __device__ T* getPRD() {
    const uint32_t u0 = optixGetPayload_0();
    const uint32_t u1 = optixGetPayload_1();
    return reinterpret_cast<T*>(unpackPointer(u0, u1));
}

static __forceinline__ __device__ void trace(
    OptixTraversableHandle handle,
    vec3f                 ray_origin,
    vec3f                 ray_direction,
    float                  tmin,
    float                  tmax,
    float3* prd) {
    unsigned int p0, p1, p2;
    p0 = float_as_int(prd->x);
    p1 = float_as_int(prd->y);
    p2 = float_as_int(prd->z);
    optixTrace(
        handle,
        ray_origin,
        ray_direction,
        tmin,
        tmax,
        0.0f,                // rayTime
        OptixVisibilityMask(1),
        OPTIX_RAY_FLAG_DISABLE_ANYHIT,//OPTIX_RAY_FLAG_NONE,
        0,                   // SBT offset
        0,                   // SBT stride
        0,                   // missSBTIndex
        p0, 
        p1, 
        p2);
    prd->x = int_as_float(p0);
    prd->y = int_as_float(p1);
    prd->z = int_as_float(p2);
}

static __forceinline__ __device__ void setPayload(float3 p) {
    optixSetPayload_0(float_as_int(p.x));
    optixSetPayload_1(float_as_int(p.y));
    optixSetPayload_2(float_as_int(p.z));
}

static __forceinline__ __device__ float3 getPayload() {
    return make_float3(int_as_float(optixGetPayload_0()), int_as_float(optixGetPayload_1()), int_as_float(optixGetPayload_2()));
}

extern "C" __global__ void __closesthit__radiance() {
    //When built-in triangle intersection is used, a number of fundamental
    //attributes are provided by the OptiX API, including barycentric coordinates
    const float2 barycentricCoordinates = optixGetTriangleBarycentrics();
    setPayload(make_float3(barycentricCoordinates.x, barycentricCoordinates.y, 1.f - barycentricCoordinates.x - barycentricCoordinates.y));
}

extern "C" __global__ void __anyhit__radiance() {  }

extern "C" __global__ void __intersection__radiance() { }

extern "C" __global__ void __miss__radiance() {
    MissData* missData = reinterpret_cast<MissData*>(optixGetSbtDataPointer());
    float3 payload = getPayload();//Why???
    setPayload(missData->backgroundColor);
}

extern "C" __global__ void __raygen__renderFrame() {
    // compute a test pattern based on pixel ID
    const int ix = optixGetLaunchIndex().x;
    const int iy = optixGetLaunchIndex().y;

    const auto& camera = optixLaunchParams.camera;

    // our per-ray data for this example. what we initialize it to
    // won't matter, since this value will be overwritten by either
    // the miss or hit program, anyway
    float3 pixelColorPRD = { 0.5f, 0.5f, 0.5f };
    // normalized screen plane position, in [0,1]^2
    const vec2f screen(vec2f(ix + .5f, iy + .5f) / vec2f(optixLaunchParams.frame.size));

    // generate ray direction
    vec3f rayDir = normalize(camera.direction + (screen.x - 0.5f) * camera.horizontal + (screen.y - 0.5f) * camera.vertical);

    trace(optixLaunchParams.traversable,
        camera.position,
        rayDir,
        0.f,    // tmin
        1e16f,  // tmax
        &pixelColorPRD);

    const int r = int(255.99f * pixelColorPRD.x);
    const int g = int(255.99f * pixelColorPRD.y);
    const int b = int(255.99f * pixelColorPRD.z);

    const uint32_t rgba = 0xff000000 | (r << 0) | (g << 8) | (b << 16);

    const uint32_t fbIndex = ix + iy * optixLaunchParams.frame.size.x;
    optixLaunchParams.frame.colorBuffer[fbIndex] = rgba;
}

}

2

There are 2 answers

0
Vectorizer On BEST ANSWER
  1. OptixPipelineCompileOptions traversableGraphFlags must contain the flag OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING
  2. maxTraversableGraphDepth = 2 in optixPipelineSetStackSize()

resolved the problem.

0
Ingo Wald On

The code above looks reasonable, so first guess would be to check the flags with which the context and pipeline have been created; in particular the instance level flag of the pipeline creation flags: The sample you originally copy-and-pasted from did not use instancing, so almost certainly has its pipeline set to 'no instancing' (because if it had instancing turned on it would expect to see an instance!).

Yet if the actual instancing level of the TLASes/BLASes you're creating doesn't match what you used to create the pipeline you won't find any hits.

I'd also suggest to look at some of the code in my more recent OWL project, also on github - that also does instancing, multi-level instances, etcpp.