Including C standard headers in CUDA NVRTC code

1.9k views Asked by At

I'm writing a CUDA kernel that is compiled at runtime using NVRTC (CUDA version 9.2 with NVRTC version 7.5), which needs the stdint.h header, in order to have the int32_t etc. types.

If I write the kernel source code without the include, it works correctly. For example the kernel

extern "C" __global__ void f() { ... }

Compiles to PTX code where f is defined as .visible .entry f.

But if the kernel source code is

#include <stdint.h>
extern "C" __global__ void f() { ... }

it reports A function without execution space annotations (__host__/__device__/__global__) is considered a host function, and host functions are not allowed in JIT mode. (also without extern "C").

Passing -default-device makes the PTX code .visible .func f, so the function cannot be called from the host.

Is there a way to include headers in the source code, and still have a __global__ entry function? Or alternately, a way to know which integer size convention is used on the by the NVRTC compiler, so that the int32_t etc. types can be manually defined?

Edit: Example program that shows the problem:

#include <cstdlib>
#include <string>
#include <vector>
#include <memory>
#include <cassert>
#include <iostream>

#include <cuda.h>
#include <cuda_runtime.h>
#include <nvrtc.h>

[[noreturn]] void fail(const std::string& msg, int code) {
    std::cerr << "error: " << msg << " (" << code << ')' << std::endl;
    std::exit(EXIT_FAILURE);
}


std::unique_ptr<char[]> compile_to_ptx(const char* program_source) {
    nvrtcResult rv;

    // create nvrtc program
    nvrtcProgram prog;
    rv = nvrtcCreateProgram(
        &prog,
        program_source,
        "program.cu",
        0,
        nullptr,
        nullptr
    );
    if(rv != NVRTC_SUCCESS) fail("nvrtcCreateProgram", rv);

    // compile nvrtc program
    std::vector<const char*> options = {
        "--gpu-architecture=compute_30"
    };
    //options.push_back("-default-device");
    rv = nvrtcCompileProgram(prog, options.size(), options.data());
    if(rv != NVRTC_SUCCESS) {
        std::size_t log_size;
        rv = nvrtcGetProgramLogSize(prog, &log_size);
        if(rv != NVRTC_SUCCESS) fail("nvrtcGetProgramLogSize", rv);

        auto log = std::make_unique<char[]>(log_size);
        rv = nvrtcGetProgramLog(prog, log.get());
        if(rv != NVRTC_SUCCESS) fail("nvrtcGetProgramLog", rv);
        assert(log[log_size - 1] == '\0');

        std::cerr << "Compile error; log:\n" << log.get() << std::endl;

        fail("nvrtcCompileProgram", rv);
    }

    // get ptx code
    std::size_t ptx_size;
    rv = nvrtcGetPTXSize(prog, &ptx_size);
    if(rv != NVRTC_SUCCESS) fail("nvrtcGetPTXSize", rv);

    auto ptx = std::make_unique<char[]>(ptx_size);
    rv = nvrtcGetPTX(prog, ptx.get());
    if(rv != NVRTC_SUCCESS) fail("nvrtcGetPTX", rv);
    assert(ptx[ptx_size - 1] == '\0');

    nvrtcDestroyProgram(&prog);

    return ptx;
}

const char program_source[] = R"%%%(
//#include <stdint.h>
extern "C" __global__ void f(int* in, int* out) {
    out[threadIdx.x] = in[threadIdx.x];
}
)%%%";

int main() {
    CUresult rv;

    // initialize CUDA
    rv = cuInit(0);
    if(rv != CUDA_SUCCESS) fail("cuInit", rv);

    // compile program to ptx
    auto ptx = compile_to_ptx(program_source);
    std::cout << "PTX code:\n" << ptx.get() << std::endl;
}

When //#include <stdint.h> in the kernel source is uncommented it no longer compiles. When //options.push_back("-default-device"); is uncommented it compiles but does not mark the function f as .entry.

CMakeLists.txt to compile it (needs CUDA driver API + NVRTC)

cmake_minimum_required(VERSION 3.4)
project(cudabug CXX)

find_package(CUDA REQUIRED)

set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED 14)

add_executable(cudabug cudabug.cc)
include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
link_directories(${CUDA_LIBRARY_DIRS})
target_link_libraries(cudabug PUBLIC ${CUDA_LIBRARIES} nvrtc cuda)
2

There are 2 answers

0
talonmies On BEST ANSWER

[Preface: this is a very hacky answer, and is specific to the GNU toolchain (although I suspect the problem in the question is also specific to the GNU toolchain)].

It would appear that the problem here is with the GNU standard header features.h, which gets pulled into stdint.h and then winds up defining a lot of stub functions which have the default __host__ compilation space. This causes nvrtc to blow up. It also seems that the -default-device option will result in a resolved glibC compiler feature set which makes the whole nvrtc compiler fail.

You can defeat this (in a very hacky way) by predefining a feature set for the standard library which excludes all the host functions. Changing your JIT kernel code to

const char program_source[] = R"%%%(
#define __ASSEMBLER__
#define __extension__
#include <stdint.h>
extern "C" __global__ void f(int32_t* in, int32_t* out) {
    out[threadIdx.x] = in[threadIdx.x];
}
)%%%";

got me this:

$ nvcc -std=c++14 -ccbin=g++-7 jit_header.cu -o jitheader -lnvrtc -lcuda
$ ./jitheader 
PTX code:
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-24330188
// Cuda compilation tools, release 9.2, V9.2.148
// Based on LLVM 3.4svn
//

.version 6.2
.target sm_30
.address_size 64

    // .globl   f

.visible .entry f(
    .param .u64 f_param_0,
    .param .u64 f_param_1
)
{
    .reg .b32   %r<3>;
    .reg .b64   %rd<8>;


    ld.param.u64    %rd1, [f_param_0];
    ld.param.u64    %rd2, [f_param_1];
    cvta.to.global.u64  %rd3, %rd2;
    cvta.to.global.u64  %rd4, %rd1;
    mov.u32     %r1, %tid.x;
    mul.wide.u32    %rd5, %r1, 4;
    add.s64     %rd6, %rd4, %rd5;
    ld.global.u32   %r2, [%rd6];
    add.s64     %rd7, %rd3, %rd5;
    st.global.u32   [%rd7], %r2;
    ret;
}

Big caveat: This worked on the glibC system I tried it on. It probably won't work with other toolchains or libC implementations (if, indeed, they have this problem).

0
einpoklum On

Another alternative is creating stand-ins, for some of the standard library headers. NVRTC's API supports your specifying header file contents as strings, associated with header names - before it will go looking through the filesystem for you. This approach is adopted in NVIDIA JITify, and I've adopted it myself working on something else which may or may not be released.

The easy way to do this You can just take the JITify header stubs for stdint.h, limits.h , from here, which I'm also attaching since it's not very long. Alternatively, you can generate this stub yourself to make sure you're not missing out on anything that's relevant from the standard. Here's how that works:

  1. Start with your stdint.h file (or cstdint file as the case may be);

  2. For each include directive in the file (and recursively, for each include in an include etc):

    2.1 Figure out whether you can skip including the file altogether (possibly by making a few defines which are known to hold on the GPU).

    2.2 If you're not sure you can skip the file - include it entirely and recurse to (2.), or keep it as its own separate header (and apply the whole process in (1.) to it).

  3. You now have a header file which only includes device-safe header files (or none at all)

  4. Partially-preprocess the file, dropping everything that won't be used on a GPU Remove the lines which might be problematic on a GPU (e.g. #pragma's), and add __device__ __host__ or just __host__ as appropriate to each function declaration.

Important note: Doing this requires paying attention to licenses and copyrights. You would be creating a "derivative work" of glibc and/or JITify and/or StackOverflow contributions etc.


Now, the stdint.h and limits.h from NVIDIA JITify I promised. I've adapted them to not have namespaces:

stdint.h:

#pragma once
#include <limits.h>
typedef signed char      int8_t;
typedef signed short     int16_t;
typedef signed int       int32_t;
typedef signed long long int64_t;
typedef signed char      int_fast8_t;
typedef signed short     int_fast16_t;
typedef signed int       int_fast32_t;
typedef signed long long int_fast64_t;
typedef signed char      int_least8_t;
typedef signed short     int_least16_t;
typedef signed int       int_least32_t;
typedef signed long long int_least64_t;
typedef signed long long intmax_t;
typedef signed long      intptr_t; //optional
typedef unsigned char      uint8_t;
typedef unsigned short     uint16_t;
typedef unsigned int       uint32_t;
typedef unsigned long long uint64_t;
typedef unsigned char      uint_fast8_t;
typedef unsigned short     uint_fast16_t;
typedef unsigned int       uint_fast32_t;
typedef unsigned long long uint_fast64_t;
typedef unsigned char      uint_least8_t;
typedef unsigned short     uint_least16_t;
typedef unsigned int       uint_least32_t;
typedef unsigned long long uint_least64_t;
typedef unsigned long long uintmax_t;
#define INT8_MIN    SCHAR_MIN
#define INT16_MIN   SHRT_MIN
#if defined _WIN32 || defined _WIN64
#define WCHAR_MIN   SHRT_MIN
#define WCHAR_MAX   SHRT_MAX
typedef unsigned long long uintptr_t; //optional
#else
#define WCHAR_MIN   INT_MIN
#define WCHAR_MAX   INT_MAX
typedef unsigned long      uintptr_t; //optional
#endif
#define INT32_MIN   INT_MIN
#define INT64_MIN   LLONG_MIN
#define INT8_MAX    SCHAR_MAX
#define INT16_MAX   SHRT_MAX
#define INT32_MAX   INT_MAX
#define INT64_MAX   LLONG_MAX
#define UINT8_MAX   UCHAR_MAX
#define UINT16_MAX  USHRT_MAX
#define UINT32_MAX  UINT_MAX
#define UINT64_MAX  ULLONG_MAX
#define INTPTR_MIN  LONG_MIN
#define INTMAX_MIN  LLONG_MIN
#define INTPTR_MAX  LONG_MAX
#define INTMAX_MAX  LLONG_MAX
#define UINTPTR_MAX ULONG_MAX
#define UINTMAX_MAX ULLONG_MAX
#define PTRDIFF_MIN INTPTR_MIN
#define PTRDIFF_MAX INTPTR_MAX
#define SIZE_MAX    UINT64_MAX

limits.h:

#pragma once
#if defined _WIN32 || defined _WIN64
 #define __WORDSIZE 32
#else
 #if defined __x86_64__ && !defined __ILP32__
  #define __WORDSIZE 64
 #else
  #define __WORDSIZE 32
 #endif
#endif
#define MB_LEN_MAX  16
#define CHAR_BIT    8
#define SCHAR_MIN   (-128)
#define SCHAR_MAX   127
#define UCHAR_MAX   255
enum {
  _JITIFY_CHAR_IS_UNSIGNED = (char)-1 >= 0,
  CHAR_MIN = _JITIFY_CHAR_IS_UNSIGNED ? 0 : SCHAR_MIN,
  CHAR_MAX = _JITIFY_CHAR_IS_UNSIGNED ? UCHAR_MAX : SCHAR_MAX,
};
#define SHRT_MIN    (-32768)
#define SHRT_MAX    32767
#define USHRT_MAX   65535
#define INT_MIN     (-INT_MAX - 1)
#define INT_MAX     2147483647
#define UINT_MAX    4294967295U
#if __WORDSIZE == 64
 # define LONG_MAX  9223372036854775807L
#else
 # define LONG_MAX  2147483647L
#endif
#define LONG_MIN    (-LONG_MAX - 1L)
#if __WORDSIZE == 64
 #define ULONG_MAX  18446744073709551615UL
#else
 #define ULONG_MAX  4294967295UL
#endif
#define LLONG_MAX  9223372036854775807LL
#define LLONG_MIN  (-LLONG_MAX - 1LL)
#define ULLONG_MAX 18446744073709551615ULL