cudaNvSci

Software Communication Interfaces(Sci)

涉及显存通信及交互的机制，暂时不是很清楚

cudaOpenMP

例子用于展示利用OpenMP在多个GPU上进行工作，OpenMP通过简单的宏指令实现多线程并发，这样在每个线程在不通的GPU上启动核函数，实现多GPU的并发操作。

核心逻辑

    //omp_set_num_threads(num_gpus);  // create as many CPU threads as there are CUDA device
    omp_set_num_threads(2*num_gpus);// create twice as many CPU threads as there are CUDA devices
    #pragma omp parallel
    {
        unsigned int cpu_thread_id = omp_get_thread_num();
        unsigned int num_cpu_threads = omp_get_num_threads();

        // set and check the CUDA device for this CPU thread
        int gpu_id = -1;
        checkCudaErrors(cudaSetDevice(cpu_thread_id % num_gpus));   // "% num_gpus" allows more CPU threads than GPU devices
        checkCudaErrors(cudaGetDevice(&gpu_id));
        printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id, num_cpu_threads, gpu_id);

        int *d_a = 0;   // pointer to memory on the device associated with this CPU thread
        int *sub_a = a + cpu_thread_id * n / num_cpu_threads;   // pointer to this CPU thread's portion of data
        unsigned int nbytes_per_kernel = nbytes / num_cpu_threads;
        dim3 gpu_threads(128);  // 128 threads per block
        dim3 gpu_blocks(n / (gpu_threads.x * num_cpu_threads));

        checkCudaErrors(cudaMalloc((void **)&d_a, nbytes_per_kernel));
        checkCudaErrors(cudaMemset(d_a, 0, nbytes_per_kernel));
        checkCudaErrors(cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice));
        kernelAddConstant<<<gpu_blocks, gpu_threads>>>(d_a, b);

        checkCudaErrors(cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost));
        checkCudaErrors(cudaFree(d_a));

    }

例子演示了OpenMP的使用方式及API

#include <omp.h>

int num_cpus = omp_get_num_procs();

omp_set_num_threads(num_gpus)
#pragma omp parallel
{
    unsigned int cpu_thread_id = omp_get_thread_num();
    unsigned int num_cpu_threads = omp_get_num_threads();
}

cudaTensorCoreGemm

需要cuda9.0及sm7.0以上支持

fp16ScalarProduct

半精度使用的例子，fp16的数据类型是half。

cuda特有数据类型half2，其实这是矢量类型，包含两个half类型元素的vector，cuda中有很多类似的自定义类型，int2，int3，float2，float3等

例子中展示了一种sizeof不常见的用法size*sizeof*vec[i]，其实这个和size*sizeof(*vec[i])同义。之所以可以像前面那么用，sizeof是一个关键字，并不是函数，所以可以像这么使用sizeof int，例子中前面有*可以区分就连在了一起。

核函数

__global__ void scalarProductKernel_intrinsics(
        half2 const * const a,
        half2 const * const b,
        float * const results,
        size_t const size
        )
{
    const int stride = gridDim.x*blockDim.x;
    __shared__ half2 shArray[NUM_OF_THREADS];

    shArray[threadIdx.x] = __float2half2_rn(0.f);
    half2 value = __float2half2_rn(0.f);

    for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i+=stride)
    {
        value = __hfma2(a[i], b[i], value);
    }

    shArray[threadIdx.x] = value;
    __syncthreads();
    reduceInShared_intrinsics(shArray);

    if (threadIdx.x == 0)
    {
        half2 result = shArray[0];
        float f_result = __low2float(result) + __high2float(result);
        results[blockIdx.x] = f_result;
    }
}

__global__ void scalarProductKernel_native(
        half2 const * const a,
        half2 const * const b,
        float * const results,
        size_t const size
        )
{
    const int stride = gridDim.x*blockDim.x;
    __shared__ half2 shArray[NUM_OF_THREADS];

    half2 value(0.f, 0.f);
    shArray[threadIdx.x] = value;

    for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i+=stride)
    {
        value = a[i] * b[i] + value;
    }

    shArray[threadIdx.x] = value;
    __syncthreads();
    reduceInShared_native(shArray);

    if (threadIdx.x == 0)
    {
        half2 result = shArray[0];
        float f_result = (float)result.y + (float)result.x;
        results[blockIdx.x] = f_result;
    }
}

输入是两个元素half2类型的向量，可以看成由二维平面点组成的序列，一个二维平面点就是一个half2。计算逻辑是将上下两组点对应位置做内积然后求和。

使用了两种计算方式，一个是原生计算，另一个是采用half2的指令进行计算。感觉是half对操作符进行了重载，直接用原生操作符就可以计算。