Lancelot / src / ops / project.cu
project.cu
Raw
// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR

#include <iostream>
#include <stdio.h>
#include <curand.h>
#include <cmath>

#include <cuda.h>
#include <cub/util_allocator.cuh>
#include "cub/test/test_util.h"

#include "crystal/crystal.cuh"

#include "utils/gpu_utils.h"

using namespace std;


//---------------------------------------------------------------------
// Implements Projection Operator
// There are two variants: dot-product and sigmoid
//---------------------------------------------------------------------

bool                    g_verbose = false;  // Whether to display input/output to console
cub::CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__global__ void project(float* in1, float* in2, float* out, int num_items)
{
  float items[ITEMS_PER_THREAD];
  float items2[ITEMS_PER_THREAD];
  float res[ITEMS_PER_THREAD];

  int tile_offset = blockIdx.x * TILE_SIZE;
  int num_tiles = (num_items + TILE_SIZE - 1) / TILE_SIZE;
  int num_tile_items = TILE_SIZE;

  if (blockIdx.x == num_tiles - 1) {
    num_tile_items = num_items - tile_offset;
  }

  BlockLoad<float, BLOCK_THREADS, ITEMS_PER_THREAD>(in1 + tile_offset, items, num_tile_items);
  BlockLoad<float, BLOCK_THREADS, ITEMS_PER_THREAD>(in2 + tile_offset, items2, num_tile_items);

  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
    if (threadIdx.x + (ITEM * BLOCK_THREADS) < num_tile_items) {
      res[ITEM] = 2*items[ITEM] + 3*items2[ITEM];
    }
  }

  BlockStore<float, BLOCK_THREADS, ITEMS_PER_THREAD>(out + tile_offset, res, num_tile_items);
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__global__ void projectSigmoid(float* in1, float* in2, float* out, int num_items)
{
  float items[ITEMS_PER_THREAD];
  float items2[ITEMS_PER_THREAD];
  float res[ITEMS_PER_THREAD];

  int tile_offset = blockIdx.x * TILE_SIZE;
  int num_tiles = (num_items + TILE_SIZE - 1) / TILE_SIZE;
  int num_tile_items = TILE_SIZE;

  if (blockIdx.x == num_tiles - 1) {
    num_tile_items = num_items - tile_offset;
  }

  BlockLoad<float, BLOCK_THREADS, ITEMS_PER_THREAD>(in1 + tile_offset, items, num_tile_items);
  BlockLoad<float, BLOCK_THREADS, ITEMS_PER_THREAD>(in2 + tile_offset, items2, num_tile_items);

  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
    if (threadIdx.x + (ITEM * BLOCK_THREADS) < num_tile_items) {
      res[ITEM] = 1.0f / (1.0f + expf(-2*items[ITEM] -3*items2[ITEM]));
    }
  }

  BlockStore<float, BLOCK_THREADS, ITEMS_PER_THREAD>(out + tile_offset, res, num_tile_items);
}


float projectGPU(float* in1, float* in2, float* out, int num_items) {
  SETUP_TIMING();

  float time_proj;
  int tile_items = 128*4;
  int num_blocks = (num_items + tile_items - 1)/tile_items;
  TIME_FUNC((project<128,4><<<num_blocks, 128>>>(in1, in2, out, num_items)), time_proj);

  return time_proj;
}

float projectSigmoidGPU(float* in1, float* in2, float* out, int num_items) {
  SETUP_TIMING();

  float time_proj;
  int tile_items = 128*4;
  int num_blocks = (num_items + tile_items - 1)/tile_items;
  TIME_FUNC((projectSigmoid<128,4><<<num_blocks, 128>>>(in1, in2, out, num_items)), time_proj);

  return time_proj;
}

/**
 * Main
 */
int main(int argc, char** argv)
{
  int num_items           = 1<<28;
  int num_trials          = 1;

  // Initialize command line
  CommandLineArgs args(argc, argv);
  args.GetCmdLineArgument("n", num_items);
  args.GetCmdLineArgument("t", num_trials);

  // Print usage
  if (args.CheckCmdLineFlag("help"))
  {
      printf("%s "
          "[--n=<input items>] "
          "[--t=<num trials>] "
          "[--device=<device-id>] "
          "[--v] "
          "\n", argv[0]);
      exit(0);
  }

  // Initialize device
  CubDebugExit(args.DeviceInit());

  // Allocate problem device arrays
  float *d_in1 = NULL;
  CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in1, sizeof(float) * num_items));

  float *d_in2 = NULL;
  CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in2, sizeof(float) * num_items));

  float  *d_out = NULL;
  CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(float) * num_items));

  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);

  curandGenerator_t generator;
  int seed = 0;
  curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_DEFAULT);
  curandSetPseudoRandomGeneratorSeed(generator,seed);
  curandGenerateUniform(generator, d_in1, num_items);
  curandGenerateUniform(generator, d_in2, num_items);

  float time_proj_gpu;
  float time_proj_sigmoid_gpu;  

  for (int t = 0; t < num_trials; t++) {
    time_proj_gpu = projectGPU(d_in1, d_in2, d_out, num_items);
    time_proj_sigmoid_gpu = projectSigmoidGPU(d_in1, d_in2, d_out, num_items);

    cout<< "{"
        << "\"time_proj_gpu\":" << time_proj_gpu
        << ",\"time_proj_sigmoid_gpu\":" << time_proj_sigmoid_gpu
        << "}" << endl;
  }

  // Cleanup
  if (d_in1) CubDebugExit(g_allocator.DeviceFree(d_in1));
  if (d_in2) CubDebugExit(g_allocator.DeviceFree(d_in2));
  if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));

  return 0;
}