nccl/src/common_kernel.h

/*************************************************************************
 * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/


#ifndef COMMON_KERNEL_H_
#define COMMON_KERNEL_H_

#include <cstdio>
#include <cstdint>

#include <cuda_runtime.h>

// BAR macro and helpers
#define WARP_SIZE 32
#define ROUNDUP(x, y)                                                           \
    (((((x) + (y) - 1) / (y))) * (y))
#define DIVUP(x, y) \
    (((x)+(y)-1)/(y))
#define BAR_EXEC(type, barid, nthreads) \
    asm("bar." #type " " #barid ", " #nthreads ";\n\t")
#define BAR_EXPAND(type, barid, nthreads) \
    BAR_EXEC(type, barid, (nthreads))

// Named barrier macro.
// Expands to asm("bar.type barid, nthreads") where
// nthreads has been rounded up to WARP_SIZE.
#define BAR(type, barid, nthreads) \
    BAR_EXPAND(type, barid, ROUNDUP(nthreads, WARP_SIZE))

template<typename T> inline __device__
T vFetch(const volatile T* ptr) {
  return *ptr;
}

template<typename T> inline __device__
void vStore(volatile T* ptr, const T val) {
  *ptr = val;
}

#ifdef CUDA_HAS_HALF
#if CUDART_VERSION < 9000
template<> inline __device__
half vFetch<half>(const volatile half* ptr) {
  half r;
  r.x = ptr->x;
  return r;
}
template<> inline __device__
void vStore<half>(volatile half* ptr, const half val) {
  ptr->x = val.x;
}
#else
template<> inline __device__
half vFetch<half>(const volatile half* ptr) {
  return *((half*)ptr);
}
template<> inline __device__
void vStore<half>(volatile half* ptr, const half val) {
  *((half*)ptr) = val;
}
#endif
#endif

__device__ unsigned int spinct;

// Spin wait until func evaluates to true
template<typename FUNC>
__device__ inline void Wait(const FUNC& func) {
  while (!func()) {
    // waste time
    atomicInc(&spinct, 10);
  }
}

typedef uint64_t PackType;

// unpack x and y to elements of type T and apply FUNC to each element
template<class FUNC, typename T>
struct MULTI {
  __device__ PackType operator()(const PackType x, const PackType y) const;
};

template<class FUNC>
struct MULTI<FUNC, char> {
  static_assert(sizeof(PackType) == 2 * sizeof(uint32_t),
      "PackType must be twice the size of uint32_t.");
  union converter {
    PackType storage;
    struct {
      uint32_t a, b;
    };
  };

  __device__ PackType operator()(const PackType x, const PackType y) const {
    converter cx, cy, cr;
    cx.storage = x;
    cy.storage = y;

    // for char, we do these as vector ops
    cr.a = FUNC()(cx.a, cy.a);
    cr.b = FUNC()(cx.b, cy.b);

    return cr.storage;
  }
};

template<class FUNC>
struct MULTI<FUNC, int> {
  static_assert(sizeof(PackType) == 2 * sizeof(int),
      "PackType must be twice the size of int.");
  union converter {
    PackType storage;
    struct {
      int a, b;
    };
  };

  __device__ PackType operator()(const PackType x, const PackType y) const {
    converter cx, cy, cr;
    cx.storage = x;
    cy.storage = y;

    cr.a = FUNC()(cx.a, cy.a);
    cr.b = FUNC()(cx.b, cy.b);

    return cr.storage;
  }
};

#ifdef CUDA_HAS_HALF
template<class FUNC>
struct MULTI<FUNC, half> {
  static_assert(sizeof(PackType) == 4 * sizeof(half),
      "PackType must be four times the size of half.");

  struct PackHalf2 {
    half2 a, b;
  };

  __device__ PackType operator()(const PackType x, const PackType y) const {
    struct PackHalf2 cx, cy, cr;
    cx = *(reinterpret_cast<const struct PackHalf2*>(&x));
    cy = *(reinterpret_cast<const struct PackHalf2*>(&y));

    cr.a = FUNC()(cx.a, cy.a);
    cr.b = FUNC()(cx.b, cy.b);

    return *(reinterpret_cast<PackType*>(&cr));
  }
};
#endif

template<class FUNC>
struct MULTI<FUNC, float> {
  static_assert(sizeof(PackType) == 2 * sizeof(float),
      "PackType must be twice the size of float.");
  union converter {
    PackType storage;
    struct {
      float a, b;
    };
  };

  __device__ PackType operator()(const PackType x, const PackType y) const {
    converter cx, cy, cr;
    cx.storage = x;
    cy.storage = y;

    cr.a = FUNC()(cx.a, cy.a);
    cr.b = FUNC()(cx.b, cy.b);

    return cr.storage;
  }
};

template<class FUNC>
struct MULTI<FUNC, double> {
  static_assert(sizeof(PackType) == sizeof(double),
      "PackType must be the same size as double.");
  __device__ PackType operator()(const PackType x, const PackType y) const {
    double rv = FUNC()(__longlong_as_double(x), __longlong_as_double(y));
    return __double_as_longlong(rv);
  }
};

template<class FUNC>
struct MULTI<FUNC, unsigned long long> {
  static_assert(sizeof(PackType) == sizeof(unsigned long long),
      "PackType must be the same size as unsigned long long.");
  __device__ PackType operator()(const PackType x, const PackType y) const {
    unsigned long long rv = FUNC()(x, y);
    return rv;
  }
};

template<class FUNC>
struct MULTI<FUNC, long long> {
  static_assert(sizeof(PackType) == sizeof(long long),
      "PackType must be the same size as long long.");
  __device__ PackType operator()(const PackType x, const PackType y) const {
    long long rv = FUNC()((long long)x, (long long)y);
    return rv;
  }
};

template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS>
__device__ inline void ReduceCopy(
    const volatile T * __restrict__ const src0,
    const volatile T * __restrict__ const src1,
    volatile T * __restrict__ const dest0,
    volatile T * __restrict__ const dest1, const int idx) {
  T val = vFetch(src0+idx);
  if (TWO_INPUTS) {
    val = FUNC()(val, vFetch(src1+idx));
  }
  vStore(dest0+idx, val);
  if (TWO_OUTPUTS) {
    vStore(dest1+idx, val);
  }
}

template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS, int UNROLL, int THREADS>
__device__ inline void ReduceCopy64b(
    const volatile T * __restrict__ const src0,
    const volatile T * __restrict__ const src1,
    volatile T * __restrict__ const dest0,
    volatile T * __restrict__ const dest1, const int offset) {
  PackType t0[UNROLL];
  PackType t1[UNROLL];
  #pragma unroll
  for (int u = 0; u < UNROLL; ++u) {
    int idx = offset + u*THREADS;
    t0[u] = (reinterpret_cast<const volatile PackType *>(src0))[idx];
    if (TWO_INPUTS) {
      t1[u] = (reinterpret_cast<const volatile PackType *>(src1))[idx];
    }
  }
  #pragma unroll
  for (int u = 0; u < UNROLL; ++u) {
    int idx = offset + u*THREADS;
    PackType val = TWO_INPUTS ? MULTI<FUNC, T>()(t0[u], t1[u]) : t0[u];
    (reinterpret_cast<volatile PackType *>(dest0))[idx] = val;
    if (TWO_OUTPUTS) {
      (reinterpret_cast<volatile PackType *>(dest1))[idx] = val;
    }
  }
}

#define ALIGNUP(x, a)   ((((x)-1) & ~((a)-1)) + (a))

template<typename T>
__device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) {
  size_t ptrval = reinterpret_cast<size_t>(ptr);
  return reinterpret_cast<volatile T*>(ALIGNUP(ptrval, align));
}

// Assumptions:
// - there is exactly 1 block
// - THREADS is the number of producer threads
// - this function is called by all producer threads
template<int UNROLL, int THREADS, class FUNC, typename T, bool HAS_DEST1,
    bool HAS_SRC1>
__device__ inline void ReduceOrCopy(const int tid,
    volatile T * __restrict__ dest0, volatile T * __restrict__ dest1,
    const volatile T * __restrict__ src0, const volatile T * __restrict__ src1,
    int N) {
  if (N<=0) {
    return;
  }

  int Npreamble = (N<alignof(PackType)) ? N : AlignUp(dest0, alignof(PackType)) - dest0;

  // stage 0: check if we'll be able to use the fast, 64-bit aligned path.
  // If not, we'll just use the slow preamble path for the whole operation
  bool alignable = (((AlignUp(src0,  alignof(PackType)) == src0  + Npreamble)) &&
      (!HAS_DEST1 || (AlignUp(dest1, alignof(PackType)) == dest1 + Npreamble)) &&
      (!HAS_SRC1  || (AlignUp(src1,  alignof(PackType)) == src1  + Npreamble)));

  if (!alignable) {
    Npreamble = N;
  }

  // stage 1: preamble: handle any elements up to the point of everything coming
  // into alignment
  for (int idx = tid; idx < Npreamble; idx += THREADS) {
    // ought to be no way this is ever more than one iteration, except when
    // alignable is false
    ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(src0, src1, dest0, dest1, idx);
  }

  // stage 2: fast path: use 64b loads/stores to do the bulk of the work,
  // assuming the pointers we have are all 64-bit alignable.
  if (alignable) {
    const int PackFactor = sizeof(PackType) / sizeof(T);
    int Nrem = N - Npreamble;
    dest0 += Npreamble; if (HAS_DEST1) { dest1 += Npreamble; }
    src0  += Npreamble; if (HAS_SRC1)  { src1  += Npreamble; }

    // stage 2a: main loop
    int Nalign2a = (Nrem / (PackFactor * UNROLL * THREADS))
        * (UNROLL * THREADS); // round down

    #pragma unroll 1 // don't unroll this loop
    for (int idx = tid; idx < Nalign2a; idx += UNROLL * THREADS) {
      ReduceCopy64b<FUNC, T, HAS_SRC1, HAS_DEST1, UNROLL, THREADS>(src0, src1, dest0, dest1, idx);
    }

    int Ndone2a = Nalign2a * PackFactor;
    Nrem -= Ndone2a;

    // stage 2b: slightly less optimized for section when we don't have full
    // UNROLLs

    int Nalign2b = Nrem / PackFactor;

    #pragma unroll 4
    for (int idx = Nalign2a + tid; idx < Nalign2a + Nalign2b; idx += THREADS) {
      ReduceCopy64b<FUNC, T, HAS_SRC1, HAS_DEST1, 1, 0>(src0, src1, dest0, dest1, idx);
    }

    int Ndone2b = Nalign2b * PackFactor;
    Nrem -= Ndone2b;
    int Ndone2 = Ndone2a + Ndone2b;
    dest0 += Ndone2; if (HAS_DEST1) { dest1 += Ndone2; }
    src0  += Ndone2; if (HAS_SRC1)  { src1  += Ndone2; }

    // stage 2c: tail

    for (int idx = tid; idx < Nrem; idx += THREADS) {
      // never ought to make it more than one time through this loop.  only a
      // few threads should even participate
      ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(src0, src1, dest0, dest1, idx);
    }
  } // done fast path
}

template <typename T>
__device__ inline void incrementOpCounter(const KernelArgs<T> *args) {
  // increment comm's operation counts
  __threadfence_system(); // Technically need to ensure that cleared flags
  // are visible before incrementing op counter.
  *args->opCounter = args->opIndex+1;
}

template <int THREADS, typename T> __device__ __forceinline__
void LoadRing(const DevRing<char>* src, DevRing<T>* dst) {
  enum { NUM_WORDS = sizeof(DevRing<char>) / sizeof(long long) };
  static_assert(sizeof(DevRing<char>) % sizeof(long long) == 0, "Bad alignment");
  static_assert(THREADS >= NUM_WORDS, "Not enough threads to load DevRing");
  static_assert(sizeof(DevRing<char>) == sizeof(DevRing<T>), "DevRing size mismatch");
  long long* lldst = reinterpret_cast<long long*>(dst);
  const long long* llsrc = reinterpret_cast<const long long*>(src);
  if (threadIdx.x < NUM_WORDS) {
    lldst[threadIdx.x] = llsrc[threadIdx.x];
  }
}


#endif // COMMON_KERNEL_H_