nv-legate · ipdemes · May 31, 2022 · Jun 22, 2022 · manopapad · Jun 1, 2022
diff --git a/cunumeric/module.py b/cunumeric/module.py
@@ -2046,7 +2046,6 @@ def repeat(a, repeats, axis=None):
     --------
     Multiple GPUs, Multiple CPUs
     """
-
     # when array is a scalar
     if np.ndim(a) == 0:
         if np.ndim(repeats) == 0:
@@ -2075,7 +2074,7 @@ def repeat(a, repeats, axis=None):
     axis = np.int32(axis)
 
     if axis >= array.ndim:
-        return ValueError("axis exceeds dimension of the input array")
+        raise ValueError("axis exceeds dimension of the input array")
 
     # If repeats is on a zero sized axis, then return the array.
     if array.shape[axis] == 0:
@@ -2100,11 +2099,36 @@ def repeat(a, repeats, axis=None):
                 category=UserWarning,
             )
         repeats = np.int64(repeats)
-        result = array._thunk.repeat(
-            repeats=repeats,
-            axis=axis,
-            scalar_repeats=True,
-        )
+        if repeats < 0:
+            raise ValueError(
+                "'repeats' should not be negative: {}".format(repeats)
+            )
+
+        # check output shape (if it will fit to GPU or not)
+        out_shape = list(array.shape)
+        out_shape[axis] *= repeats
+        out_shape = tuple(out_shape)
+        size = sum(out_shape) * array.itemsize
+        # check if size of the output array is less 8GB. In this case we can
+        # use output regions, otherwise we will use statcally allocated
+        # array
+        if size < 8589934592 / 2:
+
+            result = array._thunk.repeat(
+                repeats=repeats, axis=axis, scalar_repeats=True
+            )
+        else:
+            # this implementation is taken from CuPy
+            result = ndarray(shape=out_shape, dtype=array.dtype)
+            a_index = [slice(None)] * len(out_shape)
+            res_index = list(a_index)
+            offset = 0
+            for i in range(a._shape[axis]):
+                a_index[axis] = slice(i, i + 1)
+                res_index[axis] = slice(offset, offset + repeats)
+                result[res_index] = array[a_index]
+                offset += repeats
+            return result
     # repeats is an array
     else:
         # repeats should be integer type
@@ -2115,10 +2139,32 @@ def repeat(a, repeats, axis=None):
             )
         repeats = repeats.astype(np.int64)
         if repeats.shape[0] != array.shape[axis]:
-            return ValueError("incorrect shape of repeats array")
-        result = array._thunk.repeat(
-            repeats=repeats._thunk, axis=axis, scalar_repeats=False
-        )
+            raise ValueError("incorrect shape of repeats array")
+
+        # check output shape (if it will fit to GPU or not)
+        out_shape = list(array.shape)
+        n_repeats = sum(repeats)
+        out_shape[axis] = n_repeats
+        out_shape = tuple(out_shape)
+        size = sum(out_shape) * array.itemsize
+        # check if size of the output array is less 8GB. In this case we can
+        # use output regions, otherwise we will use statcally allocated
+        # array
+        if size < 8589934592 / 2:
+            result = array._thunk.repeat(
+                repeats=repeats._thunk, axis=axis, scalar_repeats=False
+            )
+        else:  # this implementation is taken from CuPy
+            result = ndarray(shape=out_shape, dtype=array.dtype)
+            a_index = [slice(None)] * len(out_shape)
+            res_index = list(a_index)
+            offset = 0
+            for i in range(a._shape[axis]):
+                a_index[axis] = slice(i, i + 1)
+                res_index[axis] = slice(offset, offset + repeats[i])
+                result[res_index] = array[a_index]
+                offset += repeats[i]
+            return result
     return ndarray(shape=result.shape, thunk=result)
 
 

diff --git a/src/cunumeric/index/repeat.cc b/src/cunumeric/index/repeat.cc
@@ -69,6 +69,8 @@ struct RepeatImplBody<VariantKind::CPU, CODE, DIM> {
     int64_t out_idx = 0;
     for (size_t in_idx = 0; in_idx < volume; ++in_idx) {
       auto p = in_pitches.unflatten(in_idx, in_rect.lo);
+      // TODO replace assert with Legate exception handeling interface when available
+      assert(repeats[p] >= 0);
       for (size_t r = 0; r < repeats[p]; r++) out[out_idx++] = in[p];
     }
   }
@@ -88,6 +90,8 @@ struct RepeatImplBody<VariantKind::CPU, CODE, DIM> {
     for (int64_t idx = in_rect.lo[axis]; idx <= in_rect.hi[axis]; ++idx) {
       p[axis]            = idx;
       offsets[off_idx++] = sum;
+      // TODO replace assert with Legate exception handeling interface when available
+      assert(repeats[p] >= 0);
       sum += repeats[p];
     }
 

diff --git a/src/cunumeric/index/repeat.cu b/src/cunumeric/index/repeat.cu
@@ -41,6 +41,8 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
     if (offset < extent) {
       auto p = origin;
       p[axis] += offset;
+      // TODO replace assert with Legate exception handeling interface when available
+      assert(repeats[p] >= 0);
       auto val        = repeats[p];
       offsets[offset] = val;
       SumReduction<int64_t>::fold<true>(value, val);

diff --git a/src/cunumeric/index/repeat_omp.cc b/src/cunumeric/index/repeat_omp.cc
@@ -77,7 +77,9 @@ struct RepeatImplBody<VariantKind::OMP, CODE, DIM> {
       int64_t axis_lo = p[axis];
 #pragma omp for schedule(static) private(p)
       for (int64_t idx = 0; idx < axis_extent; ++idx) {
-        p[axis]      = axis_lo + idx;
+        p[axis] = axis_lo + idx;
+        // TODO replace assert with Legate exception handeling interface when available
+        assert(repeats[p] >= 0);
         auto val     = repeats[p];
         offsets[idx] = val;
         local_sums[tid] += val;