[CUDA][Kernel] More CUDA kernels; Standardize the behavior for sorted COO/CSR (#1704)

* add cub; array cumsum * CSRSliceRows * fix warning * operator << for ndarray; CSRSliceRows * add CSRIsSorted * add csr_sort * inplace coosort and outplace csrsort * WIP: coo is sorted * mv cuda_utils * add AllTrue utility * csr sort * coo sort * coo2csr for sorted coo arrays * CSRToCOO from sorted * pass tests for the new kernel changes * cannot use inplace sort * lint * try fix msvc error * Fix g.copy_to and g.asnumbits; ToBlock no longer uses CSC * stash * revert some hack * revert some changes * address comments * fix * fix to_block unittest * add todo note
2026-06-04 19:44:23 +08:00 · 2020-06-28 18:37:28 +08:00
parent da8632cafe
commit 870da747ea
59 changed files with 1367 additions and 429 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -13,6 +13,10 @@
 [submodule "third_party/METIS"]
 	path = third_party/METIS
 	url = https://github.com/KarypisLab/METIS.git
+[submodule "third_party/cub"]
+	path = third_party/cub
+	url = https://github.com/NVlabs/cub.git
+	branch = 1.8.0
 [submodule "third_party/phmap"]
 	path = third_party/phmap
 	url = https://github.com/greg7mdp/parallel-hashmap.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,6 +44,8 @@ include_directories("third_party/METIS/include/")
 include_directories("third_party/dmlc-core/include")
 include_directories("third_party/minigun/minigun")
 include_directories("third_party/minigun/third_party/moderngpu/src")
+include_directories("third_party/cub/")
+include_directories("third_party/phmap/")

 # initial variables
 set(DGL_LINKER_LIBS "")
--- a/include/dgl/aten/array_ops.h
+++ b/include/dgl/aten/array_ops.h
@@ -13,6 +13,7 @@
 #include <utility>
 #include <vector>
 #include <tuple>
+#include <string>
 #include "./types.h"

 namespace dgl {
@@ -131,9 +132,18 @@ IdArray HStack(IdArray arr1, IdArray arr2);
 * \tparam ValueType The type of return value.
 */
 template<typename ValueType>
-ValueType IndexSelect(NDArray array, uint64_t index);
+ValueType IndexSelect(NDArray array, int64_t index);
+
+/*!
+ * \brief Return the data under the index. In numpy notation, A[I]
+ */
 NDArray IndexSelect(NDArray array, IdArray index);

+/*!
+ * \brief Return the data from `start` (inclusive) to `end` (exclusive).
+ */
+NDArray IndexSelect(NDArray array, int64_t start, int64_t end);
+
 /*!
 * \brief Permute the elements of an array according to given indices.
 *
@@ -238,6 +248,27 @@ std::tuple<NDArray, IdArray, IdArray> Pack(NDArray array, ValueType pad_value);
 */
 std::pair<NDArray, IdArray> ConcatSlices(NDArray array, IdArray lengths);

+/*!
+ * \brief Return the cumulative summation (or inclusive sum) of the input array.
+ *
+ * The first element out[0] is equal to the first element of the input array
+ * array[0]. The rest elements are defined recursively, out[i] = out[i-1] + array[i].
+ * Hence, the result array length is the same as the input array length.
+ *
+ * If prepend_zero is true, then the first element is zero and the result array
+ * length is the input array length plus one. This is useful for creating
+ * an indptr array over a count array.
+ *
+ * \param array The 1D input array.
+ * \return Array after cumsum.
+ */
+IdArray CumSum(IdArray array, bool prepend_zero = false);
+
+/*!
+ * \brief Return a string that prints out some debug information.
+ */
+std::string ToDebugString(NDArray array);
+
 // inline implementations
 template <typename T>
 IdArray VecToIdArray(const std::vector<T>& vec,
--- a/include/dgl/aten/coo.h
+++ b/include/dgl/aten/coo.h
@@ -116,6 +116,16 @@ struct COOMatrix {
    CHECK_NO_OVERFLOW(row->dtype, num_rows);
    CHECK_NO_OVERFLOW(row->dtype, num_cols);
  }
+
+  /*! \brief Return a copy of this matrix on the give device context. */
+  inline COOMatrix CopyTo(const DLContext& ctx) const {
+    if (ctx == row->ctx)
+      return *this;
+    return COOMatrix(num_rows, num_cols,
+                     row.CopyTo(ctx), col.CopyTo(ctx),
+                     aten::IsNullArray(data)? data : data.CopyTo(ctx),
+                     row_sorted, col_sorted);
+  }
 };

 ///////////////////////// COO routines //////////////////////////
@@ -141,6 +151,17 @@ inline bool COOHasData(COOMatrix csr) {
  return !IsNullArray(csr.data);
 }

+/*!
+ * \brief Check whether the COO is sorted.
+ *
+ * It returns two flags: one for whether the row is sorted;
+ * the other for whether the columns of each row is sorted
+ * if the first flag is true.
+ *
+ * Complexity: O(NNZ)
+ */
+std::pair<bool, bool> COOIsSorted(COOMatrix coo);
+
 /*! \brief Get data. The return type is an ndarray due to possible duplicate entries. */
 runtime::NDArray COOGetData(COOMatrix , int64_t row, int64_t col);

@@ -161,6 +182,20 @@ COOMatrix COOTranspose(COOMatrix coo);
 * the result CSR matrix stores a shuffle index for how the entries
 * will be reordered in CSR. The i^th entry in the result CSR corresponds
 * to the CSR.data[i] th entry in the input COO.
+ *
+ * Conversion complexity: O(nnz)
+ *
+ * - The function first check whether the input COO matrix is sorted
+ *   using a linear scan.
+ * - If the COO matrix is row sorted, the conversion can be done very
+ *   efficiently in a sequential scan. The result indices and data arrays 
+ *   are directly equal to the column and data arrays from the input.
+ * - If the COO matrix is further column sorted, the result CSR is
+ *   also column sorted.
+ * - Otherwise, the conversion is more costly but still is O(nnz).
+ *
+ * \param coo Input COO matrix.
+ * \return CSR matrix.
 */
 CSRMatrix COOToCSR(COOMatrix coo);

@@ -195,6 +230,21 @@ bool COOHasDuplicate(COOMatrix coo);
 */
 std::pair<COOMatrix, IdArray> COOCoalesce(COOMatrix coo);

+/*!
+ * \brief Sort the indices of a COO matrix in-place.
+ *
+ * The function sorts row indices in ascending order. If sort_column is true,
+ * col indices are sorted in ascending order too. The data array of the returned COOMatrix
+ * stores the shuffled index which could be used to fetch edge data.
+ *
+ * Complexity: O(N*log(N)) time and O(1) space, where N is the number of nonzeros.
+ * TODO(minjie): The time complexity could be improved to O(N) by using a O(N) space.
+ *
+ * \param mat The coo matrix to sort.
+ * \param sort_column True if column index should be sorted too.
+ */
+void COOSort_(COOMatrix* mat, bool sort_column = false);
+
 /*!
 * \brief Sort the indices of a COO matrix.
 *
@@ -202,11 +252,23 @@ std::pair<COOMatrix, IdArray> COOCoalesce(COOMatrix coo);
 * col indices are sorted in ascending order too. The data array of the returned COOMatrix
 * stores the shuffled index which could be used to fetch edge data.
 *
+ * Complexity: O(N*log(N)) time and O(1) space, where N is the number of nonzeros.
+ * TODO(minjie): The time complexity could be improved to O(N) by using a O(N) space.
+ *
 * \param mat The input coo matrix
 * \param sort_column True if column index should be sorted too.
 * \return COO matrix with index sorted.
 */
-COOMatrix COOSort(COOMatrix mat, bool sort_column = false);
+inline COOMatrix COOSort(COOMatrix mat, bool sort_column = false) {
+  if ((mat.row_sorted && !sort_column) || mat.col_sorted)
+    return mat;
+  COOMatrix ret(mat.num_rows, mat.num_cols,
+                mat.row.Clone(), mat.col.Clone(),
+                COOHasData(mat)? mat.data.Clone() : mat.data,
+                mat.row_sorted, mat.col_sorted);
+  COOSort_(&ret, sort_column);
+  return ret;
+}

 /*!
 * \brief Remove entries from COO matrix by entry indices (data indices)
--- a/include/dgl/aten/csr.h
+++ b/include/dgl/aten/csr.h
@@ -106,6 +106,17 @@ struct CSRMatrix {
    }
    CHECK_NO_OVERFLOW(indptr->dtype, num_rows);
    CHECK_NO_OVERFLOW(indptr->dtype, num_cols);
+    CHECK_EQ(indptr->shape[0], num_rows + 1);
+  }
+
+  /*! \brief Return a copy of this matrix on the give device context. */
+  inline CSRMatrix CopyTo(const DLContext& ctx) const {
+    if (ctx == indptr->ctx)
+      return *this;
+    return CSRMatrix(num_rows, num_cols,
+                     indptr.CopyTo(ctx), indices.CopyTo(ctx),
+                     aten::IsNullArray(data)? data : data.CopyTo(ctx),
+                     sorted);
  }
 };

@@ -134,6 +145,9 @@ inline bool CSRHasData(CSRMatrix csr) {
  return !IsNullArray(csr.data);
 }

+/*! \brief Whether the column indices of each row is sorted. */
+bool CSRIsSorted(CSRMatrix csr);
+
 /* \brief Get data. The return type is an ndarray due to possible duplicate entries. */
 runtime::NDArray CSRGetData(CSRMatrix , int64_t row, int64_t col);
 /*!
@@ -155,6 +169,15 @@ CSRMatrix CSRTranspose(CSRMatrix csr);

 /*!
 * \brief Convert CSR matrix to COO matrix.
+ *
+ * Complexity: O(nnz)
+ * 
+ * - If data_as_order is false, the column and data arrays of the
+ *   result COO are equal to the indices and data arrays of the
+ *   input CSR. The result COO is also row sorted.
+ * - If the input CSR is further sorted, the result COO is also
+ *   column sorted.
+ *
 * \param csr Input csr matrix
 * \param data_as_order If true, the data array in the input csr matrix contains the order
 *                      by which the resulting COO tuples are stored. In this case, the
@@ -166,9 +189,8 @@ COOMatrix CSRToCOO(CSRMatrix csr, bool data_as_order);

 /*!
 * \brief Slice rows of the given matrix and return.
- * \param csr CSR matrix
- * \param start Start row id (inclusive)
- * \param end End row id (exclusive)
+ *
+ * The sliced row IDs are relabeled to starting from zero.
 *
 * Examples:
 * num_rows = 4
@@ -182,6 +204,11 @@ COOMatrix CSRToCOO(CSRMatrix csr, bool data_as_order);
 * num_cols = 4
 * indptr = [0, 1, 1]
 * indices = [2]
+ *
+ * \param csr CSR matrix
+ * \param start Start row id (inclusive)
+ * \param end End row id (exclusive)
+ * \return sliced rows stored in a CSR matrix
 */
 CSRMatrix CSRSliceRows(CSRMatrix csr, int64_t start, int64_t end);
 CSRMatrix CSRSliceRows(CSRMatrix csr, runtime::NDArray rows);
@@ -192,6 +219,8 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, runtime::NDArray rows);
 * In numpy notation, given matrix M, row index array I, col index array J
 * This function returns the submatrix M[I, J].
 *
+ * The sliced row and column IDs are relabeled to starting from zero.
+ *
 * \param csr The input csr matrix
 * \param rows The row index to select
 * \param cols The col index to select
@@ -203,7 +232,10 @@ CSRMatrix CSRSliceMatrix(CSRMatrix csr, runtime::NDArray rows, runtime::NDArray
 bool CSRHasDuplicate(CSRMatrix csr);

 /*!
- * \brief Sort the column index at each row in the ascending order.
+ * \brief Sort the column index at each row in ascending order in-place.
+ *
+ * Only the indices and data arrays (if available) will be mutated. The indptr array
+ * stays the same.
 *
 * Examples:
 * num_rows = 4
@@ -218,6 +250,22 @@ bool CSRHasDuplicate(CSRMatrix csr);
 */
 void CSRSort_(CSRMatrix* csr);

+/*!
+ * \brief Sort the column index at each row in ascending order.
+ *
+ * Return a new CSR matrix with sorted column indices and data arrays.
+ */
+inline CSRMatrix CSRSort(CSRMatrix csr) {
+  if (csr.sorted)
+    return csr;
+  CSRMatrix ret(csr.num_rows, csr.num_cols,
+                csr.indptr, csr.indices.Clone(),
+                CSRHasData(csr)? csr.data.Clone() : csr.data,
+                csr.sorted);
+  CSRSort_(&ret);
+  return ret;
+}
+
 /*!
 * \brief Reorder the rows and colmns according to the new row and column order.
 * \param csr The input csr matrix.
--- a/include/dgl/aten/macro.h
+++ b/include/dgl/aten/macro.h
@@ -252,4 +252,8 @@
      CHECK_LE((val), 0x7FFFFFFFL) << "int32 overflow for argument " << (#val) << "."; \
  } while (0);

+#define CHECK_IS_ID_ARRAY(VAR)                                              \
+  CHECK((VAR)->ndim == 1 && (IS_INT32(VAR) || IS_INT64(VAR)))               \
+    << "Expected argument " << (#VAR) << " to be an 1D integer array.";
+
 #endif  // DGL_ATEN_MACRO_H_
--- a/include/dgl/graph_interface.h
+++ b/include/dgl/graph_interface.h
@@ -10,6 +10,7 @@
 #include <vector>
 #include <utility>
 #include <algorithm>
+#include <memory>

 #include "./runtime/object.h"
 #include "array.h"
--- a/include/dgl/immutable_graph.h
+++ b/include/dgl/immutable_graph.h
@@ -12,6 +12,7 @@
 #include <utility>
 #include <tuple>
 #include <algorithm>
+#include <memory>
 #include "runtime/ndarray.h"
 #include "graph_interface.h"
 #include "lazy.h"
--- a/include/dgl/nodeflow.h
+++ b/include/dgl/nodeflow.h
@@ -8,6 +8,7 @@

 #include <vector>
 #include <string>
+#include <memory>

 #include "./runtime/object.h"
 #include "graph_interface.h"
--- a/include/dgl/runtime/ndarray.h
+++ b/include/dgl/runtime/ndarray.h
@@ -11,6 +11,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include <memory>

 #include "c_runtime_api.h"
 #include "dlpack/dlpack.h"
@@ -157,6 +158,10 @@ class NDArray {
   * \return The array under another context.
   */
  inline NDArray CopyTo(const DLContext& ctx) const;
+  /*!
+   * \brief Return a new array with a copy of the content.
+   */
+  inline NDArray Clone() const;
  /*!
   * \brief Load NDArray from stream
   * \param stream The input data stream
@@ -410,6 +415,12 @@ inline NDArray NDArray::CopyTo(const DLContext& ctx) const {
  return ret;
 }

+inline NDArray NDArray::Clone() const {
+  CHECK(data_ != nullptr);
+  const DLTensor* dptr = operator->();
+  return this->CopyTo(dptr->ctx);
+}
+
 inline int NDArray::use_count() const {
  if (data_ == nullptr) return 0;
  return data_->ref_counter_.load(std::memory_order_relaxed);
@@ -627,6 +638,8 @@ dgl::runtime::NDArray operator <= (int64_t lhs, const dgl::runtime::NDArray& a2)
 dgl::runtime::NDArray operator == (int64_t lhs, const dgl::runtime::NDArray& a2);
 dgl::runtime::NDArray operator != (int64_t lhs, const dgl::runtime::NDArray& a2);

+std::ostream& operator << (std::ostream& os, dgl::runtime::NDArray array);
+
 ///////////////// Operator overloading for DLDataType /////////////////

 /*! \brief Check whether two data types are the same.*/
--- a/include/dgl/runtime/packed_func.h
+++ b/include/dgl/runtime/packed_func.h
@@ -13,6 +13,7 @@
 #include <string>
 #include <limits>
 #include <memory>
+#include <utility>
 #include <type_traits>
 #include "c_runtime_api.h"
 #include "module.h"
--- a/include/dgl/runtime/smart_ptr_serializer.h
+++ b/include/dgl/runtime/smart_ptr_serializer.h
@@ -10,6 +10,7 @@
 #include <dgl/graph_serializer.h>
 #include <dmlc/io.h>
 #include <dmlc/serializer.h>
+#include <memory>

 namespace dmlc {
 namespace serializer {
--- a/include/dgl/zerocopy_serializer.h
+++ b/include/dgl/zerocopy_serializer.h
@@ -17,31 +17,36 @@
 #include <tuple>
 #include <utility>
 #include <vector>
+#include <memory>

 #include "dmlc/logging.h"

 namespace dgl {

-/* StreamWithBuffer is backed up by dmlc::MemoryFixedSizeStream or
-dmlc::MemoryStringStream. This class supports serializing and deserializing
-NDArrays stored in shared memory. If the stream is created for
-sending/recving data through network, the data pointer of the NDArray will be
-transmitted directly without and copy. Otherwise, the stream is for
-sending/recving data to another process on the same machine, so if an NDArray
-is stored in shared memory, it will just record the shared memory name
-instead of the actual data buffer.
-For example:
-std::string blob;
-// Send to local
-StreamWithBuffer strm(&blob, false);
-// Send to remote
-StreamWithBuffer strm(&blob, true);
-// Receive from local
-StreamWithBuffer strm(&blob, false);
-// Receive from remote
-std::vector<void*> ptr_list
-StreamWithBuffer strm(&blob, ptr_list);
-*/
+/*!
+ *
+ * StreamWithBuffer is backed up by dmlc::MemoryFixedSizeStream or
+ * dmlc::MemoryStringStream. This class supports serializing and deserializing
+ * NDArrays stored in shared memory. If the stream is created for
+ * sending/recving data through network, the data pointer of the NDArray will be
+ * transmitted directly without and copy. Otherwise, the stream is for
+ * sending/recving data to another process on the same machine, so if an NDArray
+ * is stored in shared memory, it will just record the shared memory name
+ * instead of the actual data buffer.
+ *
+ * For example:
+ *
+ * std::string blob;
+ * // Send to local
+ * StreamWithBuffer strm(&blob, false);
+ * // Send to remote
+ * StreamWithBuffer strm(&blob, true);
+ * // Receive from local
+ * StreamWithBuffer strm(&blob, false);
+ * // Receive from remote
+ * std::vector<void*> ptr_list
+ * StreamWithBuffer strm(&blob, ptr_list);
+ */
 class StreamWithBuffer : public dmlc::SeekStream {
 public:
  // Buffer type. Storing NDArray to maintain the reference counting to ensure
--- a/src/array/array.cc
+++ b/src/array/array.cc
@@ -8,6 +8,8 @@
 #include <dgl/packed_func_ext.h>
 #include <dgl/runtime/container.h>
 #include <dgl/runtime/shared_mem.h>
+#include <dgl/runtime/device_api.h>
+#include <sstream>
 #include "../c_api_common.h"
 #include "./array_op.h"
 #include "./arith.h"
@@ -100,8 +102,10 @@ NDArray IndexSelect(NDArray array, IdArray index) {
 }

 template<typename ValueType>
-ValueType IndexSelect(NDArray array, uint64_t index) {
+ValueType IndexSelect(NDArray array, int64_t index) {
  CHECK_EQ(array->ndim, 1) << "Only support select values from 1D array.";
+  CHECK(index >= 0 && index < array.NumElements())
+    << "Index " << index << " is out of bound.";
  ValueType ret = 0;
  ATEN_XPU_SWITCH_CUDA(array->ctx.device_type, XPU, "IndexSelect", {
    ATEN_DTYPE_SWITCH(array->dtype, DType, "values", {
@@ -110,12 +114,30 @@ ValueType IndexSelect(NDArray array, uint64_t index) {
  });
  return ret;
 }
-template int32_t IndexSelect<int32_t>(NDArray array, uint64_t index);
-template int64_t IndexSelect<int64_t>(NDArray array, uint64_t index);
-template uint32_t IndexSelect<uint32_t>(NDArray array, uint64_t index);
-template uint64_t IndexSelect<uint64_t>(NDArray array, uint64_t index);
-template float IndexSelect<float>(NDArray array, uint64_t index);
-template double IndexSelect<double>(NDArray array, uint64_t index);
+template int32_t IndexSelect<int32_t>(NDArray array, int64_t index);
+template int64_t IndexSelect<int64_t>(NDArray array, int64_t index);
+template uint32_t IndexSelect<uint32_t>(NDArray array, int64_t index);
+template uint64_t IndexSelect<uint64_t>(NDArray array, int64_t index);
+template float IndexSelect<float>(NDArray array, int64_t index);
+template double IndexSelect<double>(NDArray array, int64_t index);
+
+NDArray IndexSelect(NDArray array, int64_t start, int64_t end) {
+  CHECK_EQ(array->ndim, 1) << "Only support select values from 1D array.";
+  CHECK(start >= 0 && start < array.NumElements())
+    << "Index " << start << " is out of bound.";
+  CHECK(end >= 0 && end <= array.NumElements())
+    << "Index " << end << " is out of bound.";
+  CHECK_LE(start, end);
+  auto device = runtime::DeviceAPI::Get(array->ctx);
+  const int64_t len = end - start;
+  NDArray ret = NDArray::Empty({len}, array->dtype, array->ctx);
+  ATEN_DTYPE_SWITCH(array->dtype, DType, "values", {
+    device->CopyDataFromTo(array->data, start * sizeof(DType),
+                           ret->data, 0, len * sizeof(DType),
+                           array->ctx, ret->ctx, array->dtype, nullptr);
+  });
+  return ret;
+}

 NDArray Scatter(NDArray array, IdArray indices) {
  NDArray ret;
@@ -181,6 +203,31 @@ std::pair<NDArray, IdArray> ConcatSlices(NDArray array, IdArray lengths) {
  return ret;
 }

+IdArray CumSum(IdArray array, bool prepend_zero) {
+  IdArray ret;
+  ATEN_XPU_SWITCH_CUDA(array->ctx.device_type, XPU, "CumSum", {
+    ATEN_ID_TYPE_SWITCH(array->dtype, IdType, {
+      ret = impl::CumSum<XPU, IdType>(array, prepend_zero);
+    });
+  });
+  return ret;
+}
+
+std::string ToDebugString(NDArray array) {
+  std::ostringstream oss;
+  NDArray a = array.CopyTo(DLContext{kDLCPU, 0});
+  oss << "array([";
+  ATEN_DTYPE_SWITCH(a->dtype, DType, "array", {
+    for (int64_t i = 0; i < std::min<int64_t>(a.NumElements(), 10L); ++i) {
+      oss << a.Ptr<DType>()[i] << ", ";
+    }
+  });
+  if (a.NumElements() > 10)
+    oss << "...";
+  oss << "], dtype=" << array->dtype << ", ctx=" << array->ctx << ")";
+  return oss.str();
+}
+
 ///////////////////////// CSR routines //////////////////////////

 bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) {
@@ -250,6 +297,16 @@ NDArray CSRGetRowData(CSRMatrix csr, int64_t row) {
  return ret;
 }

+bool CSRIsSorted(CSRMatrix csr) {
+  if (csr.indices->shape[0] <= 1)
+    return true;
+  bool ret = false;
+  ATEN_CSR_SWITCH_CUDA(csr, XPU, IdType, "CSRIsSorted", {
+    ret = impl::CSRIsSorted<XPU, IdType>(csr);
+  });
+  return ret;
+}
+
 NDArray CSRGetData(CSRMatrix csr, int64_t row, int64_t col) {
  CHECK(row >= 0 && row < csr.num_rows) << "Invalid row index: " << row;
  CHECK(col >= 0 && col < csr.num_cols) << "Invalid col index: " << col;
@@ -318,7 +375,7 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, int64_t start, int64_t end) {
  CHECK(end >= 0 && end <= csr.num_rows) << "Invalid end index: " << end;
  CHECK_GE(end, start);
  CSRMatrix ret;
-  ATEN_CSR_SWITCH(csr, XPU, IdType, "CSRSliceRows", {
+  ATEN_CSR_SWITCH_CUDA(csr, XPU, IdType, "CSRSliceRows", {
    ret = impl::CSRSliceRows<XPU, IdType>(csr, start, end);
  });
  return ret;
@@ -328,7 +385,7 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
  CHECK_SAME_DTYPE(csr.indices, rows);
  CHECK_SAME_CONTEXT(csr.indices, rows);
  CSRMatrix ret;
-  ATEN_CSR_SWITCH(csr, XPU, IdType, "CSRSliceRows", {
+  ATEN_CSR_SWITCH_CUDA(csr, XPU, IdType, "CSRSliceRows", {
    ret = impl::CSRSliceRows<XPU, IdType>(csr, rows);
  });
  return ret;
@@ -347,7 +404,9 @@ CSRMatrix CSRSliceMatrix(CSRMatrix csr, NDArray rows, NDArray cols) {
 }

 void CSRSort_(CSRMatrix* csr) {
-  ATEN_CSR_SWITCH(*csr, XPU, IdType, "CSRSort_", {
+  if (csr->sorted)
+    return;
+  ATEN_CSR_SWITCH_CUDA(*csr, XPU, IdType, "CSRSort_", {
    impl::CSRSort_<XPU, IdType>(csr);
  });
 }
@@ -509,13 +568,23 @@ COOMatrix COOSliceMatrix(COOMatrix coo, NDArray rows, NDArray cols) {
  return ret;
 }

-COOMatrix COOSort(COOMatrix mat, bool sort_column) {
-  COOMatrix ret;
-  ATEN_XPU_SWITCH_CUDA(mat.row->ctx.device_type, XPU, "COOSort", {
-    ATEN_ID_TYPE_SWITCH(mat.row->dtype, IdType, {
-      ret = impl::COOSort<XPU, IdType>(mat, sort_column);
+void COOSort_(COOMatrix* mat, bool sort_column) {
+  if ((mat->row_sorted && !sort_column) || mat->col_sorted)
+    return;
+  ATEN_XPU_SWITCH_CUDA(mat->row->ctx.device_type, XPU, "COOSort_", {
+    ATEN_ID_TYPE_SWITCH(mat->row->dtype, IdType, {
+      impl::COOSort_<XPU, IdType>(mat, sort_column);
    });
  });
+}
+
+std::pair<bool, bool> COOIsSorted(COOMatrix coo) {
+  if (coo.row->shape[0] <= 1)
+    return {true, true};
+  std::pair<bool, bool> ret;
+  ATEN_COO_SWITCH_CUDA(coo, XPU, IdType, "COOIsSorted", {
+    ret = impl::COOIsSorted<XPU, IdType>(coo);
+  });
  return ret;
 }

@@ -709,3 +778,7 @@ DGL_REGISTER_GLOBAL("ndarray._CAPI_DGLExistSharedMemArray")

 }  // namespace aten
 }  // namespace dgl
+
+std::ostream& operator << (std::ostream& os, dgl::runtime::NDArray array) {
+  return os << dgl::aten::ToDebugString(array);
+}
--- a/src/array/array_arith.cc
+++ b/src/array/array_arith.cc
@@ -3,8 +3,8 @@
 * \file array/array_aritch.cc
 * \brief DGL array arithmetic operations
 */
-#include <dgl/array.h>
 #include <dgl/packed_func_ext.h>
+#include <dgl/runtime/ndarray.h>
 #include <dgl/runtime/container.h>
 #include "../c_api_common.h"
 #include "./array_op.h"
--- a/src/array/array_op.h
+++ b/src/array/array_op.h
@@ -44,7 +44,7 @@ template <DLDeviceType XPU, typename DType, typename IdType>
 NDArray IndexSelect(NDArray array, IdArray index);

 template <DLDeviceType XPU, typename DType>
-DType IndexSelect(NDArray array, uint64_t index);
+DType IndexSelect(NDArray array, int64_t index);

 template <DLDeviceType XPU, typename DType, typename IdType>
 NDArray Scatter(NDArray array, IdArray indices);
@@ -61,6 +61,9 @@ std::tuple<NDArray, IdArray, IdArray> Pack(NDArray array, DType pad_value);
 template <DLDeviceType XPU, typename DType, typename IdType>
 std::pair<NDArray, IdArray> ConcatSlices(NDArray array, IdArray lengths);

+template <DLDeviceType XPU, typename IdType>
+IdArray CumSum(IdArray array, bool prepend_zero);
+
 // sparse arrays

 template <DLDeviceType XPU, typename IdType>
@@ -84,6 +87,9 @@ runtime::NDArray CSRGetRowColumnIndices(CSRMatrix csr, int64_t row);
 template <DLDeviceType XPU, typename IdType>
 runtime::NDArray CSRGetRowData(CSRMatrix csr, int64_t row);

+template <DLDeviceType XPU, typename IdType>
+bool CSRIsSorted(CSRMatrix csr);
+
 template <DLDeviceType XPU, typename IdType>
 runtime::NDArray CSRGetData(CSRMatrix csr, int64_t row, int64_t col);

@@ -187,7 +193,10 @@ template <DLDeviceType XPU, typename IdType>
 std::pair<COOMatrix, IdArray> COOCoalesce(COOMatrix coo);

 template <DLDeviceType XPU, typename IdType>
-COOMatrix COOSort(COOMatrix mat, bool sort_column);
+void COOSort_(COOMatrix* mat, bool sort_column);
+
+template <DLDeviceType XPU, typename IdType>
+std::pair<bool, bool> COOIsSorted(COOMatrix coo);

 template <DLDeviceType XPU, typename IdType>
 COOMatrix COORemove(COOMatrix coo, IdArray entries);
--- a/src/array/cpu/array_cumsum.cc
+++ b/src/array/cpu/array_cumsum.cc
@@ -0,0 +1,42 @@
+/*!
+ *  Copyright (c) 2020 by Contributors
+ * \file array/cpu/array_cumsum.cc
+ * \brief Array cumsum CPU implementation
+ */
+#include <dgl/array.h>
+
+namespace dgl {
+using runtime::NDArray;
+namespace aten {
+namespace impl {
+
+template <DLDeviceType XPU, typename IdType>
+IdArray CumSum(IdArray array, bool prepend_zero) {
+  const int64_t len = array.NumElements();
+  if (len == 0)
+    return array;
+  if (prepend_zero) {
+    IdArray ret = aten::NewIdArray(len + 1, array->ctx, array->dtype.bits);
+    const IdType* in_d = array.Ptr<IdType>();
+    IdType* out_d = ret.Ptr<IdType>();
+    out_d[0] = 0;
+    for (int64_t i = 0; i < len; ++i)
+      out_d[i + 1] = out_d[i] + in_d[i];
+    return ret;
+  } else {
+    IdArray ret = aten::NewIdArray(len, array->ctx, array->dtype.bits);
+    const IdType* in_d = array.Ptr<IdType>();
+    IdType* out_d = ret.Ptr<IdType>();
+    out_d[0] = in_d[0];
+    for (int64_t i = 1; i < len; ++i)
+      out_d[i] = out_d[i - 1] + in_d[i];
+    return ret;
+  }
+}
+
+template IdArray CumSum<kDLCPU, int32_t>(IdArray, bool);
+template IdArray CumSum<kDLCPU, int64_t>(IdArray, bool);
+
+}  // namespace impl
+}  // namespace aten
+}  // namespace dgl
--- a/src/array/cpu/array_index_select.cc
+++ b/src/array/cpu/array_index_select.cc
@@ -35,20 +35,16 @@ template NDArray IndexSelect<kDLCPU, double, int32_t>(NDArray, IdArray);
 template NDArray IndexSelect<kDLCPU, double, int64_t>(NDArray, IdArray);

 template <DLDeviceType XPU, typename DType>
-DType IndexSelect(NDArray array, uint64_t index) {
+DType IndexSelect(NDArray array, int64_t index) {
  const DType* data = static_cast<DType*>(array->data);
  return data[index];
 }

-template int32_t IndexSelect<kDLCPU, int32_t>(NDArray array, uint64_t index);
-template int64_t IndexSelect<kDLCPU, int64_t>(NDArray array, uint64_t index);
-template uint32_t IndexSelect<kDLCPU, uint32_t>(NDArray array, uint64_t index);
-template uint64_t IndexSelect<kDLCPU, uint64_t>(NDArray array, uint64_t index);
-template float IndexSelect<kDLCPU, float>(NDArray array, uint64_t index);
-template double IndexSelect<kDLCPU, double>(NDArray array, uint64_t index);
+template int32_t IndexSelect<kDLCPU, int32_t>(NDArray array, int64_t index);
+template int64_t IndexSelect<kDLCPU, int64_t>(NDArray array, int64_t index);
+template float IndexSelect<kDLCPU, float>(NDArray array, int64_t index);
+template double IndexSelect<kDLCPU, double>(NDArray array, int64_t index);

-};  // namespace impl
-
-};  // namespace aten
-
-};  // namespace dgl
+}  // namespace impl
+}  // namespace aten
+}  // namespace dgl
--- a/src/array/cpu/array_pack.cc
+++ b/src/array/cpu/array_pack.cc
@@ -76,8 +76,6 @@ template std::tuple<NDArray, IdArray, IdArray> Pack<kDLCPU, int64_t>(NDArray, in
 template std::tuple<NDArray, IdArray, IdArray> Pack<kDLCPU, float>(NDArray, float);
 template std::tuple<NDArray, IdArray, IdArray> Pack<kDLCPU, double>(NDArray, double);

-};  // namespace impl
-
-};  // namespace aten
-
-};  // namespace dgl
+}  // namespace impl
+}  // namespace aten
+}  // namespace dgl
--- a/src/array/cpu/array_utils.h
+++ b/src/array/cpu/array_utils.h
@@ -6,12 +6,12 @@
 #ifndef DGL_ARRAY_CPU_ARRAY_UTILS_H_
 #define DGL_ARRAY_CPU_ARRAY_UTILS_H_

-#include <dgl/array.h>
+#include <dgl/aten/types.h>
+#include <parallel_hashmap/phmap.h>
 #include <vector>
 #include <unordered_map>
 #include <utility>
 #include "../../c_api_common.h"
-#include "../third_party/phmap/parallel_hashmap/phmap.h"

 namespace dgl {
 namespace aten {
--- a/src/array/cpu/coo_sort.cc
+++ b/src/array/cpu/coo_sort.cc
@@ -10,37 +10,181 @@
 #include <numeric>
 #include <algorithm>
 #include <vector>
+#include <iterator>
+#include <tuple>
+
+namespace {
+
+template <typename IdType>
+struct TupleRef {
+  TupleRef() = delete;
+  TupleRef(const TupleRef& other) = default;
+  TupleRef(TupleRef&& other) = default;
+  TupleRef(IdType *const r, IdType *const c, IdType *const d)
+    : row(r), col(c), data(d) {}
+
+  TupleRef& operator=(const TupleRef& other) {
+    *row = *other.row;
+    *col = *other.col;
+    *data = *other.data;
+    return *this;
+  }
+  TupleRef& operator=(const std::tuple<IdType, IdType, IdType>& val) {
+    *row = std::get<0>(val);
+    *col = std::get<1>(val);
+    *data = std::get<2>(val);
+    return *this;
+  }
+
+  operator std::tuple<IdType, IdType, IdType>() const {
+    return std::make_tuple(*row, *col, *data);
+  }
+
+  void Swap(const TupleRef& other) const {
+    std::swap(*row, *other.row);
+    std::swap(*col, *other.col);
+    std::swap(*data, *other.data);
+  }
+
+  IdType *row, *col, *data;
+};
+
+using std::swap;
+template <typename IdType>
+void swap(const TupleRef<IdType>& r1, const TupleRef<IdType>& r2) {
+  r1.Swap(r2);
+}
+
+template <typename IdType>
+struct CooIterator : public std::iterator<std::random_access_iterator_tag,
+                                          std::tuple<IdType, IdType, IdType>,
+                                          std::ptrdiff_t,
+                                          std::tuple<IdType*, IdType*, IdType*>,
+                                          TupleRef<IdType>> {
+  CooIterator() = default;
+  CooIterator(const CooIterator& other) = default;
+  CooIterator(CooIterator&& other) = default;
+  CooIterator(IdType *r, IdType *c, IdType *d): row(r), col(c), data(d) {}
+
+  CooIterator& operator=(const CooIterator& other) = default;
+  CooIterator& operator=(CooIterator&& other) = default;
+  ~CooIterator() = default;
+
+  bool operator==(const CooIterator& other) const {
+    return row == other.row;
+  }
+
+  bool operator!=(const CooIterator& other) const {
+    return row != other.row;
+  }
+
+  bool operator<(const CooIterator& other) const {
+    return row < other.row;
+  }
+
+  bool operator>(const CooIterator& other) const {
+    return row > other.row;
+  }
+
+  bool operator<=(const CooIterator& other) const {
+    return row <= other.row;
+  }
+
+  bool operator>=(const CooIterator& other) const {
+    return row >= other.row;
+  }
+
+  CooIterator& operator+=(const std::ptrdiff_t& movement) {
+    row += movement;
+    col += movement;
+    data += movement;
+    return *this;
+  }
+
+  CooIterator& operator-=(const std::ptrdiff_t& movement) {
+    row -= movement;
+    col -= movement;
+    data -= movement;
+    return *this;
+  }
+
+  CooIterator& operator++() {
+    return operator+=(1);
+  }
+
+  CooIterator& operator--() {
+    return operator-=(1);
+  }
+
+  CooIterator operator++(int) {
+    CooIterator ret(*this);
+    operator++();
+    return ret;
+  }
+
+  CooIterator operator--(int) {
+    CooIterator ret(*this);
+    operator--();
+    return ret;
+  }
+
+  CooIterator operator+(const std::ptrdiff_t& movement) const {
+    CooIterator ret(*this);
+    ret += movement;
+    return ret;
+  }
+
+  CooIterator operator-(const std::ptrdiff_t& movement) const {
+    CooIterator ret(*this);
+    ret -= movement;
+    return ret;
+  }
+
+  std::ptrdiff_t operator-(const CooIterator& other) const {
+    return row - other.row;
+  }
+
+  TupleRef<IdType> operator*() const {
+    return TupleRef<IdType>(row, col, data);
+  }
+  TupleRef<IdType> operator*() {
+    return TupleRef<IdType>(row, col, data);
+  }
+
+  IdType *row, *col, *data;
+};
+
+}  // namespace

 namespace dgl {
 namespace aten {
 namespace impl {

-template <DLDeviceType XPU, typename IdType>
-COOMatrix COOSort(COOMatrix coo, bool sort_column) {
-  const int64_t nnz = coo.row->shape[0];
-  const IdType* coo_row_data = static_cast<IdType*>(coo.row->data);
-  const IdType* coo_col_data = static_cast<IdType*>(coo.col->data);
+///////////////////////////// COOSort_ /////////////////////////////

-  // Argsort
-  IdArray new_row = IdArray::Empty({nnz}, coo.row->dtype, coo.row->ctx);
-  IdArray new_col = IdArray::Empty({nnz}, coo.col->dtype, coo.col->ctx);
-  IdArray new_idx = IdArray::Empty({nnz}, coo.row->dtype, coo.row->ctx);
-  IdType* new_row_data = static_cast<IdType*>(new_row->data);
-  IdType* new_col_data = static_cast<IdType*>(new_col->data);
-  IdType* new_idx_data = static_cast<IdType*>(new_idx->data);
-  std::iota(new_idx_data, new_idx_data + nnz, 0);
+template <DLDeviceType XPU, typename IdType>
+void COOSort_(COOMatrix* coo, bool sort_column) {
+  const int64_t nnz = coo->row->shape[0];
+  IdType* coo_row = coo->row.Ptr<IdType>();
+  IdType* coo_col = coo->col.Ptr<IdType>();
+  if (!COOHasData(*coo))
+    coo->data = aten::Range(0, nnz, coo->row->dtype.bits, coo->row->ctx);
+  IdType* coo_data = coo->data.Ptr<IdType>();
+
+  typedef std::tuple<IdType, IdType, IdType> Tuple;
+
+  // Arg sort
  if (sort_column) {
 #ifdef PARALLEL_ALGORITHMS
    __gnu_parallel::sort(
 #else
    std::sort(
 #endif
-        new_idx_data,
-        new_idx_data + nnz,
-        [coo_row_data, coo_col_data](const IdType a, const IdType b) {
-          return (coo_row_data[a] != coo_row_data[b]) ?
-            (coo_row_data[a] < coo_row_data[b]) :
-            (coo_col_data[a] < coo_col_data[b]);
+        CooIterator<IdType>(coo_row, coo_col, coo_data),
+        CooIterator<IdType>(coo_row, coo_col, coo_data) + nnz,
+        [](const Tuple& a, const Tuple& b) {
+          return (std::get<0>(a) != std::get<0>(b)) ?
+              (std::get<0>(a) < std::get<0>(b)) : (std::get<1>(a) < std::get<1>(b));
        });
  } else {
 #ifdef PARALLEL_ALGORITHMS
@@ -48,39 +192,41 @@ COOMatrix COOSort(COOMatrix coo, bool sort_column) {
 #else
    std::sort(
 #endif
-        new_idx_data,
-        new_idx_data + nnz,
-        [coo_row_data](const IdType a, const IdType b) {
-          return coo_row_data[a] < coo_row_data[b];
+        CooIterator<IdType>(coo_row, coo_col, coo_data),
+        CooIterator<IdType>(coo_row, coo_col, coo_data) + nnz,
+        [](const Tuple& a, const Tuple& b) {
+          return std::get<0>(a) < std::get<0>(b);
        });
  }

-  // Reorder according to shuffle
-#pragma omp parallel for
-  for (IdType i = 0; i < nnz; ++i) {
-    new_row_data[i] = coo_row_data[new_idx_data[i]];
-    new_col_data[i] = coo_col_data[new_idx_data[i]];
-  }
-
-  if (COOHasData(coo)) {
-    const IdType* coo_data_data = static_cast<IdType*>(coo.data->data);
-    IdArray new_data = IdArray::Empty({nnz}, coo.row->dtype, coo.row->ctx);
-    IdType* new_data_data = static_cast<IdType*>(new_data->data);
-#pragma omp parallel for
-    for (IdType i = 0; i < nnz; ++i) {
-      new_data_data[i] = coo_data_data[new_idx_data[i]];
-    }
-
-    new_idx = new_data;
-  }
-
-  return COOMatrix{
-      coo.num_rows, coo.num_cols, std::move(new_row), std::move(new_col),
-      std::move(new_idx), true, sort_column};
+  coo->row_sorted = true;
+  coo->col_sorted = sort_column;
 }

-template COOMatrix COOSort<kDLCPU, int32_t>(COOMatrix, bool);
-template COOMatrix COOSort<kDLCPU, int64_t>(COOMatrix, bool);
+template void COOSort_<kDLCPU, int32_t>(COOMatrix*, bool);
+template void COOSort_<kDLCPU, int64_t>(COOMatrix*, bool);
+
+
+///////////////////////////// COOIsSorted /////////////////////////////
+
+template <DLDeviceType XPU, typename IdType>
+std::pair<bool, bool> COOIsSorted(COOMatrix coo) {
+  const int64_t nnz = coo.row->shape[0];
+  IdType* row = coo.row.Ptr<IdType>();
+  IdType* col = coo.col.Ptr<IdType>();
+  bool row_sorted = true;
+  bool col_sorted = true;
+  for (int64_t i = 1; row_sorted && i < nnz; ++i) {
+    row_sorted = (row[i - 1] <= row[i]);
+    col_sorted = col_sorted && (row[i - 1] < row[i] || col[i - 1] <= col[i]);
+  }
+  if (!row_sorted)
+    col_sorted = false;
+  return {row_sorted, col_sorted};
+}
+
+template std::pair<bool, bool> COOIsSorted<kDLCPU, int32_t>(COOMatrix coo);
+template std::pair<bool, bool> COOIsSorted<kDLCPU, int64_t>(COOMatrix coo);

 }  // namespace impl
 }  // namespace aten
--- a/src/array/cpu/csr_sort.cc
+++ b/src/array/cpu/csr_sort.cc
@@ -0,0 +1,83 @@
+/*!
+ *  Copyright (c) 2020 by Contributors
+ * \file array/cpu/csr_sort.cc
+ * \brief CSR sorting
+ */
+#include <dgl/array.h>
+#include <numeric>
+#include <algorithm>
+#include <vector>
+
+namespace dgl {
+namespace aten {
+namespace impl {
+
+///////////////////////////// CSRIsSorted /////////////////////////////
+template <DLDeviceType XPU, typename IdType>
+bool CSRIsSorted(CSRMatrix csr) {
+  const IdType* indptr = csr.indptr.Ptr<IdType>();
+  const IdType* indices = csr.indices.Ptr<IdType>();
+  bool ret = true;
+#pragma omp parallel for shared(ret)
+  for (int64_t row = 0; row < csr.num_rows; ++row) {
+    if (!ret)
+      continue;
+    for (IdType i = indptr[row] + 1; i < indptr[row + 1]; ++i) {
+      if (indices[i - 1] > indices[i]) {
+        ret = false;
+        break;
+      }
+    }
+  }
+  return ret;
+}
+
+template bool CSRIsSorted<kDLCPU, int64_t>(CSRMatrix csr);
+template bool CSRIsSorted<kDLCPU, int32_t>(CSRMatrix csr);
+
+///////////////////////////// CSRSort /////////////////////////////
+
+template <DLDeviceType XPU, typename IdType>
+void CSRSort_(CSRMatrix* csr) {
+  typedef std::pair<IdType, IdType> ShufflePair;
+  const int64_t num_rows = csr->num_rows;
+  const int64_t nnz = csr->indices->shape[0];
+  const IdType* indptr_data = static_cast<IdType*>(csr->indptr->data);
+  IdType* indices_data = static_cast<IdType*>(csr->indices->data);
+  if (!CSRHasData(*csr)) {
+    csr->data = aten::Range(0, nnz, csr->indptr->dtype.bits, csr->indptr->ctx);
+  }
+  IdType* eid_data = static_cast<IdType*>(csr->data->data);
+#pragma omp parallel
+  {
+    std::vector<ShufflePair> reorder_vec;
+#pragma omp for
+    for (int64_t row = 0; row < num_rows; row++) {
+      const int64_t num_cols = indptr_data[row + 1] - indptr_data[row];
+      IdType *col = indices_data + indptr_data[row];
+      IdType *eid = eid_data + indptr_data[row];
+
+      reorder_vec.resize(num_cols);
+      for (int64_t i = 0; i < num_cols; i++) {
+        reorder_vec[i].first = col[i];
+        reorder_vec[i].second = eid[i];
+      }
+      std::sort(reorder_vec.begin(), reorder_vec.end(),
+                [](const ShufflePair &e1, const ShufflePair &e2) {
+                  return e1.first < e2.first;
+                });
+      for (int64_t i = 0; i < num_cols; i++) {
+        col[i] = reorder_vec[i].first;
+        eid[i] = reorder_vec[i].second;
+      }
+    }
+  }
+  csr->sorted = true;
+}
+
+template void CSRSort_<kDLCPU, int64_t>(CSRMatrix* csr);
+template void CSRSort_<kDLCPU, int32_t>(CSRMatrix* csr);
+
+}  // namespace impl
+}  // namespace aten
+}  // namespace dgl
--- a/src/array/cpu/spmat_op_impl.cc
+++ b/src/array/cpu/spmat_op_impl.cc
@@ -377,7 +377,9 @@ COOMatrix CSRToCOO(CSRMatrix csr) {
              ret_row_data + indptr_data[i + 1],
              i);
  }
-  return COOMatrix{csr.num_rows, csr.num_cols, ret_row, csr.indices, csr.data};
+  return COOMatrix(csr.num_rows, csr.num_cols,
+                   ret_row, csr.indices, csr.data,
+                   true, csr.sorted);
 }

 template COOMatrix CSRToCOO<kDLCPU, int32_t>(CSRMatrix csr);
@@ -543,49 +545,6 @@ template CSRMatrix CSRSliceMatrix<kDLCPU, int32_t>(
 template CSRMatrix CSRSliceMatrix<kDLCPU, int64_t>(
    CSRMatrix csr, runtime::NDArray rows, runtime::NDArray cols);

-///////////////////////////// CSRSort /////////////////////////////
-
-template <DLDeviceType XPU, typename IdType>
-void CSRSort_(CSRMatrix* csr) {
-  typedef std::pair<IdType, IdType> ShufflePair;
-  const int64_t num_rows = csr->num_rows;
-  const int64_t nnz = csr->indices->shape[0];
-  const IdType* indptr_data = static_cast<IdType*>(csr->indptr->data);
-  IdType* indices_data = static_cast<IdType*>(csr->indices->data);
-  if (!CSRHasData(*csr)) {
-    csr->data = aten::Range(0, nnz, csr->indptr->dtype.bits, csr->indptr->ctx);
-  }
-  IdType* eid_data = static_cast<IdType*>(csr->data->data);
-#pragma omp parallel
-  {
-    std::vector<ShufflePair> reorder_vec;
-#pragma omp for
-    for (int64_t row = 0; row < num_rows; row++) {
-      const int64_t num_cols = indptr_data[row + 1] - indptr_data[row];
-      IdType *col = indices_data + indptr_data[row];
-      IdType *eid = eid_data + indptr_data[row];
-
-      reorder_vec.resize(num_cols);
-      for (int64_t i = 0; i < num_cols; i++) {
-        reorder_vec[i].first = col[i];
-        reorder_vec[i].second = eid[i];
-      }
-      std::sort(reorder_vec.begin(), reorder_vec.end(),
-                [](const ShufflePair &e1, const ShufflePair &e2) {
-                  return e1.first < e2.first;
-                });
-      for (int64_t i = 0; i < num_cols; i++) {
-        col[i] = reorder_vec[i].first;
-        eid[i] = reorder_vec[i].second;
-      }
-    }
-  }
-  csr->sorted = true;
-}
-
-template void CSRSort_<kDLCPU, int64_t>(CSRMatrix* csr);
-template void CSRSort_<kDLCPU, int32_t>(CSRMatrix* csr);
-
 ///////////////////////////// CSRReorder /////////////////////////////

 template <DLDeviceType XPU, typename IdType>
--- a/src/array/cpu/spmat_op_impl_coo.cc
+++ b/src/array/cpu/spmat_op_impl_coo.cc
@@ -3,10 +3,10 @@
 * \file array/cpu/spmat_op_impl.cc
 * \brief CPU implementation of COO sparse matrix operators
 */
-#include <dgl/array.h>
 #include <vector>
 #include <unordered_set>
 #include <unordered_map>
+#include <tuple>
 #include "array_utils.h"

 namespace dgl {
@@ -266,29 +266,57 @@ CSRMatrix COOToCSR(COOMatrix coo) {
  const IdType* row_data = static_cast<IdType*>(coo.row->data);
  const IdType* col_data = static_cast<IdType*>(coo.col->data);
  const IdType* data = COOHasData(coo)? static_cast<IdType*>(coo.data->data) : nullptr;
+
  NDArray ret_indptr = NDArray::Empty({N + 1}, coo.row->dtype, coo.row->ctx);
  NDArray ret_indices;
  NDArray ret_data;

-  IdType* Bp = static_cast<IdType*>(ret_indptr->data);
-
-  std::fill(Bp, Bp + N, 0);
-  for (int64_t i = 0; i < NNZ; ++i) {
-    Bp[row_data[i]]++;
+  bool row_sorted = coo.row_sorted;
+  bool col_sorted = coo.col_sorted;
+  if (!row_sorted) {
+    // It is possible that the flag is simply not set (default value is false),
+    // so we still perform a linear scan to check the flag.
+    std::tie(row_sorted, col_sorted) = COOIsSorted(coo);
  }

-  // cumsum
-  for (int64_t i = 0, cumsum = 0; i < N; ++i) {
-    const IdType temp = Bp[i];
-    Bp[i] = cumsum;
-    cumsum += temp;
-  }
-  Bp[N] = NNZ;
+  if (row_sorted) {
+    // compute indptr
+    IdType* Bp = static_cast<IdType*>(ret_indptr->data);
+    Bp[0] = 0;
+    int64_t j = 0;
+    for (int64_t i = 0; i < N; ++i) {
+      const int64_t k = j;
+      for (; j < NNZ && row_data[j] == i; ++j) {}
+      Bp[i + 1] = Bp[i] + j - k;
+    }

-  if (coo.row_sorted == true) {
+    // TODO(minjie): Many of our current implementation assumes that CSR must have
+    //   a data array. This is a temporary workaround. Remove this after:
+    //   - The old immutable graph implementation is deprecated.
+    //   - The old binary reduce kernel is deprecated.
+    if (!COOHasData(coo))
+      coo.data = aten::Range(0, NNZ, coo.row->dtype.bits, coo.row->ctx);
+
+    // compute indices and data
    ret_indices = coo.col;
    ret_data = coo.data;
  } else {
+    // compute indptr
+    IdType* Bp = static_cast<IdType*>(ret_indptr->data);
+    std::fill(Bp, Bp + N, 0);
+    for (int64_t i = 0; i < NNZ; ++i) {
+      Bp[row_data[i]]++;
+    }
+
+    // cumsum
+    for (int64_t i = 0, cumsum = 0; i < N; ++i) {
+      const IdType temp = Bp[i];
+      Bp[i] = cumsum;
+      cumsum += temp;
+    }
+    Bp[N] = NNZ;
+
+    // compute indices and data
    ret_indices = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
    ret_data = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
    IdType* Bi = static_cast<IdType*>(ret_indices->data);
@@ -311,7 +339,7 @@ CSRMatrix COOToCSR(COOMatrix coo) {

  return CSRMatrix(coo.num_rows, coo.num_cols,
                   ret_indptr, ret_indices, ret_data,
-                   coo.col_sorted);
+                   col_sorted);
 }

 template CSRMatrix COOToCSR<kDLCPU, int32_t>(COOMatrix coo);
@@ -439,7 +467,6 @@ COOMatrix COOReorder(COOMatrix coo, runtime::NDArray new_row_id_arr,
  // Input COO
  const IdType* in_rows = static_cast<IdType*>(coo.row->data);
  const IdType* in_cols = static_cast<IdType*>(coo.col->data);
-  const IdType* in_data = COOHasData(coo) ? static_cast<IdType*>(coo.data->data) : nullptr;
  int64_t num_rows = coo.num_rows;
  int64_t num_cols = coo.num_cols;
  int64_t nnz = coo.row->shape[0];
--- a/src/array/cuda/array_cumsum.cu
+++ b/src/array/cuda/array_cumsum.cu
@@ -0,0 +1,51 @@
+/*!
+ *  Copyright (c) 2020 by Contributors
+ * \file array/cpu/array_cumsum.cu
+ * \brief Array cumsum GPU implementation
+ */
+#include <dgl/array.h>
+#include <cub/cub.cuh>
+#include "../../runtime/cuda/cuda_common.h"
+#include "./utils.h"
+
+namespace dgl {
+using runtime::NDArray;
+namespace aten {
+namespace impl {
+
+template <DLDeviceType XPU, typename IdType>
+IdArray CumSum(IdArray array, bool prepend_zero) {
+  const int64_t len = array.NumElements();
+  if (len == 0)
+    return array;
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  auto device = runtime::DeviceAPI::Get(array->ctx);
+  const IdType* in_d = array.Ptr<IdType>();
+  IdArray ret;
+  IdType* out_d = nullptr;
+  if (prepend_zero) {
+    ret = aten::Full(0, len + 1, array->dtype.bits, array->ctx);
+    out_d = ret.Ptr<IdType>() + 1;
+  } else {
+    ret = aten::NewIdArray(len, array->ctx, array->dtype.bits);
+    out_d = ret.Ptr<IdType>();
+  }
+  // Allocate workspace
+  size_t workspace_size = 0;
+  cub::DeviceScan::InclusiveSum(nullptr, workspace_size, in_d, out_d, len, thr_entry->stream);
+  void* workspace = device->AllocWorkspace(array->ctx, workspace_size);
+
+  // Compute cumsum
+  cub::DeviceScan::InclusiveSum(workspace, workspace_size, in_d, out_d, len, thr_entry->stream);
+
+  device->FreeWorkspace(array->ctx, workspace);
+
+  return ret;
+}
+
+template IdArray CumSum<kDLGPU, int32_t>(IdArray, bool);
+template IdArray CumSum<kDLGPU, int64_t>(IdArray, bool);
+
+}  // namespace impl
+}  // namespace aten
+}  // namespace dgl
--- a/src/array/cuda/array_index_select.cu
+++ b/src/array/cuda/array_index_select.cu
@@ -5,7 +5,7 @@
 */
 #include <dgl/array.h>
 #include "../../runtime/cuda/cuda_common.h"
-#include "../../cuda_utils.h"
+#include "./utils.h"

 namespace dgl {
 using runtime::NDArray;
@@ -50,7 +50,7 @@ template NDArray IndexSelect<kDLGPU, double, int32_t>(NDArray, IdArray);
 template NDArray IndexSelect<kDLGPU, double, int64_t>(NDArray, IdArray);

 template <DLDeviceType XPU, typename DType>
-DType IndexSelect(NDArray array, uint64_t index) {
+DType IndexSelect(NDArray array, int64_t index) {
  auto device = runtime::DeviceAPI::Get(array->ctx);
  DType ret = 0;
  device->CopyDataFromTo(
@@ -60,12 +60,12 @@ DType IndexSelect(NDArray array, uint64_t index) {
  return ret;
 }

-template int32_t IndexSelect<kDLGPU, int32_t>(NDArray array, uint64_t index);
-template int64_t IndexSelect<kDLGPU, int64_t>(NDArray array, uint64_t index);
-template uint32_t IndexSelect<kDLGPU, uint32_t>(NDArray array, uint64_t index);
-template uint64_t IndexSelect<kDLGPU, uint64_t>(NDArray array, uint64_t index);
-template float IndexSelect<kDLGPU, float>(NDArray array, uint64_t index);
-template double IndexSelect<kDLGPU, double>(NDArray array, uint64_t index);
+template int32_t IndexSelect<kDLGPU, int32_t>(NDArray array, int64_t index);
+template int64_t IndexSelect<kDLGPU, int64_t>(NDArray array, int64_t index);
+template uint32_t IndexSelect<kDLGPU, uint32_t>(NDArray array, int64_t index);
+template uint64_t IndexSelect<kDLGPU, uint64_t>(NDArray array, int64_t index);
+template float IndexSelect<kDLGPU, float>(NDArray array, int64_t index);
+template double IndexSelect<kDLGPU, double>(NDArray array, int64_t index);

 }  // namespace impl
 }  // namespace aten
--- a/src/array/cuda/array_op_impl.cu
+++ b/src/array/cuda/array_op_impl.cu
@@ -5,7 +5,7 @@
 */
 #include <dgl/array.h>
 #include "../../runtime/cuda/cuda_common.h"
-#include "../../cuda_utils.h"
+#include "./utils.h"
 #include "../arith.h"

 namespace dgl {
--- a/src/array/cuda/coo2csr.cc
+++ b/src/array/cuda/coo2csr.cc
@@ -17,63 +17,43 @@ template <DLDeviceType XPU, typename IdType>
 CSRMatrix COOToCSR(COOMatrix coo) {
  CHECK(sizeof(IdType) == 4) << "CUDA COOToCSR does not support int64.";
  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
-  auto device = runtime::DeviceAPI::Get(coo.row->ctx);
  // allocate cusparse handle if needed
  if (!thr_entry->cusparse_handle) {
    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
  }
  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, thr_entry->stream));

-
-  NDArray row = coo.row, col = coo.col, data = coo.data;
-  int32_t* row_ptr = static_cast<int32_t*>(row->data);
-  int32_t* col_ptr = static_cast<int32_t*>(col->data);
-  int32_t* data_ptr = aten::IsNullArray(data) ? nullptr : static_cast<int32_t*>(data->data);
-
-  if (!coo.row_sorted) {
-    // make a copy of row and col because sort is done in-place
-    row = row.CopyTo(row->ctx);
-    col = col.CopyTo(col->ctx);
-    row_ptr = static_cast<int32_t*>(row->data);
-    col_ptr = static_cast<int32_t*>(col->data);
-    if (aten::IsNullArray(data)) {
-      // create the index array
-      data = aten::Range(0, row->shape[0], row->dtype.bits, row->ctx);
-      data_ptr = static_cast<int32_t*>(data->data);
-    }
-    // sort row
-    size_t workspace_size = 0;
-    CUSPARSE_CALL(cusparseXcoosort_bufferSizeExt(
-        thr_entry->cusparse_handle,
-        coo.num_rows, coo.num_cols,
-        row->shape[0],
-        row_ptr,
-        col_ptr,
-        &workspace_size));
-    void* workspace = device->AllocWorkspace(row->ctx, workspace_size);
-    CUSPARSE_CALL(cusparseXcoosortByRow(
-        thr_entry->cusparse_handle,
-        coo.num_rows, coo.num_cols,
-        row->shape[0],
-        row_ptr,
-        col_ptr,
-        data_ptr,
-        workspace));
-    device->FreeWorkspace(row->ctx, workspace);
+  bool row_sorted = coo.row_sorted;
+  bool col_sorted = coo.col_sorted;
+  if (!row_sorted) {
+    // It is possible that the flag is simply not set (default value is false),
+    // so we still perform a linear scan to check the flag.
+    std::tie(row_sorted, col_sorted) = COOIsSorted(coo);
+  }
+  if (!row_sorted) {
+    coo = COOSort(coo);
  }

-  NDArray indptr = aten::NewIdArray(coo.num_rows + 1, row->ctx, row->dtype.bits);
+  const int64_t nnz = coo.row->shape[0];
+  // TODO(minjie): Many of our current implementation assumes that CSR must have
+  //   a data array. This is a temporary workaround. Remove this after:
+  //   - The old immutable graph implementation is deprecated.
+  //   - The old binary reduce kernel is deprecated.
+  if (!COOHasData(coo))
+    coo.data = aten::Range(0, nnz, coo.row->dtype.bits, coo.row->ctx);
+
+  NDArray indptr = aten::NewIdArray(coo.num_rows + 1, coo.row->ctx, coo.row->dtype.bits);
  int32_t* indptr_ptr = static_cast<int32_t*>(indptr->data);
  CUSPARSE_CALL(cusparseXcoo2csr(
        thr_entry->cusparse_handle,
-        row_ptr,
-        row->shape[0],
+        coo.row.Ptr<int32_t>(),
+        nnz,
        coo.num_rows,
        indptr_ptr,
        CUSPARSE_INDEX_BASE_ZERO));

  return CSRMatrix(coo.num_rows, coo.num_cols,
-                   indptr, col, data, false);
+                   indptr, coo.col, coo.data, col_sorted);
 }

 template CSRMatrix COOToCSR<kDLGPU, int32_t>(COOMatrix coo);
--- a/src/array/cuda/coo_sort.cc
+++ b/src/array/cuda/coo_sort.cc
@@ -1,108 +0,0 @@
-/*!
- *  Copyright (c) 2020 by Contributors
- * \file array/cuda/coo_sort.cc
- * \brief Sort COO index
- */
-#include <dgl/array.h>
-#include "../../runtime/cuda/cuda_common.h"
-
-namespace dgl {
-
-using runtime::NDArray;
-
-namespace aten {
-namespace impl {
-
-template <DLDeviceType XPU, typename IdType>
-COOMatrix COOSort(COOMatrix coo, bool sort_column) {
-  CHECK(sizeof(IdType) == 4) << "CUDA COOSort does not support int64.";
-  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
-  auto device = runtime::DeviceAPI::Get(coo.row->ctx);
-  // allocate cusparse handle if needed
-  if (!thr_entry->cusparse_handle) {
-    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
-  }
-  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, thr_entry->stream));
-
-
-  NDArray row = coo.row.CopyTo(coo.row->ctx);
-  NDArray col = coo.col.CopyTo(coo.col->ctx);
-  NDArray data;
-  if (aten::IsNullArray(coo.data)) {
-    // create the index array
-    data = aten::Range(0, row->shape[0], row->dtype.bits, row->ctx);
-  } else {
-    data = coo.data.CopyTo(coo.data->ctx);
-  }
-  int32_t* row_ptr = static_cast<int32_t*>(row->data);
-  int32_t* col_ptr = static_cast<int32_t*>(col->data);
-  int32_t* data_ptr = static_cast<int32_t*>(data->data);
-
-  // sort row
-  size_t workspace_size = 0;
-  CUSPARSE_CALL(cusparseXcoosort_bufferSizeExt(
-      thr_entry->cusparse_handle,
-      coo.num_rows, coo.num_cols,
-      row->shape[0],
-      row_ptr,
-      col_ptr,
-      &workspace_size));
-  void* workspace = device->AllocWorkspace(row->ctx, workspace_size);
-  CUSPARSE_CALL(cusparseXcoosortByRow(
-      thr_entry->cusparse_handle,
-      coo.num_rows, coo.num_cols,
-      row->shape[0],
-      row_ptr,
-      col_ptr,
-      data_ptr,
-      workspace));
-  device->FreeWorkspace(row->ctx, workspace);
-
-  if (sort_column) {
-    // First create a row indptr array and then call csrsort
-    int32_t* indptr = static_cast<int32_t*>(
-        device->AllocWorkspace(row->ctx, (coo.num_rows + 1) * sizeof(IdType)));
-    CUSPARSE_CALL(cusparseXcoo2csr(
-          thr_entry->cusparse_handle,
-          row_ptr,
-          row->shape[0],
-          coo.num_rows,
-          indptr,
-          CUSPARSE_INDEX_BASE_ZERO));
-    CUSPARSE_CALL(cusparseXcsrsort_bufferSizeExt(
-          thr_entry->cusparse_handle,
-          coo.num_rows,
-          coo.num_cols,
-          row->shape[0],
-          indptr,
-          col_ptr,
-          &workspace_size));
-    void* workspace = device->AllocWorkspace(row->ctx, workspace_size);
-    cusparseMatDescr_t descr;
-    CUSPARSE_CALL(cusparseCreateMatDescr(&descr));
-    CUSPARSE_CALL(cusparseXcsrsort(
-          thr_entry->cusparse_handle,
-          coo.num_rows,
-          coo.num_cols,
-          row->shape[0],
-          descr,
-          indptr,
-          col_ptr,
-          data_ptr,
-          workspace));
-    CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
-    device->FreeWorkspace(row->ctx, workspace);
-    device->FreeWorkspace(row->ctx, indptr);
-  }
-
-  return COOMatrix(coo.num_rows, coo.num_cols,
-                   row, col, data, true, sort_column);
-}
-
-template COOMatrix COOSort<kDLGPU, int32_t>(COOMatrix coo, bool sort_column);
-template COOMatrix COOSort<kDLGPU, int64_t>(COOMatrix coo, bool sort_column);
-
-
-}  // namespace impl
-}  // namespace aten
-}  // namespace dgl
--- a/src/array/cuda/coo_sort.cu
+++ b/src/array/cuda/coo_sort.cu
@@ -0,0 +1,158 @@
+/*!
+ *  Copyright (c) 2020 by Contributors
+ * \file array/cuda/coo_sort.cc
+ * \brief Sort COO index
+ */
+#include <dgl/array.h>
+#include "../../runtime/cuda/cuda_common.h"
+#include "./utils.h"
+
+namespace dgl {
+
+using runtime::NDArray;
+
+namespace aten {
+namespace impl {
+
+///////////////////////////// COOSort_ /////////////////////////////
+
+template <DLDeviceType XPU, typename IdType>
+void COOSort_(COOMatrix* coo, bool sort_column) {
+  // TODO(minjie): Current implementation is based on cusparse which only supports
+  //   int32_t. To support int64_t, we could use the Radix sort algorithm provided
+  //   by CUB.
+  CHECK(sizeof(IdType) == 4) << "CUDA COOSort does not support int64.";
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  auto device = runtime::DeviceAPI::Get(coo->row->ctx);
+  // allocate cusparse handle if needed
+  if (!thr_entry->cusparse_handle) {
+    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
+  }
+  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, thr_entry->stream));
+
+
+  NDArray row = coo->row;
+  NDArray col = coo->col;
+  if (!aten::COOHasData(*coo))
+    coo->data = aten::Range(0, row->shape[0], row->dtype.bits, row->ctx);
+  NDArray data = coo->data;
+  int32_t* row_ptr = static_cast<int32_t*>(row->data);
+  int32_t* col_ptr = static_cast<int32_t*>(col->data);
+  int32_t* data_ptr = static_cast<int32_t*>(data->data);
+
+  // sort row
+  size_t workspace_size = 0;
+  CUSPARSE_CALL(cusparseXcoosort_bufferSizeExt(
+      thr_entry->cusparse_handle,
+      coo->num_rows, coo->num_cols,
+      row->shape[0],
+      row_ptr,
+      col_ptr,
+      &workspace_size));
+  void* workspace = device->AllocWorkspace(row->ctx, workspace_size);
+  CUSPARSE_CALL(cusparseXcoosortByRow(
+      thr_entry->cusparse_handle,
+      coo->num_rows, coo->num_cols,
+      row->shape[0],
+      row_ptr,
+      col_ptr,
+      data_ptr,
+      workspace));
+  device->FreeWorkspace(row->ctx, workspace);
+
+  if (sort_column) {
+    // First create a row indptr array and then call csrsort
+    int32_t* indptr = static_cast<int32_t*>(
+        device->AllocWorkspace(row->ctx, (coo->num_rows + 1) * sizeof(IdType)));
+    CUSPARSE_CALL(cusparseXcoo2csr(
+          thr_entry->cusparse_handle,
+          row_ptr,
+          row->shape[0],
+          coo->num_rows,
+          indptr,
+          CUSPARSE_INDEX_BASE_ZERO));
+    CUSPARSE_CALL(cusparseXcsrsort_bufferSizeExt(
+          thr_entry->cusparse_handle,
+          coo->num_rows,
+          coo->num_cols,
+          row->shape[0],
+          indptr,
+          col_ptr,
+          &workspace_size));
+    void* workspace = device->AllocWorkspace(row->ctx, workspace_size);
+    cusparseMatDescr_t descr;
+    CUSPARSE_CALL(cusparseCreateMatDescr(&descr));
+    CUSPARSE_CALL(cusparseXcsrsort(
+          thr_entry->cusparse_handle,
+          coo->num_rows,
+          coo->num_cols,
+          row->shape[0],
+          descr,
+          indptr,
+          col_ptr,
+          data_ptr,
+          workspace));
+    CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
+    device->FreeWorkspace(row->ctx, workspace);
+    device->FreeWorkspace(row->ctx, indptr);
+  }
+
+  coo->row_sorted = true;
+  coo->col_sorted = sort_column;
+}
+
+template void COOSort_<kDLGPU, int32_t>(COOMatrix* coo, bool sort_column);
+template void COOSort_<kDLGPU, int64_t>(COOMatrix* coo, bool sort_column);
+
+///////////////////////////// COOIsSorted /////////////////////////////
+
+template <typename IdType>
+__global__ void _COOIsSortedKernel(
+    const IdType* row, const IdType* col,
+    int64_t nnz, int8_t* row_sorted, int8_t* col_sorted) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int stride_x = gridDim.x * blockDim.x;
+  while (tx < nnz) {
+    if (tx == 0) {
+      row_sorted[0] = 1;
+      col_sorted[0] = 1;
+    } else {
+      row_sorted[tx] = static_cast<int8_t>(row[tx - 1] <= row[tx]);
+      col_sorted[tx] = static_cast<int8_t>(
+          row[tx - 1] < row[tx] || col[tx - 1] <= col[tx]);
+    }
+    tx += stride_x;
+  }
+}
+
+template <DLDeviceType XPU, typename IdType>
+std::pair<bool, bool> COOIsSorted(COOMatrix coo) {
+  const int64_t nnz = coo.row->shape[0];
+  const auto& ctx = coo.row->ctx;
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  auto device = runtime::DeviceAPI::Get(ctx);
+  // We allocate a workspace of 2*nnz bytes. It wastes a little bit memory but should
+  // be fine.
+  int8_t* row_flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, nnz));
+  int8_t* col_flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, nnz));
+  const int nt = cuda::FindNumThreads(nnz);
+  const int nb = (nnz + nt - 1) / nt;
+  _COOIsSortedKernel<<<nb, nt, 0, thr_entry->stream>>>(
+      coo.row.Ptr<IdType>(), coo.col.Ptr<IdType>(),
+      nnz, row_flags, col_flags);
+
+  const bool row_sorted = cuda::AllTrue(row_flags, nnz, ctx);
+  const bool col_sorted = row_sorted? cuda::AllTrue(col_flags, nnz, ctx) : false;
+
+  device->FreeWorkspace(ctx, row_flags);
+  device->FreeWorkspace(ctx, col_flags);
+
+  return {row_sorted, col_sorted};
+}
+
+template std::pair<bool, bool> COOIsSorted<kDLGPU, int32_t>(COOMatrix coo);
+template std::pair<bool, bool> COOIsSorted<kDLGPU, int64_t>(COOMatrix coo);
+
+}  // namespace impl
+}  // namespace aten
+}  // namespace dgl
--- a/src/array/cuda/csr_sort.cu
+++ b/src/array/cuda/csr_sort.cu
@@ -0,0 +1,108 @@
+/*!
+ *  Copyright (c) 2020 by Contributors
+ * \file array/cuda/csr_sort.cc
+ * \brief Sort COO index
+ */
+#include <dgl/array.h>
+#include "../../runtime/cuda/cuda_common.h"
+#include "./utils.h"
+
+namespace dgl {
+
+using runtime::NDArray;
+
+namespace aten {
+namespace impl {
+
+/*!
+ * \brief Check whether each row is sorted.
+ */
+template <typename IdType>
+__global__ void _SegmentIsSorted(
+    const IdType* indptr, const IdType* indices,
+    int64_t num_rows, int8_t* flags) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int stride_x = gridDim.x * blockDim.x;
+  while (tx < num_rows) {
+    bool f = true;
+    for (IdType i = indptr[tx] + 1; f && i < indptr[tx + 1]; ++i) {
+      f = (indices[i - 1] <= indices[i]);
+    }
+    flags[tx] = static_cast<int8_t>(f);
+    tx += stride_x;
+  }
+}
+
+template <DLDeviceType XPU, typename IdType>
+bool CSRIsSorted(CSRMatrix csr) {
+  const auto& ctx = csr.indptr->ctx;
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  auto device = runtime::DeviceAPI::Get(ctx);
+  // We allocate a workspace of num_rows bytes. It wastes a little bit memory but should
+  // be fine.
+  int8_t* flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, csr.num_rows));
+  const int nt = cuda::FindNumThreads(csr.num_rows);
+  const int nb = (csr.num_rows + nt - 1) / nt;
+  _SegmentIsSorted<<<nb, nt, 0, thr_entry->stream>>>(
+      csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
+      csr.num_rows, flags);
+  bool ret = cuda::AllTrue(flags, csr.num_rows, ctx);
+  device->FreeWorkspace(ctx, flags);
+  return ret;
+}
+
+template bool CSRIsSorted<kDLGPU, int32_t>(CSRMatrix csr);
+template bool CSRIsSorted<kDLGPU, int64_t>(CSRMatrix csr);
+
+template <DLDeviceType XPU, typename IdType>
+void CSRSort_(CSRMatrix* csr) {
+  CHECK(sizeof(IdType) == 4) << "CUDA CSRSort_ does not support int64.";
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  auto device = runtime::DeviceAPI::Get(csr->indptr->ctx);
+  // allocate cusparse handle if needed
+  if (!thr_entry->cusparse_handle) {
+    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
+  }
+  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, thr_entry->stream));
+
+  NDArray indptr = csr->indptr;
+  NDArray indices = csr->indices;
+  const auto& ctx = indptr->ctx;
+  const int64_t nnz = indices->shape[0];
+  if (!aten::CSRHasData(*csr))
+    csr->data = aten::Range(0, nnz, indices->dtype.bits, ctx);
+  NDArray data = csr->data;
+
+  size_t workspace_size = 0;
+  CUSPARSE_CALL(cusparseXcsrsort_bufferSizeExt(
+      thr_entry->cusparse_handle,
+      csr->num_rows, csr->num_cols, nnz,
+      indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(),
+      &workspace_size));
+  void* workspace = device->AllocWorkspace(ctx, workspace_size);
+
+  cusparseMatDescr_t descr;
+  CUSPARSE_CALL(cusparseCreateMatDescr(&descr));
+  CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
+  CUSPARSE_CALL(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
+  CUSPARSE_CALL(cusparseXcsrsort(
+      thr_entry->cusparse_handle,
+      csr->num_rows, csr->num_cols, nnz,
+      descr,
+      indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(),
+      data.Ptr<int32_t>(),
+      workspace));
+
+  csr->sorted = true;
+
+  // free resources
+  CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
+  device->FreeWorkspace(ctx, workspace);
+}
+
+template void CSRSort_<kDLGPU, int32_t>(CSRMatrix* csr);
+template void CSRSort_<kDLGPU, int64_t>(CSRMatrix* csr);
+
+}  // namespace impl
+}  // namespace aten
+}  // namespace dgl
--- a/src/array/cuda/sddmm.cuh
+++ b/src/array/cuda/sddmm.cuh
@@ -10,7 +10,7 @@
 #include "macro.cuh"
 #include "atomic.cuh"
 #include "functor.cuh"
-#include "../../cuda_utils.h"
+#include "./utils.h"
 #include "../../runtime/cuda/cuda_common.h"

 namespace dgl {
--- a/src/array/cuda/spmat_op_impl.cu
+++ b/src/array/cuda/spmat_op_impl.cu
@@ -8,7 +8,7 @@
 #include <unordered_set>
 #include <numeric>
 #include "../../runtime/cuda/cuda_common.h"
-#include "../../cuda_utils.h"
+#include "./utils.h"

 namespace dgl {

@@ -17,8 +17,6 @@ using runtime::NDArray;
 namespace aten {
 namespace impl {

-///////////////////////////// CSRIsNonZero /////////////////////////////
-
 /*!
 * \brief Search adjacency list linearly for each (row, col) pair and
 * write the matched position in the indices array to the output.
@@ -33,7 +31,7 @@ __global__ void _LinearSearchKernel(
    int64_t row_stride, int64_t col_stride,
    int64_t length, IdType* out) {
  int tx = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride_x = gridDim.x * blockDim.x;
+  const int stride_x = gridDim.x * blockDim.x;
  int rpos = tx, cpos = tx;
  while (tx < length) {
    out[tx] = -1;
@@ -50,6 +48,8 @@ __global__ void _LinearSearchKernel(
  }
 }

+///////////////////////////// CSRIsNonZero /////////////////////////////
+
 template <DLDeviceType XPU, typename IdType>
 bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) {
  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
@@ -169,6 +169,88 @@ NDArray CSRGetRowData(CSRMatrix csr, int64_t row) {
 template NDArray CSRGetRowData<kDLGPU, int32_t>(CSRMatrix, int64_t);
 template NDArray CSRGetRowData<kDLGPU, int64_t>(CSRMatrix, int64_t);

+///////////////////////////// CSRSliceRows /////////////////////////////
+
+template <DLDeviceType XPU, typename IdType>
+CSRMatrix CSRSliceRows(CSRMatrix csr, int64_t start, int64_t end) {
+  const int64_t num_rows = end - start;
+  const IdType st_pos = aten::IndexSelect<IdType>(csr.indptr, start);
+  const IdType ed_pos = aten::IndexSelect<IdType>(csr.indptr, end);
+  const IdType nnz = ed_pos - st_pos;
+  IdArray ret_indptr = aten::IndexSelect(csr.indptr, start, end + 1) - st_pos;
+  // indices and data can be view arrays
+  IdArray ret_indices = csr.indices.CreateView(
+      {nnz}, csr.indices->dtype, st_pos * sizeof(IdType));
+  IdArray ret_data;
+  if (CSRHasData(csr))
+    ret_data = csr.data.CreateView({nnz}, csr.data->dtype, st_pos * sizeof(IdType));
+  else
+    ret_data = aten::Range(st_pos, ed_pos,
+                           csr.indptr->dtype.bits, csr.indptr->ctx);
+  return CSRMatrix(num_rows, csr.num_cols,
+                   ret_indptr, ret_indices, ret_data,
+                   csr.sorted);
+}
+
+template CSRMatrix CSRSliceRows<kDLGPU, int32_t>(CSRMatrix, int64_t, int64_t);
+template CSRMatrix CSRSliceRows<kDLGPU, int64_t>(CSRMatrix, int64_t, int64_t);
+
+/*!
+ * \brief Copy data segment to output buffers
+ * 
+ * For the i^th row r = row[i], copy the data from indptr[r] ~ indptr[r+1]
+ * to the out_data from out_indptr[i] ~ out_indptr[i+1]
+ *
+ * If the provided `data` array is nullptr, write the read index to the out_data.
+ *
+ */
+template <typename IdType, typename DType>
+__global__ void _SegmentCopyKernel(
+    const IdType* indptr, const DType* data,
+    const IdType* row, int64_t row_stride, int64_t length,
+    const IdType* out_indptr, DType* out_data) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int stride_x = gridDim.x * blockDim.x;
+  int rpos = tx;
+  while (tx < length) {
+    const IdType r = row[rpos];
+    DType* out_buf = out_data + out_indptr[tx];
+    for (IdType i = indptr[r]; i < indptr[r + 1]; ++i) {
+      *(out_buf++) = data? data[i] : i;
+    }
+    rpos += row_stride;
+    tx += stride_x;
+  }
+}
+
+template <DLDeviceType XPU, typename IdType>
+CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  const int64_t len = rows->shape[0];
+  IdArray ret_indptr = aten::CumSum(aten::CSRGetRowNNZ(csr, rows), true);
+  const int64_t nnz = aten::IndexSelect<IdType>(ret_indptr, len);
+
+  const int nt = cuda::FindNumThreads(len);
+  const int nb = (len + nt - 1) / nt;
+  // Copy indices.
+  IdArray ret_indices = NDArray::Empty({nnz}, csr.indptr->dtype, csr.indptr->ctx);
+  _SegmentCopyKernel<<<nb, nt, 0, thr_entry->stream>>>(
+      csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
+      rows.Ptr<IdType>(), 1, len,
+      ret_indptr.Ptr<IdType>(), ret_indices.Ptr<IdType>());
+  // Copy data.
+  IdArray ret_data = NDArray::Empty({nnz}, csr.indptr->dtype, csr.indptr->ctx);
+  _SegmentCopyKernel<<<nb, nt, 0, thr_entry->stream>>>(
+      csr.indptr.Ptr<IdType>(), CSRHasData(csr)? csr.data.Ptr<IdType>() : nullptr,
+      rows.Ptr<IdType>(), 1, len,
+      ret_indptr.Ptr<IdType>(), ret_data.Ptr<IdType>());
+  return CSRMatrix(len, csr.num_cols,
+                   ret_indptr, ret_indices, ret_data,
+                   csr.sorted);
+}
+
+template CSRMatrix CSRSliceRows<kDLGPU, int32_t>(CSRMatrix , NDArray);
+template CSRMatrix CSRSliceRows<kDLGPU, int64_t>(CSRMatrix , NDArray);

 }  // namespace impl
 }  // namespace aten
--- a/src/array/cuda/spmm.cu
+++ b/src/array/cuda/spmm.cu
@@ -140,6 +140,7 @@ void CusparseCsrmm2(
      static_cast<int32_t*>(csr.indptr->data),
      static_cast<int32_t*>(csr.indices->data),
      B_data, n, &beta, trans_out, m));
+  CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
  if (valptr)
    device->FreeWorkspace(ctx, valptr);
  // transpose the output matrix
--- a/src/array/cuda/spmm.cuh
+++ b/src/array/cuda/spmm.cuh
@@ -9,8 +9,8 @@
 #include <dgl/bcast.h>
 #include "macro.cuh"
 #include "atomic.cuh"
-#include "../../cuda_utils.h"
 #include "../../runtime/cuda/cuda_common.h"
+#include "./utils.h"

 namespace dgl {

--- a/src/array/cuda/utils.cu
+++ b/src/array/cuda/utils.cu
@@ -0,0 +1,30 @@
+/*!
+ *  Copyright (c) 2020 by Contributors
+ * \file array/cuda/utils.cu
+ * \brief Utilities for CUDA kernels.
+ */
+
+#include "./utils.h"
+#include <cub/cub.cuh>
+#include "../../runtime/cuda/cuda_common.h"
+
+namespace dgl {
+namespace cuda {
+
+bool AllTrue(int8_t* flags, int64_t length, const DLContext& ctx) {
+  auto device = runtime::DeviceAPI::Get(ctx);
+  int8_t* rst = static_cast<int8_t*>(device->AllocWorkspace(ctx, 1));
+  // Call CUB's reduction
+  size_t workspace_size = 0;
+  CUDA_CALL(cub::DeviceReduce::Min(nullptr, workspace_size, flags, rst, length));
+  void* workspace = device->AllocWorkspace(ctx, workspace_size);
+  CUDA_CALL(cub::DeviceReduce::Min(workspace, workspace_size, flags, rst, length));
+  int8_t cpu_rst = 0;
+  CUDA_CALL(cudaMemcpy(&cpu_rst, rst, 1, cudaMemcpyDeviceToHost));
+  device->FreeWorkspace(ctx, workspace);
+  device->FreeWorkspace(ctx, rst);
+  return cpu_rst == 1;
+}
+
+}  // namespace cuda
+}  // namespace dgl
--- a/src/array/cuda/utils.h
+++ b/src/array/cuda/utils.h
@@ -1,12 +1,13 @@
 /*!
 *  Copyright (c) 2020 by Contributors
- * \file cuda_utils.h
+ * \file array/cuda/utils.h
 * \brief Utilities for CUDA kernels.
 */
-#ifndef DGL_CUDA_UTILS_H_
-#define DGL_CUDA_UTILS_H_
+#ifndef DGL_ARRAY_CUDA_UTILS_H_
+#define DGL_ARRAY_CUDA_UTILS_H_

 #include <dmlc/logging.h>
+#include <dlpack/dlpack.h>

 namespace dgl {
 namespace cuda {
@@ -68,7 +69,18 @@ __device__ __forceinline__ T _ldg(T* addr) {
 #endif
 }

+/*!
+ * \brief Return true if the given bool flag array is all true.
+ * The input bool array is in int8_t type so it is aligned with byte address.
+ *
+ * \param flags The bool array.
+ * \param length The length.
+ * \param ctx Device context.
+ * \return True if all the flags are true.
+ */
+bool AllTrue(int8_t* flags, int64_t length, const DLContext& ctx);
+
 }  // namespace cuda
 }  // namespace dgl

-#endif  // DGL_CUDA_UTILS_H_
+#endif  // DGL_ARRAY_CUDA_UTILS_H_
--- a/src/array/kernel.cc
+++ b/src/array/kernel.cc
@@ -3,7 +3,6 @@
 * \file array/kernel.cc
 * \brief New kernels
 */
-#include <dgl/array.h>
 #include <dgl/packed_func_ext.h>
 #include <dgl/base_heterograph.h>

--- a/src/array/kernel_decl.h
+++ b/src/array/kernel_decl.h
@@ -6,9 +6,9 @@
 #ifndef DGL_ARRAY_KERNEL_DECL_H_
 #define DGL_ARRAY_KERNEL_DECL_H_

-#include <dgl/array.h>
 #include <dgl/bcast.h>
 #include <dgl/base_heterograph.h>
+#include <dgl/runtime/ndarray.h>

 #include <string>
 #include <vector>
--- a/src/c_api_common.h
+++ b/src/c_api_common.h
@@ -14,6 +14,7 @@
 #include <algorithm>
 #include <vector>
 #include <string>
+#include <utility>

 namespace dgl {

--- a/src/graph/network.cc
+++ b/src/graph/network.cc
@@ -804,7 +804,7 @@ DGL_REGISTER_GLOBAL("network._CAPI_FastPull")
      }
    }
    int msg_count = 0;
-    for (int i = 0; i < remote_ids.size(); ++i) {
+    for (size_t i = 0; i < remote_ids.size(); ++i) {
      if (remote_ids[i].size() != 0) {
        KVStoreMsg kv_msg;
        kv_msg.msg_type = MessageType::kPullMsg;
@@ -827,9 +827,10 @@ DGL_REGISTER_GLOBAL("network._CAPI_FastPull")
      }
    }
    char *return_data = new char[ID_size*row_size];
+    const int64_t local_ids_size = local_ids.size();
    // Copy local data
 #pragma omp parallel for
-    for (int64_t i = 0; i < local_ids.size(); ++i) {
+    for (int64_t i = 0; i < local_ids_size; ++i) {
      CHECK_GE(ID_size*row_size, local_ids_orginal[i] * row_size + row_size);
      CHECK_GE(data_size, local_ids[i] * row_size + row_size);
      CHECK_GE(local_ids[i], 0);
@@ -843,7 +844,7 @@ DGL_REGISTER_GLOBAL("network._CAPI_FastPull")
      int64_t id_size = kv_msg->id.GetSize() / sizeof(int64_t);
      int part_id = kv_msg->rank / group_count;
      char* data_char = static_cast<char*>(kv_msg->data->data);
-      for (size_t n = 0; n < id_size; ++n) {
+      for (int64_t n = 0; n < id_size; ++n) {
        memcpy(return_data + remote_ids_original[part_id][n] * row_size,
               data_char + n * row_size,
               row_size);
--- a/src/graph/serialize/graph_serialize.h
+++ b/src/graph/serialize/graph_serialize.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include <algorithm>
 #include <utility>
+#include <memory>
 #include "../../c_api_common.h"

 using dgl::runtime::NDArray;
--- a/src/graph/transform/to_bipartite.cc
+++ b/src/graph/transform/to_bipartite.cc
@@ -51,9 +51,11 @@ ToBlock(HeteroGraphPtr graph, const std::vector<IdArray> &rhs_nodes, bool includ
    const auto src_dst_types = graph->GetEndpointTypes(etype);
    const dgl_type_t srctype = src_dst_types.first;
    const dgl_type_t dsttype = src_dst_types.second;
-    const EdgeArray edges = graph->InEdges(etype, rhs_nodes[dsttype]);
-    lhs_node_mappings[srctype].Update(edges.src);
-    edge_arrays[etype] = edges;
+    if (!aten::IsNullArray(rhs_nodes[dsttype])) {
+      const EdgeArray& edges = graph->Edges(etype);
+      lhs_node_mappings[srctype].Update(edges.src);
+      edge_arrays[etype] = edges;
+    }
  }

  const auto meta_graph = graph->meta_graph();
@@ -75,11 +77,26 @@ ToBlock(HeteroGraphPtr graph, const std::vector<IdArray> &rhs_nodes, bool includ
    const dgl_type_t dsttype = src_dst_types.second;
    const IdHashMap<IdType> &lhs_map = lhs_node_mappings[srctype];
    const IdHashMap<IdType> &rhs_map = rhs_node_mappings[dsttype];
-    rel_graphs.push_back(CreateFromCOO(
-        2, lhs_map.Size(), rhs_map.Size(),
-        lhs_map.Map(edge_arrays[etype].src, -1),
-        rhs_map.Map(edge_arrays[etype].dst, -1)));
-    induced_edges.push_back(edge_arrays[etype].id);
+    if (rhs_map.Size() == 0) {
+      // No rhs nodes are given for this edge type. Create an empty graph.
+      rel_graphs.push_back(CreateFromCOO(
+          2, lhs_map.Size(), rhs_map.Size(),
+          aten::NullArray(), aten::NullArray()));
+      induced_edges.push_back(aten::NullArray());
+    } else {
+      IdArray new_src = lhs_map.Map(edge_arrays[etype].src, -1);
+      IdArray new_dst = rhs_map.Map(edge_arrays[etype].dst, -1);
+      // Check whether there are unmapped IDs and raise error.
+      for (int64_t i = 0; i < new_dst->shape[0]; ++i)
+        CHECK_NE(new_dst.Ptr<IdType>()[i], -1)
+          << "Node " << edge_arrays[etype].dst.Ptr<IdType>()[i] << " does not exist"
+          << " in `rhs_nodes`. Argument `rhs_nodes` must contain all the edge"
+          << " destination nodes.";
+      rel_graphs.push_back(CreateFromCOO(
+          2, lhs_map.Size(), rhs_map.Size(),
+          new_src, new_dst));
+      induced_edges.push_back(edge_arrays[etype].id);
+    }
  }

  const HeteroGraphPtr new_graph = CreateHeteroGraph(
--- a/src/graph/unit_graph.cc
+++ b/src/graph/unit_graph.cc
@@ -138,13 +138,7 @@ class UnitGraph::COO : public BaseHeteroGraph {
  COO CopyTo(const DLContext& ctx) const {
    if (Context() == ctx)
      return *this;
-
-    COO ret(
-        meta_graph_,
-        adj_.num_rows, adj_.num_cols,
-        adj_.row.CopyTo(ctx),
-        adj_.col.CopyTo(ctx));
-    return ret;
+    return COO(meta_graph_, adj_.CopyTo(ctx));
  }

  bool IsMultigraph() const override {
@@ -516,13 +510,7 @@ class UnitGraph::CSR : public BaseHeteroGraph {
    if (Context() == ctx) {
      return *this;
    } else {
-      CSR ret(
-          meta_graph_,
-          adj_.num_rows, adj_.num_cols,
-          adj_.indptr.CopyTo(ctx),
-          adj_.indices.CopyTo(ctx),
-          adj_.data.CopyTo(ctx));
-      return ret;
+      return CSR(meta_graph_, adj_.CopyTo(ctx));
    }
  }

@@ -1181,35 +1169,28 @@ HeteroGraphPtr UnitGraph::AsNumBits(HeteroGraphPtr g, uint8_t bits) {
  if (g->NumBits() == bits) {
    return g;
  } else {
-    // TODO(minjie): since we don't have int32 operations,
-    //   we make sure that this graph (on CPU) has materialized CSR,
-    //   and then copy them to other context (usually GPU). This should
-    //   be fixed later.
    auto bg = std::dynamic_pointer_cast<UnitGraph>(g);
    CHECK_NOTNULL(bg);
-
-    CSRPtr new_incsr = CSRPtr(new CSR(bg->GetInCSR()->AsNumBits(bits)));
-    CSRPtr new_outcsr = CSRPtr(new CSR(bg->GetOutCSR()->AsNumBits(bits)));
+    CSRPtr new_incsr = (bg->in_csr_)? CSRPtr(new CSR(bg->in_csr_->AsNumBits(bits))) : nullptr;
+    CSRPtr new_outcsr = (bg->out_csr_)? CSRPtr(new CSR(bg->out_csr_->AsNumBits(bits))) : nullptr;
+    COOPtr new_coo = (bg->coo_)? COOPtr(new COO(bg->coo_->AsNumBits(bits))) : nullptr;
    return HeteroGraphPtr(
-        new UnitGraph(g->meta_graph(), new_incsr, new_outcsr, nullptr, bg->restrict_format_));
+        new UnitGraph(g->meta_graph(), new_incsr, new_outcsr, new_coo, bg->restrict_format_));
  }
 }

 HeteroGraphPtr UnitGraph::CopyTo(HeteroGraphPtr g, const DLContext& ctx) {
  if (ctx == g->Context()) {
    return g;
+  } else {
+    auto bg = std::dynamic_pointer_cast<UnitGraph>(g);
+    CHECK_NOTNULL(bg);
+    CSRPtr new_incsr = (bg->in_csr_)? CSRPtr(new CSR(bg->in_csr_->CopyTo(ctx))) : nullptr;
+    CSRPtr new_outcsr = (bg->out_csr_)? CSRPtr(new CSR(bg->out_csr_->CopyTo(ctx))) : nullptr;
+    COOPtr new_coo = (bg->coo_)? COOPtr(new COO(bg->coo_->CopyTo(ctx))) : nullptr;
+    return HeteroGraphPtr(
+        new UnitGraph(g->meta_graph(), new_incsr, new_outcsr, new_coo, bg->restrict_format_));
  }
-  // TODO(minjie): since we don't have GPU implementation of COO<->CSR,
-  //   we make sure that this graph (on CPU) has materialized CSR,
-  //   and then copy them to other context (usually GPU). This should
-  //   be fixed later.
-  auto bg = std::dynamic_pointer_cast<UnitGraph>(g);
-  CHECK_NOTNULL(bg);
-
-  CSRPtr new_incsr = CSRPtr(new CSR(bg->GetInCSR()->CopyTo(ctx)));
-  CSRPtr new_outcsr = CSRPtr(new CSR(bg->GetOutCSR()->CopyTo(ctx)));
-  return HeteroGraphPtr(
-      new UnitGraph(g->meta_graph(), new_incsr, new_outcsr, nullptr, bg->restrict_format_));
 }

 UnitGraph::UnitGraph(GraphPtr metagraph, CSRPtr in_csr, CSRPtr out_csr, COOPtr coo,
@@ -1278,9 +1259,8 @@ UnitGraph::CSRPtr UnitGraph::GetInCSR(bool inplace) const {
        const_cast<UnitGraph*>(this)->in_csr_ = ret;
    } else {
      CHECK(coo_) << "None of CSR, COO exist";
-      const auto& adj = coo_->adj();
-      const auto& newadj = aten::COOToCSR(
-          aten::COOMatrix{adj.num_cols, adj.num_rows, adj.col, adj.row});
+      const auto& newadj = aten::CSRSort(aten::COOToCSR(
+            aten::COOTranspose(coo_->adj())));
      ret = std::make_shared<CSR>(meta_graph(), newadj);
      if (inplace)
        const_cast<UnitGraph*>(this)->in_csr_ = ret;
@@ -1299,13 +1279,13 @@ UnitGraph::CSRPtr UnitGraph::GetOutCSR(bool inplace) const {
  CSRPtr ret = out_csr_;
  if (!out_csr_) {
    if (in_csr_) {
-      const auto& newadj = aten::CSRTranspose(in_csr_->adj());
+      const auto& newadj = aten::CSRSort(aten::CSRTranspose(in_csr_->adj()));
      ret = std::make_shared<CSR>(meta_graph(), newadj);
      if (inplace)
        const_cast<UnitGraph*>(this)->out_csr_ = ret;
    } else {
      CHECK(coo_) << "None of CSR, COO exist";
-      const auto& newadj = aten::COOToCSR(coo_->adj());
+      const auto& newadj = aten::CSRSort(aten::COOToCSR(coo_->adj()));
      ret = std::make_shared<CSR>(meta_graph(), newadj);
      if (inplace)
        const_cast<UnitGraph*>(this)->out_csr_ = ret;
--- a/src/rpc/network/socket_communicator.cc
+++ b/src/rpc/network/socket_communicator.cc
@@ -8,6 +8,7 @@
 #include <string.h>
 #include <stdlib.h>
 #include <time.h>
+#include <memory>

 #include "socket_communicator.h"
 #include "../../c_api_common.h"
--- a/src/rpc/network/socket_communicator.h
+++ b/src/rpc/network/socket_communicator.h
@@ -10,6 +10,7 @@
 #include <vector>
 #include <string>
 #include <unordered_map>
+#include <memory>

 #include "communicator.h"
 #include "msg_queue.h"
@@ -19,9 +20,9 @@
 namespace dgl {
 namespace network {

-static int kMaxTryCount = 1024;    // maximal connection: 1024
-static int kTimeOut = 10;          // 10 minutes for socket timeout
-static int kMaxConnection = 1024;  // maximal connection: 1024
+static constexpr int kMaxTryCount = 1024;    // maximal connection: 1024
+static constexpr int kTimeOut = 10;          // 10 minutes for socket timeout
+static constexpr int kMaxConnection = 1024;  // maximal connection: 1024

 /*!
 * \breif Networking address
--- a/src/runtime/file_util.cc
+++ b/src/runtime/file_util.cc
@@ -7,6 +7,7 @@
 #include <dgl/runtime/serializer.h>
 #include <fstream>
 #include <vector>
+#include <unordered_map>

 #include "file_util.h"

--- a/src/runtime/file_util.h
+++ b/src/runtime/file_util.h
@@ -7,6 +7,7 @@
 #define DGL_RUNTIME_FILE_UTIL_H_

 #include <string>
+#include <unordered_map>
 #include "meta_data.h"

 namespace dgl {
--- a/src/runtime/module_util.cc
+++ b/src/runtime/module_util.cc
@@ -9,6 +9,7 @@
 #include <dgl/runtime/module.h>
 #include <dgl/runtime/registry.h>
 #include <string>
+#include <memory>
 #include "module_util.h"

 namespace dgl {
--- a/src/runtime/module_util.h
+++ b/src/runtime/module_util.h
@@ -10,6 +10,7 @@
 #include <dgl/runtime/c_runtime_api.h>
 #include <dgl/runtime/c_backend_api.h>
 #include <vector>
+#include <memory>

 extern "C" {
 // Function signature for generated packed function in shared library
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -124,6 +124,8 @@ size_t NDArray::GetSize() const {
 }

 int64_t NDArray::NumElements() const {
+  if (data_->dl_tensor.ndim == 0)
+    return 0;
  int64_t size = 1;
  for (int i = 0; i < data_->dl_tensor.ndim; ++i) {
    size *= data_->dl_tensor.shape[i];
--- a/src/runtime/workspace_pool.cc
+++ b/src/runtime/workspace_pool.cc
@@ -4,6 +4,7 @@
 * \brief Workspace pool utility.
 */
 #include "workspace_pool.h"
+#include <memory>

 namespace dgl {
 namespace runtime {
--- a/src/runtime/workspace_pool.h
+++ b/src/runtime/workspace_pool.h
@@ -8,6 +8,7 @@

 #include <dgl/runtime/device_api.h>
 #include <vector>
+#include <memory>

 namespace dgl {
 namespace runtime {
--- a/tests/compute/test_heterograph.py
+++ b/tests/compute/test_heterograph.py
@@ -1883,4 +1883,4 @@ if __name__ == '__main__':
    # test_isolated_ntype()
    # test_bipartite()
    # test_dtype_cast()
-    test_format()
+    pass
--- a/tests/compute/test_transform.py
+++ b/tests/compute/test_transform.py
@@ -603,10 +603,6 @@ def test_to_block(index_dtype):
    assert bg.number_of_src_nodes() == 4
    assert bg.number_of_dst_nodes() == 4

-    dst_nodes = F.tensor([3, 4], dtype=getattr(F, index_dtype))
-    bg = dgl.to_block(g_a, dst_nodes)
-    check(g_a, bg, 'A', 'AA', dst_nodes)
-
    dst_nodes = F.tensor([4, 3, 2, 1], dtype=getattr(F, index_dtype))
    bg = dgl.to_block(g_a, dst_nodes)
    check(g_a, bg, 'A', 'AA', dst_nodes)
@@ -620,17 +616,13 @@ def test_to_block(index_dtype):
    assert bg.number_of_nodes('DST/A') == 0
    checkall(g_ab, bg, None)

-    dst_nodes = {'B': F.tensor([5, 6], dtype=getattr(F, index_dtype))}
+    dst_nodes = {'B': F.tensor([5, 6, 3, 1], dtype=getattr(F, index_dtype))}
    bg = dgl.to_block(g, dst_nodes)
-    assert bg.number_of_nodes('SRC/B') == 2
+    assert bg.number_of_nodes('SRC/B') == 4
    assert F.array_equal(bg.srcnodes['B'].data[dgl.NID], bg.dstnodes['B'].data[dgl.NID])
    assert bg.number_of_nodes('DST/A') == 0
    checkall(g, bg, dst_nodes)

-    dst_nodes = {'A': F.tensor([3, 4], dtype=getattr(F, index_dtype)), 'B': F.tensor([5, 6], dtype=getattr(F, index_dtype))}
-    bg = dgl.to_block(g, dst_nodes)
-    checkall(g, bg, dst_nodes)
-
    dst_nodes = {'A': F.tensor([4, 3, 2, 1], dtype=getattr(F, index_dtype)), 'B': F.tensor([3, 5, 6, 1], dtype=getattr(F, index_dtype))}
    bg = dgl.to_block(g, dst_nodes=dst_nodes)
    checkall(g, bg, dst_nodes)
--- a/tests/cpp/common.h
+++ b/tests/cpp/common.h
@@ -29,6 +29,10 @@ inline int64_t Len(dgl::runtime::NDArray nd) {
 template <typename T>
 inline bool ArrayEQ(dgl::runtime::NDArray a1, dgl::runtime::NDArray a2) {
  if (a1->ndim != a2->ndim) return false;
+  if (a1->dtype != a2->dtype) return false;
+  if (a1->ctx != a2->ctx) return false;
+  if (a1.NumElements() != a2.NumElements()) return false;
+  if (a1.NumElements() == 0) return true;
  int64_t num = 1;
  for (int i = 0; i < a1->ndim; ++i) {
    if (a1->shape[i] != a2->shape[i])
--- a/tests/cpp/test_aten.cc
+++ b/tests/cpp/test_aten.cc
@@ -208,6 +208,8 @@ template <typename IDX>
 void _TestIndexSelect(DLContext ctx) {
  IdArray a = aten::Range(0, 100, sizeof(IDX)*8, ctx);
  ASSERT_EQ(aten::IndexSelect<int>(a, 50), 50);
+  ASSERT_TRUE(ArrayEQ<IDX>(aten::IndexSelect(a, 10, 20),
+        aten::Range(10, 20, sizeof(IDX)*8, ctx)));
  IdArray b = aten::VecToIdArray(std::vector<IDX>({0, 20, 10}), sizeof(IDX)*8, ctx);
  IdArray c = aten::IndexSelect(a, b);
  ASSERT_TRUE(ArrayEQ<IDX>(b, c));
@@ -239,3 +241,41 @@ TEST(ArrayTest, TestRelabel_) {
  _TestRelabel_<int32_t>();
  _TestRelabel_<int64_t>();
 }
+
+template <typename IDX>
+void _TestCumSum(DLContext ctx) {
+  IdArray a = aten::VecToIdArray(std::vector<IDX>({8, 6, 7, 5, 3, 0, 9}),
+      sizeof(IDX)*8, ctx);
+  {
+    IdArray tb = aten::VecToIdArray(std::vector<IDX>({8, 14, 21, 26, 29, 29, 38}),
+        sizeof(IDX)*8, ctx);
+    IdArray b = aten::CumSum(a);
+    ASSERT_TRUE(ArrayEQ<IDX>(b, tb));
+  }
+  {
+    IdArray tb = aten::VecToIdArray(std::vector<IDX>({0, 8, 14, 21, 26, 29, 29, 38}),
+        sizeof(IDX)*8, ctx);
+    IdArray b = aten::CumSum(a, true);
+    ASSERT_TRUE(ArrayEQ<IDX>(b, tb));
+  }
+  a = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX)*8, ctx);
+  {
+    IdArray tb = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX)*8, ctx);
+    IdArray b = aten::CumSum(a);
+    ASSERT_TRUE(ArrayEQ<IDX>(b, tb));
+  }
+  {
+    IdArray tb = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX)*8, ctx);
+    IdArray b = aten::CumSum(a);
+    ASSERT_TRUE(ArrayEQ<IDX>(b, tb));
+  }
+}
+
+TEST(ArrayTest, CumSum) {
+  _TestCumSum<int32_t>(CPU);
+  _TestCumSum<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestCumSum<int32_t>(GPU);
+  _TestCumSum<int64_t>(GPU);
+#endif
+}
--- a/tests/cpp/test_spmat.cc
+++ b/tests/cpp/test_spmat.cc
@@ -17,8 +17,8 @@ aten::CSRMatrix CSR1(DLContext ctx = CTX) {
  return aten::CSRMatrix(
      4, 5,
      aten::VecToIdArray(std::vector<IDX>({0, 2, 3, 5, 5}), sizeof(IDX)*8, ctx),
-      aten::VecToIdArray(std::vector<IDX>({1, 2, 0, 2, 3}), sizeof(IDX)*8, ctx),
-      aten::VecToIdArray(std::vector<IDX>({0, 2, 3, 1, 4}), sizeof(IDX)*8, ctx),
+      aten::VecToIdArray(std::vector<IDX>({1, 2, 0, 3, 2}), sizeof(IDX)*8, ctx),
+      aten::VecToIdArray(std::vector<IDX>({0, 2, 3, 4, 1}), sizeof(IDX)*8, ctx),
      false);
 }

@@ -277,12 +277,23 @@ void _TestCSRToCOO(DLContext ctx) {
  auto coo = CSRToCOO(csr, false);
  ASSERT_EQ(coo.num_rows, 4);
  ASSERT_EQ(coo.num_cols, 5);
+  ASSERT_TRUE(coo.row_sorted);
  auto tr = aten::VecToIdArray(std::vector<IDX>({0, 0, 0, 1, 2, 2}), sizeof(IDX)*8, ctx);
-  auto tc = aten::VecToIdArray(std::vector<IDX>({1, 2, 2, 0, 2, 3}), sizeof(IDX)*8, ctx);
-  auto td = aten::VecToIdArray(std::vector<IDX>({0, 2, 5, 3, 1, 4}), sizeof(IDX)*8, ctx);
  ASSERT_TRUE(ArrayEQ<IDX>(coo.row, tr));
-  ASSERT_TRUE(ArrayEQ<IDX>(coo.col, tc));
-  ASSERT_TRUE(ArrayEQ<IDX>(coo.data, td));
+  ASSERT_TRUE(ArrayEQ<IDX>(coo.col, csr.indices));
+  ASSERT_TRUE(ArrayEQ<IDX>(coo.data, csr.data));
+
+  // convert from sorted csr
+  auto s_csr = CSRSort(csr);
+  coo = CSRToCOO(s_csr, false);
+  ASSERT_EQ(coo.num_rows, 4);
+  ASSERT_EQ(coo.num_cols, 5);
+  ASSERT_TRUE(coo.row_sorted);
+  ASSERT_TRUE(coo.col_sorted);
+  tr = aten::VecToIdArray(std::vector<IDX>({0, 0, 0, 1, 2, 2}), sizeof(IDX)*8, ctx);
+  ASSERT_TRUE(ArrayEQ<IDX>(coo.row, tr));
+  ASSERT_TRUE(ArrayEQ<IDX>(coo.col, s_csr.indices));
+  ASSERT_TRUE(ArrayEQ<IDX>(coo.data, s_csr.data));
  }
  {
  auto coo = CSRToCOO(csr, true);
@@ -294,7 +305,7 @@ void _TestCSRToCOO(DLContext ctx) {
  }
 }

-TEST(SpmatTest, TestCSRToCOO) {
+TEST(SpmatTest, CSRToCOO) {
  _TestCSRToCOO<int32_t>(CPU);
  _TestCSRToCOO<int64_t>(CPU);
 #if DGL_USE_CUDA
@@ -303,8 +314,8 @@ TEST(SpmatTest, TestCSRToCOO) {
 }

 template <typename IDX>
-void _TestCSRSliceRows() {
-  auto csr = CSR2<IDX>();
+void _TestCSRSliceRows(DLContext ctx) {
+  auto csr = CSR2<IDX>(ctx);
  auto x = aten::CSRSliceRows(csr, 1, 4);
  //  [1, 0, 0, 0, 0],
  //  [0, 0, 1, 1, 0],
@@ -312,30 +323,34 @@ void _TestCSRSliceRows() {
  // data: [3, 1, 4]
  ASSERT_EQ(x.num_rows, 3);
  ASSERT_EQ(x.num_cols, 5);
-  auto tp = aten::VecToIdArray(std::vector<IDX>({0, 1, 3, 3}), sizeof(IDX)*8, CTX);
-  auto ti = aten::VecToIdArray(std::vector<IDX>({0, 2, 3}), sizeof(IDX)*8, CTX);
-  auto td = aten::VecToIdArray(std::vector<IDX>({3, 1, 4}), sizeof(IDX)*8, CTX);
+  auto tp = aten::VecToIdArray(std::vector<IDX>({0, 1, 3, 3}), sizeof(IDX)*8, ctx);
+  auto ti = aten::VecToIdArray(std::vector<IDX>({0, 2, 3}), sizeof(IDX)*8, ctx);
+  auto td = aten::VecToIdArray(std::vector<IDX>({3, 1, 4}), sizeof(IDX)*8, ctx);
  ASSERT_TRUE(ArrayEQ<IDX>(x.indptr, tp));
  ASSERT_TRUE(ArrayEQ<IDX>(x.indices, ti));
  ASSERT_TRUE(ArrayEQ<IDX>(x.data, td));

-  auto r = aten::VecToIdArray(std::vector<IDX>({0, 1, 3}), sizeof(IDX)*8, CTX);
+  auto r = aten::VecToIdArray(std::vector<IDX>({0, 1, 3}), sizeof(IDX)*8, ctx);
  x = aten::CSRSliceRows(csr, r);
  // [[0, 1, 2, 0, 0],
  //  [1, 0, 0, 0, 0],
  //  [0, 0, 0, 0, 0]]
  // data: [0, 2, 5, 3]
-  tp = aten::VecToIdArray(std::vector<IDX>({0, 3, 4, 4}), sizeof(IDX)*8, CTX);
-  ti = aten::VecToIdArray(std::vector<IDX>({1, 2, 2, 0}), sizeof(IDX)*8, CTX);
-  td = aten::VecToIdArray(std::vector<IDX>({0, 2, 5, 3}), sizeof(IDX)*8, CTX);
+  tp = aten::VecToIdArray(std::vector<IDX>({0, 3, 4, 4}), sizeof(IDX)*8, ctx);
+  ti = aten::VecToIdArray(std::vector<IDX>({1, 2, 2, 0}), sizeof(IDX)*8, ctx);
+  td = aten::VecToIdArray(std::vector<IDX>({0, 2, 5, 3}), sizeof(IDX)*8, ctx);
  ASSERT_TRUE(ArrayEQ<IDX>(x.indptr, tp));
  ASSERT_TRUE(ArrayEQ<IDX>(x.indices, ti));
  ASSERT_TRUE(ArrayEQ<IDX>(x.data, td));
 }

 TEST(SpmatTest, TestCSRSliceRows) {
-  _TestCSRSliceRows<int32_t>();
-  _TestCSRSliceRows<int64_t>();
+  _TestCSRSliceRows<int32_t>(CPU);
+  _TestCSRSliceRows<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestCSRSliceRows<int32_t>(GPU);
+  _TestCSRSliceRows<int64_t>(GPU);
+#endif
 }

 template <typename IDX>
@@ -376,6 +391,29 @@ TEST(SpmatTest, TestCSRHasDuplicate) {
  _TestCSRHasDuplicate<int64_t>();
 }

+template <typename IDX>
+void _TestCSRSort(DLContext ctx) {
+  auto csr = CSR1<IDX>(ctx);
+  ASSERT_FALSE(aten::CSRIsSorted(csr));
+  auto csr1 = aten::CSRSort(csr);
+  ASSERT_FALSE(aten::CSRIsSorted(csr));
+  ASSERT_TRUE(aten::CSRIsSorted(csr1));
+  ASSERT_TRUE(csr1.sorted);
+  aten::CSRSort_(&csr);
+  ASSERT_TRUE(aten::CSRIsSorted(csr));
+  ASSERT_TRUE(csr.sorted);
+  csr = CSR2<IDX>(ctx);
+  ASSERT_TRUE(aten::CSRIsSorted(csr));
+}
+
+TEST(SpmatTest, CSRSort) {
+  _TestCSRSort<int32_t>(CPU);
+  _TestCSRSort<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestCSRSort<int32_t>(GPU);
+#endif
+}
+
 template <typename IDX>
 void _TestCOOToCSR(DLContext ctx) {
  auto coo = COO1<IDX>(ctx);
@@ -392,6 +430,7 @@ void _TestCOOToCSR(DLContext ctx) {
  ASSERT_EQ(coo.num_cols, csr.num_cols);
  ASSERT_TRUE(ArrayEQ<IDX>(csr.indptr, tcsr.indptr));

+  // Convert from row sorted coo
  coo = COO1<IDX>(ctx);
  auto rs_coo = aten::COOSort(coo, false);
  auto rs_csr = CSR1<IDX>(ctx);
@@ -399,6 +438,8 @@ void _TestCOOToCSR(DLContext ctx) {
  ASSERT_EQ(coo.num_rows, rs_tcsr.num_rows);
  ASSERT_EQ(coo.num_cols, rs_tcsr.num_cols);
  ASSERT_TRUE(ArrayEQ<IDX>(rs_csr.indptr, rs_tcsr.indptr));
+  ASSERT_TRUE(ArrayEQ<IDX>(rs_tcsr.indices, rs_coo.col));
+  ASSERT_TRUE(ArrayEQ<IDX>(rs_tcsr.data, rs_coo.data));

  coo = COO3<IDX>(ctx);
  rs_coo = aten::COOSort(coo, false);
@@ -407,16 +448,20 @@ void _TestCOOToCSR(DLContext ctx) {
  ASSERT_EQ(coo.num_rows, rs_tcsr.num_rows);
  ASSERT_EQ(coo.num_cols, rs_tcsr.num_cols);
  ASSERT_TRUE(ArrayEQ<IDX>(rs_csr.indptr, rs_tcsr.indptr));
+  ASSERT_TRUE(ArrayEQ<IDX>(rs_tcsr.indices, rs_coo.col));
+  ASSERT_TRUE(ArrayEQ<IDX>(rs_tcsr.data, rs_coo.data));

+  // Convert from col sorted coo
  coo = COO1<IDX>(ctx);
  auto src_coo = aten::COOSort(coo, true);
  auto src_csr = CSR1<IDX>(ctx);
  auto src_tcsr = aten::COOToCSR(src_coo);
  ASSERT_EQ(coo.num_rows, src_tcsr.num_rows);
  ASSERT_EQ(coo.num_cols, src_tcsr.num_cols);
-  ASSERT_TRUE(ArrayEQ<IDX>(src_csr.indptr, src_tcsr.indptr));
-  ASSERT_TRUE(ArrayEQ<IDX>(src_csr.indices, src_tcsr.indices));
-  ASSERT_TRUE(ArrayEQ<IDX>(src_csr.data, src_tcsr.data));
+  ASSERT_TRUE(src_tcsr.sorted);
+  ASSERT_TRUE(ArrayEQ<IDX>(src_tcsr.indptr, src_csr.indptr));
+  ASSERT_TRUE(ArrayEQ<IDX>(src_tcsr.indices, src_coo.col));
+  ASSERT_TRUE(ArrayEQ<IDX>(src_tcsr.data, src_coo.data));

  coo = COO3<IDX>(ctx);
  src_coo = aten::COOSort(coo, true);
@@ -424,12 +469,13 @@ void _TestCOOToCSR(DLContext ctx) {
  src_tcsr = aten::COOToCSR(src_coo);
  ASSERT_EQ(coo.num_rows, src_tcsr.num_rows);
  ASSERT_EQ(coo.num_cols, src_tcsr.num_cols);
-  ASSERT_TRUE(ArrayEQ<IDX>(src_csr.indptr, src_tcsr.indptr));
-  ASSERT_TRUE(ArrayEQ<IDX>(src_csr.indices, src_tcsr.indices));
-  ASSERT_TRUE(ArrayEQ<IDX>(src_csr.data, src_tcsr.data));
+  ASSERT_TRUE(src_tcsr.sorted);
+  ASSERT_TRUE(ArrayEQ<IDX>(src_tcsr.indptr, src_csr.indptr));
+  ASSERT_TRUE(ArrayEQ<IDX>(src_tcsr.indices, src_coo.col));
+  ASSERT_TRUE(ArrayEQ<IDX>(src_tcsr.data, src_coo.data));
 }

-TEST(SpmatTest, TestCOOToCSR) {
+TEST(SpmatTest, COOToCSR) {
  _TestCOOToCSR<int32_t>(CPU);
  _TestCOOToCSR<int64_t>(CPU);
 #ifdef DGL_USE_CUDA
@@ -453,12 +499,37 @@ TEST(SpmatTest, TestCOOHasDuplicate) {
 template <typename IDX>
 void _TestCOOSort(DLContext ctx) {
  auto coo = COO3<IDX>(ctx);
+  
  auto sr_coo = COOSort(coo, false);
  ASSERT_EQ(coo.num_rows, sr_coo.num_rows);
  ASSERT_EQ(coo.num_cols, sr_coo.num_cols);
+  ASSERT_TRUE(sr_coo.row_sorted);
+  auto flags = COOIsSorted(sr_coo);
+  ASSERT_TRUE(flags.first);
+  flags = COOIsSorted(coo);  // original coo should stay the same
+  ASSERT_FALSE(flags.first);
+  ASSERT_FALSE(flags.second);
+
  auto src_coo = COOSort(coo, true);
  ASSERT_EQ(coo.num_rows, src_coo.num_rows);
  ASSERT_EQ(coo.num_cols, src_coo.num_cols);
+  ASSERT_TRUE(src_coo.row_sorted);
+  ASSERT_TRUE(src_coo.col_sorted);
+  flags = COOIsSorted(src_coo);
+  ASSERT_TRUE(flags.first);
+  ASSERT_TRUE(flags.second);
+
+  // sort inplace
+  COOSort_(&coo);
+  ASSERT_TRUE(coo.row_sorted);
+  flags = COOIsSorted(coo);
+  ASSERT_TRUE(flags.first);
+  COOSort_(&coo, true);
+  ASSERT_TRUE(coo.row_sorted);
+  ASSERT_TRUE(coo.col_sorted);
+  flags = COOIsSorted(coo);
+  ASSERT_TRUE(flags.first);
+  ASSERT_TRUE(flags.second);

  // COO3
  // [[0, 1, 2, 0, 0],
@@ -489,7 +560,7 @@ void _TestCOOSort(DLContext ctx) {
  ASSERT_TRUE(ArrayEQ<IDX>(src_coo.data, sort_col_data));
 }

-TEST(SpmatTest, TestCOOSort) {
+TEST(SpmatTest, COOSort) {
  _TestCOOSort<int32_t>(CPU);
  _TestCOOSort<int64_t>(CPU);
 #ifdef DGL_USE_CUDA
--- a/third_party/cub
+++ b/third_party/cub