[CUDA][Kernel] More CUDA kernels; Standardize the behavior for sorted COO/CSR (#1704)

* add cub; array cumsum

* CSRSliceRows

* fix warning

* operator << for ndarray; CSRSliceRows

* add CSRIsSorted

* add csr_sort

* inplace coosort and outplace csrsort

* WIP: coo is sorted

* mv cuda_utils

* add AllTrue utility

* csr sort

* coo sort

* coo2csr for sorted coo arrays

* CSRToCOO from sorted

* pass tests for the new kernel changes

* cannot use inplace sort

* lint

* try fix msvc error

* Fix g.copy_to and g.asnumbits; ToBlock no longer uses CSC

* stash

* revert some hack

* revert some changes

* address comments

* fix

* fix to_block unittest

* add todo note
This commit is contained in:
Minjie Wang
2020-06-28 18:37:28 +08:00
committed by GitHub
parent da8632cafe
commit 870da747ea
59 changed files with 1367 additions and 429 deletions

4
.gitmodules vendored
View File

@@ -13,6 +13,10 @@
[submodule "third_party/METIS"]
path = third_party/METIS
url = https://github.com/KarypisLab/METIS.git
[submodule "third_party/cub"]
path = third_party/cub
url = https://github.com/NVlabs/cub.git
branch = 1.8.0
[submodule "third_party/phmap"]
path = third_party/phmap
url = https://github.com/greg7mdp/parallel-hashmap.git

View File

@@ -44,6 +44,8 @@ include_directories("third_party/METIS/include/")
include_directories("third_party/dmlc-core/include")
include_directories("third_party/minigun/minigun")
include_directories("third_party/minigun/third_party/moderngpu/src")
include_directories("third_party/cub/")
include_directories("third_party/phmap/")
# initial variables
set(DGL_LINKER_LIBS "")

View File

@@ -13,6 +13,7 @@
#include <utility>
#include <vector>
#include <tuple>
#include <string>
#include "./types.h"
namespace dgl {
@@ -131,9 +132,18 @@ IdArray HStack(IdArray arr1, IdArray arr2);
* \tparam ValueType The type of return value.
*/
template<typename ValueType>
ValueType IndexSelect(NDArray array, uint64_t index);
ValueType IndexSelect(NDArray array, int64_t index);
/*!
* \brief Return the data under the index. In numpy notation, A[I]
*/
NDArray IndexSelect(NDArray array, IdArray index);
/*!
* \brief Return the data from `start` (inclusive) to `end` (exclusive).
*/
NDArray IndexSelect(NDArray array, int64_t start, int64_t end);
/*!
* \brief Permute the elements of an array according to given indices.
*
@@ -238,6 +248,27 @@ std::tuple<NDArray, IdArray, IdArray> Pack(NDArray array, ValueType pad_value);
*/
std::pair<NDArray, IdArray> ConcatSlices(NDArray array, IdArray lengths);
/*!
* \brief Return the cumulative summation (or inclusive sum) of the input array.
*
* The first element out[0] is equal to the first element of the input array
* array[0]. The rest elements are defined recursively, out[i] = out[i-1] + array[i].
* Hence, the result array length is the same as the input array length.
*
* If prepend_zero is true, then the first element is zero and the result array
* length is the input array length plus one. This is useful for creating
* an indptr array over a count array.
*
* \param array The 1D input array.
* \return Array after cumsum.
*/
IdArray CumSum(IdArray array, bool prepend_zero = false);
/*!
* \brief Return a string that prints out some debug information.
*/
std::string ToDebugString(NDArray array);
// inline implementations
template <typename T>
IdArray VecToIdArray(const std::vector<T>& vec,

View File

@@ -116,6 +116,16 @@ struct COOMatrix {
CHECK_NO_OVERFLOW(row->dtype, num_rows);
CHECK_NO_OVERFLOW(row->dtype, num_cols);
}
/*! \brief Return a copy of this matrix on the give device context. */
inline COOMatrix CopyTo(const DLContext& ctx) const {
if (ctx == row->ctx)
return *this;
return COOMatrix(num_rows, num_cols,
row.CopyTo(ctx), col.CopyTo(ctx),
aten::IsNullArray(data)? data : data.CopyTo(ctx),
row_sorted, col_sorted);
}
};
///////////////////////// COO routines //////////////////////////
@@ -141,6 +151,17 @@ inline bool COOHasData(COOMatrix csr) {
return !IsNullArray(csr.data);
}
/*!
* \brief Check whether the COO is sorted.
*
* It returns two flags: one for whether the row is sorted;
* the other for whether the columns of each row is sorted
* if the first flag is true.
*
* Complexity: O(NNZ)
*/
std::pair<bool, bool> COOIsSorted(COOMatrix coo);
/*! \brief Get data. The return type is an ndarray due to possible duplicate entries. */
runtime::NDArray COOGetData(COOMatrix , int64_t row, int64_t col);
@@ -161,6 +182,20 @@ COOMatrix COOTranspose(COOMatrix coo);
* the result CSR matrix stores a shuffle index for how the entries
* will be reordered in CSR. The i^th entry in the result CSR corresponds
* to the CSR.data[i] th entry in the input COO.
*
* Conversion complexity: O(nnz)
*
* - The function first check whether the input COO matrix is sorted
* using a linear scan.
* - If the COO matrix is row sorted, the conversion can be done very
* efficiently in a sequential scan. The result indices and data arrays
* are directly equal to the column and data arrays from the input.
* - If the COO matrix is further column sorted, the result CSR is
* also column sorted.
* - Otherwise, the conversion is more costly but still is O(nnz).
*
* \param coo Input COO matrix.
* \return CSR matrix.
*/
CSRMatrix COOToCSR(COOMatrix coo);
@@ -195,6 +230,21 @@ bool COOHasDuplicate(COOMatrix coo);
*/
std::pair<COOMatrix, IdArray> COOCoalesce(COOMatrix coo);
/*!
* \brief Sort the indices of a COO matrix in-place.
*
* The function sorts row indices in ascending order. If sort_column is true,
* col indices are sorted in ascending order too. The data array of the returned COOMatrix
* stores the shuffled index which could be used to fetch edge data.
*
* Complexity: O(N*log(N)) time and O(1) space, where N is the number of nonzeros.
* TODO(minjie): The time complexity could be improved to O(N) by using a O(N) space.
*
* \param mat The coo matrix to sort.
* \param sort_column True if column index should be sorted too.
*/
void COOSort_(COOMatrix* mat, bool sort_column = false);
/*!
* \brief Sort the indices of a COO matrix.
*
@@ -202,11 +252,23 @@ std::pair<COOMatrix, IdArray> COOCoalesce(COOMatrix coo);
* col indices are sorted in ascending order too. The data array of the returned COOMatrix
* stores the shuffled index which could be used to fetch edge data.
*
* Complexity: O(N*log(N)) time and O(1) space, where N is the number of nonzeros.
* TODO(minjie): The time complexity could be improved to O(N) by using a O(N) space.
*
* \param mat The input coo matrix
* \param sort_column True if column index should be sorted too.
* \return COO matrix with index sorted.
*/
COOMatrix COOSort(COOMatrix mat, bool sort_column = false);
inline COOMatrix COOSort(COOMatrix mat, bool sort_column = false) {
if ((mat.row_sorted && !sort_column) || mat.col_sorted)
return mat;
COOMatrix ret(mat.num_rows, mat.num_cols,
mat.row.Clone(), mat.col.Clone(),
COOHasData(mat)? mat.data.Clone() : mat.data,
mat.row_sorted, mat.col_sorted);
COOSort_(&ret, sort_column);
return ret;
}
/*!
* \brief Remove entries from COO matrix by entry indices (data indices)

View File

@@ -106,6 +106,17 @@ struct CSRMatrix {
}
CHECK_NO_OVERFLOW(indptr->dtype, num_rows);
CHECK_NO_OVERFLOW(indptr->dtype, num_cols);
CHECK_EQ(indptr->shape[0], num_rows + 1);
}
/*! \brief Return a copy of this matrix on the give device context. */
inline CSRMatrix CopyTo(const DLContext& ctx) const {
if (ctx == indptr->ctx)
return *this;
return CSRMatrix(num_rows, num_cols,
indptr.CopyTo(ctx), indices.CopyTo(ctx),
aten::IsNullArray(data)? data : data.CopyTo(ctx),
sorted);
}
};
@@ -134,6 +145,9 @@ inline bool CSRHasData(CSRMatrix csr) {
return !IsNullArray(csr.data);
}
/*! \brief Whether the column indices of each row is sorted. */
bool CSRIsSorted(CSRMatrix csr);
/* \brief Get data. The return type is an ndarray due to possible duplicate entries. */
runtime::NDArray CSRGetData(CSRMatrix , int64_t row, int64_t col);
/*!
@@ -155,6 +169,15 @@ CSRMatrix CSRTranspose(CSRMatrix csr);
/*!
* \brief Convert CSR matrix to COO matrix.
*
* Complexity: O(nnz)
*
* - If data_as_order is false, the column and data arrays of the
* result COO are equal to the indices and data arrays of the
* input CSR. The result COO is also row sorted.
* - If the input CSR is further sorted, the result COO is also
* column sorted.
*
* \param csr Input csr matrix
* \param data_as_order If true, the data array in the input csr matrix contains the order
* by which the resulting COO tuples are stored. In this case, the
@@ -166,9 +189,8 @@ COOMatrix CSRToCOO(CSRMatrix csr, bool data_as_order);
/*!
* \brief Slice rows of the given matrix and return.
* \param csr CSR matrix
* \param start Start row id (inclusive)
* \param end End row id (exclusive)
*
* The sliced row IDs are relabeled to starting from zero.
*
* Examples:
* num_rows = 4
@@ -182,6 +204,11 @@ COOMatrix CSRToCOO(CSRMatrix csr, bool data_as_order);
* num_cols = 4
* indptr = [0, 1, 1]
* indices = [2]
*
* \param csr CSR matrix
* \param start Start row id (inclusive)
* \param end End row id (exclusive)
* \return sliced rows stored in a CSR matrix
*/
CSRMatrix CSRSliceRows(CSRMatrix csr, int64_t start, int64_t end);
CSRMatrix CSRSliceRows(CSRMatrix csr, runtime::NDArray rows);
@@ -192,6 +219,8 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, runtime::NDArray rows);
* In numpy notation, given matrix M, row index array I, col index array J
* This function returns the submatrix M[I, J].
*
* The sliced row and column IDs are relabeled to starting from zero.
*
* \param csr The input csr matrix
* \param rows The row index to select
* \param cols The col index to select
@@ -203,7 +232,10 @@ CSRMatrix CSRSliceMatrix(CSRMatrix csr, runtime::NDArray rows, runtime::NDArray
bool CSRHasDuplicate(CSRMatrix csr);
/*!
* \brief Sort the column index at each row in the ascending order.
* \brief Sort the column index at each row in ascending order in-place.
*
* Only the indices and data arrays (if available) will be mutated. The indptr array
* stays the same.
*
* Examples:
* num_rows = 4
@@ -218,6 +250,22 @@ bool CSRHasDuplicate(CSRMatrix csr);
*/
void CSRSort_(CSRMatrix* csr);
/*!
* \brief Sort the column index at each row in ascending order.
*
* Return a new CSR matrix with sorted column indices and data arrays.
*/
inline CSRMatrix CSRSort(CSRMatrix csr) {
if (csr.sorted)
return csr;
CSRMatrix ret(csr.num_rows, csr.num_cols,
csr.indptr, csr.indices.Clone(),
CSRHasData(csr)? csr.data.Clone() : csr.data,
csr.sorted);
CSRSort_(&ret);
return ret;
}
/*!
* \brief Reorder the rows and colmns according to the new row and column order.
* \param csr The input csr matrix.

View File

@@ -252,4 +252,8 @@
CHECK_LE((val), 0x7FFFFFFFL) << "int32 overflow for argument " << (#val) << "."; \
} while (0);
#define CHECK_IS_ID_ARRAY(VAR) \
CHECK((VAR)->ndim == 1 && (IS_INT32(VAR) || IS_INT64(VAR))) \
<< "Expected argument " << (#VAR) << " to be an 1D integer array.";
#endif // DGL_ATEN_MACRO_H_

View File

@@ -10,6 +10,7 @@
#include <vector>
#include <utility>
#include <algorithm>
#include <memory>
#include "./runtime/object.h"
#include "array.h"

View File

@@ -12,6 +12,7 @@
#include <utility>
#include <tuple>
#include <algorithm>
#include <memory>
#include "runtime/ndarray.h"
#include "graph_interface.h"
#include "lazy.h"

View File

@@ -8,6 +8,7 @@
#include <vector>
#include <string>
#include <memory>
#include "./runtime/object.h"
#include "graph_interface.h"

View File

@@ -11,6 +11,7 @@
#include <string>
#include <utility>
#include <vector>
#include <memory>
#include "c_runtime_api.h"
#include "dlpack/dlpack.h"
@@ -157,6 +158,10 @@ class NDArray {
* \return The array under another context.
*/
inline NDArray CopyTo(const DLContext& ctx) const;
/*!
* \brief Return a new array with a copy of the content.
*/
inline NDArray Clone() const;
/*!
* \brief Load NDArray from stream
* \param stream The input data stream
@@ -410,6 +415,12 @@ inline NDArray NDArray::CopyTo(const DLContext& ctx) const {
return ret;
}
inline NDArray NDArray::Clone() const {
CHECK(data_ != nullptr);
const DLTensor* dptr = operator->();
return this->CopyTo(dptr->ctx);
}
inline int NDArray::use_count() const {
if (data_ == nullptr) return 0;
return data_->ref_counter_.load(std::memory_order_relaxed);
@@ -627,6 +638,8 @@ dgl::runtime::NDArray operator <= (int64_t lhs, const dgl::runtime::NDArray& a2)
dgl::runtime::NDArray operator == (int64_t lhs, const dgl::runtime::NDArray& a2);
dgl::runtime::NDArray operator != (int64_t lhs, const dgl::runtime::NDArray& a2);
std::ostream& operator << (std::ostream& os, dgl::runtime::NDArray array);
///////////////// Operator overloading for DLDataType /////////////////
/*! \brief Check whether two data types are the same.*/

View File

@@ -13,6 +13,7 @@
#include <string>
#include <limits>
#include <memory>
#include <utility>
#include <type_traits>
#include "c_runtime_api.h"
#include "module.h"

View File

@@ -10,6 +10,7 @@
#include <dgl/graph_serializer.h>
#include <dmlc/io.h>
#include <dmlc/serializer.h>
#include <memory>
namespace dmlc {
namespace serializer {

View File

@@ -17,31 +17,36 @@
#include <tuple>
#include <utility>
#include <vector>
#include <memory>
#include "dmlc/logging.h"
namespace dgl {
/* StreamWithBuffer is backed up by dmlc::MemoryFixedSizeStream or
dmlc::MemoryStringStream. This class supports serializing and deserializing
NDArrays stored in shared memory. If the stream is created for
sending/recving data through network, the data pointer of the NDArray will be
transmitted directly without and copy. Otherwise, the stream is for
sending/recving data to another process on the same machine, so if an NDArray
is stored in shared memory, it will just record the shared memory name
instead of the actual data buffer.
For example:
std::string blob;
// Send to local
StreamWithBuffer strm(&blob, false);
// Send to remote
StreamWithBuffer strm(&blob, true);
// Receive from local
StreamWithBuffer strm(&blob, false);
// Receive from remote
std::vector<void*> ptr_list
StreamWithBuffer strm(&blob, ptr_list);
*/
/*!
*
* StreamWithBuffer is backed up by dmlc::MemoryFixedSizeStream or
* dmlc::MemoryStringStream. This class supports serializing and deserializing
* NDArrays stored in shared memory. If the stream is created for
* sending/recving data through network, the data pointer of the NDArray will be
* transmitted directly without and copy. Otherwise, the stream is for
* sending/recving data to another process on the same machine, so if an NDArray
* is stored in shared memory, it will just record the shared memory name
* instead of the actual data buffer.
*
* For example:
*
* std::string blob;
* // Send to local
* StreamWithBuffer strm(&blob, false);
* // Send to remote
* StreamWithBuffer strm(&blob, true);
* // Receive from local
* StreamWithBuffer strm(&blob, false);
* // Receive from remote
* std::vector<void*> ptr_list
* StreamWithBuffer strm(&blob, ptr_list);
*/
class StreamWithBuffer : public dmlc::SeekStream {
public:
// Buffer type. Storing NDArray to maintain the reference counting to ensure

View File

@@ -8,6 +8,8 @@
#include <dgl/packed_func_ext.h>
#include <dgl/runtime/container.h>
#include <dgl/runtime/shared_mem.h>
#include <dgl/runtime/device_api.h>
#include <sstream>
#include "../c_api_common.h"
#include "./array_op.h"
#include "./arith.h"
@@ -100,8 +102,10 @@ NDArray IndexSelect(NDArray array, IdArray index) {
}
template<typename ValueType>
ValueType IndexSelect(NDArray array, uint64_t index) {
ValueType IndexSelect(NDArray array, int64_t index) {
CHECK_EQ(array->ndim, 1) << "Only support select values from 1D array.";
CHECK(index >= 0 && index < array.NumElements())
<< "Index " << index << " is out of bound.";
ValueType ret = 0;
ATEN_XPU_SWITCH_CUDA(array->ctx.device_type, XPU, "IndexSelect", {
ATEN_DTYPE_SWITCH(array->dtype, DType, "values", {
@@ -110,12 +114,30 @@ ValueType IndexSelect(NDArray array, uint64_t index) {
});
return ret;
}
template int32_t IndexSelect<int32_t>(NDArray array, uint64_t index);
template int64_t IndexSelect<int64_t>(NDArray array, uint64_t index);
template uint32_t IndexSelect<uint32_t>(NDArray array, uint64_t index);
template uint64_t IndexSelect<uint64_t>(NDArray array, uint64_t index);
template float IndexSelect<float>(NDArray array, uint64_t index);
template double IndexSelect<double>(NDArray array, uint64_t index);
template int32_t IndexSelect<int32_t>(NDArray array, int64_t index);
template int64_t IndexSelect<int64_t>(NDArray array, int64_t index);
template uint32_t IndexSelect<uint32_t>(NDArray array, int64_t index);
template uint64_t IndexSelect<uint64_t>(NDArray array, int64_t index);
template float IndexSelect<float>(NDArray array, int64_t index);
template double IndexSelect<double>(NDArray array, int64_t index);
NDArray IndexSelect(NDArray array, int64_t start, int64_t end) {
CHECK_EQ(array->ndim, 1) << "Only support select values from 1D array.";
CHECK(start >= 0 && start < array.NumElements())
<< "Index " << start << " is out of bound.";
CHECK(end >= 0 && end <= array.NumElements())
<< "Index " << end << " is out of bound.";
CHECK_LE(start, end);
auto device = runtime::DeviceAPI::Get(array->ctx);
const int64_t len = end - start;
NDArray ret = NDArray::Empty({len}, array->dtype, array->ctx);
ATEN_DTYPE_SWITCH(array->dtype, DType, "values", {
device->CopyDataFromTo(array->data, start * sizeof(DType),
ret->data, 0, len * sizeof(DType),
array->ctx, ret->ctx, array->dtype, nullptr);
});
return ret;
}
NDArray Scatter(NDArray array, IdArray indices) {
NDArray ret;
@@ -181,6 +203,31 @@ std::pair<NDArray, IdArray> ConcatSlices(NDArray array, IdArray lengths) {
return ret;
}
IdArray CumSum(IdArray array, bool prepend_zero) {
IdArray ret;
ATEN_XPU_SWITCH_CUDA(array->ctx.device_type, XPU, "CumSum", {
ATEN_ID_TYPE_SWITCH(array->dtype, IdType, {
ret = impl::CumSum<XPU, IdType>(array, prepend_zero);
});
});
return ret;
}
std::string ToDebugString(NDArray array) {
std::ostringstream oss;
NDArray a = array.CopyTo(DLContext{kDLCPU, 0});
oss << "array([";
ATEN_DTYPE_SWITCH(a->dtype, DType, "array", {
for (int64_t i = 0; i < std::min<int64_t>(a.NumElements(), 10L); ++i) {
oss << a.Ptr<DType>()[i] << ", ";
}
});
if (a.NumElements() > 10)
oss << "...";
oss << "], dtype=" << array->dtype << ", ctx=" << array->ctx << ")";
return oss.str();
}
///////////////////////// CSR routines //////////////////////////
bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) {
@@ -250,6 +297,16 @@ NDArray CSRGetRowData(CSRMatrix csr, int64_t row) {
return ret;
}
bool CSRIsSorted(CSRMatrix csr) {
if (csr.indices->shape[0] <= 1)
return true;
bool ret = false;
ATEN_CSR_SWITCH_CUDA(csr, XPU, IdType, "CSRIsSorted", {
ret = impl::CSRIsSorted<XPU, IdType>(csr);
});
return ret;
}
NDArray CSRGetData(CSRMatrix csr, int64_t row, int64_t col) {
CHECK(row >= 0 && row < csr.num_rows) << "Invalid row index: " << row;
CHECK(col >= 0 && col < csr.num_cols) << "Invalid col index: " << col;
@@ -318,7 +375,7 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, int64_t start, int64_t end) {
CHECK(end >= 0 && end <= csr.num_rows) << "Invalid end index: " << end;
CHECK_GE(end, start);
CSRMatrix ret;
ATEN_CSR_SWITCH(csr, XPU, IdType, "CSRSliceRows", {
ATEN_CSR_SWITCH_CUDA(csr, XPU, IdType, "CSRSliceRows", {
ret = impl::CSRSliceRows<XPU, IdType>(csr, start, end);
});
return ret;
@@ -328,7 +385,7 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
CHECK_SAME_DTYPE(csr.indices, rows);
CHECK_SAME_CONTEXT(csr.indices, rows);
CSRMatrix ret;
ATEN_CSR_SWITCH(csr, XPU, IdType, "CSRSliceRows", {
ATEN_CSR_SWITCH_CUDA(csr, XPU, IdType, "CSRSliceRows", {
ret = impl::CSRSliceRows<XPU, IdType>(csr, rows);
});
return ret;
@@ -347,7 +404,9 @@ CSRMatrix CSRSliceMatrix(CSRMatrix csr, NDArray rows, NDArray cols) {
}
void CSRSort_(CSRMatrix* csr) {
ATEN_CSR_SWITCH(*csr, XPU, IdType, "CSRSort_", {
if (csr->sorted)
return;
ATEN_CSR_SWITCH_CUDA(*csr, XPU, IdType, "CSRSort_", {
impl::CSRSort_<XPU, IdType>(csr);
});
}
@@ -509,13 +568,23 @@ COOMatrix COOSliceMatrix(COOMatrix coo, NDArray rows, NDArray cols) {
return ret;
}
COOMatrix COOSort(COOMatrix mat, bool sort_column) {
COOMatrix ret;
ATEN_XPU_SWITCH_CUDA(mat.row->ctx.device_type, XPU, "COOSort", {
ATEN_ID_TYPE_SWITCH(mat.row->dtype, IdType, {
ret = impl::COOSort<XPU, IdType>(mat, sort_column);
void COOSort_(COOMatrix* mat, bool sort_column) {
if ((mat->row_sorted && !sort_column) || mat->col_sorted)
return;
ATEN_XPU_SWITCH_CUDA(mat->row->ctx.device_type, XPU, "COOSort_", {
ATEN_ID_TYPE_SWITCH(mat->row->dtype, IdType, {
impl::COOSort_<XPU, IdType>(mat, sort_column);
});
});
}
std::pair<bool, bool> COOIsSorted(COOMatrix coo) {
if (coo.row->shape[0] <= 1)
return {true, true};
std::pair<bool, bool> ret;
ATEN_COO_SWITCH_CUDA(coo, XPU, IdType, "COOIsSorted", {
ret = impl::COOIsSorted<XPU, IdType>(coo);
});
return ret;
}
@@ -709,3 +778,7 @@ DGL_REGISTER_GLOBAL("ndarray._CAPI_DGLExistSharedMemArray")
} // namespace aten
} // namespace dgl
std::ostream& operator << (std::ostream& os, dgl::runtime::NDArray array) {
return os << dgl::aten::ToDebugString(array);
}

View File

@@ -3,8 +3,8 @@
* \file array/array_aritch.cc
* \brief DGL array arithmetic operations
*/
#include <dgl/array.h>
#include <dgl/packed_func_ext.h>
#include <dgl/runtime/ndarray.h>
#include <dgl/runtime/container.h>
#include "../c_api_common.h"
#include "./array_op.h"

View File

@@ -44,7 +44,7 @@ template <DLDeviceType XPU, typename DType, typename IdType>
NDArray IndexSelect(NDArray array, IdArray index);
template <DLDeviceType XPU, typename DType>
DType IndexSelect(NDArray array, uint64_t index);
DType IndexSelect(NDArray array, int64_t index);
template <DLDeviceType XPU, typename DType, typename IdType>
NDArray Scatter(NDArray array, IdArray indices);
@@ -61,6 +61,9 @@ std::tuple<NDArray, IdArray, IdArray> Pack(NDArray array, DType pad_value);
template <DLDeviceType XPU, typename DType, typename IdType>
std::pair<NDArray, IdArray> ConcatSlices(NDArray array, IdArray lengths);
template <DLDeviceType XPU, typename IdType>
IdArray CumSum(IdArray array, bool prepend_zero);
// sparse arrays
template <DLDeviceType XPU, typename IdType>
@@ -84,6 +87,9 @@ runtime::NDArray CSRGetRowColumnIndices(CSRMatrix csr, int64_t row);
template <DLDeviceType XPU, typename IdType>
runtime::NDArray CSRGetRowData(CSRMatrix csr, int64_t row);
template <DLDeviceType XPU, typename IdType>
bool CSRIsSorted(CSRMatrix csr);
template <DLDeviceType XPU, typename IdType>
runtime::NDArray CSRGetData(CSRMatrix csr, int64_t row, int64_t col);
@@ -187,7 +193,10 @@ template <DLDeviceType XPU, typename IdType>
std::pair<COOMatrix, IdArray> COOCoalesce(COOMatrix coo);
template <DLDeviceType XPU, typename IdType>
COOMatrix COOSort(COOMatrix mat, bool sort_column);
void COOSort_(COOMatrix* mat, bool sort_column);
template <DLDeviceType XPU, typename IdType>
std::pair<bool, bool> COOIsSorted(COOMatrix coo);
template <DLDeviceType XPU, typename IdType>
COOMatrix COORemove(COOMatrix coo, IdArray entries);

View File

@@ -0,0 +1,42 @@
/*!
* Copyright (c) 2020 by Contributors
* \file array/cpu/array_cumsum.cc
* \brief Array cumsum CPU implementation
*/
#include <dgl/array.h>
namespace dgl {
using runtime::NDArray;
namespace aten {
namespace impl {
template <DLDeviceType XPU, typename IdType>
IdArray CumSum(IdArray array, bool prepend_zero) {
const int64_t len = array.NumElements();
if (len == 0)
return array;
if (prepend_zero) {
IdArray ret = aten::NewIdArray(len + 1, array->ctx, array->dtype.bits);
const IdType* in_d = array.Ptr<IdType>();
IdType* out_d = ret.Ptr<IdType>();
out_d[0] = 0;
for (int64_t i = 0; i < len; ++i)
out_d[i + 1] = out_d[i] + in_d[i];
return ret;
} else {
IdArray ret = aten::NewIdArray(len, array->ctx, array->dtype.bits);
const IdType* in_d = array.Ptr<IdType>();
IdType* out_d = ret.Ptr<IdType>();
out_d[0] = in_d[0];
for (int64_t i = 1; i < len; ++i)
out_d[i] = out_d[i - 1] + in_d[i];
return ret;
}
}
template IdArray CumSum<kDLCPU, int32_t>(IdArray, bool);
template IdArray CumSum<kDLCPU, int64_t>(IdArray, bool);
} // namespace impl
} // namespace aten
} // namespace dgl

View File

@@ -35,20 +35,16 @@ template NDArray IndexSelect<kDLCPU, double, int32_t>(NDArray, IdArray);
template NDArray IndexSelect<kDLCPU, double, int64_t>(NDArray, IdArray);
template <DLDeviceType XPU, typename DType>
DType IndexSelect(NDArray array, uint64_t index) {
DType IndexSelect(NDArray array, int64_t index) {
const DType* data = static_cast<DType*>(array->data);
return data[index];
}
template int32_t IndexSelect<kDLCPU, int32_t>(NDArray array, uint64_t index);
template int64_t IndexSelect<kDLCPU, int64_t>(NDArray array, uint64_t index);
template uint32_t IndexSelect<kDLCPU, uint32_t>(NDArray array, uint64_t index);
template uint64_t IndexSelect<kDLCPU, uint64_t>(NDArray array, uint64_t index);
template float IndexSelect<kDLCPU, float>(NDArray array, uint64_t index);
template double IndexSelect<kDLCPU, double>(NDArray array, uint64_t index);
template int32_t IndexSelect<kDLCPU, int32_t>(NDArray array, int64_t index);
template int64_t IndexSelect<kDLCPU, int64_t>(NDArray array, int64_t index);
template float IndexSelect<kDLCPU, float>(NDArray array, int64_t index);
template double IndexSelect<kDLCPU, double>(NDArray array, int64_t index);
}; // namespace impl
}; // namespace aten
}; // namespace dgl
} // namespace impl
} // namespace aten
} // namespace dgl

View File

@@ -76,8 +76,6 @@ template std::tuple<NDArray, IdArray, IdArray> Pack<kDLCPU, int64_t>(NDArray, in
template std::tuple<NDArray, IdArray, IdArray> Pack<kDLCPU, float>(NDArray, float);
template std::tuple<NDArray, IdArray, IdArray> Pack<kDLCPU, double>(NDArray, double);
}; // namespace impl
}; // namespace aten
}; // namespace dgl
} // namespace impl
} // namespace aten
} // namespace dgl

View File

@@ -6,12 +6,12 @@
#ifndef DGL_ARRAY_CPU_ARRAY_UTILS_H_
#define DGL_ARRAY_CPU_ARRAY_UTILS_H_
#include <dgl/array.h>
#include <dgl/aten/types.h>
#include <parallel_hashmap/phmap.h>
#include <vector>
#include <unordered_map>
#include <utility>
#include "../../c_api_common.h"
#include "../third_party/phmap/parallel_hashmap/phmap.h"
namespace dgl {
namespace aten {

View File

@@ -10,37 +10,181 @@
#include <numeric>
#include <algorithm>
#include <vector>
#include <iterator>
#include <tuple>
namespace {
template <typename IdType>
struct TupleRef {
TupleRef() = delete;
TupleRef(const TupleRef& other) = default;
TupleRef(TupleRef&& other) = default;
TupleRef(IdType *const r, IdType *const c, IdType *const d)
: row(r), col(c), data(d) {}
TupleRef& operator=(const TupleRef& other) {
*row = *other.row;
*col = *other.col;
*data = *other.data;
return *this;
}
TupleRef& operator=(const std::tuple<IdType, IdType, IdType>& val) {
*row = std::get<0>(val);
*col = std::get<1>(val);
*data = std::get<2>(val);
return *this;
}
operator std::tuple<IdType, IdType, IdType>() const {
return std::make_tuple(*row, *col, *data);
}
void Swap(const TupleRef& other) const {
std::swap(*row, *other.row);
std::swap(*col, *other.col);
std::swap(*data, *other.data);
}
IdType *row, *col, *data;
};
using std::swap;
template <typename IdType>
void swap(const TupleRef<IdType>& r1, const TupleRef<IdType>& r2) {
r1.Swap(r2);
}
template <typename IdType>
struct CooIterator : public std::iterator<std::random_access_iterator_tag,
std::tuple<IdType, IdType, IdType>,
std::ptrdiff_t,
std::tuple<IdType*, IdType*, IdType*>,
TupleRef<IdType>> {
CooIterator() = default;
CooIterator(const CooIterator& other) = default;
CooIterator(CooIterator&& other) = default;
CooIterator(IdType *r, IdType *c, IdType *d): row(r), col(c), data(d) {}
CooIterator& operator=(const CooIterator& other) = default;
CooIterator& operator=(CooIterator&& other) = default;
~CooIterator() = default;
bool operator==(const CooIterator& other) const {
return row == other.row;
}
bool operator!=(const CooIterator& other) const {
return row != other.row;
}
bool operator<(const CooIterator& other) const {
return row < other.row;
}
bool operator>(const CooIterator& other) const {
return row > other.row;
}
bool operator<=(const CooIterator& other) const {
return row <= other.row;
}
bool operator>=(const CooIterator& other) const {
return row >= other.row;
}
CooIterator& operator+=(const std::ptrdiff_t& movement) {
row += movement;
col += movement;
data += movement;
return *this;
}
CooIterator& operator-=(const std::ptrdiff_t& movement) {
row -= movement;
col -= movement;
data -= movement;
return *this;
}
CooIterator& operator++() {
return operator+=(1);
}
CooIterator& operator--() {
return operator-=(1);
}
CooIterator operator++(int) {
CooIterator ret(*this);
operator++();
return ret;
}
CooIterator operator--(int) {
CooIterator ret(*this);
operator--();
return ret;
}
CooIterator operator+(const std::ptrdiff_t& movement) const {
CooIterator ret(*this);
ret += movement;
return ret;
}
CooIterator operator-(const std::ptrdiff_t& movement) const {
CooIterator ret(*this);
ret -= movement;
return ret;
}
std::ptrdiff_t operator-(const CooIterator& other) const {
return row - other.row;
}
TupleRef<IdType> operator*() const {
return TupleRef<IdType>(row, col, data);
}
TupleRef<IdType> operator*() {
return TupleRef<IdType>(row, col, data);
}
IdType *row, *col, *data;
};
} // namespace
namespace dgl {
namespace aten {
namespace impl {
template <DLDeviceType XPU, typename IdType>
COOMatrix COOSort(COOMatrix coo, bool sort_column) {
const int64_t nnz = coo.row->shape[0];
const IdType* coo_row_data = static_cast<IdType*>(coo.row->data);
const IdType* coo_col_data = static_cast<IdType*>(coo.col->data);
///////////////////////////// COOSort_ /////////////////////////////
// Argsort
IdArray new_row = IdArray::Empty({nnz}, coo.row->dtype, coo.row->ctx);
IdArray new_col = IdArray::Empty({nnz}, coo.col->dtype, coo.col->ctx);
IdArray new_idx = IdArray::Empty({nnz}, coo.row->dtype, coo.row->ctx);
IdType* new_row_data = static_cast<IdType*>(new_row->data);
IdType* new_col_data = static_cast<IdType*>(new_col->data);
IdType* new_idx_data = static_cast<IdType*>(new_idx->data);
std::iota(new_idx_data, new_idx_data + nnz, 0);
template <DLDeviceType XPU, typename IdType>
void COOSort_(COOMatrix* coo, bool sort_column) {
const int64_t nnz = coo->row->shape[0];
IdType* coo_row = coo->row.Ptr<IdType>();
IdType* coo_col = coo->col.Ptr<IdType>();
if (!COOHasData(*coo))
coo->data = aten::Range(0, nnz, coo->row->dtype.bits, coo->row->ctx);
IdType* coo_data = coo->data.Ptr<IdType>();
typedef std::tuple<IdType, IdType, IdType> Tuple;
// Arg sort
if (sort_column) {
#ifdef PARALLEL_ALGORITHMS
__gnu_parallel::sort(
#else
std::sort(
#endif
new_idx_data,
new_idx_data + nnz,
[coo_row_data, coo_col_data](const IdType a, const IdType b) {
return (coo_row_data[a] != coo_row_data[b]) ?
(coo_row_data[a] < coo_row_data[b]) :
(coo_col_data[a] < coo_col_data[b]);
CooIterator<IdType>(coo_row, coo_col, coo_data),
CooIterator<IdType>(coo_row, coo_col, coo_data) + nnz,
[](const Tuple& a, const Tuple& b) {
return (std::get<0>(a) != std::get<0>(b)) ?
(std::get<0>(a) < std::get<0>(b)) : (std::get<1>(a) < std::get<1>(b));
});
} else {
#ifdef PARALLEL_ALGORITHMS
@@ -48,39 +192,41 @@ COOMatrix COOSort(COOMatrix coo, bool sort_column) {
#else
std::sort(
#endif
new_idx_data,
new_idx_data + nnz,
[coo_row_data](const IdType a, const IdType b) {
return coo_row_data[a] < coo_row_data[b];
CooIterator<IdType>(coo_row, coo_col, coo_data),
CooIterator<IdType>(coo_row, coo_col, coo_data) + nnz,
[](const Tuple& a, const Tuple& b) {
return std::get<0>(a) < std::get<0>(b);
});
}
// Reorder according to shuffle
#pragma omp parallel for
for (IdType i = 0; i < nnz; ++i) {
new_row_data[i] = coo_row_data[new_idx_data[i]];
new_col_data[i] = coo_col_data[new_idx_data[i]];
}
if (COOHasData(coo)) {
const IdType* coo_data_data = static_cast<IdType*>(coo.data->data);
IdArray new_data = IdArray::Empty({nnz}, coo.row->dtype, coo.row->ctx);
IdType* new_data_data = static_cast<IdType*>(new_data->data);
#pragma omp parallel for
for (IdType i = 0; i < nnz; ++i) {
new_data_data[i] = coo_data_data[new_idx_data[i]];
}
new_idx = new_data;
}
return COOMatrix{
coo.num_rows, coo.num_cols, std::move(new_row), std::move(new_col),
std::move(new_idx), true, sort_column};
coo->row_sorted = true;
coo->col_sorted = sort_column;
}
template COOMatrix COOSort<kDLCPU, int32_t>(COOMatrix, bool);
template COOMatrix COOSort<kDLCPU, int64_t>(COOMatrix, bool);
template void COOSort_<kDLCPU, int32_t>(COOMatrix*, bool);
template void COOSort_<kDLCPU, int64_t>(COOMatrix*, bool);
///////////////////////////// COOIsSorted /////////////////////////////
template <DLDeviceType XPU, typename IdType>
std::pair<bool, bool> COOIsSorted(COOMatrix coo) {
const int64_t nnz = coo.row->shape[0];
IdType* row = coo.row.Ptr<IdType>();
IdType* col = coo.col.Ptr<IdType>();
bool row_sorted = true;
bool col_sorted = true;
for (int64_t i = 1; row_sorted && i < nnz; ++i) {
row_sorted = (row[i - 1] <= row[i]);
col_sorted = col_sorted && (row[i - 1] < row[i] || col[i - 1] <= col[i]);
}
if (!row_sorted)
col_sorted = false;
return {row_sorted, col_sorted};
}
template std::pair<bool, bool> COOIsSorted<kDLCPU, int32_t>(COOMatrix coo);
template std::pair<bool, bool> COOIsSorted<kDLCPU, int64_t>(COOMatrix coo);
} // namespace impl
} // namespace aten

83
src/array/cpu/csr_sort.cc Normal file
View File

@@ -0,0 +1,83 @@
/*!
* Copyright (c) 2020 by Contributors
* \file array/cpu/csr_sort.cc
* \brief CSR sorting
*/
#include <dgl/array.h>
#include <numeric>
#include <algorithm>
#include <vector>
namespace dgl {
namespace aten {
namespace impl {
///////////////////////////// CSRIsSorted /////////////////////////////
template <DLDeviceType XPU, typename IdType>
bool CSRIsSorted(CSRMatrix csr) {
const IdType* indptr = csr.indptr.Ptr<IdType>();
const IdType* indices = csr.indices.Ptr<IdType>();
bool ret = true;
#pragma omp parallel for shared(ret)
for (int64_t row = 0; row < csr.num_rows; ++row) {
if (!ret)
continue;
for (IdType i = indptr[row] + 1; i < indptr[row + 1]; ++i) {
if (indices[i - 1] > indices[i]) {
ret = false;
break;
}
}
}
return ret;
}
template bool CSRIsSorted<kDLCPU, int64_t>(CSRMatrix csr);
template bool CSRIsSorted<kDLCPU, int32_t>(CSRMatrix csr);
///////////////////////////// CSRSort /////////////////////////////
template <DLDeviceType XPU, typename IdType>
void CSRSort_(CSRMatrix* csr) {
typedef std::pair<IdType, IdType> ShufflePair;
const int64_t num_rows = csr->num_rows;
const int64_t nnz = csr->indices->shape[0];
const IdType* indptr_data = static_cast<IdType*>(csr->indptr->data);
IdType* indices_data = static_cast<IdType*>(csr->indices->data);
if (!CSRHasData(*csr)) {
csr->data = aten::Range(0, nnz, csr->indptr->dtype.bits, csr->indptr->ctx);
}
IdType* eid_data = static_cast<IdType*>(csr->data->data);
#pragma omp parallel
{
std::vector<ShufflePair> reorder_vec;
#pragma omp for
for (int64_t row = 0; row < num_rows; row++) {
const int64_t num_cols = indptr_data[row + 1] - indptr_data[row];
IdType *col = indices_data + indptr_data[row];
IdType *eid = eid_data + indptr_data[row];
reorder_vec.resize(num_cols);
for (int64_t i = 0; i < num_cols; i++) {
reorder_vec[i].first = col[i];
reorder_vec[i].second = eid[i];
}
std::sort(reorder_vec.begin(), reorder_vec.end(),
[](const ShufflePair &e1, const ShufflePair &e2) {
return e1.first < e2.first;
});
for (int64_t i = 0; i < num_cols; i++) {
col[i] = reorder_vec[i].first;
eid[i] = reorder_vec[i].second;
}
}
}
csr->sorted = true;
}
template void CSRSort_<kDLCPU, int64_t>(CSRMatrix* csr);
template void CSRSort_<kDLCPU, int32_t>(CSRMatrix* csr);
} // namespace impl
} // namespace aten
} // namespace dgl

View File

@@ -377,7 +377,9 @@ COOMatrix CSRToCOO(CSRMatrix csr) {
ret_row_data + indptr_data[i + 1],
i);
}
return COOMatrix{csr.num_rows, csr.num_cols, ret_row, csr.indices, csr.data};
return COOMatrix(csr.num_rows, csr.num_cols,
ret_row, csr.indices, csr.data,
true, csr.sorted);
}
template COOMatrix CSRToCOO<kDLCPU, int32_t>(CSRMatrix csr);
@@ -543,49 +545,6 @@ template CSRMatrix CSRSliceMatrix<kDLCPU, int32_t>(
template CSRMatrix CSRSliceMatrix<kDLCPU, int64_t>(
CSRMatrix csr, runtime::NDArray rows, runtime::NDArray cols);
///////////////////////////// CSRSort /////////////////////////////
template <DLDeviceType XPU, typename IdType>
void CSRSort_(CSRMatrix* csr) {
typedef std::pair<IdType, IdType> ShufflePair;
const int64_t num_rows = csr->num_rows;
const int64_t nnz = csr->indices->shape[0];
const IdType* indptr_data = static_cast<IdType*>(csr->indptr->data);
IdType* indices_data = static_cast<IdType*>(csr->indices->data);
if (!CSRHasData(*csr)) {
csr->data = aten::Range(0, nnz, csr->indptr->dtype.bits, csr->indptr->ctx);
}
IdType* eid_data = static_cast<IdType*>(csr->data->data);
#pragma omp parallel
{
std::vector<ShufflePair> reorder_vec;
#pragma omp for
for (int64_t row = 0; row < num_rows; row++) {
const int64_t num_cols = indptr_data[row + 1] - indptr_data[row];
IdType *col = indices_data + indptr_data[row];
IdType *eid = eid_data + indptr_data[row];
reorder_vec.resize(num_cols);
for (int64_t i = 0; i < num_cols; i++) {
reorder_vec[i].first = col[i];
reorder_vec[i].second = eid[i];
}
std::sort(reorder_vec.begin(), reorder_vec.end(),
[](const ShufflePair &e1, const ShufflePair &e2) {
return e1.first < e2.first;
});
for (int64_t i = 0; i < num_cols; i++) {
col[i] = reorder_vec[i].first;
eid[i] = reorder_vec[i].second;
}
}
}
csr->sorted = true;
}
template void CSRSort_<kDLCPU, int64_t>(CSRMatrix* csr);
template void CSRSort_<kDLCPU, int32_t>(CSRMatrix* csr);
///////////////////////////// CSRReorder /////////////////////////////
template <DLDeviceType XPU, typename IdType>

View File

@@ -3,10 +3,10 @@
* \file array/cpu/spmat_op_impl.cc
* \brief CPU implementation of COO sparse matrix operators
*/
#include <dgl/array.h>
#include <vector>
#include <unordered_set>
#include <unordered_map>
#include <tuple>
#include "array_utils.h"
namespace dgl {
@@ -266,29 +266,57 @@ CSRMatrix COOToCSR(COOMatrix coo) {
const IdType* row_data = static_cast<IdType*>(coo.row->data);
const IdType* col_data = static_cast<IdType*>(coo.col->data);
const IdType* data = COOHasData(coo)? static_cast<IdType*>(coo.data->data) : nullptr;
NDArray ret_indptr = NDArray::Empty({N + 1}, coo.row->dtype, coo.row->ctx);
NDArray ret_indices;
NDArray ret_data;
IdType* Bp = static_cast<IdType*>(ret_indptr->data);
std::fill(Bp, Bp + N, 0);
for (int64_t i = 0; i < NNZ; ++i) {
Bp[row_data[i]]++;
bool row_sorted = coo.row_sorted;
bool col_sorted = coo.col_sorted;
if (!row_sorted) {
// It is possible that the flag is simply not set (default value is false),
// so we still perform a linear scan to check the flag.
std::tie(row_sorted, col_sorted) = COOIsSorted(coo);
}
// cumsum
for (int64_t i = 0, cumsum = 0; i < N; ++i) {
const IdType temp = Bp[i];
Bp[i] = cumsum;
cumsum += temp;
}
Bp[N] = NNZ;
if (row_sorted) {
// compute indptr
IdType* Bp = static_cast<IdType*>(ret_indptr->data);
Bp[0] = 0;
int64_t j = 0;
for (int64_t i = 0; i < N; ++i) {
const int64_t k = j;
for (; j < NNZ && row_data[j] == i; ++j) {}
Bp[i + 1] = Bp[i] + j - k;
}
if (coo.row_sorted == true) {
// TODO(minjie): Many of our current implementation assumes that CSR must have
// a data array. This is a temporary workaround. Remove this after:
// - The old immutable graph implementation is deprecated.
// - The old binary reduce kernel is deprecated.
if (!COOHasData(coo))
coo.data = aten::Range(0, NNZ, coo.row->dtype.bits, coo.row->ctx);
// compute indices and data
ret_indices = coo.col;
ret_data = coo.data;
} else {
// compute indptr
IdType* Bp = static_cast<IdType*>(ret_indptr->data);
std::fill(Bp, Bp + N, 0);
for (int64_t i = 0; i < NNZ; ++i) {
Bp[row_data[i]]++;
}
// cumsum
for (int64_t i = 0, cumsum = 0; i < N; ++i) {
const IdType temp = Bp[i];
Bp[i] = cumsum;
cumsum += temp;
}
Bp[N] = NNZ;
// compute indices and data
ret_indices = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
ret_data = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
IdType* Bi = static_cast<IdType*>(ret_indices->data);
@@ -311,7 +339,7 @@ CSRMatrix COOToCSR(COOMatrix coo) {
return CSRMatrix(coo.num_rows, coo.num_cols,
ret_indptr, ret_indices, ret_data,
coo.col_sorted);
col_sorted);
}
template CSRMatrix COOToCSR<kDLCPU, int32_t>(COOMatrix coo);
@@ -439,7 +467,6 @@ COOMatrix COOReorder(COOMatrix coo, runtime::NDArray new_row_id_arr,
// Input COO
const IdType* in_rows = static_cast<IdType*>(coo.row->data);
const IdType* in_cols = static_cast<IdType*>(coo.col->data);
const IdType* in_data = COOHasData(coo) ? static_cast<IdType*>(coo.data->data) : nullptr;
int64_t num_rows = coo.num_rows;
int64_t num_cols = coo.num_cols;
int64_t nnz = coo.row->shape[0];

View File

@@ -0,0 +1,51 @@
/*!
* Copyright (c) 2020 by Contributors
* \file array/cpu/array_cumsum.cu
* \brief Array cumsum GPU implementation
*/
#include <dgl/array.h>
#include <cub/cub.cuh>
#include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
namespace dgl {
using runtime::NDArray;
namespace aten {
namespace impl {
template <DLDeviceType XPU, typename IdType>
IdArray CumSum(IdArray array, bool prepend_zero) {
const int64_t len = array.NumElements();
if (len == 0)
return array;
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
auto device = runtime::DeviceAPI::Get(array->ctx);
const IdType* in_d = array.Ptr<IdType>();
IdArray ret;
IdType* out_d = nullptr;
if (prepend_zero) {
ret = aten::Full(0, len + 1, array->dtype.bits, array->ctx);
out_d = ret.Ptr<IdType>() + 1;
} else {
ret = aten::NewIdArray(len, array->ctx, array->dtype.bits);
out_d = ret.Ptr<IdType>();
}
// Allocate workspace
size_t workspace_size = 0;
cub::DeviceScan::InclusiveSum(nullptr, workspace_size, in_d, out_d, len, thr_entry->stream);
void* workspace = device->AllocWorkspace(array->ctx, workspace_size);
// Compute cumsum
cub::DeviceScan::InclusiveSum(workspace, workspace_size, in_d, out_d, len, thr_entry->stream);
device->FreeWorkspace(array->ctx, workspace);
return ret;
}
template IdArray CumSum<kDLGPU, int32_t>(IdArray, bool);
template IdArray CumSum<kDLGPU, int64_t>(IdArray, bool);
} // namespace impl
} // namespace aten
} // namespace dgl

View File

@@ -5,7 +5,7 @@
*/
#include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h"
#include "../../cuda_utils.h"
#include "./utils.h"
namespace dgl {
using runtime::NDArray;
@@ -50,7 +50,7 @@ template NDArray IndexSelect<kDLGPU, double, int32_t>(NDArray, IdArray);
template NDArray IndexSelect<kDLGPU, double, int64_t>(NDArray, IdArray);
template <DLDeviceType XPU, typename DType>
DType IndexSelect(NDArray array, uint64_t index) {
DType IndexSelect(NDArray array, int64_t index) {
auto device = runtime::DeviceAPI::Get(array->ctx);
DType ret = 0;
device->CopyDataFromTo(
@@ -60,12 +60,12 @@ DType IndexSelect(NDArray array, uint64_t index) {
return ret;
}
template int32_t IndexSelect<kDLGPU, int32_t>(NDArray array, uint64_t index);
template int64_t IndexSelect<kDLGPU, int64_t>(NDArray array, uint64_t index);
template uint32_t IndexSelect<kDLGPU, uint32_t>(NDArray array, uint64_t index);
template uint64_t IndexSelect<kDLGPU, uint64_t>(NDArray array, uint64_t index);
template float IndexSelect<kDLGPU, float>(NDArray array, uint64_t index);
template double IndexSelect<kDLGPU, double>(NDArray array, uint64_t index);
template int32_t IndexSelect<kDLGPU, int32_t>(NDArray array, int64_t index);
template int64_t IndexSelect<kDLGPU, int64_t>(NDArray array, int64_t index);
template uint32_t IndexSelect<kDLGPU, uint32_t>(NDArray array, int64_t index);
template uint64_t IndexSelect<kDLGPU, uint64_t>(NDArray array, int64_t index);
template float IndexSelect<kDLGPU, float>(NDArray array, int64_t index);
template double IndexSelect<kDLGPU, double>(NDArray array, int64_t index);
} // namespace impl
} // namespace aten

View File

@@ -5,7 +5,7 @@
*/
#include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h"
#include "../../cuda_utils.h"
#include "./utils.h"
#include "../arith.h"
namespace dgl {

View File

@@ -17,63 +17,43 @@ template <DLDeviceType XPU, typename IdType>
CSRMatrix COOToCSR(COOMatrix coo) {
CHECK(sizeof(IdType) == 4) << "CUDA COOToCSR does not support int64.";
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
auto device = runtime::DeviceAPI::Get(coo.row->ctx);
// allocate cusparse handle if needed
if (!thr_entry->cusparse_handle) {
CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
}
CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, thr_entry->stream));
NDArray row = coo.row, col = coo.col, data = coo.data;
int32_t* row_ptr = static_cast<int32_t*>(row->data);
int32_t* col_ptr = static_cast<int32_t*>(col->data);
int32_t* data_ptr = aten::IsNullArray(data) ? nullptr : static_cast<int32_t*>(data->data);
if (!coo.row_sorted) {
// make a copy of row and col because sort is done in-place
row = row.CopyTo(row->ctx);
col = col.CopyTo(col->ctx);
row_ptr = static_cast<int32_t*>(row->data);
col_ptr = static_cast<int32_t*>(col->data);
if (aten::IsNullArray(data)) {
// create the index array
data = aten::Range(0, row->shape[0], row->dtype.bits, row->ctx);
data_ptr = static_cast<int32_t*>(data->data);
}
// sort row
size_t workspace_size = 0;
CUSPARSE_CALL(cusparseXcoosort_bufferSizeExt(
thr_entry->cusparse_handle,
coo.num_rows, coo.num_cols,
row->shape[0],
row_ptr,
col_ptr,
&workspace_size));
void* workspace = device->AllocWorkspace(row->ctx, workspace_size);
CUSPARSE_CALL(cusparseXcoosortByRow(
thr_entry->cusparse_handle,
coo.num_rows, coo.num_cols,
row->shape[0],
row_ptr,
col_ptr,
data_ptr,
workspace));
device->FreeWorkspace(row->ctx, workspace);
bool row_sorted = coo.row_sorted;
bool col_sorted = coo.col_sorted;
if (!row_sorted) {
// It is possible that the flag is simply not set (default value is false),
// so we still perform a linear scan to check the flag.
std::tie(row_sorted, col_sorted) = COOIsSorted(coo);
}
if (!row_sorted) {
coo = COOSort(coo);
}
NDArray indptr = aten::NewIdArray(coo.num_rows + 1, row->ctx, row->dtype.bits);
const int64_t nnz = coo.row->shape[0];
// TODO(minjie): Many of our current implementation assumes that CSR must have
// a data array. This is a temporary workaround. Remove this after:
// - The old immutable graph implementation is deprecated.
// - The old binary reduce kernel is deprecated.
if (!COOHasData(coo))
coo.data = aten::Range(0, nnz, coo.row->dtype.bits, coo.row->ctx);
NDArray indptr = aten::NewIdArray(coo.num_rows + 1, coo.row->ctx, coo.row->dtype.bits);
int32_t* indptr_ptr = static_cast<int32_t*>(indptr->data);
CUSPARSE_CALL(cusparseXcoo2csr(
thr_entry->cusparse_handle,
row_ptr,
row->shape[0],
coo.row.Ptr<int32_t>(),
nnz,
coo.num_rows,
indptr_ptr,
CUSPARSE_INDEX_BASE_ZERO));
return CSRMatrix(coo.num_rows, coo.num_cols,
indptr, col, data, false);
indptr, coo.col, coo.data, col_sorted);
}
template CSRMatrix COOToCSR<kDLGPU, int32_t>(COOMatrix coo);

View File

@@ -1,108 +0,0 @@
/*!
* Copyright (c) 2020 by Contributors
* \file array/cuda/coo_sort.cc
* \brief Sort COO index
*/
#include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h"
namespace dgl {
using runtime::NDArray;
namespace aten {
namespace impl {
template <DLDeviceType XPU, typename IdType>
COOMatrix COOSort(COOMatrix coo, bool sort_column) {
CHECK(sizeof(IdType) == 4) << "CUDA COOSort does not support int64.";
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
auto device = runtime::DeviceAPI::Get(coo.row->ctx);
// allocate cusparse handle if needed
if (!thr_entry->cusparse_handle) {
CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
}
CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, thr_entry->stream));
NDArray row = coo.row.CopyTo(coo.row->ctx);
NDArray col = coo.col.CopyTo(coo.col->ctx);
NDArray data;
if (aten::IsNullArray(coo.data)) {
// create the index array
data = aten::Range(0, row->shape[0], row->dtype.bits, row->ctx);
} else {
data = coo.data.CopyTo(coo.data->ctx);
}
int32_t* row_ptr = static_cast<int32_t*>(row->data);
int32_t* col_ptr = static_cast<int32_t*>(col->data);
int32_t* data_ptr = static_cast<int32_t*>(data->data);
// sort row
size_t workspace_size = 0;
CUSPARSE_CALL(cusparseXcoosort_bufferSizeExt(
thr_entry->cusparse_handle,
coo.num_rows, coo.num_cols,
row->shape[0],
row_ptr,
col_ptr,
&workspace_size));
void* workspace = device->AllocWorkspace(row->ctx, workspace_size);
CUSPARSE_CALL(cusparseXcoosortByRow(
thr_entry->cusparse_handle,
coo.num_rows, coo.num_cols,
row->shape[0],
row_ptr,
col_ptr,
data_ptr,
workspace));
device->FreeWorkspace(row->ctx, workspace);
if (sort_column) {
// First create a row indptr array and then call csrsort
int32_t* indptr = static_cast<int32_t*>(
device->AllocWorkspace(row->ctx, (coo.num_rows + 1) * sizeof(IdType)));
CUSPARSE_CALL(cusparseXcoo2csr(
thr_entry->cusparse_handle,
row_ptr,
row->shape[0],
coo.num_rows,
indptr,
CUSPARSE_INDEX_BASE_ZERO));
CUSPARSE_CALL(cusparseXcsrsort_bufferSizeExt(
thr_entry->cusparse_handle,
coo.num_rows,
coo.num_cols,
row->shape[0],
indptr,
col_ptr,
&workspace_size));
void* workspace = device->AllocWorkspace(row->ctx, workspace_size);
cusparseMatDescr_t descr;
CUSPARSE_CALL(cusparseCreateMatDescr(&descr));
CUSPARSE_CALL(cusparseXcsrsort(
thr_entry->cusparse_handle,
coo.num_rows,
coo.num_cols,
row->shape[0],
descr,
indptr,
col_ptr,
data_ptr,
workspace));
CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
device->FreeWorkspace(row->ctx, workspace);
device->FreeWorkspace(row->ctx, indptr);
}
return COOMatrix(coo.num_rows, coo.num_cols,
row, col, data, true, sort_column);
}
template COOMatrix COOSort<kDLGPU, int32_t>(COOMatrix coo, bool sort_column);
template COOMatrix COOSort<kDLGPU, int64_t>(COOMatrix coo, bool sort_column);
} // namespace impl
} // namespace aten
} // namespace dgl

158
src/array/cuda/coo_sort.cu Normal file
View File

@@ -0,0 +1,158 @@
/*!
* Copyright (c) 2020 by Contributors
* \file array/cuda/coo_sort.cc
* \brief Sort COO index
*/
#include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
namespace dgl {
using runtime::NDArray;
namespace aten {
namespace impl {
///////////////////////////// COOSort_ /////////////////////////////
template <DLDeviceType XPU, typename IdType>
void COOSort_(COOMatrix* coo, bool sort_column) {
// TODO(minjie): Current implementation is based on cusparse which only supports
// int32_t. To support int64_t, we could use the Radix sort algorithm provided
// by CUB.
CHECK(sizeof(IdType) == 4) << "CUDA COOSort does not support int64.";
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
auto device = runtime::DeviceAPI::Get(coo->row->ctx);
// allocate cusparse handle if needed
if (!thr_entry->cusparse_handle) {
CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
}
CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, thr_entry->stream));
NDArray row = coo->row;
NDArray col = coo->col;
if (!aten::COOHasData(*coo))
coo->data = aten::Range(0, row->shape[0], row->dtype.bits, row->ctx);
NDArray data = coo->data;
int32_t* row_ptr = static_cast<int32_t*>(row->data);
int32_t* col_ptr = static_cast<int32_t*>(col->data);
int32_t* data_ptr = static_cast<int32_t*>(data->data);
// sort row
size_t workspace_size = 0;
CUSPARSE_CALL(cusparseXcoosort_bufferSizeExt(
thr_entry->cusparse_handle,
coo->num_rows, coo->num_cols,
row->shape[0],
row_ptr,
col_ptr,
&workspace_size));
void* workspace = device->AllocWorkspace(row->ctx, workspace_size);
CUSPARSE_CALL(cusparseXcoosortByRow(
thr_entry->cusparse_handle,
coo->num_rows, coo->num_cols,
row->shape[0],
row_ptr,
col_ptr,
data_ptr,
workspace));
device->FreeWorkspace(row->ctx, workspace);
if (sort_column) {
// First create a row indptr array and then call csrsort
int32_t* indptr = static_cast<int32_t*>(
device->AllocWorkspace(row->ctx, (coo->num_rows + 1) * sizeof(IdType)));
CUSPARSE_CALL(cusparseXcoo2csr(
thr_entry->cusparse_handle,
row_ptr,
row->shape[0],
coo->num_rows,
indptr,
CUSPARSE_INDEX_BASE_ZERO));
CUSPARSE_CALL(cusparseXcsrsort_bufferSizeExt(
thr_entry->cusparse_handle,
coo->num_rows,
coo->num_cols,
row->shape[0],
indptr,
col_ptr,
&workspace_size));
void* workspace = device->AllocWorkspace(row->ctx, workspace_size);
cusparseMatDescr_t descr;
CUSPARSE_CALL(cusparseCreateMatDescr(&descr));
CUSPARSE_CALL(cusparseXcsrsort(
thr_entry->cusparse_handle,
coo->num_rows,
coo->num_cols,
row->shape[0],
descr,
indptr,
col_ptr,
data_ptr,
workspace));
CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
device->FreeWorkspace(row->ctx, workspace);
device->FreeWorkspace(row->ctx, indptr);
}
coo->row_sorted = true;
coo->col_sorted = sort_column;
}
template void COOSort_<kDLGPU, int32_t>(COOMatrix* coo, bool sort_column);
template void COOSort_<kDLGPU, int64_t>(COOMatrix* coo, bool sort_column);
///////////////////////////// COOIsSorted /////////////////////////////
template <typename IdType>
__global__ void _COOIsSortedKernel(
const IdType* row, const IdType* col,
int64_t nnz, int8_t* row_sorted, int8_t* col_sorted) {
int tx = blockIdx.x * blockDim.x + threadIdx.x;
const int stride_x = gridDim.x * blockDim.x;
while (tx < nnz) {
if (tx == 0) {
row_sorted[0] = 1;
col_sorted[0] = 1;
} else {
row_sorted[tx] = static_cast<int8_t>(row[tx - 1] <= row[tx]);
col_sorted[tx] = static_cast<int8_t>(
row[tx - 1] < row[tx] || col[tx - 1] <= col[tx]);
}
tx += stride_x;
}
}
template <DLDeviceType XPU, typename IdType>
std::pair<bool, bool> COOIsSorted(COOMatrix coo) {
const int64_t nnz = coo.row->shape[0];
const auto& ctx = coo.row->ctx;
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
auto device = runtime::DeviceAPI::Get(ctx);
// We allocate a workspace of 2*nnz bytes. It wastes a little bit memory but should
// be fine.
int8_t* row_flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, nnz));
int8_t* col_flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, nnz));
const int nt = cuda::FindNumThreads(nnz);
const int nb = (nnz + nt - 1) / nt;
_COOIsSortedKernel<<<nb, nt, 0, thr_entry->stream>>>(
coo.row.Ptr<IdType>(), coo.col.Ptr<IdType>(),
nnz, row_flags, col_flags);
const bool row_sorted = cuda::AllTrue(row_flags, nnz, ctx);
const bool col_sorted = row_sorted? cuda::AllTrue(col_flags, nnz, ctx) : false;
device->FreeWorkspace(ctx, row_flags);
device->FreeWorkspace(ctx, col_flags);
return {row_sorted, col_sorted};
}
template std::pair<bool, bool> COOIsSorted<kDLGPU, int32_t>(COOMatrix coo);
template std::pair<bool, bool> COOIsSorted<kDLGPU, int64_t>(COOMatrix coo);
} // namespace impl
} // namespace aten
} // namespace dgl

108
src/array/cuda/csr_sort.cu Normal file
View File

@@ -0,0 +1,108 @@
/*!
* Copyright (c) 2020 by Contributors
* \file array/cuda/csr_sort.cc
* \brief Sort COO index
*/
#include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
namespace dgl {
using runtime::NDArray;
namespace aten {
namespace impl {
/*!
* \brief Check whether each row is sorted.
*/
template <typename IdType>
__global__ void _SegmentIsSorted(
const IdType* indptr, const IdType* indices,
int64_t num_rows, int8_t* flags) {
int tx = blockIdx.x * blockDim.x + threadIdx.x;
const int stride_x = gridDim.x * blockDim.x;
while (tx < num_rows) {
bool f = true;
for (IdType i = indptr[tx] + 1; f && i < indptr[tx + 1]; ++i) {
f = (indices[i - 1] <= indices[i]);
}
flags[tx] = static_cast<int8_t>(f);
tx += stride_x;
}
}
template <DLDeviceType XPU, typename IdType>
bool CSRIsSorted(CSRMatrix csr) {
const auto& ctx = csr.indptr->ctx;
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
auto device = runtime::DeviceAPI::Get(ctx);
// We allocate a workspace of num_rows bytes. It wastes a little bit memory but should
// be fine.
int8_t* flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, csr.num_rows));
const int nt = cuda::FindNumThreads(csr.num_rows);
const int nb = (csr.num_rows + nt - 1) / nt;
_SegmentIsSorted<<<nb, nt, 0, thr_entry->stream>>>(
csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
csr.num_rows, flags);
bool ret = cuda::AllTrue(flags, csr.num_rows, ctx);
device->FreeWorkspace(ctx, flags);
return ret;
}
template bool CSRIsSorted<kDLGPU, int32_t>(CSRMatrix csr);
template bool CSRIsSorted<kDLGPU, int64_t>(CSRMatrix csr);
template <DLDeviceType XPU, typename IdType>
void CSRSort_(CSRMatrix* csr) {
CHECK(sizeof(IdType) == 4) << "CUDA CSRSort_ does not support int64.";
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
auto device = runtime::DeviceAPI::Get(csr->indptr->ctx);
// allocate cusparse handle if needed
if (!thr_entry->cusparse_handle) {
CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
}
CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, thr_entry->stream));
NDArray indptr = csr->indptr;
NDArray indices = csr->indices;
const auto& ctx = indptr->ctx;
const int64_t nnz = indices->shape[0];
if (!aten::CSRHasData(*csr))
csr->data = aten::Range(0, nnz, indices->dtype.bits, ctx);
NDArray data = csr->data;
size_t workspace_size = 0;
CUSPARSE_CALL(cusparseXcsrsort_bufferSizeExt(
thr_entry->cusparse_handle,
csr->num_rows, csr->num_cols, nnz,
indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(),
&workspace_size));
void* workspace = device->AllocWorkspace(ctx, workspace_size);
cusparseMatDescr_t descr;
CUSPARSE_CALL(cusparseCreateMatDescr(&descr));
CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
CUSPARSE_CALL(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
CUSPARSE_CALL(cusparseXcsrsort(
thr_entry->cusparse_handle,
csr->num_rows, csr->num_cols, nnz,
descr,
indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(),
data.Ptr<int32_t>(),
workspace));
csr->sorted = true;
// free resources
CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
device->FreeWorkspace(ctx, workspace);
}
template void CSRSort_<kDLGPU, int32_t>(CSRMatrix* csr);
template void CSRSort_<kDLGPU, int64_t>(CSRMatrix* csr);
} // namespace impl
} // namespace aten
} // namespace dgl

View File

@@ -10,7 +10,7 @@
#include "macro.cuh"
#include "atomic.cuh"
#include "functor.cuh"
#include "../../cuda_utils.h"
#include "./utils.h"
#include "../../runtime/cuda/cuda_common.h"
namespace dgl {

View File

@@ -8,7 +8,7 @@
#include <unordered_set>
#include <numeric>
#include "../../runtime/cuda/cuda_common.h"
#include "../../cuda_utils.h"
#include "./utils.h"
namespace dgl {
@@ -17,8 +17,6 @@ using runtime::NDArray;
namespace aten {
namespace impl {
///////////////////////////// CSRIsNonZero /////////////////////////////
/*!
* \brief Search adjacency list linearly for each (row, col) pair and
* write the matched position in the indices array to the output.
@@ -33,7 +31,7 @@ __global__ void _LinearSearchKernel(
int64_t row_stride, int64_t col_stride,
int64_t length, IdType* out) {
int tx = blockIdx.x * blockDim.x + threadIdx.x;
int stride_x = gridDim.x * blockDim.x;
const int stride_x = gridDim.x * blockDim.x;
int rpos = tx, cpos = tx;
while (tx < length) {
out[tx] = -1;
@@ -50,6 +48,8 @@ __global__ void _LinearSearchKernel(
}
}
///////////////////////////// CSRIsNonZero /////////////////////////////
template <DLDeviceType XPU, typename IdType>
bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
@@ -169,6 +169,88 @@ NDArray CSRGetRowData(CSRMatrix csr, int64_t row) {
template NDArray CSRGetRowData<kDLGPU, int32_t>(CSRMatrix, int64_t);
template NDArray CSRGetRowData<kDLGPU, int64_t>(CSRMatrix, int64_t);
///////////////////////////// CSRSliceRows /////////////////////////////
template <DLDeviceType XPU, typename IdType>
CSRMatrix CSRSliceRows(CSRMatrix csr, int64_t start, int64_t end) {
const int64_t num_rows = end - start;
const IdType st_pos = aten::IndexSelect<IdType>(csr.indptr, start);
const IdType ed_pos = aten::IndexSelect<IdType>(csr.indptr, end);
const IdType nnz = ed_pos - st_pos;
IdArray ret_indptr = aten::IndexSelect(csr.indptr, start, end + 1) - st_pos;
// indices and data can be view arrays
IdArray ret_indices = csr.indices.CreateView(
{nnz}, csr.indices->dtype, st_pos * sizeof(IdType));
IdArray ret_data;
if (CSRHasData(csr))
ret_data = csr.data.CreateView({nnz}, csr.data->dtype, st_pos * sizeof(IdType));
else
ret_data = aten::Range(st_pos, ed_pos,
csr.indptr->dtype.bits, csr.indptr->ctx);
return CSRMatrix(num_rows, csr.num_cols,
ret_indptr, ret_indices, ret_data,
csr.sorted);
}
template CSRMatrix CSRSliceRows<kDLGPU, int32_t>(CSRMatrix, int64_t, int64_t);
template CSRMatrix CSRSliceRows<kDLGPU, int64_t>(CSRMatrix, int64_t, int64_t);
/*!
* \brief Copy data segment to output buffers
*
* For the i^th row r = row[i], copy the data from indptr[r] ~ indptr[r+1]
* to the out_data from out_indptr[i] ~ out_indptr[i+1]
*
* If the provided `data` array is nullptr, write the read index to the out_data.
*
*/
template <typename IdType, typename DType>
__global__ void _SegmentCopyKernel(
const IdType* indptr, const DType* data,
const IdType* row, int64_t row_stride, int64_t length,
const IdType* out_indptr, DType* out_data) {
int tx = blockIdx.x * blockDim.x + threadIdx.x;
const int stride_x = gridDim.x * blockDim.x;
int rpos = tx;
while (tx < length) {
const IdType r = row[rpos];
DType* out_buf = out_data + out_indptr[tx];
for (IdType i = indptr[r]; i < indptr[r + 1]; ++i) {
*(out_buf++) = data? data[i] : i;
}
rpos += row_stride;
tx += stride_x;
}
}
template <DLDeviceType XPU, typename IdType>
CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
const int64_t len = rows->shape[0];
IdArray ret_indptr = aten::CumSum(aten::CSRGetRowNNZ(csr, rows), true);
const int64_t nnz = aten::IndexSelect<IdType>(ret_indptr, len);
const int nt = cuda::FindNumThreads(len);
const int nb = (len + nt - 1) / nt;
// Copy indices.
IdArray ret_indices = NDArray::Empty({nnz}, csr.indptr->dtype, csr.indptr->ctx);
_SegmentCopyKernel<<<nb, nt, 0, thr_entry->stream>>>(
csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
rows.Ptr<IdType>(), 1, len,
ret_indptr.Ptr<IdType>(), ret_indices.Ptr<IdType>());
// Copy data.
IdArray ret_data = NDArray::Empty({nnz}, csr.indptr->dtype, csr.indptr->ctx);
_SegmentCopyKernel<<<nb, nt, 0, thr_entry->stream>>>(
csr.indptr.Ptr<IdType>(), CSRHasData(csr)? csr.data.Ptr<IdType>() : nullptr,
rows.Ptr<IdType>(), 1, len,
ret_indptr.Ptr<IdType>(), ret_data.Ptr<IdType>());
return CSRMatrix(len, csr.num_cols,
ret_indptr, ret_indices, ret_data,
csr.sorted);
}
template CSRMatrix CSRSliceRows<kDLGPU, int32_t>(CSRMatrix , NDArray);
template CSRMatrix CSRSliceRows<kDLGPU, int64_t>(CSRMatrix , NDArray);
} // namespace impl
} // namespace aten

View File

@@ -140,6 +140,7 @@ void CusparseCsrmm2(
static_cast<int32_t*>(csr.indptr->data),
static_cast<int32_t*>(csr.indices->data),
B_data, n, &beta, trans_out, m));
CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
if (valptr)
device->FreeWorkspace(ctx, valptr);
// transpose the output matrix

View File

@@ -9,8 +9,8 @@
#include <dgl/bcast.h>
#include "macro.cuh"
#include "atomic.cuh"
#include "../../cuda_utils.h"
#include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
namespace dgl {

30
src/array/cuda/utils.cu Normal file
View File

@@ -0,0 +1,30 @@
/*!
* Copyright (c) 2020 by Contributors
* \file array/cuda/utils.cu
* \brief Utilities for CUDA kernels.
*/
#include "./utils.h"
#include <cub/cub.cuh>
#include "../../runtime/cuda/cuda_common.h"
namespace dgl {
namespace cuda {
bool AllTrue(int8_t* flags, int64_t length, const DLContext& ctx) {
auto device = runtime::DeviceAPI::Get(ctx);
int8_t* rst = static_cast<int8_t*>(device->AllocWorkspace(ctx, 1));
// Call CUB's reduction
size_t workspace_size = 0;
CUDA_CALL(cub::DeviceReduce::Min(nullptr, workspace_size, flags, rst, length));
void* workspace = device->AllocWorkspace(ctx, workspace_size);
CUDA_CALL(cub::DeviceReduce::Min(workspace, workspace_size, flags, rst, length));
int8_t cpu_rst = 0;
CUDA_CALL(cudaMemcpy(&cpu_rst, rst, 1, cudaMemcpyDeviceToHost));
device->FreeWorkspace(ctx, workspace);
device->FreeWorkspace(ctx, rst);
return cpu_rst == 1;
}
} // namespace cuda
} // namespace dgl

View File

@@ -1,12 +1,13 @@
/*!
* Copyright (c) 2020 by Contributors
* \file cuda_utils.h
* \file array/cuda/utils.h
* \brief Utilities for CUDA kernels.
*/
#ifndef DGL_CUDA_UTILS_H_
#define DGL_CUDA_UTILS_H_
#ifndef DGL_ARRAY_CUDA_UTILS_H_
#define DGL_ARRAY_CUDA_UTILS_H_
#include <dmlc/logging.h>
#include <dlpack/dlpack.h>
namespace dgl {
namespace cuda {
@@ -68,7 +69,18 @@ __device__ __forceinline__ T _ldg(T* addr) {
#endif
}
/*!
* \brief Return true if the given bool flag array is all true.
* The input bool array is in int8_t type so it is aligned with byte address.
*
* \param flags The bool array.
* \param length The length.
* \param ctx Device context.
* \return True if all the flags are true.
*/
bool AllTrue(int8_t* flags, int64_t length, const DLContext& ctx);
} // namespace cuda
} // namespace dgl
#endif // DGL_CUDA_UTILS_H_
#endif // DGL_ARRAY_CUDA_UTILS_H_

View File

@@ -3,7 +3,6 @@
* \file array/kernel.cc
* \brief New kernels
*/
#include <dgl/array.h>
#include <dgl/packed_func_ext.h>
#include <dgl/base_heterograph.h>

View File

@@ -6,9 +6,9 @@
#ifndef DGL_ARRAY_KERNEL_DECL_H_
#define DGL_ARRAY_KERNEL_DECL_H_
#include <dgl/array.h>
#include <dgl/bcast.h>
#include <dgl/base_heterograph.h>
#include <dgl/runtime/ndarray.h>
#include <string>
#include <vector>

View File

@@ -14,6 +14,7 @@
#include <algorithm>
#include <vector>
#include <string>
#include <utility>
namespace dgl {

View File

@@ -804,7 +804,7 @@ DGL_REGISTER_GLOBAL("network._CAPI_FastPull")
}
}
int msg_count = 0;
for (int i = 0; i < remote_ids.size(); ++i) {
for (size_t i = 0; i < remote_ids.size(); ++i) {
if (remote_ids[i].size() != 0) {
KVStoreMsg kv_msg;
kv_msg.msg_type = MessageType::kPullMsg;
@@ -827,9 +827,10 @@ DGL_REGISTER_GLOBAL("network._CAPI_FastPull")
}
}
char *return_data = new char[ID_size*row_size];
const int64_t local_ids_size = local_ids.size();
// Copy local data
#pragma omp parallel for
for (int64_t i = 0; i < local_ids.size(); ++i) {
for (int64_t i = 0; i < local_ids_size; ++i) {
CHECK_GE(ID_size*row_size, local_ids_orginal[i] * row_size + row_size);
CHECK_GE(data_size, local_ids[i] * row_size + row_size);
CHECK_GE(local_ids[i], 0);
@@ -843,7 +844,7 @@ DGL_REGISTER_GLOBAL("network._CAPI_FastPull")
int64_t id_size = kv_msg->id.GetSize() / sizeof(int64_t);
int part_id = kv_msg->rank / group_count;
char* data_char = static_cast<char*>(kv_msg->data->data);
for (size_t n = 0; n < id_size; ++n) {
for (int64_t n = 0; n < id_size; ++n) {
memcpy(return_data + remote_ids_original[part_id][n] * row_size,
data_char + n * row_size,
row_size);

View File

@@ -20,6 +20,7 @@
#include <vector>
#include <algorithm>
#include <utility>
#include <memory>
#include "../../c_api_common.h"
using dgl::runtime::NDArray;

View File

@@ -51,9 +51,11 @@ ToBlock(HeteroGraphPtr graph, const std::vector<IdArray> &rhs_nodes, bool includ
const auto src_dst_types = graph->GetEndpointTypes(etype);
const dgl_type_t srctype = src_dst_types.first;
const dgl_type_t dsttype = src_dst_types.second;
const EdgeArray edges = graph->InEdges(etype, rhs_nodes[dsttype]);
lhs_node_mappings[srctype].Update(edges.src);
edge_arrays[etype] = edges;
if (!aten::IsNullArray(rhs_nodes[dsttype])) {
const EdgeArray& edges = graph->Edges(etype);
lhs_node_mappings[srctype].Update(edges.src);
edge_arrays[etype] = edges;
}
}
const auto meta_graph = graph->meta_graph();
@@ -75,11 +77,26 @@ ToBlock(HeteroGraphPtr graph, const std::vector<IdArray> &rhs_nodes, bool includ
const dgl_type_t dsttype = src_dst_types.second;
const IdHashMap<IdType> &lhs_map = lhs_node_mappings[srctype];
const IdHashMap<IdType> &rhs_map = rhs_node_mappings[dsttype];
rel_graphs.push_back(CreateFromCOO(
2, lhs_map.Size(), rhs_map.Size(),
lhs_map.Map(edge_arrays[etype].src, -1),
rhs_map.Map(edge_arrays[etype].dst, -1)));
induced_edges.push_back(edge_arrays[etype].id);
if (rhs_map.Size() == 0) {
// No rhs nodes are given for this edge type. Create an empty graph.
rel_graphs.push_back(CreateFromCOO(
2, lhs_map.Size(), rhs_map.Size(),
aten::NullArray(), aten::NullArray()));
induced_edges.push_back(aten::NullArray());
} else {
IdArray new_src = lhs_map.Map(edge_arrays[etype].src, -1);
IdArray new_dst = rhs_map.Map(edge_arrays[etype].dst, -1);
// Check whether there are unmapped IDs and raise error.
for (int64_t i = 0; i < new_dst->shape[0]; ++i)
CHECK_NE(new_dst.Ptr<IdType>()[i], -1)
<< "Node " << edge_arrays[etype].dst.Ptr<IdType>()[i] << " does not exist"
<< " in `rhs_nodes`. Argument `rhs_nodes` must contain all the edge"
<< " destination nodes.";
rel_graphs.push_back(CreateFromCOO(
2, lhs_map.Size(), rhs_map.Size(),
new_src, new_dst));
induced_edges.push_back(edge_arrays[etype].id);
}
}
const HeteroGraphPtr new_graph = CreateHeteroGraph(

View File

@@ -138,13 +138,7 @@ class UnitGraph::COO : public BaseHeteroGraph {
COO CopyTo(const DLContext& ctx) const {
if (Context() == ctx)
return *this;
COO ret(
meta_graph_,
adj_.num_rows, adj_.num_cols,
adj_.row.CopyTo(ctx),
adj_.col.CopyTo(ctx));
return ret;
return COO(meta_graph_, adj_.CopyTo(ctx));
}
bool IsMultigraph() const override {
@@ -516,13 +510,7 @@ class UnitGraph::CSR : public BaseHeteroGraph {
if (Context() == ctx) {
return *this;
} else {
CSR ret(
meta_graph_,
adj_.num_rows, adj_.num_cols,
adj_.indptr.CopyTo(ctx),
adj_.indices.CopyTo(ctx),
adj_.data.CopyTo(ctx));
return ret;
return CSR(meta_graph_, adj_.CopyTo(ctx));
}
}
@@ -1181,35 +1169,28 @@ HeteroGraphPtr UnitGraph::AsNumBits(HeteroGraphPtr g, uint8_t bits) {
if (g->NumBits() == bits) {
return g;
} else {
// TODO(minjie): since we don't have int32 operations,
// we make sure that this graph (on CPU) has materialized CSR,
// and then copy them to other context (usually GPU). This should
// be fixed later.
auto bg = std::dynamic_pointer_cast<UnitGraph>(g);
CHECK_NOTNULL(bg);
CSRPtr new_incsr = CSRPtr(new CSR(bg->GetInCSR()->AsNumBits(bits)));
CSRPtr new_outcsr = CSRPtr(new CSR(bg->GetOutCSR()->AsNumBits(bits)));
CSRPtr new_incsr = (bg->in_csr_)? CSRPtr(new CSR(bg->in_csr_->AsNumBits(bits))) : nullptr;
CSRPtr new_outcsr = (bg->out_csr_)? CSRPtr(new CSR(bg->out_csr_->AsNumBits(bits))) : nullptr;
COOPtr new_coo = (bg->coo_)? COOPtr(new COO(bg->coo_->AsNumBits(bits))) : nullptr;
return HeteroGraphPtr(
new UnitGraph(g->meta_graph(), new_incsr, new_outcsr, nullptr, bg->restrict_format_));
new UnitGraph(g->meta_graph(), new_incsr, new_outcsr, new_coo, bg->restrict_format_));
}
}
HeteroGraphPtr UnitGraph::CopyTo(HeteroGraphPtr g, const DLContext& ctx) {
if (ctx == g->Context()) {
return g;
} else {
auto bg = std::dynamic_pointer_cast<UnitGraph>(g);
CHECK_NOTNULL(bg);
CSRPtr new_incsr = (bg->in_csr_)? CSRPtr(new CSR(bg->in_csr_->CopyTo(ctx))) : nullptr;
CSRPtr new_outcsr = (bg->out_csr_)? CSRPtr(new CSR(bg->out_csr_->CopyTo(ctx))) : nullptr;
COOPtr new_coo = (bg->coo_)? COOPtr(new COO(bg->coo_->CopyTo(ctx))) : nullptr;
return HeteroGraphPtr(
new UnitGraph(g->meta_graph(), new_incsr, new_outcsr, new_coo, bg->restrict_format_));
}
// TODO(minjie): since we don't have GPU implementation of COO<->CSR,
// we make sure that this graph (on CPU) has materialized CSR,
// and then copy them to other context (usually GPU). This should
// be fixed later.
auto bg = std::dynamic_pointer_cast<UnitGraph>(g);
CHECK_NOTNULL(bg);
CSRPtr new_incsr = CSRPtr(new CSR(bg->GetInCSR()->CopyTo(ctx)));
CSRPtr new_outcsr = CSRPtr(new CSR(bg->GetOutCSR()->CopyTo(ctx)));
return HeteroGraphPtr(
new UnitGraph(g->meta_graph(), new_incsr, new_outcsr, nullptr, bg->restrict_format_));
}
UnitGraph::UnitGraph(GraphPtr metagraph, CSRPtr in_csr, CSRPtr out_csr, COOPtr coo,
@@ -1278,9 +1259,8 @@ UnitGraph::CSRPtr UnitGraph::GetInCSR(bool inplace) const {
const_cast<UnitGraph*>(this)->in_csr_ = ret;
} else {
CHECK(coo_) << "None of CSR, COO exist";
const auto& adj = coo_->adj();
const auto& newadj = aten::COOToCSR(
aten::COOMatrix{adj.num_cols, adj.num_rows, adj.col, adj.row});
const auto& newadj = aten::CSRSort(aten::COOToCSR(
aten::COOTranspose(coo_->adj())));
ret = std::make_shared<CSR>(meta_graph(), newadj);
if (inplace)
const_cast<UnitGraph*>(this)->in_csr_ = ret;
@@ -1299,13 +1279,13 @@ UnitGraph::CSRPtr UnitGraph::GetOutCSR(bool inplace) const {
CSRPtr ret = out_csr_;
if (!out_csr_) {
if (in_csr_) {
const auto& newadj = aten::CSRTranspose(in_csr_->adj());
const auto& newadj = aten::CSRSort(aten::CSRTranspose(in_csr_->adj()));
ret = std::make_shared<CSR>(meta_graph(), newadj);
if (inplace)
const_cast<UnitGraph*>(this)->out_csr_ = ret;
} else {
CHECK(coo_) << "None of CSR, COO exist";
const auto& newadj = aten::COOToCSR(coo_->adj());
const auto& newadj = aten::CSRSort(aten::COOToCSR(coo_->adj()));
ret = std::make_shared<CSR>(meta_graph(), newadj);
if (inplace)
const_cast<UnitGraph*>(this)->out_csr_ = ret;

View File

@@ -8,6 +8,7 @@
#include <string.h>
#include <stdlib.h>
#include <time.h>
#include <memory>
#include "socket_communicator.h"
#include "../../c_api_common.h"

View File

@@ -10,6 +10,7 @@
#include <vector>
#include <string>
#include <unordered_map>
#include <memory>
#include "communicator.h"
#include "msg_queue.h"
@@ -19,9 +20,9 @@
namespace dgl {
namespace network {
static int kMaxTryCount = 1024; // maximal connection: 1024
static int kTimeOut = 10; // 10 minutes for socket timeout
static int kMaxConnection = 1024; // maximal connection: 1024
static constexpr int kMaxTryCount = 1024; // maximal connection: 1024
static constexpr int kTimeOut = 10; // 10 minutes for socket timeout
static constexpr int kMaxConnection = 1024; // maximal connection: 1024
/*!
* \breif Networking address

View File

@@ -7,6 +7,7 @@
#include <dgl/runtime/serializer.h>
#include <fstream>
#include <vector>
#include <unordered_map>
#include "file_util.h"

View File

@@ -7,6 +7,7 @@
#define DGL_RUNTIME_FILE_UTIL_H_
#include <string>
#include <unordered_map>
#include "meta_data.h"
namespace dgl {

View File

@@ -9,6 +9,7 @@
#include <dgl/runtime/module.h>
#include <dgl/runtime/registry.h>
#include <string>
#include <memory>
#include "module_util.h"
namespace dgl {

View File

@@ -10,6 +10,7 @@
#include <dgl/runtime/c_runtime_api.h>
#include <dgl/runtime/c_backend_api.h>
#include <vector>
#include <memory>
extern "C" {
// Function signature for generated packed function in shared library

View File

@@ -124,6 +124,8 @@ size_t NDArray::GetSize() const {
}
int64_t NDArray::NumElements() const {
if (data_->dl_tensor.ndim == 0)
return 0;
int64_t size = 1;
for (int i = 0; i < data_->dl_tensor.ndim; ++i) {
size *= data_->dl_tensor.shape[i];

View File

@@ -4,6 +4,7 @@
* \brief Workspace pool utility.
*/
#include "workspace_pool.h"
#include <memory>
namespace dgl {
namespace runtime {

View File

@@ -8,6 +8,7 @@
#include <dgl/runtime/device_api.h>
#include <vector>
#include <memory>
namespace dgl {
namespace runtime {

View File

@@ -1883,4 +1883,4 @@ if __name__ == '__main__':
# test_isolated_ntype()
# test_bipartite()
# test_dtype_cast()
test_format()
pass

View File

@@ -603,10 +603,6 @@ def test_to_block(index_dtype):
assert bg.number_of_src_nodes() == 4
assert bg.number_of_dst_nodes() == 4
dst_nodes = F.tensor([3, 4], dtype=getattr(F, index_dtype))
bg = dgl.to_block(g_a, dst_nodes)
check(g_a, bg, 'A', 'AA', dst_nodes)
dst_nodes = F.tensor([4, 3, 2, 1], dtype=getattr(F, index_dtype))
bg = dgl.to_block(g_a, dst_nodes)
check(g_a, bg, 'A', 'AA', dst_nodes)
@@ -620,17 +616,13 @@ def test_to_block(index_dtype):
assert bg.number_of_nodes('DST/A') == 0
checkall(g_ab, bg, None)
dst_nodes = {'B': F.tensor([5, 6], dtype=getattr(F, index_dtype))}
dst_nodes = {'B': F.tensor([5, 6, 3, 1], dtype=getattr(F, index_dtype))}
bg = dgl.to_block(g, dst_nodes)
assert bg.number_of_nodes('SRC/B') == 2
assert bg.number_of_nodes('SRC/B') == 4
assert F.array_equal(bg.srcnodes['B'].data[dgl.NID], bg.dstnodes['B'].data[dgl.NID])
assert bg.number_of_nodes('DST/A') == 0
checkall(g, bg, dst_nodes)
dst_nodes = {'A': F.tensor([3, 4], dtype=getattr(F, index_dtype)), 'B': F.tensor([5, 6], dtype=getattr(F, index_dtype))}
bg = dgl.to_block(g, dst_nodes)
checkall(g, bg, dst_nodes)
dst_nodes = {'A': F.tensor([4, 3, 2, 1], dtype=getattr(F, index_dtype)), 'B': F.tensor([3, 5, 6, 1], dtype=getattr(F, index_dtype))}
bg = dgl.to_block(g, dst_nodes=dst_nodes)
checkall(g, bg, dst_nodes)

View File

@@ -29,6 +29,10 @@ inline int64_t Len(dgl::runtime::NDArray nd) {
template <typename T>
inline bool ArrayEQ(dgl::runtime::NDArray a1, dgl::runtime::NDArray a2) {
if (a1->ndim != a2->ndim) return false;
if (a1->dtype != a2->dtype) return false;
if (a1->ctx != a2->ctx) return false;
if (a1.NumElements() != a2.NumElements()) return false;
if (a1.NumElements() == 0) return true;
int64_t num = 1;
for (int i = 0; i < a1->ndim; ++i) {
if (a1->shape[i] != a2->shape[i])

View File

@@ -208,6 +208,8 @@ template <typename IDX>
void _TestIndexSelect(DLContext ctx) {
IdArray a = aten::Range(0, 100, sizeof(IDX)*8, ctx);
ASSERT_EQ(aten::IndexSelect<int>(a, 50), 50);
ASSERT_TRUE(ArrayEQ<IDX>(aten::IndexSelect(a, 10, 20),
aten::Range(10, 20, sizeof(IDX)*8, ctx)));
IdArray b = aten::VecToIdArray(std::vector<IDX>({0, 20, 10}), sizeof(IDX)*8, ctx);
IdArray c = aten::IndexSelect(a, b);
ASSERT_TRUE(ArrayEQ<IDX>(b, c));
@@ -239,3 +241,41 @@ TEST(ArrayTest, TestRelabel_) {
_TestRelabel_<int32_t>();
_TestRelabel_<int64_t>();
}
template <typename IDX>
void _TestCumSum(DLContext ctx) {
IdArray a = aten::VecToIdArray(std::vector<IDX>({8, 6, 7, 5, 3, 0, 9}),
sizeof(IDX)*8, ctx);
{
IdArray tb = aten::VecToIdArray(std::vector<IDX>({8, 14, 21, 26, 29, 29, 38}),
sizeof(IDX)*8, ctx);
IdArray b = aten::CumSum(a);
ASSERT_TRUE(ArrayEQ<IDX>(b, tb));
}
{
IdArray tb = aten::VecToIdArray(std::vector<IDX>({0, 8, 14, 21, 26, 29, 29, 38}),
sizeof(IDX)*8, ctx);
IdArray b = aten::CumSum(a, true);
ASSERT_TRUE(ArrayEQ<IDX>(b, tb));
}
a = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX)*8, ctx);
{
IdArray tb = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX)*8, ctx);
IdArray b = aten::CumSum(a);
ASSERT_TRUE(ArrayEQ<IDX>(b, tb));
}
{
IdArray tb = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX)*8, ctx);
IdArray b = aten::CumSum(a);
ASSERT_TRUE(ArrayEQ<IDX>(b, tb));
}
}
TEST(ArrayTest, CumSum) {
_TestCumSum<int32_t>(CPU);
_TestCumSum<int64_t>(CPU);
#ifdef DGL_USE_CUDA
_TestCumSum<int32_t>(GPU);
_TestCumSum<int64_t>(GPU);
#endif
}

View File

@@ -17,8 +17,8 @@ aten::CSRMatrix CSR1(DLContext ctx = CTX) {
return aten::CSRMatrix(
4, 5,
aten::VecToIdArray(std::vector<IDX>({0, 2, 3, 5, 5}), sizeof(IDX)*8, ctx),
aten::VecToIdArray(std::vector<IDX>({1, 2, 0, 2, 3}), sizeof(IDX)*8, ctx),
aten::VecToIdArray(std::vector<IDX>({0, 2, 3, 1, 4}), sizeof(IDX)*8, ctx),
aten::VecToIdArray(std::vector<IDX>({1, 2, 0, 3, 2}), sizeof(IDX)*8, ctx),
aten::VecToIdArray(std::vector<IDX>({0, 2, 3, 4, 1}), sizeof(IDX)*8, ctx),
false);
}
@@ -277,12 +277,23 @@ void _TestCSRToCOO(DLContext ctx) {
auto coo = CSRToCOO(csr, false);
ASSERT_EQ(coo.num_rows, 4);
ASSERT_EQ(coo.num_cols, 5);
ASSERT_TRUE(coo.row_sorted);
auto tr = aten::VecToIdArray(std::vector<IDX>({0, 0, 0, 1, 2, 2}), sizeof(IDX)*8, ctx);
auto tc = aten::VecToIdArray(std::vector<IDX>({1, 2, 2, 0, 2, 3}), sizeof(IDX)*8, ctx);
auto td = aten::VecToIdArray(std::vector<IDX>({0, 2, 5, 3, 1, 4}), sizeof(IDX)*8, ctx);
ASSERT_TRUE(ArrayEQ<IDX>(coo.row, tr));
ASSERT_TRUE(ArrayEQ<IDX>(coo.col, tc));
ASSERT_TRUE(ArrayEQ<IDX>(coo.data, td));
ASSERT_TRUE(ArrayEQ<IDX>(coo.col, csr.indices));
ASSERT_TRUE(ArrayEQ<IDX>(coo.data, csr.data));
// convert from sorted csr
auto s_csr = CSRSort(csr);
coo = CSRToCOO(s_csr, false);
ASSERT_EQ(coo.num_rows, 4);
ASSERT_EQ(coo.num_cols, 5);
ASSERT_TRUE(coo.row_sorted);
ASSERT_TRUE(coo.col_sorted);
tr = aten::VecToIdArray(std::vector<IDX>({0, 0, 0, 1, 2, 2}), sizeof(IDX)*8, ctx);
ASSERT_TRUE(ArrayEQ<IDX>(coo.row, tr));
ASSERT_TRUE(ArrayEQ<IDX>(coo.col, s_csr.indices));
ASSERT_TRUE(ArrayEQ<IDX>(coo.data, s_csr.data));
}
{
auto coo = CSRToCOO(csr, true);
@@ -294,7 +305,7 @@ void _TestCSRToCOO(DLContext ctx) {
}
}
TEST(SpmatTest, TestCSRToCOO) {
TEST(SpmatTest, CSRToCOO) {
_TestCSRToCOO<int32_t>(CPU);
_TestCSRToCOO<int64_t>(CPU);
#if DGL_USE_CUDA
@@ -303,8 +314,8 @@ TEST(SpmatTest, TestCSRToCOO) {
}
template <typename IDX>
void _TestCSRSliceRows() {
auto csr = CSR2<IDX>();
void _TestCSRSliceRows(DLContext ctx) {
auto csr = CSR2<IDX>(ctx);
auto x = aten::CSRSliceRows(csr, 1, 4);
// [1, 0, 0, 0, 0],
// [0, 0, 1, 1, 0],
@@ -312,30 +323,34 @@ void _TestCSRSliceRows() {
// data: [3, 1, 4]
ASSERT_EQ(x.num_rows, 3);
ASSERT_EQ(x.num_cols, 5);
auto tp = aten::VecToIdArray(std::vector<IDX>({0, 1, 3, 3}), sizeof(IDX)*8, CTX);
auto ti = aten::VecToIdArray(std::vector<IDX>({0, 2, 3}), sizeof(IDX)*8, CTX);
auto td = aten::VecToIdArray(std::vector<IDX>({3, 1, 4}), sizeof(IDX)*8, CTX);
auto tp = aten::VecToIdArray(std::vector<IDX>({0, 1, 3, 3}), sizeof(IDX)*8, ctx);
auto ti = aten::VecToIdArray(std::vector<IDX>({0, 2, 3}), sizeof(IDX)*8, ctx);
auto td = aten::VecToIdArray(std::vector<IDX>({3, 1, 4}), sizeof(IDX)*8, ctx);
ASSERT_TRUE(ArrayEQ<IDX>(x.indptr, tp));
ASSERT_TRUE(ArrayEQ<IDX>(x.indices, ti));
ASSERT_TRUE(ArrayEQ<IDX>(x.data, td));
auto r = aten::VecToIdArray(std::vector<IDX>({0, 1, 3}), sizeof(IDX)*8, CTX);
auto r = aten::VecToIdArray(std::vector<IDX>({0, 1, 3}), sizeof(IDX)*8, ctx);
x = aten::CSRSliceRows(csr, r);
// [[0, 1, 2, 0, 0],
// [1, 0, 0, 0, 0],
// [0, 0, 0, 0, 0]]
// data: [0, 2, 5, 3]
tp = aten::VecToIdArray(std::vector<IDX>({0, 3, 4, 4}), sizeof(IDX)*8, CTX);
ti = aten::VecToIdArray(std::vector<IDX>({1, 2, 2, 0}), sizeof(IDX)*8, CTX);
td = aten::VecToIdArray(std::vector<IDX>({0, 2, 5, 3}), sizeof(IDX)*8, CTX);
tp = aten::VecToIdArray(std::vector<IDX>({0, 3, 4, 4}), sizeof(IDX)*8, ctx);
ti = aten::VecToIdArray(std::vector<IDX>({1, 2, 2, 0}), sizeof(IDX)*8, ctx);
td = aten::VecToIdArray(std::vector<IDX>({0, 2, 5, 3}), sizeof(IDX)*8, ctx);
ASSERT_TRUE(ArrayEQ<IDX>(x.indptr, tp));
ASSERT_TRUE(ArrayEQ<IDX>(x.indices, ti));
ASSERT_TRUE(ArrayEQ<IDX>(x.data, td));
}
TEST(SpmatTest, TestCSRSliceRows) {
_TestCSRSliceRows<int32_t>();
_TestCSRSliceRows<int64_t>();
_TestCSRSliceRows<int32_t>(CPU);
_TestCSRSliceRows<int64_t>(CPU);
#ifdef DGL_USE_CUDA
_TestCSRSliceRows<int32_t>(GPU);
_TestCSRSliceRows<int64_t>(GPU);
#endif
}
template <typename IDX>
@@ -376,6 +391,29 @@ TEST(SpmatTest, TestCSRHasDuplicate) {
_TestCSRHasDuplicate<int64_t>();
}
template <typename IDX>
void _TestCSRSort(DLContext ctx) {
auto csr = CSR1<IDX>(ctx);
ASSERT_FALSE(aten::CSRIsSorted(csr));
auto csr1 = aten::CSRSort(csr);
ASSERT_FALSE(aten::CSRIsSorted(csr));
ASSERT_TRUE(aten::CSRIsSorted(csr1));
ASSERT_TRUE(csr1.sorted);
aten::CSRSort_(&csr);
ASSERT_TRUE(aten::CSRIsSorted(csr));
ASSERT_TRUE(csr.sorted);
csr = CSR2<IDX>(ctx);
ASSERT_TRUE(aten::CSRIsSorted(csr));
}
TEST(SpmatTest, CSRSort) {
_TestCSRSort<int32_t>(CPU);
_TestCSRSort<int64_t>(CPU);
#ifdef DGL_USE_CUDA
_TestCSRSort<int32_t>(GPU);
#endif
}
template <typename IDX>
void _TestCOOToCSR(DLContext ctx) {
auto coo = COO1<IDX>(ctx);
@@ -392,6 +430,7 @@ void _TestCOOToCSR(DLContext ctx) {
ASSERT_EQ(coo.num_cols, csr.num_cols);
ASSERT_TRUE(ArrayEQ<IDX>(csr.indptr, tcsr.indptr));
// Convert from row sorted coo
coo = COO1<IDX>(ctx);
auto rs_coo = aten::COOSort(coo, false);
auto rs_csr = CSR1<IDX>(ctx);
@@ -399,6 +438,8 @@ void _TestCOOToCSR(DLContext ctx) {
ASSERT_EQ(coo.num_rows, rs_tcsr.num_rows);
ASSERT_EQ(coo.num_cols, rs_tcsr.num_cols);
ASSERT_TRUE(ArrayEQ<IDX>(rs_csr.indptr, rs_tcsr.indptr));
ASSERT_TRUE(ArrayEQ<IDX>(rs_tcsr.indices, rs_coo.col));
ASSERT_TRUE(ArrayEQ<IDX>(rs_tcsr.data, rs_coo.data));
coo = COO3<IDX>(ctx);
rs_coo = aten::COOSort(coo, false);
@@ -407,16 +448,20 @@ void _TestCOOToCSR(DLContext ctx) {
ASSERT_EQ(coo.num_rows, rs_tcsr.num_rows);
ASSERT_EQ(coo.num_cols, rs_tcsr.num_cols);
ASSERT_TRUE(ArrayEQ<IDX>(rs_csr.indptr, rs_tcsr.indptr));
ASSERT_TRUE(ArrayEQ<IDX>(rs_tcsr.indices, rs_coo.col));
ASSERT_TRUE(ArrayEQ<IDX>(rs_tcsr.data, rs_coo.data));
// Convert from col sorted coo
coo = COO1<IDX>(ctx);
auto src_coo = aten::COOSort(coo, true);
auto src_csr = CSR1<IDX>(ctx);
auto src_tcsr = aten::COOToCSR(src_coo);
ASSERT_EQ(coo.num_rows, src_tcsr.num_rows);
ASSERT_EQ(coo.num_cols, src_tcsr.num_cols);
ASSERT_TRUE(ArrayEQ<IDX>(src_csr.indptr, src_tcsr.indptr));
ASSERT_TRUE(ArrayEQ<IDX>(src_csr.indices, src_tcsr.indices));
ASSERT_TRUE(ArrayEQ<IDX>(src_csr.data, src_tcsr.data));
ASSERT_TRUE(src_tcsr.sorted);
ASSERT_TRUE(ArrayEQ<IDX>(src_tcsr.indptr, src_csr.indptr));
ASSERT_TRUE(ArrayEQ<IDX>(src_tcsr.indices, src_coo.col));
ASSERT_TRUE(ArrayEQ<IDX>(src_tcsr.data, src_coo.data));
coo = COO3<IDX>(ctx);
src_coo = aten::COOSort(coo, true);
@@ -424,12 +469,13 @@ void _TestCOOToCSR(DLContext ctx) {
src_tcsr = aten::COOToCSR(src_coo);
ASSERT_EQ(coo.num_rows, src_tcsr.num_rows);
ASSERT_EQ(coo.num_cols, src_tcsr.num_cols);
ASSERT_TRUE(ArrayEQ<IDX>(src_csr.indptr, src_tcsr.indptr));
ASSERT_TRUE(ArrayEQ<IDX>(src_csr.indices, src_tcsr.indices));
ASSERT_TRUE(ArrayEQ<IDX>(src_csr.data, src_tcsr.data));
ASSERT_TRUE(src_tcsr.sorted);
ASSERT_TRUE(ArrayEQ<IDX>(src_tcsr.indptr, src_csr.indptr));
ASSERT_TRUE(ArrayEQ<IDX>(src_tcsr.indices, src_coo.col));
ASSERT_TRUE(ArrayEQ<IDX>(src_tcsr.data, src_coo.data));
}
TEST(SpmatTest, TestCOOToCSR) {
TEST(SpmatTest, COOToCSR) {
_TestCOOToCSR<int32_t>(CPU);
_TestCOOToCSR<int64_t>(CPU);
#ifdef DGL_USE_CUDA
@@ -453,12 +499,37 @@ TEST(SpmatTest, TestCOOHasDuplicate) {
template <typename IDX>
void _TestCOOSort(DLContext ctx) {
auto coo = COO3<IDX>(ctx);
auto sr_coo = COOSort(coo, false);
ASSERT_EQ(coo.num_rows, sr_coo.num_rows);
ASSERT_EQ(coo.num_cols, sr_coo.num_cols);
ASSERT_TRUE(sr_coo.row_sorted);
auto flags = COOIsSorted(sr_coo);
ASSERT_TRUE(flags.first);
flags = COOIsSorted(coo); // original coo should stay the same
ASSERT_FALSE(flags.first);
ASSERT_FALSE(flags.second);
auto src_coo = COOSort(coo, true);
ASSERT_EQ(coo.num_rows, src_coo.num_rows);
ASSERT_EQ(coo.num_cols, src_coo.num_cols);
ASSERT_TRUE(src_coo.row_sorted);
ASSERT_TRUE(src_coo.col_sorted);
flags = COOIsSorted(src_coo);
ASSERT_TRUE(flags.first);
ASSERT_TRUE(flags.second);
// sort inplace
COOSort_(&coo);
ASSERT_TRUE(coo.row_sorted);
flags = COOIsSorted(coo);
ASSERT_TRUE(flags.first);
COOSort_(&coo, true);
ASSERT_TRUE(coo.row_sorted);
ASSERT_TRUE(coo.col_sorted);
flags = COOIsSorted(coo);
ASSERT_TRUE(flags.first);
ASSERT_TRUE(flags.second);
// COO3
// [[0, 1, 2, 0, 0],
@@ -489,7 +560,7 @@ void _TestCOOSort(DLContext ctx) {
ASSERT_TRUE(ArrayEQ<IDX>(src_coo.data, sort_col_data));
}
TEST(SpmatTest, TestCOOSort) {
TEST(SpmatTest, COOSort) {
_TestCOOSort<int32_t>(CPU);
_TestCOOSort<int64_t>(CPU);
#ifdef DGL_USE_CUDA

1
third_party/cub vendored Submodule

Submodule third_party/cub added at c3cceac115