mirror of
https://github.com/dmlc/dgl.git
synced 2026-06-04 19:44:23 +08:00
[CUDA][Kernel] More CUDA kernels; Standardize the behavior for sorted COO/CSR (#1704)
* add cub; array cumsum * CSRSliceRows * fix warning * operator << for ndarray; CSRSliceRows * add CSRIsSorted * add csr_sort * inplace coosort and outplace csrsort * WIP: coo is sorted * mv cuda_utils * add AllTrue utility * csr sort * coo sort * coo2csr for sorted coo arrays * CSRToCOO from sorted * pass tests for the new kernel changes * cannot use inplace sort * lint * try fix msvc error * Fix g.copy_to and g.asnumbits; ToBlock no longer uses CSC * stash * revert some hack * revert some changes * address comments * fix * fix to_block unittest * add todo note
This commit is contained in:
4
.gitmodules
vendored
4
.gitmodules
vendored
@@ -13,6 +13,10 @@
|
||||
[submodule "third_party/METIS"]
|
||||
path = third_party/METIS
|
||||
url = https://github.com/KarypisLab/METIS.git
|
||||
[submodule "third_party/cub"]
|
||||
path = third_party/cub
|
||||
url = https://github.com/NVlabs/cub.git
|
||||
branch = 1.8.0
|
||||
[submodule "third_party/phmap"]
|
||||
path = third_party/phmap
|
||||
url = https://github.com/greg7mdp/parallel-hashmap.git
|
||||
|
||||
@@ -44,6 +44,8 @@ include_directories("third_party/METIS/include/")
|
||||
include_directories("third_party/dmlc-core/include")
|
||||
include_directories("third_party/minigun/minigun")
|
||||
include_directories("third_party/minigun/third_party/moderngpu/src")
|
||||
include_directories("third_party/cub/")
|
||||
include_directories("third_party/phmap/")
|
||||
|
||||
# initial variables
|
||||
set(DGL_LINKER_LIBS "")
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <tuple>
|
||||
#include <string>
|
||||
#include "./types.h"
|
||||
|
||||
namespace dgl {
|
||||
@@ -131,9 +132,18 @@ IdArray HStack(IdArray arr1, IdArray arr2);
|
||||
* \tparam ValueType The type of return value.
|
||||
*/
|
||||
template<typename ValueType>
|
||||
ValueType IndexSelect(NDArray array, uint64_t index);
|
||||
ValueType IndexSelect(NDArray array, int64_t index);
|
||||
|
||||
/*!
|
||||
* \brief Return the data under the index. In numpy notation, A[I]
|
||||
*/
|
||||
NDArray IndexSelect(NDArray array, IdArray index);
|
||||
|
||||
/*!
|
||||
* \brief Return the data from `start` (inclusive) to `end` (exclusive).
|
||||
*/
|
||||
NDArray IndexSelect(NDArray array, int64_t start, int64_t end);
|
||||
|
||||
/*!
|
||||
* \brief Permute the elements of an array according to given indices.
|
||||
*
|
||||
@@ -238,6 +248,27 @@ std::tuple<NDArray, IdArray, IdArray> Pack(NDArray array, ValueType pad_value);
|
||||
*/
|
||||
std::pair<NDArray, IdArray> ConcatSlices(NDArray array, IdArray lengths);
|
||||
|
||||
/*!
|
||||
* \brief Return the cumulative summation (or inclusive sum) of the input array.
|
||||
*
|
||||
* The first element out[0] is equal to the first element of the input array
|
||||
* array[0]. The rest elements are defined recursively, out[i] = out[i-1] + array[i].
|
||||
* Hence, the result array length is the same as the input array length.
|
||||
*
|
||||
* If prepend_zero is true, then the first element is zero and the result array
|
||||
* length is the input array length plus one. This is useful for creating
|
||||
* an indptr array over a count array.
|
||||
*
|
||||
* \param array The 1D input array.
|
||||
* \return Array after cumsum.
|
||||
*/
|
||||
IdArray CumSum(IdArray array, bool prepend_zero = false);
|
||||
|
||||
/*!
|
||||
* \brief Return a string that prints out some debug information.
|
||||
*/
|
||||
std::string ToDebugString(NDArray array);
|
||||
|
||||
// inline implementations
|
||||
template <typename T>
|
||||
IdArray VecToIdArray(const std::vector<T>& vec,
|
||||
|
||||
@@ -116,6 +116,16 @@ struct COOMatrix {
|
||||
CHECK_NO_OVERFLOW(row->dtype, num_rows);
|
||||
CHECK_NO_OVERFLOW(row->dtype, num_cols);
|
||||
}
|
||||
|
||||
/*! \brief Return a copy of this matrix on the give device context. */
|
||||
inline COOMatrix CopyTo(const DLContext& ctx) const {
|
||||
if (ctx == row->ctx)
|
||||
return *this;
|
||||
return COOMatrix(num_rows, num_cols,
|
||||
row.CopyTo(ctx), col.CopyTo(ctx),
|
||||
aten::IsNullArray(data)? data : data.CopyTo(ctx),
|
||||
row_sorted, col_sorted);
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////// COO routines //////////////////////////
|
||||
@@ -141,6 +151,17 @@ inline bool COOHasData(COOMatrix csr) {
|
||||
return !IsNullArray(csr.data);
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief Check whether the COO is sorted.
|
||||
*
|
||||
* It returns two flags: one for whether the row is sorted;
|
||||
* the other for whether the columns of each row is sorted
|
||||
* if the first flag is true.
|
||||
*
|
||||
* Complexity: O(NNZ)
|
||||
*/
|
||||
std::pair<bool, bool> COOIsSorted(COOMatrix coo);
|
||||
|
||||
/*! \brief Get data. The return type is an ndarray due to possible duplicate entries. */
|
||||
runtime::NDArray COOGetData(COOMatrix , int64_t row, int64_t col);
|
||||
|
||||
@@ -161,6 +182,20 @@ COOMatrix COOTranspose(COOMatrix coo);
|
||||
* the result CSR matrix stores a shuffle index for how the entries
|
||||
* will be reordered in CSR. The i^th entry in the result CSR corresponds
|
||||
* to the CSR.data[i] th entry in the input COO.
|
||||
*
|
||||
* Conversion complexity: O(nnz)
|
||||
*
|
||||
* - The function first check whether the input COO matrix is sorted
|
||||
* using a linear scan.
|
||||
* - If the COO matrix is row sorted, the conversion can be done very
|
||||
* efficiently in a sequential scan. The result indices and data arrays
|
||||
* are directly equal to the column and data arrays from the input.
|
||||
* - If the COO matrix is further column sorted, the result CSR is
|
||||
* also column sorted.
|
||||
* - Otherwise, the conversion is more costly but still is O(nnz).
|
||||
*
|
||||
* \param coo Input COO matrix.
|
||||
* \return CSR matrix.
|
||||
*/
|
||||
CSRMatrix COOToCSR(COOMatrix coo);
|
||||
|
||||
@@ -195,6 +230,21 @@ bool COOHasDuplicate(COOMatrix coo);
|
||||
*/
|
||||
std::pair<COOMatrix, IdArray> COOCoalesce(COOMatrix coo);
|
||||
|
||||
/*!
|
||||
* \brief Sort the indices of a COO matrix in-place.
|
||||
*
|
||||
* The function sorts row indices in ascending order. If sort_column is true,
|
||||
* col indices are sorted in ascending order too. The data array of the returned COOMatrix
|
||||
* stores the shuffled index which could be used to fetch edge data.
|
||||
*
|
||||
* Complexity: O(N*log(N)) time and O(1) space, where N is the number of nonzeros.
|
||||
* TODO(minjie): The time complexity could be improved to O(N) by using a O(N) space.
|
||||
*
|
||||
* \param mat The coo matrix to sort.
|
||||
* \param sort_column True if column index should be sorted too.
|
||||
*/
|
||||
void COOSort_(COOMatrix* mat, bool sort_column = false);
|
||||
|
||||
/*!
|
||||
* \brief Sort the indices of a COO matrix.
|
||||
*
|
||||
@@ -202,11 +252,23 @@ std::pair<COOMatrix, IdArray> COOCoalesce(COOMatrix coo);
|
||||
* col indices are sorted in ascending order too. The data array of the returned COOMatrix
|
||||
* stores the shuffled index which could be used to fetch edge data.
|
||||
*
|
||||
* Complexity: O(N*log(N)) time and O(1) space, where N is the number of nonzeros.
|
||||
* TODO(minjie): The time complexity could be improved to O(N) by using a O(N) space.
|
||||
*
|
||||
* \param mat The input coo matrix
|
||||
* \param sort_column True if column index should be sorted too.
|
||||
* \return COO matrix with index sorted.
|
||||
*/
|
||||
COOMatrix COOSort(COOMatrix mat, bool sort_column = false);
|
||||
inline COOMatrix COOSort(COOMatrix mat, bool sort_column = false) {
|
||||
if ((mat.row_sorted && !sort_column) || mat.col_sorted)
|
||||
return mat;
|
||||
COOMatrix ret(mat.num_rows, mat.num_cols,
|
||||
mat.row.Clone(), mat.col.Clone(),
|
||||
COOHasData(mat)? mat.data.Clone() : mat.data,
|
||||
mat.row_sorted, mat.col_sorted);
|
||||
COOSort_(&ret, sort_column);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief Remove entries from COO matrix by entry indices (data indices)
|
||||
|
||||
@@ -106,6 +106,17 @@ struct CSRMatrix {
|
||||
}
|
||||
CHECK_NO_OVERFLOW(indptr->dtype, num_rows);
|
||||
CHECK_NO_OVERFLOW(indptr->dtype, num_cols);
|
||||
CHECK_EQ(indptr->shape[0], num_rows + 1);
|
||||
}
|
||||
|
||||
/*! \brief Return a copy of this matrix on the give device context. */
|
||||
inline CSRMatrix CopyTo(const DLContext& ctx) const {
|
||||
if (ctx == indptr->ctx)
|
||||
return *this;
|
||||
return CSRMatrix(num_rows, num_cols,
|
||||
indptr.CopyTo(ctx), indices.CopyTo(ctx),
|
||||
aten::IsNullArray(data)? data : data.CopyTo(ctx),
|
||||
sorted);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -134,6 +145,9 @@ inline bool CSRHasData(CSRMatrix csr) {
|
||||
return !IsNullArray(csr.data);
|
||||
}
|
||||
|
||||
/*! \brief Whether the column indices of each row is sorted. */
|
||||
bool CSRIsSorted(CSRMatrix csr);
|
||||
|
||||
/* \brief Get data. The return type is an ndarray due to possible duplicate entries. */
|
||||
runtime::NDArray CSRGetData(CSRMatrix , int64_t row, int64_t col);
|
||||
/*!
|
||||
@@ -155,6 +169,15 @@ CSRMatrix CSRTranspose(CSRMatrix csr);
|
||||
|
||||
/*!
|
||||
* \brief Convert CSR matrix to COO matrix.
|
||||
*
|
||||
* Complexity: O(nnz)
|
||||
*
|
||||
* - If data_as_order is false, the column and data arrays of the
|
||||
* result COO are equal to the indices and data arrays of the
|
||||
* input CSR. The result COO is also row sorted.
|
||||
* - If the input CSR is further sorted, the result COO is also
|
||||
* column sorted.
|
||||
*
|
||||
* \param csr Input csr matrix
|
||||
* \param data_as_order If true, the data array in the input csr matrix contains the order
|
||||
* by which the resulting COO tuples are stored. In this case, the
|
||||
@@ -166,9 +189,8 @@ COOMatrix CSRToCOO(CSRMatrix csr, bool data_as_order);
|
||||
|
||||
/*!
|
||||
* \brief Slice rows of the given matrix and return.
|
||||
* \param csr CSR matrix
|
||||
* \param start Start row id (inclusive)
|
||||
* \param end End row id (exclusive)
|
||||
*
|
||||
* The sliced row IDs are relabeled to starting from zero.
|
||||
*
|
||||
* Examples:
|
||||
* num_rows = 4
|
||||
@@ -182,6 +204,11 @@ COOMatrix CSRToCOO(CSRMatrix csr, bool data_as_order);
|
||||
* num_cols = 4
|
||||
* indptr = [0, 1, 1]
|
||||
* indices = [2]
|
||||
*
|
||||
* \param csr CSR matrix
|
||||
* \param start Start row id (inclusive)
|
||||
* \param end End row id (exclusive)
|
||||
* \return sliced rows stored in a CSR matrix
|
||||
*/
|
||||
CSRMatrix CSRSliceRows(CSRMatrix csr, int64_t start, int64_t end);
|
||||
CSRMatrix CSRSliceRows(CSRMatrix csr, runtime::NDArray rows);
|
||||
@@ -192,6 +219,8 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, runtime::NDArray rows);
|
||||
* In numpy notation, given matrix M, row index array I, col index array J
|
||||
* This function returns the submatrix M[I, J].
|
||||
*
|
||||
* The sliced row and column IDs are relabeled to starting from zero.
|
||||
*
|
||||
* \param csr The input csr matrix
|
||||
* \param rows The row index to select
|
||||
* \param cols The col index to select
|
||||
@@ -203,7 +232,10 @@ CSRMatrix CSRSliceMatrix(CSRMatrix csr, runtime::NDArray rows, runtime::NDArray
|
||||
bool CSRHasDuplicate(CSRMatrix csr);
|
||||
|
||||
/*!
|
||||
* \brief Sort the column index at each row in the ascending order.
|
||||
* \brief Sort the column index at each row in ascending order in-place.
|
||||
*
|
||||
* Only the indices and data arrays (if available) will be mutated. The indptr array
|
||||
* stays the same.
|
||||
*
|
||||
* Examples:
|
||||
* num_rows = 4
|
||||
@@ -218,6 +250,22 @@ bool CSRHasDuplicate(CSRMatrix csr);
|
||||
*/
|
||||
void CSRSort_(CSRMatrix* csr);
|
||||
|
||||
/*!
|
||||
* \brief Sort the column index at each row in ascending order.
|
||||
*
|
||||
* Return a new CSR matrix with sorted column indices and data arrays.
|
||||
*/
|
||||
inline CSRMatrix CSRSort(CSRMatrix csr) {
|
||||
if (csr.sorted)
|
||||
return csr;
|
||||
CSRMatrix ret(csr.num_rows, csr.num_cols,
|
||||
csr.indptr, csr.indices.Clone(),
|
||||
CSRHasData(csr)? csr.data.Clone() : csr.data,
|
||||
csr.sorted);
|
||||
CSRSort_(&ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief Reorder the rows and colmns according to the new row and column order.
|
||||
* \param csr The input csr matrix.
|
||||
|
||||
@@ -252,4 +252,8 @@
|
||||
CHECK_LE((val), 0x7FFFFFFFL) << "int32 overflow for argument " << (#val) << "."; \
|
||||
} while (0);
|
||||
|
||||
#define CHECK_IS_ID_ARRAY(VAR) \
|
||||
CHECK((VAR)->ndim == 1 && (IS_INT32(VAR) || IS_INT64(VAR))) \
|
||||
<< "Expected argument " << (#VAR) << " to be an 1D integer array.";
|
||||
|
||||
#endif // DGL_ATEN_MACRO_H_
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
|
||||
#include "./runtime/object.h"
|
||||
#include "array.h"
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
#include <utility>
|
||||
#include <tuple>
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include "runtime/ndarray.h"
|
||||
#include "graph_interface.h"
|
||||
#include "lazy.h"
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <memory>
|
||||
|
||||
#include "./runtime/object.h"
|
||||
#include "graph_interface.h"
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
|
||||
#include "c_runtime_api.h"
|
||||
#include "dlpack/dlpack.h"
|
||||
@@ -157,6 +158,10 @@ class NDArray {
|
||||
* \return The array under another context.
|
||||
*/
|
||||
inline NDArray CopyTo(const DLContext& ctx) const;
|
||||
/*!
|
||||
* \brief Return a new array with a copy of the content.
|
||||
*/
|
||||
inline NDArray Clone() const;
|
||||
/*!
|
||||
* \brief Load NDArray from stream
|
||||
* \param stream The input data stream
|
||||
@@ -410,6 +415,12 @@ inline NDArray NDArray::CopyTo(const DLContext& ctx) const {
|
||||
return ret;
|
||||
}
|
||||
|
||||
inline NDArray NDArray::Clone() const {
|
||||
CHECK(data_ != nullptr);
|
||||
const DLTensor* dptr = operator->();
|
||||
return this->CopyTo(dptr->ctx);
|
||||
}
|
||||
|
||||
inline int NDArray::use_count() const {
|
||||
if (data_ == nullptr) return 0;
|
||||
return data_->ref_counter_.load(std::memory_order_relaxed);
|
||||
@@ -627,6 +638,8 @@ dgl::runtime::NDArray operator <= (int64_t lhs, const dgl::runtime::NDArray& a2)
|
||||
dgl::runtime::NDArray operator == (int64_t lhs, const dgl::runtime::NDArray& a2);
|
||||
dgl::runtime::NDArray operator != (int64_t lhs, const dgl::runtime::NDArray& a2);
|
||||
|
||||
std::ostream& operator << (std::ostream& os, dgl::runtime::NDArray array);
|
||||
|
||||
///////////////// Operator overloading for DLDataType /////////////////
|
||||
|
||||
/*! \brief Check whether two data types are the same.*/
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
#include <string>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <type_traits>
|
||||
#include "c_runtime_api.h"
|
||||
#include "module.h"
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
#include <dgl/graph_serializer.h>
|
||||
#include <dmlc/io.h>
|
||||
#include <dmlc/serializer.h>
|
||||
#include <memory>
|
||||
|
||||
namespace dmlc {
|
||||
namespace serializer {
|
||||
|
||||
@@ -17,31 +17,36 @@
|
||||
#include <tuple>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
|
||||
#include "dmlc/logging.h"
|
||||
|
||||
namespace dgl {
|
||||
|
||||
/* StreamWithBuffer is backed up by dmlc::MemoryFixedSizeStream or
|
||||
dmlc::MemoryStringStream. This class supports serializing and deserializing
|
||||
NDArrays stored in shared memory. If the stream is created for
|
||||
sending/recving data through network, the data pointer of the NDArray will be
|
||||
transmitted directly without and copy. Otherwise, the stream is for
|
||||
sending/recving data to another process on the same machine, so if an NDArray
|
||||
is stored in shared memory, it will just record the shared memory name
|
||||
instead of the actual data buffer.
|
||||
For example:
|
||||
std::string blob;
|
||||
// Send to local
|
||||
StreamWithBuffer strm(&blob, false);
|
||||
// Send to remote
|
||||
StreamWithBuffer strm(&blob, true);
|
||||
// Receive from local
|
||||
StreamWithBuffer strm(&blob, false);
|
||||
// Receive from remote
|
||||
std::vector<void*> ptr_list
|
||||
StreamWithBuffer strm(&blob, ptr_list);
|
||||
*/
|
||||
/*!
|
||||
*
|
||||
* StreamWithBuffer is backed up by dmlc::MemoryFixedSizeStream or
|
||||
* dmlc::MemoryStringStream. This class supports serializing and deserializing
|
||||
* NDArrays stored in shared memory. If the stream is created for
|
||||
* sending/recving data through network, the data pointer of the NDArray will be
|
||||
* transmitted directly without and copy. Otherwise, the stream is for
|
||||
* sending/recving data to another process on the same machine, so if an NDArray
|
||||
* is stored in shared memory, it will just record the shared memory name
|
||||
* instead of the actual data buffer.
|
||||
*
|
||||
* For example:
|
||||
*
|
||||
* std::string blob;
|
||||
* // Send to local
|
||||
* StreamWithBuffer strm(&blob, false);
|
||||
* // Send to remote
|
||||
* StreamWithBuffer strm(&blob, true);
|
||||
* // Receive from local
|
||||
* StreamWithBuffer strm(&blob, false);
|
||||
* // Receive from remote
|
||||
* std::vector<void*> ptr_list
|
||||
* StreamWithBuffer strm(&blob, ptr_list);
|
||||
*/
|
||||
class StreamWithBuffer : public dmlc::SeekStream {
|
||||
public:
|
||||
// Buffer type. Storing NDArray to maintain the reference counting to ensure
|
||||
|
||||
@@ -8,6 +8,8 @@
|
||||
#include <dgl/packed_func_ext.h>
|
||||
#include <dgl/runtime/container.h>
|
||||
#include <dgl/runtime/shared_mem.h>
|
||||
#include <dgl/runtime/device_api.h>
|
||||
#include <sstream>
|
||||
#include "../c_api_common.h"
|
||||
#include "./array_op.h"
|
||||
#include "./arith.h"
|
||||
@@ -100,8 +102,10 @@ NDArray IndexSelect(NDArray array, IdArray index) {
|
||||
}
|
||||
|
||||
template<typename ValueType>
|
||||
ValueType IndexSelect(NDArray array, uint64_t index) {
|
||||
ValueType IndexSelect(NDArray array, int64_t index) {
|
||||
CHECK_EQ(array->ndim, 1) << "Only support select values from 1D array.";
|
||||
CHECK(index >= 0 && index < array.NumElements())
|
||||
<< "Index " << index << " is out of bound.";
|
||||
ValueType ret = 0;
|
||||
ATEN_XPU_SWITCH_CUDA(array->ctx.device_type, XPU, "IndexSelect", {
|
||||
ATEN_DTYPE_SWITCH(array->dtype, DType, "values", {
|
||||
@@ -110,12 +114,30 @@ ValueType IndexSelect(NDArray array, uint64_t index) {
|
||||
});
|
||||
return ret;
|
||||
}
|
||||
template int32_t IndexSelect<int32_t>(NDArray array, uint64_t index);
|
||||
template int64_t IndexSelect<int64_t>(NDArray array, uint64_t index);
|
||||
template uint32_t IndexSelect<uint32_t>(NDArray array, uint64_t index);
|
||||
template uint64_t IndexSelect<uint64_t>(NDArray array, uint64_t index);
|
||||
template float IndexSelect<float>(NDArray array, uint64_t index);
|
||||
template double IndexSelect<double>(NDArray array, uint64_t index);
|
||||
template int32_t IndexSelect<int32_t>(NDArray array, int64_t index);
|
||||
template int64_t IndexSelect<int64_t>(NDArray array, int64_t index);
|
||||
template uint32_t IndexSelect<uint32_t>(NDArray array, int64_t index);
|
||||
template uint64_t IndexSelect<uint64_t>(NDArray array, int64_t index);
|
||||
template float IndexSelect<float>(NDArray array, int64_t index);
|
||||
template double IndexSelect<double>(NDArray array, int64_t index);
|
||||
|
||||
NDArray IndexSelect(NDArray array, int64_t start, int64_t end) {
|
||||
CHECK_EQ(array->ndim, 1) << "Only support select values from 1D array.";
|
||||
CHECK(start >= 0 && start < array.NumElements())
|
||||
<< "Index " << start << " is out of bound.";
|
||||
CHECK(end >= 0 && end <= array.NumElements())
|
||||
<< "Index " << end << " is out of bound.";
|
||||
CHECK_LE(start, end);
|
||||
auto device = runtime::DeviceAPI::Get(array->ctx);
|
||||
const int64_t len = end - start;
|
||||
NDArray ret = NDArray::Empty({len}, array->dtype, array->ctx);
|
||||
ATEN_DTYPE_SWITCH(array->dtype, DType, "values", {
|
||||
device->CopyDataFromTo(array->data, start * sizeof(DType),
|
||||
ret->data, 0, len * sizeof(DType),
|
||||
array->ctx, ret->ctx, array->dtype, nullptr);
|
||||
});
|
||||
return ret;
|
||||
}
|
||||
|
||||
NDArray Scatter(NDArray array, IdArray indices) {
|
||||
NDArray ret;
|
||||
@@ -181,6 +203,31 @@ std::pair<NDArray, IdArray> ConcatSlices(NDArray array, IdArray lengths) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
IdArray CumSum(IdArray array, bool prepend_zero) {
|
||||
IdArray ret;
|
||||
ATEN_XPU_SWITCH_CUDA(array->ctx.device_type, XPU, "CumSum", {
|
||||
ATEN_ID_TYPE_SWITCH(array->dtype, IdType, {
|
||||
ret = impl::CumSum<XPU, IdType>(array, prepend_zero);
|
||||
});
|
||||
});
|
||||
return ret;
|
||||
}
|
||||
|
||||
std::string ToDebugString(NDArray array) {
|
||||
std::ostringstream oss;
|
||||
NDArray a = array.CopyTo(DLContext{kDLCPU, 0});
|
||||
oss << "array([";
|
||||
ATEN_DTYPE_SWITCH(a->dtype, DType, "array", {
|
||||
for (int64_t i = 0; i < std::min<int64_t>(a.NumElements(), 10L); ++i) {
|
||||
oss << a.Ptr<DType>()[i] << ", ";
|
||||
}
|
||||
});
|
||||
if (a.NumElements() > 10)
|
||||
oss << "...";
|
||||
oss << "], dtype=" << array->dtype << ", ctx=" << array->ctx << ")";
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
///////////////////////// CSR routines //////////////////////////
|
||||
|
||||
bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) {
|
||||
@@ -250,6 +297,16 @@ NDArray CSRGetRowData(CSRMatrix csr, int64_t row) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool CSRIsSorted(CSRMatrix csr) {
|
||||
if (csr.indices->shape[0] <= 1)
|
||||
return true;
|
||||
bool ret = false;
|
||||
ATEN_CSR_SWITCH_CUDA(csr, XPU, IdType, "CSRIsSorted", {
|
||||
ret = impl::CSRIsSorted<XPU, IdType>(csr);
|
||||
});
|
||||
return ret;
|
||||
}
|
||||
|
||||
NDArray CSRGetData(CSRMatrix csr, int64_t row, int64_t col) {
|
||||
CHECK(row >= 0 && row < csr.num_rows) << "Invalid row index: " << row;
|
||||
CHECK(col >= 0 && col < csr.num_cols) << "Invalid col index: " << col;
|
||||
@@ -318,7 +375,7 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, int64_t start, int64_t end) {
|
||||
CHECK(end >= 0 && end <= csr.num_rows) << "Invalid end index: " << end;
|
||||
CHECK_GE(end, start);
|
||||
CSRMatrix ret;
|
||||
ATEN_CSR_SWITCH(csr, XPU, IdType, "CSRSliceRows", {
|
||||
ATEN_CSR_SWITCH_CUDA(csr, XPU, IdType, "CSRSliceRows", {
|
||||
ret = impl::CSRSliceRows<XPU, IdType>(csr, start, end);
|
||||
});
|
||||
return ret;
|
||||
@@ -328,7 +385,7 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
|
||||
CHECK_SAME_DTYPE(csr.indices, rows);
|
||||
CHECK_SAME_CONTEXT(csr.indices, rows);
|
||||
CSRMatrix ret;
|
||||
ATEN_CSR_SWITCH(csr, XPU, IdType, "CSRSliceRows", {
|
||||
ATEN_CSR_SWITCH_CUDA(csr, XPU, IdType, "CSRSliceRows", {
|
||||
ret = impl::CSRSliceRows<XPU, IdType>(csr, rows);
|
||||
});
|
||||
return ret;
|
||||
@@ -347,7 +404,9 @@ CSRMatrix CSRSliceMatrix(CSRMatrix csr, NDArray rows, NDArray cols) {
|
||||
}
|
||||
|
||||
void CSRSort_(CSRMatrix* csr) {
|
||||
ATEN_CSR_SWITCH(*csr, XPU, IdType, "CSRSort_", {
|
||||
if (csr->sorted)
|
||||
return;
|
||||
ATEN_CSR_SWITCH_CUDA(*csr, XPU, IdType, "CSRSort_", {
|
||||
impl::CSRSort_<XPU, IdType>(csr);
|
||||
});
|
||||
}
|
||||
@@ -509,13 +568,23 @@ COOMatrix COOSliceMatrix(COOMatrix coo, NDArray rows, NDArray cols) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
COOMatrix COOSort(COOMatrix mat, bool sort_column) {
|
||||
COOMatrix ret;
|
||||
ATEN_XPU_SWITCH_CUDA(mat.row->ctx.device_type, XPU, "COOSort", {
|
||||
ATEN_ID_TYPE_SWITCH(mat.row->dtype, IdType, {
|
||||
ret = impl::COOSort<XPU, IdType>(mat, sort_column);
|
||||
void COOSort_(COOMatrix* mat, bool sort_column) {
|
||||
if ((mat->row_sorted && !sort_column) || mat->col_sorted)
|
||||
return;
|
||||
ATEN_XPU_SWITCH_CUDA(mat->row->ctx.device_type, XPU, "COOSort_", {
|
||||
ATEN_ID_TYPE_SWITCH(mat->row->dtype, IdType, {
|
||||
impl::COOSort_<XPU, IdType>(mat, sort_column);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
std::pair<bool, bool> COOIsSorted(COOMatrix coo) {
|
||||
if (coo.row->shape[0] <= 1)
|
||||
return {true, true};
|
||||
std::pair<bool, bool> ret;
|
||||
ATEN_COO_SWITCH_CUDA(coo, XPU, IdType, "COOIsSorted", {
|
||||
ret = impl::COOIsSorted<XPU, IdType>(coo);
|
||||
});
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -709,3 +778,7 @@ DGL_REGISTER_GLOBAL("ndarray._CAPI_DGLExistSharedMemArray")
|
||||
|
||||
} // namespace aten
|
||||
} // namespace dgl
|
||||
|
||||
std::ostream& operator << (std::ostream& os, dgl::runtime::NDArray array) {
|
||||
return os << dgl::aten::ToDebugString(array);
|
||||
}
|
||||
|
||||
@@ -3,8 +3,8 @@
|
||||
* \file array/array_aritch.cc
|
||||
* \brief DGL array arithmetic operations
|
||||
*/
|
||||
#include <dgl/array.h>
|
||||
#include <dgl/packed_func_ext.h>
|
||||
#include <dgl/runtime/ndarray.h>
|
||||
#include <dgl/runtime/container.h>
|
||||
#include "../c_api_common.h"
|
||||
#include "./array_op.h"
|
||||
|
||||
@@ -44,7 +44,7 @@ template <DLDeviceType XPU, typename DType, typename IdType>
|
||||
NDArray IndexSelect(NDArray array, IdArray index);
|
||||
|
||||
template <DLDeviceType XPU, typename DType>
|
||||
DType IndexSelect(NDArray array, uint64_t index);
|
||||
DType IndexSelect(NDArray array, int64_t index);
|
||||
|
||||
template <DLDeviceType XPU, typename DType, typename IdType>
|
||||
NDArray Scatter(NDArray array, IdArray indices);
|
||||
@@ -61,6 +61,9 @@ std::tuple<NDArray, IdArray, IdArray> Pack(NDArray array, DType pad_value);
|
||||
template <DLDeviceType XPU, typename DType, typename IdType>
|
||||
std::pair<NDArray, IdArray> ConcatSlices(NDArray array, IdArray lengths);
|
||||
|
||||
template <DLDeviceType XPU, typename IdType>
|
||||
IdArray CumSum(IdArray array, bool prepend_zero);
|
||||
|
||||
// sparse arrays
|
||||
|
||||
template <DLDeviceType XPU, typename IdType>
|
||||
@@ -84,6 +87,9 @@ runtime::NDArray CSRGetRowColumnIndices(CSRMatrix csr, int64_t row);
|
||||
template <DLDeviceType XPU, typename IdType>
|
||||
runtime::NDArray CSRGetRowData(CSRMatrix csr, int64_t row);
|
||||
|
||||
template <DLDeviceType XPU, typename IdType>
|
||||
bool CSRIsSorted(CSRMatrix csr);
|
||||
|
||||
template <DLDeviceType XPU, typename IdType>
|
||||
runtime::NDArray CSRGetData(CSRMatrix csr, int64_t row, int64_t col);
|
||||
|
||||
@@ -187,7 +193,10 @@ template <DLDeviceType XPU, typename IdType>
|
||||
std::pair<COOMatrix, IdArray> COOCoalesce(COOMatrix coo);
|
||||
|
||||
template <DLDeviceType XPU, typename IdType>
|
||||
COOMatrix COOSort(COOMatrix mat, bool sort_column);
|
||||
void COOSort_(COOMatrix* mat, bool sort_column);
|
||||
|
||||
template <DLDeviceType XPU, typename IdType>
|
||||
std::pair<bool, bool> COOIsSorted(COOMatrix coo);
|
||||
|
||||
template <DLDeviceType XPU, typename IdType>
|
||||
COOMatrix COORemove(COOMatrix coo, IdArray entries);
|
||||
|
||||
42
src/array/cpu/array_cumsum.cc
Normal file
42
src/array/cpu/array_cumsum.cc
Normal file
@@ -0,0 +1,42 @@
|
||||
/*!
|
||||
* Copyright (c) 2020 by Contributors
|
||||
* \file array/cpu/array_cumsum.cc
|
||||
* \brief Array cumsum CPU implementation
|
||||
*/
|
||||
#include <dgl/array.h>
|
||||
|
||||
namespace dgl {
|
||||
using runtime::NDArray;
|
||||
namespace aten {
|
||||
namespace impl {
|
||||
|
||||
template <DLDeviceType XPU, typename IdType>
|
||||
IdArray CumSum(IdArray array, bool prepend_zero) {
|
||||
const int64_t len = array.NumElements();
|
||||
if (len == 0)
|
||||
return array;
|
||||
if (prepend_zero) {
|
||||
IdArray ret = aten::NewIdArray(len + 1, array->ctx, array->dtype.bits);
|
||||
const IdType* in_d = array.Ptr<IdType>();
|
||||
IdType* out_d = ret.Ptr<IdType>();
|
||||
out_d[0] = 0;
|
||||
for (int64_t i = 0; i < len; ++i)
|
||||
out_d[i + 1] = out_d[i] + in_d[i];
|
||||
return ret;
|
||||
} else {
|
||||
IdArray ret = aten::NewIdArray(len, array->ctx, array->dtype.bits);
|
||||
const IdType* in_d = array.Ptr<IdType>();
|
||||
IdType* out_d = ret.Ptr<IdType>();
|
||||
out_d[0] = in_d[0];
|
||||
for (int64_t i = 1; i < len; ++i)
|
||||
out_d[i] = out_d[i - 1] + in_d[i];
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
template IdArray CumSum<kDLCPU, int32_t>(IdArray, bool);
|
||||
template IdArray CumSum<kDLCPU, int64_t>(IdArray, bool);
|
||||
|
||||
} // namespace impl
|
||||
} // namespace aten
|
||||
} // namespace dgl
|
||||
@@ -35,20 +35,16 @@ template NDArray IndexSelect<kDLCPU, double, int32_t>(NDArray, IdArray);
|
||||
template NDArray IndexSelect<kDLCPU, double, int64_t>(NDArray, IdArray);
|
||||
|
||||
template <DLDeviceType XPU, typename DType>
|
||||
DType IndexSelect(NDArray array, uint64_t index) {
|
||||
DType IndexSelect(NDArray array, int64_t index) {
|
||||
const DType* data = static_cast<DType*>(array->data);
|
||||
return data[index];
|
||||
}
|
||||
|
||||
template int32_t IndexSelect<kDLCPU, int32_t>(NDArray array, uint64_t index);
|
||||
template int64_t IndexSelect<kDLCPU, int64_t>(NDArray array, uint64_t index);
|
||||
template uint32_t IndexSelect<kDLCPU, uint32_t>(NDArray array, uint64_t index);
|
||||
template uint64_t IndexSelect<kDLCPU, uint64_t>(NDArray array, uint64_t index);
|
||||
template float IndexSelect<kDLCPU, float>(NDArray array, uint64_t index);
|
||||
template double IndexSelect<kDLCPU, double>(NDArray array, uint64_t index);
|
||||
template int32_t IndexSelect<kDLCPU, int32_t>(NDArray array, int64_t index);
|
||||
template int64_t IndexSelect<kDLCPU, int64_t>(NDArray array, int64_t index);
|
||||
template float IndexSelect<kDLCPU, float>(NDArray array, int64_t index);
|
||||
template double IndexSelect<kDLCPU, double>(NDArray array, int64_t index);
|
||||
|
||||
}; // namespace impl
|
||||
|
||||
}; // namespace aten
|
||||
|
||||
}; // namespace dgl
|
||||
} // namespace impl
|
||||
} // namespace aten
|
||||
} // namespace dgl
|
||||
|
||||
@@ -76,8 +76,6 @@ template std::tuple<NDArray, IdArray, IdArray> Pack<kDLCPU, int64_t>(NDArray, in
|
||||
template std::tuple<NDArray, IdArray, IdArray> Pack<kDLCPU, float>(NDArray, float);
|
||||
template std::tuple<NDArray, IdArray, IdArray> Pack<kDLCPU, double>(NDArray, double);
|
||||
|
||||
}; // namespace impl
|
||||
|
||||
}; // namespace aten
|
||||
|
||||
}; // namespace dgl
|
||||
} // namespace impl
|
||||
} // namespace aten
|
||||
} // namespace dgl
|
||||
|
||||
@@ -6,12 +6,12 @@
|
||||
#ifndef DGL_ARRAY_CPU_ARRAY_UTILS_H_
|
||||
#define DGL_ARRAY_CPU_ARRAY_UTILS_H_
|
||||
|
||||
#include <dgl/array.h>
|
||||
#include <dgl/aten/types.h>
|
||||
#include <parallel_hashmap/phmap.h>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include "../../c_api_common.h"
|
||||
#include "../third_party/phmap/parallel_hashmap/phmap.h"
|
||||
|
||||
namespace dgl {
|
||||
namespace aten {
|
||||
|
||||
@@ -10,37 +10,181 @@
|
||||
#include <numeric>
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include <iterator>
|
||||
#include <tuple>
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename IdType>
|
||||
struct TupleRef {
|
||||
TupleRef() = delete;
|
||||
TupleRef(const TupleRef& other) = default;
|
||||
TupleRef(TupleRef&& other) = default;
|
||||
TupleRef(IdType *const r, IdType *const c, IdType *const d)
|
||||
: row(r), col(c), data(d) {}
|
||||
|
||||
TupleRef& operator=(const TupleRef& other) {
|
||||
*row = *other.row;
|
||||
*col = *other.col;
|
||||
*data = *other.data;
|
||||
return *this;
|
||||
}
|
||||
TupleRef& operator=(const std::tuple<IdType, IdType, IdType>& val) {
|
||||
*row = std::get<0>(val);
|
||||
*col = std::get<1>(val);
|
||||
*data = std::get<2>(val);
|
||||
return *this;
|
||||
}
|
||||
|
||||
operator std::tuple<IdType, IdType, IdType>() const {
|
||||
return std::make_tuple(*row, *col, *data);
|
||||
}
|
||||
|
||||
void Swap(const TupleRef& other) const {
|
||||
std::swap(*row, *other.row);
|
||||
std::swap(*col, *other.col);
|
||||
std::swap(*data, *other.data);
|
||||
}
|
||||
|
||||
IdType *row, *col, *data;
|
||||
};
|
||||
|
||||
using std::swap;
|
||||
template <typename IdType>
|
||||
void swap(const TupleRef<IdType>& r1, const TupleRef<IdType>& r2) {
|
||||
r1.Swap(r2);
|
||||
}
|
||||
|
||||
template <typename IdType>
|
||||
struct CooIterator : public std::iterator<std::random_access_iterator_tag,
|
||||
std::tuple<IdType, IdType, IdType>,
|
||||
std::ptrdiff_t,
|
||||
std::tuple<IdType*, IdType*, IdType*>,
|
||||
TupleRef<IdType>> {
|
||||
CooIterator() = default;
|
||||
CooIterator(const CooIterator& other) = default;
|
||||
CooIterator(CooIterator&& other) = default;
|
||||
CooIterator(IdType *r, IdType *c, IdType *d): row(r), col(c), data(d) {}
|
||||
|
||||
CooIterator& operator=(const CooIterator& other) = default;
|
||||
CooIterator& operator=(CooIterator&& other) = default;
|
||||
~CooIterator() = default;
|
||||
|
||||
bool operator==(const CooIterator& other) const {
|
||||
return row == other.row;
|
||||
}
|
||||
|
||||
bool operator!=(const CooIterator& other) const {
|
||||
return row != other.row;
|
||||
}
|
||||
|
||||
bool operator<(const CooIterator& other) const {
|
||||
return row < other.row;
|
||||
}
|
||||
|
||||
bool operator>(const CooIterator& other) const {
|
||||
return row > other.row;
|
||||
}
|
||||
|
||||
bool operator<=(const CooIterator& other) const {
|
||||
return row <= other.row;
|
||||
}
|
||||
|
||||
bool operator>=(const CooIterator& other) const {
|
||||
return row >= other.row;
|
||||
}
|
||||
|
||||
CooIterator& operator+=(const std::ptrdiff_t& movement) {
|
||||
row += movement;
|
||||
col += movement;
|
||||
data += movement;
|
||||
return *this;
|
||||
}
|
||||
|
||||
CooIterator& operator-=(const std::ptrdiff_t& movement) {
|
||||
row -= movement;
|
||||
col -= movement;
|
||||
data -= movement;
|
||||
return *this;
|
||||
}
|
||||
|
||||
CooIterator& operator++() {
|
||||
return operator+=(1);
|
||||
}
|
||||
|
||||
CooIterator& operator--() {
|
||||
return operator-=(1);
|
||||
}
|
||||
|
||||
CooIterator operator++(int) {
|
||||
CooIterator ret(*this);
|
||||
operator++();
|
||||
return ret;
|
||||
}
|
||||
|
||||
CooIterator operator--(int) {
|
||||
CooIterator ret(*this);
|
||||
operator--();
|
||||
return ret;
|
||||
}
|
||||
|
||||
CooIterator operator+(const std::ptrdiff_t& movement) const {
|
||||
CooIterator ret(*this);
|
||||
ret += movement;
|
||||
return ret;
|
||||
}
|
||||
|
||||
CooIterator operator-(const std::ptrdiff_t& movement) const {
|
||||
CooIterator ret(*this);
|
||||
ret -= movement;
|
||||
return ret;
|
||||
}
|
||||
|
||||
std::ptrdiff_t operator-(const CooIterator& other) const {
|
||||
return row - other.row;
|
||||
}
|
||||
|
||||
TupleRef<IdType> operator*() const {
|
||||
return TupleRef<IdType>(row, col, data);
|
||||
}
|
||||
TupleRef<IdType> operator*() {
|
||||
return TupleRef<IdType>(row, col, data);
|
||||
}
|
||||
|
||||
IdType *row, *col, *data;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace dgl {
|
||||
namespace aten {
|
||||
namespace impl {
|
||||
|
||||
template <DLDeviceType XPU, typename IdType>
|
||||
COOMatrix COOSort(COOMatrix coo, bool sort_column) {
|
||||
const int64_t nnz = coo.row->shape[0];
|
||||
const IdType* coo_row_data = static_cast<IdType*>(coo.row->data);
|
||||
const IdType* coo_col_data = static_cast<IdType*>(coo.col->data);
|
||||
///////////////////////////// COOSort_ /////////////////////////////
|
||||
|
||||
// Argsort
|
||||
IdArray new_row = IdArray::Empty({nnz}, coo.row->dtype, coo.row->ctx);
|
||||
IdArray new_col = IdArray::Empty({nnz}, coo.col->dtype, coo.col->ctx);
|
||||
IdArray new_idx = IdArray::Empty({nnz}, coo.row->dtype, coo.row->ctx);
|
||||
IdType* new_row_data = static_cast<IdType*>(new_row->data);
|
||||
IdType* new_col_data = static_cast<IdType*>(new_col->data);
|
||||
IdType* new_idx_data = static_cast<IdType*>(new_idx->data);
|
||||
std::iota(new_idx_data, new_idx_data + nnz, 0);
|
||||
template <DLDeviceType XPU, typename IdType>
|
||||
void COOSort_(COOMatrix* coo, bool sort_column) {
|
||||
const int64_t nnz = coo->row->shape[0];
|
||||
IdType* coo_row = coo->row.Ptr<IdType>();
|
||||
IdType* coo_col = coo->col.Ptr<IdType>();
|
||||
if (!COOHasData(*coo))
|
||||
coo->data = aten::Range(0, nnz, coo->row->dtype.bits, coo->row->ctx);
|
||||
IdType* coo_data = coo->data.Ptr<IdType>();
|
||||
|
||||
typedef std::tuple<IdType, IdType, IdType> Tuple;
|
||||
|
||||
// Arg sort
|
||||
if (sort_column) {
|
||||
#ifdef PARALLEL_ALGORITHMS
|
||||
__gnu_parallel::sort(
|
||||
#else
|
||||
std::sort(
|
||||
#endif
|
||||
new_idx_data,
|
||||
new_idx_data + nnz,
|
||||
[coo_row_data, coo_col_data](const IdType a, const IdType b) {
|
||||
return (coo_row_data[a] != coo_row_data[b]) ?
|
||||
(coo_row_data[a] < coo_row_data[b]) :
|
||||
(coo_col_data[a] < coo_col_data[b]);
|
||||
CooIterator<IdType>(coo_row, coo_col, coo_data),
|
||||
CooIterator<IdType>(coo_row, coo_col, coo_data) + nnz,
|
||||
[](const Tuple& a, const Tuple& b) {
|
||||
return (std::get<0>(a) != std::get<0>(b)) ?
|
||||
(std::get<0>(a) < std::get<0>(b)) : (std::get<1>(a) < std::get<1>(b));
|
||||
});
|
||||
} else {
|
||||
#ifdef PARALLEL_ALGORITHMS
|
||||
@@ -48,39 +192,41 @@ COOMatrix COOSort(COOMatrix coo, bool sort_column) {
|
||||
#else
|
||||
std::sort(
|
||||
#endif
|
||||
new_idx_data,
|
||||
new_idx_data + nnz,
|
||||
[coo_row_data](const IdType a, const IdType b) {
|
||||
return coo_row_data[a] < coo_row_data[b];
|
||||
CooIterator<IdType>(coo_row, coo_col, coo_data),
|
||||
CooIterator<IdType>(coo_row, coo_col, coo_data) + nnz,
|
||||
[](const Tuple& a, const Tuple& b) {
|
||||
return std::get<0>(a) < std::get<0>(b);
|
||||
});
|
||||
}
|
||||
|
||||
// Reorder according to shuffle
|
||||
#pragma omp parallel for
|
||||
for (IdType i = 0; i < nnz; ++i) {
|
||||
new_row_data[i] = coo_row_data[new_idx_data[i]];
|
||||
new_col_data[i] = coo_col_data[new_idx_data[i]];
|
||||
}
|
||||
|
||||
if (COOHasData(coo)) {
|
||||
const IdType* coo_data_data = static_cast<IdType*>(coo.data->data);
|
||||
IdArray new_data = IdArray::Empty({nnz}, coo.row->dtype, coo.row->ctx);
|
||||
IdType* new_data_data = static_cast<IdType*>(new_data->data);
|
||||
#pragma omp parallel for
|
||||
for (IdType i = 0; i < nnz; ++i) {
|
||||
new_data_data[i] = coo_data_data[new_idx_data[i]];
|
||||
}
|
||||
|
||||
new_idx = new_data;
|
||||
}
|
||||
|
||||
return COOMatrix{
|
||||
coo.num_rows, coo.num_cols, std::move(new_row), std::move(new_col),
|
||||
std::move(new_idx), true, sort_column};
|
||||
coo->row_sorted = true;
|
||||
coo->col_sorted = sort_column;
|
||||
}
|
||||
|
||||
template COOMatrix COOSort<kDLCPU, int32_t>(COOMatrix, bool);
|
||||
template COOMatrix COOSort<kDLCPU, int64_t>(COOMatrix, bool);
|
||||
template void COOSort_<kDLCPU, int32_t>(COOMatrix*, bool);
|
||||
template void COOSort_<kDLCPU, int64_t>(COOMatrix*, bool);
|
||||
|
||||
|
||||
///////////////////////////// COOIsSorted /////////////////////////////
|
||||
|
||||
template <DLDeviceType XPU, typename IdType>
|
||||
std::pair<bool, bool> COOIsSorted(COOMatrix coo) {
|
||||
const int64_t nnz = coo.row->shape[0];
|
||||
IdType* row = coo.row.Ptr<IdType>();
|
||||
IdType* col = coo.col.Ptr<IdType>();
|
||||
bool row_sorted = true;
|
||||
bool col_sorted = true;
|
||||
for (int64_t i = 1; row_sorted && i < nnz; ++i) {
|
||||
row_sorted = (row[i - 1] <= row[i]);
|
||||
col_sorted = col_sorted && (row[i - 1] < row[i] || col[i - 1] <= col[i]);
|
||||
}
|
||||
if (!row_sorted)
|
||||
col_sorted = false;
|
||||
return {row_sorted, col_sorted};
|
||||
}
|
||||
|
||||
template std::pair<bool, bool> COOIsSorted<kDLCPU, int32_t>(COOMatrix coo);
|
||||
template std::pair<bool, bool> COOIsSorted<kDLCPU, int64_t>(COOMatrix coo);
|
||||
|
||||
} // namespace impl
|
||||
} // namespace aten
|
||||
|
||||
83
src/array/cpu/csr_sort.cc
Normal file
83
src/array/cpu/csr_sort.cc
Normal file
@@ -0,0 +1,83 @@
|
||||
/*!
|
||||
* Copyright (c) 2020 by Contributors
|
||||
* \file array/cpu/csr_sort.cc
|
||||
* \brief CSR sorting
|
||||
*/
|
||||
#include <dgl/array.h>
|
||||
#include <numeric>
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
|
||||
namespace dgl {
|
||||
namespace aten {
|
||||
namespace impl {
|
||||
|
||||
///////////////////////////// CSRIsSorted /////////////////////////////
|
||||
template <DLDeviceType XPU, typename IdType>
|
||||
bool CSRIsSorted(CSRMatrix csr) {
|
||||
const IdType* indptr = csr.indptr.Ptr<IdType>();
|
||||
const IdType* indices = csr.indices.Ptr<IdType>();
|
||||
bool ret = true;
|
||||
#pragma omp parallel for shared(ret)
|
||||
for (int64_t row = 0; row < csr.num_rows; ++row) {
|
||||
if (!ret)
|
||||
continue;
|
||||
for (IdType i = indptr[row] + 1; i < indptr[row + 1]; ++i) {
|
||||
if (indices[i - 1] > indices[i]) {
|
||||
ret = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
template bool CSRIsSorted<kDLCPU, int64_t>(CSRMatrix csr);
|
||||
template bool CSRIsSorted<kDLCPU, int32_t>(CSRMatrix csr);
|
||||
|
||||
///////////////////////////// CSRSort /////////////////////////////
|
||||
|
||||
template <DLDeviceType XPU, typename IdType>
|
||||
void CSRSort_(CSRMatrix* csr) {
|
||||
typedef std::pair<IdType, IdType> ShufflePair;
|
||||
const int64_t num_rows = csr->num_rows;
|
||||
const int64_t nnz = csr->indices->shape[0];
|
||||
const IdType* indptr_data = static_cast<IdType*>(csr->indptr->data);
|
||||
IdType* indices_data = static_cast<IdType*>(csr->indices->data);
|
||||
if (!CSRHasData(*csr)) {
|
||||
csr->data = aten::Range(0, nnz, csr->indptr->dtype.bits, csr->indptr->ctx);
|
||||
}
|
||||
IdType* eid_data = static_cast<IdType*>(csr->data->data);
|
||||
#pragma omp parallel
|
||||
{
|
||||
std::vector<ShufflePair> reorder_vec;
|
||||
#pragma omp for
|
||||
for (int64_t row = 0; row < num_rows; row++) {
|
||||
const int64_t num_cols = indptr_data[row + 1] - indptr_data[row];
|
||||
IdType *col = indices_data + indptr_data[row];
|
||||
IdType *eid = eid_data + indptr_data[row];
|
||||
|
||||
reorder_vec.resize(num_cols);
|
||||
for (int64_t i = 0; i < num_cols; i++) {
|
||||
reorder_vec[i].first = col[i];
|
||||
reorder_vec[i].second = eid[i];
|
||||
}
|
||||
std::sort(reorder_vec.begin(), reorder_vec.end(),
|
||||
[](const ShufflePair &e1, const ShufflePair &e2) {
|
||||
return e1.first < e2.first;
|
||||
});
|
||||
for (int64_t i = 0; i < num_cols; i++) {
|
||||
col[i] = reorder_vec[i].first;
|
||||
eid[i] = reorder_vec[i].second;
|
||||
}
|
||||
}
|
||||
}
|
||||
csr->sorted = true;
|
||||
}
|
||||
|
||||
template void CSRSort_<kDLCPU, int64_t>(CSRMatrix* csr);
|
||||
template void CSRSort_<kDLCPU, int32_t>(CSRMatrix* csr);
|
||||
|
||||
} // namespace impl
|
||||
} // namespace aten
|
||||
} // namespace dgl
|
||||
@@ -377,7 +377,9 @@ COOMatrix CSRToCOO(CSRMatrix csr) {
|
||||
ret_row_data + indptr_data[i + 1],
|
||||
i);
|
||||
}
|
||||
return COOMatrix{csr.num_rows, csr.num_cols, ret_row, csr.indices, csr.data};
|
||||
return COOMatrix(csr.num_rows, csr.num_cols,
|
||||
ret_row, csr.indices, csr.data,
|
||||
true, csr.sorted);
|
||||
}
|
||||
|
||||
template COOMatrix CSRToCOO<kDLCPU, int32_t>(CSRMatrix csr);
|
||||
@@ -543,49 +545,6 @@ template CSRMatrix CSRSliceMatrix<kDLCPU, int32_t>(
|
||||
template CSRMatrix CSRSliceMatrix<kDLCPU, int64_t>(
|
||||
CSRMatrix csr, runtime::NDArray rows, runtime::NDArray cols);
|
||||
|
||||
///////////////////////////// CSRSort /////////////////////////////
|
||||
|
||||
template <DLDeviceType XPU, typename IdType>
|
||||
void CSRSort_(CSRMatrix* csr) {
|
||||
typedef std::pair<IdType, IdType> ShufflePair;
|
||||
const int64_t num_rows = csr->num_rows;
|
||||
const int64_t nnz = csr->indices->shape[0];
|
||||
const IdType* indptr_data = static_cast<IdType*>(csr->indptr->data);
|
||||
IdType* indices_data = static_cast<IdType*>(csr->indices->data);
|
||||
if (!CSRHasData(*csr)) {
|
||||
csr->data = aten::Range(0, nnz, csr->indptr->dtype.bits, csr->indptr->ctx);
|
||||
}
|
||||
IdType* eid_data = static_cast<IdType*>(csr->data->data);
|
||||
#pragma omp parallel
|
||||
{
|
||||
std::vector<ShufflePair> reorder_vec;
|
||||
#pragma omp for
|
||||
for (int64_t row = 0; row < num_rows; row++) {
|
||||
const int64_t num_cols = indptr_data[row + 1] - indptr_data[row];
|
||||
IdType *col = indices_data + indptr_data[row];
|
||||
IdType *eid = eid_data + indptr_data[row];
|
||||
|
||||
reorder_vec.resize(num_cols);
|
||||
for (int64_t i = 0; i < num_cols; i++) {
|
||||
reorder_vec[i].first = col[i];
|
||||
reorder_vec[i].second = eid[i];
|
||||
}
|
||||
std::sort(reorder_vec.begin(), reorder_vec.end(),
|
||||
[](const ShufflePair &e1, const ShufflePair &e2) {
|
||||
return e1.first < e2.first;
|
||||
});
|
||||
for (int64_t i = 0; i < num_cols; i++) {
|
||||
col[i] = reorder_vec[i].first;
|
||||
eid[i] = reorder_vec[i].second;
|
||||
}
|
||||
}
|
||||
}
|
||||
csr->sorted = true;
|
||||
}
|
||||
|
||||
template void CSRSort_<kDLCPU, int64_t>(CSRMatrix* csr);
|
||||
template void CSRSort_<kDLCPU, int32_t>(CSRMatrix* csr);
|
||||
|
||||
///////////////////////////// CSRReorder /////////////////////////////
|
||||
|
||||
template <DLDeviceType XPU, typename IdType>
|
||||
|
||||
@@ -3,10 +3,10 @@
|
||||
* \file array/cpu/spmat_op_impl.cc
|
||||
* \brief CPU implementation of COO sparse matrix operators
|
||||
*/
|
||||
#include <dgl/array.h>
|
||||
#include <vector>
|
||||
#include <unordered_set>
|
||||
#include <unordered_map>
|
||||
#include <tuple>
|
||||
#include "array_utils.h"
|
||||
|
||||
namespace dgl {
|
||||
@@ -266,29 +266,57 @@ CSRMatrix COOToCSR(COOMatrix coo) {
|
||||
const IdType* row_data = static_cast<IdType*>(coo.row->data);
|
||||
const IdType* col_data = static_cast<IdType*>(coo.col->data);
|
||||
const IdType* data = COOHasData(coo)? static_cast<IdType*>(coo.data->data) : nullptr;
|
||||
|
||||
NDArray ret_indptr = NDArray::Empty({N + 1}, coo.row->dtype, coo.row->ctx);
|
||||
NDArray ret_indices;
|
||||
NDArray ret_data;
|
||||
|
||||
IdType* Bp = static_cast<IdType*>(ret_indptr->data);
|
||||
|
||||
std::fill(Bp, Bp + N, 0);
|
||||
for (int64_t i = 0; i < NNZ; ++i) {
|
||||
Bp[row_data[i]]++;
|
||||
bool row_sorted = coo.row_sorted;
|
||||
bool col_sorted = coo.col_sorted;
|
||||
if (!row_sorted) {
|
||||
// It is possible that the flag is simply not set (default value is false),
|
||||
// so we still perform a linear scan to check the flag.
|
||||
std::tie(row_sorted, col_sorted) = COOIsSorted(coo);
|
||||
}
|
||||
|
||||
// cumsum
|
||||
for (int64_t i = 0, cumsum = 0; i < N; ++i) {
|
||||
const IdType temp = Bp[i];
|
||||
Bp[i] = cumsum;
|
||||
cumsum += temp;
|
||||
}
|
||||
Bp[N] = NNZ;
|
||||
if (row_sorted) {
|
||||
// compute indptr
|
||||
IdType* Bp = static_cast<IdType*>(ret_indptr->data);
|
||||
Bp[0] = 0;
|
||||
int64_t j = 0;
|
||||
for (int64_t i = 0; i < N; ++i) {
|
||||
const int64_t k = j;
|
||||
for (; j < NNZ && row_data[j] == i; ++j) {}
|
||||
Bp[i + 1] = Bp[i] + j - k;
|
||||
}
|
||||
|
||||
if (coo.row_sorted == true) {
|
||||
// TODO(minjie): Many of our current implementation assumes that CSR must have
|
||||
// a data array. This is a temporary workaround. Remove this after:
|
||||
// - The old immutable graph implementation is deprecated.
|
||||
// - The old binary reduce kernel is deprecated.
|
||||
if (!COOHasData(coo))
|
||||
coo.data = aten::Range(0, NNZ, coo.row->dtype.bits, coo.row->ctx);
|
||||
|
||||
// compute indices and data
|
||||
ret_indices = coo.col;
|
||||
ret_data = coo.data;
|
||||
} else {
|
||||
// compute indptr
|
||||
IdType* Bp = static_cast<IdType*>(ret_indptr->data);
|
||||
std::fill(Bp, Bp + N, 0);
|
||||
for (int64_t i = 0; i < NNZ; ++i) {
|
||||
Bp[row_data[i]]++;
|
||||
}
|
||||
|
||||
// cumsum
|
||||
for (int64_t i = 0, cumsum = 0; i < N; ++i) {
|
||||
const IdType temp = Bp[i];
|
||||
Bp[i] = cumsum;
|
||||
cumsum += temp;
|
||||
}
|
||||
Bp[N] = NNZ;
|
||||
|
||||
// compute indices and data
|
||||
ret_indices = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
|
||||
ret_data = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
|
||||
IdType* Bi = static_cast<IdType*>(ret_indices->data);
|
||||
@@ -311,7 +339,7 @@ CSRMatrix COOToCSR(COOMatrix coo) {
|
||||
|
||||
return CSRMatrix(coo.num_rows, coo.num_cols,
|
||||
ret_indptr, ret_indices, ret_data,
|
||||
coo.col_sorted);
|
||||
col_sorted);
|
||||
}
|
||||
|
||||
template CSRMatrix COOToCSR<kDLCPU, int32_t>(COOMatrix coo);
|
||||
@@ -439,7 +467,6 @@ COOMatrix COOReorder(COOMatrix coo, runtime::NDArray new_row_id_arr,
|
||||
// Input COO
|
||||
const IdType* in_rows = static_cast<IdType*>(coo.row->data);
|
||||
const IdType* in_cols = static_cast<IdType*>(coo.col->data);
|
||||
const IdType* in_data = COOHasData(coo) ? static_cast<IdType*>(coo.data->data) : nullptr;
|
||||
int64_t num_rows = coo.num_rows;
|
||||
int64_t num_cols = coo.num_cols;
|
||||
int64_t nnz = coo.row->shape[0];
|
||||
|
||||
51
src/array/cuda/array_cumsum.cu
Normal file
51
src/array/cuda/array_cumsum.cu
Normal file
@@ -0,0 +1,51 @@
|
||||
/*!
|
||||
* Copyright (c) 2020 by Contributors
|
||||
* \file array/cpu/array_cumsum.cu
|
||||
* \brief Array cumsum GPU implementation
|
||||
*/
|
||||
#include <dgl/array.h>
|
||||
#include <cub/cub.cuh>
|
||||
#include "../../runtime/cuda/cuda_common.h"
|
||||
#include "./utils.h"
|
||||
|
||||
namespace dgl {
|
||||
using runtime::NDArray;
|
||||
namespace aten {
|
||||
namespace impl {
|
||||
|
||||
template <DLDeviceType XPU, typename IdType>
|
||||
IdArray CumSum(IdArray array, bool prepend_zero) {
|
||||
const int64_t len = array.NumElements();
|
||||
if (len == 0)
|
||||
return array;
|
||||
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
|
||||
auto device = runtime::DeviceAPI::Get(array->ctx);
|
||||
const IdType* in_d = array.Ptr<IdType>();
|
||||
IdArray ret;
|
||||
IdType* out_d = nullptr;
|
||||
if (prepend_zero) {
|
||||
ret = aten::Full(0, len + 1, array->dtype.bits, array->ctx);
|
||||
out_d = ret.Ptr<IdType>() + 1;
|
||||
} else {
|
||||
ret = aten::NewIdArray(len, array->ctx, array->dtype.bits);
|
||||
out_d = ret.Ptr<IdType>();
|
||||
}
|
||||
// Allocate workspace
|
||||
size_t workspace_size = 0;
|
||||
cub::DeviceScan::InclusiveSum(nullptr, workspace_size, in_d, out_d, len, thr_entry->stream);
|
||||
void* workspace = device->AllocWorkspace(array->ctx, workspace_size);
|
||||
|
||||
// Compute cumsum
|
||||
cub::DeviceScan::InclusiveSum(workspace, workspace_size, in_d, out_d, len, thr_entry->stream);
|
||||
|
||||
device->FreeWorkspace(array->ctx, workspace);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
template IdArray CumSum<kDLGPU, int32_t>(IdArray, bool);
|
||||
template IdArray CumSum<kDLGPU, int64_t>(IdArray, bool);
|
||||
|
||||
} // namespace impl
|
||||
} // namespace aten
|
||||
} // namespace dgl
|
||||
@@ -5,7 +5,7 @@
|
||||
*/
|
||||
#include <dgl/array.h>
|
||||
#include "../../runtime/cuda/cuda_common.h"
|
||||
#include "../../cuda_utils.h"
|
||||
#include "./utils.h"
|
||||
|
||||
namespace dgl {
|
||||
using runtime::NDArray;
|
||||
@@ -50,7 +50,7 @@ template NDArray IndexSelect<kDLGPU, double, int32_t>(NDArray, IdArray);
|
||||
template NDArray IndexSelect<kDLGPU, double, int64_t>(NDArray, IdArray);
|
||||
|
||||
template <DLDeviceType XPU, typename DType>
|
||||
DType IndexSelect(NDArray array, uint64_t index) {
|
||||
DType IndexSelect(NDArray array, int64_t index) {
|
||||
auto device = runtime::DeviceAPI::Get(array->ctx);
|
||||
DType ret = 0;
|
||||
device->CopyDataFromTo(
|
||||
@@ -60,12 +60,12 @@ DType IndexSelect(NDArray array, uint64_t index) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
template int32_t IndexSelect<kDLGPU, int32_t>(NDArray array, uint64_t index);
|
||||
template int64_t IndexSelect<kDLGPU, int64_t>(NDArray array, uint64_t index);
|
||||
template uint32_t IndexSelect<kDLGPU, uint32_t>(NDArray array, uint64_t index);
|
||||
template uint64_t IndexSelect<kDLGPU, uint64_t>(NDArray array, uint64_t index);
|
||||
template float IndexSelect<kDLGPU, float>(NDArray array, uint64_t index);
|
||||
template double IndexSelect<kDLGPU, double>(NDArray array, uint64_t index);
|
||||
template int32_t IndexSelect<kDLGPU, int32_t>(NDArray array, int64_t index);
|
||||
template int64_t IndexSelect<kDLGPU, int64_t>(NDArray array, int64_t index);
|
||||
template uint32_t IndexSelect<kDLGPU, uint32_t>(NDArray array, int64_t index);
|
||||
template uint64_t IndexSelect<kDLGPU, uint64_t>(NDArray array, int64_t index);
|
||||
template float IndexSelect<kDLGPU, float>(NDArray array, int64_t index);
|
||||
template double IndexSelect<kDLGPU, double>(NDArray array, int64_t index);
|
||||
|
||||
} // namespace impl
|
||||
} // namespace aten
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
*/
|
||||
#include <dgl/array.h>
|
||||
#include "../../runtime/cuda/cuda_common.h"
|
||||
#include "../../cuda_utils.h"
|
||||
#include "./utils.h"
|
||||
#include "../arith.h"
|
||||
|
||||
namespace dgl {
|
||||
|
||||
@@ -17,63 +17,43 @@ template <DLDeviceType XPU, typename IdType>
|
||||
CSRMatrix COOToCSR(COOMatrix coo) {
|
||||
CHECK(sizeof(IdType) == 4) << "CUDA COOToCSR does not support int64.";
|
||||
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
|
||||
auto device = runtime::DeviceAPI::Get(coo.row->ctx);
|
||||
// allocate cusparse handle if needed
|
||||
if (!thr_entry->cusparse_handle) {
|
||||
CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
|
||||
}
|
||||
CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, thr_entry->stream));
|
||||
|
||||
|
||||
NDArray row = coo.row, col = coo.col, data = coo.data;
|
||||
int32_t* row_ptr = static_cast<int32_t*>(row->data);
|
||||
int32_t* col_ptr = static_cast<int32_t*>(col->data);
|
||||
int32_t* data_ptr = aten::IsNullArray(data) ? nullptr : static_cast<int32_t*>(data->data);
|
||||
|
||||
if (!coo.row_sorted) {
|
||||
// make a copy of row and col because sort is done in-place
|
||||
row = row.CopyTo(row->ctx);
|
||||
col = col.CopyTo(col->ctx);
|
||||
row_ptr = static_cast<int32_t*>(row->data);
|
||||
col_ptr = static_cast<int32_t*>(col->data);
|
||||
if (aten::IsNullArray(data)) {
|
||||
// create the index array
|
||||
data = aten::Range(0, row->shape[0], row->dtype.bits, row->ctx);
|
||||
data_ptr = static_cast<int32_t*>(data->data);
|
||||
}
|
||||
// sort row
|
||||
size_t workspace_size = 0;
|
||||
CUSPARSE_CALL(cusparseXcoosort_bufferSizeExt(
|
||||
thr_entry->cusparse_handle,
|
||||
coo.num_rows, coo.num_cols,
|
||||
row->shape[0],
|
||||
row_ptr,
|
||||
col_ptr,
|
||||
&workspace_size));
|
||||
void* workspace = device->AllocWorkspace(row->ctx, workspace_size);
|
||||
CUSPARSE_CALL(cusparseXcoosortByRow(
|
||||
thr_entry->cusparse_handle,
|
||||
coo.num_rows, coo.num_cols,
|
||||
row->shape[0],
|
||||
row_ptr,
|
||||
col_ptr,
|
||||
data_ptr,
|
||||
workspace));
|
||||
device->FreeWorkspace(row->ctx, workspace);
|
||||
bool row_sorted = coo.row_sorted;
|
||||
bool col_sorted = coo.col_sorted;
|
||||
if (!row_sorted) {
|
||||
// It is possible that the flag is simply not set (default value is false),
|
||||
// so we still perform a linear scan to check the flag.
|
||||
std::tie(row_sorted, col_sorted) = COOIsSorted(coo);
|
||||
}
|
||||
if (!row_sorted) {
|
||||
coo = COOSort(coo);
|
||||
}
|
||||
|
||||
NDArray indptr = aten::NewIdArray(coo.num_rows + 1, row->ctx, row->dtype.bits);
|
||||
const int64_t nnz = coo.row->shape[0];
|
||||
// TODO(minjie): Many of our current implementation assumes that CSR must have
|
||||
// a data array. This is a temporary workaround. Remove this after:
|
||||
// - The old immutable graph implementation is deprecated.
|
||||
// - The old binary reduce kernel is deprecated.
|
||||
if (!COOHasData(coo))
|
||||
coo.data = aten::Range(0, nnz, coo.row->dtype.bits, coo.row->ctx);
|
||||
|
||||
NDArray indptr = aten::NewIdArray(coo.num_rows + 1, coo.row->ctx, coo.row->dtype.bits);
|
||||
int32_t* indptr_ptr = static_cast<int32_t*>(indptr->data);
|
||||
CUSPARSE_CALL(cusparseXcoo2csr(
|
||||
thr_entry->cusparse_handle,
|
||||
row_ptr,
|
||||
row->shape[0],
|
||||
coo.row.Ptr<int32_t>(),
|
||||
nnz,
|
||||
coo.num_rows,
|
||||
indptr_ptr,
|
||||
CUSPARSE_INDEX_BASE_ZERO));
|
||||
|
||||
return CSRMatrix(coo.num_rows, coo.num_cols,
|
||||
indptr, col, data, false);
|
||||
indptr, coo.col, coo.data, col_sorted);
|
||||
}
|
||||
|
||||
template CSRMatrix COOToCSR<kDLGPU, int32_t>(COOMatrix coo);
|
||||
|
||||
@@ -1,108 +0,0 @@
|
||||
/*!
|
||||
* Copyright (c) 2020 by Contributors
|
||||
* \file array/cuda/coo_sort.cc
|
||||
* \brief Sort COO index
|
||||
*/
|
||||
#include <dgl/array.h>
|
||||
#include "../../runtime/cuda/cuda_common.h"
|
||||
|
||||
namespace dgl {
|
||||
|
||||
using runtime::NDArray;
|
||||
|
||||
namespace aten {
|
||||
namespace impl {
|
||||
|
||||
template <DLDeviceType XPU, typename IdType>
|
||||
COOMatrix COOSort(COOMatrix coo, bool sort_column) {
|
||||
CHECK(sizeof(IdType) == 4) << "CUDA COOSort does not support int64.";
|
||||
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
|
||||
auto device = runtime::DeviceAPI::Get(coo.row->ctx);
|
||||
// allocate cusparse handle if needed
|
||||
if (!thr_entry->cusparse_handle) {
|
||||
CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
|
||||
}
|
||||
CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, thr_entry->stream));
|
||||
|
||||
|
||||
NDArray row = coo.row.CopyTo(coo.row->ctx);
|
||||
NDArray col = coo.col.CopyTo(coo.col->ctx);
|
||||
NDArray data;
|
||||
if (aten::IsNullArray(coo.data)) {
|
||||
// create the index array
|
||||
data = aten::Range(0, row->shape[0], row->dtype.bits, row->ctx);
|
||||
} else {
|
||||
data = coo.data.CopyTo(coo.data->ctx);
|
||||
}
|
||||
int32_t* row_ptr = static_cast<int32_t*>(row->data);
|
||||
int32_t* col_ptr = static_cast<int32_t*>(col->data);
|
||||
int32_t* data_ptr = static_cast<int32_t*>(data->data);
|
||||
|
||||
// sort row
|
||||
size_t workspace_size = 0;
|
||||
CUSPARSE_CALL(cusparseXcoosort_bufferSizeExt(
|
||||
thr_entry->cusparse_handle,
|
||||
coo.num_rows, coo.num_cols,
|
||||
row->shape[0],
|
||||
row_ptr,
|
||||
col_ptr,
|
||||
&workspace_size));
|
||||
void* workspace = device->AllocWorkspace(row->ctx, workspace_size);
|
||||
CUSPARSE_CALL(cusparseXcoosortByRow(
|
||||
thr_entry->cusparse_handle,
|
||||
coo.num_rows, coo.num_cols,
|
||||
row->shape[0],
|
||||
row_ptr,
|
||||
col_ptr,
|
||||
data_ptr,
|
||||
workspace));
|
||||
device->FreeWorkspace(row->ctx, workspace);
|
||||
|
||||
if (sort_column) {
|
||||
// First create a row indptr array and then call csrsort
|
||||
int32_t* indptr = static_cast<int32_t*>(
|
||||
device->AllocWorkspace(row->ctx, (coo.num_rows + 1) * sizeof(IdType)));
|
||||
CUSPARSE_CALL(cusparseXcoo2csr(
|
||||
thr_entry->cusparse_handle,
|
||||
row_ptr,
|
||||
row->shape[0],
|
||||
coo.num_rows,
|
||||
indptr,
|
||||
CUSPARSE_INDEX_BASE_ZERO));
|
||||
CUSPARSE_CALL(cusparseXcsrsort_bufferSizeExt(
|
||||
thr_entry->cusparse_handle,
|
||||
coo.num_rows,
|
||||
coo.num_cols,
|
||||
row->shape[0],
|
||||
indptr,
|
||||
col_ptr,
|
||||
&workspace_size));
|
||||
void* workspace = device->AllocWorkspace(row->ctx, workspace_size);
|
||||
cusparseMatDescr_t descr;
|
||||
CUSPARSE_CALL(cusparseCreateMatDescr(&descr));
|
||||
CUSPARSE_CALL(cusparseXcsrsort(
|
||||
thr_entry->cusparse_handle,
|
||||
coo.num_rows,
|
||||
coo.num_cols,
|
||||
row->shape[0],
|
||||
descr,
|
||||
indptr,
|
||||
col_ptr,
|
||||
data_ptr,
|
||||
workspace));
|
||||
CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
|
||||
device->FreeWorkspace(row->ctx, workspace);
|
||||
device->FreeWorkspace(row->ctx, indptr);
|
||||
}
|
||||
|
||||
return COOMatrix(coo.num_rows, coo.num_cols,
|
||||
row, col, data, true, sort_column);
|
||||
}
|
||||
|
||||
template COOMatrix COOSort<kDLGPU, int32_t>(COOMatrix coo, bool sort_column);
|
||||
template COOMatrix COOSort<kDLGPU, int64_t>(COOMatrix coo, bool sort_column);
|
||||
|
||||
|
||||
} // namespace impl
|
||||
} // namespace aten
|
||||
} // namespace dgl
|
||||
158
src/array/cuda/coo_sort.cu
Normal file
158
src/array/cuda/coo_sort.cu
Normal file
@@ -0,0 +1,158 @@
|
||||
/*!
|
||||
* Copyright (c) 2020 by Contributors
|
||||
* \file array/cuda/coo_sort.cc
|
||||
* \brief Sort COO index
|
||||
*/
|
||||
#include <dgl/array.h>
|
||||
#include "../../runtime/cuda/cuda_common.h"
|
||||
#include "./utils.h"
|
||||
|
||||
namespace dgl {
|
||||
|
||||
using runtime::NDArray;
|
||||
|
||||
namespace aten {
|
||||
namespace impl {
|
||||
|
||||
///////////////////////////// COOSort_ /////////////////////////////
|
||||
|
||||
template <DLDeviceType XPU, typename IdType>
|
||||
void COOSort_(COOMatrix* coo, bool sort_column) {
|
||||
// TODO(minjie): Current implementation is based on cusparse which only supports
|
||||
// int32_t. To support int64_t, we could use the Radix sort algorithm provided
|
||||
// by CUB.
|
||||
CHECK(sizeof(IdType) == 4) << "CUDA COOSort does not support int64.";
|
||||
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
|
||||
auto device = runtime::DeviceAPI::Get(coo->row->ctx);
|
||||
// allocate cusparse handle if needed
|
||||
if (!thr_entry->cusparse_handle) {
|
||||
CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
|
||||
}
|
||||
CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, thr_entry->stream));
|
||||
|
||||
|
||||
NDArray row = coo->row;
|
||||
NDArray col = coo->col;
|
||||
if (!aten::COOHasData(*coo))
|
||||
coo->data = aten::Range(0, row->shape[0], row->dtype.bits, row->ctx);
|
||||
NDArray data = coo->data;
|
||||
int32_t* row_ptr = static_cast<int32_t*>(row->data);
|
||||
int32_t* col_ptr = static_cast<int32_t*>(col->data);
|
||||
int32_t* data_ptr = static_cast<int32_t*>(data->data);
|
||||
|
||||
// sort row
|
||||
size_t workspace_size = 0;
|
||||
CUSPARSE_CALL(cusparseXcoosort_bufferSizeExt(
|
||||
thr_entry->cusparse_handle,
|
||||
coo->num_rows, coo->num_cols,
|
||||
row->shape[0],
|
||||
row_ptr,
|
||||
col_ptr,
|
||||
&workspace_size));
|
||||
void* workspace = device->AllocWorkspace(row->ctx, workspace_size);
|
||||
CUSPARSE_CALL(cusparseXcoosortByRow(
|
||||
thr_entry->cusparse_handle,
|
||||
coo->num_rows, coo->num_cols,
|
||||
row->shape[0],
|
||||
row_ptr,
|
||||
col_ptr,
|
||||
data_ptr,
|
||||
workspace));
|
||||
device->FreeWorkspace(row->ctx, workspace);
|
||||
|
||||
if (sort_column) {
|
||||
// First create a row indptr array and then call csrsort
|
||||
int32_t* indptr = static_cast<int32_t*>(
|
||||
device->AllocWorkspace(row->ctx, (coo->num_rows + 1) * sizeof(IdType)));
|
||||
CUSPARSE_CALL(cusparseXcoo2csr(
|
||||
thr_entry->cusparse_handle,
|
||||
row_ptr,
|
||||
row->shape[0],
|
||||
coo->num_rows,
|
||||
indptr,
|
||||
CUSPARSE_INDEX_BASE_ZERO));
|
||||
CUSPARSE_CALL(cusparseXcsrsort_bufferSizeExt(
|
||||
thr_entry->cusparse_handle,
|
||||
coo->num_rows,
|
||||
coo->num_cols,
|
||||
row->shape[0],
|
||||
indptr,
|
||||
col_ptr,
|
||||
&workspace_size));
|
||||
void* workspace = device->AllocWorkspace(row->ctx, workspace_size);
|
||||
cusparseMatDescr_t descr;
|
||||
CUSPARSE_CALL(cusparseCreateMatDescr(&descr));
|
||||
CUSPARSE_CALL(cusparseXcsrsort(
|
||||
thr_entry->cusparse_handle,
|
||||
coo->num_rows,
|
||||
coo->num_cols,
|
||||
row->shape[0],
|
||||
descr,
|
||||
indptr,
|
||||
col_ptr,
|
||||
data_ptr,
|
||||
workspace));
|
||||
CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
|
||||
device->FreeWorkspace(row->ctx, workspace);
|
||||
device->FreeWorkspace(row->ctx, indptr);
|
||||
}
|
||||
|
||||
coo->row_sorted = true;
|
||||
coo->col_sorted = sort_column;
|
||||
}
|
||||
|
||||
template void COOSort_<kDLGPU, int32_t>(COOMatrix* coo, bool sort_column);
|
||||
template void COOSort_<kDLGPU, int64_t>(COOMatrix* coo, bool sort_column);
|
||||
|
||||
///////////////////////////// COOIsSorted /////////////////////////////
|
||||
|
||||
template <typename IdType>
|
||||
__global__ void _COOIsSortedKernel(
|
||||
const IdType* row, const IdType* col,
|
||||
int64_t nnz, int8_t* row_sorted, int8_t* col_sorted) {
|
||||
int tx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int stride_x = gridDim.x * blockDim.x;
|
||||
while (tx < nnz) {
|
||||
if (tx == 0) {
|
||||
row_sorted[0] = 1;
|
||||
col_sorted[0] = 1;
|
||||
} else {
|
||||
row_sorted[tx] = static_cast<int8_t>(row[tx - 1] <= row[tx]);
|
||||
col_sorted[tx] = static_cast<int8_t>(
|
||||
row[tx - 1] < row[tx] || col[tx - 1] <= col[tx]);
|
||||
}
|
||||
tx += stride_x;
|
||||
}
|
||||
}
|
||||
|
||||
template <DLDeviceType XPU, typename IdType>
|
||||
std::pair<bool, bool> COOIsSorted(COOMatrix coo) {
|
||||
const int64_t nnz = coo.row->shape[0];
|
||||
const auto& ctx = coo.row->ctx;
|
||||
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
|
||||
auto device = runtime::DeviceAPI::Get(ctx);
|
||||
// We allocate a workspace of 2*nnz bytes. It wastes a little bit memory but should
|
||||
// be fine.
|
||||
int8_t* row_flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, nnz));
|
||||
int8_t* col_flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, nnz));
|
||||
const int nt = cuda::FindNumThreads(nnz);
|
||||
const int nb = (nnz + nt - 1) / nt;
|
||||
_COOIsSortedKernel<<<nb, nt, 0, thr_entry->stream>>>(
|
||||
coo.row.Ptr<IdType>(), coo.col.Ptr<IdType>(),
|
||||
nnz, row_flags, col_flags);
|
||||
|
||||
const bool row_sorted = cuda::AllTrue(row_flags, nnz, ctx);
|
||||
const bool col_sorted = row_sorted? cuda::AllTrue(col_flags, nnz, ctx) : false;
|
||||
|
||||
device->FreeWorkspace(ctx, row_flags);
|
||||
device->FreeWorkspace(ctx, col_flags);
|
||||
|
||||
return {row_sorted, col_sorted};
|
||||
}
|
||||
|
||||
template std::pair<bool, bool> COOIsSorted<kDLGPU, int32_t>(COOMatrix coo);
|
||||
template std::pair<bool, bool> COOIsSorted<kDLGPU, int64_t>(COOMatrix coo);
|
||||
|
||||
} // namespace impl
|
||||
} // namespace aten
|
||||
} // namespace dgl
|
||||
108
src/array/cuda/csr_sort.cu
Normal file
108
src/array/cuda/csr_sort.cu
Normal file
@@ -0,0 +1,108 @@
|
||||
/*!
|
||||
* Copyright (c) 2020 by Contributors
|
||||
* \file array/cuda/csr_sort.cc
|
||||
* \brief Sort COO index
|
||||
*/
|
||||
#include <dgl/array.h>
|
||||
#include "../../runtime/cuda/cuda_common.h"
|
||||
#include "./utils.h"
|
||||
|
||||
namespace dgl {
|
||||
|
||||
using runtime::NDArray;
|
||||
|
||||
namespace aten {
|
||||
namespace impl {
|
||||
|
||||
/*!
|
||||
* \brief Check whether each row is sorted.
|
||||
*/
|
||||
template <typename IdType>
|
||||
__global__ void _SegmentIsSorted(
|
||||
const IdType* indptr, const IdType* indices,
|
||||
int64_t num_rows, int8_t* flags) {
|
||||
int tx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int stride_x = gridDim.x * blockDim.x;
|
||||
while (tx < num_rows) {
|
||||
bool f = true;
|
||||
for (IdType i = indptr[tx] + 1; f && i < indptr[tx + 1]; ++i) {
|
||||
f = (indices[i - 1] <= indices[i]);
|
||||
}
|
||||
flags[tx] = static_cast<int8_t>(f);
|
||||
tx += stride_x;
|
||||
}
|
||||
}
|
||||
|
||||
template <DLDeviceType XPU, typename IdType>
|
||||
bool CSRIsSorted(CSRMatrix csr) {
|
||||
const auto& ctx = csr.indptr->ctx;
|
||||
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
|
||||
auto device = runtime::DeviceAPI::Get(ctx);
|
||||
// We allocate a workspace of num_rows bytes. It wastes a little bit memory but should
|
||||
// be fine.
|
||||
int8_t* flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, csr.num_rows));
|
||||
const int nt = cuda::FindNumThreads(csr.num_rows);
|
||||
const int nb = (csr.num_rows + nt - 1) / nt;
|
||||
_SegmentIsSorted<<<nb, nt, 0, thr_entry->stream>>>(
|
||||
csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
|
||||
csr.num_rows, flags);
|
||||
bool ret = cuda::AllTrue(flags, csr.num_rows, ctx);
|
||||
device->FreeWorkspace(ctx, flags);
|
||||
return ret;
|
||||
}
|
||||
|
||||
template bool CSRIsSorted<kDLGPU, int32_t>(CSRMatrix csr);
|
||||
template bool CSRIsSorted<kDLGPU, int64_t>(CSRMatrix csr);
|
||||
|
||||
template <DLDeviceType XPU, typename IdType>
|
||||
void CSRSort_(CSRMatrix* csr) {
|
||||
CHECK(sizeof(IdType) == 4) << "CUDA CSRSort_ does not support int64.";
|
||||
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
|
||||
auto device = runtime::DeviceAPI::Get(csr->indptr->ctx);
|
||||
// allocate cusparse handle if needed
|
||||
if (!thr_entry->cusparse_handle) {
|
||||
CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
|
||||
}
|
||||
CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, thr_entry->stream));
|
||||
|
||||
NDArray indptr = csr->indptr;
|
||||
NDArray indices = csr->indices;
|
||||
const auto& ctx = indptr->ctx;
|
||||
const int64_t nnz = indices->shape[0];
|
||||
if (!aten::CSRHasData(*csr))
|
||||
csr->data = aten::Range(0, nnz, indices->dtype.bits, ctx);
|
||||
NDArray data = csr->data;
|
||||
|
||||
size_t workspace_size = 0;
|
||||
CUSPARSE_CALL(cusparseXcsrsort_bufferSizeExt(
|
||||
thr_entry->cusparse_handle,
|
||||
csr->num_rows, csr->num_cols, nnz,
|
||||
indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(),
|
||||
&workspace_size));
|
||||
void* workspace = device->AllocWorkspace(ctx, workspace_size);
|
||||
|
||||
cusparseMatDescr_t descr;
|
||||
CUSPARSE_CALL(cusparseCreateMatDescr(&descr));
|
||||
CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
|
||||
CUSPARSE_CALL(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
|
||||
CUSPARSE_CALL(cusparseXcsrsort(
|
||||
thr_entry->cusparse_handle,
|
||||
csr->num_rows, csr->num_cols, nnz,
|
||||
descr,
|
||||
indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(),
|
||||
data.Ptr<int32_t>(),
|
||||
workspace));
|
||||
|
||||
csr->sorted = true;
|
||||
|
||||
// free resources
|
||||
CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
|
||||
device->FreeWorkspace(ctx, workspace);
|
||||
}
|
||||
|
||||
template void CSRSort_<kDLGPU, int32_t>(CSRMatrix* csr);
|
||||
template void CSRSort_<kDLGPU, int64_t>(CSRMatrix* csr);
|
||||
|
||||
} // namespace impl
|
||||
} // namespace aten
|
||||
} // namespace dgl
|
||||
@@ -10,7 +10,7 @@
|
||||
#include "macro.cuh"
|
||||
#include "atomic.cuh"
|
||||
#include "functor.cuh"
|
||||
#include "../../cuda_utils.h"
|
||||
#include "./utils.h"
|
||||
#include "../../runtime/cuda/cuda_common.h"
|
||||
|
||||
namespace dgl {
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
#include <unordered_set>
|
||||
#include <numeric>
|
||||
#include "../../runtime/cuda/cuda_common.h"
|
||||
#include "../../cuda_utils.h"
|
||||
#include "./utils.h"
|
||||
|
||||
namespace dgl {
|
||||
|
||||
@@ -17,8 +17,6 @@ using runtime::NDArray;
|
||||
namespace aten {
|
||||
namespace impl {
|
||||
|
||||
///////////////////////////// CSRIsNonZero /////////////////////////////
|
||||
|
||||
/*!
|
||||
* \brief Search adjacency list linearly for each (row, col) pair and
|
||||
* write the matched position in the indices array to the output.
|
||||
@@ -33,7 +31,7 @@ __global__ void _LinearSearchKernel(
|
||||
int64_t row_stride, int64_t col_stride,
|
||||
int64_t length, IdType* out) {
|
||||
int tx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int stride_x = gridDim.x * blockDim.x;
|
||||
const int stride_x = gridDim.x * blockDim.x;
|
||||
int rpos = tx, cpos = tx;
|
||||
while (tx < length) {
|
||||
out[tx] = -1;
|
||||
@@ -50,6 +48,8 @@ __global__ void _LinearSearchKernel(
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////// CSRIsNonZero /////////////////////////////
|
||||
|
||||
template <DLDeviceType XPU, typename IdType>
|
||||
bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) {
|
||||
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
|
||||
@@ -169,6 +169,88 @@ NDArray CSRGetRowData(CSRMatrix csr, int64_t row) {
|
||||
template NDArray CSRGetRowData<kDLGPU, int32_t>(CSRMatrix, int64_t);
|
||||
template NDArray CSRGetRowData<kDLGPU, int64_t>(CSRMatrix, int64_t);
|
||||
|
||||
///////////////////////////// CSRSliceRows /////////////////////////////
|
||||
|
||||
template <DLDeviceType XPU, typename IdType>
|
||||
CSRMatrix CSRSliceRows(CSRMatrix csr, int64_t start, int64_t end) {
|
||||
const int64_t num_rows = end - start;
|
||||
const IdType st_pos = aten::IndexSelect<IdType>(csr.indptr, start);
|
||||
const IdType ed_pos = aten::IndexSelect<IdType>(csr.indptr, end);
|
||||
const IdType nnz = ed_pos - st_pos;
|
||||
IdArray ret_indptr = aten::IndexSelect(csr.indptr, start, end + 1) - st_pos;
|
||||
// indices and data can be view arrays
|
||||
IdArray ret_indices = csr.indices.CreateView(
|
||||
{nnz}, csr.indices->dtype, st_pos * sizeof(IdType));
|
||||
IdArray ret_data;
|
||||
if (CSRHasData(csr))
|
||||
ret_data = csr.data.CreateView({nnz}, csr.data->dtype, st_pos * sizeof(IdType));
|
||||
else
|
||||
ret_data = aten::Range(st_pos, ed_pos,
|
||||
csr.indptr->dtype.bits, csr.indptr->ctx);
|
||||
return CSRMatrix(num_rows, csr.num_cols,
|
||||
ret_indptr, ret_indices, ret_data,
|
||||
csr.sorted);
|
||||
}
|
||||
|
||||
template CSRMatrix CSRSliceRows<kDLGPU, int32_t>(CSRMatrix, int64_t, int64_t);
|
||||
template CSRMatrix CSRSliceRows<kDLGPU, int64_t>(CSRMatrix, int64_t, int64_t);
|
||||
|
||||
/*!
|
||||
* \brief Copy data segment to output buffers
|
||||
*
|
||||
* For the i^th row r = row[i], copy the data from indptr[r] ~ indptr[r+1]
|
||||
* to the out_data from out_indptr[i] ~ out_indptr[i+1]
|
||||
*
|
||||
* If the provided `data` array is nullptr, write the read index to the out_data.
|
||||
*
|
||||
*/
|
||||
template <typename IdType, typename DType>
|
||||
__global__ void _SegmentCopyKernel(
|
||||
const IdType* indptr, const DType* data,
|
||||
const IdType* row, int64_t row_stride, int64_t length,
|
||||
const IdType* out_indptr, DType* out_data) {
|
||||
int tx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int stride_x = gridDim.x * blockDim.x;
|
||||
int rpos = tx;
|
||||
while (tx < length) {
|
||||
const IdType r = row[rpos];
|
||||
DType* out_buf = out_data + out_indptr[tx];
|
||||
for (IdType i = indptr[r]; i < indptr[r + 1]; ++i) {
|
||||
*(out_buf++) = data? data[i] : i;
|
||||
}
|
||||
rpos += row_stride;
|
||||
tx += stride_x;
|
||||
}
|
||||
}
|
||||
|
||||
template <DLDeviceType XPU, typename IdType>
|
||||
CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
|
||||
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
|
||||
const int64_t len = rows->shape[0];
|
||||
IdArray ret_indptr = aten::CumSum(aten::CSRGetRowNNZ(csr, rows), true);
|
||||
const int64_t nnz = aten::IndexSelect<IdType>(ret_indptr, len);
|
||||
|
||||
const int nt = cuda::FindNumThreads(len);
|
||||
const int nb = (len + nt - 1) / nt;
|
||||
// Copy indices.
|
||||
IdArray ret_indices = NDArray::Empty({nnz}, csr.indptr->dtype, csr.indptr->ctx);
|
||||
_SegmentCopyKernel<<<nb, nt, 0, thr_entry->stream>>>(
|
||||
csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
|
||||
rows.Ptr<IdType>(), 1, len,
|
||||
ret_indptr.Ptr<IdType>(), ret_indices.Ptr<IdType>());
|
||||
// Copy data.
|
||||
IdArray ret_data = NDArray::Empty({nnz}, csr.indptr->dtype, csr.indptr->ctx);
|
||||
_SegmentCopyKernel<<<nb, nt, 0, thr_entry->stream>>>(
|
||||
csr.indptr.Ptr<IdType>(), CSRHasData(csr)? csr.data.Ptr<IdType>() : nullptr,
|
||||
rows.Ptr<IdType>(), 1, len,
|
||||
ret_indptr.Ptr<IdType>(), ret_data.Ptr<IdType>());
|
||||
return CSRMatrix(len, csr.num_cols,
|
||||
ret_indptr, ret_indices, ret_data,
|
||||
csr.sorted);
|
||||
}
|
||||
|
||||
template CSRMatrix CSRSliceRows<kDLGPU, int32_t>(CSRMatrix , NDArray);
|
||||
template CSRMatrix CSRSliceRows<kDLGPU, int64_t>(CSRMatrix , NDArray);
|
||||
|
||||
} // namespace impl
|
||||
} // namespace aten
|
||||
|
||||
@@ -140,6 +140,7 @@ void CusparseCsrmm2(
|
||||
static_cast<int32_t*>(csr.indptr->data),
|
||||
static_cast<int32_t*>(csr.indices->data),
|
||||
B_data, n, &beta, trans_out, m));
|
||||
CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
|
||||
if (valptr)
|
||||
device->FreeWorkspace(ctx, valptr);
|
||||
// transpose the output matrix
|
||||
|
||||
@@ -9,8 +9,8 @@
|
||||
#include <dgl/bcast.h>
|
||||
#include "macro.cuh"
|
||||
#include "atomic.cuh"
|
||||
#include "../../cuda_utils.h"
|
||||
#include "../../runtime/cuda/cuda_common.h"
|
||||
#include "./utils.h"
|
||||
|
||||
namespace dgl {
|
||||
|
||||
|
||||
30
src/array/cuda/utils.cu
Normal file
30
src/array/cuda/utils.cu
Normal file
@@ -0,0 +1,30 @@
|
||||
/*!
|
||||
* Copyright (c) 2020 by Contributors
|
||||
* \file array/cuda/utils.cu
|
||||
* \brief Utilities for CUDA kernels.
|
||||
*/
|
||||
|
||||
#include "./utils.h"
|
||||
#include <cub/cub.cuh>
|
||||
#include "../../runtime/cuda/cuda_common.h"
|
||||
|
||||
namespace dgl {
|
||||
namespace cuda {
|
||||
|
||||
bool AllTrue(int8_t* flags, int64_t length, const DLContext& ctx) {
|
||||
auto device = runtime::DeviceAPI::Get(ctx);
|
||||
int8_t* rst = static_cast<int8_t*>(device->AllocWorkspace(ctx, 1));
|
||||
// Call CUB's reduction
|
||||
size_t workspace_size = 0;
|
||||
CUDA_CALL(cub::DeviceReduce::Min(nullptr, workspace_size, flags, rst, length));
|
||||
void* workspace = device->AllocWorkspace(ctx, workspace_size);
|
||||
CUDA_CALL(cub::DeviceReduce::Min(workspace, workspace_size, flags, rst, length));
|
||||
int8_t cpu_rst = 0;
|
||||
CUDA_CALL(cudaMemcpy(&cpu_rst, rst, 1, cudaMemcpyDeviceToHost));
|
||||
device->FreeWorkspace(ctx, workspace);
|
||||
device->FreeWorkspace(ctx, rst);
|
||||
return cpu_rst == 1;
|
||||
}
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace dgl
|
||||
@@ -1,12 +1,13 @@
|
||||
/*!
|
||||
* Copyright (c) 2020 by Contributors
|
||||
* \file cuda_utils.h
|
||||
* \file array/cuda/utils.h
|
||||
* \brief Utilities for CUDA kernels.
|
||||
*/
|
||||
#ifndef DGL_CUDA_UTILS_H_
|
||||
#define DGL_CUDA_UTILS_H_
|
||||
#ifndef DGL_ARRAY_CUDA_UTILS_H_
|
||||
#define DGL_ARRAY_CUDA_UTILS_H_
|
||||
|
||||
#include <dmlc/logging.h>
|
||||
#include <dlpack/dlpack.h>
|
||||
|
||||
namespace dgl {
|
||||
namespace cuda {
|
||||
@@ -68,7 +69,18 @@ __device__ __forceinline__ T _ldg(T* addr) {
|
||||
#endif
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief Return true if the given bool flag array is all true.
|
||||
* The input bool array is in int8_t type so it is aligned with byte address.
|
||||
*
|
||||
* \param flags The bool array.
|
||||
* \param length The length.
|
||||
* \param ctx Device context.
|
||||
* \return True if all the flags are true.
|
||||
*/
|
||||
bool AllTrue(int8_t* flags, int64_t length, const DLContext& ctx);
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace dgl
|
||||
|
||||
#endif // DGL_CUDA_UTILS_H_
|
||||
#endif // DGL_ARRAY_CUDA_UTILS_H_
|
||||
@@ -3,7 +3,6 @@
|
||||
* \file array/kernel.cc
|
||||
* \brief New kernels
|
||||
*/
|
||||
#include <dgl/array.h>
|
||||
#include <dgl/packed_func_ext.h>
|
||||
#include <dgl/base_heterograph.h>
|
||||
|
||||
|
||||
@@ -6,9 +6,9 @@
|
||||
#ifndef DGL_ARRAY_KERNEL_DECL_H_
|
||||
#define DGL_ARRAY_KERNEL_DECL_H_
|
||||
|
||||
#include <dgl/array.h>
|
||||
#include <dgl/bcast.h>
|
||||
#include <dgl/base_heterograph.h>
|
||||
#include <dgl/runtime/ndarray.h>
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
namespace dgl {
|
||||
|
||||
|
||||
@@ -804,7 +804,7 @@ DGL_REGISTER_GLOBAL("network._CAPI_FastPull")
|
||||
}
|
||||
}
|
||||
int msg_count = 0;
|
||||
for (int i = 0; i < remote_ids.size(); ++i) {
|
||||
for (size_t i = 0; i < remote_ids.size(); ++i) {
|
||||
if (remote_ids[i].size() != 0) {
|
||||
KVStoreMsg kv_msg;
|
||||
kv_msg.msg_type = MessageType::kPullMsg;
|
||||
@@ -827,9 +827,10 @@ DGL_REGISTER_GLOBAL("network._CAPI_FastPull")
|
||||
}
|
||||
}
|
||||
char *return_data = new char[ID_size*row_size];
|
||||
const int64_t local_ids_size = local_ids.size();
|
||||
// Copy local data
|
||||
#pragma omp parallel for
|
||||
for (int64_t i = 0; i < local_ids.size(); ++i) {
|
||||
for (int64_t i = 0; i < local_ids_size; ++i) {
|
||||
CHECK_GE(ID_size*row_size, local_ids_orginal[i] * row_size + row_size);
|
||||
CHECK_GE(data_size, local_ids[i] * row_size + row_size);
|
||||
CHECK_GE(local_ids[i], 0);
|
||||
@@ -843,7 +844,7 @@ DGL_REGISTER_GLOBAL("network._CAPI_FastPull")
|
||||
int64_t id_size = kv_msg->id.GetSize() / sizeof(int64_t);
|
||||
int part_id = kv_msg->rank / group_count;
|
||||
char* data_char = static_cast<char*>(kv_msg->data->data);
|
||||
for (size_t n = 0; n < id_size; ++n) {
|
||||
for (int64_t n = 0; n < id_size; ++n) {
|
||||
memcpy(return_data + remote_ids_original[part_id][n] * row_size,
|
||||
data_char + n * row_size,
|
||||
row_size);
|
||||
|
||||
@@ -20,6 +20,7 @@
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <utility>
|
||||
#include <memory>
|
||||
#include "../../c_api_common.h"
|
||||
|
||||
using dgl::runtime::NDArray;
|
||||
|
||||
@@ -51,9 +51,11 @@ ToBlock(HeteroGraphPtr graph, const std::vector<IdArray> &rhs_nodes, bool includ
|
||||
const auto src_dst_types = graph->GetEndpointTypes(etype);
|
||||
const dgl_type_t srctype = src_dst_types.first;
|
||||
const dgl_type_t dsttype = src_dst_types.second;
|
||||
const EdgeArray edges = graph->InEdges(etype, rhs_nodes[dsttype]);
|
||||
lhs_node_mappings[srctype].Update(edges.src);
|
||||
edge_arrays[etype] = edges;
|
||||
if (!aten::IsNullArray(rhs_nodes[dsttype])) {
|
||||
const EdgeArray& edges = graph->Edges(etype);
|
||||
lhs_node_mappings[srctype].Update(edges.src);
|
||||
edge_arrays[etype] = edges;
|
||||
}
|
||||
}
|
||||
|
||||
const auto meta_graph = graph->meta_graph();
|
||||
@@ -75,11 +77,26 @@ ToBlock(HeteroGraphPtr graph, const std::vector<IdArray> &rhs_nodes, bool includ
|
||||
const dgl_type_t dsttype = src_dst_types.second;
|
||||
const IdHashMap<IdType> &lhs_map = lhs_node_mappings[srctype];
|
||||
const IdHashMap<IdType> &rhs_map = rhs_node_mappings[dsttype];
|
||||
rel_graphs.push_back(CreateFromCOO(
|
||||
2, lhs_map.Size(), rhs_map.Size(),
|
||||
lhs_map.Map(edge_arrays[etype].src, -1),
|
||||
rhs_map.Map(edge_arrays[etype].dst, -1)));
|
||||
induced_edges.push_back(edge_arrays[etype].id);
|
||||
if (rhs_map.Size() == 0) {
|
||||
// No rhs nodes are given for this edge type. Create an empty graph.
|
||||
rel_graphs.push_back(CreateFromCOO(
|
||||
2, lhs_map.Size(), rhs_map.Size(),
|
||||
aten::NullArray(), aten::NullArray()));
|
||||
induced_edges.push_back(aten::NullArray());
|
||||
} else {
|
||||
IdArray new_src = lhs_map.Map(edge_arrays[etype].src, -1);
|
||||
IdArray new_dst = rhs_map.Map(edge_arrays[etype].dst, -1);
|
||||
// Check whether there are unmapped IDs and raise error.
|
||||
for (int64_t i = 0; i < new_dst->shape[0]; ++i)
|
||||
CHECK_NE(new_dst.Ptr<IdType>()[i], -1)
|
||||
<< "Node " << edge_arrays[etype].dst.Ptr<IdType>()[i] << " does not exist"
|
||||
<< " in `rhs_nodes`. Argument `rhs_nodes` must contain all the edge"
|
||||
<< " destination nodes.";
|
||||
rel_graphs.push_back(CreateFromCOO(
|
||||
2, lhs_map.Size(), rhs_map.Size(),
|
||||
new_src, new_dst));
|
||||
induced_edges.push_back(edge_arrays[etype].id);
|
||||
}
|
||||
}
|
||||
|
||||
const HeteroGraphPtr new_graph = CreateHeteroGraph(
|
||||
|
||||
@@ -138,13 +138,7 @@ class UnitGraph::COO : public BaseHeteroGraph {
|
||||
COO CopyTo(const DLContext& ctx) const {
|
||||
if (Context() == ctx)
|
||||
return *this;
|
||||
|
||||
COO ret(
|
||||
meta_graph_,
|
||||
adj_.num_rows, adj_.num_cols,
|
||||
adj_.row.CopyTo(ctx),
|
||||
adj_.col.CopyTo(ctx));
|
||||
return ret;
|
||||
return COO(meta_graph_, adj_.CopyTo(ctx));
|
||||
}
|
||||
|
||||
bool IsMultigraph() const override {
|
||||
@@ -516,13 +510,7 @@ class UnitGraph::CSR : public BaseHeteroGraph {
|
||||
if (Context() == ctx) {
|
||||
return *this;
|
||||
} else {
|
||||
CSR ret(
|
||||
meta_graph_,
|
||||
adj_.num_rows, adj_.num_cols,
|
||||
adj_.indptr.CopyTo(ctx),
|
||||
adj_.indices.CopyTo(ctx),
|
||||
adj_.data.CopyTo(ctx));
|
||||
return ret;
|
||||
return CSR(meta_graph_, adj_.CopyTo(ctx));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1181,35 +1169,28 @@ HeteroGraphPtr UnitGraph::AsNumBits(HeteroGraphPtr g, uint8_t bits) {
|
||||
if (g->NumBits() == bits) {
|
||||
return g;
|
||||
} else {
|
||||
// TODO(minjie): since we don't have int32 operations,
|
||||
// we make sure that this graph (on CPU) has materialized CSR,
|
||||
// and then copy them to other context (usually GPU). This should
|
||||
// be fixed later.
|
||||
auto bg = std::dynamic_pointer_cast<UnitGraph>(g);
|
||||
CHECK_NOTNULL(bg);
|
||||
|
||||
CSRPtr new_incsr = CSRPtr(new CSR(bg->GetInCSR()->AsNumBits(bits)));
|
||||
CSRPtr new_outcsr = CSRPtr(new CSR(bg->GetOutCSR()->AsNumBits(bits)));
|
||||
CSRPtr new_incsr = (bg->in_csr_)? CSRPtr(new CSR(bg->in_csr_->AsNumBits(bits))) : nullptr;
|
||||
CSRPtr new_outcsr = (bg->out_csr_)? CSRPtr(new CSR(bg->out_csr_->AsNumBits(bits))) : nullptr;
|
||||
COOPtr new_coo = (bg->coo_)? COOPtr(new COO(bg->coo_->AsNumBits(bits))) : nullptr;
|
||||
return HeteroGraphPtr(
|
||||
new UnitGraph(g->meta_graph(), new_incsr, new_outcsr, nullptr, bg->restrict_format_));
|
||||
new UnitGraph(g->meta_graph(), new_incsr, new_outcsr, new_coo, bg->restrict_format_));
|
||||
}
|
||||
}
|
||||
|
||||
HeteroGraphPtr UnitGraph::CopyTo(HeteroGraphPtr g, const DLContext& ctx) {
|
||||
if (ctx == g->Context()) {
|
||||
return g;
|
||||
} else {
|
||||
auto bg = std::dynamic_pointer_cast<UnitGraph>(g);
|
||||
CHECK_NOTNULL(bg);
|
||||
CSRPtr new_incsr = (bg->in_csr_)? CSRPtr(new CSR(bg->in_csr_->CopyTo(ctx))) : nullptr;
|
||||
CSRPtr new_outcsr = (bg->out_csr_)? CSRPtr(new CSR(bg->out_csr_->CopyTo(ctx))) : nullptr;
|
||||
COOPtr new_coo = (bg->coo_)? COOPtr(new COO(bg->coo_->CopyTo(ctx))) : nullptr;
|
||||
return HeteroGraphPtr(
|
||||
new UnitGraph(g->meta_graph(), new_incsr, new_outcsr, new_coo, bg->restrict_format_));
|
||||
}
|
||||
// TODO(minjie): since we don't have GPU implementation of COO<->CSR,
|
||||
// we make sure that this graph (on CPU) has materialized CSR,
|
||||
// and then copy them to other context (usually GPU). This should
|
||||
// be fixed later.
|
||||
auto bg = std::dynamic_pointer_cast<UnitGraph>(g);
|
||||
CHECK_NOTNULL(bg);
|
||||
|
||||
CSRPtr new_incsr = CSRPtr(new CSR(bg->GetInCSR()->CopyTo(ctx)));
|
||||
CSRPtr new_outcsr = CSRPtr(new CSR(bg->GetOutCSR()->CopyTo(ctx)));
|
||||
return HeteroGraphPtr(
|
||||
new UnitGraph(g->meta_graph(), new_incsr, new_outcsr, nullptr, bg->restrict_format_));
|
||||
}
|
||||
|
||||
UnitGraph::UnitGraph(GraphPtr metagraph, CSRPtr in_csr, CSRPtr out_csr, COOPtr coo,
|
||||
@@ -1278,9 +1259,8 @@ UnitGraph::CSRPtr UnitGraph::GetInCSR(bool inplace) const {
|
||||
const_cast<UnitGraph*>(this)->in_csr_ = ret;
|
||||
} else {
|
||||
CHECK(coo_) << "None of CSR, COO exist";
|
||||
const auto& adj = coo_->adj();
|
||||
const auto& newadj = aten::COOToCSR(
|
||||
aten::COOMatrix{adj.num_cols, adj.num_rows, adj.col, adj.row});
|
||||
const auto& newadj = aten::CSRSort(aten::COOToCSR(
|
||||
aten::COOTranspose(coo_->adj())));
|
||||
ret = std::make_shared<CSR>(meta_graph(), newadj);
|
||||
if (inplace)
|
||||
const_cast<UnitGraph*>(this)->in_csr_ = ret;
|
||||
@@ -1299,13 +1279,13 @@ UnitGraph::CSRPtr UnitGraph::GetOutCSR(bool inplace) const {
|
||||
CSRPtr ret = out_csr_;
|
||||
if (!out_csr_) {
|
||||
if (in_csr_) {
|
||||
const auto& newadj = aten::CSRTranspose(in_csr_->adj());
|
||||
const auto& newadj = aten::CSRSort(aten::CSRTranspose(in_csr_->adj()));
|
||||
ret = std::make_shared<CSR>(meta_graph(), newadj);
|
||||
if (inplace)
|
||||
const_cast<UnitGraph*>(this)->out_csr_ = ret;
|
||||
} else {
|
||||
CHECK(coo_) << "None of CSR, COO exist";
|
||||
const auto& newadj = aten::COOToCSR(coo_->adj());
|
||||
const auto& newadj = aten::CSRSort(aten::COOToCSR(coo_->adj()));
|
||||
ret = std::make_shared<CSR>(meta_graph(), newadj);
|
||||
if (inplace)
|
||||
const_cast<UnitGraph*>(this)->out_csr_ = ret;
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <time.h>
|
||||
#include <memory>
|
||||
|
||||
#include "socket_communicator.h"
|
||||
#include "../../c_api_common.h"
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <memory>
|
||||
|
||||
#include "communicator.h"
|
||||
#include "msg_queue.h"
|
||||
@@ -19,9 +20,9 @@
|
||||
namespace dgl {
|
||||
namespace network {
|
||||
|
||||
static int kMaxTryCount = 1024; // maximal connection: 1024
|
||||
static int kTimeOut = 10; // 10 minutes for socket timeout
|
||||
static int kMaxConnection = 1024; // maximal connection: 1024
|
||||
static constexpr int kMaxTryCount = 1024; // maximal connection: 1024
|
||||
static constexpr int kTimeOut = 10; // 10 minutes for socket timeout
|
||||
static constexpr int kMaxConnection = 1024; // maximal connection: 1024
|
||||
|
||||
/*!
|
||||
* \breif Networking address
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include <dgl/runtime/serializer.h>
|
||||
#include <fstream>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "file_util.h"
|
||||
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#define DGL_RUNTIME_FILE_UTIL_H_
|
||||
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include "meta_data.h"
|
||||
|
||||
namespace dgl {
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
#include <dgl/runtime/module.h>
|
||||
#include <dgl/runtime/registry.h>
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include "module_util.h"
|
||||
|
||||
namespace dgl {
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
#include <dgl/runtime/c_runtime_api.h>
|
||||
#include <dgl/runtime/c_backend_api.h>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
|
||||
extern "C" {
|
||||
// Function signature for generated packed function in shared library
|
||||
|
||||
@@ -124,6 +124,8 @@ size_t NDArray::GetSize() const {
|
||||
}
|
||||
|
||||
int64_t NDArray::NumElements() const {
|
||||
if (data_->dl_tensor.ndim == 0)
|
||||
return 0;
|
||||
int64_t size = 1;
|
||||
for (int i = 0; i < data_->dl_tensor.ndim; ++i) {
|
||||
size *= data_->dl_tensor.shape[i];
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
* \brief Workspace pool utility.
|
||||
*/
|
||||
#include "workspace_pool.h"
|
||||
#include <memory>
|
||||
|
||||
namespace dgl {
|
||||
namespace runtime {
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
|
||||
#include <dgl/runtime/device_api.h>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
|
||||
namespace dgl {
|
||||
namespace runtime {
|
||||
|
||||
@@ -1883,4 +1883,4 @@ if __name__ == '__main__':
|
||||
# test_isolated_ntype()
|
||||
# test_bipartite()
|
||||
# test_dtype_cast()
|
||||
test_format()
|
||||
pass
|
||||
|
||||
@@ -603,10 +603,6 @@ def test_to_block(index_dtype):
|
||||
assert bg.number_of_src_nodes() == 4
|
||||
assert bg.number_of_dst_nodes() == 4
|
||||
|
||||
dst_nodes = F.tensor([3, 4], dtype=getattr(F, index_dtype))
|
||||
bg = dgl.to_block(g_a, dst_nodes)
|
||||
check(g_a, bg, 'A', 'AA', dst_nodes)
|
||||
|
||||
dst_nodes = F.tensor([4, 3, 2, 1], dtype=getattr(F, index_dtype))
|
||||
bg = dgl.to_block(g_a, dst_nodes)
|
||||
check(g_a, bg, 'A', 'AA', dst_nodes)
|
||||
@@ -620,17 +616,13 @@ def test_to_block(index_dtype):
|
||||
assert bg.number_of_nodes('DST/A') == 0
|
||||
checkall(g_ab, bg, None)
|
||||
|
||||
dst_nodes = {'B': F.tensor([5, 6], dtype=getattr(F, index_dtype))}
|
||||
dst_nodes = {'B': F.tensor([5, 6, 3, 1], dtype=getattr(F, index_dtype))}
|
||||
bg = dgl.to_block(g, dst_nodes)
|
||||
assert bg.number_of_nodes('SRC/B') == 2
|
||||
assert bg.number_of_nodes('SRC/B') == 4
|
||||
assert F.array_equal(bg.srcnodes['B'].data[dgl.NID], bg.dstnodes['B'].data[dgl.NID])
|
||||
assert bg.number_of_nodes('DST/A') == 0
|
||||
checkall(g, bg, dst_nodes)
|
||||
|
||||
dst_nodes = {'A': F.tensor([3, 4], dtype=getattr(F, index_dtype)), 'B': F.tensor([5, 6], dtype=getattr(F, index_dtype))}
|
||||
bg = dgl.to_block(g, dst_nodes)
|
||||
checkall(g, bg, dst_nodes)
|
||||
|
||||
dst_nodes = {'A': F.tensor([4, 3, 2, 1], dtype=getattr(F, index_dtype)), 'B': F.tensor([3, 5, 6, 1], dtype=getattr(F, index_dtype))}
|
||||
bg = dgl.to_block(g, dst_nodes=dst_nodes)
|
||||
checkall(g, bg, dst_nodes)
|
||||
|
||||
@@ -29,6 +29,10 @@ inline int64_t Len(dgl::runtime::NDArray nd) {
|
||||
template <typename T>
|
||||
inline bool ArrayEQ(dgl::runtime::NDArray a1, dgl::runtime::NDArray a2) {
|
||||
if (a1->ndim != a2->ndim) return false;
|
||||
if (a1->dtype != a2->dtype) return false;
|
||||
if (a1->ctx != a2->ctx) return false;
|
||||
if (a1.NumElements() != a2.NumElements()) return false;
|
||||
if (a1.NumElements() == 0) return true;
|
||||
int64_t num = 1;
|
||||
for (int i = 0; i < a1->ndim; ++i) {
|
||||
if (a1->shape[i] != a2->shape[i])
|
||||
|
||||
@@ -208,6 +208,8 @@ template <typename IDX>
|
||||
void _TestIndexSelect(DLContext ctx) {
|
||||
IdArray a = aten::Range(0, 100, sizeof(IDX)*8, ctx);
|
||||
ASSERT_EQ(aten::IndexSelect<int>(a, 50), 50);
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(aten::IndexSelect(a, 10, 20),
|
||||
aten::Range(10, 20, sizeof(IDX)*8, ctx)));
|
||||
IdArray b = aten::VecToIdArray(std::vector<IDX>({0, 20, 10}), sizeof(IDX)*8, ctx);
|
||||
IdArray c = aten::IndexSelect(a, b);
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(b, c));
|
||||
@@ -239,3 +241,41 @@ TEST(ArrayTest, TestRelabel_) {
|
||||
_TestRelabel_<int32_t>();
|
||||
_TestRelabel_<int64_t>();
|
||||
}
|
||||
|
||||
template <typename IDX>
|
||||
void _TestCumSum(DLContext ctx) {
|
||||
IdArray a = aten::VecToIdArray(std::vector<IDX>({8, 6, 7, 5, 3, 0, 9}),
|
||||
sizeof(IDX)*8, ctx);
|
||||
{
|
||||
IdArray tb = aten::VecToIdArray(std::vector<IDX>({8, 14, 21, 26, 29, 29, 38}),
|
||||
sizeof(IDX)*8, ctx);
|
||||
IdArray b = aten::CumSum(a);
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(b, tb));
|
||||
}
|
||||
{
|
||||
IdArray tb = aten::VecToIdArray(std::vector<IDX>({0, 8, 14, 21, 26, 29, 29, 38}),
|
||||
sizeof(IDX)*8, ctx);
|
||||
IdArray b = aten::CumSum(a, true);
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(b, tb));
|
||||
}
|
||||
a = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX)*8, ctx);
|
||||
{
|
||||
IdArray tb = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX)*8, ctx);
|
||||
IdArray b = aten::CumSum(a);
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(b, tb));
|
||||
}
|
||||
{
|
||||
IdArray tb = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX)*8, ctx);
|
||||
IdArray b = aten::CumSum(a);
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(b, tb));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(ArrayTest, CumSum) {
|
||||
_TestCumSum<int32_t>(CPU);
|
||||
_TestCumSum<int64_t>(CPU);
|
||||
#ifdef DGL_USE_CUDA
|
||||
_TestCumSum<int32_t>(GPU);
|
||||
_TestCumSum<int64_t>(GPU);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -17,8 +17,8 @@ aten::CSRMatrix CSR1(DLContext ctx = CTX) {
|
||||
return aten::CSRMatrix(
|
||||
4, 5,
|
||||
aten::VecToIdArray(std::vector<IDX>({0, 2, 3, 5, 5}), sizeof(IDX)*8, ctx),
|
||||
aten::VecToIdArray(std::vector<IDX>({1, 2, 0, 2, 3}), sizeof(IDX)*8, ctx),
|
||||
aten::VecToIdArray(std::vector<IDX>({0, 2, 3, 1, 4}), sizeof(IDX)*8, ctx),
|
||||
aten::VecToIdArray(std::vector<IDX>({1, 2, 0, 3, 2}), sizeof(IDX)*8, ctx),
|
||||
aten::VecToIdArray(std::vector<IDX>({0, 2, 3, 4, 1}), sizeof(IDX)*8, ctx),
|
||||
false);
|
||||
}
|
||||
|
||||
@@ -277,12 +277,23 @@ void _TestCSRToCOO(DLContext ctx) {
|
||||
auto coo = CSRToCOO(csr, false);
|
||||
ASSERT_EQ(coo.num_rows, 4);
|
||||
ASSERT_EQ(coo.num_cols, 5);
|
||||
ASSERT_TRUE(coo.row_sorted);
|
||||
auto tr = aten::VecToIdArray(std::vector<IDX>({0, 0, 0, 1, 2, 2}), sizeof(IDX)*8, ctx);
|
||||
auto tc = aten::VecToIdArray(std::vector<IDX>({1, 2, 2, 0, 2, 3}), sizeof(IDX)*8, ctx);
|
||||
auto td = aten::VecToIdArray(std::vector<IDX>({0, 2, 5, 3, 1, 4}), sizeof(IDX)*8, ctx);
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(coo.row, tr));
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(coo.col, tc));
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(coo.data, td));
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(coo.col, csr.indices));
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(coo.data, csr.data));
|
||||
|
||||
// convert from sorted csr
|
||||
auto s_csr = CSRSort(csr);
|
||||
coo = CSRToCOO(s_csr, false);
|
||||
ASSERT_EQ(coo.num_rows, 4);
|
||||
ASSERT_EQ(coo.num_cols, 5);
|
||||
ASSERT_TRUE(coo.row_sorted);
|
||||
ASSERT_TRUE(coo.col_sorted);
|
||||
tr = aten::VecToIdArray(std::vector<IDX>({0, 0, 0, 1, 2, 2}), sizeof(IDX)*8, ctx);
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(coo.row, tr));
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(coo.col, s_csr.indices));
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(coo.data, s_csr.data));
|
||||
}
|
||||
{
|
||||
auto coo = CSRToCOO(csr, true);
|
||||
@@ -294,7 +305,7 @@ void _TestCSRToCOO(DLContext ctx) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST(SpmatTest, TestCSRToCOO) {
|
||||
TEST(SpmatTest, CSRToCOO) {
|
||||
_TestCSRToCOO<int32_t>(CPU);
|
||||
_TestCSRToCOO<int64_t>(CPU);
|
||||
#if DGL_USE_CUDA
|
||||
@@ -303,8 +314,8 @@ TEST(SpmatTest, TestCSRToCOO) {
|
||||
}
|
||||
|
||||
template <typename IDX>
|
||||
void _TestCSRSliceRows() {
|
||||
auto csr = CSR2<IDX>();
|
||||
void _TestCSRSliceRows(DLContext ctx) {
|
||||
auto csr = CSR2<IDX>(ctx);
|
||||
auto x = aten::CSRSliceRows(csr, 1, 4);
|
||||
// [1, 0, 0, 0, 0],
|
||||
// [0, 0, 1, 1, 0],
|
||||
@@ -312,30 +323,34 @@ void _TestCSRSliceRows() {
|
||||
// data: [3, 1, 4]
|
||||
ASSERT_EQ(x.num_rows, 3);
|
||||
ASSERT_EQ(x.num_cols, 5);
|
||||
auto tp = aten::VecToIdArray(std::vector<IDX>({0, 1, 3, 3}), sizeof(IDX)*8, CTX);
|
||||
auto ti = aten::VecToIdArray(std::vector<IDX>({0, 2, 3}), sizeof(IDX)*8, CTX);
|
||||
auto td = aten::VecToIdArray(std::vector<IDX>({3, 1, 4}), sizeof(IDX)*8, CTX);
|
||||
auto tp = aten::VecToIdArray(std::vector<IDX>({0, 1, 3, 3}), sizeof(IDX)*8, ctx);
|
||||
auto ti = aten::VecToIdArray(std::vector<IDX>({0, 2, 3}), sizeof(IDX)*8, ctx);
|
||||
auto td = aten::VecToIdArray(std::vector<IDX>({3, 1, 4}), sizeof(IDX)*8, ctx);
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(x.indptr, tp));
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(x.indices, ti));
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(x.data, td));
|
||||
|
||||
auto r = aten::VecToIdArray(std::vector<IDX>({0, 1, 3}), sizeof(IDX)*8, CTX);
|
||||
auto r = aten::VecToIdArray(std::vector<IDX>({0, 1, 3}), sizeof(IDX)*8, ctx);
|
||||
x = aten::CSRSliceRows(csr, r);
|
||||
// [[0, 1, 2, 0, 0],
|
||||
// [1, 0, 0, 0, 0],
|
||||
// [0, 0, 0, 0, 0]]
|
||||
// data: [0, 2, 5, 3]
|
||||
tp = aten::VecToIdArray(std::vector<IDX>({0, 3, 4, 4}), sizeof(IDX)*8, CTX);
|
||||
ti = aten::VecToIdArray(std::vector<IDX>({1, 2, 2, 0}), sizeof(IDX)*8, CTX);
|
||||
td = aten::VecToIdArray(std::vector<IDX>({0, 2, 5, 3}), sizeof(IDX)*8, CTX);
|
||||
tp = aten::VecToIdArray(std::vector<IDX>({0, 3, 4, 4}), sizeof(IDX)*8, ctx);
|
||||
ti = aten::VecToIdArray(std::vector<IDX>({1, 2, 2, 0}), sizeof(IDX)*8, ctx);
|
||||
td = aten::VecToIdArray(std::vector<IDX>({0, 2, 5, 3}), sizeof(IDX)*8, ctx);
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(x.indptr, tp));
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(x.indices, ti));
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(x.data, td));
|
||||
}
|
||||
|
||||
TEST(SpmatTest, TestCSRSliceRows) {
|
||||
_TestCSRSliceRows<int32_t>();
|
||||
_TestCSRSliceRows<int64_t>();
|
||||
_TestCSRSliceRows<int32_t>(CPU);
|
||||
_TestCSRSliceRows<int64_t>(CPU);
|
||||
#ifdef DGL_USE_CUDA
|
||||
_TestCSRSliceRows<int32_t>(GPU);
|
||||
_TestCSRSliceRows<int64_t>(GPU);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename IDX>
|
||||
@@ -376,6 +391,29 @@ TEST(SpmatTest, TestCSRHasDuplicate) {
|
||||
_TestCSRHasDuplicate<int64_t>();
|
||||
}
|
||||
|
||||
template <typename IDX>
|
||||
void _TestCSRSort(DLContext ctx) {
|
||||
auto csr = CSR1<IDX>(ctx);
|
||||
ASSERT_FALSE(aten::CSRIsSorted(csr));
|
||||
auto csr1 = aten::CSRSort(csr);
|
||||
ASSERT_FALSE(aten::CSRIsSorted(csr));
|
||||
ASSERT_TRUE(aten::CSRIsSorted(csr1));
|
||||
ASSERT_TRUE(csr1.sorted);
|
||||
aten::CSRSort_(&csr);
|
||||
ASSERT_TRUE(aten::CSRIsSorted(csr));
|
||||
ASSERT_TRUE(csr.sorted);
|
||||
csr = CSR2<IDX>(ctx);
|
||||
ASSERT_TRUE(aten::CSRIsSorted(csr));
|
||||
}
|
||||
|
||||
TEST(SpmatTest, CSRSort) {
|
||||
_TestCSRSort<int32_t>(CPU);
|
||||
_TestCSRSort<int64_t>(CPU);
|
||||
#ifdef DGL_USE_CUDA
|
||||
_TestCSRSort<int32_t>(GPU);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename IDX>
|
||||
void _TestCOOToCSR(DLContext ctx) {
|
||||
auto coo = COO1<IDX>(ctx);
|
||||
@@ -392,6 +430,7 @@ void _TestCOOToCSR(DLContext ctx) {
|
||||
ASSERT_EQ(coo.num_cols, csr.num_cols);
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(csr.indptr, tcsr.indptr));
|
||||
|
||||
// Convert from row sorted coo
|
||||
coo = COO1<IDX>(ctx);
|
||||
auto rs_coo = aten::COOSort(coo, false);
|
||||
auto rs_csr = CSR1<IDX>(ctx);
|
||||
@@ -399,6 +438,8 @@ void _TestCOOToCSR(DLContext ctx) {
|
||||
ASSERT_EQ(coo.num_rows, rs_tcsr.num_rows);
|
||||
ASSERT_EQ(coo.num_cols, rs_tcsr.num_cols);
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(rs_csr.indptr, rs_tcsr.indptr));
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(rs_tcsr.indices, rs_coo.col));
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(rs_tcsr.data, rs_coo.data));
|
||||
|
||||
coo = COO3<IDX>(ctx);
|
||||
rs_coo = aten::COOSort(coo, false);
|
||||
@@ -407,16 +448,20 @@ void _TestCOOToCSR(DLContext ctx) {
|
||||
ASSERT_EQ(coo.num_rows, rs_tcsr.num_rows);
|
||||
ASSERT_EQ(coo.num_cols, rs_tcsr.num_cols);
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(rs_csr.indptr, rs_tcsr.indptr));
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(rs_tcsr.indices, rs_coo.col));
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(rs_tcsr.data, rs_coo.data));
|
||||
|
||||
// Convert from col sorted coo
|
||||
coo = COO1<IDX>(ctx);
|
||||
auto src_coo = aten::COOSort(coo, true);
|
||||
auto src_csr = CSR1<IDX>(ctx);
|
||||
auto src_tcsr = aten::COOToCSR(src_coo);
|
||||
ASSERT_EQ(coo.num_rows, src_tcsr.num_rows);
|
||||
ASSERT_EQ(coo.num_cols, src_tcsr.num_cols);
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(src_csr.indptr, src_tcsr.indptr));
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(src_csr.indices, src_tcsr.indices));
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(src_csr.data, src_tcsr.data));
|
||||
ASSERT_TRUE(src_tcsr.sorted);
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(src_tcsr.indptr, src_csr.indptr));
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(src_tcsr.indices, src_coo.col));
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(src_tcsr.data, src_coo.data));
|
||||
|
||||
coo = COO3<IDX>(ctx);
|
||||
src_coo = aten::COOSort(coo, true);
|
||||
@@ -424,12 +469,13 @@ void _TestCOOToCSR(DLContext ctx) {
|
||||
src_tcsr = aten::COOToCSR(src_coo);
|
||||
ASSERT_EQ(coo.num_rows, src_tcsr.num_rows);
|
||||
ASSERT_EQ(coo.num_cols, src_tcsr.num_cols);
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(src_csr.indptr, src_tcsr.indptr));
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(src_csr.indices, src_tcsr.indices));
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(src_csr.data, src_tcsr.data));
|
||||
ASSERT_TRUE(src_tcsr.sorted);
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(src_tcsr.indptr, src_csr.indptr));
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(src_tcsr.indices, src_coo.col));
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(src_tcsr.data, src_coo.data));
|
||||
}
|
||||
|
||||
TEST(SpmatTest, TestCOOToCSR) {
|
||||
TEST(SpmatTest, COOToCSR) {
|
||||
_TestCOOToCSR<int32_t>(CPU);
|
||||
_TestCOOToCSR<int64_t>(CPU);
|
||||
#ifdef DGL_USE_CUDA
|
||||
@@ -453,12 +499,37 @@ TEST(SpmatTest, TestCOOHasDuplicate) {
|
||||
template <typename IDX>
|
||||
void _TestCOOSort(DLContext ctx) {
|
||||
auto coo = COO3<IDX>(ctx);
|
||||
|
||||
auto sr_coo = COOSort(coo, false);
|
||||
ASSERT_EQ(coo.num_rows, sr_coo.num_rows);
|
||||
ASSERT_EQ(coo.num_cols, sr_coo.num_cols);
|
||||
ASSERT_TRUE(sr_coo.row_sorted);
|
||||
auto flags = COOIsSorted(sr_coo);
|
||||
ASSERT_TRUE(flags.first);
|
||||
flags = COOIsSorted(coo); // original coo should stay the same
|
||||
ASSERT_FALSE(flags.first);
|
||||
ASSERT_FALSE(flags.second);
|
||||
|
||||
auto src_coo = COOSort(coo, true);
|
||||
ASSERT_EQ(coo.num_rows, src_coo.num_rows);
|
||||
ASSERT_EQ(coo.num_cols, src_coo.num_cols);
|
||||
ASSERT_TRUE(src_coo.row_sorted);
|
||||
ASSERT_TRUE(src_coo.col_sorted);
|
||||
flags = COOIsSorted(src_coo);
|
||||
ASSERT_TRUE(flags.first);
|
||||
ASSERT_TRUE(flags.second);
|
||||
|
||||
// sort inplace
|
||||
COOSort_(&coo);
|
||||
ASSERT_TRUE(coo.row_sorted);
|
||||
flags = COOIsSorted(coo);
|
||||
ASSERT_TRUE(flags.first);
|
||||
COOSort_(&coo, true);
|
||||
ASSERT_TRUE(coo.row_sorted);
|
||||
ASSERT_TRUE(coo.col_sorted);
|
||||
flags = COOIsSorted(coo);
|
||||
ASSERT_TRUE(flags.first);
|
||||
ASSERT_TRUE(flags.second);
|
||||
|
||||
// COO3
|
||||
// [[0, 1, 2, 0, 0],
|
||||
@@ -489,7 +560,7 @@ void _TestCOOSort(DLContext ctx) {
|
||||
ASSERT_TRUE(ArrayEQ<IDX>(src_coo.data, sort_col_data));
|
||||
}
|
||||
|
||||
TEST(SpmatTest, TestCOOSort) {
|
||||
TEST(SpmatTest, COOSort) {
|
||||
_TestCOOSort<int32_t>(CPU);
|
||||
_TestCOOSort<int64_t>(CPU);
|
||||
#ifdef DGL_USE_CUDA
|
||||
|
||||
1
third_party/cub
vendored
Submodule
1
third_party/cub
vendored
Submodule
Submodule third_party/cub added at c3cceac115
Reference in New Issue
Block a user