[GraphBolt][CUDA] Optimize CopyTo performance. (#7634)

2026-06-04 19:44:23 +08:00 · 2024-08-01 13:10:43 -04:00
parent 5c902cd59e
commit dfd491568d
2 changed files with 4 additions and 8 deletions
--- a/python/dgl/graphbolt/base.py
+++ b/python/dgl/graphbolt/base.py
@@ -369,9 +369,6 @@ class CopyTo(IterDataPipe):

    def __iter__(self):
        for data in self.datapipe:
-            if self.non_blocking:
-                # The copy is non blocking only if contents of data are pinned.
-                assert data.is_pinned(), f"{data} should be pinned."
            yield recursive_apply(
                data, apply_to, self.device, self.non_blocking
            )
--- a/python/dgl/graphbolt/dataloader.py
+++ b/python/dgl/graphbolt/dataloader.py
@@ -231,11 +231,10 @@ class DataLoader(torch_data.DataLoader):
                    datapipe_graph = dp_utils.replace_dp(
                        datapipe_graph,
                        copier,
-                        copier.datapipe.transform(
-                            lambda x: x.pin_memory()
-                        ).prefetch(2)
-                        # After the data gets pinned, we can copy non_blocking.
-                        .copy_to(copier.device, non_blocking=True),
+                        # Add prefetch so that CPU and GPU can run concurrently.
+                        copier.datapipe.prefetch(2).copy_to(
+                            copier.device, non_blocking=True
+                        ),
                    )

        # The stages after feature fetching is still done in the main process.