mirror of
https://github.com/dmlc/dgl.git
synced 2026-06-04 19:44:23 +08:00
Integrate Regression Test with Jenkins (#2448)
* add bench jenkins * instance type * fix * fix * fix * 111 * test * 111 * 111 * fix * test * run * fix * fix * fix * fix * fix * publish results * 111 * regression * launch ec2 script * fix * add * run on master * change * rrr * run gpu * fix * fix * try fix * fix * ff * fix * fix * fix * refactor * fix * fix * update * fix * fix * fix * fix * remove import torchtext * add shm size * update * fix * fix * fix * fix * fix this!!!! * 111 * fix * remove verbose * fix * fix * fix * fix * fix * fix * fix * fix * update readme * fix * fix * fix * change asv default to head * commit sage and rgcn * fix * update
This commit is contained in:
@@ -17,17 +17,19 @@
|
||||
// uninstalling the project. See asv.conf.json documentation.
|
||||
//
|
||||
"build_command": [
|
||||
"/bin/bash {conf_dir}/build_dgl_asv.sh"
|
||||
"/bin/bash {conf_dir}/scripts/build_dgl_asv.sh"
|
||||
],
|
||||
"install_command": [
|
||||
"/bin/bash {conf_dir}/install_dgl_asv.sh"
|
||||
"/bin/bash {conf_dir}/scripts/install_dgl_asv.sh"
|
||||
],
|
||||
"uninstall_command": [
|
||||
"return-code=any python -m pip uninstall -y dgl"
|
||||
],
|
||||
// List of branches to benchmark. If not provided, defaults to "master"
|
||||
// (for git) or "default" (for mercurial).
|
||||
"branches": ["HEAD", "master"], // for git
|
||||
"branches": [
|
||||
"HEAD"
|
||||
], // for git
|
||||
// The DVCS being used. If not set, it will be automatically
|
||||
// determined from "repo" by looking at the protocol in the URL
|
||||
// (if remote), or by looking for special directories, such as
|
||||
|
||||
341
benchmarks/benchmarks/model_acc/bench_rgcn_ns.py
Normal file
341
benchmarks/benchmarks/model_acc/bench_rgcn_ns.py
Normal file
@@ -0,0 +1,341 @@
|
||||
import dgl
|
||||
import itertools
|
||||
import torch as th
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torch.optim as optim
|
||||
import torch.multiprocessing as mp
|
||||
from torch.utils.data import DataLoader
|
||||
import dgl.nn.pytorch as dglnn
|
||||
from dgl.nn import RelGraphConv
|
||||
import time
|
||||
|
||||
from .. import utils
|
||||
|
||||
class EntityClassify(nn.Module):
|
||||
""" Entity classification class for RGCN
|
||||
Parameters
|
||||
----------
|
||||
device : int
|
||||
Device to run the layer.
|
||||
num_nodes : int
|
||||
Number of nodes.
|
||||
h_dim : int
|
||||
Hidden dim size.
|
||||
out_dim : int
|
||||
Output dim size.
|
||||
num_rels : int
|
||||
Numer of relation types.
|
||||
num_bases : int
|
||||
Number of bases. If is none, use number of relations.
|
||||
num_hidden_layers : int
|
||||
Number of hidden RelGraphConv Layer
|
||||
dropout : float
|
||||
Dropout
|
||||
use_self_loop : bool
|
||||
Use self loop if True, default False.
|
||||
low_mem : bool
|
||||
True to use low memory implementation of relation message passing function
|
||||
trade speed with memory consumption
|
||||
"""
|
||||
def __init__(self,
|
||||
device,
|
||||
num_nodes,
|
||||
h_dim,
|
||||
out_dim,
|
||||
num_rels,
|
||||
num_bases=None,
|
||||
num_hidden_layers=1,
|
||||
dropout=0,
|
||||
use_self_loop=False,
|
||||
low_mem=False,
|
||||
layer_norm=False):
|
||||
super(EntityClassify, self).__init__()
|
||||
self.device = device
|
||||
self.num_nodes = num_nodes
|
||||
self.h_dim = h_dim
|
||||
self.out_dim = out_dim
|
||||
self.num_rels = num_rels
|
||||
self.num_bases = None if num_bases < 0 else num_bases
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.dropout = dropout
|
||||
self.use_self_loop = use_self_loop
|
||||
self.low_mem = low_mem
|
||||
self.layer_norm = layer_norm
|
||||
|
||||
self.layers = nn.ModuleList()
|
||||
# i2h
|
||||
self.layers.append(RelGraphConv(
|
||||
self.h_dim, self.h_dim, self.num_rels, "basis",
|
||||
self.num_bases, activation=F.relu, self_loop=self.use_self_loop,
|
||||
low_mem=self.low_mem, dropout=self.dropout, layer_norm = layer_norm))
|
||||
# h2h
|
||||
for idx in range(self.num_hidden_layers):
|
||||
self.layers.append(RelGraphConv(
|
||||
self.h_dim, self.h_dim, self.num_rels, "basis",
|
||||
self.num_bases, activation=F.relu, self_loop=self.use_self_loop,
|
||||
low_mem=self.low_mem, dropout=self.dropout, layer_norm = layer_norm))
|
||||
# h2o
|
||||
self.layers.append(RelGraphConv(
|
||||
self.h_dim, self.out_dim, self.num_rels, "basis",
|
||||
self.num_bases, activation=None,
|
||||
self_loop=self.use_self_loop,
|
||||
low_mem=self.low_mem, layer_norm = layer_norm))
|
||||
|
||||
def forward(self, blocks, feats, norm=None):
|
||||
if blocks is None:
|
||||
# full graph training
|
||||
blocks = [self.g] * len(self.layers)
|
||||
h = feats
|
||||
for layer, block in zip(self.layers, blocks):
|
||||
block = block.to(self.device)
|
||||
h = layer(block, h, block.edata['etype'], block.edata['norm'])
|
||||
return h
|
||||
|
||||
class RelGraphEmbedLayer(nn.Module):
|
||||
r"""Embedding layer for featureless heterograph.
|
||||
Parameters
|
||||
----------
|
||||
device : int
|
||||
Device to run the layer.
|
||||
num_nodes : int
|
||||
Number of nodes.
|
||||
node_tides : tensor
|
||||
Storing the node type id for each node starting from 0
|
||||
num_of_ntype : int
|
||||
Number of node types
|
||||
input_size : list of int
|
||||
A list of input feature size for each node type. If None, we then
|
||||
treat certain input feature as an one-hot encoding feature.
|
||||
embed_size : int
|
||||
Output embed size
|
||||
embed_name : str, optional
|
||||
Embed name
|
||||
"""
|
||||
def __init__(self,
|
||||
device,
|
||||
num_nodes,
|
||||
node_tids,
|
||||
num_of_ntype,
|
||||
input_size,
|
||||
embed_size,
|
||||
sparse_emb=False,
|
||||
embed_name='embed'):
|
||||
super(RelGraphEmbedLayer, self).__init__()
|
||||
self.device = device
|
||||
self.embed_size = embed_size
|
||||
self.embed_name = embed_name
|
||||
self.num_nodes = num_nodes
|
||||
self.sparse_emb = sparse_emb
|
||||
|
||||
# create weight embeddings for each node for each relation
|
||||
self.embeds = nn.ParameterDict()
|
||||
self.num_of_ntype = num_of_ntype
|
||||
self.idmap = th.empty(num_nodes).long()
|
||||
|
||||
for ntype in range(num_of_ntype):
|
||||
if input_size[ntype] is not None:
|
||||
input_emb_size = input_size[ntype].shape[1]
|
||||
embed = nn.Parameter(th.Tensor(input_emb_size, self.embed_size))
|
||||
nn.init.xavier_uniform_(embed)
|
||||
self.embeds[str(ntype)] = embed
|
||||
|
||||
self.node_embeds = th.nn.Embedding(node_tids.shape[0], self.embed_size, sparse=self.sparse_emb)
|
||||
nn.init.uniform_(self.node_embeds.weight, -1.0, 1.0)
|
||||
|
||||
def forward(self, node_ids, node_tids, type_ids, features):
|
||||
"""Forward computation
|
||||
Parameters
|
||||
----------
|
||||
node_ids : tensor
|
||||
node ids to generate embedding for.
|
||||
node_tids : tensor
|
||||
node type ids
|
||||
features : list of features
|
||||
list of initial features for nodes belong to different node type.
|
||||
If None, the corresponding features is an one-hot encoding feature,
|
||||
else use the features directly as input feature and matmul a
|
||||
projection matrix.
|
||||
Returns
|
||||
-------
|
||||
tensor
|
||||
embeddings as the input of the next layer
|
||||
"""
|
||||
tsd_ids = node_ids.to(self.node_embeds.weight.device)
|
||||
embeds = th.empty(node_ids.shape[0], self.embed_size, device=self.device)
|
||||
for ntype in range(self.num_of_ntype):
|
||||
if features[ntype] is not None:
|
||||
loc = node_tids == ntype
|
||||
embeds[loc] = features[ntype][type_ids[loc]].to(self.device) @ self.embeds[str(ntype)].to(self.device)
|
||||
else:
|
||||
loc = node_tids == ntype
|
||||
embeds[loc] = self.node_embeds(tsd_ids[loc]).to(self.device)
|
||||
|
||||
return embeds
|
||||
|
||||
def evaluate(model, embed_layer, eval_loader, node_feats):
|
||||
model.eval()
|
||||
embed_layer.eval()
|
||||
eval_logits = []
|
||||
eval_seeds = []
|
||||
|
||||
with th.no_grad():
|
||||
for sample_data in tqdm.tqdm(eval_loader):
|
||||
th.cuda.empty_cache()
|
||||
seeds, blocks = sample_data
|
||||
feats = embed_layer(blocks[0].srcdata[dgl.NID],
|
||||
blocks[0].srcdata[dgl.NTYPE],
|
||||
blocks[0].srcdata['type_id'],
|
||||
node_feats)
|
||||
logits = model(blocks, feats)
|
||||
eval_logits.append(logits.cpu().detach())
|
||||
eval_seeds.append(seeds.cpu().detach())
|
||||
eval_logits = th.cat(eval_logits)
|
||||
eval_seeds = th.cat(eval_seeds)
|
||||
|
||||
return eval_logits, eval_seeds
|
||||
|
||||
|
||||
@utils.benchmark('time', 3600)
|
||||
@utils.parametrize('data', ['am', 'ogbn-mag'])
|
||||
def track_acc(data):
|
||||
dataset = utils.process_data(data)
|
||||
device = utils.get_bench_device()
|
||||
|
||||
if data == 'am':
|
||||
n_bases = 40
|
||||
l2norm = 5e-4
|
||||
elif data == 'ogbn-mag':
|
||||
n_bases = 2
|
||||
l2norm = 0
|
||||
else:
|
||||
raise ValueError()
|
||||
|
||||
fanouts = [25,15]
|
||||
n_layers = 2
|
||||
batch_size = 1024
|
||||
n_hidden = 64
|
||||
dropout = 0.5
|
||||
use_self_loop = True
|
||||
lr = 0.01
|
||||
n_epochs = 20
|
||||
low_mem = True
|
||||
num_workers = 4
|
||||
|
||||
hg = dataset[0]
|
||||
category = dataset.predict_category
|
||||
num_classes = dataset.num_classes
|
||||
train_mask = hg.nodes[category].data.pop('train_mask')
|
||||
train_idx = th.nonzero(train_mask, as_tuple=False).squeeze()
|
||||
test_mask = hg.nodes[category].data.pop('test_mask')
|
||||
test_idx = th.nonzero(test_mask, as_tuple=False).squeeze()
|
||||
labels = hg.nodes[category].data.pop('labels').to(device)
|
||||
num_of_ntype = len(hg.ntypes)
|
||||
num_rels = len(hg.canonical_etypes)
|
||||
|
||||
node_feats = []
|
||||
for ntype in hg.ntypes:
|
||||
if len(hg.nodes[ntype].data) == 0 or 'feat' not in hg.nodes[ntype].data:
|
||||
node_feats.append(None)
|
||||
else:
|
||||
feat = hg.nodes[ntype].data.pop('feat')
|
||||
node_feats.append(feat.share_memory_())
|
||||
|
||||
# get target category id
|
||||
category_id = len(hg.ntypes)
|
||||
for i, ntype in enumerate(hg.ntypes):
|
||||
if ntype == category:
|
||||
category_id = i
|
||||
g = dgl.to_homogeneous(hg)
|
||||
u, v, eid = g.all_edges(form='all')
|
||||
|
||||
# global norm
|
||||
_, inverse_index, count = th.unique(v, return_inverse=True, return_counts=True)
|
||||
degrees = count[inverse_index]
|
||||
norm = th.ones(eid.shape[0]) / degrees
|
||||
norm = norm.unsqueeze(1)
|
||||
g.edata['norm'] = norm
|
||||
g.edata['etype'] = g.edata[dgl.ETYPE]
|
||||
g.ndata['type_id'] = g.ndata[dgl.NID]
|
||||
g.ndata['ntype'] = g.ndata[dgl.NTYPE]
|
||||
|
||||
node_ids = th.arange(g.number_of_nodes())
|
||||
# find out the target node ids
|
||||
node_tids = g.ndata[dgl.NTYPE]
|
||||
loc = (node_tids == category_id)
|
||||
target_nids = node_ids[loc]
|
||||
train_nids = target_nids[train_idx]
|
||||
|
||||
# Create csr/coo/csc formats before launching training processes with multi-gpu.
|
||||
# This avoids creating certain formats in each sub-process, which saves momory and CPU.
|
||||
g.create_formats_()
|
||||
sampler = dgl.dataloading.MultiLayerNeighborSampler(fanouts)
|
||||
collator = dgl.dataloading.NodeCollator(g, train_nids, sampler, return_indices=True)
|
||||
loader = dgl.dataloading.DataLoader(
|
||||
collator.dataset, collate_fn=collator.collate,
|
||||
batch_size=batch_size, shuffle=True, num_workers=4)
|
||||
# test_sampler = dgl.dataloading.MultiLayerNeighborSampler(fanouts)
|
||||
test_loader = DataLoader(dataset=test_idx.numpy(),
|
||||
batch_size=batch_size,
|
||||
collate_fn=collator.collate,
|
||||
shuffle=False,
|
||||
num_workers=4)
|
||||
|
||||
# node features
|
||||
# None for one-hot feature, if not none, it should be the feature tensor.
|
||||
#
|
||||
embed_layer = RelGraphEmbedLayer(device,
|
||||
g.number_of_nodes(),
|
||||
node_tids,
|
||||
num_of_ntype,
|
||||
node_feats,
|
||||
n_hidden,
|
||||
sparse_emb=True)
|
||||
|
||||
# create model
|
||||
# all model params are in device.
|
||||
model = EntityClassify(device,
|
||||
g.number_of_nodes(),
|
||||
n_hidden,
|
||||
num_classes,
|
||||
num_rels,
|
||||
num_bases=n_bases,
|
||||
num_hidden_layers=n_layers - 2,
|
||||
dropout=dropout,
|
||||
use_self_loop=use_self_loop,
|
||||
low_mem=low_mem,
|
||||
layer_norm=False)
|
||||
|
||||
embed_layer = embed_layer.to(device)
|
||||
model = model.to(device)
|
||||
|
||||
all_params = itertools.chain(model.parameters(), embed_layer.embeds.parameters())
|
||||
optimizer = th.optim.Adam(all_params, lr=lr, weight_decay=l2norm)
|
||||
emb_optimizer = th.optim.SparseAdam(list(embed_layer.node_embeds.parameters()), lr=lr)
|
||||
|
||||
print("start training...")
|
||||
t0 = time.time()
|
||||
for epoch in range(n_epochs):
|
||||
model.train()
|
||||
embed_layer.train()
|
||||
|
||||
for i, sample_data in enumerate(loader):
|
||||
input_nodes, output_nodes, seed_idx, blocks = sample_data
|
||||
feats = embed_layer(input_nodes,
|
||||
blocks[0].srcdata['ntype'],
|
||||
blocks[0].srcdata['type_id'],
|
||||
node_feats)
|
||||
logits = model(blocks, feats)
|
||||
loss = F.cross_entropy(logits, labels[train_idx][seed_idx])
|
||||
optimizer.zero_grad()
|
||||
emb_optimizer.zero_grad()
|
||||
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
emb_optimizer.step()
|
||||
|
||||
test_logits, test_seeds = evaluate(model, embed_layer, test_loader, node_feats)
|
||||
test_loss = F.cross_entropy(test_logits, labels[test_seeds].cpu()).item()
|
||||
test_acc = th.sum(test_logits.argmax(dim=1) == labels[test_seeds].cpu()).item() / len(test_seeds)
|
||||
t1 = time.time()
|
||||
return test_acc
|
||||
205
benchmarks/benchmarks/model_acc/bench_sage_ns.py
Normal file
205
benchmarks/benchmarks/model_acc/bench_sage_ns.py
Normal file
@@ -0,0 +1,205 @@
|
||||
import dgl
|
||||
import torch as th
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torch.optim as optim
|
||||
import torch.multiprocessing as mp
|
||||
from torch.utils.data import DataLoader
|
||||
import dgl.nn.pytorch as dglnn
|
||||
import time
|
||||
|
||||
from .. import utils
|
||||
|
||||
|
||||
class SAGE(nn.Module):
|
||||
def __init__(self,
|
||||
in_feats,
|
||||
n_hidden,
|
||||
n_classes,
|
||||
n_layers,
|
||||
activation,
|
||||
dropout):
|
||||
super().__init__()
|
||||
self.n_layers = n_layers
|
||||
self.n_hidden = n_hidden
|
||||
self.n_classes = n_classes
|
||||
self.layers = nn.ModuleList()
|
||||
self.layers.append(dglnn.SAGEConv(in_feats, n_hidden, 'mean'))
|
||||
for i in range(1, n_layers - 1):
|
||||
self.layers.append(dglnn.SAGEConv(n_hidden, n_hidden, 'mean'))
|
||||
self.layers.append(dglnn.SAGEConv(n_hidden, n_classes, 'mean'))
|
||||
self.dropout = nn.Dropout(dropout)
|
||||
self.activation = activation
|
||||
|
||||
def forward(self, blocks, x):
|
||||
h = x
|
||||
for l, (layer, block) in enumerate(zip(self.layers, blocks)):
|
||||
h = layer(block, h)
|
||||
if l != len(self.layers) - 1:
|
||||
h = self.activation(h)
|
||||
h = self.dropout(h)
|
||||
return h
|
||||
|
||||
def inference(self, g, x, batch_size, device):
|
||||
"""
|
||||
Inference with the GraphSAGE model on full neighbors (i.e. without neighbor sampling).
|
||||
g : the entire graph.
|
||||
x : the input of entire node set.
|
||||
|
||||
The inference code is written in a fashion that it could handle any number of nodes and
|
||||
layers.
|
||||
"""
|
||||
# During inference with sampling, multi-layer blocks are very inefficient because
|
||||
# lots of computations in the first few layers are repeated.
|
||||
# Therefore, we compute the representation of all nodes layer by layer. The nodes
|
||||
# on each layer are of course splitted in batches.
|
||||
# TODO: can we standardize this?
|
||||
for l, layer in enumerate(self.layers):
|
||||
y = th.zeros(g.number_of_nodes(), self.n_hidden if l !=
|
||||
len(self.layers) - 1 else self.n_classes)
|
||||
|
||||
sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)
|
||||
dataloader = dgl.dataloading.NodeDataLoader(
|
||||
g,
|
||||
th.arange(g.number_of_nodes()),
|
||||
sampler,
|
||||
batch_size=batch_size,
|
||||
shuffle=True,
|
||||
drop_last=False,
|
||||
num_workers=4)
|
||||
|
||||
for input_nodes, output_nodes, blocks in dataloader:
|
||||
block = blocks[0]
|
||||
|
||||
block = block.int().to(device)
|
||||
h = x[input_nodes].to(device)
|
||||
h = layer(block, h)
|
||||
if l != len(self.layers) - 1:
|
||||
h = self.activation(h)
|
||||
h = self.dropout(h)
|
||||
|
||||
y[output_nodes] = h.cpu()
|
||||
|
||||
x = y
|
||||
return y
|
||||
|
||||
|
||||
def compute_acc(pred, labels):
|
||||
"""
|
||||
Compute the accuracy of prediction given the labels.
|
||||
"""
|
||||
labels = labels.long()
|
||||
return (th.argmax(pred, dim=1) == labels).float().sum() / len(pred)
|
||||
|
||||
|
||||
def evaluate(model, g, inputs, labels, val_nid, batch_size, device):
|
||||
"""
|
||||
Evaluate the model on the validation set specified by ``val_nid``.
|
||||
g : The entire graph.
|
||||
inputs : The features of all the nodes.
|
||||
labels : The labels of all the nodes.
|
||||
val_nid : the node Ids for validation.
|
||||
batch_size : Number of nodes to compute at the same time.
|
||||
device : The GPU device to evaluate on.
|
||||
"""
|
||||
model.eval()
|
||||
with th.no_grad():
|
||||
pred = model.inference(g, inputs, batch_size, device)
|
||||
model.train()
|
||||
return compute_acc(pred[val_nid], labels[val_nid])
|
||||
|
||||
|
||||
def load_subtensor(g, seeds, input_nodes, device):
|
||||
"""
|
||||
Copys features and labels of a set of nodes onto GPU.
|
||||
"""
|
||||
batch_inputs = g.ndata['features'][input_nodes].to(device)
|
||||
batch_labels = g.ndata['labels'][seeds].to(device)
|
||||
return batch_inputs, batch_labels
|
||||
|
||||
|
||||
@utils.benchmark('acc', 3600)
|
||||
@utils.parametrize('data', ['ogbn-products', "reddit"])
|
||||
def track_acc(data):
|
||||
data = utils.process_data(data)
|
||||
device = utils.get_bench_device()
|
||||
g = data[0]
|
||||
g.ndata['features'] = g.ndata['feat']
|
||||
g.ndata['labels'] = g.ndata['label']
|
||||
in_feats = g.ndata['features'].shape[1]
|
||||
n_classes = data.num_labels
|
||||
|
||||
# Create csr/coo/csc formats before launching training processes with multi-gpu.
|
||||
# This avoids creating certain formats in each sub-process, which saves momory and CPU.
|
||||
g.create_formats_()
|
||||
|
||||
num_epochs = 20
|
||||
num_hidden = 16
|
||||
num_layers = 2
|
||||
fan_out = '5,10'
|
||||
batch_size = 1024
|
||||
lr = 0.003
|
||||
dropout = 0.5
|
||||
num_workers = 4
|
||||
|
||||
train_nid = th.nonzero(g.ndata['train_mask'], as_tuple=True)[0]
|
||||
|
||||
# Create PyTorch DataLoader for constructing blocks
|
||||
sampler = dgl.dataloading.MultiLayerNeighborSampler(
|
||||
[int(fanout) for fanout in fan_out.split(',')])
|
||||
dataloader = dgl.dataloading.NodeDataLoader(
|
||||
g,
|
||||
train_nid,
|
||||
sampler,
|
||||
batch_size=batch_size,
|
||||
shuffle=True,
|
||||
drop_last=False,
|
||||
num_workers=num_workers)
|
||||
|
||||
# Define model and optimizer
|
||||
model = SAGE(in_feats, num_hidden, n_classes, num_layers, F.relu, dropout)
|
||||
model = model.to(device)
|
||||
loss_fcn = nn.CrossEntropyLoss()
|
||||
loss_fcn = loss_fcn.to(device)
|
||||
optimizer = optim.Adam(model.parameters(), lr=lr)
|
||||
|
||||
# dry run one epoch
|
||||
for step, (input_nodes, seeds, blocks) in enumerate(dataloader):
|
||||
# Load the input features as well as output labels
|
||||
#batch_inputs, batch_labels = load_subtensor(g, seeds, input_nodes, device)
|
||||
blocks = [block.int().to(device) for block in blocks]
|
||||
batch_inputs = blocks[0].srcdata['features']
|
||||
batch_labels = blocks[-1].dstdata['labels']
|
||||
|
||||
# Compute loss and prediction
|
||||
batch_pred = model(blocks, batch_inputs)
|
||||
loss = loss_fcn(batch_pred, batch_labels)
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
# Training loop
|
||||
for epoch in range(num_epochs):
|
||||
# Loop over the dataloader to sample the computation dependency graph as a list of
|
||||
# blocks.
|
||||
for step, (input_nodes, seeds, blocks) in enumerate(dataloader):
|
||||
# Load the input features as well as output labels
|
||||
#batch_inputs, batch_labels = load_subtensor(g, seeds, input_nodes, device)
|
||||
blocks = [block.int().to(device) for block in blocks]
|
||||
batch_inputs = blocks[0].srcdata['features']
|
||||
batch_labels = blocks[-1].dstdata['labels']
|
||||
|
||||
# Compute loss and prediction
|
||||
batch_pred = model(blocks, batch_inputs)
|
||||
loss = loss_fcn(batch_pred, batch_labels)
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
test_g = g
|
||||
test_nid = th.nonzero(
|
||||
~(test_g.ndata['train_mask'] | test_g.ndata['val_mask']), as_tuple=True)[0]
|
||||
test_acc = evaluate(
|
||||
model, test_g, test_g.ndata['features'], test_g.ndata['labels'], test_nid, batch_size, device)
|
||||
|
||||
return test_acc.item()
|
||||
@@ -5,7 +5,6 @@ import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torch.utils.data import IterableDataset, DataLoader
|
||||
import torchtext
|
||||
import dgl
|
||||
import dgl.function as fn
|
||||
|
||||
|
||||
@@ -1,12 +1,15 @@
|
||||
import os, pickle
|
||||
import shutil, zipfile
|
||||
import json
|
||||
import os
|
||||
import pickle
|
||||
import shutil
|
||||
import zipfile
|
||||
import requests
|
||||
import inspect
|
||||
import numpy as np
|
||||
import pandas
|
||||
import dgl
|
||||
import torch
|
||||
import torchtext
|
||||
|
||||
|
||||
def _download(url, path, filename):
|
||||
fn = os.path.join(path, filename)
|
||||
@@ -22,15 +25,30 @@ def _download(url, path, filename):
|
||||
writer.write(chunk)
|
||||
print('Download finished.')
|
||||
|
||||
|
||||
def get_livejournal():
|
||||
_download('https://snap.stanford.edu/data/soc-LiveJournal1.txt.gz',
|
||||
'/tmp', 'soc-LiveJournal1.txt.gz')
|
||||
df = pandas.read_csv('/tmp/soc-LiveJournal1.txt.gz', sep='\t', skiprows=4, header=None,
|
||||
# Same as https://snap.stanford.edu/data/soc-LiveJournal1.txt.gz
|
||||
_download('https://dgl-asv-data.s3-us-west-2.amazonaws.com/dataset/livejournal/soc-LiveJournal1.txt.gz',
|
||||
'/tmp/dataset', 'soc-LiveJournal1.txt.gz')
|
||||
df = pandas.read_csv('/tmp/dataset/soc-LiveJournal1.txt.gz', sep='\t', skiprows=4, header=None,
|
||||
names=['src', 'dst'], compression='gzip')
|
||||
src = np.array(df['src'])
|
||||
dst = np.array(df['dst'])
|
||||
src = df['src'].values
|
||||
dst = df['dst'].values
|
||||
print('construct the graph')
|
||||
return dgl.DGLGraph((src, dst), readonly=True)
|
||||
return dgl.graph((src, dst))
|
||||
|
||||
|
||||
def get_filmbaster():
|
||||
# Same as https://snap.stanford.edu/data/bigdata/communities/com-friendster.ungraph.txt.gz
|
||||
_download('https://dgl-asv-data.s3-us-west-2.amazonaws.com/dataset/friendster/com-friendster.ungraph.txt.gz',
|
||||
'/tmp/dataset', 'com-friendster.ungraph.txt.gz')
|
||||
df = pandas.read_csv('/tmp/dataset/com-friendster.ungraph.txt.gz', sep='\t', skiprows=4, header=None,
|
||||
names=['src', 'dst'], compression='gzip')
|
||||
src = df['src'].values
|
||||
dst = df['dst'].values
|
||||
print('construct the graph')
|
||||
return dgl.graph((src, dst))
|
||||
|
||||
|
||||
def get_graph(name):
|
||||
if name == 'livejournal':
|
||||
@@ -39,6 +57,7 @@ def get_graph(name):
|
||||
print(name + " doesn't exist")
|
||||
return None
|
||||
|
||||
|
||||
class OGBDataset(object):
|
||||
def __init__(self, g, num_labels, predict_category=None):
|
||||
self._g = g
|
||||
@@ -75,7 +94,8 @@ def load_ogb_product():
|
||||
|
||||
graph.ndata['label'] = labels
|
||||
in_feats = graph.ndata['feat'].shape[1]
|
||||
num_labels = len(torch.unique(labels[torch.logical_not(torch.isnan(labels))]))
|
||||
num_labels = len(torch.unique(
|
||||
labels[torch.logical_not(torch.isnan(labels))]))
|
||||
|
||||
# Find the node IDs in the training, validation, and test set.
|
||||
train_nid, val_nid, test_nid = splitted_idx['train'], splitted_idx['valid'], splitted_idx['test']
|
||||
@@ -148,12 +168,15 @@ class PinsageDataset:
|
||||
def __getitem__(self, idx):
|
||||
return self._g
|
||||
|
||||
|
||||
def load_nowplaying_rs():
|
||||
name = 'nowplaying_rs.pkl' # follow examples/pytorch/pinsage/README to create nowplaying_rs.pkl
|
||||
import torchtext
|
||||
# follow examples/pytorch/pinsage/README to create nowplaying_rs.pkl
|
||||
name = 'nowplaying_rs.pkl'
|
||||
dataset_dir = os.path.join(os.getcwd(), 'dataset')
|
||||
os.symlink('/tmp/dataset/', dataset_dir)
|
||||
|
||||
dataset_path = os.path.join(dataset_dir, name)
|
||||
dataset_path = os.path.join(dataset_dir, "nowplaying_rs", name)
|
||||
# Load dataset
|
||||
with open(dataset_path, 'rb') as f:
|
||||
dataset = pickle.load(f)
|
||||
@@ -169,14 +192,17 @@ def load_nowplaying_rs():
|
||||
|
||||
# Assign user and movie IDs and use them as features (to learn an individual trainable
|
||||
# embedding for each entity)
|
||||
g.nodes[user_ntype].data['id'] = torch.arange(g.number_of_nodes(user_ntype))
|
||||
g.nodes[item_ntype].data['id'] = torch.arange(g.number_of_nodes(item_ntype))
|
||||
g.nodes[user_ntype].data['id'] = torch.arange(
|
||||
g.number_of_nodes(user_ntype))
|
||||
g.nodes[item_ntype].data['id'] = torch.arange(
|
||||
g.number_of_nodes(item_ntype))
|
||||
|
||||
# Prepare torchtext dataset and vocabulary
|
||||
fields = {}
|
||||
examples = []
|
||||
for key, texts in item_texts.items():
|
||||
fields[key] = torchtext.data.Field(include_lengths=True, lower=True, batch_first=True)
|
||||
fields[key] = torchtext.data.Field(
|
||||
include_lengths=True, lower=True, batch_first=True)
|
||||
for i in range(g.number_of_nodes(item_ntype)):
|
||||
example = torchtext.data.Example.fromlist(
|
||||
[item_texts[key][i] for key in item_texts.keys()],
|
||||
@@ -188,6 +214,7 @@ def load_nowplaying_rs():
|
||||
|
||||
return PinsageDataset(g, user_ntype, item_ntype, textset)
|
||||
|
||||
|
||||
def process_data(name):
|
||||
if name == 'cora':
|
||||
return dgl.data.CoraGraphDataset()
|
||||
@@ -212,29 +239,38 @@ def process_data(name):
|
||||
else:
|
||||
raise ValueError('Invalid dataset name:', name)
|
||||
|
||||
|
||||
def get_bench_device():
|
||||
return os.environ.get('DGL_BENCH_DEVICE', 'cpu')
|
||||
device = os.environ.get('DGL_BENCH_DEVICE', 'cpu')
|
||||
if device.lower() == "gpu":
|
||||
return "cuda:0"
|
||||
else:
|
||||
return device
|
||||
|
||||
|
||||
def setup_track_time(*args, **kwargs):
|
||||
# fix random seed
|
||||
np.random.seed(42)
|
||||
torch.random.manual_seed(42)
|
||||
|
||||
|
||||
def setup_track_acc(*args, **kwargs):
|
||||
# fix random seed
|
||||
np.random.seed(42)
|
||||
torch.random.manual_seed(42)
|
||||
|
||||
|
||||
TRACK_UNITS = {
|
||||
'time' : 's',
|
||||
'acc' : '%',
|
||||
'time': 's',
|
||||
'acc': '%',
|
||||
}
|
||||
|
||||
TRACK_SETUP = {
|
||||
'time' : setup_track_time,
|
||||
'acc' : setup_track_acc,
|
||||
'time': setup_track_time,
|
||||
'acc': setup_track_acc,
|
||||
}
|
||||
|
||||
|
||||
def parametrize(param_name, params):
|
||||
"""Decorator for benchmarking over a set of parameters.
|
||||
|
||||
@@ -297,6 +333,40 @@ def parametrize(param_name, params):
|
||||
return func
|
||||
return _wrapper
|
||||
|
||||
|
||||
class TestFilter:
|
||||
def __init__(self):
|
||||
self.conf = None
|
||||
if "DGL_REG_CONF" in os.environ:
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
path = os.path.join(current_dir, "../../",
|
||||
os.environ["DGL_REG_CONF"])
|
||||
with open(path, "r") as f:
|
||||
self.conf = json.load(f)
|
||||
if "INSTANCE_TYPE" in os.environ:
|
||||
instance_type = os.environ["INSTANCE_TYPE"]
|
||||
else:
|
||||
raise Exception(
|
||||
"Must set both DGL_REG_CONF and INSTANCE_TYPE as env")
|
||||
self.enabled_tests = self.conf[instance_type]["tests"]
|
||||
else:
|
||||
import logging
|
||||
logging.warning("No regression test conf file specified")
|
||||
|
||||
def check(self, func):
|
||||
funcfullname = inspect.getmodule(func).__name__ + "." + func.__name__
|
||||
if self.conf is None:
|
||||
return True
|
||||
else:
|
||||
for enabled_testname in self.enabled_tests:
|
||||
if enabled_testname in funcfullname:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
filter = TestFilter()
|
||||
|
||||
|
||||
def benchmark(track_type, timeout=60):
|
||||
"""Decorator for indicating the benchmark type.
|
||||
|
||||
@@ -319,9 +389,13 @@ def benchmark(track_type, timeout=60):
|
||||
pass
|
||||
"""
|
||||
assert track_type in ['time', 'acc']
|
||||
|
||||
def _wrapper(func):
|
||||
func.unit = TRACK_UNITS[track_type]
|
||||
func.setup = TRACK_SETUP[track_type]
|
||||
func.timeout = timeout
|
||||
if not filter.check(func):
|
||||
# skip if not enabled
|
||||
func.benchmark_name = "skip_" + func.__name__
|
||||
return func
|
||||
return _wrapper
|
||||
|
||||
@@ -1,13 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
. /opt/conda/etc/profile.d/conda.sh
|
||||
|
||||
# build
|
||||
CMAKE_VARS="-DUSE_CUDA=ON"
|
||||
mkdir -p build
|
||||
pushd build
|
||||
cmake $CMAKE_VARS ..
|
||||
make -j
|
||||
popd
|
||||
@@ -13,6 +13,7 @@ pip install asv
|
||||
pip uninstall -y dgl
|
||||
|
||||
export DGL_BENCH_DEVICE=$DEVICE
|
||||
echo "DGL_BENCH_DEVICE=$DGL_BENCH_DEVICE"
|
||||
pushd $ROOT/benchmarks
|
||||
cat asv.conf.json
|
||||
asv machine --yes
|
||||
|
||||
32
benchmarks/scripts/README.md
Normal file
32
benchmarks/scripts/README.md
Normal file
@@ -0,0 +1,32 @@
|
||||
Regression Test Suite
|
||||
========================
|
||||
|
||||
### Spec of task.json
|
||||
```json
|
||||
# Note the test will be run if the name specified below is a substring of the full test name.
|
||||
# The fullname of "benchmarks/model_acc/bench_sage_ns.track_acc" will be "model_acc.bench_sage_ns.track_acc". Test will be run if it contains any keyword.
|
||||
# For example, "model_acc" will run all the tests under "model_acc" folder
|
||||
# "bench_sage" will run both "bench_sage" and "bench_sage_ns"
|
||||
# "bench_sage." will only run "bench_sage"
|
||||
# "ns" will run any tests name contains "ms"
|
||||
# "" will run all tests
|
||||
{
|
||||
"c5.9xlarge": { # The instance type to run the test
|
||||
"tests": [
|
||||
"bench_sage" # The test to be run on this instance
|
||||
],
|
||||
"env": {
|
||||
"DEVICE": "cpu" # The environment variable passed to publish.sh
|
||||
}
|
||||
},
|
||||
"g4dn.2xlarge": {
|
||||
...
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
### Environment variable
|
||||
- `MOUNT_PATH` specify the directory in the host to be mapped into docker, if exists will map the `MOUNT_PATH`(in host) to `/tmp/dataset`(in docker)
|
||||
- `INSTANCE_TYPE` specify the current instance type
|
||||
- `DGL_REG_CONF` specify the path to `task.json`, which is relative to the repo root. If specified, must specify `INSTANCE_TYPE` also
|
||||
20
benchmarks/scripts/build_dgl_asv.sh
Normal file
20
benchmarks/scripts/build_dgl_asv.sh
Normal file
@@ -0,0 +1,20 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
. /opt/conda/etc/profile.d/conda.sh
|
||||
|
||||
# Default building only with cpu
|
||||
DEVICE=${DGL_BENCH_DEVICE:-cpu}
|
||||
|
||||
# build
|
||||
if [[ $DEVICE == "cpu" ]]; then
|
||||
CMAKE_VARS=""
|
||||
else
|
||||
CMAKE_VARS="-DUSE_CUDA=ON"
|
||||
fi
|
||||
mkdir -p build
|
||||
pushd build
|
||||
cmake $CMAKE_VARS ..
|
||||
make -j
|
||||
popd
|
||||
@@ -7,6 +7,7 @@ set -e
|
||||
pip install -r /asv/torch_gpu_pip.txt
|
||||
pip install pandas rdflib ogb
|
||||
|
||||
|
||||
# install
|
||||
pushd python
|
||||
rm -rf build *.egg-info dist
|
||||
@@ -17,7 +17,6 @@
|
||||
# the host machine.
|
||||
#
|
||||
|
||||
|
||||
if [ $# -eq 2 ]; then
|
||||
MACHINE=$1
|
||||
DEVICE=$2
|
||||
@@ -27,15 +26,51 @@ else
|
||||
fi
|
||||
|
||||
WS_ROOT=/asv/dgl
|
||||
docker pull dgllib/dgl-ci-gpu:conda
|
||||
if [ -z "$DGL_REG_CONF"]; then
|
||||
DOCKER_ENV_OPT="$DOCKER_ENV_OPT"
|
||||
else
|
||||
DOCKER_ENV_OPT=" -e DGL_REG_CONF=$DGL_REG_CONF $DOCKER_ENV_OPT"
|
||||
fi
|
||||
|
||||
if [ -z "$INSTANCE_TYPE"]; then
|
||||
DOCKER_ENV_OPT="$DOCKER_ENV_OPT"
|
||||
else
|
||||
DOCKER_ENV_OPT=" -e INSTANCE_TYPE=$INSTANCE_TYPE $DOCKER_ENV_OPT"
|
||||
fi
|
||||
|
||||
if [ -z "$MOUNT_PATH"]; then
|
||||
DOCKER_MOUNT_OPT=""
|
||||
else
|
||||
DOCKER_MOUNT_OPT="-v ${MOUNT_PATH}:/tmp/dataset -v ${MOUNT_PATH}/dgl_home/:/root/.dgl/"
|
||||
fi
|
||||
|
||||
echo $HOME
|
||||
echo "Mount Point: ${DOCKER_MOUNT_OPT}"
|
||||
echo "Env opt: ${DOCKER_ENV_OPT}"
|
||||
echo "DEVICE: ${DEVICE}"
|
||||
|
||||
if [[ $DEVICE == "cpu" ]]; then
|
||||
docker run --name dgl-reg \
|
||||
--rm \
|
||||
$DOCKER_MOUNT_OPT \
|
||||
$DOCKER_ENV_OPT \
|
||||
--shm-size="4g" \
|
||||
--hostname=$MACHINE -dit dgllib/dgl-ci-gpu:conda /bin/bash
|
||||
else
|
||||
docker run --name dgl-reg \
|
||||
--rm --runtime=nvidia \
|
||||
$DOCKER_MOUNT_OPT \
|
||||
$DOCKER_ENV_OPT \
|
||||
--shm-size="4g" \
|
||||
--hostname=$MACHINE -dit dgllib/dgl-ci-gpu:conda /bin/bash
|
||||
fi
|
||||
|
||||
docker run --name dgl-reg \
|
||||
--rm --runtime=nvidia \
|
||||
--hostname=$MACHINE -dit dgllib/dgl-ci-gpu:conda /bin/bash
|
||||
docker exec dgl-reg mkdir -p $WS_ROOT
|
||||
docker cp ../.git dgl-reg:$WS_ROOT
|
||||
docker cp . dgl-reg:$WS_ROOT/benchmarks/
|
||||
docker cp ../../.git dgl-reg:$WS_ROOT
|
||||
docker cp ../ dgl-reg:$WS_ROOT/benchmarks/
|
||||
docker cp torch_gpu_pip.txt dgl-reg:/asv
|
||||
docker exec dgl-reg bash $WS_ROOT/benchmarks/run.sh $DEVICE
|
||||
docker cp dgl-reg:$WS_ROOT/benchmarks/results .
|
||||
docker cp dgl-reg:$WS_ROOT/benchmarks/html .
|
||||
docker exec $DOCKER_ENV_OPT dgl-reg bash $WS_ROOT/benchmarks/run.sh $DEVICE
|
||||
docker cp dgl-reg:$WS_ROOT/benchmarks/results ../
|
||||
docker cp dgl-reg:$WS_ROOT/benchmarks/html ../
|
||||
docker stop dgl-reg
|
||||
@@ -10,4 +10,7 @@ networkx
|
||||
matplotlib
|
||||
nltk
|
||||
requests[security]
|
||||
tqdm
|
||||
tqdm
|
||||
awscli
|
||||
# 0.6.0 is for pytorch 1.5
|
||||
torchtext==0.6.0
|
||||
18
benchmarks/task.json
Normal file
18
benchmarks/task.json
Normal file
@@ -0,0 +1,18 @@
|
||||
{
|
||||
"c5.9xlarge": {
|
||||
"tests": [
|
||||
""
|
||||
],
|
||||
"env": {
|
||||
"DEVICE": "cpu"
|
||||
}
|
||||
},
|
||||
"g4dn.2xlarge": {
|
||||
"tests": [
|
||||
""
|
||||
],
|
||||
"env": {
|
||||
"DEVICE": "gpu"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,4 +1,5 @@
|
||||
import argparse, time
|
||||
import argparse
|
||||
import time
|
||||
import numpy as np
|
||||
import networkx as nx
|
||||
import torch
|
||||
@@ -12,6 +13,7 @@ from gcn import GCN
|
||||
#from gcn_mp import GCN
|
||||
#from gcn_spmv import GCN
|
||||
|
||||
|
||||
def evaluate(model, features, labels, mask):
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
@@ -22,6 +24,7 @@ def evaluate(model, features, labels, mask):
|
||||
correct = torch.sum(indices == labels)
|
||||
return correct.item() * 1.0 / len(labels)
|
||||
|
||||
|
||||
def main(args):
|
||||
# load and preprocess dataset
|
||||
if args.dataset == 'cora':
|
||||
@@ -122,21 +125,21 @@ if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='GCN')
|
||||
register_data_args(parser)
|
||||
parser.add_argument("--dropout", type=float, default=0.5,
|
||||
help="dropout probability")
|
||||
help="dropout probability")
|
||||
parser.add_argument("--gpu", type=int, default=-1,
|
||||
help="gpu")
|
||||
help="gpu")
|
||||
parser.add_argument("--lr", type=float, default=1e-2,
|
||||
help="learning rate")
|
||||
help="learning rate")
|
||||
parser.add_argument("--n-epochs", type=int, default=200,
|
||||
help="number of training epochs")
|
||||
help="number of training epochs")
|
||||
parser.add_argument("--n-hidden", type=int, default=16,
|
||||
help="number of hidden gcn units")
|
||||
help="number of hidden gcn units")
|
||||
parser.add_argument("--n-layers", type=int, default=1,
|
||||
help="number of hidden gcn layers")
|
||||
help="number of hidden gcn layers")
|
||||
parser.add_argument("--weight-decay", type=float, default=5e-4,
|
||||
help="Weight for L2 loss")
|
||||
help="Weight for L2 loss")
|
||||
parser.add_argument("--self-loop", action='store_true',
|
||||
help="graph self-loop (default=False)")
|
||||
help="graph self-loop (default=False)")
|
||||
parser.set_defaults(self_loop=False)
|
||||
args = parser.parse_args()
|
||||
print(args)
|
||||
|
||||
Reference in New Issue
Block a user