1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
|
import os.path as osp
import time
import torch
import torch.nn.functional as F
import tqdm
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score
from sklearn.multioutput import MultiOutputClassifier
import torch_geometric
from torch_geometric.data import Batch
from torch_geometric.datasets import PPI
from torch_geometric.loader import DataLoader, LinkNeighborLoader
from torch_geometric.nn import GraphSAGE
path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'PPI')
train_dataset = PPI(path, split='train')
val_dataset = PPI(path, split='val')
test_dataset = PPI(path, split='test')
# Group all training graphs into a single graph to perform sampling:
train_data = Batch.from_data_list(train_dataset)
loader = LinkNeighborLoader(train_data, batch_size=2048, shuffle=True,
neg_sampling_ratio=1.0, num_neighbors=[10, 10],
num_workers=6, persistent_workers=True)
# Evaluation loaders (one datapoint corresponds to a graph)
train_loader = DataLoader(train_dataset, batch_size=2)
val_loader = DataLoader(val_dataset, batch_size=2)
test_loader = DataLoader(test_dataset, batch_size=2)
if torch.cuda.is_available():
device = torch.device('cuda')
elif torch_geometric.is_xpu_available():
device = torch.device('xpu')
else:
device = torch.device('cpu')
model = GraphSAGE(
in_channels=train_dataset.num_features,
hidden_channels=64,
num_layers=2,
out_channels=64,
).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
def train():
model.train()
total_loss = total_examples = 0
for data in tqdm.tqdm(loader):
data = data.to(device)
optimizer.zero_grad()
h = model(data.x, data.edge_index)
h_src = h[data.edge_label_index[0]]
h_dst = h[data.edge_label_index[1]]
link_pred = (h_src * h_dst).sum(dim=-1) # Inner product.
loss = F.binary_cross_entropy_with_logits(link_pred, data.edge_label)
loss.backward()
optimizer.step()
total_loss += float(loss) * link_pred.numel()
total_examples += link_pred.numel()
return total_loss / total_examples
@torch.no_grad()
def encode(loader):
model.eval()
xs, ys = [], []
for data in loader:
data = data.to(device)
xs.append(model(data.x, data.edge_index).cpu())
ys.append(data.y.cpu())
return torch.cat(xs, dim=0), torch.cat(ys, dim=0)
@torch.no_grad()
def test():
# Train classifier on training set:
x, y = encode(train_loader)
clf = MultiOutputClassifier(SGDClassifier(loss='log_loss', penalty='l2'))
clf.fit(x, y)
train_f1 = f1_score(y, clf.predict(x), average='micro')
# Evaluate on validation set:
x, y = encode(val_loader)
val_f1 = f1_score(y, clf.predict(x), average='micro')
# Evaluate on test set:
x, y = encode(test_loader)
test_f1 = f1_score(y, clf.predict(x), average='micro')
return train_f1, val_f1, test_f1
times = []
for epoch in range(1, 6):
start = time.time()
loss = train()
print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}')
train_f1, val_f1, test_f1 = test()
print(f'Train F1: {train_f1:.4f}, Val F1: {val_f1:.4f}, '
f'Test F1: {test_f1:.4f}')
times.append(time.time() - start)
print(f"Median time per epoch: {torch.tensor(times).median():.4f}s")
|