1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183
|
import os
import os.path as osp
from typing import Callable, List, Optional
import torch
from torch_geometric.data import (
HeteroData,
InMemoryDataset,
download_url,
extract_zip,
)
from torch_geometric.io import fs
MOVIE_HEADERS = [
"movieId", "title", "releaseDate", "videoReleaseDate", "IMDb URL",
"unknown", "Action", "Adventure", "Animation", "Children's", "Comedy",
"Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
"Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
]
USER_HEADERS = ["userId", "age", "gender", "occupation", "zipCode"]
RATING_HEADERS = ["userId", "movieId", "rating", "timestamp"]
class MovieLens100K(InMemoryDataset):
r"""The MovieLens 100K heterogeneous rating dataset, assembled by GroupLens
Research from the `MovieLens web site <https://movielens.org>`__,
consisting of movies (1,682 nodes) and users (943 nodes) with 100K
ratings between them.
User ratings for movies are available as ground truth labels.
Features of users and movies are encoded according to the `"Inductive
Matrix Completion Based on Graph Neural Networks"
<https://arxiv.org/abs/1904.12058>`__ paper.
Args:
root (str): Root directory where the dataset should be saved.
transform (callable, optional): A function/transform that takes in an
:obj:`torch_geometric.data.HeteroData` object and returns a
transformed version. The data object will be transformed before
every access. (default: :obj:`None`)
pre_transform (callable, optional): A function/transform that takes in
an :obj:`torch_geometric.data.HeteroData` object and returns a
transformed version. The data object will be transformed before
being saved to disk. (default: :obj:`None`)
force_reload (bool, optional): Whether to re-process the dataset.
(default: :obj:`False`)
**STATS:**
.. list-table::
:widths: 20 10 10 10
:header-rows: 1
* - Node/Edge Type
- #nodes/#edges
- #features
- #tasks
* - Movie
- 1,682
- 18
-
* - User
- 943
- 24
-
* - User-Movie
- 80,000
- 1
- 1
"""
url = 'https://files.grouplens.org/datasets/movielens/ml-100k.zip'
def __init__(
self,
root: str,
transform: Optional[Callable] = None,
pre_transform: Optional[Callable] = None,
force_reload: bool = False,
) -> None:
super().__init__(root, transform, pre_transform,
force_reload=force_reload)
self.load(self.processed_paths[0], data_cls=HeteroData)
@property
def raw_file_names(self) -> List[str]:
return ['u.item', 'u.user', 'u1.base', 'u1.test']
@property
def processed_file_names(self) -> str:
return 'data.pt'
def download(self) -> None:
path = download_url(self.url, self.root)
extract_zip(path, self.root)
os.remove(path)
folder = osp.join(self.root, 'ml-100k')
fs.rm(self.raw_dir)
os.rename(folder, self.raw_dir)
def process(self) -> None:
import pandas as pd
data = HeteroData()
# Process movie data:
df = pd.read_csv(
self.raw_paths[0],
sep='|',
header=None,
names=MOVIE_HEADERS,
index_col='movieId',
encoding='ISO-8859-1',
)
movie_mapping = {idx: i for i, idx in enumerate(df.index)}
x = df[MOVIE_HEADERS[6:]].values
data['movie'].x = torch.from_numpy(x).to(torch.float)
# Process user data:
df = pd.read_csv(
self.raw_paths[1],
sep='|',
header=None,
names=USER_HEADERS,
index_col='userId',
encoding='ISO-8859-1',
)
user_mapping = {idx: i for i, idx in enumerate(df.index)}
age = df['age'].values / df['age'].values.max()
age = torch.from_numpy(age).to(torch.float).view(-1, 1)
gender = df['gender'].str.get_dummies().values
gender = torch.from_numpy(gender).to(torch.float)
occupation = df['occupation'].str.get_dummies().values
occupation = torch.from_numpy(occupation).to(torch.float)
data['user'].x = torch.cat([age, gender, occupation], dim=-1)
# Process rating data for training:
df = pd.read_csv(
self.raw_paths[2],
sep='\t',
header=None,
names=RATING_HEADERS,
)
src = [user_mapping[idx] for idx in df['userId']]
dst = [movie_mapping[idx] for idx in df['movieId']]
edge_index = torch.tensor([src, dst])
data['user', 'rates', 'movie'].edge_index = edge_index
rating = torch.from_numpy(df['rating'].values).to(torch.long)
data['user', 'rates', 'movie'].rating = rating
time = torch.from_numpy(df['timestamp'].values)
data['user', 'rates', 'movie'].time = time
data['movie', 'rated_by', 'user'].edge_index = edge_index.flip([0])
data['movie', 'rated_by', 'user'].rating = rating
data['movie', 'rated_by', 'user'].time = time
# Process rating data for testing:
df = pd.read_csv(
self.raw_paths[3],
sep='\t',
header=None,
names=RATING_HEADERS,
)
src = [user_mapping[idx] for idx in df['userId']]
dst = [movie_mapping[idx] for idx in df['movieId']]
edge_label_index = torch.tensor([src, dst])
data['user', 'rates', 'movie'].edge_label_index = edge_label_index
edge_label = torch.from_numpy(df['rating'].values).to(torch.float)
data['user', 'rates', 'movie'].edge_label = edge_label
if self.pre_transform is not None:
data = self.pre_transform(data)
self.save([data], self.processed_paths[0])
|