Here’s a detailed breakdown of the code snippets for each step to guide you through setting up and implementing the GNN model for the Movie Recommender System:
1. Setup and Data Preparation
Make sure all necessary packages are installed and load the dataset:
# Ensure necessary libraries are installed, add more libraries as needed
!pip install torch==1.10.0 torchvision==0.11.1 torchaudio==0.10.0
!pip install torch-geometric==2.0.1
!pip install pandas networkx
import pandas as pd
# Load MovieLens data.
try:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
print("Data loaded successfully!")
except FileNotFoundError:
print("Data files not found. Please check the files' path and try again.")
Steps for IGMC:
# Clone the IGMC repository
!git clone -b latest https://github.com/muhanzhang/IGMC.git
# Move necessary files 'util_functions.py', 'data_utils.py', 'preprocessing.py' from the cloned repository to your current directory
2. Building the Graph Structure
Create a graph from the loaded data and prepare it for use with PyTorch Geometric:
import networkx as nx
from torch_geometric.utils import from_networkx
# Create a graph from the pandas DataFrame
# Nodes represent users and movies, edges represent user ratings for movies
G = nx.from_pandas_edgelist(ratings, 'user_id', 'movie_id', ['rating'])
# Convert the NetworkX graph to a PyTorch Geometric graph
G_torch = from_networkx(G)
Steps for IGMC:
import torch
from data_utils import load_official_trainvaltest_split
# Load and preprocess the MovieLens dataset using IGMC's utilities
(u_features, v_features, adj_train, train_labels, train_u_indices, train_v_indices,
val_labels, val_u_indices, val_v_indices, test_labels, test_u_indices, test_v_indices,
class_values) = load_official_trainvaltest_split('ml_100k', testing=True)
# Print to confirm the data is ready for further steps
print("Data loaded and preprocessed successfully using IGMC's tools.")
3. Implementing GNN Models
Define a basic GNN model architecture using PyTorch Geometric, ensuring to define the number of features and classes:
import torch
from torch_geometric.nn import GCNConv
class RecommenderGNN(torch.nn.Module):
def __init__(self, num_features, num_classes):
super(RecommenderGNN, self).__init__()
# Initialize two GCN layers
self.conv1 = GCNConv(num_features, 16) # First GCN layer to transform features
self.conv2 = GCNConv(16, num_classes) # Second GCN layer to prepare for output
def forward(self, data):
x, edge_index = data.x, data.edge_index
x = torch.relu(self.conv1(x, edge_index)) # Apply ReLU activation function for non-linearity
x = self.conv2(x, edge_index) # No activation before final layer
return x # Return the output directly for a regression or use sigmoid for binary classification
# Example instantiation of the model
# Ensure that 'your_num_features' and 'your_num_classes' are defined
# For instance, 'your_num_features' could be the number of movie genres, and 'your_num_classes' could be 1 for a rating prediction
model = RecommenderGNN(num_features=your_num_features, num_classes=your_num_classes)
Steps for IGMC:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import RGCNConv, GraphConv
class RecommenderGNN(torch.nn.Module):
def __init__(self, num_features, num_classes, num_relations, num_bases):
super(RecommenderGNN, self).__init__()
self.conv1 = RGCNConv(num_features, 64, num_relations, num_bases=num_bases)
self.conv2 = GraphConv(64, 128)
self.conv3 = GraphConv(128, 256)
self.fc1 = nn.Linear(256, 128)
self.fc2 = nn.Linear(128, num_classes)
def forward(self, data):
x, edge_index, edge_type = data.x, data.edge_index, data.edge_type
x = self.conv1(x, edge_index, edge_type)
x = F.relu(x)
x = self.conv2(x, edge_index)
x = F.relu(x)
x = self.conv3(x, edge_index)
x = F.relu(x)
x = torch.mean(x, dim=0) # Global pooling
x = F.dropout(x, p=0.5, training=self.training)
x = self.fc1(x)
x = F.relu(x)
x = self.fc2(x)
return x
# Example instantiation of the model
# Ensure that 'your_num_features', 'your_num_classes', 'your_num_relations', and 'your_num_bases' are defined
# For instance, 'your_num_relations' could be the number of edge types in your graph
# and 'your_num_bases' could be a hyperparameter for the RGCN layer
model = RecommenderGNN(num_features=your_num_features, num_classes=your_num_classes,
num_relations=your_num_relations, num_bases=your_num_bases)
4. Training and Testing the GNN Model
Setup the training loop and define the optimizer and loss function:
from torch.optim import Adam
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR
# Setup the training loop
optimizer = Adam(model.parameters(), lr=0.01)
criterion = torch.nn.MSELoss() # Using Mean Squared Error for regression
scheduler = StepLR(optimizer, step_size=50, gamma=0.1) # Reduces the learning rate by a factor of 0.1 every 50 epochs
# Example of how validation and test data might be structured, with some data intentionally masked to simulate missing interactions
# val_data, test_data = ...
for epoch in range(200):
model.train()
optimizer.zero_grad()
# Simulate missing interactions by masking some data entries
# This is critical for ensuring the model learns to predict unseen movie preferences
out = model(data) # Ensure 'data' is properly loaded and prepared
loss = criterion(out, data.y) # Assuming data.y are the actual ratings
loss.backward()
optimizer.step()
scheduler.step() # Adjust the learning rate
# Validation logic
if epoch % 10 == 0: # Check validation performance every 10 epochs
model.eval()
with torch.no_grad():
val_out = model(val_data)
val_loss = criterion(val_out, val_data.y)
print(f'Epoch {epoch}, Validation Loss: {val_loss.item()}')
# Early stopping logic to halt training if no improvement in validation loss
if should_stop_early(val_loss):
print("Stopping early due to lack of improvement in validation loss.")
break
print(f'Epoch {epoch}, Training Loss: {loss.item()}')
def should_stop_early(current_val_loss):
# Placeholder function for early stopping logic
# Implement logic to keep track of the best loss and stop if there's no improvement
pass
# Testing logic to evaluate the model's ability to predict new or missing interactions
# This might involve measuring the accuracy of predictions against truly unseen data
# test_out = model(test_data)
# test_loss = criterion(test_out, test_data.y)
# print(f'Test Loss: {test_loss.item()}')
Steps for IGMC:
from IGMC.models import IGMC_R_GCN
# Define the IGMC model
igmc_model = IGMC_R_GCN(u_features, v_features, adj_train, num_users, num_items, hidden_dim=64, num_relations=1, num_bases=4)
# Setup the training loop with IGMC-specific configurations
optimizer = Adam(igmc_model.parameters(), lr=0.01, weight_decay=1e-4)
criterion = torch.nn.MSELoss()
scheduler = StepLR(optimizer, step_size=50, gamma=0.1)
for epoch in range(200):
igmc_model.train()
optimizer.zero_grad()
out = igmc_model(u_features, v_features, adj_train)
loss = criterion(out[train_u_indices, train_v_indices], train_labels)
loss.backward()
optimizer.step()
scheduler.step()
# Validation logic for IGMC model
if epoch % 10 == 0:
igmc_model.eval()
with torch.no_grad():
val_out = igmc_model(u_features, v_features, adj_train)
val_loss = criterion(val_out[val_u_indices, val_v_indices], val_labels)
print(f'Epoch {epoch}, Validation Loss: {val_loss.item()}')
if should_stop_early(val_loss):
print("Stopping early due to lack of improvement in validation loss.")
break
print(f'Epoch {epoch}, Training Loss: {loss.item()}')
import torch
def precision_at_k(predicted, truth, k):
"""
Computes the precision@k for the specified values of k
"""
batch_size = predicted.size(0)
_, ind = torch.topk(predicted, k, dim=1)
truth_resholded = truth.view(-1, 1).expand_as(ind)
correct = ind.eq(truth_throughed)
correct_k = correct[:, :k].sum(dim=1).float()
return correct_k.sum() / batch_size
def recall_at_k(predicted, truth, k):
"""
Computes the recall@k for the specified values of k
"""
batch_size = predicted.size(0)
_, ind = torch.topk(predicted, k, dim=1)
truth_reshaped = truth.view(-1, 1).expand_as(ind)
correct = ind.eq(truth_through)
total = truth.sum(dim=1)
hit_at_k = correct[:, :k].sum(dim=1)
return hit_at_k.sum().float() / total.sum()
def f1_at_k(predicted, truth, k):
"""
Computes the F1@k for the specified values of k
"""
precision = precision_at_k(predicted, truth, k)
recall = recall_at_k(predicted, truth, k)
return 2 * (precision * recall) / (precision + recall)
# Example usage
test_out = model(test_data)
precision_k = precision_at_k(test_out, test_data.y, k=10)
recall_k = recall_at_k(test_out, test_data.y, k=10)
f1_k = f1_at_k(test_out, test_data.y, k=10)
print(f'Precision@10: {precision_k}, Recall@10: {recall_k}, F1@10: {f1_k}')