🔸 Code Along for A-TE-1: GNN-Based Recommender Systems

stemaway · June 2, 2024, 6:16pm

Here’s a detailed breakdown of the code snippets for each step to guide you through setting up and implementing the GNN model for the Movie Recommender System:

1. Setup and Data Preparation

Make sure all necessary packages are installed and load the dataset:

# Ensure necessary libraries are installed, add more libraries as needed
!pip install torch==1.10.0 torchvision==0.11.1 torchaudio==0.10.0
!pip install torch-geometric==2.0.1
!pip install pandas networkx

import pandas as pd

# Load MovieLens data. 
try:
    movies = pd.read_csv('movies.csv')
    ratings = pd.read_csv('ratings.csv')
    print("Data loaded successfully!")
except FileNotFoundError:
    print("Data files not found. Please check the files' path and try again.")

Steps for IGMC:

# Clone the IGMC repository
!git clone -b latest https://github.com/muhanzhang/IGMC.git

# Move necessary files 'util_functions.py', 'data_utils.py', 'preprocessing.py' from the cloned repository to your current directory

2. Building the Graph Structure

Create a graph from the loaded data and prepare it for use with PyTorch Geometric:

import networkx as nx
from torch_geometric.utils import from_networkx

# Create a graph from the pandas DataFrame
# Nodes represent users and movies, edges represent user ratings for movies
G = nx.from_pandas_edgelist(ratings, 'user_id', 'movie_id', ['rating'])

# Convert the NetworkX graph to a PyTorch Geometric graph
G_torch = from_networkx(G)

Steps for IGMC:

import torch
from data_utils import load_official_trainvaltest_split

# Load and preprocess the MovieLens dataset using IGMC's utilities
(u_features, v_features, adj_train, train_labels, train_u_indices, train_v_indices, 
 val_labels, val_u_indices, val_v_indices, test_labels, test_u_indices, test_v_indices, 
 class_values) = load_official_trainvaltest_split('ml_100k', testing=True)

# Print to confirm the data is ready for further steps
print("Data loaded and preprocessed successfully using IGMC's tools.")

3. Implementing GNN Models

Define a basic GNN model architecture using PyTorch Geometric, ensuring to define the number of features and classes:

import torch
from torch_geometric.nn import GCNConv

class RecommenderGNN(torch.nn.Module):
    def __init__(self, num_features, num_classes):
        super(RecommenderGNN, self).__init__()
        # Initialize two GCN layers
        self.conv1 = GCNConv(num_features, 16)  # First GCN layer to transform features
        self.conv2 = GCNConv(16, num_classes)   # Second GCN layer to prepare for output

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = torch.relu(self.conv1(x, edge_index))  # Apply ReLU activation function for non-linearity
        x = self.conv2(x, edge_index)             # No activation before final layer
        return x  # Return the output directly for a regression or use sigmoid for binary classification

# Example instantiation of the model
# Ensure that 'your_num_features' and 'your_num_classes' are defined
# For instance, 'your_num_features' could be the number of movie genres, and 'your_num_classes' could be 1 for a rating prediction
model = RecommenderGNN(num_features=your_num_features, num_classes=your_num_classes)

Steps for IGMC:

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import RGCNConv, GraphConv

class RecommenderGNN(torch.nn.Module):
    def __init__(self, num_features, num_classes, num_relations, num_bases):
        super(RecommenderGNN, self).__init__()
        self.conv1 = RGCNConv(num_features, 64, num_relations, num_bases=num_bases)
        self.conv2 = GraphConv(64, 128)
        self.conv3 = GraphConv(128, 256)
        self.fc1 = nn.Linear(256, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, data):
        x, edge_index, edge_type = data.x, data.edge_index, data.edge_type
        x = self.conv1(x, edge_index, edge_type)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.conv3(x, edge_index)
        x = F.relu(x)
        x = torch.mean(x, dim=0)  # Global pooling
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        return x

# Example instantiation of the model
# Ensure that 'your_num_features', 'your_num_classes', 'your_num_relations', and 'your_num_bases' are defined
# For instance, 'your_num_relations' could be the number of edge types in your graph
# and 'your_num_bases' could be a hyperparameter for the RGCN layer
model = RecommenderGNN(num_features=your_num_features, num_classes=your_num_classes,
                       num_relations=your_num_relations, num_bases=your_num_bases)

4. Training and Testing the GNN Model

Setup the training loop and define the optimizer and loss function:

from torch.optim import Adam
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR

# Setup the training loop
optimizer = Adam(model.parameters(), lr=0.01)
criterion = torch.nn.MSELoss()  # Using Mean Squared Error for regression
scheduler = StepLR(optimizer, step_size=50, gamma=0.1)  # Reduces the learning rate by a factor of 0.1 every 50 epochs

# Example of how validation and test data might be structured, with some data intentionally masked to simulate missing interactions
# val_data, test_data = ...

for epoch in range(200):
    model.train()
    optimizer.zero_grad()
    # Simulate missing interactions by masking some data entries
    # This is critical for ensuring the model learns to predict unseen movie preferences
    out = model(data)  # Ensure 'data' is properly loaded and prepared
    loss = criterion(out, data.y)  # Assuming data.y are the actual ratings
    loss.backward()
    optimizer.step()
    scheduler.step()  # Adjust the learning rate

    # Validation logic
    if epoch % 10 == 0:  # Check validation performance every 10 epochs
        model.eval()
        with torch.no_grad():
            val_out = model(val_data)
            val_loss = criterion(val_out, val_data.y)
            print(f'Epoch {epoch}, Validation Loss: {val_loss.item()}')

        # Early stopping logic to halt training if no improvement in validation loss
        if should_stop_early(val_loss):
            print("Stopping early due to lack of improvement in validation loss.")
            break

    print(f'Epoch {epoch}, Training Loss: {loss.item()}')

def should_stop_early(current_val_loss):
    # Placeholder function for early stopping logic
    # Implement logic to keep track of the best loss and stop if there's no improvement
    pass

# Testing logic to evaluate the model's ability to predict new or missing interactions
# This might involve measuring the accuracy of predictions against truly unseen data
# test_out = model(test_data)
# test_loss = criterion(test_out, test_data.y)
# print(f'Test Loss: {test_loss.item()}')

Steps for IGMC:

from IGMC.models import IGMC_R_GCN

# Define the IGMC model
igmc_model = IGMC_R_GCN(u_features, v_features, adj_train, num_users, num_items, hidden_dim=64, num_relations=1, num_bases=4)

# Setup the training loop with IGMC-specific configurations
optimizer = Adam(igmc_model.parameters(), lr=0.01, weight_decay=1e-4)
criterion = torch.nn.MSELoss()
scheduler = StepLR(optimizer, step_size=50, gamma=0.1)

for epoch in range(200):
    igmc_model.train()
    optimizer.zero_grad()
    out = igmc_model(u_features, v_features, adj_train)
    loss = criterion(out[train_u_indices, train_v_indices], train_labels)
    loss.backward()
    optimizer.step()
    scheduler.step()

    # Validation logic for IGMC model
    if epoch % 10 == 0:
        igmc_model.eval()
        with torch.no_grad():
            val_out = igmc_model(u_features, v_features, adj_train)
            val_loss = criterion(val_out[val_u_indices, val_v_indices], val_labels)
            print(f'Epoch {epoch}, Validation Loss: {val_loss.item()}')

        if should_stop_early(val_loss):
            print("Stopping early due to lack of improvement in validation loss.")
            break

    print(f'Epoch {epoch}, Training Loss: {loss.item()}')

import torch

def precision_at_k(predicted, truth, k):
    """
    Computes the precision@k for the specified values of k
    """
    batch_size = predicted.size(0)
    _, ind = torch.topk(predicted, k, dim=1)
    truth_resholded = truth.view(-1, 1).expand_as(ind)
    correct = ind.eq(truth_throughed)
    correct_k = correct[:, :k].sum(dim=1).float()
    return correct_k.sum() / batch_size

def recall_at_k(predicted, truth, k):
    """
    Computes the recall@k for the specified values of k
    """
    batch_size = predicted.size(0)
    _, ind = torch.topk(predicted, k, dim=1)
    truth_reshaped = truth.view(-1, 1).expand_as(ind)
    correct = ind.eq(truth_through)
    total = truth.sum(dim=1)
    hit_at_k = correct[:, :k].sum(dim=1)
    return hit_at_k.sum().float() / total.sum()

def f1_at_k(predicted, truth, k):
    """
    Computes the F1@k for the specified values of k
    """
    precision = precision_at_k(predicted, truth, k)
    recall = recall_at_k(predicted, truth, k)
    return 2 * (precision * recall) / (precision + recall)

# Example usage
test_out = model(test_data)
precision_k = precision_at_k(test_out, test_data.y, k=10)
recall_k = recall_at_k(test_out, test_data.y, k=10)
f1_k = f1_at_k(test_out, test_data.y, k=10)
print(f'Precision@10: {precision_k}, Recall@10: {recall_k}, F1@10: {f1_k}')