🔸 Code Along for A-TE-2: Building an NLP Pipeline - Recommender Systems

1. Setup and Data Preparation

# Install necessary libraries
!pip install numpy pandas scikit-learn tensorflow gensim openai

# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import openai

# Load and preprocess the MovieLens dataset
movielens_data = pd.read_csv('path/to/movielens.csv')

# Basic data preprocessing
movielens_data['userId'] = movielens_data['userId'].astype(str)
movielens_data['movieId'] = movielens_data['movieId'].astype(str)

# Split data into training and testing sets
train_data, test_data = train_test_split(movielens_data, test_size=0.2, random_state=42)

2. Embedding the Data

# Basic embedding with OpenAI API
openai.api_key = 'your-api-key'

def get_embedding(text):
    response = openai.Embedding.create(input=[text], model="text-embedding-ada-002")
    return response['data'][0]['embedding']

# Example: Get embedding for a movie description
movie_description = "A young wizard's journey to defeat the dark forces."
embedding = get_embedding(movie_description)

3a. Developing the Recommender System (Cosine Similarity)

# Basic recommender system using cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

# Assume embeddings_df is a DataFrame with movieId and corresponding embeddings
def recommend_movies(movie_id, embeddings_df, top_n=5):
    movie_embedding = embeddings_df.loc[embeddings_df['movieId'] == movie_id, 'embedding'].values[0]
    similarities = cosine_similarity([movie_embedding], embeddings_df['embedding'].tolist())
    similar_movies = embeddings_df.iloc[np.argsort(similarities[0])[::-1][:top_n]]
    return similar_movies

# Example: Recommend movies similar to a given movieId
recommendations = recommend_movies('12345', embeddings_df)

3b. Developing the Recommender System ( Neural Collaborative Filtering)

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

class NCF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim):
        super(NCF, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        self.fc1 = nn.Linear(embedding_dim * 2, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, user, item):
        user_embedded = self.user_embedding(user)
        item_embedded = self.item_embedding(item)
        x = torch.cat([user_embedded, item_embedded], dim=-1)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

# Dummy dataset for illustration
class MovieLensDataset(Dataset):
    def __init__(self, user_ids, item_ids, ratings):
        self.user_ids = user_ids
        self.item_ids = item_ids
        self.ratings = ratings

    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, idx):
        return self.user_ids[idx], self.item_ids[idx], self.ratings[idx]

# Example data
num_users = 1000
num_items = 1700
user_ids = torch.randint(0, num_users, (10000,))
item_ids = torch.randint(0, num_items, (10000,))
ratings = torch.randint(0, 2, (10000,)).float()

dataset = MovieLensDataset(user_ids, item_ids, ratings)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

# Instantiate model, loss function, and optimizer
embedding_dim = 50
model = NCF(num_users, num_items, embedding_dim)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(5):
    for user, item, rating in dataloader:
        outputs = model(user, item)
        loss = criterion(outputs.squeeze(), rating)

    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

For another advanced approach, refer to 🔸 A-TE-1: GNN-Based Recommender Systems.

4. Model Evaluation and Refinement

# Evaluation metrics
from sklearn.metrics import precision_score, recall_score, f1_score

# Example: Calculate Precision, Recall, and F1-Score
y_true = [1, 0, 1, 1, 0]  # Example ground truth labels
y_pred = [1, 0, 1, 0, 0]  # Example predicted labels

precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"Precision: {precision}, Recall: {recall}, F1-Score: {f1}")