1. Setup and Data Preparation
# Install necessary libraries
!pip install numpy pandas scikit-learn tensorflow gensim openai
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import openai
# Load and preprocess the MovieLens dataset
movielens_data = pd.read_csv('path/to/movielens.csv')
movielens_data.head()
# Basic data preprocessing
movielens_data.dropna(inplace=True)
movielens_data['userId'] = movielens_data['userId'].astype(str)
movielens_data['movieId'] = movielens_data['movieId'].astype(str)
# Split data into training and testing sets
train_data, test_data = train_test_split(movielens_data, test_size=0.2, random_state=42)
2. Embedding the Data
# Basic embedding with OpenAI API
openai.api_key = 'your-api-key'
def get_embedding(text):
response = openai.Embedding.create(input=[text], model="text-embedding-ada-002")
return response['data'][0]['embedding']
# Example: Get embedding for a movie description
movie_description = "A young wizard's journey to defeat the dark forces."
embedding = get_embedding(movie_description)
print(embedding)
3a. Developing the Recommender System (Cosine Similarity)
# Basic recommender system using cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
# Assume embeddings_df is a DataFrame with movieId and corresponding embeddings
def recommend_movies(movie_id, embeddings_df, top_n=5):
movie_embedding = embeddings_df.loc[embeddings_df['movieId'] == movie_id, 'embedding'].values[0]
similarities = cosine_similarity([movie_embedding], embeddings_df['embedding'].tolist())
similar_movies = embeddings_df.iloc[np.argsort(similarities[0])[::-1][:top_n]]
return similar_movies
# Example: Recommend movies similar to a given movieId
recommendations = recommend_movies('12345', embeddings_df)
print(recommendations)
3b. Developing the Recommender System ( Neural Collaborative Filtering)
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
class NCF(nn.Module):
def __init__(self, num_users, num_items, embedding_dim):
super(NCF, self).__init__()
self.user_embedding = nn.Embedding(num_users, embedding_dim)
self.item_embedding = nn.Embedding(num_items, embedding_dim)
self.fc1 = nn.Linear(embedding_dim * 2, 128)
self.fc2 = nn.Linear(128, 64)
self.fc3 = nn.Linear(64, 1)
self.sigmoid = nn.Sigmoid()
def forward(self, user, item):
user_embedded = self.user_embedding(user)
item_embedded = self.item_embedding(item)
x = torch.cat([user_embedded, item_embedded], dim=-1)
x = torch.relu(self.fc1(x))
x = torch.relu(self.fc2(x))
x = self.sigmoid(self.fc3(x))
return x
# Dummy dataset for illustration
class MovieLensDataset(Dataset):
def __init__(self, user_ids, item_ids, ratings):
self.user_ids = user_ids
self.item_ids = item_ids
self.ratings = ratings
def __len__(self):
return len(self.user_ids)
def __getitem__(self, idx):
return self.user_ids[idx], self.item_ids[idx], self.ratings[idx]
# Example data
num_users = 1000
num_items = 1700
user_ids = torch.randint(0, num_users, (10000,))
item_ids = torch.randint(0, num_items, (10000,))
ratings = torch.randint(0, 2, (10000,)).float()
dataset = MovieLensDataset(user_ids, item_ids, ratings)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)
# Instantiate model, loss function, and optimizer
embedding_dim = 50
model = NCF(num_users, num_items, embedding_dim)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Training loop
for epoch in range(5):
for user, item, rating in dataloader:
optimizer.zero_grad()
outputs = model(user, item)
loss = criterion(outputs.squeeze(), rating)
loss.backward()
optimizer.step()
print(f'Epoch {epoch+1}, Loss: {loss.item()}')
4. Model Evaluation and Refinement
# Evaluation metrics
from sklearn.metrics import precision_score, recall_score, f1_score
# Example: Calculate Precision, Recall, and F1-Score
y_true = [1, 0, 1, 1, 0] # Example ground truth labels
y_pred = [1, 0, 1, 0, 0] # Example predicted labels
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
print(f"Precision: {precision}, Recall: {recall}, F1-Score: {f1}")