Hi all,
We hope you’ve had enough time to experiment with different models! This week, let’s focus on a more structured deep dive. We are sharing sample code and not just snippets. Please remember that simply copying the code won’t lead to much learning. It’s crucial that you’ve put in your best effort to understand the bigger picture, both in terms of the domain and the machine learning concepts involved.
Task 1: Data Preparation and Graph Creation
This code prepares the dataset and creates a graph for disease-gene association prediction. Focus on adjusting these key parameters:
-
Global Score Cutoff:
- Purpose: Controls number of positive edges
- Adjust: Increase to reduce positive edges if too many
-
Max Number of PPI Interactions:
- Purpose: Limits protein-protein interaction edges
- Adjust: Reduce if facing resource constraints
-
Negative-to-Positive Sample Ratio:
- Purpose: Determines balance between negative and positive edges
- Adjust: Increase for larger overall dataset
Overall Objective: Tune parameters so logistic regression shows low-medium performance, while other models (Random Forest, Gradient Boosting, SVM) perform better. Avoid near-perfect performance, as it may indicate oversimplification or overfitting.
Submit your results using your chosen disease, even if unable to attend the meeting.
Even if you can’t attend the meeting, please make sure to submit your results for this piece of code (Task 1), and stick to the disease you’ve chosen.
import pandas as pd
import networkx as nx
import random
from tqdm import tqdm
# Load data
print("Loading data...")
ot_df = pd.read_csv('/content/drive/MyDrive/ot.tsv', sep='\t')
ppi_df = pd.read_csv('/content/drive/MyDrive/ppi.csv')
# Filter PPI data
print("Filtering PPI data...")
ppi_df = ppi_df.dropna()
ppi_df = ppi_df[(ppi_df['GeneName1'] != '') & (ppi_df['GeneName2'] != '')]
print(f"Filtered PPI data shape: {ppi_df.shape}")
# Sample PPI data if necessary
max_ppi_interactions = 5000000
if len(ppi_df) > max_ppi_interactions:
ppi_filtered = ppi_df.sample(n=max_ppi_interactions, random_state=42)
else:
ppi_filtered = ppi_df
print(f"Final PPI data shape: {ppi_filtered.shape}")
# Create graph
print("Creating graph...")
G = nx.Graph()
# Add OpenTargets edges
diseases = ['lung cancer']
positive_edges = []
for disease in diseases:
for _, row in ot_df.iterrows():
if row['globalScore'] > 0.1:
G.add_edge(disease, row['symbol'], weight=row['globalScore'], type='disease-gene')
positive_edges.append((disease, row['symbol']))
print(f"Number of positive edges: {len(positive_edges)}")
# Add PPI edges
ppi_edges_added = 0
for _, row in tqdm(ppi_filtered.iterrows(), total=len(ppi_filtered), desc="Adding PPI edges"):
if row['GeneName1'] != row['GeneName2']: # Avoid self-loops
G.add_edge(row['GeneName1'], row['GeneName2'], type='ppi')
ppi_edges_added += 1
print(f"Number of PPI edges added: {ppi_edges_added}")
# Create negative examples
print("Creating negative examples...")
all_genes = set(ppi_filtered['GeneName1']).union(set(ppi_filtered['GeneName2']))
print(f"Total unique genes in PPI network: {len(all_genes)}")
negative_edges = []
for disease in diseases:
associated_genes = set(G.neighbors(disease))
print(f"Genes associated with {disease}: {len(associated_genes)}")
non_associated_genes = all_genes - associated_genes
print(f"Genes not associated with {disease}: {len(non_associated_genes)}")
negative_edges.extend([(disease, gene) for gene in non_associated_genes])
print(f"Number of potential negative edges: {len(negative_edges)}")
# Maintain imbalance
desired_ratio = 10 # 10 times more negative than positive
num_negative_samples = min(len(negative_edges), desired_ratio * len(positive_edges))
negative_edges = random.sample(negative_edges, num_negative_samples)
print(f"Final number of positive edges: {len(positive_edges)}")
print(f"Final number of negative edges: {len(negative_edges)}")
# Additional diagnostics
print("\nAdditional Diagnostics:")
print(f"Total nodes in graph: {G.number_of_nodes()}")
print(f"Total edges in graph: {G.number_of_edges()}")
print(f"Nodes with 'lung cancer' as neighbor: {len(list(G.neighbors('lung cancer')))}")
ot_genes = set(ot_df['symbol'])
ppi_genes = set(ppi_filtered['GeneName1']).union(set(ppi_filtered['GeneName2']))
print(f"Genes in OpenTargets: {len(ot_genes)}")
print(f"Genes in PPI network: {len(ppi_genes)}")
Task 2: Model Implementation and Tuning
This code introduces simplified node embeddings based on basic graph features. These embeddings are used to create edge features for machine learning models.
Instructions:
- Choose at least one model: Logistic Regression, Random Forest, Gradient Boosting, or SVM.
- Experiment with hyperparameters:
- Logistic Regression: Adjust
C
- Random Forest: Vary
n_estimators
andmax_depth
- Gradient Boosting: Modify
learning_rate
andn_estimators
- SVM: Tweak
C
andkernel
- Logistic Regression: Adjust
- Document changes in F1-score, precision, recall, and accuracy as you adjust parameters.
- Compare tuned model performance with default settings.
- Explain the impact of hyperparameter changes in the context of gene-disease association.
Focus on understanding how these simpler embeddings and different hyperparameters affect model performance in predicting disease-gene associations.
# Import necessary libraries
import networkx as nx
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
import numpy as np
import random
from tqdm import tqdm
def simple_node_embedding(G, dim=64):
print("Generating simple node embeddings...")
embeddings = {}
for node in G.nodes():
# Use node degree as a feature
degree = G.degree(node)
# Use average neighbor degree as another feature
neighbor_degrees = [G.degree(n) for n in G.neighbors(node)]
avg_neighbor_degree = np.mean(neighbor_degrees) if neighbor_degrees else 0
# Create a simple embedding vector
embedding = np.zeros(dim)
embedding[0] = degree
embedding[1] = avg_neighbor_degree
# Fill the rest with random values (you could add more graph-based features here)
embedding[2:] = np.random.randn(dim-2)
embeddings[node] = embedding / np.linalg.norm(embedding) # Normalize
return embeddings
# Replace Node2Vec with this simpler embedding
node_embeddings = simple_node_embedding(G, dim=64)
# Use the embeddings in your existing pipeline
def get_edge_features(edge):
return node_embeddings[edge[0]] * node_embeddings[edge[1]]
X_positive = np.array([get_edge_features(edge) for edge in tqdm(positive_edges, desc="Processing positive edges")])
X_negative = np.array([get_edge_features(edge) for edge in tqdm(negative_edges, desc="Processing negative edges")])
X = np.vstack((X_positive, X_negative))
y = np.array([1] * len(positive_edges) + [0] * len(negative_edges))
# First, let's split our data into training+validation set and a separate test set
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
def cross_validate_and_evaluate(model, X_train_val, y_train_val, X_test, y_test, model_name, cv=5):
# Perform cross-validation
cv_scores = cross_val_score(model, X_train_val, y_train_val, cv=cv, scoring='f1')
print(f"\n{model_name} Cross-Validation Results:")
print(f"Mean F1-score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
# Train on full training set and evaluate on test set
model.fit(X_train_val, y_train_val)
y_pred = model.predict(X_test)
print(f"\n{model_name} Test Set Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1-score: {f1_score(y_test, y_pred):.4f}")
# Define models, remember these hyperparameters maynot be the best for your dataset!
models = {
"Logistic Regression": LogisticRegression(class_weight='balanced'),
"Random Forest": RandomForestClassifier(n_estimators=200, max_depth=10, class_weight='balanced', random_state=42),
"Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
"SVM": SVC(kernel='linear', C=1.0, class_weight='balanced', random_state=42)
}
# Perform cross-validation and evaluation for each model
for name, model in models.items():
cross_validate_and_evaluate(model, X_train_val, y_train_val, X_test, y_test, name)
Pinging members active in meetings/ forums: @Prasun_Sharma @Moh_Saiger @ayahashim16 @ahmedsalim @Huikun_Li @Thuraya_Ayman @hahaharsini.
Please forward to your team channel to include other members who are working on the code but are unable to make mentor meetings.