Analyzing Music Rhythms: A Data Science Approach in R
Objective
Build a comprehensive data analysis pipeline to explore and visualize rhythmic patterns across different music genres using R. This project combines data collection through the Spotify API, statistical analysis, and interactive visualization to uncover patterns in musical rhythms.
Learning Outcomes
By completing this project, you will:
- Master data collection using REST APIs in R
- Implement statistical analysis techniques for music data
- Create interactive visualizations using modern R libraries
- Understand rhythm patterns across music genres
- Build production-ready R Shiny applications
Skills Gained
- Building data collection pipelines with APIs
- Implementing statistical analysis in R
- Creating interactive data visualizations
- Using modern R packages and tools
- Handling real-world music data
- Building web applications with Shiny
Tools Required
# Install required packages
install.packages(c(
"tidyverse", # Data manipulation
"httr", # API requests
"spotifyr", # Spotify API wrapper
"ggplot2", # Visualization
"plotly", # Interactive plots
"shiny", # Web applications
"shinydashboard", # Dashboard layouts
"DT", # Interactive tables
"scales" # Better plot scaling
))
Project Structure
music_analysis/
│
├── data/
│ ├── raw/
│ │ └── spotify_tracks.csv
│ └── processed/
│ └── genre_analysis.csv
│
├── R/
│ ├── data_collection.R
│ ├── preprocessing.R
│ ├── analysis.R
│ └── visualization.R
│
├── shiny/
│ ├── ui.R
│ └── server.R
│
└── app.R
Prerequisites and Theoretical Foundations for Music Genre Analysis
1. R Programming Foundations
- Data manipulation with R
- Statistical analysis basics
- Data visualization fundamentals
- Function and package usage
Click to view R prerequisites code examples
# Data manipulation example
library(dplyr)
# Basic data operations
music_data <- data.frame(
song = c("Song1", "Song2", "Song3"),
bpm = c(120, 95, 140),
genre = c("rock", "jazz", "electronic")
)
# Data summarization
summary_stats <- music_data %>%
group_by(genre) %>%
summarize(
avg_bpm = mean(bpm),
sd_bpm = sd(bpm),
count = n()
)
# Basic visualization
library(ggplot2)
ggplot(music_data, aes(x = genre, y = bpm)) +
geom_boxplot() +
theme_minimal()
2. Statistical Analysis Skills
- Descriptive statistics
- Hypothesis testing
- ANOVA fundamentals
- Data distribution analysis
Click to view statistics code examples
# Statistical analysis examples
library(stats)
# Descriptive statistics
describe_genre <- function(data, genre_col, bpm_col) {
stats <- list(
mean = mean(bpm_col),
median = median(bpm_col),
sd = sd(bpm_col),
iqr = IQR(bpm_col)
)
return(stats)
}
# ANOVA test
perform_anova <- function(data) {
model <- aov(bpm ~ genre, data = data)
return(summary(model))
}
# Distribution test
check_normality <- function(data) {
shapiro.test(data$bpm)
}
3. Required Libraries and Skills
- R 4.0+
- tidyverse ecosystem
- Basic music theory understanding
- API usage experience
4. List of Theoretical Concepts
Music Theory
-
Rhythm Fundamentals
- Beats per minute (BPM)
- Time signatures
- Rhythm patterns
- Tempo classifications
-
Genre Characteristics
- Typical tempo ranges
- Rhythmic patterns
- Common time signatures
- Sub-genre variations
Statistical Analysis
-
Descriptive Statistics
- Central tendency measures
- Dispersion measures
- Distribution analysis
- Outlier detection
-
Inferential Statistics
- ANOVA
- Post-hoc tests
- Effect size calculations
- Confidence intervals
Data Visualization
-
Plot Types
- Box plots for distribution
- Violin plots for density
- Bar plots for comparisons
- Time series for trends
-
Interactive Elements
- User inputs
- Dynamic updates
- Filtering options
- Zoom and pan features
Data Processing
-
Audio Analysis
- BPM detection
- Feature extraction
- Signal processing
- Audio metadata
-
Data Cleaning
- Outlier handling
- Missing value treatment
- Data normalization
- Genre categorization
API Integration
-
Spotify API
- Authentication
- Data retrieval
- Rate limiting
- Error handling
-
Data Structure
- JSON parsing
- Data formatting
- Response handling
- Cache management
Steps and Tasks
1. Data Collection
Set up Spotify API authentication and data collection:
library(spotifyr)
library(tidyverse)
# Set up Spotify credentials
Sys.setenv(SPOTIFY_CLIENT_ID = 'your_client_id')
Sys.setenv(SPOTIFY_CLIENT_SECRET = 'your_client_secret')
# Get access token
access_token <- get_spotify_access_token()
# Function to get tracks by genre
get_genre_tracks <- function(genre, limit = 50) {
playlist <- get_playlist_tracks(
playlist_id = get_genre_playlist(genre),
limit = limit
)
tracks <- playlist %>%
select(
track_id = id,
name = track_name,
artist = artist_name,
tempo = audio_features$tempo,
genre = playlist_genre
)
return(tracks)
}
Click to view complete data collection implementation
library(spotifyr)
library(tidyverse)
library(future)
library(furrr)
# Setup parallel processing
plan(multisession)
class SpotifyDataCollector {
private = list(
client_id = NULL,
client_secret = NULL,
access_token = NULL,
genres = c(
"rock", "pop", "hip-hop",
"jazz", "classical", "electronic"
)
)
public = list(
initialize = function(client_id, client_secret) {
private$client_id <- client_id
private$client_secret <- client_secret
private$access_token <- get_spotify_access_token(
client_id = client_id,
client_secret = client_secret
)
},
get_genre_playlists = function() {
map_dfr(private$genres, function(genre) {
search_playlists(
q = genre,
limit = 5,
market = "US"
) %>%
mutate(genre = genre)
})
},
get_tracks_with_features = function(playlist_id) {
tracks <- get_playlist_tracks(playlist_id) %>%
select(
track_id = id,
name = track_name,
artist = artist_name
)
features <- get_track_audio_features(tracks$track_id) %>%
select(
track_id = id,
tempo,
danceability,
energy,
key,
mode,
time_signature
)
left_join(tracks, features, by = "track_id")
},
collect_all_data = function(n_tracks_per_genre = 100) {
playlists <- self$get_genre_playlists()
tracks <- future_map_dfr(
playlists$playlist_id,
self$get_tracks_with_features
) %>%
group_by(genre) %>%
slice_sample(n = n_tracks_per_genre) %>%
ungroup()
return(tracks)
}
)
)
# Usage example
collector <- SpotifyDataCollector$new(
client_id = "your_client_id",
client_secret = "your_client_secret"
)
data <- collector$collect_all_data(n_tracks_per_genre = 100)
2. Data Preprocessing
Clean and prepare the collected data:
library(tidyverse)
preprocess_music_data <- function(data) {
cleaned_data <- data %>%
# Remove missing values
drop_na() %>%
# Remove outliers using IQR method
group_by(genre) %>%
filter(
tempo >= quantile(tempo, 0.25) - 1.5 * IQR(tempo),
tempo <= quantile(tempo, 0.75) + 1.5 * IQR(tempo)
) %>%
ungroup() %>%
# Add derived features
mutate(
tempo_category = case_when(
tempo < 80 ~ "Slow",
tempo < 120 ~ "Medium",
TRUE ~ "Fast"
)
)
return(cleaned_data)
}
Click to view advanced preprocessing
library(tidyverse)
library(lubridate)
library(recipes)
class MusicDataPreprocessor {
private = list(
data = NULL,
recipe = NULL
)
public = list(
initialize = function(data) {
private$data <- data
# Create preprocessing recipe
private$recipe <- recipe(
formula = tempo ~ .,
data = data
) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors()) %>%
step_corr(all_numeric_predictors())
},
clean_data = function() {
cleaned <- private$data %>%
# Remove missing values
drop_na() %>%
# Remove duplicates
distinct() %>%
# Handle outliers
group_by(genre) %>%
filter(
tempo >= quantile(tempo, 0.25) - 1.5 * IQR(tempo),
tempo <= quantile(tempo, 0.75) + 1.5 * IQR(tempo)
) %>%
ungroup()
return(cleaned)
},
create_features = function(data) {
featured <- data %>%
mutate(
tempo_category = case_when(
tempo < 80 ~ "Slow",
tempo < 120 ~ "Medium",
TRUE ~ "Fast"
),
# Additional musical features
is_major = mode == 1,
beats_per_bar = time_signature,
normalized_tempo = scale(tempo),
# Interaction features
energy_tempo = energy * tempo,
dance_energy = danceability * energy
)
return(featured)
},
process = function() {
processed <- self$clean_data() %>%
self$create_features()
return(processed)
}
)
)
# Usage
preprocessor <- MusicDataPreprocessor$new(raw_data)
processed_data <- preprocessor$process()
3. Statistical Analysis
Perform comprehensive statistical analysis:
library(tidyverse)
library(broom)
analyze_rhythms <- function(data) {
# Summary statistics
summary_stats <- data %>%
group_by(genre) %>%
summarise(
mean_tempo = mean(tempo),
median_tempo = median(tempo),
sd_tempo = sd(tempo),
n = n()
)
# ANOVA test
anova_model <- aov(tempo ~ genre, data = data)
anova_results <- tidy(anova_model)
return(list(
summary = summary_stats,
anova = anova_results
))
}
Click to view advanced statistical analysis
library(tidyverse)
library(broom)
library(car)
library(effectsize)
class MusicAnalyzer {
private = list(
data = NULL,
models = list()
)
public = list(
initialize = function(data) {
private$data <- data
},
descriptive_statistics = function() {
stats <- private$data %>%
group_by(genre) %>%
summarise(
mean_tempo = mean(tempo),
median_tempo = median(tempo),
sd_tempo = sd(tempo),
iqr_tempo = IQR(tempo),
skewness = moments::skewness(tempo),
kurtosis = moments::kurtosis(tempo),
n = n()
) %>%
arrange(desc(mean_tempo))
return(stats)
},
perform_anova = function() {
# One-way ANOVA
model <- aov(tempo ~ genre, data = private$data)
private$models$anova <- model
# Assumptions
assumptions <- list(
normality = shapiro.test(residuals(model)),
homogeneity = leveneTest(tempo ~ genre, data = private$data)
)
# Effect size
eta_squared <- eta_squared(model)
# Post-hoc
tukey <- TukeyHSD(model)
return(list(
model = tidy(model),
assumptions = assumptions,
effect_size = eta_squared,
post_hoc = tukey
))
},
correlation_analysis = function() {
corr_matrix <- private$data %>%
select(tempo, danceability, energy, valence) %>%
cor()
return(corr_matrix)
},
regression_analysis = function() {
# Multiple regression
model <- lm(
tempo ~ danceability + energy + valence + key + mode,
data = private$data
)
private$models$regression <- model
# Model diagnostics
diagnostics <- list(
vif = car::vif(model),
residuals = augment(model)
)
return(list(
summary = tidy(model),
diagnostics = diagnostics
))
}
)
)
# Usage
analyzer <- MusicAnalyzer$new(processed_data)
stats <- analyzer$descriptive_statistics()
anova_results <- analyzer$perform_anova()
correlations <- analyzer$correlation_analysis()
regression_results <- analyzer$regression_analysis()
4. Data Visualization
Create static and interactive visualizations:
library(ggplot2)
library(plotly)
create_visualizations <- function(data) {
# Boxplot
boxplot <- ggplot(data, aes(x = genre, y = tempo, fill = genre)) +
geom_boxplot() +
theme_minimal() +
labs(
title = "Tempo Distribution by Genre",
x = "Genre",
y = "Tempo (BPM)"
)
# Interactive histogram
hist <- plot_ly(data, x = ~tempo, color = ~genre, type = "histogram") %>%
layout(
title = "Tempo Distribution",
xaxis = list(title = "Tempo (BPM)"),
yaxis = list(title = "Count"),
barmode = "overlay"
)
return(list(
boxplot = boxplot,
histogram = hist
))
}
Click to view advanced visualization code
library(ggplot2)
library(plotly)
library(patchwork)
library(viridis)
class MusicVisualizer {
private = list(
data = NULL,
color_palette = viridis::viridis_pal(option = "D")(8)
)
public = list(
initialize = function(data) {
private$data <- data
},
create_tempo_distribution = function() {
# Violin plot with boxplot overlay
p <- ggplot(private$data,
aes(x = genre, y = tempo, fill = genre)) +
geom_violin(alpha = 0.6) +
geom_boxplot(width = 0.2, alpha = 0.8) +
scale_fill_viridis_d() +
theme_minimal() +
theme(
legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1)
) +
labs(
title = "Tempo Distribution by Genre",
x = "Genre",
y = "Tempo (BPM)"
)
return(ggplotly(p))
},
create_rhythm_heatmap = function() {
# Create heatmap of tempo vs. energy by genre
p <- ggplot(private$data,
aes(x = tempo, y = energy, color = genre)) +
geom_point(alpha = 0.6) +
geom_density_2d() +
facet_wrap(~genre) +
scale_color_viridis_d() +
theme_minimal() +
labs(
title = "Tempo vs. Energy Relationship by Genre",
x = "Tempo (BPM)",
y = "Energy"
)
return(ggplotly(p))
},
create_feature_correlation = function() {
# Correlation matrix visualization
corr_data <- private$data %>%
select(tempo, danceability, energy, valence) %>%
cor()
p <- plot_ly(
x = colnames(corr_data),
y = rownames(corr_data),
z = corr_data,
type = "heatmap",
colorscale = "Viridis"
) %>%
layout(
title = "Feature Correlation Matrix",
xaxis = list(title = ""),
yaxis = list(title = "")
)
return(p)
},
create_tempo_evolution = function() {
# Time series of tempo by genre
p <- ggplot(private$data,
aes(x = release_date, y = tempo, color = genre)) +
geom_smooth(se = FALSE) +
scale_color_viridis_d() +
theme_minimal() +
labs(
title = "Tempo Evolution Over Time",
x = "Release Date",
y = "Tempo (BPM)"
)
return(ggplotly(p))
},
create_dashboard = function() {
# Combine all visualizations
plots <- list(
self$create_tempo_distribution(),
self$create_rhythm_heatmap(),
self$create_feature_correlation(),
self$create_tempo_evolution()
)
return(plots)
}
)
)
5. Shiny Application
Create an interactive web application to explore the music data:
library(shiny)
library(shinydashboard)
library(DT)
# UI Definition
ui <- dashboardPage(
dashboardHeader(title = "Music Rhythm Analysis"),
dashboardSidebar(
sidebarMenu(
menuItem("Overview", tabName = "overview", icon = icon("dashboard")),
menuItem("Genre Analysis", tabName = "genre", icon = icon("music")),
menuItem("Comparisons", tabName = "compare", icon = icon("chart-bar"))
)
),
dashboardBody(
tabItems(
tabItem(tabName = "overview",
fluidRow(
box(
title = "Dataset Summary",
status = "primary",
solidHeader = TRUE,
width = 12,
DT::dataTableOutput("summary_table")
)
),
fluidRow(
box(
title = "Tempo Distribution",
status = "primary",
solidHeader = TRUE,
plotlyOutput("tempo_dist")
)
)
)
)
)
)
Click to view complete Shiny application code
library(shiny)
library(shinydashboard)
library(DT)
library(plotly)
# Create a class for the Shiny application
class MusicAnalysisDashboard {
private = list(
data = NULL,
analyzer = NULL,
visualizer = NULL
)
public = list(
initialize = function(data) {
private$data <- data
private$analyzer <- MusicAnalyzer$new(data)
private$visualizer <- MusicVisualizer$new(data)
},
create_ui = function() {
ui <- dashboardPage(
dashboardHeader(title = "Music Rhythm Analysis"),
dashboardSidebar(
sidebarMenu(
menuItem("Overview", tabName = "overview",
icon = icon("dashboard")),
menuItem("Genre Analysis", tabName = "genre",
icon = icon("music")),
menuItem("Comparisons", tabName = "compare",
icon = icon("chart-bar")),
menuItem("Statistical Tests", tabName = "stats",
icon = icon("calculator"))
)
),
dashboardBody(
tabItems(
# Overview Tab
tabItem(
tabName = "overview",
fluidRow(
valueBoxOutput("total_tracks"),
valueBoxOutput("total_genres"),
valueBoxOutput("avg_tempo")
),
fluidRow(
box(
title = "Dataset Summary",
status = "primary",
solidHeader = TRUE,
width = 12,
DT::dataTableOutput("summary_table")
)
),
fluidRow(
box(
title = "Tempo Distribution",
status = "primary",
plotlyOutput("tempo_dist")
),
box(
title = "Feature Correlations",
status = "primary",
plotlyOutput("correlations")
)
)
),
# Genre Analysis Tab
tabItem(
tabName = "genre",
fluidRow(
box(
title = "Select Genre",
status = "primary",
selectInput("genre", "Genre:",
choices = unique(private$data$genre))
),
box(
title = "Tempo Range",
status = "primary",
sliderInput("tempo_range", "BPM Range:",
min = 0, max = 200,
value = c(60, 180))
)
),
fluidRow(
box(
title = "Genre Characteristics",
status = "primary",
plotlyOutput("genre_analysis")
),
box(
title = "Top Tracks",
status = "primary",
DT::dataTableOutput("top_tracks")
)
)
),
# Comparisons Tab
tabItem(
tabName = "compare",
fluidRow(
box(
title = "Select Genres to Compare",
status = "primary",
checkboxGroupInput("compare_genres",
"Select Genres:",
choices = unique(private$data$genre))
)
),
fluidRow(
box(
title = "Tempo Comparison",
status = "primary",
plotlyOutput("genre_comparison")
),
box(
title = "Statistical Summary",
status = "primary",
verbatimTextOutput("stat_summary")
)
)
),
# Statistical Tests Tab
tabItem(
tabName = "stats",
fluidRow(
box(
title = "ANOVA Results",
status = "primary",
verbatimTextOutput("anova_results")
),
box(
title = "Post-hoc Tests",
status = "primary",
verbatimTextOutput("posthoc_results")
)
)
)
)
)
)
return(ui)
},
create_server = function() {
server <- function(input, output, session) {
# Overview Tab
output$total_tracks <- renderValueBox({
valueBox(
nrow(private$data),
"Total Tracks",
icon = icon("music")
)
})
output$summary_table <- DT::renderDataTable({
private$analyzer$descriptive_statistics()
})
output$tempo_dist <- renderPlotly({
private$visualizer$create_tempo_distribution()
})
# Genre Analysis Tab
output$genre_analysis <- renderPlotly({
req(input$genre)
filtered_data <- private$data %>%
filter(
genre == input$genre,
tempo >= input$tempo_range[1],
tempo <= input$tempo_range[2]
)
private$visualizer$create_rhythm_heatmap(filtered_data)
})
# Comparisons Tab
output$genre_comparison <- renderPlotly({
req(input$compare_genres)
filtered_data <- private$data %>%
filter(genre %in% input$compare_genres)
private$visualizer$create_tempo_evolution(filtered_data)
})
# Statistical Tests Tab
output$anova_results <- renderPrint({
anova_results <- private$analyzer$perform_anova()
print(anova_results$model)
})
}
return(server)
},
run_app = function() {
shinyApp(
ui = self$create_ui(),
server = self$create_server()
)
}
)
)
Usage
dashboard ← MusicAnalysisDashboard$new(processed_data) dashboard$run_app()
6. Best Practices and Optimization
Performance Optimization:
- Use data.table for large datasets
- Implement caching for expensive computations
- Optimize plot rendering
- Use reactive expressions in Shiny
Code Organization:
- Modularize Shiny components
- Create reusable functions
- Implement proper error handling
- Document code thoroughly
Analysis Best Practices:
- Validate statistical assumptions
- Use appropriate tests
- Handle outliers appropriately
- Document analysis decisions
Visualization Guidelines:
- Maintain consistent styling
- Use appropriate color schemes
- Add proper labels and titles
- Make plots interactive when useful