#Loading R Packages
library(statsr)
library(dplyr)
library(ggplot2)
library(fitdistrplus)
1. Data Set
Dataset Info:Source: https://www.kaggle.com/datasets/iamsumat/spotify-top-2000s-mega-dataset
#Import CSV File and store data frame in variable df
df <- read.csv("Spotify-2000.csv")
#Display the first 5 rows to confirm import looks correct
head(df, n = 5)
## Index Title Artist Top.Genre Year
## 1 1 Sunrise Norah Jones adult standards 2004
## 2 2 Black Night Deep Purple album rock 2000
## 3 3 Clint Eastwood Gorillaz alternative hip hop 2001
## 4 4 The Pretender Foo Fighters alternative metal 2007
## 5 5 Waitin' On A Sunny Day Bruce Springsteen classic rock 2002
## Beats.Per.Minute..BPM. Energy Danceability Loudness..dB. Liveness Valence
## 1 157 30 53 -14 11 68
## 2 135 79 50 -11 17 81
## 3 168 69 66 -9 7 52
## 4 173 96 43 -4 3 37
## 5 106 82 58 -5 10 87
## Length..Duration. Acousticness Speechiness Popularity
## 1 201 94 3 71
## 2 207 17 7 39
## 3 341 2 17 69
## 4 269 0 4 76
## 5 256 1 3 59
Contents:
#Number of variables in the data set
ncol(df)
## [1] 15
#Number of entries in the data set
nrow(df)
## [1] 1994
Interest/importance in data set:
Music is an important aspect
of culture and has been a part of human history for centuries. It can
play an important part in things like social gatherings, religion, and
storytelling. With the invention of music streaming services, like
Spotify, we now have easy access to large amounts of data that can be
investigated to understand aspects of human nature and culture.
2. Visualization
Variables of interest:Energy
#Energy: Histogram
ggplot(data = df, aes(x = Energy)) +
geom_histogram(bins = 15) +
ggtitle("Distribution of Energy") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(plot.title = element_text(face="bold"))
#Determine range of energy to determine breaks for cum. rel. freq. plot
range(df$Energy)
## [1] 3 100
#Energy: Cumulative Relative Frequency
breaks <- seq(0,100,1)
energy.cut <- cut(df$Energy, breaks, right=FALSE)
energy.freq <- table(energy.cut)
energy.cumfreq <- cumsum(energy.freq)
energy.cumrelfreq <- energy.cumfreq / nrow(df)
cumrelfreq0_energy <- c(0, energy.cumrelfreq)
plot(breaks, cumrelfreq0_energy, main = "Cumulative Relative Frequency Plot of Energy", xlab = "Energy", ylab = "Cumulative Relative Frequency (%)")
lines(breaks, cumrelfreq0_energy)
#Energy: QQ Plot
qqnorm(df$Energy, main="Normal Q-Q Plot of Energy")
qqline(df$Energy, col = "red")
Danceability
#Danceability: Histogram
ggplot(data = df, aes(x = Danceability)) +
geom_histogram(bins = 15) +
ggtitle("Distribution of Danceability") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(plot.title = element_text(face="bold"))
#Determine range of danceability to determine breaks for cum. rel. freq. plot
range(df$Danceability)
## [1] 10 96
#Danceability: Cumulative Relative Frequency
breaks <- seq(10,100,1)
dance.cut <- cut(df$Danceability, breaks, right=FALSE)
dance.freq <- table(dance.cut)
dance.cumfreq <- cumsum(dance.freq)
dance.cumrelfreq <- dance.cumfreq / nrow(df)
cumrelfreq0_dance <- c(0, dance.cumrelfreq)
plot(breaks, cumrelfreq0_dance, main = "Cumulative Relative Frequency Plot of Danceability", xlab = "Danceability", ylab = "Cumulative Relative Frequency (%)")
lines(breaks, cumrelfreq0_dance)
#Danceability: QQ Plot
qqnorm(df$Danceability, main="Normal Q-Q Plot of Danceability")
qqline(df$Danceability, col = "red")
Speechiness
#Speechiness: Histogram
ggplot(data = df, aes(x = Speechiness)) +
geom_histogram(bins = 20) +
ggtitle("Distribution of Speechiness") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(plot.title = element_text(face="bold"))
#Determine range of Speechiness to determine breaks for cum. rel. freq. plot
range(df$Speechiness)
## [1] 2 55
#Speechiness: Cumulative Relative Frequency
breaks <- seq(2,55,1)
Speechiness.cut <- cut(df$Speechiness, breaks, right=FALSE)
Speechiness.freq <- table(Speechiness.cut)
Speechiness.cumfreq <- cumsum(Speechiness.freq)
Speechiness.cumrelfreq <- Speechiness.cumfreq / nrow(df)
cumrelfreq0_Speechiness <- c(0, Speechiness.cumrelfreq)
plot(breaks, cumrelfreq0_Speechiness, main = "Cumulative Relative Frequency Plot of Speechiness", xlab = "Speechiness", ylab = "Cumulative Relative Frequency (%)")
lines(breaks, cumrelfreq0_Speechiness)
#Speechiness: QQ Plot
qqnorm(df$Speechiness, main="Normal Q-Q Plot of Speechiness")
qqline(df$Speechiness, col = "red")
3. Point Estimates
#Function to determine mode
mode <- function(x) {
uniqueVal <- unique(x)
uniqueVal[which.max(tabulate(match(x, uniqueVal)))]
}
Energy
#Energy: mean, median, variance, standard deviation
df %>%
summarise(sample_mu = mean(df$Energy),
sample_med = median(df$Energy),
sample_mode = mode(df$Energy),
sample_var = var(df$Energy),
sample_sd = sd(df$Energy))
## sample_mu sample_med sample_mode sample_var sample_sd
## 1 59.67954 61 76 490.814 22.15432
Danceability
#Danceability: mean, median, variance, standard deviation
df %>%
summarise(sample_mu = mean(df$Danceability),
sample_med = median(df$Danceability),
sample_mode = mode(df$Danceability),
sample_var = var(df$Danceability),
sample_sd = sd(df$Danceability))
## sample_mu sample_med sample_mode sample_var sample_sd
## 1 53.23821 53 53 235.6688 15.35151
Speechiness
#Speechiness: mean, median, variance, standard deviation
df %>%
summarise(sample_mu = mean(df$Speechiness),
sample_med = median(df$Speechiness),
sample_mode = mode(df$Speechiness),
sample_var = var(df$Speechiness),
sample_sd = sd(df$Speechiness))
## sample_mu sample_med sample_mode sample_var sample_sd
## 1 4.994985 4 3 19.37378 4.401566
#Function to determine proportion of data values that lies within 1.5IQR
data_within1_5IQR <- function(x,lbound,ubound){
length(x[(x>lbound)&(x<ubound)])/length(df$Energy)
}
Energy
#Proportion of data within 1.5IQR for Energy
lowerBound = quantile(df$Energy,0.25) - 1.5*IQR(df$Energy)
upperBound = quantile(df$Energy,0.75) + 1.5*IQR(df$Energy)
data_within1_5IQR(df$Energy,lowerBound,upperBound)
## [1] 1
Danceability
#Proportion of data within 1.5IQR for Danceability
lowerBound = quantile(df$Danceability,0.25) - 1.5*IQR(df$Danceability)
upperBound = quantile(df$Danceability,0.75) + 1.5*IQR(df$Danceability)
data_within1_5IQR(df$Danceability,lowerBound,upperBound)
## [1] 0.9984955
Speechiness
#Proportion of data within 1.5IQR for Speechiness
lowerBound = quantile(df$Speechiness,0.25) - 1.5*IQR(df$Speechiness)
upperBound = quantile(df$Speechiness,0.75) + 1.5*IQR(df$Speechiness)
data_within1_5IQR(df$Speechiness,lowerBound,upperBound)
## [1] 0.887663
Energy
# critical value for a 95% confidence interval
z_star_95 <- qnorm(0.975)
# Confidence Interval for population mean based on the entire data set and on the portion determined in b.
df %>%
summarise(lower = mean(df$Energy) - z_star_95 * (sd(df$Energy) / sqrt(length(df$Energy))),
upper = mean(df$Energy) + z_star_95 * (sd(df$Energy) / sqrt(length(df$Energy))))
## lower upper
## 1 58.70714 60.65194
From part b, the entire population of the data set is included within 1.5IQR, so there will be one CI calculation for both the population and sample; therefore, they are not different.
Danceability
# 95% Confidence Interval for population mean based on the entire data set
df %>%
summarise(lower = mean(df$Danceability) - z_star_95 * (sd(df$Danceability) / sqrt(length(df$Danceability))),
upper = mean(df$Danceability) + z_star_95 * (sd(df$Danceability) / sqrt(length(df$Danceability))))
## lower upper
## 1 52.56441 53.91202
# 95% Confidence Interval for population mean based on the portion determined in b
lowerBound = quantile(df$Danceability,0.25) - 1.5*IQR(df$Danceability)
upperBound = quantile(df$Danceability,0.75) + 1.5*IQR(df$Danceability)
df_sD <- df %>% filter((Danceability>lowerBound)&(Danceability<upperBound))
df_sD %>%
summarise(lower = mean(df_sD$Danceability) - z_star_95 * (sd(df$Danceability) / sqrt(length(df_sD$Danceability))),
upper = mean(df_sD$Danceability) + z_star_95 * (sd(df$Danceability) / sqrt(length(df_sD$Danceability))))
## lower upper
## 1 52.54266 53.89129
The 95% confidence intervals are slightly different between the CI calculated on the entire data set and on the portion within 1.5IQR. This is due to the changes that occur to the mean and number of values that result from removing the outliers from the data set. For Danceability, since the proportion of data values that lies within 1.5IQR is 0.998, there will only be very slight changes to the 95% CI calculation.
Speechiness
# 95% Confidence Interval for population mean based on the entire data set
df %>%
summarise(lower = mean(df$Speechiness) - z_star_95 * (sd(df$Speechiness) / sqrt(length(df$Speechiness))),
upper = mean(df$Speechiness) + z_star_95 * (sd(df$Speechiness) / sqrt(length(df$Speechiness))))
## lower upper
## 1 4.801791 5.188179
# 95% Confidence Interval for population mean based on the portion determined in b
lowerBound = quantile(df$Speechiness,0.25) - 1.5*IQR(df$Speechiness)
upperBound = quantile(df$Speechiness,0.75) + 1.5*IQR(df$Speechiness)
df_sS <- df %>% filter((Speechiness>lowerBound)&(Speechiness<upperBound))
df_sS %>%
summarise(lower = mean(df_sS$Speechiness) - z_star_95 * (sd(df$Speechiness) / sqrt(length(df_sS$Speechiness))),
upper = mean(df_sS$Speechiness) + z_star_95 * (sd(df$Speechiness) / sqrt(length(df_sS$Speechiness))))
## lower upper
## 1 3.623194 4.033303
The 95% confidence intervals are different between the CI calculated on the entire data set and on the proportion within 1.5IQR. Since the proportion of data values that lies within 1.5IQR for speechiness is 0.888, there will be more significant differences in the mean and number of values that are used to calculate the CI, when compared to the danceability calculations.
Energy
# t-critical value for a 95% confidence interval
t_star_95 <- qt(0.975, df = length(df$Energy) - 1)
#95% CI for population mean, assuming population variance is unknown
df %>%
summarise(lower = mean(df$Energy) - t_star_95 * (sd(df$Energy) / sqrt(length(df$Energy))),
upper = mean(df$Energy) + t_star_95 * (sd(df$Energy) / sqrt(length(df$Energy))))
## lower upper
## 1 58.70655 60.65253
Danceability
# t-critical value for a 95% confidence interval
t_star_95 <- qt(0.975, df = length(df_sD$Danceability) - 1)
#95% CI for population mean, assuming population variance is unknown
df_sD %>%
summarise(lower = mean(df_sD$Danceability) - t_star_95 * (sd(df_sD$Danceability) / sqrt(length(df_sD$Danceability))),
upper = mean(df_sD$Danceability) + t_star_95 * (sd(df_sD$Danceability) / sqrt(length(df_sD$Danceability))))
## lower upper
## 1 52.54573 53.88823
Speechiness
# t-critical value for a 95% confidence interval
t_star_95 <- qt(0.975, df = length(df_sS$Speechiness) - 1)
#95% CI for population mean, assuming population variance is unknown
df_sS %>%
summarise(lower = mean(df_sS$Speechiness) - t_star_95 * (sd(df_sS$Speechiness) / sqrt(length(df_sS$Speechiness))),
upper = mean(df_sS$Speechiness) + t_star_95 * (sd(df_sS$Speechiness) / sqrt(length(df_sS$Speechiness))))
## lower upper
## 1 3.776405 3.880092
Yes, the CI calculated for danceability and energy, assuming population variance is unknown, is different than in part c (when it is known). This is because we now are using the t-critical value instead of z-critical value, because we no longer know the population standard deviation. These two critical values will be different, since t is a function of the p-value and degrees of freedom. When calculating the CI using the t-value, we will be replacing the population standard deviation with the sample standard deviation.
The CI for energy is also different from part c. However, since the entire data set is included within the 1.5IQR range, the difference is solely driven by the difference between the t and z-values used.4. Modeling
For each of the selected variables assume that it follows one of the theoretical distributions (choose from Continuous or Discrete).
Energy Approximate distribution: Weibull Distribution Parameters: shape (k), scale (\(\lambda\))
#Estimated parameters for the energy distribution approximated as the weibull distribution
fit_w <- fitdist(df$Energy, "weibull")
fit_w
## Fitting of the distribution ' weibull ' by maximum likelihood
## Parameters:
## estimate Std. Error
## shape 3.013653 0.05573883
## scale 66.810679 0.51998742
Danceability Approximate distribution: Normal Distribution Parameters: mean (\(\mu\)), standard deviation (\(\sigma^2\))
#Estimated parameters for the danceability distribution approximated as the normal distribution
fit_n <- fitdist(df$Danceability, "norm")
fit_n
## Fitting of the distribution ' norm ' by maximum likelihood
## Parameters:
## estimate Std. Error
## mean 53.23821 0.3436997
## sd 15.34766 0.2430324
Speechiness Approximate distribution: Exponential Parameters: rate (\(\lambda\))
#Estimated parameters for the speechiness distribution approximated as the exponential distribution
fit_e <- fitdist(df$Speechiness, "exp")
fit_e
## Fitting of the distribution ' exp ' by maximum likelihood
## Parameters:
## estimate Std. Error
## rate 0.2002008 0.004483245
Energy
#Relative Frequency Plot and Theoretical Distribution
denscomp(list(fit_w))
Danceability
#Relative Frequency Plot and Theoretical Distribution
denscomp(list(fit_n))
Speechiness
#Relative Frequency Plot and Theoretical Distribution
denscomp(list(fit_e))
5. CLT
Use a random number generator to pick N samples of size n from the original data set, where N = 10, 100, 500 and n = 2, 5, 10. Analysis for 5 will be focused on the energy variable in this data set.
n = 2, N = 10
# Random Sample Generator for Energy
s_means_energy2_10 <- df %>%
rep_sample_n(size = 2, reps = 10, replace = TRUE) %>%
summarise(x_bar_energy = mean(Energy))
# Compute Average x_bar for Energy
mean(s_means_energy2_10$x_bar_energy)
## [1] 62.6
n = 2, N = 100
# Random Sample Generator for Energy
s_means_energy2_100 <- df %>%
rep_sample_n(size = 2, reps = 100, replace = TRUE) %>%
summarise(x_bar_energy = mean(Energy))
# Compute Average x_bar for Energy
mean(s_means_energy2_100$x_bar_energy)
## [1] 58.12
n = 2, N = 500
# Random Sample Generator for Energy
s_means_energy2_500 <- df %>%
rep_sample_n(size = 2, reps = 500, replace = TRUE) %>%
summarise(x_bar_energy = mean(Energy))
# Compute Average x_bar for Energy
mean(s_means_energy2_500$x_bar_energy)
## [1] 59.747
n = 5, N = 10
# Random Sample Generator for Energy
s_means_energy5_10 <- df %>%
rep_sample_n(size = 5, reps = 10, replace = TRUE) %>%
summarise(x_bar_energy = mean(Energy))
# Compute Average x_bar for Energy
mean(s_means_energy5_10$x_bar_energy)
## [1] 57.54
n = 5, N = 100
# Random Sample Generator for Energy
s_means_energy5_100 <- df %>%
rep_sample_n(size = 5, reps = 100, replace = TRUE) %>%
summarise(x_bar_energy = mean(Energy))
# Compute Average x_bar for Energy
mean(s_means_energy5_100$x_bar_energy)
## [1] 58.928
n = 5, N = 500
# Random Sample Generator for Energy
s_means_energy5_500 <- df %>%
rep_sample_n(size = 5, reps = 500, replace = TRUE) %>%
summarise(x_bar_energy = mean(Energy))
# Compute Average x_bar for Energy
mean(s_means_energy5_500$x_bar_energy)
## [1] 60.6692
n = 10, N = 10
# Random Sample Generator for Energy
s_means_energy10_10 <- df %>%
rep_sample_n(size = 10, reps = 10, replace = TRUE) %>%
summarise(x_bar_energy = mean(Energy))
# Compute Average x_bar for Energy
mean(s_means_energy10_10$x_bar_energy)
## [1] 55.48
n = 10, N = 100
# Random Sample Generator for Energy
s_means_energy10_100 <- df %>%
rep_sample_n(size = 10, reps = 100, replace = TRUE) %>%
summarise(x_bar_energy = mean(Energy))
# Compute Average x_bar for Energy
mean(s_means_energy10_100$x_bar_energy)
## [1] 58.954
n = 10, N = 500
# Random Sample Generator for Energy
s_means_energy10_500 <- df %>%
rep_sample_n(size = 10, reps = 500, replace = TRUE) %>%
summarise(x_bar_energy = mean(Energy))
# Compute Average x_bar for Energy
mean(s_means_energy10_500$x_bar_energy)
## [1] 59.491
The theoretical distribution of \(\overline{x}\) energy is approximately normal depending on the size of the sample. From the calculations and histograms above, we see that we need a sample number of at least 100 for the distribution to approximate a normal distribution. In addition, we can see that with increasing sample numbers (N) and size (n), the mean of the sample distribution gets closer to the true population mean of the entire data set, since extreme values have less impact on the sample mean. In addition, the standard deviation should decrease with increasing samples.
n = 2, N = 10
# Histogram of x_bar_energy
ggplot(data = s_means_energy2_10, aes(x = x_bar_energy)) +
geom_histogram(bins = 23) +
ggtitle("Distribution of x_bar_energy for n = 2, N = 10") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(plot.title = element_text(face="bold"))
n = 2, N = 100
# Histogram of x_bar_energy
ggplot(data = s_means_energy2_100, aes(x = x_bar_energy)) +
geom_histogram(bins = 23) +
ggtitle("Distribution of x_bar_energy for n = 2, N = 100") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(plot.title = element_text(face="bold"))
n = 2, N = 500
# Histogram of x_bar_energy
ggplot(data = s_means_energy2_500, aes(x = x_bar_energy)) +
geom_histogram(bins = 23) +
ggtitle("Distribution of x_bar_energy for n = 2, N = 500") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(plot.title = element_text(face="bold"))
n = 5, N = 10
# Histogram of x_bar_energy
ggplot(data = s_means_energy5_10, aes(x = x_bar_energy)) +
geom_histogram(bins = 23) +
ggtitle("Distribution of x_bar_energy for n = 5, N = 10") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(plot.title = element_text(face="bold"))
n = 5, N = 100
# Histogram of x_bar_energy
ggplot(data = s_means_energy5_100, aes(x = x_bar_energy)) +
geom_histogram(bins = 23) +
ggtitle("Distribution of x_bar_energy for n = 5, N = 100") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(plot.title = element_text(face="bold"))
n = 5, N = 500
# Histogram of x_bar_energy
ggplot(data = s_means_energy5_500, aes(x = x_bar_energy)) +
geom_histogram(bins = 23) +
ggtitle("Distribution of x_bar_energy for n = 5, N = 500") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(plot.title = element_text(face="bold"))
n = 10, N = 10
# Histogram of x_bar_energy
ggplot(data = s_means_energy10_10, aes(x = x_bar_energy)) +
geom_histogram() +
ggtitle("Distribution of x_bar_energy for n = 10, N = 10") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(plot.title = element_text(face="bold"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
n = 10, N = 100
# Histogram of x_bar_energy
ggplot(data = s_means_energy10_100, aes(x = x_bar_energy)) +
geom_histogram(bins = 23) +
ggtitle("Distribution of x_bar_energy for n = 10, N = 100") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(plot.title = element_text(face="bold"))
n = 10, N = 500
# Histogram of x_bar_energy
ggplot(data = s_means_energy10_500, aes(x = x_bar_energy)) +
geom_histogram(bins = 23) +
ggtitle("Distribution of x_bar_energy for n = 10, N = 500") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(plot.title = element_text(face="bold"))
#Relative Frequency Plot for n = 2, N = 500
ggplot(data = s_means_energy2_500, aes(x = x_bar_energy)) +
geom_histogram(aes(y=..density..) ,bins = 23) +
ggtitle("Distribution of x_bar_energy for n = 2, N = 500") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(plot.title = element_text(face="bold")) +
ylab("Density") +
stat_function(fun = dnorm, args = list(mean = mean(s_means_energy2_500$x_bar_energy), sd = sd(s_means_energy2_500$x_bar_energy)))
#Relative Frequency Plot for n = 5, N = 500
ggplot(data = s_means_energy5_500, aes(x = x_bar_energy)) +
geom_histogram(aes(y=..density..) ,bins = 23) +
ggtitle("Distribution of x_bar_energy for n = 5, N = 500") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(plot.title = element_text(face="bold")) +
ylab("Density") +
stat_function(fun = dnorm, args = list(mean = mean(s_means_energy5_500$x_bar_energy), sd = sd(s_means_energy5_500$x_bar_energy)))
#Relative Frequency Plot for n = 10, N = 500
ggplot(data = s_means_energy10_500, aes(x = x_bar_energy)) +
geom_histogram(aes(y=..density..) ,bins = 23) +
ggtitle("Distribution of x_bar_energy for n = 10, N = 500") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(plot.title = element_text(face="bold")) +
ylab("Density") +
stat_function(fun = dnorm, args = list(mean = mean(s_means_energy10_500$x_bar_energy), sd = sd(s_means_energy10_500$x_bar_energy)))
6. Relationship between two variables
#Rename Beats Per Minute column to BPM for ease of coding
names(df)[names(df)=="Beats.Per.Minute..BPM."] <- "BPM"
#Scatter plot of Valence and BPM
ggplot(df, aes(x = BPM, y = Valence)) +
geom_point(alpha = 0.5) +
ggtitle("Plot of Valence and Beats Per Minute") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(plot.title = element_text(face="bold"))
#Sample Correlation Coefficient
r <- cov(df$BPM, df$Valence)/(sd(df$BPM)*sd(df$Valence))
r
## [1] 0.05965322
`
# Critical Value for 95% CI
z_star_95 <- qnorm(0.975)
# Calculation for the CI for the correlation coefficient
z_r <- log((1+r)/(1-r)) / 2
L <- z_r - (z_star_95 / sqrt(nrow(df)-3))
U <- z_r + (z_star_95 / sqrt(nrow(df)-3))
lower <- (exp(2*L)-1)/(exp(2*L)+1)
upper <- (exp(2*U)-1)/(exp(2*U)+1)
CI <- data.frame(lower,upper)
CI
## lower upper
## 1 0.01579775 0.1032796
Null Hypothesis: There is no association between Valence and BPM
Alternative Hypothesis: There is an association between Valence
and BPM
Significance Level: 0.05
#Chi-Square Test for BPM and Valence
chisq.test(df$BPM, df$Valence, correct = FALSE, simulate.p.value=TRUE)
##
## Pearson's Chi-squared test with simulated p-value (based on 2000
## replicates)
##
## data: df$BPM and df$Valence
## X-squared = 13807, df = NA, p-value = 0.4443
#Box Plot of Popularity by Year
ggplot(df, aes(y = Popularity, x = Year, group = Year)) +
geom_boxplot() +
xlab("Release Year") +
ylab("Popularity") +
ggtitle("Popularity of Song by Release Year") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(plot.title = element_text(face="bold"))
The box plots allow us to visually see if there are outliers of
popularity within each Year category, as seen by the black dots. There
are only outliers outside of the lower 1.5IQR bound. The box plots also
increase in range with increasing year, which may be caused by several
factors. For example, later years may have more data points, thus having
a higher chance of having a larger range or earlier years may have songs
that consistently have a popularity within a certain range.
#Adding a column, stratifying songs as recent or not recent based on whether it was released within the last 10 years.
df <- df %>%
mutate(Recent = case_when(
Year > 2011 ~ "Recent",
Year < 2012 ~ "Not Recent"))
#Hypothesis testing with the above parameters and null/alternative hypotheses
inference(y = Popularity, x = Recent, data = df, statistic = "mean", type = "ht", null = 0, alternative = "twosided", method = "theoretical")
## Response variable: numerical
## Explanatory variable: categorical (2 levels)
## n_Not Recent = 1674, y_bar_Not Recent = 59.9086, s_Not Recent = 13.0307
## n_Recent = 320, y_bar_Recent = 57.5281, s_Recent = 19.7871
## H0: mu_Not Recent = mu_Recent
## HA: mu_Not Recent != mu_Recent
## t = 2.0681, df = 319
## p_value = 0.0394
Conclusion: The p-value is 0.0394 which is smaller than alpha = 0.05. This indicates that we can reject the null hypothesis of there being no difference in the average popularity value between recent and older songs.