df<-read.csv("Health_Sleep_Statistics.csv")

mean(df$Sleep.Quality, na.rm = TRUE)
median(df$Sleep.Quality, na.rm = TRUE)
sd(df$Sleep.Quality, na.rm = TRUE)

(This is the univariate analysis for the x variable)


```{r}
barplot( table(df$Physical.Activity.Level),
  main = "Physical Activity Levels",
  xlab = "Activity Level",
  ylab = "Number of People",
  col = c("lightblue", "lightgreen", "salmon")
)
```
(this is the univariate analysis for the y-variable)

library(dplyr)
summary_stats <- df %>%
group_by(Physical.Activity.Level) %>%
summarise(
Mean = mean(Sleep.Quality, na.rm = TRUE),
Median = median(Sleep.Quality, na.rm = TRUE),
SD = sd(Sleep.Quality, na.rm = TRUE),
n = n()
)
summary_stats

(this is the numerical summary of the variables, aligning with the bivariate analysis)

library(ggplot2)
ggplot(df, aes(x = Physical.Activity.Level, y= Sleep.Quality, fill = Physical.Activity.Level))+
geom_boxplot() +
labs(
title = "Sleep Quality vs Physical Activity Level",
x = "Physical Activity Level",
y = "Sleep Hours"
) +
scale_fill_manual (values= c("lightblue", "lightgreen", "salmon"))

(This is a visualization of the variables, aligning with the bivariate analysis)

set.seed(123)
n_perm <- 10000
perm_diffs <- replicate(n_perm, {
  shuffled <- sample(df$Physical.Activity.Level)
  mean(df$Sleep.Quality[shuffled == "high"], na.rm = TRUE) - 
    mean(df$Sleep.Quality[shuffled == "low"], na.rm = TRUE)
})
obs_diff <- mean(df$Sleep.Quality[df$Physical.Activity.Level=="high"], na.rm = TRUE) -
            mean(df$Sleep.Quality[df$Physical.Activity.Level=="low"], na.rm = TRUE)
p_value <- mean(abs(perm_diffs) >= abs(obs_diff))
print(p_value)