knitr::opts_chunk$set(include = FALSE)
#Libraries Loading
if(!require(tidyverse)){install.packages("tidyverse")}
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
if(!require(gridExtra)){install.packages("gridExtra")}
## Loading required package: gridExtra
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(tidyverse)
library(readr)
library(dplyr)
library(ggplot2)
library(gridExtra)
This case study investigates the relationship between calorie burn and various parameters collected by a fitness tracking app. The primary goal is to understand which factors are most influential in calorie expenditure and demonstrate the importance of calorie tracking for overall health management. We will explore how different activities, duration, intensity, steps, sleep, and weight contribute to calorie burn. Ultimately, this analysis aims to provide actionable insights for users to optimize their fitness routines and achieve their health goals.
The data for this analysis comes from a publicly available Fitbit dataset. It includes various metrics related to daily activity, hourly activity, sleep, and weight. The following datasets were used:
dailyActivity_merged.csv: Daily summary of activity.
dailyCalories_merged.csv: Daily calorie estimates.
dailyIntensities_merged.csv: Daily intensity levels.
dailySteps_merged.csv: Daily step counts.
hourlyCalories_merged.csv: Hourly calorie estimates.
hourlyIntensities_merged.csv: Hourly intensity levels.
hourlySteps_merged.csv: Hourly step counts.
sleepDay_merged.csv: Daily sleep records.
heartrate_seconds_merged.csv: Second-by-second heart rate data (not used in this analysis due to complexity).
weightLogInfo_merged.csv: Weight logs.
minuteCaloriesNarrow_merged.csv,
minuteIntensitiesNarrow_merged.csv, minuteMETsNarrow_merged.csv,
minuteStepsNarrow_merged.csv, minuteSleep_merged.csv:
Minute-level data (not used in this initial analysis due to aggregation
requirements).
#Data loading in Variables
activity <- read.csv("dailyActivity_merged.csv")
calories <- read.csv("dailyCalories_merged.csv")
intensities <- read.csv("dailyIntensities_merged.csv")
steps <- read.csv("dailySteps_merged.csv")
hcalories <- read.csv("hourlyCalories_merged.csv")
hintensities <- read.csv("hourlyIntensities_merged.csv")
hsteps <- read.csv("hourlySteps_merged.csv")
dsleep <- read.csv("sleepDay_merged.csv")
heartrate <- read.csv("heartrate_seconds_merged.csv")
msleep <- read.csv("minuteSleep_merged.csv")
weight <- read.csv("weightLogInfo_merged.csv")
mcalories <- read_csv("minuteCaloriesNarrow_merged.csv")
## Rows: 1325580 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ActivityMinute
## dbl (2): Id, Calories
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
mintensities <- read_csv("minuteIntensitiesNarrow_merged.csv")
## Rows: 1325580 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ActivityMinute
## dbl (2): Id, Intensity
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
mmets <- read_csv("minuteMETsNarrow_merged.csv")
## Rows: 1325580 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ActivityMinute
## dbl (2): Id, METs
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
msteps <- read_csv("minuteStepsNarrow_merged.csv")
## Rows: 1325580 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ActivityMinute
## dbl (2): Id, Steps
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
###Checking Columns for Data Integrity and Check Our needs of the Data and understanding what each dataframe have to offer.
#Checking Data Validation
colnames(activity)
## [1] "Id" "ActivityDate"
## [3] "TotalSteps" "TotalDistance"
## [5] "TrackerDistance" "LoggedActivitiesDistance"
## [7] "VeryActiveDistance" "ModeratelyActiveDistance"
## [9] "LightActiveDistance" "SedentaryActiveDistance"
## [11] "VeryActiveMinutes" "FairlyActiveMinutes"
## [13] "LightlyActiveMinutes" "SedentaryMinutes"
## [15] "Calories"
colnames(calories)
## [1] "Id" "ActivityDay" "Calories"
colnames(intensities)
## [1] "Id" "ActivityDay"
## [3] "SedentaryMinutes" "LightlyActiveMinutes"
## [5] "FairlyActiveMinutes" "VeryActiveMinutes"
## [7] "SedentaryActiveDistance" "LightActiveDistance"
## [9] "ModeratelyActiveDistance" "VeryActiveDistance"
colnames(steps)
## [1] "Id" "ActivityDay" "StepTotal"
colnames(heartrate)
## [1] "Id" "Time" "Value"
colnames(hcalories)
## [1] "Id" "ActivityHour" "Calories"
colnames(hintensities)
## [1] "Id" "ActivityHour" "TotalIntensity" "AverageIntensity"
colnames(hsteps)
## [1] "Id" "ActivityHour" "StepTotal"
colnames(mcalories)
## [1] "Id" "ActivityMinute" "Calories"
colnames(mintensities)
## [1] "Id" "ActivityMinute" "Intensity"
colnames(mmets)
## [1] "Id" "ActivityMinute" "METs"
colnames(msleep)
## [1] "Id" "date" "value" "logId"
colnames(msteps)
## [1] "Id" "ActivityMinute" "Steps"
colnames(weight)
## [1] "Id" "Date" "WeightKg" "WeightPounds"
## [5] "Fat" "BMI" "IsManualReport" "LogId"
colnames(dsleep)
## [1] "Id" "SleepDay" "TotalSleepRecords"
## [4] "TotalMinutesAsleep" "TotalTimeInBed"
length(unique(activity$Id))
## [1] 33
length(unique(heartrate$Id))
## [1] 14
length(unique(hcalories$Id))
## [1] 33
length(unique(hintensities$Id))
## [1] 33
length(unique(hsteps$Id))
## [1] 33
length(unique(mcalories$Id))
## [1] 33
length(unique(mintensities$Id))
## [1] 33
length(unique(mmets$Id))
## [1] 33
length(unique(msleep$Id))
## [1] 24
length(unique(msteps$Id))
## [1] 33
length(unique(weight$Id))
## [1] 8
length(unique(calories$Id))
## [1] 33
length(unique(intensities$Id))
## [1] 33
length(unique(steps$Id))
## [1] 33
length(unique(dsleep$Id))
## [1] 24
# Choosing Which Tables to work With
head(activity)
## Id ActivityDate TotalSteps TotalDistance TrackerDistance
## 1 1503960366 4/12/2016 13162 8.50 8.50
## 2 1503960366 4/13/2016 10735 6.97 6.97
## 3 1503960366 4/14/2016 10460 6.74 6.74
## 4 1503960366 4/15/2016 9762 6.28 6.28
## 5 1503960366 4/16/2016 12669 8.16 8.16
## 6 1503960366 4/17/2016 9705 6.48 6.48
## LoggedActivitiesDistance VeryActiveDistance ModeratelyActiveDistance
## 1 0 1.88 0.55
## 2 0 1.57 0.69
## 3 0 2.44 0.40
## 4 0 2.14 1.26
## 5 0 2.71 0.41
## 6 0 3.19 0.78
## LightActiveDistance SedentaryActiveDistance VeryActiveMinutes
## 1 6.06 0 25
## 2 4.71 0 21
## 3 3.91 0 30
## 4 2.83 0 29
## 5 5.04 0 36
## 6 2.51 0 38
## FairlyActiveMinutes LightlyActiveMinutes SedentaryMinutes Calories
## 1 13 328 728 1985
## 2 19 217 776 1797
## 3 11 181 1218 1776
## 4 34 209 726 1745
## 5 10 221 773 1863
## 6 20 164 539 1728
head(calories)
## Id ActivityDay Calories
## 1 1503960366 4/12/2016 1985
## 2 1503960366 4/13/2016 1797
## 3 1503960366 4/14/2016 1776
## 4 1503960366 4/15/2016 1745
## 5 1503960366 4/16/2016 1863
## 6 1503960366 4/17/2016 1728
head(dsleep)
## Id SleepDay TotalSleepRecords TotalMinutesAsleep
## 1 1503960366 4/12/2016 12:00:00 AM 1 327
## 2 1503960366 4/13/2016 12:00:00 AM 2 384
## 3 1503960366 4/15/2016 12:00:00 AM 1 412
## 4 1503960366 4/16/2016 12:00:00 AM 2 340
## 5 1503960366 4/17/2016 12:00:00 AM 1 700
## 6 1503960366 4/19/2016 12:00:00 AM 1 304
## TotalTimeInBed
## 1 346
## 2 407
## 3 442
## 4 367
## 5 712
## 6 320
head(steps)
## Id ActivityDay StepTotal
## 1 1503960366 4/12/2016 13162
## 2 1503960366 4/13/2016 10735
## 3 1503960366 4/14/2016 10460
## 4 1503960366 4/15/2016 9762
## 5 1503960366 4/16/2016 12669
## 6 1503960366 4/17/2016 9705
head(weight)
## Id Date WeightKg WeightPounds Fat BMI
## 1 1503960366 5/2/2016 11:59:59 PM 52.6 115.9631 22 22.65
## 2 1503960366 5/3/2016 11:59:59 PM 52.6 115.9631 NA 22.65
## 3 1927972279 4/13/2016 1:08:52 AM 133.5 294.3171 NA 47.54
## 4 2873212765 4/21/2016 11:59:59 PM 56.7 125.0021 NA 21.45
## 5 2873212765 5/12/2016 11:59:59 PM 57.3 126.3249 NA 21.69
## 6 4319703577 4/17/2016 11:59:59 PM 72.4 159.6147 25 27.45
## IsManualReport LogId
## 1 True 1.462234e+12
## 2 True 1.462320e+12
## 3 False 1.460510e+12
## 4 True 1.461283e+12
## 5 True 1.463098e+12
## 6 True 1.460938e+12
# Understanding Our tables for Better Decision and Understanding Statistical Rates
summary(activity)
## Id ActivityDate TotalSteps TotalDistance
## Min. :1.504e+09 Length:940 Min. : 0 Min. : 0.000
## 1st Qu.:2.320e+09 Class :character 1st Qu.: 3790 1st Qu.: 2.620
## Median :4.445e+09 Mode :character Median : 7406 Median : 5.245
## Mean :4.855e+09 Mean : 7638 Mean : 5.490
## 3rd Qu.:6.962e+09 3rd Qu.:10727 3rd Qu.: 7.713
## Max. :8.878e+09 Max. :36019 Max. :28.030
## TrackerDistance LoggedActivitiesDistance VeryActiveDistance
## Min. : 0.000 Min. :0.0000 Min. : 0.000
## 1st Qu.: 2.620 1st Qu.:0.0000 1st Qu.: 0.000
## Median : 5.245 Median :0.0000 Median : 0.210
## Mean : 5.475 Mean :0.1082 Mean : 1.503
## 3rd Qu.: 7.710 3rd Qu.:0.0000 3rd Qu.: 2.053
## Max. :28.030 Max. :4.9421 Max. :21.920
## ModeratelyActiveDistance LightActiveDistance SedentaryActiveDistance
## Min. :0.0000 Min. : 0.000 Min. :0.000000
## 1st Qu.:0.0000 1st Qu.: 1.945 1st Qu.:0.000000
## Median :0.2400 Median : 3.365 Median :0.000000
## Mean :0.5675 Mean : 3.341 Mean :0.001606
## 3rd Qu.:0.8000 3rd Qu.: 4.782 3rd Qu.:0.000000
## Max. :6.4800 Max. :10.710 Max. :0.110000
## VeryActiveMinutes FairlyActiveMinutes LightlyActiveMinutes SedentaryMinutes
## Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.:127.0 1st Qu.: 729.8
## Median : 4.00 Median : 6.00 Median :199.0 Median :1057.5
## Mean : 21.16 Mean : 13.56 Mean :192.8 Mean : 991.2
## 3rd Qu.: 32.00 3rd Qu.: 19.00 3rd Qu.:264.0 3rd Qu.:1229.5
## Max. :210.00 Max. :143.00 Max. :518.0 Max. :1440.0
## Calories
## Min. : 0
## 1st Qu.:1828
## Median :2134
## Mean :2304
## 3rd Qu.:2793
## Max. :4900
summary(calories)
## Id ActivityDay Calories
## Min. :1.504e+09 Length:940 Min. : 0
## 1st Qu.:2.320e+09 Class :character 1st Qu.:1828
## Median :4.445e+09 Mode :character Median :2134
## Mean :4.855e+09 Mean :2304
## 3rd Qu.:6.962e+09 3rd Qu.:2793
## Max. :8.878e+09 Max. :4900
summary(dsleep)
## Id SleepDay TotalSleepRecords TotalMinutesAsleep
## Min. :1.504e+09 Length:413 Min. :1.000 Min. : 58.0
## 1st Qu.:3.977e+09 Class :character 1st Qu.:1.000 1st Qu.:361.0
## Median :4.703e+09 Mode :character Median :1.000 Median :433.0
## Mean :5.001e+09 Mean :1.119 Mean :419.5
## 3rd Qu.:6.962e+09 3rd Qu.:1.000 3rd Qu.:490.0
## Max. :8.792e+09 Max. :3.000 Max. :796.0
## TotalTimeInBed
## Min. : 61.0
## 1st Qu.:403.0
## Median :463.0
## Mean :458.6
## 3rd Qu.:526.0
## Max. :961.0
summary(steps)
## Id ActivityDay StepTotal
## Min. :1.504e+09 Length:940 Min. : 0
## 1st Qu.:2.320e+09 Class :character 1st Qu.: 3790
## Median :4.445e+09 Mode :character Median : 7406
## Mean :4.855e+09 Mean : 7638
## 3rd Qu.:6.962e+09 3rd Qu.:10727
## Max. :8.878e+09 Max. :36019
summary(weight)
## Id Date WeightKg WeightPounds
## Min. :1.504e+09 Length:67 Min. : 52.60 Min. :116.0
## 1st Qu.:6.962e+09 Class :character 1st Qu.: 61.40 1st Qu.:135.4
## Median :6.962e+09 Mode :character Median : 62.50 Median :137.8
## Mean :7.009e+09 Mean : 72.04 Mean :158.8
## 3rd Qu.:8.878e+09 3rd Qu.: 85.05 3rd Qu.:187.5
## Max. :8.878e+09 Max. :133.50 Max. :294.3
##
## Fat BMI IsManualReport LogId
## Min. :22.00 Min. :21.45 Length:67 Min. :1.460e+12
## 1st Qu.:22.75 1st Qu.:23.96 Class :character 1st Qu.:1.461e+12
## Median :23.50 Median :24.39 Mode :character Median :1.462e+12
## Mean :23.50 Mean :25.19 Mean :1.462e+12
## 3rd Qu.:24.25 3rd Qu.:25.56 3rd Qu.:1.462e+12
## Max. :25.00 Max. :47.54 Max. :1.463e+12
## NA's :65
# Preparing to merge and work with final Data
activity <- activity %>%
rename(date = ActivityDate) %>%
mutate(date = as.Date(date, format = "%m/%d/%y"))
calories <- calories %>%
rename(date = ActivityDay) %>%
mutate(date = as.Date(date, format = "%m/%d/%y"))
dsleep <- dsleep %>%
rename(date = SleepDay) %>%
mutate(date = as.Date(date, format = "%m/%d/%y"))
steps <- steps %>%
rename( date= ActivityDay) %>%
mutate(date = as.Date(date, format = "%m/%d/%y"))
weight <- weight %>%
rename(date = Date) %>%
mutate(date = as.Date(date, format = "%m/%d/%y"))
hcalories <- hcalories %>%
rename(date = ActivityHour) %>%
mutate(date = as.Date(date, format = "%m/%d/%y"))
#Memory allocation would be failure to join without Adjusting date
finaltable <- activity %>%
left_join(hcalories, by = c("Id","date")) %>%
left_join(calories, by = c("Id","date")) %>%
left_join(steps, by = c("Id","date")) %>%
left_join(weight, by = c("Id","date")) %>%
left_join(dsleep, by = c("Id","date"), relationship = "many-to-many")
#Final Clean Form to Work With
#Exporting Data Set for Clean work with Visualization
write_csv(finaltable, "FinalWorkingTable.csv") # row.names = FALSE
#Visualization Relationship
#Undstanding Distance to Burning Calories
T1 <- ggplot(finaltable, aes(x=VeryActiveDistance, y=Calories)) + geom_point(aes(color=TotalSteps)) + geom_smooth() + labs(x="Active Distance Covered", y="Calories Burned")
T2 <- ggplot(finaltable, aes(x=ModeratelyActiveDistance, y=Calories)) + geom_point(aes(color=TotalSteps)) + geom_smooth() + labs(x="Moderate Activity Distance Covered", y="Calories Burned")
T3 <- ggplot(finaltable, aes(x=LightActiveDistance, y=Calories)) + geom_point(aes(color=TotalSteps)) + geom_smooth() + labs(x="Light Activity Distance Covered", y="Calories Burned")
T4 <- ggplot(finaltable, aes(x=SedentaryActiveDistance, y=Calories)) + geom_point(aes(color=TotalSteps)) + geom_smooth() + labs(x="Sedentary Activity Distance Covered", y="Calories Burned")
grid.arrange(T1,T2,T3,T4)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Failed to fit group -1.
## Caused by error in `smooth.construct.cr.smooth.spec()`:
## ! x has insufficient unique values to support 10 knots: reduce k.
#Understanding Time used for Burning Calories
M1 <- ggplot(finaltable, aes(x=VeryActiveMinutes, y=Calories)) + geom_point(aes(color=VeryActiveMinutes)) + geom_smooth() + labs(x="Active Minutes Consumed", y="Calories Burned",color="Minutes")
M2 <- ggplot(finaltable, aes(x=FairlyActiveMinutes, y=Calories)) + geom_point(aes(color=FairlyActiveMinutes)) + geom_smooth() + labs(x="Fairly Active Minutes Consumed", y="Calories Burned",color="Minutes")
M3 <- ggplot(finaltable, aes(x=LightlyActiveMinutes, y=Calories)) + geom_point(aes(color=LightlyActiveMinutes)) + geom_smooth() + labs(x="Lightly Active Minutes Consumed", y="Calories Burned",color="Minutes")
M4 <- ggplot(finaltable, aes(x=SedentaryMinutes, y=Calories)) + geom_point(aes(color=SedentaryMinutes)) + geom_smooth() + labs(x="Sedentary Minutes Consumed", y="Calories Burned",color="Minutes")
grid.arrange(M1,M2,M3,M4)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
# Understanding Affected Parameters to Burning Calories
Tsteps <- ggplot(finaltable, aes(x=TotalSteps, y=Calories)) + geom_point(aes(color=TotalSteps)) + geom_smooth() + labs(x="Steps", y="Calories Burned",color="Steps")
TDistance <- ggplot(finaltable, aes(x=TotalDistance, y=Calories)) + geom_point(aes(color=TotalDistance)) + geom_smooth() + labs(x="Total Distance", y="Calories Burned",color="Distance")
TMS <- ggplot(finaltable, aes(x=TotalMinutesAsleep, y=Calories)) + geom_point(aes(color=TotalMinutesAsleep)) + geom_smooth() + labs(x="Total Sleep in Minutes", y="Calories Burned",color="Total Sleeping in Minutes")
TTB <- ggplot(finaltable, aes(x=TotalTimeInBed, y=Calories)) + geom_point(aes(color=TotalTimeInBed)) + geom_smooth() + labs(x="Total Time in Bed", y="Calories Burned",color="Total Time In Bed")
WC <- ggplot(finaltable, aes(x=WeightKg, y=Calories)) + geom_point(aes(color=WeightKg)) + geom_smooth() + labs(x="Weight", y="Calories Burned",color="Weight")
TTZ <- ggplot(finaltable, aes(x=TotalTimeInBed, y=TotalMinutesAsleep)) + geom_step(direction="hv") + geom_smooth() + labs(x="Total Time in Bed", y="Total Time A Sleep",color="Difference Between Sleep and Spended time in Bed")
grid.arrange(Tsteps,TDistance,TMS,WC,TTB,TTZ)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 12406 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 12406 rows containing missing values or values outside the scale range
## (`geom_point()`).
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 20598 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 20598 rows containing missing values or values outside the scale range
## (`geom_point()`).
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 12406 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 12406 rows containing missing values or values outside the scale range
## (`geom_point()`).
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 12406 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 856 rows containing missing values or values outside the scale range
## (`geom_step()`).