Interactive mode: click a code block or Show Plot button to reveal/hide its corresponding plot.
We’ll use the NHANES dataset, which contains demographic, health, and lifestyle data from a U.S. population survey.
library("NHANES")
library(tidyverse)
library(ggplot2)
library(haven)
data("NHANES")
data<-data.frame(NHANES)
glimpse(data)
## Rows: 10,000
## Columns: 76
## $ ID <int> 51624, 51624, 51624, 51625, 51630, 51638, 51646, 51647, 51647, 51647, 516…
## $ SurveyYr <fct> 2009_10, 2009_10, 2009_10, 2009_10, 2009_10, 2009_10, 2009_10, 2009_10, 2…
## $ Gender <fct> male, male, male, male, female, male, male, female, female, female, male,…
## $ Age <int> 34, 34, 34, 4, 49, 9, 8, 45, 45, 45, 66, 58, 54, 10, 58, 50, 9, 33, 60, 1…
## $ AgeDecade <fct> 30-39, 30-39, 30-39, 0-9, 40-49, 0-9, 0-9, 40-49, 40-49, 40-49,…
## $ AgeMonths <int> 409, 409, 409, 49, 596, 115, 101, 541, 541, 541, 795, 707, 654, 123, 700,…
## $ Race1 <fct> White, White, White, Other, White, White, White, White, White, White, Whi…
## $ Race3 <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Education <fct> High School, High School, High School, NA, Some College, NA, NA, College …
## $ MaritalStatus <fct> Married, Married, Married, NA, LivePartner, NA, NA, Married, Married, Mar…
## $ HHIncome <fct> 25000-34999, 25000-34999, 25000-34999, 20000-24999, 35000-44999, 75000-99…
## $ HHIncomeMid <int> 30000, 30000, 30000, 22500, 40000, 87500, 60000, 87500, 87500, 87500, 300…
## $ Poverty <dbl> 1.36, 1.36, 1.36, 1.07, 1.91, 1.84, 2.33, 5.00, 5.00, 5.00, 2.20, 5.00, 2…
## $ HomeRooms <int> 6, 6, 6, 9, 5, 6, 7, 6, 6, 6, 5, 10, 6, 10, 10, 4, 3, 11, 5, 7, 10, 10, 9…
## $ HomeOwn <fct> Own, Own, Own, Own, Rent, Rent, Own, Own, Own, Own, Own, Rent, Rent, Own,…
## $ Work <fct> NotWorking, NotWorking, NotWorking, NA, NotWorking, NA, NA, Working, Work…
## $ Weight <dbl> 87.4, 87.4, 87.4, 17.0, 86.7, 29.8, 35.2, 75.7, 75.7, 75.7, 68.0, 78.4, 7…
## $ Length <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ HeadCirc <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Height <dbl> 164.7, 164.7, 164.7, 105.4, 168.4, 133.1, 130.6, 166.7, 166.7, 166.7, 169…
## $ BMI <dbl> 32.22, 32.22, 32.22, 15.30, 30.57, 16.82, 20.64, 27.24, 27.24, 27.24, 23.…
## $ BMICatUnder20yrs <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ BMI_WHO <fct> 30.0_plus, 30.0_plus, 30.0_plus, 12.0_18.5, 30.0_plus, 12.0_18.5, 18.5_to…
## $ Pulse <int> 70, 70, 70, NA, 86, 82, 72, 62, 62, 62, 60, 62, 76, 80, 94, 74, 92, 96, 8…
## $ BPSysAve <int> 113, 113, 113, NA, 112, 86, 107, 118, 118, 118, 111, 104, 134, 104, 127, …
## $ BPDiaAve <int> 85, 85, 85, NA, 75, 47, 37, 64, 64, 64, 63, 74, 85, 68, 83, 68, 63, 74, 1…
## $ BPSys1 <int> 114, 114, 114, NA, 118, 84, 114, 106, 106, 106, 124, 108, 136, 102, NA, 1…
## $ BPDia1 <int> 88, 88, 88, NA, 82, 50, 46, 62, 62, 62, 64, 76, 86, 66, NA, 66, 56, 80, 9…
## $ BPSys2 <int> 114, 114, 114, NA, 108, 84, 108, 118, 118, 118, 108, 104, 132, 102, 134, …
## $ BPDia2 <int> 88, 88, 88, NA, 74, 50, 36, 68, 68, 68, 62, 72, 88, 66, 82, 74, 64, 74, 9…
## $ BPSys3 <int> 112, 112, 112, NA, 116, 88, 106, 118, 118, 118, 114, 104, 136, 106, 120, …
## $ BPDia3 <int> 82, 82, 82, NA, 76, 44, 38, 60, 60, 60, 64, 76, 82, 70, 84, 62, 62, NA, 1…
## $ Testosterone <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ DirectChol <dbl> 1.29, 1.29, 1.29, NA, 1.16, 1.34, 1.55, 2.12, 2.12, 2.12, 0.67, 0.96, 1.1…
## $ TotChol <dbl> 3.49, 3.49, 3.49, NA, 6.70, 4.86, 4.09, 5.82, 5.82, 5.82, 4.99, 4.24, 6.4…
## $ UrineVol1 <int> 352, 352, 352, NA, 77, 123, 238, 106, 106, 106, 113, 163, 215, 7, 29, 64,…
## $ UrineFlow1 <dbl> NA, NA, NA, NA, 0.094, 1.538, 1.322, 1.116, 1.116, 1.116, 0.489, NA, 0.90…
## $ UrineVol2 <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ UrineFlow2 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Diabetes <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, No, No, No, No, No, N…
## $ DiabetesAge <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ HealthGen <fct> Good, Good, Good, NA, Good, NA, NA, Vgood, Vgood, Vgood, Vgood, Vgood, Fa…
## $ DaysPhysHlthBad <int> 0, 0, 0, NA, 0, NA, NA, 0, 0, 0, 10, 0, 4, NA, NA, 0, NA, 3, 7, 0, 3, 3, …
## $ DaysMentHlthBad <int> 15, 15, 15, NA, 10, NA, NA, 3, 3, 3, 0, 0, 0, NA, NA, 0, NA, 7, 0, 20, 0,…
## $ LittleInterest <fct> Most, Most, Most, NA, Several, NA, NA, None, None, None, None, None, None…
## $ Depressed <fct> Several, Several, Several, NA, Several, NA, NA, None, None, None, None, N…
## $ nPregnancies <int> NA, NA, NA, NA, 2, NA, NA, 1, 1, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ nBabies <int> NA, NA, NA, NA, 2, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ Age1stBaby <int> NA, NA, NA, NA, 27, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ SleepHrsNight <int> 4, 4, 4, NA, 8, NA, NA, 8, 8, 8, 7, 5, 4, NA, 5, 7, NA, 6, 6, 6, 7, 7, 8,…
## $ SleepTrouble <fct> Yes, Yes, Yes, NA, Yes, NA, NA, No, No, No, No, No, Yes, NA, No, No, NA, …
## $ PhysActive <fct> No, No, No, NA, No, NA, NA, Yes, Yes, Yes, Yes, Yes, Yes, NA, Yes, Yes, N…
## $ PhysActiveDays <int> NA, NA, NA, NA, NA, NA, NA, 5, 5, 5, 7, 5, 1, NA, 2, 7, NA, NA, NA, 3, 7,…
## $ TVHrsDay <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ CompHrsDay <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ TVHrsDayChild <int> NA, NA, NA, 4, NA, 5, 1, NA, NA, NA, NA, NA, NA, 4, NA, NA, 0, NA, NA, NA…
## $ CompHrsDayChild <int> NA, NA, NA, 1, NA, 0, 6, NA, NA, NA, NA, NA, NA, 3, NA, NA, 1, NA, NA, NA…
## $ Alcohol12PlusYr <fct> Yes, Yes, Yes, NA, Yes, NA, NA, Yes, Yes, Yes, Yes, Yes, Yes, NA, NA, No,…
## $ AlcoholDay <int> NA, NA, NA, NA, 2, NA, NA, 3, 3, 3, 1, 2, 6, NA, NA, NA, NA, 3, 6, NA, 1,…
## $ AlcoholYear <int> 0, 0, 0, NA, 20, NA, NA, 52, 52, 52, 100, 104, 364, NA, NA, 0, NA, 104, 3…
## $ SmokeNow <fct> No, No, No, NA, Yes, NA, NA, NA, NA, NA, No, NA, NA, NA, Yes, NA, NA, No,…
## $ Smoke100 <fct> Yes, Yes, Yes, NA, Yes, NA, NA, No, No, No, Yes, No, No, NA, Yes, No, NA,…
## $ Smoke100n <fct> Smoker, Smoker, Smoker, NA, Smoker, NA, NA, Non-Smoker, Non-Smoker, Non-S…
## $ SmokeAge <int> 18, 18, 18, NA, 38, NA, NA, NA, NA, NA, 13, NA, NA, NA, 17, NA, NA, NA, 1…
## $ Marijuana <fct> Yes, Yes, Yes, NA, Yes, NA, NA, Yes, Yes, Yes, NA, Yes, Yes, NA, NA, No, …
## $ AgeFirstMarij <int> 17, 17, 17, NA, 18, NA, NA, 13, 13, 13, NA, 19, 15, NA, NA, NA, NA, NA, N…
## $ RegularMarij <fct> No, No, No, NA, No, NA, NA, No, No, No, NA, Yes, Yes, NA, NA, No, NA, No,…
## $ AgeRegMarij <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 20, 15, NA, NA, NA, NA, NA, N…
## $ HardDrugs <fct> Yes, Yes, Yes, NA, Yes, NA, NA, No, No, No, No, Yes, Yes, NA, NA, No, NA,…
## $ SexEver <fct> Yes, Yes, Yes, NA, Yes, NA, NA, Yes, Yes, Yes, Yes, Yes, Yes, NA, NA, Yes…
## $ SexAge <int> 16, 16, 16, NA, 12, NA, NA, 13, 13, 13, 17, 22, 12, NA, NA, NA, NA, 27, 2…
## $ SexNumPartnLife <int> 8, 8, 8, NA, 10, NA, NA, 20, 20, 20, 15, 7, 100, NA, NA, 9, NA, 1, 1, NA,…
## $ SexNumPartYear <int> 1, 1, 1, NA, 1, NA, NA, 0, 0, 0, NA, 1, 1, NA, NA, 1, NA, 1, NA, NA, 1, 1…
## $ SameSex <fct> No, No, No, NA, Yes, NA, NA, Yes, Yes, Yes, No, No, No, NA, NA, No, NA, N…
## $ SexOrientation <fct> Heterosexual, Heterosexual, Heterosexual, NA, Heterosexual, NA, NA, Bisex…
## $ PregnantNow <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
summary(data %>% select(BMI,MaritalStatus,Depressed,HHIncomeMid,Gender,Age, SmokeNow,PhysActive))
## BMI MaritalStatus Depressed HHIncomeMid Gender Age
## Min. :12.88 Divorced : 707 None :5246 Min. : 2500 female:5020 Min. : 0.00
## 1st Qu.:21.58 LivePartner : 560 Several:1009 1st Qu.: 30000 male :4980 1st Qu.:17.00
## Median :25.98 Married :3945 Most : 418 Median : 50000 Median :36.00
## Mean :26.66 NeverMarried:1380 NA's :3327 Mean : 57206 Mean :36.74
## 3rd Qu.:30.89 Separated : 183 3rd Qu.: 87500 3rd Qu.:54.00
## Max. :81.25 Widowed : 456 Max. :100000 Max. :80.00
## NA's :366 NA's :2769 NA's :811
## SmokeNow PhysActive
## No :1745 No :3677
## Yes :1466 Yes :4649
## NA's:6789 NA's:1674
##
##
##
##
data_clean <- data %>%
drop_na(
BMI,
MaritalStatus,
Depressed,
Poverty,
Gender,
Age,
SmokeNow,
PhysActive,
HHIncomeMid)
summary(data_clean %>% select(BMI,MaritalStatus,Depressed,HHIncomeMid,Gender,Age, SmokeNow,PhysActive))
## BMI MaritalStatus Depressed HHIncomeMid Gender Age
## Min. :15.02 Divorced : 325 None :1965 Min. : 2500 female:1127 Min. :20.00
## 1st Qu.:24.07 LivePartner : 305 Several: 473 1st Qu.: 30000 male :1503 1st Qu.:35.00
## Median :27.71 Married :1345 Most : 192 Median : 50000 Median :49.00
## Mean :28.64 NeverMarried: 440 Mean : 54744 Mean :49.07
## 3rd Qu.:32.00 Separated : 56 3rd Qu.: 87500 3rd Qu.:62.00
## Max. :67.83 Widowed : 159 Max. :100000 Max. :80.00
## SmokeNow PhysActive
## No :1445 No :1355
## Yes:1185 Yes:1275
##
##
##
##
Advantage: quick
Limitation: Hard to customize further or overlay additional elements.
hist(data_clean$BMI,
main = "Histogram of BMI",
xlab = "BMI",
col = "lightblue",
border = "black")
boxplot(BMI ~ Gender, data = data_clean,
main = "Boxplot of BMI by Gender",
xlab = "Gender", ylab = "BMI")
plot(BMI ~ Age, data = data_clean,
main = "Scatter Plot of BMI vs Age",
xlab = "Age", ylab = "BMI",
col = "blue", pch = 16)
Layering: ggplot2 allows adding multiple layers (points, lines, etc.).
Faceting: Easily split data across categories.
Customization: Control over themes, labels, and scales.
Aesthetics: Consistent look and feel with better visualization options.
# Basic scatter plot
ggplot(data_clean, aes(x = Age, y = BMI))+
geom_point()
# Scatter plot with color and labels
ggplot(data_clean, aes(x = Age, y = BMI, color = Gender)) +
geom_point() +
labs(title = "Scatter Plot of BMI vs Age by Gender",
x = "Age", y = "BMI")
ggplot(data_clean, aes(x = Age, y = BMI, color = Gender))+
geom_point()
geom_smooth() helps visualize
trends in data.
Alpha transparency avoids overplotting by making points semi-transparent.
# Scatter plot with a smooth trend line
ggplot(data_clean, aes(x = Age, y = BMI, color = Gender)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "loess", se = T) +
labs(title = "BMI vs Age with Trend Line", x = "Age", y = "BMI")+
theme_classic()
Locally estimated scatterplot smoothing (loess)
Locally estimated scatterplot smoothing, or LOESS, is a nonparametric method for smoothing a series of data in which no assumptions are made about the underlying structure of the data.
# Facet by Marital Status
ggplot(data_clean, aes(x = Age, y = BMI, color = Gender)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE) +
facet_wrap(~MaritalStatus) +
labs(title = "BMI vs Age by Marital Status", x = "Age", y = "BMI")+
theme_classic()
# Bar plot of gender distribution
ggplot(data_clean, aes(x = Gender)) +
geom_bar(fill = "blue") +
labs(title = "Gender Distribution", x = "Gender", y = "Count")+
theme_bw()
geom_bar() automatically counts observations for
categorical variables.
# Boxplot of BMI by Depression Levels
ggplot(data_clean, aes(x = Depressed, y = BMI, fill = Gender)) +
geom_boxplot() +
labs(title = "BMI by Depression Levels", x = "Depression Level", y = "BMI")+theme_classic()
Create a histogram of Age using
ggplot2.
Customize it by changing the binwidth and color.
Create a scatter plot of BMI vs
HHIncomeMid.
Color the points by Gender and use alpha transparency to avoid overplotting.
# Scatter plot of BMI vs Income
ggplot(data_clean, aes(x = HHIncomeMid, y = BMI, color = Gender)) +
geom_jitter(alpha = 0.6) +
labs(title = "Scatter Plot of BMI vs Household Income Midpoint",
x = "Household Income Midpoint", y = "BMI")
Using the scatter plot from Exercise 2, add a
trend line with geom_smooth().
Use method = “lm” to fit a linear trend and disable confidence intervals.
# Scatter plot with a linear trend line
ggplot(data_clean, aes(x = HHIncomeMid, y = BMI, color = Gender)) +
geom_jitter(alpha = 0.6) +
geom_smooth(method = "loess",)+
labs(title = "BMI vs Income with Linear Trend",
x = "Household Income Midpoint", y = "BMI")
Create a scatter plot of Age vs BMI and facet it
by Smoking Status (SmokeNow).
Use different colors for Gender and add a smooth trend line.
# Faceted scatter plot by Smoking Status
ggplot(data_clean, aes(x = Age, y = BMI, color = Gender)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "loess", se = FALSE) +
facet_wrap(~SmokeNow) +
labs(title = "BMI vs Age by Smoking Status", x = "Age", y = "BMI")