Interactive mode: click a code block or Show Plot button to reveal/hide its corresponding plot.
library(dplyr)
library(readr)
library(haven)
data <- haven::read_dta("nyc_schools.dta")
# Source: New York City Department of Education records, assembled by Nathan Favero
head(data)
# Rename a variable
data <- data %>% rename(school = schoolname)
# See if changes made
head(data)
summary(data$overallscore[data$schooltype == "Elementary"])
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 12.70 44.83 53.80 54.54 64.70 102.20 8
summary(data$overallscore[data$schooltype == "Middle"])
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 13.40 45.90 55.30 55.94 65.47 102.20 25
summary(data$overallscore[data$schooltype == "K-8"])
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 26.20 44.22 53.95 54.98 64.27 91.90 1
#Guess what will following codes generate
data$schooltype == "Elementary"
## [1] TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE FALSE FALSE FALSE
## [16] TRUE FALSE FALSE TRUE TRUE TRUE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE TRUE TRUE FALSE TRUE
## [46] FALSE TRUE TRUE FALSE TRUE TRUE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE TRUE TRUE
## [76] TRUE FALSE TRUE FALSE TRUE FALSE FALSE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
## [91] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE
## [106] FALSE FALSE TRUE TRUE FALSE TRUE FALSE TRUE TRUE FALSE TRUE TRUE FALSE FALSE TRUE
## [121] FALSE FALSE FALSE TRUE FALSE FALSE TRUE TRUE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [136] TRUE FALSE TRUE TRUE FALSE TRUE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [151] FALSE FALSE TRUE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE TRUE TRUE FALSE TRUE
## [166] TRUE TRUE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [181] TRUE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE TRUE TRUE TRUE TRUE FALSE TRUE
## [196] FALSE TRUE TRUE TRUE FALSE TRUE TRUE TRUE FALSE TRUE FALSE FALSE FALSE FALSE TRUE
## [211] FALSE FALSE FALSE TRUE TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE
## [226] TRUE TRUE FALSE TRUE TRUE FALSE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE FALSE
## [241] FALSE FALSE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [256] FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [271] TRUE FALSE TRUE FALSE TRUE FALSE TRUE TRUE TRUE FALSE FALSE FALSE FALSE TRUE FALSE
## [286] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE
## [301] FALSE FALSE TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE
## [316] TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE
## [331] TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE TRUE TRUE FALSE TRUE FALSE FALSE
## [346] TRUE FALSE TRUE TRUE FALSE FALSE TRUE TRUE FALSE FALSE TRUE FALSE FALSE TRUE FALSE
## [361] TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [376] TRUE TRUE FALSE FALSE FALSE TRUE TRUE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE
## [391] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE
## [406] TRUE TRUE TRUE TRUE FALSE TRUE FALSE TRUE TRUE FALSE TRUE TRUE FALSE FALSE FALSE
## [421] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [436] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE
## [451] TRUE TRUE FALSE FALSE TRUE FALSE TRUE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
## [466] FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE TRUE TRUE TRUE
## [481] FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [496] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE
## [511] TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
## [526] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE FALSE TRUE
## [541] FALSE TRUE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE TRUE TRUE TRUE
## [556] FALSE TRUE TRUE TRUE FALSE TRUE TRUE FALSE FALSE TRUE FALSE FALSE TRUE TRUE TRUE
## [571] FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE TRUE TRUE FALSE
## [586] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE FALSE TRUE TRUE FALSE
## [601] TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE FALSE TRUE
## [616] TRUE TRUE TRUE FALSE FALSE FALSE TRUE FALSE TRUE TRUE FALSE TRUE TRUE TRUE TRUE
## [631] FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE
## [646] TRUE TRUE FALSE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE FALSE TRUE TRUE
## [661] FALSE FALSE TRUE FALSE TRUE TRUE FALSE FALSE FALSE FALSE TRUE FALSE TRUE TRUE FALSE
## [676] TRUE FALSE FALSE TRUE FALSE FALSE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE FALSE
## [691] TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE
## [706] FALSE TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## [721] FALSE TRUE TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE
## [736] FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [751] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE
## [766] TRUE FALSE TRUE FALSE TRUE TRUE FALSE FALSE TRUE FALSE TRUE TRUE TRUE FALSE FALSE
## [781] FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE
## [796] TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE FALSE
## [811] FALSE TRUE FALSE FALSE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [826] TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE TRUE FALSE TRUE TRUE FALSE
## [841] TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE FALSE FALSE TRUE FALSE
## [856] FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE
## [871] TRUE FALSE TRUE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE TRUE TRUE FALSE
## [886] FALSE TRUE TRUE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE TRUE
## [901] TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [916] FALSE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE FALSE TRUE FALSE FALSE FALSE FALSE
## [931] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE FALSE TRUE TRUE TRUE
## [946] TRUE TRUE TRUE FALSE FALSE FALSE TRUE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [961] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## [976] TRUE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
## [991] TRUE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE TRUE TRUE TRUE TRUE
## [1006] FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE
## [1021] TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [1036] FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE
## [1051] FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE
## [1066] FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1081] FALSE FALSE TRUE TRUE FALSE FALSE FALSE TRUE FALSE TRUE TRUE TRUE TRUE FALSE FALSE
## [1096] FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE
## [1111] TRUE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE TRUE TRUE TRUE FALSE FALSE
## [1126] FALSE FALSE FALSE TRUE FALSE TRUE TRUE FALSE FALSE FALSE TRUE TRUE TRUE TRUE FALSE
## [1141] FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [1156] TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE TRUE
Breaking Down the Command:
data$overallscore: This selects
the overallscore column from
the data dataframe.
data$schooltype == "Elementary": This
creates a logical vector (TRUE/FALSE) that is TRUE for rows
where the schooltype column equals “Elementary”
and FALSE otherwise.
data$overallscore[data$schooltype == "Elementary"]: This
uses the logical vector to subset overallscore, selecting
only those values where schooltype is
“Elementary.”
summary(): The summary() function
then computes summary statistics (such as the minimum, 1st quartile,
median, mean, 3rd quartile, and maximum) for the selected subset
of overallscore.
data%>%
filter(schooltype=="Elementary")%>%
select(overallscore)%>%
summary()
## overallscore
## Min. : 12.70
## 1st Qu.: 44.83
## Median : 53.80
## Mean : 54.54
## 3rd Qu.: 64.70
## Max. :102.20
## NA's :8
data%>%
filter(schooltype=="Middle")%>%
.$overallscore%>% # see the break-downs below
summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 13.40 45.90 55.30 55.94 65.47 102.20 25
Breaking Down the Command:
data %>%: Starts a pipeline
where the data dataframe is passed into the next
function.
filter(schooltype == "Middle"): Filters
the data dataframe to include only rows where
the schooltypecolumn is equal to “Middle.” The result is a
smaller dataframe containing only middle schools.
.$overallscore: Extracts
the overallscore column from the filtered dataframe.
The . refers to the dataframe that results from the
previous step in the pipeline.
summary(): Applies
the summary() function to the
extracted overallscore column, generating summary
statistics such as the minimum, 1st quartile, median, mean, 3rd
quartile, and maximum for middle schools.
head(data)
# Rescale the overall score to range from 0 to 1
data <- data %>%
mutate(overallscore = overallscore / 100)
summary(data$overallscore)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.1270 0.4497 0.5415 0.5505 0.6490 1.0220 34
table(data$overallgrade)
##
## A B C D F
## 34 287 399 348 76 22
# Create dummy variables for letter grades
data <- data %>% mutate(
gradeA = ifelse(overallgrade == "A", 1, ifelse(overallgrade == "", NA, 0)),
gradeB = ifelse(overallgrade == "B", 1, ifelse(overallgrade == "", NA, 0)),
gradeC = ifelse(overallgrade == "C", 1, ifelse(overallgrade == "", NA, 0)),
gradeD = ifelse(overallgrade == "D", 1, ifelse(overallgrade == "", NA, 0)),
gradeF = ifelse(overallgrade == "F", 1, ifelse(overallgrade == "", NA, 0)),
grade_NA = ifelse(overallgrade == "", 1, 0)
)
head(data)
table(data$gradeA,data$overallgrade)
##
## A B C D F
## 0 0 0 399 348 76 22
## 1 0 287 0 0 0 0
Breaking Down the Command:
gradeA = ifelse(overallgrade == "A", 1, ifelse(overallgrade == "", NA, 0)):
ifelse(overallgrade == "A", 1, ifelse(overallgrade == "", NA, 0)):
Checks if overallgrade is “A”. If true,
assigns 1 to gradeA.
If overallgrade is missing (empty string),
assigns NA to gradeA.
If neither condition is true,
assigns 0 to gradeA.
This pattern is repeated
for gradeB, gradeC, gradeD,
and gradeF, where the check is for “B”, “C”, “D”,
and “F”, respectively.
grade_missing = ifelse(overallgrade == "", 1, 0):
1 if overallgrade is missing (empty
string) and 0 otherwise.Now, let’s create an index of the progress grade and the performance grade. We first convert the grades to numeric variables.
We assign a score of 4 to schools with an A, 3 for a B, etc.
# Creating an index of progress grade and performance grade
data <- data %>% mutate(
progress = case_when(
progressgrade == "A" ~ 4,
progressgrade == "B" ~ 3,
progressgrade == "C" ~ 2,
progressgrade == "D" ~ 1,
progressgrade == "F" ~ 0,
TRUE ~ NA_real_
),
performance = case_when(
performancegrade == "A" ~ 4,
performancegrade == "B" ~ 3,
performancegrade == "C" ~ 2,
performancegrade == "D" ~ 1,
performancegrade == "F" ~ 0,
TRUE ~ NA_real_
),
index = (progress + performance)/2
)
head(data)
# Note: NA + 10 = ?
table(data$progressgrade,data$progress)
##
## 0 1 2 3 4
## 0 0 0 0 0
## A 0 0 0 0 185
## B 0 0 0 316 0
## C 0 0 367 0 0
## D 0 162 0 0 0
## F 102 0 0 0 0
table(data$performancegrade,data$performance)
##
## 0 1 2 3 4
## 0 0 0 0 0
## A 0 0 0 0 439
## B 0 0 0 302 0
## C 0 0 247 0 0
## D 0 95 0 0 0
## F 49 0 0 0 0
NA and NA_real_ |
|---|
|
Convert Percentage Variables to Numeric
# We want to know how many black or hispanic students those schools have
summary(data$blackhispanic)
## Length Class Mode
## 1166 character character
#We need to convert this variable to just numbers.
# Convert blackhispanic and ell variables to numeric
data <- data %>% mutate(
blackhispanic = as.numeric(gsub("%", "", blackhispanic))
)
data <- data %>% mutate(
blackhispanic = gsub("%", "", blackhispanic),
blackhispanic = as.numeric(blackhispanic)
)
summary(data$blackhispanic)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.60 52.52 92.35 74.99 97.40 100.00
Breaking Down the Command:
mutate(blackhispanic = as.numeric(gsub("%", "", blackhispanic))):
gsub("%", "", blackhispanic): This
function removes the percentage signs (%) from
the blackhispanicvariable. The gsub function
replaces each occurrence of % with an empty string
("").
as.numeric(...): Converts the
cleaned blackhispanic values (now just numbers in string
form) into numeric data.
mutate(...): Creates a new version
of the blackhispanic variable within
the data dataframe, replacing the original values with the
cleaned numeric values.
The variable dbn contains the district, borough, and
school number. The first 2 digits are the district number. The third
digit is the borough. And the fourth through sixth digits are the school
number.
# Extract district, borough, and school number from the dbn variable
data <- data %>% mutate(
distnum = substr(dbn, 1, 2),
borough = substr(dbn, 3, 3),
schoolnum = substr(dbn, 4, 6)
)
substr("abcdef", 2, 5)
## [1] "bcde"
# Save the cleaned data
saveRDS(data, "nyc_schools_cleaned.RDS")