Interactive mode: click a code block or Show Plot button to reveal/hide its corresponding plot.

Practice 1

library(dplyr)
library(readr)
library(haven)
data <- haven::read_dta("nyc_schools.dta")
# Source: New York City Department of Education records, assembled by Nathan Favero

Renaming Variables

head(data)
# Rename a variable
data <- data %>% rename(school = schoolname)

# See if changes made
head(data)

Summary Statistics for Subsets

Method 1

summary(data$overallscore[data$schooltype == "Elementary"])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   12.70   44.83   53.80   54.54   64.70  102.20       8
summary(data$overallscore[data$schooltype == "Middle"])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   13.40   45.90   55.30   55.94   65.47  102.20      25
summary(data$overallscore[data$schooltype == "K-8"])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   26.20   44.22   53.95   54.98   64.27   91.90       1
#Guess what will following codes generate

data$schooltype == "Elementary"
##    [1]  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE
##   [16]  TRUE FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE
##   [31]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE
##   [46] FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE
##   [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE
##   [76]  TRUE FALSE  TRUE FALSE  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE
##   [91] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE
##  [106] FALSE FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE  TRUE
##  [121] FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE
##  [136]  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
##  [151] FALSE FALSE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE
##  [166]  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
##  [181]  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE
##  [196] FALSE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE
##  [211] FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE
##  [226]  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE
##  [241] FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [256] FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [271]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE
##  [286] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE
##  [301] FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE
##  [316]  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE
##  [331]  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE
##  [346]  TRUE FALSE  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE
##  [361]  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [376]  TRUE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE
##  [391] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [406]  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE FALSE
##  [421] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
##  [436]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE
##  [451]  TRUE  TRUE FALSE FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE
##  [466] FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE
##  [481] FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
##  [496]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE
##  [511]  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE
##  [526] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE
##  [541] FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE
##  [556] FALSE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE  TRUE FALSE FALSE  TRUE  TRUE  TRUE
##  [571] FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE  TRUE  TRUE FALSE
##  [586] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE
##  [601]  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE  TRUE
##  [616]  TRUE  TRUE  TRUE FALSE FALSE FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE
##  [631] FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE
##  [646]  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE
##  [661] FALSE FALSE  TRUE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE  TRUE FALSE
##  [676]  TRUE FALSE FALSE  TRUE FALSE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE
##  [691]  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE
##  [706] FALSE  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE
##  [721] FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE
##  [736] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [751] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE
##  [766]  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE FALSE  TRUE FALSE  TRUE  TRUE  TRUE FALSE FALSE
##  [781] FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE
##  [796]  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE
##  [811] FALSE  TRUE FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [826]  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE
##  [841]  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE
##  [856] FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE
##  [871]  TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE
##  [886] FALSE  TRUE  TRUE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE
##  [901]  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [916] FALSE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE
##  [931]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE  TRUE  TRUE  TRUE
##  [946]  TRUE  TRUE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
##  [961] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE
##  [976]  TRUE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [991]  TRUE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE
## [1006] FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE
## [1021]  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [1036] FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE
## [1051] FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE
## [1066] FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1081] FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE FALSE
## [1096] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE
## [1111]  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE FALSE
## [1126] FALSE FALSE FALSE  TRUE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE FALSE
## [1141] FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [1156]  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE

Breaking Down the Command:

  1. data$overallscore: This selects the overallscore column from the data dataframe.

  2. data$schooltype == "Elementary": This creates a logical vector (TRUE/FALSE) that is TRUE for rows where the schooltype column equals “Elementary” and FALSE otherwise.

  3. data$overallscore[data$schooltype == "Elementary"]: This uses the logical vector to subset overallscore, selecting only those values where schooltype is “Elementary.”

  4. summary(): The summary() function then computes summary statistics (such as the minimum, 1st quartile, median, mean, 3rd quartile, and maximum) for the selected subset of overallscore.

Method 2

data%>%
  filter(schooltype=="Elementary")%>%
  select(overallscore)%>%
  summary()
##   overallscore   
##  Min.   : 12.70  
##  1st Qu.: 44.83  
##  Median : 53.80  
##  Mean   : 54.54  
##  3rd Qu.: 64.70  
##  Max.   :102.20  
##  NA's   :8
data%>%
  filter(schooltype=="Middle")%>%
  .$overallscore%>% # see the break-downs below
  summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   13.40   45.90   55.30   55.94   65.47  102.20      25

Breaking Down the Command:

  1. data %>%: Starts a pipeline where the data dataframe is passed into the next function.

  2. filter(schooltype == "Middle"): Filters the data dataframe to include only rows where the schooltypecolumn is equal to “Middle.” The result is a smaller dataframe containing only middle schools.

  3. .$overallscore: Extracts the overallscore column from the filtered dataframe. The . refers to the dataframe that results from the previous step in the pipeline.

  4. summary(): Applies the summary() function to the extracted overallscore column, generating summary statistics such as the minimum, 1st quartile, median, mean, 3rd quartile, and maximum for middle schools.

Rescaling Variables

head(data)
# Rescale the overall score to range from 0 to 1
data <- data %>% 
  mutate(overallscore = overallscore / 100)

summary(data$overallscore)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.1270  0.4497  0.5415  0.5505  0.6490  1.0220      34

Creating Dummy Variables for Grades

table(data$overallgrade)
## 
##       A   B   C   D   F 
##  34 287 399 348  76  22
# Create dummy variables for letter grades
data <- data %>% mutate(
  gradeA = ifelse(overallgrade == "A", 1, ifelse(overallgrade == "", NA, 0)),
  gradeB = ifelse(overallgrade == "B", 1, ifelse(overallgrade == "", NA, 0)),
  gradeC = ifelse(overallgrade == "C", 1, ifelse(overallgrade == "", NA, 0)),
  gradeD = ifelse(overallgrade == "D", 1, ifelse(overallgrade == "", NA, 0)),
  gradeF = ifelse(overallgrade == "F", 1, ifelse(overallgrade == "", NA, 0)),
  grade_NA = ifelse(overallgrade == "", 1, 0)
)

head(data)
table(data$gradeA,data$overallgrade)
##    
##           A   B   C   D   F
##   0   0   0 399 348  76  22
##   1   0 287   0   0   0   0

Breaking Down the Command:

  • gradeA = ifelse(overallgrade == "A", 1, ifelse(overallgrade == "", NA, 0)):

    • ifelse(overallgrade == "A", 1, ifelse(overallgrade == "", NA, 0)):

      • Checks if overallgrade is “A”. If true, assigns 1 to gradeA.

      • If overallgrade is missing (empty string), assigns NA to gradeA.

      • If neither condition is true, assigns 0 to gradeA.

    • This pattern is repeated for gradeBgradeCgradeD, and gradeF, where the check is for “B”, “C”, “D”, and “F”, respectively.

  • grade_missing = ifelse(overallgrade == "", 1, 0):

    • This creates a binary indicator that assigns 1 if overallgrade is missing (empty string) and 0 otherwise.

Creating an Index Variable

Now, let’s create an index of the progress grade and the performance grade. We first convert the grades to numeric variables.

We assign a score of 4 to schools with an A, 3 for a B, etc.

# Creating an index of progress grade and performance grade
data <- data %>% mutate(
  progress = case_when(
    progressgrade == "A" ~ 4,
    progressgrade == "B" ~ 3,
    progressgrade == "C" ~ 2,
    progressgrade == "D" ~ 1,
    progressgrade == "F" ~ 0,
    TRUE ~ NA_real_
  ),
  performance = case_when(
    performancegrade == "A" ~ 4,
    performancegrade == "B" ~ 3,
    performancegrade == "C" ~ 2,
    performancegrade == "D" ~ 1,
    performancegrade == "F" ~ 0,
    TRUE ~ NA_real_
  ),
  index = (progress + performance)/2
)

head(data)
# Note: NA + 10 = ?

table(data$progressgrade,data$progress)
##    
##       0   1   2   3   4
##       0   0   0   0   0
##   A   0   0   0   0 185
##   B   0   0   0 316   0
##   C   0   0 367   0   0
##   D   0 162   0   0   0
##   F 102   0   0   0   0
table(data$performancegrade,data$performance)
##    
##       0   1   2   3   4
##       0   0   0   0   0
##   A   0   0   0   0 439
##   B   0   0   0 302   0
##   C   0   0 247   0   0
##   D   0  95   0   0   0
##   F  49   0   0   0   0

NA and NA_real_

NA

  • Type: Generic missing value.

  • Behavior: When used in expressions, NA can adapt to the expected type of the output (integer, numeric, character, etc.). For example, if you’re working with a numeric vector and use NA, it will automatically be treated as NA_real_.

  • Usage: NA is flexible and can be used in various contexts, including vectors of different types.

NA_real_

  • Type: Specifically a missing value of type double (real numbers).

  • Behavior: Explicitly indicates that the missing value is numeric and of type double. This is important when you need to ensure that the data type remains consistent, especially in functions like mutate() where type consistency is crucial.

  • Usage: Typically used in numeric calculations or when creating variables where the type must be explicitly double.

Why Use NA_real_?

  • Type Consistency: By explicitly using NA_real_, you ensure that the elementary variable is always treated as a numeric vector. If you used NA instead, R would still work correctly in this case, but using NA_real_ makes the intent clear and avoids potential issues if the type needs to be consistent, especially in more complex operations.

  • Prevents Implicit Type Conversion: If the context changes or if additional types are introduced, NA_real_ ensures that R does not implicitly convert the vector to another type, which might happen with NA.

Handling String Variables

Convert Percentage Variables to Numeric

# We want to know how many black or hispanic students those schools have
summary(data$blackhispanic)
##    Length     Class      Mode 
##      1166 character character
#We need to convert this variable to just numbers.
# Convert blackhispanic and ell variables to numeric

data <- data %>% mutate(
  blackhispanic = as.numeric(gsub("%", "", blackhispanic))
)

data <- data %>% mutate(
  blackhispanic = gsub("%", "", blackhispanic),
  blackhispanic = as.numeric(blackhispanic)
)

summary(data$blackhispanic)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3.60   52.52   92.35   74.99   97.40  100.00

Breaking Down the Command:

mutate(blackhispanic = as.numeric(gsub("%", "", blackhispanic))):

  • gsub("%", "", blackhispanic): This function removes the percentage signs (%) from the blackhispanicvariable. The gsub function replaces each occurrence of % with an empty string ("").

  • as.numeric(...): Converts the cleaned blackhispanic values (now just numbers in string form) into numeric data.

  • mutate(...): Creates a new version of the blackhispanic variable within the data dataframe, replacing the original values with the cleaned numeric values.

Extracting Substrings

The variable dbn contains the district, borough, and school number. The first 2 digits are the district number. The third digit is the borough. And the fourth through sixth digits are the school number.

# Extract district, borough, and school number from the dbn variable
data <- data %>% mutate(
  distnum = substr(dbn, 1, 2),
  borough = substr(dbn, 3, 3),
  schoolnum = substr(dbn, 4, 6)
)

substr("abcdef", 2, 5)
## [1] "bcde"

Save the data

# Save the cleaned data
saveRDS(data, "nyc_schools_cleaned.RDS")