Revised section on looping through files

devnich · devnich · commit 290134e0b71d · 2025-04-17T14:32:21.000-07:00
diff --git a/README.md b/README.md
@@ -1,3 +1,8 @@
+---
+author: Derek Devnich
+title: Programming in R
+---
+
 - [<span class="toc-section-number">1</span> Fundamentals (Week 1)](#fundamentals-week-1)
   - [<span class="toc-section-number">1.1</span> Introduction to RStudio](#introduction-to-rstudio)
   - [<span class="toc-section-number">1.2</span> Introduction to R](#introduction-to-r)
@@ -859,20 +864,6 @@ See /scripts/curriculum.Rmd
     }
     ```
 
-5.  Vectorize your tests
-
-    ``` r
-    x <- 1:4
-
-    if (any(x < 2)) {
-      print("Some x less than 2")
-    }
-
-    if (all(x < 2)){
-      print("All x less than 2")
-    }
-    ```
-
 ### Review Subsetting section
 
 Subsetting is frequently an alternative to if-else statements in R
@@ -936,7 +927,21 @@ x + z
     all(a)
     ```
 
-2.  Can you detect missing data?
+2.  Vectorize your tests
+
+    ``` r
+    x <- 1:4
+
+    if (any(x < 2)) {
+      print("Some x less than 2")
+    }
+
+    if (all(x < 2)){
+      print("All x less than 2")
+    }
+    ```
+
+3.  Can you detect missing data?
 
     ``` r
     nan_vec <- c(1, 3, NaN)
@@ -1222,77 +1227,63 @@ See data/curriculum.Rmd
 
 ## Reading and writing data
 
-### Create sample data sets and write them to the \`processed\` directory
+### How to find files
 
-1.  Preliminaries
+Get all regional files
 
-    ``` r
-    if (!dir.exists("../processed")) {
-      dir.create("../processed")
-    }
+``` r
+## Get matching files from the `processed` subdirectory
+dir(path = ".", pattern = "gapminder_gdp.*.csv")
+```
 
-    north_america <- c("Canada", "Mexico", "United States")
-    ```
+### Read files using a for loop
 
-2.  Version 1: Use `calcGDP` function
+1.  Read each file into a data frame and add it to a list
 
     ``` r
-    for (year in unique(gapminder$year)) {
-      df <- calcGDP(gapminder, year = year, country = north_america)
+    ## Create an empty list
+    df_list <- list()
 
-      ## Generate a file name. This will fail if "processed" doesn't exist
-      fname <- paste("../processed/north_america_", as.character(year), ".csv", sep = "")
+    # Get list of files to read
+    file_names <- dir(path = "../data", pattern = "gapminder_gdp.*.csv")
 
-      ## Write the file
-      write.csv(x = df, file = fname, row.names = FALSE)
+    # Read files into data frames
+    for (f in file_names){
+      df_list[[f]] <- read.csv(file = file.path("../data", f, stringsAsFactors = TRUE))
     }
     ```
 
-3.  (Optional) Version 2: Bypass `calcGDP` function
+2.  Check our data for compatibility
 
     ``` r
-    for (year in unique(gapminder$year)) {
-      df <- gapminder[gapminder$year == year, ]
-      df <- df[df$country %in% north_america, ]
-      fname <- paste("processed/north_america_", as.character(year), ".csv", sep="")
-      write.csv(x = df, file = fname, row.names = FALSE)
+    # Check data frame dimensions
+    for (name in names(df_list)) {
+      print(name)
+      print(dim(df_list[[name]]))
     }
-    ```
-
-### How to find files
-
-``` r
-## Get matching files from the `processed` subdirectory
-dir(path = "../processed", pattern = "north_america_[1-9]*.csv")
-```
 
-### Read files using a for loop
+    # What's going on with the Americas?
+    for (name in names(df_list)) {
+      print(name)
+      print(dim(df_list[[name]]))
+      print(colnames(df_list[[name]]))
+    }
+    ```
 
-1.  Read each file into a data frame and add it to a list
+3.  Drop the continent column for Americas
 
     ``` r
-    ## Create an empty list
-    df_list <- list()
-
-    ## Get the locations of the matching files
-    file_names <- dir(path = "../processed", pattern = "north_america_[1-9]*.csv")
-    file_paths <- file.path("../processed", file_names)
-
-    for (f in file_paths){
-      df_list[[f]] <- read.csv(f, stringsAsFactors = TRUE)
-    }
+    americas <- df_list[["gapminder_gdp_americas.csv"]]
+    df_list[["gapminder_gdp_americas.csv"]] <- americas[, ! colnames(americas) %in% c("continent")]
     ```
 
-2.  Access the list items to view the individual data frames
+4.  Concatenate data frames
 
     ``` r
-    length(df_list)
-    names(df_list)
-    lapply(df_list, length)
-    df_list[["north_america_1952.csv"]]
+    df <- do.call(rbind, df_list)
     ```
 
-### Read files using apply
+### (Optional) Read files using apply
 
 1.  Instead of a for loop that handles each file individually, use a single vectorized function.
 
diff --git a/README.org b/README.org
@@ -1098,7 +1098,7 @@ north_america <- c("Canada", "Mexico", "United States")
 See data/curriculum.Rmd
 
 ** Reading and writing data
-*** Create sample data sets and write them to the `processed` directory
+*** COMMENT Create sample data sets and write them to the `processed` directory
 1. Preliminaries
    #+BEGIN_SRC R
    if (!dir.exists("../processed")) {
@@ -1121,7 +1121,7 @@ See data/curriculum.Rmd
    }
    #+END_SRC
 
-3. (Optional) Version 2: Bypass ~calcGDP~ function
+3. Version 2: Bypass ~calcGDP~ function
    #+BEGIN_SRC R
    for (year in unique(gapminder$year)) {
      df <- gapminder[gapminder$year == year, ]
@@ -1132,9 +1132,10 @@ See data/curriculum.Rmd
    #+END_SRC
 
 *** How to find files
+Get all regional files
 #+BEGIN_SRC R
 ## Get matching files from the `processed` subdirectory
-dir(path = "../processed", pattern = "north_america_[1-9]*.csv")
+dir(path = ".", pattern = "gapminder_gdp.*.csv")
 #+END_SRC
 
 *** Read files using a for loop
@@ -1143,24 +1144,43 @@ dir(path = "../processed", pattern = "north_america_[1-9]*.csv")
    ## Create an empty list
    df_list <- list()
 
-   ## Get the locations of the matching files
-   file_names <- dir(path = "../processed", pattern = "north_america_[1-9]*.csv")
-   file_paths <- file.path("../processed", file_names)
+   # Get list of files to read
+   file_names <- dir(path = "../data", pattern = "gapminder_gdp.*.csv")
 
-   for (f in file_paths){
-     df_list[[f]] <- read.csv(f, stringsAsFactors = TRUE)
+   # Read files into data frames
+   for (f in file_names){
+     df_list[[f]] <- read.csv(file = file.path("../data", f, stringsAsFactors = TRUE))
    }
    #+END_SRC
 
-2. Access the list items to view the individual data frames
+2. Check our data for compatibility
    #+BEGIN_SRC R
-   length(df_list)
-   names(df_list)
-   lapply(df_list, length)
-   df_list[["north_america_1952.csv"]]
+   # Check data frame dimensions
+   for (name in names(df_list)) {
+     print(name)
+     print(dim(df_list[[name]]))
+   }
+
+   # What's going on with the Americas?
+   for (name in names(df_list)) {
+     print(name)
+     print(dim(df_list[[name]]))
+     print(colnames(df_list[[name]]))
+   }
+   #+END_SRC
+
+3. Drop the continent column for Americas
+   #+BEGIN_SRC R
+   americas <- df_list[["gapminder_gdp_americas.csv"]]
+   df_list[["gapminder_gdp_americas.csv"]] <- americas[, ! colnames(americas) %in% c("continent")]
+   #+END_SRC
+
+4. Concatenate data frames
+   #+BEGIN_SRC R
+   df <- do.call(rbind, df_list)
    #+END_SRC
 
-*** Read files using apply
+*** (Optional) Read files using apply
 1. Instead of a for loop that handles each file individually, use a single vectorized function.
    #+BEGIN_SRC R
    df_list <- lapply(file_paths, read.csv, stringsAsFactors = TRUE)