Skip to content

Commit 290134e

Browse files
committed
Revised section on looping through files
1 parent 1b8e16b commit 290134e

File tree

2 files changed

+87
-76
lines changed

2 files changed

+87
-76
lines changed

README.md

Lines changed: 53 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
---
2+
author: Derek Devnich
3+
title: Programming in R
4+
---
5+
16
- [<span class="toc-section-number">1</span> Fundamentals (Week 1)](#fundamentals-week-1)
27
- [<span class="toc-section-number">1.1</span> Introduction to RStudio](#introduction-to-rstudio)
38
- [<span class="toc-section-number">1.2</span> Introduction to R](#introduction-to-r)
@@ -859,20 +864,6 @@ See /scripts/curriculum.Rmd
859864
}
860865
```
861866

862-
5. Vectorize your tests
863-
864-
``` r
865-
x <- 1:4
866-
867-
if (any(x < 2)) {
868-
print("Some x less than 2")
869-
}
870-
871-
if (all(x < 2)){
872-
print("All x less than 2")
873-
}
874-
```
875-
876867
### Review Subsetting section
877868

878869
Subsetting is frequently an alternative to if-else statements in R
@@ -936,7 +927,21 @@ x + z
936927
all(a)
937928
```
938929
939-
2. Can you detect missing data?
930+
2. Vectorize your tests
931+
932+
``` r
933+
x <- 1:4
934+
935+
if (any(x < 2)) {
936+
print("Some x less than 2")
937+
}
938+
939+
if (all(x < 2)){
940+
print("All x less than 2")
941+
}
942+
```
943+
944+
3. Can you detect missing data?
940945
941946
``` r
942947
nan_vec <- c(1, 3, NaN)
@@ -1222,77 +1227,63 @@ See data/curriculum.Rmd
12221227
12231228
## Reading and writing data
12241229
1225-
### Create sample data sets and write them to the \`processed\` directory
1230+
### How to find files
12261231
1227-
1. Preliminaries
1232+
Get all regional files
12281233
1229-
``` r
1230-
if (!dir.exists("../processed")) {
1231-
dir.create("../processed")
1232-
}
1234+
``` r
1235+
## Get matching files from the `processed` subdirectory
1236+
dir(path = ".", pattern = "gapminder_gdp.*.csv")
1237+
```
12331238
1234-
north_america <- c("Canada", "Mexico", "United States")
1235-
```
1239+
### Read files using a for loop
12361240
1237-
2. Version 1: Use `calcGDP` function
1241+
1. Read each file into a data frame and add it to a list
12381242
12391243
``` r
1240-
for (year in unique(gapminder$year)) {
1241-
df <- calcGDP(gapminder, year = year, country = north_america)
1244+
## Create an empty list
1245+
df_list <- list()
12421246
1243-
## Generate a file name. This will fail if "processed" doesn't exist
1244-
fname <- paste("../processed/north_america_", as.character(year), ".csv", sep = "")
1247+
# Get list of files to read
1248+
file_names <- dir(path = "../data", pattern = "gapminder_gdp.*.csv")
12451249
1246-
## Write the file
1247-
write.csv(x = df, file = fname, row.names = FALSE)
1250+
# Read files into data frames
1251+
for (f in file_names){
1252+
df_list[[f]] <- read.csv(file = file.path("../data", f, stringsAsFactors = TRUE))
12481253
}
12491254
```
12501255
1251-
3. (Optional) Version 2: Bypass `calcGDP` function
1256+
2. Check our data for compatibility
12521257
12531258
``` r
1254-
for (year in unique(gapminder$year)) {
1255-
df <- gapminder[gapminder$year == year, ]
1256-
df <- df[df$country %in% north_america, ]
1257-
fname <- paste("processed/north_america_", as.character(year), ".csv", sep="")
1258-
write.csv(x = df, file = fname, row.names = FALSE)
1259+
# Check data frame dimensions
1260+
for (name in names(df_list)) {
1261+
print(name)
1262+
print(dim(df_list[[name]]))
12591263
}
1260-
```
1261-
1262-
### How to find files
1263-
1264-
``` r
1265-
## Get matching files from the `processed` subdirectory
1266-
dir(path = "../processed", pattern = "north_america_[1-9]*.csv")
1267-
```
12681264
1269-
### Read files using a for loop
1265+
# What's going on with the Americas?
1266+
for (name in names(df_list)) {
1267+
print(name)
1268+
print(dim(df_list[[name]]))
1269+
print(colnames(df_list[[name]]))
1270+
}
1271+
```
12701272

1271-
1. Read each file into a data frame and add it to a list
1273+
3. Drop the continent column for Americas
12721274

12731275
``` r
1274-
## Create an empty list
1275-
df_list <- list()
1276-
1277-
## Get the locations of the matching files
1278-
file_names <- dir(path = "../processed", pattern = "north_america_[1-9]*.csv")
1279-
file_paths <- file.path("../processed", file_names)
1280-
1281-
for (f in file_paths){
1282-
df_list[[f]] <- read.csv(f, stringsAsFactors = TRUE)
1283-
}
1276+
americas <- df_list[["gapminder_gdp_americas.csv"]]
1277+
df_list[["gapminder_gdp_americas.csv"]] <- americas[, ! colnames(americas) %in% c("continent")]
12841278
```
12851279

1286-
2. Access the list items to view the individual data frames
1280+
4. Concatenate data frames
12871281

12881282
``` r
1289-
length(df_list)
1290-
names(df_list)
1291-
lapply(df_list, length)
1292-
df_list[["north_america_1952.csv"]]
1283+
df <- do.call(rbind, df_list)
12931284
```
12941285

1295-
### Read files using apply
1286+
### (Optional) Read files using apply
12961287

12971288
1. Instead of a for loop that handles each file individually, use a single vectorized function.
12981289

README.org

Lines changed: 34 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1098,7 +1098,7 @@ north_america <- c("Canada", "Mexico", "United States")
10981098
See data/curriculum.Rmd
10991099

11001100
** Reading and writing data
1101-
*** Create sample data sets and write them to the `processed` directory
1101+
*** COMMENT Create sample data sets and write them to the `processed` directory
11021102
1. Preliminaries
11031103
#+BEGIN_SRC R
11041104
if (!dir.exists("../processed")) {
@@ -1121,7 +1121,7 @@ See data/curriculum.Rmd
11211121
}
11221122
#+END_SRC
11231123

1124-
3. (Optional) Version 2: Bypass ~calcGDP~ function
1124+
3. Version 2: Bypass ~calcGDP~ function
11251125
#+BEGIN_SRC R
11261126
for (year in unique(gapminder$year)) {
11271127
df <- gapminder[gapminder$year == year, ]
@@ -1132,9 +1132,10 @@ See data/curriculum.Rmd
11321132
#+END_SRC
11331133

11341134
*** How to find files
1135+
Get all regional files
11351136
#+BEGIN_SRC R
11361137
## Get matching files from the `processed` subdirectory
1137-
dir(path = "../processed", pattern = "north_america_[1-9]*.csv")
1138+
dir(path = ".", pattern = "gapminder_gdp.*.csv")
11381139
#+END_SRC
11391140

11401141
*** Read files using a for loop
@@ -1143,24 +1144,43 @@ dir(path = "../processed", pattern = "north_america_[1-9]*.csv")
11431144
## Create an empty list
11441145
df_list <- list()
11451146

1146-
## Get the locations of the matching files
1147-
file_names <- dir(path = "../processed", pattern = "north_america_[1-9]*.csv")
1148-
file_paths <- file.path("../processed", file_names)
1147+
# Get list of files to read
1148+
file_names <- dir(path = "../data", pattern = "gapminder_gdp.*.csv")
11491149

1150-
for (f in file_paths){
1151-
df_list[[f]] <- read.csv(f, stringsAsFactors = TRUE)
1150+
# Read files into data frames
1151+
for (f in file_names){
1152+
df_list[[f]] <- read.csv(file = file.path("../data", f, stringsAsFactors = TRUE))
11521153
}
11531154
#+END_SRC
11541155

1155-
2. Access the list items to view the individual data frames
1156+
2. Check our data for compatibility
11561157
#+BEGIN_SRC R
1157-
length(df_list)
1158-
names(df_list)
1159-
lapply(df_list, length)
1160-
df_list[["north_america_1952.csv"]]
1158+
# Check data frame dimensions
1159+
for (name in names(df_list)) {
1160+
print(name)
1161+
print(dim(df_list[[name]]))
1162+
}
1163+
1164+
# What's going on with the Americas?
1165+
for (name in names(df_list)) {
1166+
print(name)
1167+
print(dim(df_list[[name]]))
1168+
print(colnames(df_list[[name]]))
1169+
}
1170+
#+END_SRC
1171+
1172+
3. Drop the continent column for Americas
1173+
#+BEGIN_SRC R
1174+
americas <- df_list[["gapminder_gdp_americas.csv"]]
1175+
df_list[["gapminder_gdp_americas.csv"]] <- americas[, ! colnames(americas) %in% c("continent")]
1176+
#+END_SRC
1177+
1178+
4. Concatenate data frames
1179+
#+BEGIN_SRC R
1180+
df <- do.call(rbind, df_list)
11611181
#+END_SRC
11621182

1163-
*** Read files using apply
1183+
*** (Optional) Read files using apply
11641184
1. Instead of a for loop that handles each file individually, use a single vectorized function.
11651185
#+BEGIN_SRC R
11661186
df_list <- lapply(file_paths, read.csv, stringsAsFactors = TRUE)

0 commit comments

Comments
 (0)