pollutantmean <- function(directory, pollutant, id = 1:332) {
## 'directory' is a character vector of length 1 indicating
## the location of the CSV files
## 'pollutant' is a character vector of length 1 indicating
## the name of the pollutant for which we will calculate the
## mean; either "sulfate" or "nitrate".
## 'id' is an integer vector indicating the monitor ID numbers
## to be used
## Return the mean of the pollutant across all monitors list
## in the 'id' vector (ignoring NA values)
dataSum <- numeric()
for(i in id)
dataPath <- sprintf("%s\\%03d.csv", directory, i)
data <- read.csv(dataPath)
dataSum <- c(dataSum, data[[pollutant]])
mean(dataSum, na.rm = TRUE)
complete <- function(directory, id = 1:332) {
## 'directory' is a character vector of length 1 indicating
## the location of the CSV files
## 'id' is an integer vector indicating the monitor ID numbers
## to be used
## Return a data frame of the form:
## id nobs
## 1 117
## 2 1041
## ...
## where 'id' is the monitor ID number and 'nobs' is the
## number of complete cases
ident <- numeric()
nober <- numeric()
for(i in id)
dataPath <- sprintf("%s\\%03d.csv", directory, i)
data <- read.csv(dataPath)
ident <- c(ident, i)
nober <- c(nober, nrow(na.omit(data)))
data.frame(id = ident, nobs = nober)
corr <- function(directory, threshold = 0) {
## 'directory' is a character vector of length 1 indicating
## the location of the CSV files
## 'threshold' is a numeric vector of length 1 indicating the
## number of completely observed observations (on all
## variables) required to compute the correlation between
## nitrate and sulfate; the default is 0
## Return a numeric vector of correlations
corVec <- numeric()
for(i in 1:332)
dataPath <- sprintf("%s\\%03d.csv", directory, i)
data <- read.csv(dataPath)
completeData <- na.omit(data)
if( nrow(completeData) >= threshold &&
nrow(completeData) != 0)
corVec <- c(corVec, cor(x = completeData$sulfate, y = completeData$nitrate))