一些使用R統計分散csv檔資料函數－資料科學(Data Science)學習筆記

醒來寫了一下作業…

計算指定資料表欄位的mean

pollutantmean <- function(directory, pollutant, id = 1:332) {
   ## 'directory' is a character vector of length 1 indicating
   ## the location of the CSV files

   ## 'pollutant' is a character vector of length 1 indicating
   ## the name of the pollutant for which we will calculate the
   ## mean; either "sulfate" or "nitrate".

   ## 'id' is an integer vector indicating the monitor ID numbers
   ## to be used

   ## Return the mean of the pollutant across all monitors list
   ## in the 'id' vector (ignoring NA values)
   dataSum <- numeric()
   for(i in id)
   {
        dataPath <- sprintf("%s\\%03d.csv", directory, i)
       data <- read.csv(dataPath)
       dataSum <- c(dataSum, data[[pollutant]])
   }

   mean(dataSum, na.rm = TRUE)
}

#計算指定資料表的完整資料數

complete <- function(directory, id = 1:332) {
        ## 'directory' is a character vector of length 1 indicating
        ## the location of the CSV files

        ## 'id' is an integer vector indicating the monitor ID numbers
        ## to be used

        ## Return a data frame of the form:
        ## id nobs
        ## 1 117
        ## 2 1041
        ## ...
        ## where 'id' is the monitor ID number and 'nobs' is the
        ## number of complete cases

   ident <- numeric()
   nober <- numeric()

   for(i in id)
   {
       dataPath <- sprintf("%s\\%03d.csv", directory, i)

       data <- read.csv(dataPath)
       ident <- c(ident, i)
       nober <- c(nober, nrow(na.omit(data)))
   }

   data.frame(id = ident, nobs = nober)
}

#計算二個變數間的correlation

corr <- function(directory, threshold = 0) {
   ## 'directory' is a character vector of length 1 indicating
   ## the location of the CSV files

   ## 'threshold' is a numeric vector of length 1 indicating the
   ## number of completely observed observations (on all
    ## variables) required to compute the correlation between
   ## nitrate and sulfate; the default is 0

   ## Return a numeric vector of correlations

   corVec <- numeric()
   for(i in 1:332)
   {
       dataPath <- sprintf("%s\\%03d.csv", directory, i)
       data <- read.csv(dataPath)

       completeData <- na.omit(data)

       if(   nrow(completeData) >= threshold &&
           nrow(completeData) != 0)
       {
           corVec <- c(corVec, cor(x = completeData$sulfate, y = completeData$nitrate))
       }
   }

   corVec
}