Re: [R] : Quantile and rowMean from multiple files in a folder

arun Mon, 14 Apr 2014 20:54:08 -0700



Hi,
Q1 solution already sent.

Regarding Q2, one of the files in the new Observed folder doesn't have any  
data (just the Year column alone).

That may be the reason for the problem.

### Q1: working directory: Observed #Only one file per Site.  Assuming this is 
the
### case for the full dataset, then I guess there is no need to average
dir.create("final")
lst1 <- split(list.files(pattern = ".csv"), gsub("\\_.*", "", 
list.files(pattern = ".csv")))

lst2 <- lapply(lst1, function(x1) lapply(x1, function(x2) {
    lines1 <- readLines(x2)
    header1 <- lines1[1:2]
    dat1 <- read.table(text = lines1, header = FALSE, sep = ",", 
stringsAsFactors = FALSE, 
        skip = 2)
    colnames(dat1) <- Reduce(paste, strsplit(header1, ","))
    dat1[-c(nrow(dat1), nrow(dat1) - 1), ]
}))

lst3 <- lst2[sapply(seq_along(lst2),function(i){lstN <- 
sapply(lst2[[i]],function(x) is.integer(ncol(x)))})]


#difference in column number
sapply(seq_along(lst3), function(i) {
    sapply(lst3[[i]], function(x) ncol(x))
})
# 
#[1] 157 258 258  98 157 258 256 258 250 258 258 147 157 250 250 256 249 240
# [19] 181 188 256 146 117 258 153 256 255 246 255 256 258 257 145 258 258 255
# [37] 258 157 164 144 265 258 254 258 258 157 258 176 258 256 257 258 258 258
# [55] 248 258 156 258 157 157 258 258 258 258 258 148 258 258 258 258 257 258
# [73] 258 258 157 154 153 258 248 255 257 256 258 258 157 256 256 257 257 250
# [91] 257 139 155 256 256 257 257 256 258 258 257 258 258 258 258 157 157 157
#[109] 258 258 258 258 256 258 157 258 258 256 258

library(plyr)
library(stringr)

lst4 <- setNames(lapply(seq_along(lst3), function(i) {
    lapply(lst3[[i]], function(x) {
        names(x)[-1] <- paste(names(x)[-1], names(lst1)[i], sep = "_")
        names(x) <- str_trim(names(x))
        x
    })[[1]]
}), names(lst3))

df1 <- join_all(lst4, by = "Year")
dim(df1)
# [1] 9 27311

sapply(split(names(df1)[-1], gsub(".*\\_", "", names(df1)[-1])), function(x) {
    df2 <- df1[, x]
    df3 <- data.frame(Percentiles = paste0(seq(0, 100, by = 1), "%"), 
numcolwise(function(y) quantile(y, 
        seq(0, 1, by = 0.01), na.rm = TRUE))(df2), stringsAsFactors = FALSE)
    ncol(df3)
})
# 
#G100 G101 G102 G103 G104 G105 G106 G107 G108 G109 G110 G111 G112 G113 G114 
G115 
# 157  258  258   98  157  258  256  258  250  258  258  147  157  250  250  
256 
#G116 G117 G118 G119 G120 GG10 GG11 GG12 GG13 GG14 GG15 GG16 GG17 GG18 GG19 
GG20 
# 249  240  181  188  256  146  117  258  153  256  255  246  255  256  258  
257 
#GG21 GG22 GG23 GG24 GG25 GG26 GG27 GG28 GG29 GG30 GG31 GG32 GG33 GG34 GG35 
GG36 
# 145  258  258  255  258  157  164  144  265  258  254  258  258  157  258  
176 
#GG37 GG38 GG39 GG40 GG41 GG42 GG43 GG44 GG45 GG46 GG47 GG48 GG49 GG50 GG51 
GG52 
# 258  256  257  258  258  258  248  258  156  258  157  157  258  258  258  
258 
#GG53 GG54 GG55 GG56 GG57 GG58 GG59 GG60 GG61 GG62 GG63 GG64 GG65 GG66 GG67 
GG68 
# 258  148  258  258  258  258  257  258  258  258  157  154  153  258  248  
255 
#GG69 GG70 GG71 GG72 GG73 GG74 GG75 GG76 GG77 GG78 GG79 GG80 GG81 GG82 GG83 
GG84 
# 257  256  258  258  157  256  256  257  257  250  257  139  155  256  256  
257 
#GG85 GG86 GG87 GG88 GG89 GG90 GG91 GG92 GG93 GG94 GG95 GG96 GG97 GG98 GG99 
GGG1 
# 257  256  258  258  257  258  258  258  258  157  157  157  258  258  258  
258 
#GGG2 GGG3 GGG4 GGG5 GGG6 GGG7 GGG8 
# 256  258  157  258  258  256  258 



lst5 <- split(names(df1)[-1], gsub(".*\\_", "", names(df1)[-1]))

lapply(seq_along(lst5), function(i) {
    df2 <- df1[, lst5[[i]]]
    df3 <- data.frame(Percentiles = paste0(seq(0, 100, by = 1), "%"), 
numcolwise(function(y) quantile(y, 
        seq(0, 1, by = 0.01), na.rm = TRUE))(df2), stringsAsFactors = FALSE)
    df3[1:3, 1:3]
    write.csv(df3, paste0(paste(getwd(), "final", paste(names(lst4)[[i]], 
"Quantile", 
        sep = "_"), sep = "/"), ".csv"), row.names = FALSE, quote = FALSE)
})

ReadOut1 <- lapply(list.files(recursive = TRUE)[grep("Quantile", 
list.files(recursive = TRUE))], 
    function(x) read.csv(x, header = TRUE, stringsAsFactors = FALSE))

sapply(ReadOut1, dim)[,1:3]
#     [,1] [,2] [,3]
#[1,]  101  101  101
#[2,]  157  258  258


lapply(ReadOut1, function(x) x[1:2, 1:3])[1:3]
#[[1]]
#  Percentiles pav.DJF_G100 pav.MAM_G100
#1          0%            0     0.640500
#2          1%            0     0.664604
#
#[[2]]
#  Percentiles txav.DJF_G101 txav.MAM_G101
#1          0%      -13.8756      4.742400
#2          1%      -13.8140      4.817184
#
#[[3]]
#  Percentiles txav.DJF_G102 txav.MAM_G102
#1          0%     -15.05000      4.520700
#2          1%     -14.96833      4.543828


### Q2: Observed data

dir.create("Indices")

names1 <- unlist(lapply(ReadOut1, function(x) names(x)[-1]))
names2 <- gsub("\\_.*", "", names1)
names3 <- unique(gsub("[.]", " ", names2))

res <- do.call(rbind, lapply(seq_along(lst5), function(i) {
    df2 <- df1[, lst5[[i]]]
    vec1 <- colMeans(df2, na.rm = TRUE)
    vec2 <- rep(NA, length(names3))
    names(vec2) <- paste(names3, names(lst5)[[i]], sep = "_")
    vec2[names(vec2) %in% names(vec1)] <- vec1
    names(vec2) <- gsub("\\_.*", "", names(vec2))
    vec2
}))
dim(res)
#[1] 119 264

lapply(seq_len(ncol(res)), function(i) {
    mat1 <- t(res[, i, drop = FALSE])
    colnames(mat1) <- names(lst4)
    write.csv(mat1, paste0(paste(getwd(), "Indices", gsub(" ", "_", 
rownames(mat1)), 
        sep = "/"), ".csv"), row.names = FALSE, quote = FALSE)
})

## Output2:
ReadOut2 <- lapply(list.files(recursive = TRUE)[grep("Indices", 
list.files(recursive = TRUE))], 
    function(x) read.csv(x, header = TRUE, stringsAsFactors = FALSE))
length(ReadOut2)
# [1]  264

list.files(recursive = TRUE)[grep("Indices", list.files(recursive = TRUE))][1]
# [1] 'Indices/pav_ANN.csv'
res[, "pav ANN", drop = FALSE]


ReadOut2[[1]]


Attached is the updated Quantilecode2.txt.

A.K.


On Monday, April 14, 2014 10:41 PM, Zilefac Elvis <zilefacel...@yahoo.com> 
wrote:
Hi AK,
Q1) Please try to correct the error using the larger data set (Sample.zip). The 
issue is that once you write the codes and restrict it to smaller data sets, I 
find it difficult to generalize it to larger data sets.

Q2) From the Quantilecode2.txt you just sent, you forgot to do the following 
section using the Observed.zip file. I tried to run the code to section Q1 in 
Quantilecode2.txt using a larger data set and received the same error :Error in 
2:nrow(lstNew) : argument of length 0. I have attached a larger data set too 
for you to generalize the code to suit the larger data set. Please do not 
forget to include the code below in the final code of Q2.


Once you fix these two, I should be able to fix the rest following these 
examples.

Thanks AK. Sorry for overloading you with much work.
Atem.

#==============================================================================================================
dir.create("Indices") 
names1 <- lapply(ReadOut1, function(x) names(x))[[1]]
lstNew <- simplify2array(ReadOut1) lapply(2:nrow(lstNew), function(i) { dat1 <- 
data.frame(lstNew[1], do.call(cbind, lstNew[i, ]), stringsAsFactors = FALSE) 
colnames(dat1) <- c(rownames(lstNew)[1], paste(names(lst1), 
rep(rownames(lstNew)[i],  length(lst1)), sep = "_")) 
write.csv(dat1, paste0(paste(getwd(), "Indices", rownames(lstNew)[i], sep = 
"/"),  ".csv"), row.names = FALSE, quote = FALSE)
})  
## Output2:
ReadOut2 <- lapply(list.files(recursive = TRUE)[grep("Indices", 
list.files(recursive = TRUE))],  function(x) read.csv(x, header = TRUE, 
stringsAsFactors = FALSE))
length(ReadOut2)
# [1] 257
head(ReadOut2[[1]], 2) 

#==============================================================================================================


=================================================

### Q1: working directory: Observed #Only one file per Site.  Assuming this is 
the
### case for the full dataset, then I guess there is no need to average
dir.create("final")
lst1 <- split(list.files(pattern = ".csv"), gsub("\\_.*", "", 
list.files(pattern = ".csv")))

lst2 <- lapply(lst1, function(x1) lapply(x1, function(x2) {
    lines1 <- readLines(x2)
    header1 <- lines1[1:2]
    dat1 <- read.table(text = lines1, header = FALSE, sep = ",", 
stringsAsFactors = FALSE, 
        skip = 2)
    colnames(dat1) <- Reduce(paste, strsplit(header1, ","))
    dat1[-c(nrow(dat1), nrow(dat1) - 1), ]
}))

 lst3 <- lst2[sapply(seq_along(lst2),function(i){lstN <- 
sapply(lst2[[i]],function(x) is.integer(ncol(x)))})]


#difference in column number
sapply(seq_along(lst3), function(i) {
    sapply(lst3[[i]], function(x) ncol(x))
})
# 
#[1] 157 258 258  98 157 258 256 258 250 258 258 147 157 250 250 256 249 240
# [19] 181 188 256 146 117 258 153 256 255 246 255 256 258 257 145 258 258 255
# [37] 258 157 164 144 265 258 254 258 258 157 258 176 258 256 257 258 258 258
# [55] 248 258 156 258 157 157 258 258 258 258 258 148 258 258 258 258 257 258
# [73] 258 258 157 154 153 258 248 255 257 256 258 258 157 256 256 257 257 250
# [91] 257 139 155 256 256 257 257 256 258 258 257 258 258 258 258 157 157 157
#[109] 258 258 258 258 256 258 157 258 258 256 258

library(plyr)
library(stringr)

lst4 <- setNames(lapply(seq_along(lst3), function(i) {
    lapply(lst3[[i]], function(x) {
        names(x)[-1] <- paste(names(x)[-1], names(lst1)[i], sep = "_")
        names(x) <- str_trim(names(x))
        x
    })[[1]]
}), names(lst3))

df1 <- join_all(lst4, by = "Year")
dim(df1)
# [1] 9 27311

sapply(split(names(df1)[-1], gsub(".*\\_", "", names(df1)[-1])), function(x) {
    df2 <- df1[, x]
    df3 <- data.frame(Percentiles = paste0(seq(0, 100, by = 1), "%"), 
numcolwise(function(y) quantile(y, 
        seq(0, 1, by = 0.01), na.rm = TRUE))(df2), stringsAsFactors = FALSE)
    ncol(df3)
})
# 
#G100 G101 G102 G103 G104 G105 G106 G107 G108 G109 G110 G111 G112 G113 G114 
G115 
# 157  258  258   98  157  258  256  258  250  258  258  147  157  250  250  
256 
#G116 G117 G118 G119 G120 GG10 GG11 GG12 GG13 GG14 GG15 GG16 GG17 GG18 GG19 
GG20 
# 249  240  181  188  256  146  117  258  153  256  255  246  255  256  258  
257 
#GG21 GG22 GG23 GG24 GG25 GG26 GG27 GG28 GG29 GG30 GG31 GG32 GG33 GG34 GG35 
GG36 
# 145  258  258  255  258  157  164  144  265  258  254  258  258  157  258  
176 
#GG37 GG38 GG39 GG40 GG41 GG42 GG43 GG44 GG45 GG46 GG47 GG48 GG49 GG50 GG51 
GG52 
# 258  256  257  258  258  258  248  258  156  258  157  157  258  258  258  
258 
#GG53 GG54 GG55 GG56 GG57 GG58 GG59 GG60 GG61 GG62 GG63 GG64 GG65 GG66 GG67 
GG68 
# 258  148  258  258  258  258  257  258  258  258  157  154  153  258  248  
255 
#GG69 GG70 GG71 GG72 GG73 GG74 GG75 GG76 GG77 GG78 GG79 GG80 GG81 GG82 GG83 
GG84 
# 257  256  258  258  157  256  256  257  257  250  257  139  155  256  256  
257 
#GG85 GG86 GG87 GG88 GG89 GG90 GG91 GG92 GG93 GG94 GG95 GG96 GG97 GG98 GG99 
GGG1 
# 257  256  258  258  257  258  258  258  258  157  157  157  258  258  258  
258 
#GGG2 GGG3 GGG4 GGG5 GGG6 GGG7 GGG8 
# 256  258  157  258  258  256  258 



lst5 <- split(names(df1)[-1], gsub(".*\\_", "", names(df1)[-1]))

lapply(seq_along(lst5), function(i) {
    df2 <- df1[, lst5[[i]]]
    df3 <- data.frame(Percentiles = paste0(seq(0, 100, by = 1), "%"), 
numcolwise(function(y) quantile(y, 
        seq(0, 1, by = 0.01), na.rm = TRUE))(df2), stringsAsFactors = FALSE)
    df3[1:3, 1:3]
    write.csv(df3, paste0(paste(getwd(), "final", paste(names(lst4)[[i]], 
"Quantile", 
        sep = "_"), sep = "/"), ".csv"), row.names = FALSE, quote = FALSE)
})

ReadOut1 <- lapply(list.files(recursive = TRUE)[grep("Quantile", 
list.files(recursive = TRUE))], 
    function(x) read.csv(x, header = TRUE, stringsAsFactors = FALSE))

sapply(ReadOut1, dim)[,1:3]
#     [,1] [,2] [,3]
#[1,]  101  101  101
#[2,]  157  258  258


lapply(ReadOut1, function(x) x[1:2, 1:3])[1:3]
#[[1]]
#  Percentiles pav.DJF_G100 pav.MAM_G100
#1          0%            0     0.640500
#2          1%            0     0.664604
#
#[[2]]
#  Percentiles txav.DJF_G101 txav.MAM_G101
#1          0%      -13.8756      4.742400
#2          1%      -13.8140      4.817184
#
#[[3]]
#  Percentiles txav.DJF_G102 txav.MAM_G102
#1          0%     -15.05000      4.520700
#2          1%     -14.96833      4.543828


### Q2: Observed data

dir.create("Indices")

names1 <- unlist(lapply(ReadOut1, function(x) names(x)[-1]))
names2 <- gsub("\\_.*", "", names1)
names3 <- unique(gsub("[.]", " ", names2))

res <- do.call(rbind, lapply(seq_along(lst5), function(i) {
    df2 <- df1[, lst5[[i]]]
    vec1 <- colMeans(df2, na.rm = TRUE)
    vec2 <- rep(NA, length(names3))
    names(vec2) <- paste(names3, names(lst5)[[i]], sep = "_")
    vec2[names(vec2) %in% names(vec1)] <- vec1
    names(vec2) <- gsub("\\_.*", "", names(vec2))
    vec2
}))
dim(res)
#[1] 119 264

lapply(seq_len(ncol(res)), function(i) {
    mat1 <- t(res[, i, drop = FALSE])
    colnames(mat1) <- names(lst4)
    write.csv(mat1, paste0(paste(getwd(), "Indices", gsub(" ", "_", 
rownames(mat1)), 
        sep = "/"), ".csv"), row.names = FALSE, quote = FALSE)
})

## Output2:
ReadOut2 <- lapply(list.files(recursive = TRUE)[grep("Indices", 
list.files(recursive = TRUE))], 
    function(x) read.csv(x, header = TRUE, stringsAsFactors = FALSE))
length(ReadOut2)
# [1]  264

list.files(recursive = TRUE)[grep("Indices", list.files(recursive = TRUE))][1]
# [1] 'Indices/pav_ANN.csv'
res[, "pav ANN", drop = FALSE]


ReadOut2[[1]]

### Sample data Working directory changed to 'sample'
dir.create("Indices_colMeans")

lst1 <- split(list.files(pattern = ".csv"), gsub("\\_.*", "", 
list.files(pattern = ".csv")))

lst2 <- lapply(lst1, function(x1) lapply(x1, function(x2) {
    lines1 <- readLines(x2)
    header1 <- lines1[1:2]
    dat1 <- read.table(text = lines1, header = FALSE, sep = ",", 
stringsAsFactors = FALSE, 
        skip = 2)
    colnames(dat1) <- Reduce(paste, strsplit(header1, ","))
    dat1[-c(nrow(dat1), nrow(dat1) - 1), ]
}))
res1 <- do.call(rbind, lapply(seq_along(lst2), function(i) {
    rowMeans(do.call(cbind, lapply(lst2[[i]], function(x) colMeans(x[, -1], 
na.rm = TRUE))), 
        na.rm = TRUE)
}))

lapply(seq_len(ncol(res1)), function(i) {
    mat1 <- t(res1[, i, drop = FALSE])
    colnames(mat1) <- names(lst2)
    write.csv(mat1, paste0(paste(getwd(), "Indices_colMeans", gsub(" ", "_", 
rownames(mat1)), 
        sep = "/"), ".csv"), row.names = FALSE, quote = FALSE)
})

## Output2 Sample
ReadOut2S <- lapply(list.files(recursive = TRUE)[grep("Indices", 
list.files(recursive = TRUE))], 
    function(x) read.csv(x, header = TRUE, stringsAsFactors = FALSE))
length(ReadOut2S)
# [1] 257
list.files(recursive = TRUE)[grep("Indices", list.files(recursive = TRUE))][1]
# [1] 'Indices_colMeans/pav_ANN.csv'
res1[, "pav ANN", drop = FALSE]
# pav ANN [1,] 1.545620 [2,] 1.518553

ReadOut2S[[1]]
# G100 G101 1 1.54562 1.518553

______________________________________________
R-help@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.

Re: [R] : Quantile and rowMean from multiple files in a folder

Reply via email to