I do not fully comprehend the codes below. But if I usually want to check if all the elements in a row/column are the same, then I would check the variance or range and see if they are nearly zero.
v.row <- apply( mat, 1, var ) v.col <- apply( mat, 2, var ) tol <- 0 good.row <- which( v.row > tol ) good.col <- which( v.col > tol ) Regards, Adai On Tue, 2005-08-09 at 12:22 +0200, Torsten Schindler wrote: > Hi, > > I'm a R newbie and want to accelerate the following pre-filtering > step of a data set with more than 115,000 rows : > > #----------------- > # Function to filter out constant data columns > filter.const<-function(X, vectors=c('column', 'row'), tol=0){ > realdata=c() > filteredX<-matrix() > if( vectors[1] == 'row' ){ > for( row in (1:nrow(X)) ){ > if( length(which(X[row,]!=median(X[row,])))>tol ){ > realdata[length(realdata)+1]=row > } > } > filteredX=X[realdata,] > } else if( vectors[1] == 'column' ){ > for( col in (1:ncol(X)) ){ > if( length(which(X[,col]!=median(X[,col])))>tol ){ > realdata[length(realdata)+1]=col > } > } > filteredX=X[,realdata] > } > return(list(x=filteredX, ix=realdata)) > } > > #----------------- > # Filter out all all-constant columns in my training data set > # > # Read training data set with class information in the first column > training <- read.csv('training_data.txt') > dim(training) # => 49 rows and 525 columns > > # Prepare column names by stripping the underline and the number at > the end > colnames(training) <- sub('_\\d+$', '', colnames(training), perl=TRUE) > > # Filter out the all-constant columns, exclude column 1, the class > column called myclass > training.filter <- filter.const(training[,-1]) > > # The filtered data frame is > training.filtered <- cbind(myclass=training[,1], training.filter$x) > dim(training.filtered) # => 49 rows and 250 columns > > # Save the filtered training set for later use in classification > filtered.data <- 'training_set_filtered.Rdata' > save(training.filtered, file=filtered.data) > > #----------------- > # THE FOLLOWING FILTERING STEP TAKES 3 HOUR ON MY PowerBook > # AND CONSUMES ABOUT 600 Mb MEMORY. > # > # I WOULD BE HAPPY ABOUT ANY HINT HOW TO IMPROVE THIS. > > # Pre-filter the big data set (more than 115,000 rows and 524 > columns) for later class predictions. > # The big data set contains the same column names as the training > set, but in a different order. > > input.file <- 'big_data_set.txt' > filtered.file <- 'big_data_set_filtered.txt' > > # Read header with first row > prediction.set <- read.csv(input.file, header=TRUE, skip=0, nrow=1) > > # Prepare column names by stripping the underline and the number at > the end > colnames(prediction.set) <- sub('_\\d+$', '', colnames > (prediction.set), perl=TRUE) > prediction.set.header <- colnames(prediction.set) > > # Get descriptor columns of the training data set without the > Activity_Class column > training.filtered.property.colnames <- colnames(training.filtered)[-1] > > # Filter out the all-constant columns from the training set > prediction.set.filtered <- prediction.set > [training.filtered.property.colnames] > dim(prediction.set.filtered) # => 1 row and 249 columns > > # Write header and the first filtered row > write.csv(prediction.set.filtered, file=filtered.file, > append=FALSE, > col.names=training.filtered.property.colnames) > > blocksize <- 1000 > for (lineid in (0:120)*blocksize) { > cat('lineid: ', lineid, '\n') > > # Read block of data > # We have to add an dummy colname "x" in the col.names, when the > header is not read! > prediction.set <- try(read.csv(input.file, header=FALSE, > col.names=c('x',prediction.set.header), > row.names=1, > skip=lineid+2, nrow=blocksize)) > if (class(prediction.set) == "try-error") break > > # Filter out all-constant training set columns from the block > prediction.set.filtered <- prediction.set > [training.filtered.property.colnames] > > # Append the data > # (I know this function is slow, but I couldn't figure out how to > do it faster, so far.) > write.table(prediction.set.filtered, file=filtered.file, > append=TRUE, col.names=FALSE, sep=",") > } > > #------------- > # Now read in the filtered data set and save it for later use in > classification > prediction.set.filtered <- read.csv(filtered.file, header=TRUE, > row.names=1) > filtered.data <- 'prediction_set_filtered.Rdata' > save(prediction.set.filtered, file=filtered.data) > > > > I would be very happy about any hints how to improve the code above!!! > > Best regards, > > Torsten > > ______________________________________________ > R-help@stat.math.ethz.ch mailing list > https://stat.ethz.ch/mailman/listinfo/r-help > PLEASE do read the posting guide! http://www.R-project.org/posting-guide.html > ______________________________________________ R-help@stat.math.ethz.ch mailing list https://stat.ethz.ch/mailman/listinfo/r-help PLEASE do read the posting guide! http://www.R-project.org/posting-guide.html