Re:_: how to replace values in x by means in subgroups created in ...(not loops)


Thanks, below some code and reply:
#_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_#
#-------------------------------------------------------
# my slow function with loops:
# replace_x_by_locallyMean_x_4_0s_in_y.f(x,y)
#-------------------------------------------------------
#-------------------------------------------------------
# replace_x_by_locallyMean_x_4_0s_in_y.f
  # Arguments: function takes vectors x, y
  #            these vector have the same length: length(x)==length(y)
  # function replaces in x
  #     values that have in indexes in y
  #     at least one zero in the same position in y
  #     by locally compted mean of values in x
  #        from all local x that also have continous 000 in their positions 
close to this x
  #     in y vector
  #x   # 0   1   2   3   4   5   6   7   8   9
  #y   # 0   0   0   1   0   0   0   1   1   1
  # Value expected:
  #x_m # 1.5 1.5 1.5 3.0 5.0 5.0 5.0 7.0 8.0 9.0
  #-------------------------------------------------------
  # author: me
  # replace_x_by_locallyMean_x_4_0s_in_y.f(x,y)
  #-------------------------------------------------------
 #initialisation

Dff<-NULL
index_Dff.start_0<-NULL
index_Dff.stop_0<-NULL
x<-NULL
y<-NULL
replace_x_by_locallyMean_x_4_0s_in_y.f<-NULL
replace_x_by_locallyMean_x_4_0s_in_y.f<-function(x=Df$x,
                                        y=Df$y){
  Dff<-as.data.frame(cbind(as.vector(x),as.vector(y)))
  colnames(Dff)<-c('x','y')
  index_Dff.start_0<-which( ( (Dff$y==0) &
                         rev(rev(c(0,Dff$y))[-1]) !=0) )
  if (Dff$y[1]==0) {
    index_Dff.start_0=c(1,index_Dff.start_0)
  }
  index_Dff.stop_0<-which( ( (Dff$y==0) &
                         c(Dff$y[-1],rev(Dff$y)[1]) !=0) )
  if (rev(Dff$y)[1]==0) {
     index_Dff.stop_0=c(index_Dff.stop_0,length(x))
  }
  Dff$x_m<-Dff$x
  for (i in ( 1:length(index_Dff.start_0) ) ) {
     Dff$x_m[index_Dff.start_0[i]:index_Dff.stop_0[i]]<-mean(
          Dff$x[index_Dff.start_0[i]:index_Dff.stop_0[i]],na.rm=T)
     }
  for (i in ( 1:length(index_Dff.start_0) ) ) {
     Dff$x_m[index_Dff.start_0[i]:index_Dff.stop_0[i]]<-mean(
          Dff$x[index_Dff.start_0[i]:index_Dff.stop_0[i]],na.rm=T)
     }
  Dff$x_m
  }
#replace_x_by_locallyMean_x_4_0s_in_y.f()


#-------------------------------------------------------
# replace_x_by_locallyMean_x_4_0s_in_y.f__Thierry
  # Arguments: function takes vectors x, y
  #            these vector have the same length: length(x)==length(y)
  # function replaces in x
  #     values that have in indexes in y
  #     at least one zero in the same position in y
  #     by locally compted mean of values in x
  #        from all local x that also have continous 000 in their positions 
close to this x
  #     in y vector
  #x   # 0   1   2   3   4   5   6   7   8   9
  #y   # 0   0   0   1   0   0   0   1   1   1
  # Value expected:
  #x_m # 1.5 1.5 1.5 3.0 5.0 5.0 5.0 7.0 8.0 9.0
  #-------------------------------------------------------
  # author: Thierry
  # replace_x_by_Avgx_for000iny_Thierry.f(x,y)
  #-------------------------------------------------------
 #initialisation
  dataset <- NULL
  tmp <- NULL
  result <- NULL
  replace_x_by_locallyMean_x_4_0s_in_y.f__Thierry<-NULL
library(plyr)

replace_x_by_locallyMean_x_4_0s_in_y.f__Thierry<-function(x=x,
                                        y=y){
  dataset <- data.frame(x = x, y = y)
  dataset$Group <- cumsum(c(0, diff(!is.na(dataset$y) & dataset$y == 0)) == 1)
  #library(plyr)
  tmp <- ddply(subset(dataset, y == 0), .(Group), function(z){c(Mean = 
mean(z$x, na.rm = TRUE))})
  result <- merge(dataset, tmp)
  result$Mean[is.na(result$y) | result$y != 0] <- result$x[is.na(result$y) | 
result$y != 0]
  #result
  result$Mean
} #replace_x_by_locallyMean_x_4_0s_in_y.f__Thierry(x,y)


# Let's test it for longer vector
x<-1:(50*1000)
 y<-sample(c(0,1), length(x), replace = TRUE)
 # for Thierry we must start by 1 if it start by 0,
y<-c(0,1,sample(c(0,1), length(x)-2, replace = TRUE)  )
paste(length(x),length(y))
# let's comapre times:
# slow solution with loops, my function x_by_locallyMean_x_4_0s_in_y__loop()
    #  system.time(
    #     x_by_locallyMean_x_4_0s_in_y__loop<-
    #        replace_x_by_locallyMean_x_4_0s_in_y.f(x,y)
    #     )
    # _for_ x<-1:(50*1000) #
    #  user  system elapsed
    # 760.34    0.86  769.16
    #---------------------------------------------------------------
# time of Thierry's function :
        system.time(
        x_by_locallyMean_x_4_0s_in_y__Thierry<-
           replace_x_by_locallyMean_x_4_0s_in_y.f__Thierry(x,y)
           )
#Df.x_m_Thierry (10x faster) then my loop
# _for_ x<-1:500000
# user  system elapsed
#  11.61    0.00   11.68

# good news:
# 769.16 sec / 11.68 sec => Thierry's solution is 65 (x) times faster than 
loops,
# great!

length(x_by_locallyMean_x_4_0s_in_y__Thierry)
cbind(x,y,x_by_locallyMean_x_4_0s_in_y__Thierry) [1:15,]
paste(' good, faster than my loops ')

#1. Thierry, your function is fast, ()
#    and it does perfectly the problem that I proposed.

#2. Could we ask for a little modification in that code,
#    to make it more general,
#    more generaal, i.e. to do it working when y starts from "1" y=c(1,...

 x<-1:(50*1000)
 y<-c(1,sample(c(0,1), length(x)-1, replace = TRUE)  )

#-------------------------------------------------------------------------


    #---------------------------------------------------------------------
    # next to try in looking for faster solutions mayby:
    #require(sqldf)
    # centered moving average of length 7
    #set.seed(1)
    #DF <- data.frame(x = rnorm(15, 1:15))
    #s18 <- sqldf("select a.x x, avg(b.x) movavgx from DF a, DF b
    #   where a.row_names - b.row_names between -3 and 3
    #   group by a.row_names having count(*) = 7
    #   order by a.row_names+0",
    # row.names = TRUE)
    #r18 <- data.frame(x = DF[4:12,], movavgx = rowMeans(embed(DF$x, 7)))
    #row.names(r18) <- NULL
    #all.equal(r18, s18)
    #---------------------------------------------------------------------






 
#_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_#

-----Wiadomo¶æ oryginalna-----
>Od: ONKELINX, Thierry [mailto:thierry.onkel...@inbo.be]
>Wys³ano: ¦r 2011-07-27 11:36
>Do: Eugeniusz Ka³u¿a; r-help@r-project.org
>Temat: RE: how to replace values in x by means in subgroups created in ...(not 
>loops)
> 
>Something like this?
>
>dataset <- data.frame(x = x, y = y)
>dataset$Group <- cumsum(c(0, diff(!is.na(dataset$y) & dataset$y == 0)) == 1)
>library(plyr)
>tmp <- ddply(subset(dataset, y == 0), .(Group), function(z){c(Mean = mean(z$x, 
>na.rm = TRUE))})
>result <- merge(dataset, tmp)
>result$Mean[is.na(result$y) | result$y != 0] <- result$x[is.na(result$y) | 
>result$y != 0]
>
>Best regards,
>
>Thierry

>> -----Oorspronkelijk bericht-----
>> Van: r-help-boun...@r-project.org [mailto:r-help-boun...@r-project.org]
>> Namens Eugeniusz Kaluza
>> Verzonden: woensdag 27 juli 2011 11:21
>> Aan: r-help@r-project.org
>> Onderwerp: [R] how to replace values in x by means in subgroups created in 
>> ...
>> (not loops)
>> 
>> 
>> # Dear all,
>> # how to replace values in x by means in subgroups created in ...
>> # replace only these values where y=0 in continous sequence # replace by mean
>> calculated locally for each subgroup created by # continous sequence of 
>> 0,0,0 in
>> parallel y vector, i.e.
>> # where there is continous sequence of 0 in data frame vector y
>> #    but we do not replace values in x[i], if y[i]!=0
>> # we do not want use loops we do not use apply (not very fast)
>> 
>> 
>> x      <-c(0 ,1,2,3,4,5,6,7,8,NA,NA,1  ,1 ,NA,2  ,2)
>> y      <-c(0 ,0,0,1,0,0,0,1,1,1 ,NA,0  ,0 ,0 ,0  ,1)
>> Must_be<-c(1 ,1,1,3,5,5,5,7,8,NA,NA,1.5,1 ,NA,1.5,2)
>> 
>> (df<-as.data.frame(cbind(x,y))  )
>> 
>> # I have traied many bad colusions based on cumsum, pmin, pmax, ...
>> (mean_dfx_if_yIs0<-y*cumsum(x*y)/(cumsum(y)*y) )
>> 
>> # how to do this?
>> # thans for any advice
>> # E


        [[alternative HTML version deleted]]

______________________________________________
R-help@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.

Reply via email to