Hi, Try this:
final3New<-read.table(file="real_data_cecilia.txt",sep="\t") final3New1<-read.csv("real_data_cecilia_new.csv") fun2<-function(dat){ indx<- duplicated(dat)|duplicated(dat,fromLast=TRUE) dat1<- subset(dat[indx,],dummy==1) dat2<- dat1[order(dat1$dimension),] indx1<- as.numeric(row.names(dat2)) names(indx1)<- (seq_along(indx1)-1)%/%2+1 dat3<- dat[c(indx1,indx1+1),] dat3$id<- names(c(indx1,indx1+1)) lst1<- lapply(split(dat3,dat3$id),function(x){ x1<- x[-1,] x2<- x1[which.min(abs(x1$dimension[1]-x1$dimension[-1]))+1,] x3<- subset(x,dummy==1) rowNx2<- as.numeric(row.names(x2)) rowNx3<- as.numeric(row.names(x3)) x4<- x3[which.min(abs(rowNx2-rowNx3)),] x5<- rbind(x4,x2) x6<- x[is.na(match(row.names(x),row.names(x5))),] }) dat4<- do.call(rbind,lst1) row.names(dat4)<- gsub(".*\\.","",row.names(dat4)) indxNew1<- sort(as.numeric(unique(row.names(dat4)))) dat0<- subset(dat[indx,],dummy==0) if(nrow(dat0)>0){ dat20<-dat0[order(dat0$dimension),] indx0<- as.numeric(row.names(dat20)) names(indx0)<- (seq_along(indx0)-1)%/%2+1 dat30<- dat[c(indx0-1,indx0),] dat30$id<- names(c(indx0-1,indx0)) lst0<- lapply(split(dat30,dat30$id),function(x) { x1<- subset(x,dummy==1) x2<- subset(x,dummy==0) x3<- x1[which.min(abs(x1$dimension- unique(x2$dimension))),] rowNx2<- as.numeric(row.names(x2)) rowNx3<- as.numeric(row.names(x3)) x4<- x2[which.min(abs(rowNx2-rowNx3)),] x5<- rbind(x3,x4) x6<- x[is.na(match(row.names(x),row.names(x5))),] }) dat40<- do.call(rbind,lst0) row.names(dat40)<- gsub(".*\\.","",row.names(dat40)) indxNew0<- sort(as.numeric(unique(row.names(dat40)))) res1Del<-dat[indxNew1,] res0Del<-dat[indxNew0,] indx10<-sort(as.numeric(union(row.names(res0Del),row.names(res1Del)))) if(length(indx10)%%2==1){ res10Del<-unique(rbind(res1Del,res0Del)) indx10New<- sort(as.numeric(row.names(res10Del))) resF<- dat[-indx10New,] resF } else{ resF<- dat[-indx10,] resF } } else{ resF<- dat[-indxNew1,] } } ###Old Function fun3<- function(dat){ indx<- duplicated(dat) dat1<- subset(dat[indx,],dummy==1) dat0<- subset(dat[indx,],dummy==0) indx1<- as.numeric(row.names(dat1)) indx11<- sort(c(indx1,indx1+1)) indx0<- as.numeric(row.names(dat0)) indx00<- sort(c(indx0,indx0-1)) indx10<- sort(c(indx11,indx00)) res <- dat[-indx10,] res } ##Applying fun1() (from previous post) res5Percent<- fun1(final3New,0.05,50) res5Percent1<- fun1(final3New1,0.05,50) res10Percent<- fun1(final3New,0.10,200) res10Percent1<- fun1(final3New1,0.10,200) res20Percent<- fun1(final3New,0.20,100) res20Percent1<- fun1(final3New1,0.20,100) ###Applying fun2() res5F2<- fun2(res5Percent) res5F2_1<- fun2(res5Percent1) res10F2<- fun2(res10Percent) res10F2_1<- fun2(res10Percent1) res20F2<- fun2(res20Percent) res20F2_1<- fun2(res20Percent1) #Applying fun3() res5F3<- fun3(res5Percent) res5F3_1<- fun3(res5Percent1) res10F3<- fun3(res10Percent) res10F3_1<- fun3(res10Percent1) res20F3<- fun3(res20Percent) res20F3_1<- fun3(res20Percent1) vec1<- rep(c("res5F2","res10F2","res20F2"),2) vec2<- rep(c("res5F3","res10F3","res20F3"),2) vec1[4:6]<-paste(vec1[4:6],"_1",sep="") vec2[4:6]<-paste(vec2[4:6],"_1",sep="") resTbl<-data.frame( Dataset=rep(rep(c("final3New","final3New1"),each=3),2),Funct=rep(c("fun2","fun3"),each=6),do.call(rbind,lapply(as.list(c(vec1,vec2)),function(x) {x1<-get(x);c(N_row=nrow(x1),Sub0_Nrow=nrow(subset(x1,dummy==0)),Sub1_Nrow=nrow(subset(x1,dummy==1)),Uniq_Nrow=nrow(unique(x1)))})),stringsAsFactors=FALSE) row.names(resTbl)<- c(vec1,vec2) resTbl # Dataset Funct N_row Sub0_Nrow Sub1_Nrow Uniq_Nrow #res5F2 final3New fun2 276 138 138 276 #res10F2 final3New fun2 454 227 227 454 #res20F2 final3New fun2 284 142 142 284 #res5F2_1 final3New1 fun2 288 144 144 288 #res10F2_1 final3New1 fun2 488 244 244 488 #res20F2_1 final3New1 fun2 310 155 155 310 #res5F3 final3New fun3 276 138 138 276 #res10F3 final3New fun3 452 226 226 452 #res20F3 final3New fun3 284 142 142 284 #res5F3_1 final3New1 fun3 288 144 144 288 #res10F3_1 final3New1 fun3 488 244 244 488 #res20F3_1 final3New1 fun3 310 155 155 310 head(res5F2_1,4) # firm year industry dummy dimension #1 500622043 2004 1 1 1172 #2 501611886 2004 1 0 1183 #3 500778787 2004 1 1 5680 #4 500047006 2004 1 0 5692 A.K. ________________________________ From: Cecilia Carmo <cecilia.ca...@ua.pt> To: arun <smartpink...@yahoo.com> Sent: Tuesday, June 11, 2013 4:36 PM Subject: new data Here it is. Cecília ______________________________________________ R-help@r-project.org mailing list https://stat.ethz.ch/mailman/listinfo/r-help PLEASE do read the posting guide http://www.R-project.org/posting-guide.html and provide commented, minimal, self-contained, reproducible code.