Re: [Rd] a fast table() for the 1D case
Any chance some improvements can be made on table()? table() is probably one of the most used R functions when working interactively. Unfortunately it can be incredibly slow, especially on a logical vector where a simple sum() is hundred times faster (I actually got into the habit of using sum() instead of table()). The table1D() proposal below doesn't go as far as using sum() on a logical vector but it already provides significant speedups for most use cases. Thanks, H. On 08/09/2013 01:19 AM, Hervé Pagès wrote: Hi, table1D() below can be up to 60x faster than base::table() for the 1D case. Here are the detailed speedups compared to base::table(). o With a logical vector of length 5M: 11x faster (or more if 'useNA=always') o With factor/integer/numeric/character of length 1M and 9 levels (or 9 distinct values for non-factors): - factor: 60x faster - integer/numeric vector: 12x faster - character vector: 2.4x faster o With factor/integer/numeric/character of length 1M and no duplicates: - factor: 5x faster - integer vector: 2x faster - numeric vector:1.7x faster - character vector: no significant speedup Would be great if this improvement could make it into base::table(). Thanks, H. ## A fast table() implementation for the 1D case (replacing the '...' ## arg with 'x' and omitting the 'dnn' and 'deparse.level' arguments ## which are unrelated to performance). table1D - function(x, exclude = if (useNA == no) c(NA, NaN), useNA = c(no, ifany, always)) { if (!missing(exclude) is.null(exclude)) { useNA - always } else { useNA - match.arg(useNA) } if (useNA == always !missing(exclude)) exclude - setdiff(exclude, NA) if (is.factor(x)) { x2 - levels(x) append_NA - (useNA == always || useNA == ifany any(is.na(x))) !any(is.na(x2)) if (append_NA) { x2 - c(x2, NA) x - factor(x, levels=x2, exclude=NULL) } t2 - tabulate(x, nbins=length(x2)) if (!is.null(exclude)) { keep_idx - which(!(x2 %in% exclude)) x2 - x2[keep_idx] t2 - t2[keep_idx] } } else { xx - match(x, x) t - tabulate(xx, nbins=length(xx)) keep_idx - which(t != 0L) x2 - x[keep_idx] t2 - t[keep_idx] if (!is.null(exclude)) { exclude - as.vector(exclude, typeof(x)) keep_idx - which(!(x2 %in% exclude)) x2 - x2[keep_idx] t2 - t2[keep_idx] } oo - order(x2) x2 - x2[oo] t2 - t2[oo] append_NA - useNA == always !any(is.na(x2)) if (append_NA) { x2 - c(x2, NA) t2 - c(t2, 0L) } } ans - array(t2) dimnames(ans) - list(as.character(x2)) names(dimnames(ans)) - x # always set to 'x' class(ans) - table ans } table1D() also fixes some issues with base::table() that can be exposed by running the tests below. test_table - function(FUN_NAME) { FUN - match.fun(FUN_NAME) .make_target - function(target_names, target_data) { ans - array(target_data) dimnames(ans) - list(as.character(target_names)) names(dimnames(ans)) - x class(ans) - table ans } .check_identical - function(target, current, varname, extra_args) { if (identical(target, current)) return() if (extra_args != ) extra_args - paste0(, , extra_args) cat(unexpected result for ', FUN_NAME, (x=, varname, extra_args, )'\n, sep=) } .test_exclude - function(x, varname, target_names0, target_data0, exclude) { extra_args - paste0(exclude=, deparse(exclude)) current - FUN(x=x, exclude=exclude) target_names - target_names0 target_data - target_data0 if (is.null(exclude)) { if (!any(is.na(target_names))) { target_names - c(target_names, NA) target_data - c(target_data, 0L) } } else { if (!is.factor(x)) { exclude - as.vector(exclude, typeof(x)) } else if (!any(is.na(levels(x { exclude - union(exclude, NA) } exclude_idx - match(exclude, target_names, nomatch=0L) if (any(exclude_idx != 0L)) { target_names - target_names[-exclude_idx] target_data - target_data[-exclude_idx] } } target - .make_target(target_names, target_data) .check_identical(target,
[Rd] a fast table() for the 1D case
Hi, table1D() below can be up to 60x faster than base::table() for the 1D case. Here are the detailed speedups compared to base::table(). o With a logical vector of length 5M: 11x faster (or more if 'useNA=always') o With factor/integer/numeric/character of length 1M and 9 levels (or 9 distinct values for non-factors): - factor: 60x faster - integer/numeric vector: 12x faster - character vector: 2.4x faster o With factor/integer/numeric/character of length 1M and no duplicates: - factor: 5x faster - integer vector: 2x faster - numeric vector:1.7x faster - character vector: no significant speedup Would be great if this improvement could make it into base::table(). Thanks, H. ## A fast table() implementation for the 1D case (replacing the '...' ## arg with 'x' and omitting the 'dnn' and 'deparse.level' arguments ## which are unrelated to performance). table1D - function(x, exclude = if (useNA == no) c(NA, NaN), useNA = c(no, ifany, always)) { if (!missing(exclude) is.null(exclude)) { useNA - always } else { useNA - match.arg(useNA) } if (useNA == always !missing(exclude)) exclude - setdiff(exclude, NA) if (is.factor(x)) { x2 - levels(x) append_NA - (useNA == always || useNA == ifany any(is.na(x))) !any(is.na(x2)) if (append_NA) { x2 - c(x2, NA) x - factor(x, levels=x2, exclude=NULL) } t2 - tabulate(x, nbins=length(x2)) if (!is.null(exclude)) { keep_idx - which(!(x2 %in% exclude)) x2 - x2[keep_idx] t2 - t2[keep_idx] } } else { xx - match(x, x) t - tabulate(xx, nbins=length(xx)) keep_idx - which(t != 0L) x2 - x[keep_idx] t2 - t[keep_idx] if (!is.null(exclude)) { exclude - as.vector(exclude, typeof(x)) keep_idx - which(!(x2 %in% exclude)) x2 - x2[keep_idx] t2 - t2[keep_idx] } oo - order(x2) x2 - x2[oo] t2 - t2[oo] append_NA - useNA == always !any(is.na(x2)) if (append_NA) { x2 - c(x2, NA) t2 - c(t2, 0L) } } ans - array(t2) dimnames(ans) - list(as.character(x2)) names(dimnames(ans)) - x # always set to 'x' class(ans) - table ans } table1D() also fixes some issues with base::table() that can be exposed by running the tests below. test_table - function(FUN_NAME) { FUN - match.fun(FUN_NAME) .make_target - function(target_names, target_data) { ans - array(target_data) dimnames(ans) - list(as.character(target_names)) names(dimnames(ans)) - x class(ans) - table ans } .check_identical - function(target, current, varname, extra_args) { if (identical(target, current)) return() if (extra_args != ) extra_args - paste0(, , extra_args) cat(unexpected result for ', FUN_NAME, (x=, varname, extra_args, )'\n, sep=) } .test_exclude - function(x, varname, target_names0, target_data0, exclude) { extra_args - paste0(exclude=, deparse(exclude)) current - FUN(x=x, exclude=exclude) target_names - target_names0 target_data - target_data0 if (is.null(exclude)) { if (!any(is.na(target_names))) { target_names - c(target_names, NA) target_data - c(target_data, 0L) } } else { if (!is.factor(x)) { exclude - as.vector(exclude, typeof(x)) } else if (!any(is.na(levels(x { exclude - union(exclude, NA) } exclude_idx - match(exclude, target_names, nomatch=0L) if (any(exclude_idx != 0L)) { target_names - target_names[-exclude_idx] target_data - target_data[-exclude_idx] } } target - .make_target(target_names, target_data) .check_identical(target, current, varname, extra_args) } .do_exclude_tests - function(x, varname, target_names0, target_data0, more_excludes=NULL) { .BASIC_EXCLUDES - list(c(NA, NaN), NULL, numeric(0), NA, NaN) excludes - c(.BASIC_EXCLUDES, more_excludes) for (exclude in excludes) .test_exclude(x, varname, target_names0, target_data0, exclude) } ## Test on a numeric vector. x0 - numeric(0) .do_exclude_tests(x0, x0, character(0), integer(0), list(5.3)) x1_target_names0 - c(-9, 4, 5.3, NaN, NA) x1_target_data0 - c(1L, 2L, 1L, 2L, 3L) x1 -