Re: [Rd] a fast table() for the 1D case

2013-09-16 Thread Hervé Pagès

Any chance some improvements can be made on table()?

table() is probably one of the most used R functions when working
interactively. Unfortunately it can be incredibly slow, especially
on a logical vector where a simple sum() is hundred times faster
(I actually got into the habit of using sum() instead of table()).
The table1D() proposal below doesn't go as far as using sum() on a
logical vector but it already provides significant speedups for most
use cases.

Thanks,
H.


On 08/09/2013 01:19 AM, Hervé Pagès wrote:

Hi,

table1D() below can be up to 60x faster than base::table() for the 1D
case. Here are the detailed speedups compared to base::table().

   o With a logical vector of length 5M: 11x faster
 (or more if 'useNA=always')

   o With factor/integer/numeric/character of length 1M and 9 levels
 (or 9 distinct values for non-factors):
  - factor:  60x faster
  - integer/numeric vector:  12x faster
  - character vector:   2.4x faster

   o With factor/integer/numeric/character of length 1M and no
 duplicates:
   - factor:  5x faster
   - integer vector:  2x faster
   - numeric vector:1.7x faster
   - character vector:   no significant speedup

Would be great if this improvement could make it into base::table().

Thanks,
H.

   ## A fast table() implementation for the 1D case (replacing the '...'
   ## arg with 'x' and omitting the 'dnn' and 'deparse.level' arguments
   ## which are unrelated to performance).

   table1D - function(x, exclude = if (useNA == no) c(NA, NaN),
   useNA = c(no, ifany, always))
   {
 if (!missing(exclude)  is.null(exclude)) {
 useNA - always
 } else {
 useNA - match.arg(useNA)
 }
 if (useNA == always  !missing(exclude))
 exclude - setdiff(exclude, NA)
 if (is.factor(x)) {
 x2 - levels(x)
 append_NA - (useNA == always ||
   useNA == ifany  any(is.na(x))) 
  !any(is.na(x2))
 if (append_NA) {
 x2 - c(x2, NA)
 x - factor(x, levels=x2, exclude=NULL)
 }
 t2 - tabulate(x, nbins=length(x2))
 if (!is.null(exclude)) {
 keep_idx - which(!(x2 %in% exclude))
 x2 - x2[keep_idx]
 t2 - t2[keep_idx]
 }
 } else {
 xx - match(x, x)
 t - tabulate(xx, nbins=length(xx))
 keep_idx - which(t != 0L)
 x2 - x[keep_idx]
 t2 - t[keep_idx]
 if (!is.null(exclude)) {
 exclude - as.vector(exclude, typeof(x))
 keep_idx - which(!(x2 %in% exclude))
 x2 - x2[keep_idx]
 t2 - t2[keep_idx]
 }
 oo - order(x2)
 x2 - x2[oo]
 t2 - t2[oo]
 append_NA - useNA == always  !any(is.na(x2))
 if (append_NA) {
 x2 - c(x2, NA)
 t2 - c(t2, 0L)
 }
 }
 ans - array(t2)
 dimnames(ans) - list(as.character(x2))
 names(dimnames(ans)) - x  # always set to 'x'
 class(ans) - table
 ans
   }

table1D() also fixes some issues with base::table() that can be exposed
by running the tests below.

   test_table - function(FUN_NAME)
   {
 FUN - match.fun(FUN_NAME)

 .make_target - function(target_names, target_data)
 {
 ans - array(target_data)
 dimnames(ans) - list(as.character(target_names))
 names(dimnames(ans)) - x
 class(ans) - table
 ans
 }

 .check_identical - function(target, current, varname, extra_args)
 {
 if (identical(target, current))
 return()
 if (extra_args != )
 extra_args - paste0(, , extra_args)
 cat(unexpected result for ', FUN_NAME,
 (x=, varname, extra_args, )'\n, sep=)
 }

 .test_exclude - function(x, varname, target_names0, target_data0,
exclude)
 {
 extra_args - paste0(exclude=, deparse(exclude))
 current - FUN(x=x, exclude=exclude)
 target_names - target_names0
 target_data - target_data0
 if (is.null(exclude)) {
 if (!any(is.na(target_names))) {
 target_names - c(target_names, NA)
 target_data - c(target_data, 0L)
 }
 } else {
 if (!is.factor(x)) {
 exclude - as.vector(exclude, typeof(x))
 } else if (!any(is.na(levels(x {
 exclude - union(exclude, NA)
 }
 exclude_idx - match(exclude, target_names, nomatch=0L)
 if (any(exclude_idx != 0L)) {
 target_names - target_names[-exclude_idx]
 target_data - target_data[-exclude_idx]
 }
 }
 target - .make_target(target_names, target_data)
 .check_identical(target, 

[Rd] a fast table() for the 1D case

2013-08-09 Thread Hervé Pagès

Hi,

table1D() below can be up to 60x faster than base::table() for the 1D
case. Here are the detailed speedups compared to base::table().

  o With a logical vector of length 5M: 11x faster
(or more if 'useNA=always')

  o With factor/integer/numeric/character of length 1M and 9 levels
(or 9 distinct values for non-factors):
 - factor:  60x faster
 - integer/numeric vector:  12x faster
 - character vector:   2.4x faster

  o With factor/integer/numeric/character of length 1M and no
duplicates:
  - factor:  5x faster
  - integer vector:  2x faster
  - numeric vector:1.7x faster
  - character vector:   no significant speedup

Would be great if this improvement could make it into base::table().

Thanks,
H.

  ## A fast table() implementation for the 1D case (replacing the '...'
  ## arg with 'x' and omitting the 'dnn' and 'deparse.level' arguments
  ## which are unrelated to performance).

  table1D - function(x, exclude = if (useNA == no) c(NA, NaN),
  useNA = c(no, ifany, always))
  {
if (!missing(exclude)  is.null(exclude)) {
useNA - always
} else {
useNA - match.arg(useNA)
}
if (useNA == always  !missing(exclude))
exclude - setdiff(exclude, NA)
if (is.factor(x)) {
x2 - levels(x)
append_NA - (useNA == always ||
  useNA == ifany  any(is.na(x))) 
 !any(is.na(x2))
if (append_NA) {
x2 - c(x2, NA)
x - factor(x, levels=x2, exclude=NULL)
}
t2 - tabulate(x, nbins=length(x2))
if (!is.null(exclude)) {
keep_idx - which(!(x2 %in% exclude))
x2 - x2[keep_idx]
t2 - t2[keep_idx]
}
} else {
xx - match(x, x)
t - tabulate(xx, nbins=length(xx))
keep_idx - which(t != 0L)
x2 - x[keep_idx]
t2 - t[keep_idx]
if (!is.null(exclude)) {
exclude - as.vector(exclude, typeof(x))
keep_idx - which(!(x2 %in% exclude))
x2 - x2[keep_idx]
t2 - t2[keep_idx]
}
oo - order(x2)
x2 - x2[oo]
t2 - t2[oo]
append_NA - useNA == always  !any(is.na(x2))
if (append_NA) {
x2 - c(x2, NA)
t2 - c(t2, 0L)
}
}
ans - array(t2)
dimnames(ans) - list(as.character(x2))
names(dimnames(ans)) - x  # always set to 'x'
class(ans) - table
ans
  }

table1D() also fixes some issues with base::table() that can be exposed
by running the tests below.

  test_table - function(FUN_NAME)
  {
FUN - match.fun(FUN_NAME)

.make_target - function(target_names, target_data)
{
ans - array(target_data)
dimnames(ans) - list(as.character(target_names))
names(dimnames(ans)) - x
class(ans) - table
ans
}

.check_identical - function(target, current, varname, extra_args)
{
if (identical(target, current))
return()
if (extra_args != )
extra_args - paste0(, , extra_args)
cat(unexpected result for ', FUN_NAME,
(x=, varname, extra_args, )'\n, sep=)
}

.test_exclude - function(x, varname, target_names0, target_data0, 
exclude)

{
extra_args - paste0(exclude=, deparse(exclude))
current - FUN(x=x, exclude=exclude)
target_names - target_names0
target_data - target_data0
if (is.null(exclude)) {
if (!any(is.na(target_names))) {
target_names - c(target_names, NA)
target_data - c(target_data, 0L)
}
} else {
if (!is.factor(x)) {
exclude - as.vector(exclude, typeof(x))
} else if (!any(is.na(levels(x {
exclude - union(exclude, NA)
}
exclude_idx - match(exclude, target_names, nomatch=0L)
if (any(exclude_idx != 0L)) {
target_names - target_names[-exclude_idx]
target_data - target_data[-exclude_idx]
}
}
target - .make_target(target_names, target_data)
.check_identical(target, current, varname, extra_args)
}

.do_exclude_tests - function(x, varname, target_names0, target_data0,
  more_excludes=NULL)
{
.BASIC_EXCLUDES - list(c(NA, NaN), NULL, numeric(0), NA, NaN)
excludes - c(.BASIC_EXCLUDES, more_excludes)
for (exclude in excludes)
.test_exclude(x, varname, target_names0, target_data0, exclude)
}

## Test on a numeric vector.
x0 - numeric(0)
.do_exclude_tests(x0, x0, character(0), integer(0), list(5.3))

x1_target_names0 - c(-9, 4, 5.3, NaN, NA)
x1_target_data0 - c(1L, 2L, 1L, 2L, 3L)
x1 -