tapply1 <- tapply <- function (X, INDEX, FUN = NULL, ..., simplify = TRUE) { FUN <- if (!is.null(FUN)) match.fun(FUN) if (!is.list(INDEX)) INDEX <- list(INDEX) nI <- length(INDEX) if (!nI) stop("'INDEX' is of length zero") namelist <- vector("list", nI) names(namelist) <- names(INDEX) extent <- integer(nI) nx <- length(X) one <- 1L group <- rep.int(one, nx) #- to contain the splitting vector ngroup <- one for (i in seq_along(INDEX)) { index <- as.factor(INDEX[[i]]) if (length(index) != nx) stop("arguments must have same length") namelist[[i]] <- levels(index)#- all of them, yes ! extent[i] <- nlevels(index) group <- group + ngroup * (as.integer(index) - one) ngroup <- ngroup * nlevels(index) } if (is.null(FUN)) return(group) ans <- lapply(X = split(X, group), FUN = FUN, ...) index <- as.integer(names(ans)) if (simplify && all(lengths(ans) == 1L)) { ansmat <- array(dim = extent, dimnames = namelist) ans <- unlist(ans, recursive = FALSE) } else { ansmat <- array(vector("list", prod(extent)), dim = extent, dimnames = namelist) } if(length(index)) { names(ans) <- NULL ansmat[index] <- ans } ansmat } tapply5 <- tapply <- function (X, INDEX, FUN = NULL, ..., simplify = TRUE) { FUN <- if (!is.null(FUN)) match.fun(FUN) if (!is.list(INDEX)) INDEX <- list(INDEX) INDEX <- lapply(INDEX, as.factor) nI <- length(INDEX) # now, 'INDEX' is not classed if (!nI) stop("'INDEX' is of length zero") if (!all(lengths(INDEX) == length(X))) stop("arguments must have same length") namelist <- lapply(INDEX, levels)#- all of them, yes ! extent <- lengths(namelist, use.names = FALSE) cumextent <- cumprod(extent) if (cumextent[nI] > .Machine$integer.max) stop("total number of levels >= 2^31") storage.mode(cumextent) <- "integer" ngroup <- cumextent[nI] group <- as.integer(INDEX[[1L]]) #- to contain the splitting vector if (nI > 1L) for (i in 2L:nI) group <- group + cumextent[i - 1L] * (as.integer(INDEX[[i]]) - 1L) if (is.null(FUN)) return(group) levels(group) <- as.character(seq_len(ngroup)) class(group) <- "factor" ans <- split(X, group) # use generic, e.g. for 'Date' names(ans) <- NULL index <- as.logical(lengths(ans)) # equivalently, lengths(ans) > 0L ans <- lapply(X = ans[index], FUN = FUN, ...) if (simplify && all(lengths(ans) == 1L)) { ansmat <- array(dim = extent, dimnames = namelist) ans <- unlist(ans, recursive = FALSE) } else { ansmat <- array(vector("list", prod(extent)), dim = extent, dimnames = namelist) } if(length(ans)) { ansmat[index] <- ans } ansmat } ## Use byte-compiled versions to be "realistic": require(compiler) tapply1 <- cmpfun(tapply1) tapply5 <- cmpfun(tapply5) set.seed(17) n <- 10000 k1 <- 12; k2 <- 7 x <- rnorm(n) f1 <- factor(sample(k1, size=n, replace=TRUE)) f2 <- factor(sample(k2, size=n, replace=TRUE)) I <- list(f1, f2) I3 <- list(f1, f2, factor(sample(47, size=n, replace=TRUE))) ## plus extreme case for triggering integer overflow ## 2^32 = 4 ^ 16 I32 <- lapply(1:16, function(i) factor(sample(4, size=n, replace=TRUE))) memory.limit() for (nI in 7:11) print(system.time(tapply1(x, I32[1:nI], sum))) for (nI in 7:11) print(system.time(tapply5(x, I32[1:nI], sum))) sessionInfo()