Generating a model matrix with very large numbers of columns overflows the stack and/or runs very slowly, due to the implementation of TrimRepeats().
This patch modifies it to use Rf_duplicated() to find the duplicates. This makes the running time linear in the number of columns and eliminates the recursive function calls. Thanks
Index: src/library/stats/src/model.c =================================================================== --- src/library/stats/src/model.c (revision 70230) +++ src/library/stats/src/model.c (working copy) @@ -1259,11 +1259,12 @@ static int TermZero(SEXP term) { - int i, val; - val = 1; - for (i = 0; i < nwords; i++) - val = val && (INTEGER(term)[i] == 0); - return val; + for (int i = 0; i < nwords; i++) { + if (INTEGER(term)[i] != 0) { + return 0; + } + } + return 1; } @@ -1271,11 +1272,12 @@ static int TermEqual(SEXP term1, SEXP term2) { - int i, val; - val = 1; - for (i = 0; i < nwords; i++) - val = val && (INTEGER(term1)[i] == INTEGER(term2)[i]); - return val; + for (int i = 0; i < nwords; i++) { + if (INTEGER(term1)[i] != INTEGER(term2)[i]) { + return 0; + } + } + return 1; } @@ -1303,18 +1305,37 @@ /* TrimRepeats removes duplicates of (bit string) terms - in a model formula by repeated use of ``StripTerm''. + in a model formula. Also drops zero terms. */ static SEXP TrimRepeats(SEXP list) { - if (list == R_NilValue) - return R_NilValue; - /* Highly recursive */ - R_CheckStack(); - if (TermZero(CAR(list))) - return TrimRepeats(CDR(list)); - SETCDR(list, TrimRepeats(StripTerm(CAR(list), CDR(list)))); + // Drop zero terms at the start of the list. + while (list != R_NilValue && TermZero(CAR(list))) { + list = CDR(list); + } + if (list == R_NilValue || CDR(list) == R_NilValue) + return list; + + // Find out which terms are duplicates. + SEXP all_terms = PROTECT(Rf_PairToVectorList(list)); + SEXP duplicate_sexp = PROTECT(Rf_duplicated(all_terms, FALSE)); + int* is_duplicate = LOGICAL(duplicate_sexp); + int i = 0; + + // Remove the zero terms and duplicates from the list. + for (SEXP current = list; CDR(current) != R_NilValue; i++) { + SEXP next = CDR(current); + + if (is_duplicate[i + 1] || TermZero(CAR(next))) { + // Remove the node from the list. + SETCDR(current, CDR(next)); + } else { + current = next; + } + } + + UNPROTECT(2); return list; }
______________________________________________ R-devel@r-project.org mailing list https://stat.ethz.ch/mailman/listinfo/r-devel