Generating a model matrix with very large numbers of columns overflows
the stack and/or runs very slowly, due to the implementation of
TrimRepeats().

This patch modifies it to use Rf_duplicated() to find the duplicates.
This makes the running time linear in the number of columns and
eliminates the recursive function calls.

Thanks
Index: src/library/stats/src/model.c
===================================================================
--- src/library/stats/src/model.c	(revision 70230)
+++ src/library/stats/src/model.c	(working copy)
@@ -1259,11 +1259,12 @@
 
 static int TermZero(SEXP term)
 {
-    int i, val;
-    val = 1;
-    for (i = 0; i < nwords; i++)
-	val = val && (INTEGER(term)[i] == 0);
-    return val;
+    for (int i = 0; i < nwords; i++) {
+        if (INTEGER(term)[i] != 0) {
+            return 0;
+        }
+    }
+    return 1;
 }
 
 
@@ -1271,11 +1272,12 @@
 
 static int TermEqual(SEXP term1, SEXP term2)
 {
-    int i, val;
-    val = 1;
-    for (i = 0; i < nwords; i++)
-	val = val && (INTEGER(term1)[i] == INTEGER(term2)[i]);
-    return val;
+    for (int i = 0; i < nwords; i++) {
+        if (INTEGER(term1)[i] != INTEGER(term2)[i]) {
+            return 0;
+        }
+    }
+    return 1;
 }
 
 
@@ -1303,18 +1305,37 @@
 
 
 /* TrimRepeats removes duplicates of (bit string) terms 
-   in a model formula by repeated use of ``StripTerm''.
+   in a model formula.
    Also drops zero terms. */
 
 static SEXP TrimRepeats(SEXP list)
 {
-    if (list == R_NilValue)
-	return R_NilValue;
-    /* Highly recursive */
-    R_CheckStack();
-    if (TermZero(CAR(list)))
-	return TrimRepeats(CDR(list));
-    SETCDR(list, TrimRepeats(StripTerm(CAR(list), CDR(list))));
+    // Drop zero terms at the start of the list.
+    while (list != R_NilValue && TermZero(CAR(list))) {
+	list = CDR(list);
+    }
+    if (list == R_NilValue || CDR(list) == R_NilValue)
+	return list;
+
+    // Find out which terms are duplicates.
+    SEXP all_terms = PROTECT(Rf_PairToVectorList(list));
+    SEXP duplicate_sexp = PROTECT(Rf_duplicated(all_terms, FALSE));
+    int* is_duplicate = LOGICAL(duplicate_sexp);
+    int i = 0;
+
+    // Remove the zero terms and duplicates from the list.
+    for (SEXP current = list; CDR(current) != R_NilValue; i++) {
+	SEXP next = CDR(current);
+
+	if (is_duplicate[i + 1] || TermZero(CAR(next))) {
+	    // Remove the node from the list.
+	    SETCDR(current, CDR(next));
+	} else {
+	    current = next;
+	}
+    }
+
+    UNPROTECT(2);
     return list;
 }
 
______________________________________________
R-devel@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel

Reply via email to