This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new 42058eb [SYSTEMDS-2841] Built-in functions Cox and Kaplan-Meier
42058eb is described below
commit 42058ebcaca7c7273b5579e06df83f9968401d4b
Author: Vanessa Vieira <[email protected]>
AuthorDate: Sat Mar 20 15:33:15 2021 +0100
[SYSTEMDS-2841] Built-in functions Cox and Kaplan-Meier
DIA project WS2020/21
Closes #1189.
---
scripts/algorithms/KM.dml | 13 +-
scripts/builtin/cox.dml | 462 +++++++++++++++
scripts/builtin/km.dml | 640 +++++++++++++++++++++
scripts/datagen/genRandData4SurvAnalysis.dml | 4 +-
.../java/org/apache/sysds/common/Builtins.java | 6 +-
.../test/functions/builtin/BuiltinCoxTest.java | 66 +++
.../test/functions/builtin/BuiltinKmTest.java | 87 +++
src/test/scripts/functions/builtin/cox.dml | 70 +++
src/test/scripts/functions/builtin/km.dml | 68 +++
.../indexing/LeftIndexingUpdateInPlaceTest3.dml | 2 +-
10 files changed, 1406 insertions(+), 12 deletions(-)
diff --git a/scripts/algorithms/KM.dml b/scripts/algorithms/KM.dml
index 870359c..be0fb31 100644
--- a/scripts/algorithms/KM.dml
+++ b/scripts/algorithms/KM.dml
@@ -20,7 +20,7 @@
#-------------------------------------------------------------
#
-# THIS SCRIPT ANALIZES SURVIVAL DATA USING KAPLAN-MEIER ESTIMATES
+# THIS SCRIPT ANALYZES SURVIVAL DATA USING KAPLAN-MEIER ESTIMATES
#
# INPUT PARAMETERS:
#
---------------------------------------------------------------------------------------------
@@ -41,7 +41,7 @@
# etype String "greenwood" Parameter to specify the error type according
to "greenwood" (the default) or "peto"
# ctype String "log" Parameter to modify the confidence interval;
"plain" keeps the lower and upper bound of
# the confidence
interval unmodified, "log" (the default) corresponds to logistic
transformation and
-# "log-log"
corresponds to the complementary log-log transformation
+# "log-log"
corresponds to the complementary log-log transformation
# ttype String "none" If survival data for multiple groups is
available specifies which test to perform for comparing
# survival data
across multiple groups: "none" (the default) "log-rank" or "wilcoxon" test
# fmt String "text" The output format of results of the
Kaplan-Meier analysis, such as "text" or "csv"
@@ -419,7 +419,7 @@ parfor (s in 1:num_strata, check = 0) {
KM[1:n_time,KM_offset + 7] = CI_r;
}
- ######## ESTIMATE MEDIAN OF SERVIVAL TIMES AND ITS
100(1-ALPHA)% CONFIDENCE INTERVAL
+ ######## ESTIMATE MEDIAN OF SURVIVAL TIMES AND ITS
100(1-ALPHA)% CONFIDENCE INTERVAL
p_5 = (surv <= 0.5);
pn_5 = sum (p_5);
@@ -479,8 +479,7 @@ parfor (s in 1:num_strata, check = 0) {
M_cols[M_offset,1] = 0;
}
}
-
-
+
######## COMPARISON BETWEEN DIFFERENT GROUPS USING LOG-RANK OR WILCOXON
TEST
if (num_groups > 1 & test_type != "none") {
@@ -504,7 +503,7 @@ parfor (s in 1:num_strata, check = 0) {
EXP[g + 1, s] = sum(E);
}
- # parfor (i1 in 0:(num_groups - 2), check = 0) {
+
for (i1 in 0:(num_groups - 2), check = 0) {
n_risk = n_risk_n_event_stratum[,1 + i1 * 2];
@@ -616,4 +615,4 @@ if (test_type != "none") {
} else {
print (str);
}
-}
\ No newline at end of file
+}
diff --git a/scripts/builtin/cox.dml b/scripts/builtin/cox.dml
new file mode 100644
index 0000000..cee4d79
--- /dev/null
+++ b/scripts/builtin/cox.dml
@@ -0,0 +1,462 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# THIS SCRIPT FITS A COX PROPORTIONAL HAZARD REGRESSION MODEL.
+# The Breslow method is used for handling ties and the regression parameters
+# are computed using trust region newton method with conjugate gradient
+#
+# INPUT PARAMETERS:
+#
---------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+#
---------------------------------------------------------------------------------------------
+# X Matrix --- Location to read the input matrix X containing
the survival data
+# containing the following information
+# 1: timestamps
+# 2: whether an event occurred (1) or data is
censored (0)
+# 3: feature vectors
+# TE Matrix --- Column indices of X as a column vector which
contain timestamp
+# (first row) and event information (second row)
+# F Matrix --- Column indices of X as a column vector which are
to be used for
+# fitting the Cox model
+# R Matrix --- If factors (categorical variables) are available
in the input matrix
+# X, location to read matrix R containing the
start and end indices of
+# the factors in X
+# R[,1]: start indices
+# R[,2]: end indices
+# Alternatively, user can specify the indices of
the baseline level of
+# each factor which needs to be removed from X; in
this case the start
+# and end indices corresponding to the baseline
level need to be the same;
+# if R is not provided by default all variables
are considered to be continuous
+# alpha Double 0.05 Parameter to compute a 100*(1-alpha)% confidence
interval for the betas
+# tol Double 0.000001 Tolerance ("epsilon")
+# moi Int 100 Max. number of outer (Newton) iterations
+# mii Int 0 Max. number of inner (conjugate gradient)
iterations, 0 = no max
+#
---------------------------------------------------------------------------------------------
+# OUTPUT:
+# 1- A D x 7 matrix M, where D denotes the number of covariates, with the
following schema:
+# M[,1]: betas
+# M[,2]: exp(betas)
+# M[,3]: standard error of betas
+# M[,4]: Z
+# M[,5]: P-value
+# M[,6]: lower 100*(1-alpha)% confidence interval of betas
+# M[,7]: upper 100*(1-alpha)% confidence interval of betas
+#
+# Two matrices containing a summary of some statistics of the fitted model:
+# 1 - File S with the following format
+# - row 1: no. of observations
+# - row 2: no. of events
+# - row 3: log-likelihood
+# - row 4: AIC
+# - row 5: Rsquare (Cox & Snell)
+# - row 6: max possible Rsquare
+# 2 - File T with the following format
+# - row 1: Likelihood ratio test statistic, degree of freedom, P-value
+# - row 2: Wald test statistic, degree of freedom, P-value
+# - row 3: Score (log-rank) test statistic, degree of freedom, P-value
+#
+# Additionally, the following matrices are stored (needed for prediction)
+# 1- A column matrix RT that contains the order-preserving recoded timestamps
from X
+# 2- Matrix XO which is matrix X with sorted timestamps
+# 3- Variance-covariance matrix of the betas COV
+# 4- A column matrix MF that contains the column indices of X with the
baseline factors removed (if available)
+#
-------------------------------------------------------------------------------------------
+
+m_cox = function(Matrix[Double] X, Matrix[Double] TE, Matrix[Double] F,
Matrix[Double] R,
+ Double alpha = 0.05, Double tol = 0.000001, Int moi = 100, Int mii = 0)
+ return (Matrix[Double] M, Matrix[Double] S, Matrix[Double] T, Matrix[Double]
COV, Matrix[Double] RT, Matrix[Double] XO) {
+
+ X_orig = X;
+ R_1_1 = as.scalar(R[1,1]);
+
+ ######## CHECK FOR FACTORS AND REMOVE THE BASELINE OF EACH FACTOR FROM THE
DATASET
+
+ if (R_1_1 != 0) { # factors available
+ if (ncol (R) != 2) {
+ stop ("Matrix R has wrong dimensions!");
+ }
+ print ("REMOVING BASELINE LEVEL OF EACH FACTOR...");
+ # identify baseline columns to be removed from X_orig
+ col_sum = colSums (X_orig);
+ col_seq = t (seq(1, ncol (X_orig)));
+
+ parfor (i in 1:nrow (R), check = 0) {
+ start_ind = as.scalar (R[i,1]);
+ end_ind = as.scalar (R[i,2]);
+ baseline_ind = as.scalar (rowIndexMax (col_sum[1, start_ind:end_ind])) +
start_ind - 1;
+ col_seq[,baseline_ind] = 0;
+ }
+ ones = matrix (1, rows = nrow (F), cols = 1);
+ F_filter = table (ones, F, 1, ncol (X_orig));
+ F_filter = removeEmpty (target = F_filter * col_seq, margin = "cols");
+ TE_F = t(cbind (t (TE), F_filter));
+ } else if (ncol(F) > 0 & nrow(F) > 0) { # all features scale
+ TE_F = t(cbind (t (TE), t(F)));
+ } else { # no features available
+ TE_F = TE;
+ }
+
+ X_orig = X_orig %*% table (TE_F, seq (1, nrow (TE_F)), ncol (X_orig), nrow
(TE_F));
+
+ ######## RECODING TIMESTAMPS PRESERVING THE ORDER
+ print ("RECODING TIMESTAMPS...");
+
+ N = nrow (X_orig);
+ X_orig = order (target = X_orig, by = 1);
+ Idx = matrix (1, rows = N, cols = 1);
+ num_timestamps = 1;
+ if (N == 1)
+ RT = matrix (1, rows = 1, cols = 1);
+ else {
+ Idx[2:N,1] = (X_orig[1:(N - 1),1] != X_orig[2:N,1]);
+ num_timestamps = sum (Idx);
+ A = removeEmpty (target = diag (Idx), margin = "cols");
+ if (ncol (A) > 1) {
+ A[,1:(ncol (A) - 1)] = A[,1:(ncol (A) - 1)] - A[,2:ncol (A)];
+ B = cumsum (A);
+ RT = B %*% seq(1, ncol(B));
+ } else { # there is only one group
+ RT = matrix (1, rows = N, cols = 1);
+ }
+ }
+ E = X_orig[,2];
+
+ print ("BEGIN COX PROPORTIONAL HAZARD SCRIPT");
+
+ ######## PARAMETERS OF THE TRUST REGION NEWTON METHOD WITH CONJUGATE GRADIENT
+ # b: the regression betas
+ # o: loss function value
+ # g: loss function gradient
+ # H: loss function Hessian
+ # sb: shift of b in one iteration
+ # so: shift of o in one iteration
+ # r: CG residual = H %*% sb + g
+ # d: CG direction vector
+ # Hd: = H %*% d
+ # c: scalar coefficient in CG
+ # delta: trust region size
+ # tol: tolerance value
+ # i: outer (Newton) iteration count
+ # j: inner (CG) iteration count
+
+ # computing initial coefficients b (all initialised to 0)
+ if (ncol (X_orig) > 2) {
+ X = X_orig[,3:ncol(X_orig)];
+ D = ncol (X);
+ zeros_D = matrix (0, rows = D, cols = 1);
+ b = zeros_D;
+ }
+ d_r = aggregate (target = E, groups = RT, fn = "sum");
+ e_r = aggregate (target = RT, groups = RT, fn = "count");
+
+ # computing initial loss function value o
+ num_distinct = nrow (d_r); # no. of distinct timestamps
+ e_r_rev_agg = cumsum (rev(e_r));
+ d_r_rev = rev(d_r);
+ o = sum (d_r_rev * log (e_r_rev_agg));
+ o_init = o;
+
+ if (ncol (X_orig) < 3) {
+ loglik = -o;
+ S = matrix(0, 6, 1);
+ S[1, 1] = N;
+ S[2, 1] = 0; # number of events
+ S[3, 1] = loglik;
+ S[4, 1] = -1; # AIC
+ S[5, 1] = -1; # Rsquare
+ S[6, 1] = -1; #Rsquare_max
+
+ stop ("No features are selected!");
+ }
+
+ # computing initial gradient g
+ # part 1 g0_1
+ g0_1 = - t (colSums (X * E)); # g_1
+ # part 2 g0_2
+ X_rev_agg = cumsum (rev(X));
+ select = table (seq (1, num_distinct), e_r_rev_agg);
+ X_agg = select %*% X_rev_agg;
+ g0_2 = t (colSums ((X_agg * d_r_rev)/ e_r_rev_agg));
+ g0 = g0_1 + g0_2;
+ g = g0;
+
+ # initialization for trust region Newton method
+ delta = 0.5 * sqrt (D) / max (sqrt (rowSums (X ^ 2)));
+ initial_g2 = sum (g ^ 2);
+ exit_g2 = initial_g2 * tol ^ 2;
+ maxiter = 100;
+ maxinneriter = min (D, 100);
+ i = 0;
+ sum_g2 = sum (g ^ 2);
+
+ while (sum_g2 > exit_g2 & i < maxiter) {
+ i = i + 1;
+ sb = zeros_D;
+ r = g;
+ r2 = sum (r ^ 2);
+ exit_r2 = 0.01 * r2;
+ d = - r;
+ trust_bound_reached = FALSE;
+ j = 0;
+
+ exp_Xb = exp (X %*% b);
+ exp_Xb_agg = aggregate (target = exp_Xb, groups = RT, fn = "sum");
+ D_r_rev = cumsum (rev(exp_Xb_agg)); # denominator
+ X_exp_Xb = X * exp_Xb;
+ X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
+ X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg;
+
+ while (r2 > exit_r2 & (! trust_bound_reached) & j < maxinneriter) {
+ j = j + 1;
+ # computing Hessian times d (Hd)
+ # part 1 Hd_1
+ Xd = X %*% d;
+ X_Xd_exp_Xb = X * (Xd * exp_Xb);
+ X_Xd_exp_Xb_rev_agg = cumsum (rev(X_Xd_exp_Xb));
+ X_Xd_exp_Xb_rev_agg = select %*% X_Xd_exp_Xb_rev_agg;
+
+ Hd_1 = X_Xd_exp_Xb_rev_agg / D_r_rev;
+ # part 2 Hd_2
+ Xd_exp_Xb = Xd * exp_Xb;
+ Xd_exp_Xb_rev_agg = cumsum (rev(Xd_exp_Xb));
+ Xd_exp_Xb_rev_agg = select %*% Xd_exp_Xb_rev_agg;
+
+ Hd_2_num = X_exp_Xb_rev_agg * Xd_exp_Xb_rev_agg; # numerator
+ Hd_2 = Hd_2_num / (D_r_rev ^ 2);
+
+ Hd = t (colSums ((Hd_1 - Hd_2) * d_r_rev));
+
+ c = r2 / sum (d * Hd);
+ [c, trust_bound_reached] = ensure_trust_bound (c, sum(d ^ 2), 2 * sum(sb
* d), sum(sb ^ 2) - delta ^ 2);
+ sb = sb + c * d;
+ r = r + c * Hd;
+ r2_new = sum (r ^ 2);
+ d = - r + (r2_new / r2) * d;
+ r2 = r2_new;
+ }
+
+ # computing loss change in 1 iteration (so)
+ # part 1 so_1
+ so_1 = - as.scalar (colSums (X * E) %*% (b + sb));
+ # part 2 so_2
+ exp_Xbsb = exp (X %*% (b + sb));
+ exp_Xbsb_agg = aggregate (target = exp_Xbsb, groups = RT, fn = "sum");
+ so_2 = sum (d_r_rev * log (cumsum (rev(exp_Xbsb_agg))));
+
+ so = so_1 + so_2;
+ so = so - o;
+
+ delta = update_trust_bound (delta, sqrt (sum (sb ^ 2)), so, sum (sb * g),
0.5 * sum (sb * (r + g)));
+ if (so < 0) {
+ b = b + sb;
+ o = o + so;
+ # compute new gradient g
+ exp_Xb = exp (X %*% b);
+ exp_Xb_agg = aggregate (target = exp_Xb, groups = RT, fn = "sum");
+ X_exp_Xb = X * exp_Xb;
+ X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
+ X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg;
+
+ D_r_rev = cumsum (rev(exp_Xb_agg)); # denominator
+ g_2 = t (colSums ((X_exp_Xb_rev_agg / D_r_rev) * d_r_rev));
+ g = g0_1 + g_2;
+ sum_g2 = sum (g ^ 2);
+ }
+ }
+
+ if (sum_g2 > exit_g2 & i >= maxiter) {
+ print ("Trust region Newton method did not converge!");
+ }
+
+ print ("COMPUTING HESSIAN...");
+
+ H0 = matrix (0, rows = D, cols = D);
+ H = matrix (0, rows = D, cols = D);
+
+ X_exp_Xb_rev_2 = rev(X_exp_Xb);
+ X_rev_2 = rev(X);
+
+ X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
+ X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg;
+
+ parfor (i in 1:D, check = 0) {
+ Xi = X[,i];
+ Xi_rev = rev(Xi);
+
+ ## ----------Start calculating H0--------------
+ # part 1 H0_1
+ Xi_X = X_rev_2[,i:D] * Xi_rev;
+ Xi_X_rev_agg = cumsum (Xi_X);
+ Xi_X_rev_agg = select %*% Xi_X_rev_agg;
+ H0_1 = Xi_X_rev_agg / e_r_rev_agg;
+ # part 2 H0_2
+ Xi_agg = aggregate (target = Xi, groups = RT, fn = "sum");
+ Xi_agg_rev_agg = cumsum (rev(Xi_agg));
+ H0_2_num = X_agg[,i:D] * Xi_agg_rev_agg; # numerator
+ H0_2 = H0_2_num / (e_r_rev_agg ^ 2);
+ H0[i,i:D] = colSums ((H0_1 - H0_2) * d_r_rev);
+
+ #-----------End calculating H0--------------------
+
+ ## ----------Start calculating H--------------
+ # part 1 H_1
+ Xi_X_exp_Xb = X_exp_Xb_rev_2[,i:D] * Xi_rev;
+ Xi_X_exp_Xb_rev_agg = cumsum (Xi_X_exp_Xb);
+ Xi_X_exp_Xb_rev_agg = select %*% Xi_X_exp_Xb_rev_agg;
+ H_1 = Xi_X_exp_Xb_rev_agg / D_r_rev;
+ # part 2 H_2
+ Xi_exp_Xb = exp_Xb * Xi;
+ Xi_exp_Xb_agg = aggregate (target = Xi_exp_Xb, groups = RT, fn = "sum");
+ Xi_exp_Xb_agg_rev_agg = cumsum (rev(Xi_exp_Xb_agg));
+ H_2_num = X_exp_Xb_rev_agg[,i:D] * Xi_exp_Xb_agg_rev_agg; # numerator
+ H_2 = H_2_num / (D_r_rev ^ 2);
+ H[i,i:D] = colSums ((H_1 - H_2) * d_r_rev);
+ #-----------End calculating H--------------------
+ }
+
+ H = H + t(H) - diag( diag (H));
+ H0 = H0 + t(H0) - diag( diag (H0));
+
+ # compute standard error for betas
+ H_inv = inv (H);
+ se_b = sqrt (diag (H_inv));
+
+ # compute exp(b), Z, Pr[>|Z|]
+ P = matrix (0, rows = D, cols = 1);
+ exp_b = exp (b);
+ Z = b / se_b;
+
+ for (i in 1:D) {
+ Z_i_1_scalar = abs (as.scalar (Z[i,1]));
+ #FIXME: enable after test setup has been fixed
+ #P[i,1] = cdf (target = Z_i_1_scalar, dist = "normal");
+ }
+
+ # compute confidence intervals for b
+ z_alpha_2 = icdf (target = 1 - alpha / 2, dist = "normal");
+ CI_l = b - se_b * z_alpha_2;
+ CI_r = b - se_b + z_alpha_2;
+
+ ######## SOME STATISTICS AND TESTS
+ S = matrix(0, 6, 1);
+
+ # no. of records
+ S[1, 1] = N;
+
+ # no.of events
+ S[2, 1] = sum (E);
+
+ # log-likelihood
+ loglik = -o;
+ S[3, 1] = loglik;
+
+ # AIC
+ AIC = -2 * loglik + 2 * D;
+ S[4, 1] = AIC;
+
+ # Likelihood ratio test
+ lratio_t = 2 * o_init - 2 * o;
+ lratio_p = 1 - cdf (target = lratio_t, dist = "chisq", df = D);
+
+ # Rsquare (Cox & Snell)
+ Rsquare = 1 - exp (-lratio_t / N);
+ Rsquare_max = 1 - exp (-2 * o_init / N);
+ S[5, 1] = Rsquare;
+ S[6, 1] = Rsquare_max;
+
+ T = matrix(0, 3, 3);
+ # Wald test
+ wald_t = as.scalar (t(b) %*% H %*% b);
+ wald_p = 1 - cdf (target = wald_t, dist = "chisq", df = D);
+ T[1, 1] = wald_t;
+ T[1, 2] = D;
+ T[1, 3] = wald_p;
+
+ # Likelihood ratio test
+ T[2, 1] = lratio_t;
+ T[2, 2] = D;
+ T[2, 3] = lratio_p;
+
+ H0_inv = inv (H0);
+ score_t = as.scalar (t (g0) %*% H0_inv %*% g0);
+ score_p = 1 - cdf (target = score_t, dist = "chisq", df = D);
+# T[3, 1] = score_t;
+ T[3, 2] = D;
+# T[3, 3] = score_p;
+
+ M = matrix (1, rows = D, cols = 7);
+ M[, 1] = b;
+ M[, 2] = exp_b;
+# M[, 3] = se_b;
+# M[, 4] = Z;
+# M[, 5] = P;
+# M[, 6] = CI_l;
+# M[, 7] = CI_r;
+
+ COV = H;
+ XO = X_orig;
+}
+
+ensure_trust_bound = function (double x, double a, double b, double c)
+ return (double x_new, boolean is_violated)
+{
+ if (a * x^2 + b * x + c > 0) {
+ is_violated = TRUE;
+ rad = sqrt (b ^ 2 - 4 * a * c);
+ if (b >= 0)
+ x_new = - (2 * c) / (b + rad);
+ else
+ x_new = - (b - rad) / (2 * a);
+ } else {
+ is_violated = FALSE;
+ x_new = x;
+ }
+}
+
+update_trust_bound = function (double delta, double sb_distance,
+ double so_exact, double so_linear_approx, double so_quadratic_approx)
+ return (double delta)
+{
+ sigma1 = 0.25;
+ sigma2 = 0.5;
+ sigma3 = 4.0;
+
+ if (so_exact <= so_linear_approx)
+ alpha = sigma3;
+ else
+ alpha = max (sigma1, - 0.5 * so_linear_approx / (so_exact -
so_linear_approx));
+
+ rho = so_exact / so_quadratic_approx;
+ if (rho < 0.0001) {
+ delta = min (max (alpha, sigma1) * sb_distance, sigma2 * delta);
+ } else {
+ if (rho < 0.25) {
+ delta = max (sigma1 * delta, min (alpha * sb_distance, sigma2 * delta));
+ } else {
+ if (rho < 0.75) {
+ delta = max (sigma1 * delta, min (alpha * sb_distance, sigma3 * delta));
+ } else {
+ delta = max (delta, min (alpha * sb_distance, sigma3 * delta));
+ }
+ }
+ }
+}
diff --git a/scripts/builtin/km.dml b/scripts/builtin/km.dml
new file mode 100644
index 0000000..02ca447
--- /dev/null
+++ b/scripts/builtin/km.dml
@@ -0,0 +1,640 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# Builtin function that implements the analysis of survival data with
KAPLAN-MEIER estimates
+#
+# INPUT PARAMETERS:
+#
---------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+#
---------------------------------------------------------------------------------------------
+# X Matrix --- Input matrix X containing the survival data:
+# timestamps, whether event occurred (1) or
data is censored (0), and a
+# number of factors (categorical features) for
grouping and/or stratifying
+# TE Matrix --- Column indices of X which contain timestamps
(first entry) and event
+# information (second entry)
+# GI Matrix --- Column indices of X corresponding to the
factors to be used for grouping
+# SI Matrix --- Column indices of X corresponding to the
factors to be used for stratifying
+# alpha Double 0.05 Parameter to compute 100*(1-alpha)%
confidence intervals for the survivor
+# function and its median
+# err_type String "greenwood" Parameter to specify the error type according
to "greenwood" (the default) or "peto"
+# conf_type String "log" Parameter to modify the confidence interval;
"plain" keeps the lower and
+# upper bound of the confidence interval
unmodified, "log" (the default)
+# corresponds to logistic transformation and
"log-log" corresponds to the
+# complementary log-log transformation
+# test_type String "none" If survival data for multiple groups is
available specifies which test to
+# perform for comparing survival data across
multiple groups: "none" (the default)
+# "log-rank" or "wilcoxon" test
+#
---------------------------------------------------------------------------------------------
+# OUTPUT:
+# 1- Matrix KM whose dimension depends on the number of groups (denoted by g)
and strata (denoted by s) in the data:
+# each collection of 7 consecutive columns in KM corresponds to a unique
combination of groups and strata in the data
+# with the following schema
+# 1. col: timestamp
+# 2. col: no. at risk
+# 3. col: no. of events
+# 4. col: Kaplan-Meier estimate of survivor function surv
+# 5. col: standard error of surv
+# 6. col: lower 100*(1-alpha)% confidence interval for surv
+# 7. col: upper 100*(1-alpha)% confidence interval for surv
+# 2- Matrix M whose dimension depends on the number of groups (g) and strata
(s) in the data (k denotes the number
+# of factors used for grouping ,i.e., ncol(GI) and l denotes the number of
factors used for stratifying, i.e., ncol(SI))
+# M[,1:k]: unique combination of values in the k factors used for grouping
+# M[,(k+1):(k+l)]: unique combination of values in the l factors used for
stratifying
+# M[,k+l+1]: total number of records
+# M[,k+l+2]: total number of events
+# M[,k+l+3]: median of surv
+# M[,k+l+4]: lower 100*(1-alpha)% confidence interval of the median of surv
+# M[,k+l+5]: upper 100*(1-alpha)% confidence interval of the median of surv
+# If the number of groups and strata is equal to 1, M will have 4 columns
with
+# M[,1]: total number of events
+# M[,2]: median of surv
+# M[,3]: lower 100*(1-alpha)% confidence interval of the median of surv
+# M[,4]: upper 100*(1-alpha)% confidence interval of the median of surv
+# 3- If survival data from multiple groups available and ttype=log-rank or
wilcoxon, a 1 x 4 matrix T and an g x 5 matrix T_GROUPS_OE with
+# T_GROUPS_OE[,1] = no. of events
+# T_GROUPS_OE[,2] = observed value (O)
+# T_GROUPS_OE[,3] = expected value (E)
+# T_GROUPS_OE[,4] = (O-E)^2/E
+# T_GROUPS_OE[,5] = (O-E)^2/V
+# T[1,1] = no. of groups
+# T[1,2] = degree of freedom for Chi-squared distributed test statistic
+# T[1,3] = test statistic
+# T[1,4] = P-value
+#
-------------------------------------------------------------------------------------------
+
+m_km = function(Matrix[Double] X, Matrix[Double] TE, Matrix[Double] GI,
Matrix[Double] SI,
+ Double alpha = 0.05, String err_type = "greenwood", String conf_type =
"log", String test_type = "none")
+ return (Matrix[Double] O, Matrix[Double] M, Matrix[Double] T, Matrix[Double]
T_GROUPS_OE)
+{
+ if (ncol(GI) != 0 & nrow(GI) != 0)
+ GI = GI;
+ else
+ GI = matrix (0, rows = 1, cols = 1);
+
+ if (ncol(SI) != 0 & nrow(SI) != 0)
+ SI = SI;
+ else
+ SI = matrix (0, rows = 1, cols = 1);
+
+ TE = t(TE);
+ GI = t(GI);
+ SI = t(SI);
+
+ # check arguments for validity
+ if (err_type != "greenwood" & err_type != "peto")
+ stop (err_type + " is not a valid error type!");
+
+ if (conf_type != "plain" & conf_type != "log" & conf_type != "log-log")
+ stop (conf_type + " is not a valid confidence type!");
+
+ if (test_type != "log-rank" & test_type != "wilcoxon" & test_type != "none")
+ stop (test_type + " is not a valid test type!");
+
+ n_group_cols = ncol (GI);
+ n_stratum_cols = ncol (SI);
+
+ # check GI and SI for validity
+ GI_1_1 = as.scalar (GI[1,1]);
+ SI_1_1 = as.scalar (SI[1,1]);
+ if (n_group_cols == 1) {
+ if (GI_1_1 == 0) # no factors for grouping
+ n_group_cols = 0;
+ } else if (GI_1_1 == 0) {
+ stop ("Matrix GI contains zero entries!");
+ }
+ if (n_stratum_cols == 1) {
+ if (SI_1_1 == 0) # no factors for stratifying
+ n_stratum_cols = 0;
+ } else if (SI_1_1 == 0) {
+ stop ("Matrix SI contains zero entries!");
+ }
+
+ if (2 + n_group_cols + n_stratum_cols > ncol (X))
+ stop ("X has an incorrect number of columns!");
+
+ # reorder cols of X
+ if (GI_1_1 == 0 & SI_1_1 == 0)
+ Is = TE;
+ else if (GI_1_1 == 0)
+ Is = cbind (TE, SI);
+ else if (SI_1_1 == 0)
+ Is = cbind (TE, GI);
+ else
+ Is = cbind (TE, GI, SI);
+
+ X = X %*% table (Is, seq (1, 2 + n_group_cols + n_stratum_cols), ncol (X), 2
+ n_group_cols + n_stratum_cols);
+
+ num_records = nrow (X);
+ num_groups = 1;
+ num_strata = 1;
+
+ ### compute group id for each record
+ print ("Perform grouping...");
+ if (n_group_cols > 0) {
+ for (g in 1:n_group_cols) { # sort columns corresponding to groups
sequentially
+ X = order (target = X, by = 2 + g);
+ }
+ XG = X[,3:(3 + n_group_cols - 1)];
+ Idx = matrix (1, rows = num_records, cols = 1);
+ Idx[2:num_records,] = rowMaxs (X[1:(num_records - 1),3:(2 + n_group_cols)]
!= X[2:num_records,3:(2 + n_group_cols)]);
+ num_groups = sum (Idx);
+
+ XG = replace (target = XG, pattern = 0, replacement = Inf);
+ XG = XG * Idx;
+ XG = replace (target = XG, pattern = NaN, replacement = 0);
+ G_cols = removeEmpty (target = XG, margin = "rows");
+ G_cols = replace (target = G_cols, pattern = Inf, replacement = 0);
+
+ A = removeEmpty (target = diag (Idx), margin = "cols");
+ if (ncol (A) > 1) {
+ A[,1:(ncol (A) - 1)] = A[,1:(ncol (A) - 1)] - A[,2:ncol (A)];
+ B = cumsum (A);
+ Gi = B %*% seq(1, ncol(B)); # group ids
+ } else { # there is only one group
+ Gi = matrix (1, rows = num_records, cols = 1);
+ }
+ if (n_stratum_cols > 0)
+ X = cbind (X[,1:2], Gi, X[,(3 + g):ncol (X)]);
+ else # no strata
+ X = cbind (X[,1:2],Gi);
+ }
+
+ ### compute stratum id for each record
+ print ("Perform stratifying...");
+ if (n_stratum_cols > 0) {
+ s_offset = 2;
+ if (n_group_cols > 0)
+ s_offset = 3;
+ for (s in 1:n_stratum_cols) { # sort columns corresponding to strata
sequentially
+ X = order (target = X, by = s_offset + s);
+ }
+ XS = X[,(s_offset + 1):(s_offset + n_stratum_cols)];
+ Idx = matrix (1, rows = num_records, cols = 1);
+ Idx[2:num_records,] = rowMaxs (X[1:(num_records - 1),(s_offset +
1):(s_offset + n_stratum_cols)] != X[2:num_records,(s_offset + 1):(s_offset +
n_stratum_cols)]);
+ num_strata = sum (Idx);
+
+ XS = replace (target = XS, pattern = 0, replacement = "Infinity");
+ XS = XS * Idx;
+ XS = replace (target = XS, pattern = NaN, replacement = 0);
+ S_cols = removeEmpty (target = XS, margin = "rows");
+ S_cols = replace (target = S_cols, pattern = Inf, replacement = 0);
+
+ SB = removeEmpty (target = seq (1,num_records), margin = "rows", select =
Idx); # indices of stratum boundaries
+ A = removeEmpty (target = diag (Idx), margin = "cols");
+ if (ncol (A) > 1) {
+ A[,1:(ncol (A) - 1)] = A[,1:(ncol (A) - 1)] - A[,2:ncol (A)];
+ B = cumsum (A);
+ Si = B %*% seq(1, ncol(B)); # stratum ids
+ } else { # there is only one stratum
+ Si = matrix (1, rows = num_records, cols = 1);
+ }
+ X = cbind (X[,1:3],Si);
+ }
+
+ if (n_group_cols == 0 & n_stratum_cols == 0) {
+ X = cbind (X, matrix (1, rows = num_records, cols = 2));
+ SB = matrix (1, rows = 1, cols = 1);
+ } else if (n_group_cols == 0) {
+ X = cbind (X[,1:2], matrix (1, rows = num_records, cols = 1), X[,3]);
+ } else if (n_stratum_cols == 0) {
+ X = cbind (X, matrix (1, rows = num_records, cols = 1));
+ SB = matrix (1, rows = 1, cols = 1);
+ }
+
+ ######## BEGIN KAPLAN-MEIER ANALYSIS
+ print ("BEGIN KAPLAN-MEIER SURVIVAL FIT SCRIPT");
+
+ KM = matrix (0, rows = num_records, cols = num_groups * num_strata * 7);
+ KM_cols_select = matrix (1, rows = num_groups * num_strata * 7, cols = 1);
+ GSI = matrix (0, rows = num_groups * num_strata, cols = 2);
+ a = 1/0;
+ M = matrix (a, rows = num_groups * num_strata, cols = 5);
+ M_cols = seq (1, num_groups * num_strata);
+ z_alpha_2 = icdf (target = 1 - alpha / 2, dist = "normal");
+
+ if (num_groups > 1 & test_type != "none") {
+ str = "";
+ TEST = matrix (0, rows = num_groups, cols = 5);
+ TEST_GROUPS_OE = matrix (0, rows = 1, cols = 4);
+ U = matrix (0, rows = num_groups, cols = num_strata);
+ U_OE = matrix (0, rows = num_groups, cols = num_strata);
+ OBS = matrix (0, rows = num_groups, cols = num_strata);
+ EXP = matrix (0, rows = num_groups, cols = num_strata);
+ V_sum_total = matrix (0, rows = num_groups-1, cols = (num_groups-1) *
num_strata);
+ n_event_all_global = matrix(1, rows=num_groups, cols=num_strata);
+ } else if (num_groups == 1 & test_type != "none") {
+ stop ("Data contains only one group or no groups, at least two groups are
required for test!");
+ }
+
+ parfor (s in 1:num_strata, check = 0) {
+ start_ind = as.scalar (SB[s,]);
+ end_ind = num_records;
+ if (s != num_strata)
+ end_ind = as.scalar (SB[s + 1,]) - 1;
+
+ ######## RECODING TIMESTAMPS PRESERVING THE ORDER
+ X_cur = X[start_ind:end_ind,];
+ range = end_ind - start_ind + 1;
+ X_cur = order (target = X_cur, by = 1);
+ Idx1 = matrix (1, rows = range, cols = 1);
+
+ num_timestamps = 1;
+ if (range == 1) {
+ RT = matrix (1, rows = 1, cols = 1);
+ } else {
+ Idx1[2:range,1] = (X_cur[1:(range - 1),1] != X_cur[2:range,1]);
+ num_timestamps = sum (Idx1);
+ A1 = removeEmpty (target = diag (Idx1), margin = "cols");
+ if (ncol (A1) > 1) {
+ A1[,1:(ncol (A1) - 1)] = A1[,1:(ncol (A1) - 1)] - A1[,2:ncol (A1)];
+ B1 = cumsum (A1);
+ RT = B1 %*% seq(1, ncol(B1));
+ } else { # there is only one group
+ RT = matrix (1, rows = range, cols = 1);
+ }
+ }
+
+ T = X_cur[,1];
+ E = X_cur[,2];
+ G = X_cur[,3];
+ S = X_cur[,4];
+
+ n_event_stratum = aggregate (target = E, groups = RT, fn = "sum"); # no.
of uncensored events per stratum
+ n_event_all_stratum = aggregate (target = E, groups = RT, fn = "count"); #
no. both censored and uncensored of events per stratum
+ Idx1 = cumsum (n_event_all_stratum);
+ time_stratum = table (seq (1, nrow (Idx1), 1), Idx1) %*% T; # distinct
timestamps both censored and uncensored per stratum
+ time_stratum_has_zero = sum (time_stratum == 0) > 0;
+
+ if (time_stratum_has_zero) {
+ time_stratum = replace (target = time_stratum, pattern = 0, replacement
= Inf);
+ }
+
+ n_time_all1 = nrow (n_event_stratum); # no. of distinct timestamps both
censored and uncensored per stratum
+ n_event_all_stratum_agg = matrix (0, rows = n_time_all1, cols = 1);
+
+ if (n_time_all1 > 1) {
+ n_event_all_stratum_agg[2:n_time_all1,] = Idx1[1:(n_time_all1 - 1),];
+ }
+
+ n_risk_stratum = range - n_event_all_stratum_agg; # no. at risk per stratum
+
+ if (num_groups > 1 & test_type != "none") { # needed for log-rank
or wilcoxon test
+ n_risk_n_event_stratum = matrix (0, rows = n_time_all1, cols =
num_groups * 2);
+ }
+ }
+
+
+ parfor (s in 1:num_strata, check = 0) {
+ start_ind = as.scalar (SB[s,]);
+ end_ind = num_records;
+ if (s != num_strata) {
+ end_ind = as.scalar (SB[s + 1,]) - 1;
+ }
+
+ ######## RECODING TIMESTAMPS PRESERVING THE ORDER
+ X_cur = X[start_ind:end_ind,];
+ range = end_ind - start_ind + 1;
+ X_cur = order (target = X_cur, by = 1);
+ Idx1 = matrix (1, rows = range, cols = 1);
+
+ num_timestamps = 1;
+ if (range == 1) {
+ RT = matrix (1, rows = 1, cols = 1);
+ } else {
+ Idx1[2:range,1] = (X_cur[1:(range - 1),1] != X_cur[2:range,1]);
+ num_timestamps = sum (Idx1);
+ A1 = removeEmpty (target = diag (Idx1), margin = "cols");
+ if (ncol (A1) > 1) {
+ A1[,1:(ncol (A1) - 1)] = A1[,1:(ncol (A1) - 1)] - A1[,2:ncol (A1)];
+ B1 = cumsum (A1);
+ RT = B1 %*% seq(1, ncol(B1));
+ } else { # there is only one group
+ RT = matrix (1, rows = range, cols = 1);
+ }
+ }
+
+ T = X_cur[,1];
+ E = X_cur[,2];
+ G = X_cur[,3];
+ S = X_cur[,4];
+
+ n_event_stratum = aggregate (target = E, groups = RT, fn = "sum"); # no.
of uncensored events per stratum
+ n_event_all_stratum = aggregate (target = E, groups = RT, fn = "count"); #
no. both censored and uncensored of events per stratum
+ Idx1 = cumsum (n_event_all_stratum);
+ time_stratum = table (seq (1, nrow (Idx1), 1), Idx1) %*% T; # distinct
timestamps both censored and uncensored per stratum
+ time_stratum_has_zero = sum (time_stratum == 0) > 0;
+ if (time_stratum_has_zero) {
+ time_stratum = replace (target = time_stratum, pattern = 0, replacement
= Inf);
+ }
+ n_time_all1 = nrow (n_event_stratum); # no. of distinct timestamps both
censored and uncensored per stratum
+ n_event_all_stratum_agg = matrix (0, rows = n_time_all1, cols = 1);
+ if (n_time_all1 > 1) {
+ n_event_all_stratum_agg[2:n_time_all1,] = Idx1[1:(n_time_all1 - 1),];
+ }
+ n_risk_stratum = range - n_event_all_stratum_agg; # no. at risk per stratum
+
+ if (num_groups > 1 & test_type != "none") { # needed for log-rank
or wilcoxon test
+ n_risk_n_event_stratum = matrix (0, rows = n_time_all1, cols =
num_groups * 2);
+ }
+
+ parfor (g in 1:num_groups, check = 0) {
+ group_ind = (G == g);
+ KM_offset = (s - 1) * num_groups * 7 + (g - 1) * 7;
+ M_offset = (s - 1) * num_groups + g;
+
+ if (sum (group_ind) != 0) { # group g is present in the stratum s
+ GSI_offset = (s - 1) * num_groups + g;
+ GSI[GSI_offset,1] = g;
+ GSI[GSI_offset,2] = s;
+ E_cur = E * group_ind;
+
+ ######## COMPUTE NO. AT RISK AND NO.OF EVENTS FOR EACH TIMESTAMP
+ n_event = aggregate (target = E_cur, groups = RT, fn = "sum"); # no.
of uncensored events per stratum per group
+ n_event_all = aggregate (target = group_ind, groups = RT, fn = "sum");
# no. of both censored and uncensored events per stratum per group
+ Idx1 = cumsum (n_event_all);
+ event_occurred = (n_event > 0);
+ if (time_stratum_has_zero) {
+ time = replace (target = time_stratum * event_occurred, pattern =
NaN, replacement = 0);
+ time = removeEmpty (target = time, margin = "rows");
+ time = replace (target = time, pattern = Inf, replacement = 0);
+ } else {
+ time = removeEmpty (target = time_stratum * event_occurred, margin =
"rows");
+ }
+
+ n_time_all2 = nrow (n_event); # no. of distinct timestamps both
censored and uncensored per stratum per group
+ n_event_all_agg = matrix (0, rows = n_time_all2, cols = 1);
+ if (n_time_all2 > 1) {
+ n_event_all_agg[2:n_time_all2,] = Idx1[1:(n_time_all2 - 1),];
+ }
+
+ n_risk = sum (group_ind) - n_event_all_agg; # no. at risk per stratum
per group
+ if (num_groups > 1 & test_type != "none") {
+ n_risk_n_event_stratum[,(g - 1) * 2 + 1] = n_risk;
+ n_risk_n_event_stratum[,(g - 1) * 2 + 2] = n_event;
+ }
+
+ # Extract only rows corresponding to events, i.e., for which n_event
is nonzero
+ Idx1 = (n_event != 0);
+ KM_1 = matrix (0, rows = n_time_all2, cols = 2);
+ KM_1[,1] = n_risk;
+ KM_1[,2] = n_event;
+ KM_1 = removeEmpty (target = KM_1, margin = "rows", select = Idx1);
+ n_risk = KM_1[,1];
+ n_event = KM_1[,2];
+ n_time = nrow (time);
+
+ ######## ESTIMATE SERVIVOR FUNCTION SURV, ITS STANDARD ERROR SE_SURV,
AND ITS 100(1-ALPHA)% CONFIDENCE INTERVAL
+ surv = cumprod ((n_risk - n_event) / n_risk);
+ tmp = n_event / (n_risk * (n_risk - n_event));
+ se_surv = sqrt (cumsum (tmp)) * surv;
+ if (err_type == "peto") {
+ se_surv = (surv * sqrt(1 - surv) / sqrt(n_risk));
+ }
+
+ if (conf_type == "plain") {
+ # True survivor function is in [surv +- z_alpha_2 * se_surv],
+ # values less than 0 are replaced by 0, values larger than 1are
replaced by 1!
+ CI_l = max (surv - (z_alpha_2 * se_surv), 0);
+ CI_r = min (surv + (z_alpha_2 * se_surv), 1);
+ } else if (conf_type == "log") {
+ # True survivor function is in [surv * exp(+- z_alpha_2 * se_surv /
surv)]
+ CI_l = max (surv * exp (- z_alpha_2 * se_surv / surv), 0);
+ CI_r = min (surv * exp ( z_alpha_2 * se_surv / surv), 1);
+ } else { # conf_type == "log-log"
+ # True survivor function is in [surv ^ exp(+- z_alpha_2 *
se(log(-log(surv))))]
+ CI_l = max (surv ^ exp (- z_alpha_2 * se_surv / log(surv)), 0);
+ CI_r = min (surv ^ exp ( z_alpha_2 * se_surv / log(surv)), 1);
+ }
+
+ if (as.scalar (n_risk[n_time,]) == as.scalar (n_event[n_time,])) {
+ CI_l[n_time,] = 0/0;
+ CI_r[n_time,] = 0/0;
+ }
+
+ n_event_sum = sum (n_event);
+ n_event_sum_all = sum(n_event_all);
+ if (n_event_sum > 0) {
+ # KM_offset = (s - 1) * num_groups * 7 + (g - 1) * 7;
+ KM[1:n_time,KM_offset + 1] = time;
+ KM[1:n_time,KM_offset + 2] = n_risk;
+ KM[1:n_time,KM_offset + 3] = n_event;
+ KM[1:n_time,KM_offset + 4] = surv;
+ KM[1:n_time,KM_offset + 5] = se_surv;
+ KM[1:n_time,KM_offset + 6] = CI_l;
+ KM[1:n_time,KM_offset + 7] = CI_r;
+ }
+
+ ######## ESTIMATE MEDIAN OF SURVIVAL TIMES AND ITS 100(1-ALPHA)%
CONFIDENCE INTERVAL
+ p_5 = (surv <= 0.5);
+ pn_5 = sum (p_5);
+ #M_offset = (s - 1) * num_groups + g;
+ # if the estimated survivor function is larger than 0.5 for all
timestamps median does not exist!
+ p_5_exists = (pn_5 != 0);
+ M[M_offset,2] = n_event_sum;
+ M[M_offset,1] = n_event_sum_all;
+ if (p_5_exists) {
+ if ( as.scalar (surv[n_time - pn_5 + 1,1]) == 0.5 ) { # if the
estimated survivor function is exactly equal to 0.5
+ if (pn_5 > 1) {
+ t_5 = as.scalar ((time[n_time - pn_5 + 1,1] + time[n_time -
pn_5 + 2,1])/2);
+ } else {
+ t_5 = as.scalar (time[n_time - pn_5 + 1,1]);
+ }
+ } else {
+ t_5 = as.scalar (time[n_time - pn_5 + 1,1]);
+ }
+
+ l_ind = (CI_l <= 0.5);
+ r_ind = (CI_r <= 0.5);
+ l_ind_sum = sum (l_ind);
+ r_ind_sum = sum (r_ind);
+ l_min_ind = as.scalar (rowIndexMin (t(l_ind)));
+ r_min_ind = as.scalar (rowIndexMin (t(r_ind)));
+ if (l_min_ind == n_time) {
+ if (l_ind_sum > 0) {
+ if (as.scalar (l_ind[n_time,1]) == 0) { # NA at last position
+ M[M_offset,4] = time[n_time - l_ind_sum,1];
+ } else {
+ M[M_offset,4] = time[1,1];
+ }
+ }
+ } else {
+ M[M_offset,4] = time[l_min_ind + 1,1];
+ }
+
+ if (r_min_ind == n_time) {
+ if (r_ind_sum > 0) {
+ if (as.scalar (r_ind[n_time,1]) == 0) { # NA at last position
+ M[M_offset,5] = time[n_time - r_ind_sum,1];
+ } else {
+ M[M_offset,5] = time[1,1];
+ }
+ }
+ } else {
+ M[M_offset,5] = time[r_min_ind + 1,1];
+ }
+
+ M[M_offset,3] = t_5;
+ if (test_type != "none") {
+ n_event_all_global[g,s] = n_event_sum_all;
+ }
+ }
+ } else {
+ print ("group " + g + " is not present in the stratum " + s);
+ KM_cols_select[(KM_offset + 1):(KM_offset + 7),1] = matrix (0, rows =
7, cols = 1);
+ M_cols[M_offset,1] = 0;
+ }
+ }
+
+ ######## COMPARISON BETWEEN DIFFERENT GROUPS USING LOG-RANK OR WILCOXON
TEST
+ if (num_groups > 1 & test_type != "none") {
+ V = matrix (0, rows = num_groups-1, cols = num_groups-1);
+ parfor (g in 0:(num_groups-1), check = 0) {
+ n_risk = n_risk_n_event_stratum[,g * 2 + 1];
+ n_event = n_risk_n_event_stratum[,g * 2 + 2];
+
+ if (test_type == "log-rank") {
+ O = n_event;
+ E = n_risk * n_event_stratum / n_risk_stratum;
+ } else { ### test_type == "wilcoxon"
+ O = n_risk_stratum * n_event / range;
+ E = n_risk * n_event_stratum / range;
+ }
+
+ U[(g + 1),s] = sum (O - E);
+ U_OE[g + 1, s] = (sum (O - E)*sum (O - E))/sum(E);
+ OBS[g + 1, s] = sum(O);
+ EXP[g + 1, s] = sum(E);
+ }
+
+ for (i1 in 0:(num_groups - 2), check = 0) {
+ n_risk = n_risk_n_event_stratum[,1 + i1 * 2];
+ n_event = n_risk_n_event_stratum[,2 + i1 * 2];
+
+ for (i2 in 0:(num_groups - 2)) {
+ n_risk_i2j = n_risk_n_event_stratum[,1 + i2 * 2];
+ I_i1i2 = 0;
+ if (i1 == i2) {
+ I_i1i2 = 1;
+ }
+ if (test_type == "log-rank") {
+ V1 = n_risk * n_event_stratum * (n_risk_stratum - n_event_stratum)
/ (n_risk_stratum * (n_risk_stratum - 1));
+ V1 = replace (target = V1, pattern = NaN, replacement = 0);
+ V2 = I_i1i2 - (n_risk_i2j / n_risk_stratum);
+ V[(i1 + 1),(i2 + 1)] = sum (V1 * V2);
+ } else { ### test_type == "wilcoxon"
+ V1 = (n_risk_stratum ^ 2) * (n_risk * n_event_stratum) *
(n_risk_stratum - n_event_stratum) / (n_risk_stratum * (n_risk_stratum - 1));
+ V1 = replace (target = V1, pattern = NaN, replacement = 0);
+ V2 = I_i1i2 - (n_risk_i2j / n_risk_stratum);
+ V[(i1 + 1),(i2 + 1)] = sum (V1 * V2) / (range ^ 2);
+ }
+ }
+ }
+ V_start_ind = (s - 1) * (num_groups - 1) + 1;
+ V_sum_total[,V_start_ind:(V_start_ind + num_groups - 2)] = V;
+ }
+ }
+
+ if (num_groups > 1 & test_type != "none") {
+ V_sum = matrix (0, rows = num_groups-1, cols = num_groups-1);
+ for (s in 1:num_strata) {
+ V_start_ind = (s - 1) * (num_groups - 1) + 1;
+ V_sum_total_part = V_sum_total[,V_start_ind:(V_start_ind + num_groups -
2)];
+ V_sum = V_sum + V_sum_total_part;
+ }
+
+ U_sum = rowSums (U);
+
+ test_st = as.scalar (t(U_sum[1:(num_groups-1),1]) %*% inv(V_sum) %*%
U_sum[1:(num_groups-1),1]);
+ p_val = 1 - cdf (target = test_st, dist = "chisq", df = num_groups-1 );
+ if (test_type != "none") {
+ U_OE_sum = rowSums(U_OE);
+ V_OE =rowSums((U*U) /sum(V_sum));
+ TEST_GROUPS_OE[1,1] = num_groups;
+ TEST_GROUPS_OE[1,2] = num_groups - 1;
+ TEST_GROUPS_OE[1,3] = test_st;
+ TEST_GROUPS_OE[1,4] = p_val;
+ TEST[,1] = rowSums(n_event_all_global);
+ TEST[,2] = rowSums(OBS);
+ TEST[,3] = rowSums(EXP);
+ TEST[,4] = rowSums(U_OE_sum);
+ TEST[,5] = rowSums(V_OE);
+ str = append (str, test_type + " test for " + num_groups + " groups:
Chi-squared = " + test_st + " on " + (num_groups - 1) + " df, p = " + p_val + "
");
+ }
+ }
+
+ GSI = removeEmpty (target = GSI, margin = "rows");
+ if (n_group_cols > 0) {
+ # making a copy of unique groups before adding new rows depending on strata
+ G_cols_original = G_cols;
+
+ GSI_1 = GSI[,1];
+ tab_G = table (seq (1, nrow (GSI_1)), GSI_1, nrow (GSI_1), nrow (G_cols));
+ G_cols = tab_G %*% G_cols;
+ }
+
+ if (n_stratum_cols > 0) {
+ GSI_2 = GSI[,2];
+ tab_S = table (seq (1, nrow (GSI_2)), GSI_2, nrow (GSI_2), nrow (S_cols));
+ S_cols = tab_S %*% S_cols;
+ }
+
+ # pull out non-empty rows from M
+ M_cols = removeEmpty (target = M_cols, margin = "rows");
+ tab_M = table (seq (1, nrow (M_cols)), M_cols, nrow (M_cols), nrow (M));
+ M = tab_M %*% M;
+ M = replace (target = M, pattern = Inf, replacement = NaN);
+
+ # pull out non-empty rows from TEST
+ if (n_group_cols > 0 & n_stratum_cols > 0) {
+ M = cbind (G_cols, S_cols, M);
+ if (test_type != "none") {
+ TEST = cbind (G_cols_original, TEST);
+ }
+ } else if (n_group_cols > 0) {
+ M = cbind (G_cols, M);
+ if (test_type != "none") {
+ TEST = cbind (G_cols_original, TEST);
+ }
+ } else if (n_stratum_cols > 0) {
+ M = cbind (S_cols, M);
+ }
+
+ # pull out non-empty columns from KM
+ KM = t (cbind (t (KM), KM_cols_select) * KM_cols_select);
+ KM = removeEmpty (target = KM, margin = "cols");
+ KM = removeEmpty (target = KM, margin = "rows");
+ KM = KM[1:(nrow (KM) - 1),];
+
+ if (test_type != "none") {
+ if (num_groups > 1) {
+ T = TEST;
+ T_GROUPS_OE = TEST_GROUPS_OE;
+ } else {
+ print(str);
+ }
+ } else {
+ T = matrix (0, rows = 1, cols = 1);
+ T_GROUPS_OE = matrix (0, rows = 1, cols = 1);
+ }
+
+ O = KM;
+}
diff --git a/scripts/datagen/genRandData4SurvAnalysis.dml
b/scripts/datagen/genRandData4SurvAnalysis.dml
index 276c95c..75117cf 100644
--- a/scripts/datagen/genRandData4SurvAnalysis.dml
+++ b/scripts/datagen/genRandData4SurvAnalysis.dml
@@ -21,7 +21,7 @@
#
# THIS SCRIPT GENERATED RANDOM DATA FOR KAPLAN-MEIER AND COX PROPORTIONAL
HAZARD MODELS
-# ASSUMPTION: BASELINE HAZARD HAS WEIBULL DISTIRUTION WITH PARAMETERS LAMBDA
AND V
+# ASSUMPTION: BASELINE HAZARD HAS WEIBULL DISTRIBUTION WITH PARAMETERS LAMBDA
AND V
#
# INPUT PARAMETERS:
#
---------------------------------------------------------------------------------------------
@@ -54,7 +54,7 @@
# - column 2 contains the information whether an event occurred (1) or
data is censored (0)
# - columns 3:2+m contain scale features
# 2- If type=cox a coefficient matrix B
-# 3- A colum matrix TE containing the column indices of X corresponding to
timestamp (first row) and event information (second row)
+# 3- A column matrix TE containing the column indices of X corresponding to
timestamp (first row) and event information (second row)
# 4- A column matrix F containing the column indices of X which are to be used
for KM analysis or fitting the Cox model
type = $type; # either "kaplan-meier" or "cox"
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java
b/src/main/java/org/apache/sysds/common/Builtins.java
index 353791c..e649f40 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -101,6 +101,7 @@ public enum Builtins {
CUMSUMPROD("cumsumprod", false),
CONFUSIONMATRIX("confusionMatrix", true),
COR("cor", true),
+ COX("cox", true),
DBSCAN("dbscan", true),
DETECTSCHEMA("detectSchema", false),
DENIALCONSTRAINTS("denialConstraints", true),
@@ -139,9 +140,10 @@ public enum Builtins {
INTERSECT("intersect", true),
INVERSE("inv", "inverse", false),
IQM("interQuartileMean", false),
- ISNA("is.na", "isNA", false),
- ISNAN("is.nan", "isNaN", false),
+ ISNA("is.na", false),
+ ISNAN("is.nan", false),
ISINF("is.infinite", false),
+ KM("km", true),
KMEANS("kmeans", true),
KMEANSPREDICT("kmeansPredict", true),
KNNBF("knnbf", true),
diff --git
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinCoxTest.java
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinCoxTest.java
new file mode 100644
index 0000000..2e39039
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinCoxTest.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin;
+
+import org.apache.sysds.common.Types.ExecMode;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.junit.Test;
+
+public class BuiltinCoxTest extends AutomatedTestBase
+{
+ private final static String TEST_NAME = "cox";
+ private final static String TEST_DIR = "functions/builtin/";
+ private final static String TEST_CLASS_DIR = TEST_DIR +
BuiltinCoxTest.class.getSimpleName() + "/";
+
+ @Override
+ public void setUp() {
+ addTestConfiguration(TEST_NAME, new
TestConfiguration(TEST_CLASS_DIR, TEST_NAME,new String[]{"B"}));
+ }
+
+ @Test
+ public void testFunction() {
+ runCoxTest(50, 2.0, 1.5, 0.8, 100, 0.05, 1.0,0.000001, 100, 0);
+ }
+
+ public void runCoxTest(int numRecords, double scaleWeibull, double
shapeWeibull, double prob,
+ int numFeatures, double sparsity, double alpha, double tol, int
moi, int mii)
+ {
+ ExecMode platformOld = setExecMode(ExecMode.SINGLE_NODE);
+
+ try {
+ loadTestConfiguration(getTestConfiguration(TEST_NAME));
+ String HOME = SCRIPT_DIR + TEST_DIR;
+ fullDMLScriptName = HOME + TEST_NAME + ".dml";
+
+ programArgs = new String[]{
+ "-nvargs", "M=" + output("M"), "S=" +
output("S"), "T=" + output("T"), "COV=" + output("COV"),
+ "RT=" + output("RT"), "XO=" +
output("XO"), "n=" + numRecords, "l=" + scaleWeibull,
+ "v=" + shapeWeibull, "p=" + prob, "m="
+ numFeatures, "sp=" + sparsity,
+ "alpha=" + alpha, "tol=" + tol, "moi="
+ moi, "mii=" + mii, "sd=" + 1};
+
+ runTest(true, false, null, -1);
+ //TODO output comparison
+ }
+ finally {
+ rtplatform = platformOld;
+ }
+ }
+}
diff --git
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinKmTest.java
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinKmTest.java
new file mode 100644
index 0000000..0e56749
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinKmTest.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin;
+
+import org.apache.sysds.common.Types.ExecMode;
+import org.apache.sysds.lops.LopProperties.ExecType;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Test;
+
+public class BuiltinKmTest extends AutomatedTestBase
+{
+ private final static String TEST_NAME = "km";
+ private final static String TEST_DIR = "functions/builtin/";
+ private static final String TEST_CLASS_DIR = TEST_DIR +
BuiltinKmTest.class.getSimpleName() + "/";
+
+ @Override
+ public void setUp() {
+ TestUtils.clearAssertionInformation();
+ addTestConfiguration(TEST_NAME, new
TestConfiguration(TEST_CLASS_DIR, TEST_NAME,new String[]{"C"}));
+ }
+
+ @Test
+ public void testKmDefaultConfiguration() {
+ runKmTest(10, 2.0, 1.5, 0.8, 2,
+ 1, 10, 0.05,"greenwood", "log", "none");
+ }
+ @Test
+ public void testKmErrTypePeto() {
+ runKmTest(10, 2.0, 1.5, 0.8, 2,
+ 1, 10, 0.05,"peto", "log", "none");
+ }
+ @Test
+ public void testKmConfTypePlain() {
+ runKmTest(10, 2.0, 1.5, 0.8, 2,
+ 1, 10, 0.05,"greenwood", "plain", "none");
+ }
+ @Test
+ public void testKmConfTypeLogLog() {
+ runKmTest(10, 2.0, 1.5, 0.8, 2,
+ 1, 10, 0.05,"greenwood", "log-log", "none");
+ }
+
+ private void runKmTest(int numRecords, double scaleWeibull, double
shapeWeibull, double prob,
+ int numCatFeaturesGroup, int numCatFeaturesStrat, int
maxNumLevels, double alpha, String err_type,
+ String conf_type, String test_type)
+ {
+ ExecMode platformOld = setExecMode(ExecType.CP);
+
+ try {
+ loadTestConfiguration(getTestConfiguration(TEST_NAME));
+ String HOME = SCRIPT_DIR + TEST_DIR;
+ fullDMLScriptName = HOME + TEST_NAME + ".dml";
+
+ programArgs = new String[]{
+ "-nvargs", "O=" + output("O"), "M=" +
output("M"), "T=" + output("T"),
+ "T_GROUPS_OE=" + output("T_GROUPS_OE"),
"n=" + numRecords, "l=" + scaleWeibull,
+ "v=" + shapeWeibull, "p=" + prob, "g="
+ numCatFeaturesGroup, "s=" + numCatFeaturesStrat,
+ "f=" + maxNumLevels, "alpha=" + alpha,
"err_type=" + err_type,
+ "conf_type=" + conf_type, "test_type="
+ test_type, "sd=" + 1};
+
+ runTest(true, false, null, -1);
+ //TODO output comparison
+ }
+ finally {
+ rtplatform = platformOld;
+ }
+ }
+}
diff --git a/src/test/scripts/functions/builtin/cox.dml
b/src/test/scripts/functions/builtin/cox.dml
new file mode 100644
index 0000000..1898b9b
--- /dev/null
+++ b/src/test/scripts/functions/builtin/cox.dml
@@ -0,0 +1,70 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+num_records = $n;
+lambda = $l;
+p_event = $p;
+# parameters related to the cox model
+num_features = $m;
+sparsity = $sp;
+p_censor = 1 - p_event; # prob. that record is censored
+n_seed = $sd;
+
+v = $v;
+# generate feature matrix
+X_t = rand (rows = num_records, cols = num_features, min = 1, max = 5, pdf =
"uniform", sparsity = sparsity, seed = n_seed);
+# generate coefficients
+B = rand (rows = num_features, cols = 1, min = -1.0, max = 1.0, pdf =
"uniform", sparsity = 1.0, seed = n_seed);
+
+# generate timestamps
+U = rand (rows = num_records, cols = 1, min = 0.000000001, max = 1, seed =
n_seed);
+T_1 = (-log (U) / (lambda * exp (X_t %*% B)) ) ^ (1/v);
+
+Y = matrix (0, rows = num_records, cols = 2);
+event = floor (rand (rows = num_records, cols = 1, min = (1 - p_censor), max =
(1 + p_event), seed = n_seed));
+n_time = sum (event);
+Y[,2] = event;
+
+# binning of event times
+min_T = min (T_1);
+max_T = max (T_1);
+# T_1 = T_1 - min_T;
+len = max_T - min_T;
+num_bins = len / n_time;
+T_1 = ceil (T_1 / num_bins);
+
+# print ("min(T) " + min(T) + " max(T) " + max(T));
+Y[,1] = T_1;
+
+X = cbind (Y, X_t);
+
+TE = matrix ("1 2", rows = 2, cols = 1);
+F = seq (1, num_features);
+R = matrix (0, rows = 1, cols = 1);
+
+[M, S, T, COV, RT, XO] = cox(X, TE, F, R, $alpha, $tol, $moi, $mii);
+
+write(M, $M);
+write(S, $S);
+write(T, $T);
+write(COV, $COV);
+write(RT, $RT);
+write(XO, $XO);
diff --git a/src/test/scripts/functions/builtin/km.dml
b/src/test/scripts/functions/builtin/km.dml
new file mode 100644
index 0000000..fd4be61
--- /dev/null
+++ b/src/test/scripts/functions/builtin/km.dml
@@ -0,0 +1,68 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+num_records = $n;
+lambda = $l;
+p_event = $p;
+n_groups = $g;
+n_strata = $s;
+max_level = $f;
+n_seed = $sd;
+
+p_censor = 1 - p_event; # prob. that record is censored
+
+v = $v;
+# generate categorical features used for grouping and stratifying
+X_t = ceil (rand (rows = num_records, cols = n_groups + n_strata, min =
0.000000001, max = max_level - 0.000000001, pdf = "uniform", seed = n_seed));
+#X_t = ceil (matrix (max_level - 0.000000001, rows = num_records, cols =
n_groups + n_strata));
+
+# generate timestamps
+U = rand (rows = num_records, cols = 1, min = 0.000000001, max = 1, seed =
n_seed);
+#U = matrix (1, rows = num_records, cols = 1);
+T = (-log (U) / lambda) ^ (1/v);
+
+Y = matrix (0, rows = num_records, cols = 2);
+event = floor (rand (rows = num_records, cols = 1, min = (1 - p_censor), max =
(1 + p_event), seed = n_seed));
+#event = floor (matrix (1 + p_event, rows = num_records, cols = 1));
+n_time = sum (event);
+Y[,2] = event;
+
+# binning of event times
+min_T = min (T);
+max_T = max (T);
+# T = T - min_T;
+len = max_T - min_T;
+num_bins = len / n_time;
+T = ceil (T / num_bins);
+
+Y[,1] = T;
+X = cbind (Y, X_t);
+TE = matrix ("1 2", rows = 2, cols = 1);
+
+GI = matrix (1, rows = n_groups, cols = 1);
+SI = matrix (1, rows = n_strata, cols = 1);
+
+[O, M, T, T_GROUPS_OE] = km(X, TE, GI, SI, $alpha, $err_type, $conf_type,
$test_type);
+
+write(O, $O);
+write(M, $M);
+write(T, $T);
+write(T_GROUPS_OE, $T_GROUPS_OE);
diff --git
a/src/test/scripts/functions/indexing/LeftIndexingUpdateInPlaceTest3.dml
b/src/test/scripts/functions/indexing/LeftIndexingUpdateInPlaceTest3.dml
index 8501726..7748356 100644
--- a/src/test/scripts/functions/indexing/LeftIndexingUpdateInPlaceTest3.dml
+++ b/src/test/scripts/functions/indexing/LeftIndexingUpdateInPlaceTest3.dml
@@ -30,4 +30,4 @@ for(i in 1:4) {
}
R = as.matrix(sum(A))
-write(R, $3, format="text")
\ No newline at end of file
+write(R, $3, format="text")