christinadionysio commented on code in PR #1946: URL: https://github.com/apache/systemds/pull/1946#discussion_r1696547077
########## scripts/builtin/shapExplainer.dml: ########## @@ -0,0 +1,730 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Computes shapley values for multiple instances in parallel using antithetic permutation sampling. +# The resulting matrix phis holds the shapley values for each feature in the column given by the index of the feature in the sample. +# +# This method first creates two large matrices for masks and masked background data for all permutations and +# then runs in paralell on all instances in x. +# While the prepared matrices can become very large (2 * #features * #permuations * #n_samples * #features), +# the preparation of a row for the model call breaks down to a single element-wise multiplication of this mask with the row and +# an addition to the masked background data, since masks can be reused for each instance. +# +# INPUT: +# --------------------------------------------------------------------------------------- +# model_function The function of the model to be evaluated as a String. This function has to take a matrix of samples +# and return a vector of predictions. +# It might be usefull to wrap the model into a function the takes and returns the desired shapes and +# use this wrapper here. +# model_args Arguments in order for the model, if desired. This will be prepended by the created instances-matrix. +# x_instances Multiple instances as rows for which to compute the shapley values. +# X_bg The background dataset from which to pull the random samples to perform Monte Carlo integration. +# n_permutations The number of permutaions. Defaults to 10. Theoretical 1 should already be enough for models with up +# to second order interaction effects. +# n_samples Number of samples from X_bg used for marginalization. +# remove_non_var EXPERIMENTAL: If set, for every instance the varaince of each feature is checked against this feature in the +# background data. If it does not change, we do not run any model cals for it. +# seed A seed, in case the sampling has to be deterministic. +# verbose A boolean to enable logging of each step of the function. +# --------------------------------------------------------------------------------------- +# +# OUTPUT: +# ----------------------------------------------------------------------------- +# S Matrix holding the shapley values along the cols, one row per instance. +# expected Double holding the average prediction of all instances. +# ----------------------------------------------------------------------------- +s_shapExplainer = function(String model_function, list[unknown] model_args, Matrix[Double] x_instances, Review Comment: Please reformat the function declarations so that you use 4 spaces for every parameter that is introduced in a new line and 2 spaces for the return value and move the opening bracket `{` to a new line. Please double check for all other functions in this file. ``` example = function(String x, Double y, ... Matrix[Double] z) return Double r { } ``` ########## src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinShapExplainerTest.java: ########## @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.functions.builtin.part2; + + +import org.junit.Test; + +import java.util.HashMap; + +import org.apache.sysds.common.Types.ExecMode; +import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex; +import org.apache.sysds.test.AutomatedTestBase; +import org.apache.sysds.test.TestConfiguration; +import org.apache.sysds.test.TestUtils; + +public class BuiltinShapExplainerTest extends AutomatedTestBase +{ + private static final String TEST_NAME = "shapExplainer"; + private static final String TEST_DIR = "functions/builtin/"; + private static final String TEST_CLASS_DIR = TEST_DIR + BuiltinShapExplainerTest.class.getSimpleName() + "/"; + private static final boolean VERBOSE = true; + + @Override + public void setUp() { + addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[]{"R"})); + } + + @Test + public void testPrepareMaskForPermutation() { + runShapExplainerUnitTest("prepare_mask_for_permutation"); Review Comment: We use these tests to test the whole functionality of the scripts (more like your component test). The test cases should cover different input parameters and edge cases. It would be great if you could modify the tests in that regard to keep them consistent with our other tests. Please add additional component tests for the different modes (HYBRID, SPARK). ########## scripts/builtin/shapExplainer.dml: ########## @@ -0,0 +1,730 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Computes shapley values for multiple instances in parallel using antithetic permutation sampling. +# The resulting matrix phis holds the shapley values for each feature in the column given by the index of the feature in the sample. +# +# This method first creates two large matrices for masks and masked background data for all permutations and +# then runs in paralell on all instances in x. +# While the prepared matrices can become very large (2 * #features * #permuations * #n_samples * #features), +# the preparation of a row for the model call breaks down to a single element-wise multiplication of this mask with the row and +# an addition to the masked background data, since masks can be reused for each instance. +# +# INPUT: +# --------------------------------------------------------------------------------------- +# model_function The function of the model to be evaluated as a String. This function has to take a matrix of samples +# and return a vector of predictions. +# It might be usefull to wrap the model into a function the takes and returns the desired shapes and +# use this wrapper here. +# model_args Arguments in order for the model, if desired. This will be prepended by the created instances-matrix. +# x_instances Multiple instances as rows for which to compute the shapley values. +# X_bg The background dataset from which to pull the random samples to perform Monte Carlo integration. +# n_permutations The number of permutaions. Defaults to 10. Theoretical 1 should already be enough for models with up +# to second order interaction effects. +# n_samples Number of samples from X_bg used for marginalization. +# remove_non_var EXPERIMENTAL: If set, for every instance the varaince of each feature is checked against this feature in the +# background data. If it does not change, we do not run any model cals for it. +# seed A seed, in case the sampling has to be deterministic. +# verbose A boolean to enable logging of each step of the function. +# --------------------------------------------------------------------------------------- +# +# OUTPUT: +# ----------------------------------------------------------------------------- +# S Matrix holding the shapley values along the cols, one row per instance. +# expected Double holding the average prediction of all instances. +# ----------------------------------------------------------------------------- +s_shapExplainer = function(String model_function, list[unknown] model_args, Matrix[Double] x_instances, + Matrix[Double] X_bg, Integer n_permutations = 10, Integer n_samples = 100, Integer remove_non_var=0, + Matrix[Double] partitions=as.matrix(-1), Integer seed = -1, Integer verbose = 0) +return (Matrix[Double] row_phis, Double expected){ + u_printShapMessage("Parallel Permutation Explainer for "+nrow(x_instances)+" rows.", verbose) + u_printShapMessage("Number of Features: "+ncol(x_instances), verbose ) + total_preds=ncol(x_instances)*2*n_permutations*n_samples*nrow(x_instances) + u_printShapMessage("Number of predictions: "+toString(total_preds)+" in "+nrow(x_instances)+ + " parallel cals.", verbose ) + + #start with all features + features=u_range(1, ncol(x_instances)) + + #handle partitions + if(sum(partitions) != -1){ + if(remove_non_var != 0){ + stop("shapley_permutations_by_row:ERROR: Can't use n_non_varying_inds and partitions at the same time.") + } + features=removePartitionsFromFeatures(features, partitions) + reduced_total_preds=ncol(features)*2*n_permutations*n_samples*nrow(x_instances) + u_printShapMessage("Using Partitions reduces number of features to "+ncol(features)+".", verbose ) + u_printShapMessage("Total number of predictions reduced by "+(total_preds-reduced_total_preds)/total_preds+" to "+reduced_total_preds+".", verbose ) + } + + #lengths and offsets + total_features = ncol(x_instances) + perm_length = ncol(features) + full_mask_offset = perm_length * 2 * n_samples + n_partition_features = total_features - perm_length + + #sample from X_bg + u_printShapMessage("Sampling from X_bg", verbose ) + # could use new samples for each permutation by sampling n_samples*n_permutations + X_bg_samples = u_sample_with_potential_replace(X_bg=X_bg, samples=n_samples, seed=seed ) + row_phis = matrix(0, rows=nrow(x_instances), cols=total_features) + expected_m = matrix(0, rows=nrow(x_instances), cols=1) + + #prepare masks for all permutations, since it stays the same for every row + u_printShapMessage("Preparing reusable intermediate masks.", verbose ) + permutations = matrix(0, rows=n_permutations, cols=perm_length) + masks_for_permutations = matrix(0, rows=perm_length*2*n_permutations*n_samples, cols=total_features) + + parfor (i in 1:n_permutations, check=0){ + #shuffle features to get permutation + permutations[i] = t(u_shuffle(t(features))) + perm_mask = prepare_mask_for_permutation(permutation=permutations[i], partitions=partitions) + + offset_masks = (i-1) * full_mask_offset + 1 + masks_for_permutations[offset_masks:offset_masks+full_mask_offset-1]=prepare_full_mask(perm_mask, n_samples) + } + + #replicate background and mask it, since it also can stay the same for every row + # could use new samples for each permutation by sampling n_samples*n_permutations and telling this function about it + masked_bg_for_permutations = prepare_masked_X_bg(masks_for_permutations, X_bg_samples, 0) + u_printShapMessage("Computing phis in parallel.", verbose ) + + #enable spark execution for parfor if desired + #TODO allow spark mode via parameter? + #parfor (i in 1:nrow(x_instances), opt=CONSTRAINED, mode=REMOTE_SPARK){ + + parfor (i in 1:nrow(x_instances)){ + if(remove_non_var == 1){ + # try to remove inds that do not vary from the background + non_var_inds = get_non_varying_inds(x_instances[i], X_bg_samples) + # only remove if more than 2 features remain, less then two breaks removal procedure + if (ncol(x_instances) > length(non_var_inds)+2){ + #remove samples and masks for non varying features + [i_masks_for_permutations, i_masked_bg_for_permutations] = remove_inds(masks_for_permutations, masked_bg_for_permutations, permutations, non_var_inds, n_samples) + }else{ + # we would remove all but two features, whichs breaks the removal algorithm + non_var_inds = as.matrix(-1) + i_masks_for_permutations = masks_for_permutations + i_masked_bg_for_permutations = masked_bg_for_permutations + } + } else { + non_var_inds = as.matrix(-1) + i_masks_for_permutations = masks_for_permutations + i_masked_bg_for_permutations = masked_bg_for_permutations + } + + #apply masks and bg data for all permutations at once + X_test = apply_full_mask(x_instances[i], i_masks_for_permutations, i_masked_bg_for_permutations) + + #generate args for call to model + X_arg = append(list(X=X_test), model_args) + + #call model + P = eval(model_function, X_arg) + + #compute means, deviding n_rows by n_samples + P = compute_means_from_predictions(P=P, n_samples=n_samples) + + #compute phis + [phis, e] = compute_phis_from_prediction_means(P=P, permutations=permutations, non_var_inds=non_var_inds, n_partition_features=n_partition_features) + expected_m[i] = e + + #compute phis for this row from all permutations + row_phis[i] = t(phis) + } + #compute expected of model from all rows + expected = mean(expected_m) +} + +# Computes which indices do not vary from the background. +# Uses the appraoch from numpy.isclose() and compares to the largest diff of each feature in the bg data. +# In the futere, more advanced techniques like using std-dev of bg data as a tollerance could be used. +# +# INPUT: +# ----------------------------------------------------------------------------- +# x One single instance. +# X_bg Background dataset. +# ----------------------------------------------------------------------------- +# OUTPUT: +# ----------------------------------------------------------------------------- +# non_varying_inds A row-vector with all the indices that do not vary from the background dataset. +# ----------------------------------------------------------------------------- +get_non_varying_inds = function(Matrix[Double] x, Matrix[Double] X_bg) +return (Matrix[Double] non_varying_inds){ + #from numpy.isclose but adapted to fit MSE of shap, which is within the same scale + rtol = 1e-04 + atol = 1e-05 + + # compute distance metrics + diff = colMaxs(abs(X_bg -x)) + rdist = atol + rtol * colMaxs(abs(X_bg)) + + non_varying_inds = (diff <= rdist) + # translate to indices + non_varying_inds = t(seq(1,ncol(x))) * non_varying_inds + # remove the ones that do vary + non_varying_inds = removeEmpty(target=non_varying_inds, margin="cols") +} + +# Prepares a boolean mask for removing features according to permutaion. +# The resulting matrix needs to be inflated to a sample set by using prepare_samples_from_mask() before calling the model. +# +# INPUT: +# --------------------------------------------------------------------------------------- +# permutation A single permutation of varying features. +# If using partitions, remove them beforhand by using removePartitionsFromFeatures() from the utils. +# n_non_varying_inds The number of feature that do not vary in the background data. +# Can be retrieved e.g. by looking at std.dev +# partitions Matrix with first elemnt of partition in first row and last element of partition in second row. +# Used to treat partitions as one feature when creating masks. Useful for one-hot-encoded features. +# --------------------------------------------------------------------------------------- +# +# OUTPUT: +# ----------------------------------------------------------------------------- +# mask Boolean mask. +# ----------------------------------------------------------------------------- +prepare_mask_for_permutation = function(Matrix[Double] permutation, Integer n_non_varying_inds=0, + Matrix[Double] partitions=as.matrix(-1)) +return (Matrix[Double] masks){ + if(sum(partitions)!=-1){ + #can't use n_non_varying_inds and partitions at the same time + if(n_non_varying_inds > 0){ + stop("shap-explainer::prepare_mask_for_permutation:ERROR: Can't use n_non_varying_inds and partitions at the same time.") + } + #number of features not in permutation is diff between start and end of partitions, since first feature remains in permutation + skip_inds = partitions[2,] - partitions[1,] + + #skip these inds by treating them as non varying + n_non_varying_inds = sum(skip_inds) + } + + #total number of features + perm_len = ncol(permutation)+n_non_varying_inds + if(n_non_varying_inds > 0){ + #prep full constructor with placeholders + mask_constructor = matrix(perm_len+1, rows=1, cols = perm_len) + mask_constructor[1,1:ncol(permutation)] = permutation + }else{ + mask_constructor=permutation + } + + perm_cols = ncol(mask_constructor) + + # we compute mask on reverse permutation wnd reverse it later to get desired shape + + # create row indicator vector ctable + perm_mask_rows = seq(1,perm_cols) + #TODO: col-vector and matrix mult? Review Comment: Is this TODO still needed? If not please remove it and also double check for the other occurrences in this script. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
