Repository: incubator-systemml Updated Branches: refs/heads/master 28c92b93f -> 19eed8f38
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/19eed8f3/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java new file mode 100644 index 0000000..40a39f0 --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java @@ -0,0 +1,541 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sysml.runtime.matrix.data; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.concurrent.Callable; + +import org.apache.sysml.hops.OptimizerUtils; +import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.instructions.InstructionUtils; +import org.apache.sysml.runtime.util.ConvolutionUtils; +import org.apache.sysml.utils.NativeHelper; + + +public class LibMatrixDNNHelper { + + // *********************************** low-level runtime operator selection *********************************************** + // *********************************** based on runtime properties (sparsity, native, etc) ******************************** + // These methods help reduce branch miss predictions and instruction-cache misses. + // Also, they simplify the design of LibMatrixDNN and help in code-maintenance. + + /** + * Factory method that returns list of callable tasks for performing maxpooling operation + * + * @param params convolution parameters + * @return list of callable tasks for performing maxpooling operation + * @throws DMLRuntimeException if error occurs + */ + public static ArrayList<Callable<Long>> getMaxPoolingWorkers(ConvolutionParameters params) throws DMLRuntimeException { + ArrayList<Callable<Long>> ret = new ArrayList<Callable<Long>>(); + int k = OptimizerUtils.getConstrainedNumThreads(params.numThreads); + int taskSize = (int)(Math.ceil((double)params.N / k)); + for(int i = 0; i*taskSize < params.N; i++) { + if(params.input1.isInSparseFormat()) + ret.add(new LibMatrixDNNPoolingHelper.SparseMaxPooling(i*taskSize, Math.min((i+1)*taskSize, params.N), params)); + else + ret.add(new LibMatrixDNNPoolingHelper.DenseMaxPooling(i*taskSize, Math.min((i+1)*taskSize, params.N), params)); + } + return ret; + } + + /** + * Factory method that returns list of callable tasks for performing maxpooling backward operation + * + * @param params convolution parameters + * @return list of callable tasks for performing maxpooling backward operation + * @throws DMLRuntimeException if error occurs + */ + public static ArrayList<Callable<Long>> getMaxPoolingBackwardWorkers(ConvolutionParameters params, boolean performReluBackward) throws DMLRuntimeException { + ArrayList<Callable<Long>> ret = new ArrayList<Callable<Long>>(); + int k = OptimizerUtils.getConstrainedNumThreads(params.numThreads); + int taskSize = (int)(Math.ceil((double)params.N / k)); + for(int i = 0; i*taskSize < params.N; i++) { + if(!params.input1.isInSparseFormat()) { + if(!params.input2.isInSparseFormat()) + ret.add(new LibMatrixDNNPoolingBackwardHelper.PoolingBackwardDenseDense(i*taskSize, Math.min((i+1)*taskSize, params.N), params, performReluBackward)); + else + ret.add(new LibMatrixDNNPoolingBackwardHelper.PoolingBackwardDenseSparse(i*taskSize, Math.min((i+1)*taskSize, params.N), params, performReluBackward)); + } + else { + if(!params.input2.isInSparseFormat()) + ret.add(new LibMatrixDNNPoolingBackwardHelper.PoolingBackwardSparseDense(i*taskSize, Math.min((i+1)*taskSize, params.N), params, performReluBackward)); + else + ret.add(new LibMatrixDNNPoolingBackwardHelper.PoolingBackwardSparseSparse(i*taskSize, Math.min((i+1)*taskSize, params.N), params, performReluBackward)); + } + } + return ret; + } + + /** + * Factory method that returns list of callable tasks for performing relu backward operation + * + * @param params convolution parameters + * @return list of callable tasks for performing relu backward operation + * @throws DMLRuntimeException if error occurs + */ + public static ArrayList<Callable<Long>> getReluBackwardWorkers(ConvolutionParameters params) throws DMLRuntimeException { + ArrayList<Callable<Long>> ret = new ArrayList<Callable<Long>>(); + int k = OptimizerUtils.getConstrainedNumThreads(params.numThreads); + int taskSize = (int)(Math.ceil((double)params.N / k)); + for(int i = 0; i*taskSize < params.N; i++) { + ret.add(new ReluBackward(i*taskSize, Math.min((i+1)*taskSize, params.N), params)); + } + return ret; + } + + /** + * Factory method that returns list of callable tasks for performing conv2d + * + * @param params convolution parameters + * @return list of callable tasks for performing conv2d + * @throws DMLRuntimeException if error occurs + */ + public static ArrayList<Callable<Long>> getConv2dWorkers(ConvolutionParameters params) throws DMLRuntimeException { + ArrayList<Callable<Long>> ret = new ArrayList<Callable<Long>>(); + + // Try to create as many tasks as threads. + // Creating more tasks will help in tail, but would have additional overhead of maintaining the intermediate + // data structures such as im2col blocks. + int k = OptimizerUtils.getConstrainedNumThreads(params.numThreads); + int taskSize = (int)(Math.ceil((double)params.N / k)); + + // TODO: Decide here based on params whether to use LoopedIm2ColConv2dAllChannels or LoopedIm2ColConv2dOneChannel + // For now, let's stick to the existing approach of converting [1, CHW] to [CRS, PQ] as it allows matrix multiplication large enough matrix. + boolean allChannels = true; ArrayList<MatrixBlock> filters = null; + if(!allChannels) { + filters = splitFilter(params); + } + + boolean isEmptyDenseInput = !params.input1.isInSparseFormat() && params.input1.denseBlock == null; + + for(int i = 0; i*taskSize < params.N; i++) { + if(LibMatrixDNN.isEligibleForConv2dSparse(params)) + ret.add(new LibMatrixDNNConv2dHelper.SparseNativeConv2d(i*taskSize, Math.min((i+1)*taskSize, params.N), params)); + else if(!isEmptyDenseInput && allChannels) + ret.add(new LibMatrixDNNConv2dHelper.LoopedIm2ColConv2dAllChannels(i*taskSize, Math.min((i+1)*taskSize, params.N), params)); + else if(!isEmptyDenseInput && !allChannels) + ret.add(new LibMatrixDNNConv2dHelper.LoopedIm2ColConv2dOneChannel(i*taskSize, Math.min((i+1)*taskSize, params.N), params, filters)); + else + throw new DMLRuntimeException("Unsupported operator"); + } + return ret; + } + + /** + * Factory method that returns list of callable tasks for performing conv2d backward filter + * + * @param params convolution parameters + * @return list of callable tasks for performing conv2d backward filter + * @throws DMLRuntimeException if error occurs + */ + public static ArrayList<Callable<Long>> getConv2dBackwardFilterWorkers(ConvolutionParameters params) throws DMLRuntimeException { + ArrayList<Callable<Long>> ret = new ArrayList<Callable<Long>>(); + // Try to create as many tasks as threads. + // Creating more tasks will help in tail, but would have additional overhead of maintaining the intermediate + // data structures such as im2col blocks. + int k = OptimizerUtils.getConstrainedNumThreads(params.numThreads); + int taskSize = (int)(Math.ceil((double)params.N / k)); + + boolean isEmptyDenseInput = (!params.input1.isInSparseFormat() && params.input1.denseBlock == null) || + (!params.input2.isInSparseFormat() && params.input2.denseBlock == null); + + for(int i = 0; i*taskSize < params.N; i++) { + if(LibMatrixDNN.isEligibleForConv2dBackwardFilterSparseDense(params)) + ret.add(new LibMatrixDNNConv2dBackwardFilterHelper.SparseNativeConv2dBackwardFilterDense(i*taskSize, Math.min((i+1)*taskSize, params.N), params)); + else if(!isEmptyDenseInput) + ret.add(new LibMatrixDNNConv2dBackwardFilterHelper.Conv2dBackwardFilter(i*taskSize, Math.min((i+1)*taskSize, params.N), params)); + else + throw new DMLRuntimeException("Unsupported operator"); + } + return ret; + } + + /** + * Factory method that returns list of callable tasks for performing conv2d backward data + * + * @param params convolution parameters + * @return list of callable tasks for performing conv2d backward data + * @throws DMLRuntimeException if error occurs + */ + public static ArrayList<Callable<Long>> getConv2dBackwardDataWorkers(ConvolutionParameters params) throws DMLRuntimeException { + ArrayList<Callable<Long>> ret = new ArrayList<Callable<Long>>(); + + // Try to create as many tasks as threads. + // Creating more tasks will help in tail, but would have additional overhead of maintaining the intermediate + // data structures such as im2col blocks. + int k = OptimizerUtils.getConstrainedNumThreads(params.numThreads); + int taskSize = (int)(Math.ceil((double)params.N / k)); + + boolean isEmptyDenseInput = (!params.input1.isInSparseFormat() && params.input1.denseBlock == null) || + (!params.input2.isInSparseFormat() && params.input2.denseBlock == null); + + for(int i = 0; i*taskSize < params.N; i++) { + if(LibMatrixDNN.isEligibleForConv2dBackwardDataDense(params)) + ret.add(new LibMatrixDNNConv2dBackwardDataHelper.SparseNativeConv2dBackwardDataDense(i*taskSize, Math.min((i+1)*taskSize, params.N), params)); + else if(!isEmptyDenseInput) + ret.add(new LibMatrixDNNConv2dBackwardDataHelper.Conv2dBackwardData(i*taskSize, Math.min((i+1)*taskSize, params.N), params)); + else + throw new DMLRuntimeException("Unsupported operator"); + } + + return ret; + } + + // *********************************** relu backward operator ****************************************************** + + /** + * Performs the operation: (X gt 0) * dout + */ + public static class ReluBackward implements Callable<Long> + { + public int _rl; public int _ru; + private final ConvolutionParameters _params; + double [] outputArray; int numOutCols; + public ReluBackward(int rl, int ru, ConvolutionParameters params) { + _rl = rl; _ru = ru; + _params = params; + outputArray= params.output.getDenseBlock(); + numOutCols = params.input1.getNumColumns(); + } + + @Override + public Long call() throws Exception { + if(!_params.input1.isInSparseFormat() && !_params.input2.isInSparseFormat()) { + double [] inputArr = _params.input1.getDenseBlock(); + double [] doutArr = _params.input2.getDenseBlock(); + for(int i = _rl*numOutCols; i < _ru*numOutCols; i++) { + outputArray[i] = inputArr[i] > 0 ? doutArr[i] : 0; + } + } + else { + // Perform (X > 0) + ConvolutionUtils.scalarOperations(_params.input1, outputArray, _rl*numOutCols, numOutCols, _rl, _ru, + InstructionUtils.parseScalarBinaryOperator(">", false, 0)); + // Then perform (X > 0) * dout + ConvolutionUtils.binaryOperationInPlace(_params.input2, outputArray, _rl*numOutCols, numOutCols, _rl, _ru, + LibMatrixDNN._binaryElementWiseMultiplication); + } + return 0L; + } + } + + // *********************************** utility methods ****************************************************** + + /** + * Computes tensor indexes from column index such that column index is equal to ret[0]*HW + ret[1]*W + ret[2] + * + * @param j column index + * @param ret tensor indexes + * @param H second last dimension + * @param W last dimension + */ + static void computeTensorIndexes(int j, int [] ret, int H, int W) { + ret[0] = j / (H*W); + ret[1] = (j - ret[0]*(H*W))/W; + ret[2] = j % W; + } + + //Split a filter of size [K, CRS] into c filters of [K, RS] + private static ArrayList<MatrixBlock> splitFilter(ConvolutionParameters _params) { + ArrayList<MatrixBlock> ret = new ArrayList<MatrixBlock>(); + int RS = _params.R*_params.S; int CRS = _params.C*_params.R*_params.S; + double [] filter = _params.input2.getDenseBlock(); int S = _params.S; + for(int c = 0; c < _params.C; c++) { + MatrixBlock mb = new MatrixBlock(_params.K, RS, false); + mb.allocateDenseBlock(); long nnz = 0; + double [] outputArr = mb.getDenseBlock(); + if(filter != null) { + for(int k = 0; k < _params.K; k++) { + for(int rs = 0; rs < RS; rs++) { + outputArr[k*RS + rs] = filter[k*CRS + c*RS + rs]; + nnz += outputArr[k*RS + rs] != 0 ? 1 : 0; + } + } + } + else { + for(int k = 0; k < _params.K; k++) { + if( !_params.input2.sparseBlock.isEmpty(k) ) { + int [] tensorIndexes = new int[3]; + // Find maxIndex + int apos = _params.input2.sparseBlock.pos(k); + int alen = _params.input2.sparseBlock.size(k); + int[] aix = _params.input2.sparseBlock.indexes(k); + double[] avals = _params.input2.sparseBlock.values(k); + for(int j=apos; j<apos+alen; j++) { + computeTensorIndexes(aix[j], tensorIndexes, _params.R, _params.S); + if(c != tensorIndexes[0]) + continue; + int r = tensorIndexes[1]; + int s = tensorIndexes[2]; + outputArr[k*RS + r*S + s] = avals[j]; + nnz += outputArr[k*RS + r*S + s] != 0 ? 1 : 0; + } + } + } + } + mb.setNonZeros(nnz); + ret.add(mb); + } + return ret; + } + + // Single-threaded matrix multiplication + static void singleThreadedMatMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, + boolean recomputeNNZM1, boolean recomputeNNZM2, ConvolutionParameters params) throws DMLRuntimeException { + if(!params.enableNative || m1.isInSparseFormat() || m2.isInSparseFormat()) { + if(recomputeNNZM1) + m1.recomputeNonZeros(); + if(recomputeNNZM2) + m2.recomputeNonZeros(); + LibMatrixMult.matrixMult(m1, m2, ret, false); + } + else { + ret.sparse = false; + if(ret.getDenseBlock() == null) + ret.allocateDenseBlock(); + NativeHelper.matrixMultDenseDense(m1.denseBlock, m2.denseBlock, + ret.denseBlock, m1.getNumRows(), m1.getNumColumns(), m2.getNumColumns(), 1); + ret.recomputeNonZeros(); + } + } + + static void addBias(int _rl, int _ru, double [] outputArr, double [] biasArr, int K, int PQ) { + // double [] biasArr = _params.bias.getDenseBlock(); + + int index = _rl*K*PQ; + for(int n = _rl; n < _ru; n++) { + for(int k = 0; k < K; k++) { + for(int pq = 0; pq < PQ; pq++, index++) { + outputArr[index] += biasArr[k]; + } + } + } + } + + /** + * Returns the index of cell with maximum value. This method is optimized for dense input + * + * @param p output feature map height + * @param q output feature map width + * @param inputOffset offset to be used for input index + * @param inputArray input array + * @param params convolution parameters + * @param performReluBackward perform ReLU backward + * @return index of cell with maximum value + */ + static int getMaxIndex(int p, int q, int inputOffset, double [] inputArray, ConvolutionParameters params, boolean performReluBackward) { + int start_index_h = params.start_indexes_h[p]; + int end_index_h = params.end_indexes_h[p]; + int start_index_w = params.start_indexes_w[q]; + int end_index_w = params.end_indexes_w[q]; + + int maxIndex = -1; + double maxVal = -Double.MAX_VALUE; + + // Note: We do not treat pad as zero and hence we don't do: + // maxVal = 0 + // if start_index_h < 0 || start_index_w < 0 || end_index_h >= params.H || end_index_w >= params.W + + // Find maxIndex + double currDoutVal = -1; + for (int h = start_index_h; h < end_index_h; h++) { + for (int w = start_index_w; w < end_index_w; w++) { + currDoutVal = inputArray[inputOffset + h*params.W + w]; + currDoutVal = performReluBackward && currDoutVal < 0 ? 0 : currDoutVal; + if(maxVal < currDoutVal) { + maxIndex = inputOffset + h*params.W + w; + maxVal = currDoutVal; + } + } + } + return maxIndex; + } + + /** + * Returns the index of cell with maximum value. This method is optimized for sparse input + * + * @param p output feature map height + * @param q output feature map width + * @param inputOffset offset to be used for input index + * @param n number of images + * @param c number of channels + * @param input input matrix + * @param params convolution parameters + * @param performReluBackward perform ReLU on input + * @return index of the cell with maximum value + * @throws DMLRuntimeException if error occurs + */ + static int getMaxIndexSparse(int p, int q, int inputOffset, int n, int c, MatrixBlock input, ConvolutionParameters params, boolean performReluBackward) throws DMLRuntimeException { + if(!input.isInSparseFormat()) + throw new DMLRuntimeException("Incorrect usage: Only sparse format supported"); + + int [] tensorIndexes = new int[3]; + + int start_index_h = params.start_indexes_h[p]; + int end_index_h = params.end_indexes_h[p]; + int start_index_w = params.start_indexes_w[q]; + int end_index_w = params.end_indexes_w[q]; + + int maxIndex = -1; + double maxVal = -Double.MAX_VALUE; + + // Note: We do not treat pad as zero and hence we don't do: + // maxVal = 0 + // if start_index_h < 0 || start_index_w < 0 || end_index_h >= params.H || end_index_w >= params.W + + // input.isEmptyBlock() check is done by the caller + if( !input.sparseBlock.isEmpty(n) ) { + // Find maxIndex + int apos = input.sparseBlock.pos(n); + int alen = input.sparseBlock.size(n); + int[] aix = input.sparseBlock.indexes(n); + double[] avals = input.sparseBlock.values(n); + for(int j=apos; j<apos+alen; j++) { + computeTensorIndexes(aix[j], tensorIndexes, params.H, params.W); + if(c != tensorIndexes[0]) + continue; + int h = tensorIndexes[1]; + int w = tensorIndexes[2]; + if(h >= start_index_h && h < end_index_h && w >= start_index_w && w < end_index_w) { + double val = performReluBackward && avals[j] < 0 ? 0 : avals[j]; + if(maxVal < val) { + maxIndex = inputOffset + h*params.W + w; + maxVal = val; + } + } + } + } + else { + maxIndex = inputOffset; + } + return maxIndex; + } + + // Returns the row of matrix in dense format + static void getRowInDenseFormat(MatrixBlock input, int n, double [] ret) throws DMLRuntimeException { + if(input.getNumColumns() != ret.length) { + throw new DMLRuntimeException("Invalid parameters"); + } + // Use temporary array to avoid binary search + if(input.isInSparseFormat()) { + Arrays.fill(ret, 0); + if( !input.sparseBlock.isEmpty(n) ) { + int apos = input.sparseBlock.pos(n); + int alen = input.sparseBlock.size(n); + int[] aix = input.sparseBlock.indexes(n); + double[] avals = input.sparseBlock.values(n); + for(int j=apos; j<apos+alen; j++) + ret[ aix[j] ] = avals[j]; + } + } + else { + System.arraycopy(input.getDenseBlock(), n*input.getNumColumns(), ret, 0, input.getNumColumns()); + } + } + + // ------------------------------------------------------------------------------------------------------ + // Since col2im always operates on intermediate generated as part of matmult, it is not clear which operator to select apriori. + // Therefore, it is provided as utility function rather than an operator (like im2col or rotate180) + + //Converts input: PQ X CRS matrix and writes to 1 X CHW + static void doCol2imOverSingleImage(int outputN, MatrixBlock input, ConvolutionParameters params) throws DMLRuntimeException { + if(input.rlen != params.P*params.Q || input.clen != params.C*params.R*params.S) { + throw new DMLRuntimeException("Incorrect input dimensions"); + } + + double [] outputArray = null; + if (!params.output.isInSparseFormat()) + outputArray = params.output.getDenseBlock(); + else { + throw new DMLRuntimeException("Only dense output is implemented"); + } + + if(!input.isInSparseFormat()) { + double [] inputArray = input.getDenseBlock(); + doCol2IMDenseInput(0, outputN, inputArray, outputArray, params); + } + else { + if(!input.isEmptyBlock()) { + int [] tensorIndexes = new int[3]; + for(int i = 0; i < input.getNumRows(); i++) { + if( !input.sparseBlock.isEmpty(i) ) { + computeTensorIndexes(i, tensorIndexes, params.P, params.Q); + int p = tensorIndexes[1]; + int q = tensorIndexes[2]; + if(tensorIndexes[0] != 0) + throw new DMLRuntimeException("Incorrect tensor indexes: " + tensorIndexes[0] + " != 0 <" + p + " " + q + " " + tensorIndexes[0] + params.P + " " + params.Q + ">"); + + int apos = input.sparseBlock.pos(i); + int alen = input.sparseBlock.size(i); + int[] aix = input.sparseBlock.indexes(i); + double[] avals = input.sparseBlock.values(i); + for(int j = apos; j < apos+alen; j++) { + computeTensorIndexes(aix[j], tensorIndexes, params.R, params.S); + int c = tensorIndexes[0]; + int r = tensorIndexes[1]; + int s = tensorIndexes[2]; + int h = p*params.stride_h + r - params.pad_h; + int w = q*params.stride_w + s - params.pad_w; + if(h >= 0 && h < params.H && w >= 0 && w < params.W) { + int outIndex = outputN*params.C*params.H*params.W + c*params.H*params.W + h*params.W + w; + outputArray[outIndex] += avals[j]; + } + } + } + } + } + } + } + + // Converts input: PQ X CRS matrix and writes to 1 X CHW if inputN == 0 + // Or converts input: NPQ X CRS matrix and writes to N X CHW + private static void doCol2IMDenseInput(int inputN, int outputN, double [] inputArray, double [] outputArray, ConvolutionParameters params) throws DMLRuntimeException { + final int outputNOffset = outputN*params.C*params.H*params.W; + for (int p = 0; p < params.P; p++) { + // h = p*params.stride_h + r - params.pad_h + // = r + hOffset + // Based on restrictions: h >= 0 and r >= 0 and h < params.H and r < params.R, we get + // max(0, - hOffset) <= r < min(params.R, params.H - hOffset) + final int hOffset = p*params.stride_h - params.pad_h; + final int rStart = Math.max(0, - hOffset); + final int rEnd = Math.min(params.R, params.H - hOffset); + for (int q = 0; q < params.Q; q++) { + // Using the same logic as above on following: + // w = q*params.stride_w + s - params.pad_w + final int wOffset = q*params.stride_w - params.pad_w; + final int sStart = Math.max(0, - wOffset); + final int sEnd = Math.min(params.S, params.W - wOffset); + final int tempOffset = (inputN*params.P*params.Q + p*params.Q + q)*params.C*params.R*params.S; + for (int c = 0; c < params.C; c++) { + final int outOffset = outputNOffset + c*params.H*params.W; + final int inputOffset = tempOffset + c*params.R*params.S; + for (int r = rStart; r < rEnd; r++) { + for (int s = sStart; s < sEnd; s++) { + int inputIndex = inputOffset + r*params.S + s; + int outIndex = outOffset + (hOffset + r)*params.W + wOffset + s; + outputArray[outIndex] += inputArray[inputIndex]; + } + } + } + } + } + } +} http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/19eed8f3/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java new file mode 100644 index 0000000..9ae39bf --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java @@ -0,0 +1,386 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sysml.runtime.matrix.data; + +import java.util.Arrays; + +/** + * This class contains the different implementation of im2col operation + */ +public class LibMatrixDNNIm2ColHelper { + + static interface Im2colWorker { + public void execute(int n); + public void execute(int n, int c); + public static Im2colWorker getWorker(MatrixBlock input, MatrixBlock im2ColOutBlock, ConvolutionParameters params, boolean allChannels) { + if(im2ColOutBlock.isInSparseFormat() || im2ColOutBlock.getDenseBlock() == null) + throw new RuntimeException("im2col output is always in dense format"); + if(allChannels) { + if(!input.isInSparseFormat()) { + if (params.stride_h == 1 && params.stride_w == 1 && params.pad_h == 0 && params.pad_w == 0) + return new DenseIm2colWorkerStride1Pad0AllChannels(input.getDenseBlock(), im2ColOutBlock.getDenseBlock(), params); + else + return new DenseIm2colWorkerAllChannels(input.getDenseBlock(), im2ColOutBlock.getDenseBlock(), params); + } + else + return new SparseIm2colWorkerAllChannels(input, im2ColOutBlock, params); + } + else { + if(!input.isInSparseFormat()) { + if (params.stride_h == 1 && params.stride_w == 1 && params.pad_h == 0 && params.pad_w == 0) + return new DenseIm2colWorkerStride1Pad0(input.getDenseBlock(), im2ColOutBlock.getDenseBlock(), params); + else + return new DenseIm2colWorker(input.getDenseBlock(), im2ColOutBlock.getDenseBlock(), params); + } + else + return new SparseIm2colWorker(input, im2ColOutBlock, params); + } + } + } + + /** + * Special case operator for performing dense im2col when stride = [1, 1] and pad = [0, 0] by using System.arraycopy + */ + static class DenseIm2colWorkerStride1Pad0 implements Im2colWorker { + double [] inputArray; double [] outputArray; + int CRS; int S; int R; int P; int Q; int CHW; int H; int W; + public DenseIm2colWorkerStride1Pad0(double [] inputArray, double [] outputArray, ConvolutionParameters params) { + this.inputArray = inputArray; + this.outputArray = outputArray; + this.CRS = params.C * params.R * params.S; + this.H = params.H; this.W = params.W; this.R = params.R; this.S = params.S; this.P = params.P; this.Q = params.Q; + this.CHW = params.C*params.H*params.W; + } + + @Override + public void execute(int n) { + throw new RuntimeException("Not supported"); + } + + @Override + public void execute(int n, int cInput) { + int nOffset = n * CHW; + int RS = R*S; + for (int rs = 0; rs < RS; ++rs) { + int wOffset = rs % S; + int hOffset = rs / S; + for (int h = 0; h < P; ++h) { + int hPadded = h + hOffset; + int outOffset = (rs * P + h) * Q; + int inputOffset = nOffset + (cInput * H + hPadded) * W; + System.arraycopy(inputArray, inputOffset + wOffset, outputArray, outOffset, Q); + int w = Q - 1; + int wPadded = w + wOffset; + if (hPadded < H && wPadded < W) + outputArray[outOffset + w] = inputArray[inputOffset + wPadded]; + else + outputArray[outOffset + w] = 0; + } + } + } + } + + + + /** + * Special case operator for performing dense im2col when stride = [1, 1] and pad = [0, 0] by using System.arraycopy + */ + static class DenseIm2colWorkerStride1Pad0AllChannels implements Im2colWorker { + double [] inputArray; double [] outputArray; + int CRS; int S; int R; int P; int Q; int CHW; int H; int W; + public DenseIm2colWorkerStride1Pad0AllChannels(double [] inputArray, double [] outputArray, ConvolutionParameters params) { + this.inputArray = inputArray; + this.outputArray = outputArray; + this.CRS = params.C * params.R * params.S; + this.H = params.H; this.W = params.W; this.R = params.R; this.S = params.S; this.P = params.P; this.Q = params.Q; + this.CHW = params.C*params.H*params.W; + } + + @Override + public void execute(int n, int c) { + throw new RuntimeException("Not supported"); + } + + @Override + public void execute(int n) { + int nOffset = n * CHW; + for (int c = 0; c < CRS; ++c) { + int wOffset = c % S; + int hOffset = (c / S) % R; + int cInput = c / R / S; + for (int h = 0; h < P; ++h) { + int hPadded = h + hOffset; + int outOffset = (c * P + h) * Q; + int inputOffset = nOffset + (cInput * H + hPadded) * W; + System.arraycopy(inputArray, inputOffset + wOffset, outputArray, outOffset, Q); + int w = Q - 1; + int wPadded = w + wOffset; + if (hPadded < H && wPadded < W) + outputArray[outOffset + w] = inputArray[inputOffset + wPadded]; + else + outputArray[outOffset + w] = 0; + } + } + } + } + + /** + * Performing dense im2col (general case) + */ + static class DenseIm2colWorker implements Im2colWorker { + double [] inputArray; double [] outputArray; + int CRS; int S; int R; int P; int Q; int CHW; int H; int W; + int stride_h; int stride_w; int pad_h; int pad_w; + public DenseIm2colWorker(double [] inputArray, double [] outputArray, ConvolutionParameters params) { + this.inputArray = inputArray; + this.outputArray = outputArray; + this.CRS = params.C * params.R * params.S; + this.H = params.H; this.W = params.W; this.R = params.R; this.S = params.S; this.P = params.P; this.Q = params.Q; + this.CHW = params.C*params.H*params.W; + this.stride_h = params.stride_h; this.stride_w = params.stride_w; + this.pad_h = params.pad_h; this.pad_w = params.pad_w; + } + + @Override + public void execute(int n) { + throw new RuntimeException("Not supported"); + } + + @Override + public void execute(int n, int cInput) { + int nOffset = n * CHW; int RS = R*S; + for (int rs = 0; rs < RS; ++rs) { + int wOffset = rs % S; + int hOffset = rs / S; + for (int h = 0; h < P; ++h) { + int outOffset = (rs * P + h) * Q; + int hPadded = h * stride_h - pad_h + hOffset; + int inputOffset = nOffset + (cInput * H + hPadded) * W; + if (hPadded < 0 || hPadded >= H) { + Arrays.fill(outputArray, outOffset, outOffset+Q, 0); + } else { + for (int w = 0; w < Q; ++w) { + int wPadded = w * stride_w - pad_w + wOffset; + if (wPadded >= 0 && wPadded < W) + outputArray[outOffset + w] = inputArray[inputOffset + wPadded]; + else + outputArray[outOffset + w] = 0; + } + } + } + } + } + } + + /** + * Performing dense im2col (general case) + */ + static class DenseIm2colWorkerAllChannels implements Im2colWorker { + double [] inputArray; double [] outputArray; + int CRS; int S; int R; int P; int Q; int CHW; int H; int W; + int stride_h; int stride_w; int pad_h; int pad_w; + public DenseIm2colWorkerAllChannels(double [] inputArray, double [] outputArray, ConvolutionParameters params) { + this.inputArray = inputArray; + this.outputArray = outputArray; + this.CRS = params.C * params.R * params.S; + this.H = params.H; this.W = params.W; this.R = params.R; this.S = params.S; this.P = params.P; this.Q = params.Q; + this.CHW = params.C*params.H*params.W; + this.stride_h = params.stride_h; this.stride_w = params.stride_w; + this.pad_h = params.pad_h; this.pad_w = params.pad_w; + } + + @Override + public void execute(int n, int c) { + throw new RuntimeException("Not supported"); + } + + @Override + public void execute(int n) { + int nOffset = n * CHW; + for (int c = 0; c < CRS; ++c) { + int wOffset = c % S; + int hOffset = (c / S) % R; + int cInput = c / R / S; + for (int h = 0; h < P; ++h) { + int outOffset = (c * P + h) * Q; + int hPadded = h * stride_h - pad_h + hOffset; + int inputOffset = nOffset + (cInput * H + hPadded) * W; + if (hPadded < 0 || hPadded >= H) { + Arrays.fill(outputArray, outOffset, outOffset+Q, 0); + } else { + for (int w = 0; w < Q; ++w) { + int wPadded = w * stride_w - pad_w + wOffset; + if (wPadded >= 0 && wPadded < W) + outputArray[outOffset + w] = inputArray[inputOffset + wPadded]; + else + outputArray[outOffset + w] = 0; + } + } + } + } + } + } + + /** + * Performing dense im2col (general case) + */ + static class SparseIm2colWorkerAllChannels implements Im2colWorker { + MatrixBlock input; double [] outputArray; + int CRS; int S; int R; int P; int Q; int H; int W; + int stride_h; int stride_w; int pad_h; int pad_w; double [] temp; + public SparseIm2colWorkerAllChannels(MatrixBlock input, MatrixBlock im2ColOutBlock, ConvolutionParameters params) { + this.input = input; + this.outputArray = im2ColOutBlock.getDenseBlock(); + this.CRS = params.C * params.R * params.S; + this.H = params.H; this.W = params.W; this.R = params.R; this.S = params.S; this.P = params.P; this.Q = params.Q; + this.stride_h = params.stride_h; this.stride_w = params.stride_w; + this.pad_h = params.pad_h; this.pad_w = params.pad_w; + temp = new double[input.getNumColumns()]; + } + + @Override + public void execute(int n, int c) { + throw new RuntimeException("Not supported"); + } + + @Override + public void execute(int n) { + // Using a temporary array improves performance by not requiring binary search for getValue + // Since the access pattern depends on ConvolutionParameters, this serves as a temporary fix. + fillTemp(input, n); + // final int nOffset = n * params.C*params.H*params.W; + for (int c = 0; c < CRS; ++c) { + int wOffset = c % S; + int hOffset = (c / S) % R; + int cInput = c / R / S; + for (int h = 0; h < P; ++h) { + int outOffset = (c * P + h) * Q; + int hPadded = h * stride_h - pad_h + hOffset; + int tempOffset = (cInput * H + hPadded) * W; + // int inputOffset = nOffset + tempOffset; + if (hPadded < 0 || hPadded >= H) { + Arrays.fill(outputArray, outOffset, outOffset+Q, 0); + } else { + for (int w = 0; w < Q; ++w) { + int wPadded = w * stride_w - pad_w + wOffset; + if (wPadded >= 0 && wPadded < W) + outputArray[outOffset + w] = temp[tempOffset + wPadded]; + else + outputArray[outOffset + w] = 0; + } + } + } + } + } + // Returns the row of matrix in dense format + private void fillTemp(MatrixBlock input, int n) { + if(input.getNumColumns() != temp.length) { + throw new RuntimeException("Invalid parameters"); + } + // Use temporary array to avoid binary search + if(input.isInSparseFormat()) { + Arrays.fill(temp, 0); + if( !input.sparseBlock.isEmpty(n) ) { + int apos = input.sparseBlock.pos(n); + int alen = input.sparseBlock.size(n); + int[] aix = input.sparseBlock.indexes(n); + double[] avals = input.sparseBlock.values(n); + for(int j=apos; j<apos+alen; j++) + temp[ aix[j] ] = avals[j]; + } + } + else { + System.arraycopy(input.getDenseBlock(), n*input.getNumColumns(), temp, 0, input.getNumColumns()); + } + } + } + + /** + * Performing dense im2col (general case) + */ + static class SparseIm2colWorker implements Im2colWorker { + MatrixBlock input; double [] outputArray; + int CRS; int S; int R; int P; int Q; int H; int W; + int stride_h; int stride_w; int pad_h; int pad_w; double [] temp; + public SparseIm2colWorker(MatrixBlock input, MatrixBlock im2ColOutBlock, ConvolutionParameters params) { + this.input = input; + this.outputArray = im2ColOutBlock.getDenseBlock(); + this.CRS = params.C * params.R * params.S; + this.H = params.H; this.W = params.W; this.R = params.R; this.S = params.S; this.P = params.P; this.Q = params.Q; + this.stride_h = params.stride_h; this.stride_w = params.stride_w; + this.pad_h = params.pad_h; this.pad_w = params.pad_w; + temp = new double[input.getNumColumns()]; + } + + @Override + public void execute(int n) { + throw new RuntimeException("Not supported"); + } + + @Override + public void execute(int n, int cInput) { + // Using a temporary array improves performance by not requiring binary search for getValue + // Since the access pattern depends on ConvolutionParameters, this serves as a temporary fix. + fillTemp(input, n); int RS = R*S; + for (int rs = 0; rs < RS; ++rs) { + int wOffset = rs % S; + int hOffset = rs / S; + for (int h = 0; h < P; ++h) { + int outOffset = (rs * P + h) * Q; + int hPadded = h * stride_h - pad_h + hOffset; + int tempOffset = (cInput * H + hPadded) * W; + // int inputOffset = nOffset + tempOffset; + if (hPadded < 0 || hPadded >= H) { + Arrays.fill(outputArray, outOffset, outOffset+Q, 0); + } else { + for (int w = 0; w < Q; ++w) { + int wPadded = w * stride_w - pad_w + wOffset; + if (wPadded >= 0 && wPadded < W) + outputArray[outOffset + w] = temp[tempOffset + wPadded]; + else + outputArray[outOffset + w] = 0; + } + } + } + } + } + // Returns the row of matrix in dense format + private void fillTemp(MatrixBlock input, int n) { + if(input.getNumColumns() != temp.length) { + throw new RuntimeException("Invalid parameters"); + } + // Use temporary array to avoid binary search + if(input.isInSparseFormat()) { + Arrays.fill(temp, 0); + if( !input.sparseBlock.isEmpty(n) ) { + int apos = input.sparseBlock.pos(n); + int alen = input.sparseBlock.size(n); + int[] aix = input.sparseBlock.indexes(n); + double[] avals = input.sparseBlock.values(n); + for(int j=apos; j<apos+alen; j++) + temp[ aix[j] ] = avals[j]; + } + } + else { + System.arraycopy(input.getDenseBlock(), n*input.getNumColumns(), temp, 0, input.getNumColumns()); + } + } + } + +} http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/19eed8f3/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPoolingBackwardHelper.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPoolingBackwardHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPoolingBackwardHelper.java new file mode 100644 index 0000000..b400105 --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPoolingBackwardHelper.java @@ -0,0 +1,212 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sysml.runtime.matrix.data; + +import java.util.concurrent.Callable; + +/** + * This class contains the set of operators used for performing pooling backward + */ +public class LibMatrixDNNPoolingBackwardHelper { + /** + * Performs the maxpooling backward operation for dense input and dense error (dout) + */ + public static class PoolingBackwardDenseDense implements Callable<Long> + { + public int _rl; public int _ru; + private final ConvolutionParameters _params; + double [] outputArray; boolean performReluBackward; + double [] inputArray; double [] doutArray; + int C; int CHW; int P; int Q; int HW; int CPQ; int PQ; + public PoolingBackwardDenseDense(int rl, int ru, ConvolutionParameters params, boolean performReluBackward) { + _rl = rl; _ru = ru; + _params = params; + this.performReluBackward = performReluBackward; + inputArray = params.input1.getDenseBlock(); + doutArray = params.input2.getDenseBlock(); + outputArray = params.output.getDenseBlock(); + C = params.C; CHW = params.C*params.H*params.W; HW = params.H*params.W; + P = params.P; Q = params.Q; CPQ = params.C*params.P*params.Q; + PQ = params.P*params.Q; + if (inputArray == null || doutArray == null || outputArray == null ) + throw new RuntimeException("Incorrect usage: empty inputs"); + } + + @Override + public Long call() throws Exception { + for(int n = _rl; n < _ru; n++) { + for (int c = 0; c < C; c++) { + final int inputOffset = n*CHW + c*HW; + final int outputOffset = n*CPQ + c*PQ; + for (int p = 0; p < P; p++) { + for (int q = 0; q < Q; q++) { + int maxIndex = LibMatrixDNNHelper.getMaxIndex(p, q, inputOffset, inputArray, _params, performReluBackward); + if(maxIndex != -1) + outputArray[maxIndex] += doutArray[outputOffset + p * Q + q]; + } + } + } + } + return 0L; + } + } + + /** + * Performs the maxpooling backward operation for dense input and sparse error (dout) + */ + public static class PoolingBackwardDenseSparse implements Callable<Long> + { + public int _rl; public int _ru; + private final ConvolutionParameters _params; + double [] outputArray; boolean performReluBackward; + double [] inputArray; MatrixBlock dout; + int C; int CHW; int P; int Q; int HW; + public PoolingBackwardDenseSparse(int rl, int ru, ConvolutionParameters params, boolean performReluBackward) { + _rl = rl; _ru = ru; + _params = params; + this.performReluBackward = performReluBackward; + inputArray = params.input1.getDenseBlock(); + dout = params.input2; + outputArray = params.output.getDenseBlock(); + C = params.C; CHW = params.C*params.H*params.W; HW = params.H*params.W; + P = params.P; Q = params.Q; + if (inputArray == null || outputArray == null ) + throw new RuntimeException("Incorrect usage: empty inputs"); + if (!params.input2.isInSparseFormat()) + throw new RuntimeException("Incorrect usage: Call optimized versions"); + } + + @Override + public Long call() throws Exception { + for(int n = _rl; n < _ru; n++) { + if( !dout.sparseBlock.isEmpty(n) ) { + int [] tensorIndexes = new int[3]; + int apos = dout.sparseBlock.pos(n); + int alen = dout.sparseBlock.size(n); + int[] aix = dout.sparseBlock.indexes(n); + double[] avals = dout.sparseBlock.values(n); + for(int j = apos; j < apos+alen; j++) { + LibMatrixDNNHelper.computeTensorIndexes(aix[j], tensorIndexes, P, Q); + int c = tensorIndexes[0]; + int p = tensorIndexes[1]; + int q = tensorIndexes[2]; + final int inputOffset = n*CHW + c*HW; + int maxIndex = LibMatrixDNNHelper.getMaxIndex(p, q, inputOffset, inputArray, _params, performReluBackward); + if(maxIndex != -1) + outputArray[maxIndex] += avals[j]; + } + } + } + return 0L; + } + } + + /** + * Performs the maxpooling backward operation for sparse input and dense error (dout) + */ + public static class PoolingBackwardSparseDense implements Callable<Long> + { + public int _rl; public int _ru; + private final ConvolutionParameters _params; + double [] outputArray; boolean performReluBackward; + double [] doutArray; + int C; int CHW; int P; int Q; int HW; int CPQ; int PQ; + public PoolingBackwardSparseDense(int rl, int ru, ConvolutionParameters params, boolean performReluBackward) { + _rl = rl; _ru = ru; + _params = params; + this.performReluBackward = performReluBackward; + doutArray = params.input2.getDenseBlock(); + outputArray = params.output.getDenseBlock(); + C = params.C; CHW = params.C*params.H*params.W; HW = params.H*params.W; + P = params.P; Q = params.Q; CPQ = params.C*params.P*params.Q; + PQ = params.P*params.Q; + if (doutArray == null || outputArray == null ) + throw new RuntimeException("Incorrect usage: empty inputs"); + if (!params.input1.isInSparseFormat()) + throw new RuntimeException("Incorrect usage: Call optimized versions"); + } + + @Override + public Long call() throws Exception { + for(int n = _rl; n < _ru; n++) { + for (int c = 0; c < C; c++) { + for (int p = 0; p < P; p++) { + for (int q = 0; q < Q; q++) { + double inVal = doutArray[n*CPQ + c*PQ + p * Q + q]; + if(inVal != 0) { + final int inputOffset = n*CHW + c*HW; + int maxIndex = LibMatrixDNNHelper.getMaxIndexSparse(p, q, inputOffset, n, c, _params.input1, _params, performReluBackward); + if(maxIndex != -1) + outputArray[maxIndex] += inVal; + } + } + } + } + } + return 0L; + } + } + + /** + * Performs the maxpooling backward operation for sparse input and sparse error (dout) + */ + public static class PoolingBackwardSparseSparse implements Callable<Long> + { + public int _rl; public int _ru; + private final ConvolutionParameters _params; + double [] outputArray; boolean performReluBackward; + int C; int CHW; int P; int Q; int HW; + public PoolingBackwardSparseSparse(int rl, int ru, ConvolutionParameters params, boolean performReluBackward) { + _rl = rl; _ru = ru; + _params = params; + this.performReluBackward = performReluBackward; + outputArray = params.output.getDenseBlock(); + C = params.C; CHW = params.C*params.H*params.W; HW = params.H*params.W; + P = params.P; Q = params.Q; + if (outputArray == null ) + throw new RuntimeException("Incorrect usage: empty outputs"); + if (!params.input1.isInSparseFormat() || !params.input2.isInSparseFormat()) + throw new RuntimeException("Incorrect usage: Call optimized versions"); + } + + @Override + public Long call() throws Exception { + for(int n = _rl; n < _ru; n++) { + if( !_params.input2.sparseBlock.isEmpty(n) ) { + int [] tensorIndexes = new int[3]; + int apos = _params.input2.sparseBlock.pos(n); + int alen = _params.input2.sparseBlock.size(n); + int[] aix = _params.input2.sparseBlock.indexes(n); + double[] avals = _params.input2.sparseBlock.values(n); + for(int j = apos; j < apos+alen; j++) { + LibMatrixDNNHelper.computeTensorIndexes(aix[j], tensorIndexes, P, Q); + int c = tensorIndexes[0]; + int p = tensorIndexes[1]; + int q = tensorIndexes[2]; + final int inputOffset = n*CHW + c*HW; + int maxIndex = LibMatrixDNNHelper.getMaxIndexSparse(p, q, inputOffset, n, c, _params.input1, _params, performReluBackward); + if(maxIndex != -1) + outputArray[maxIndex] += avals[j]; + } + } + } + return 0L; + } + } +} http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/19eed8f3/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPoolingHelper.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPoolingHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPoolingHelper.java new file mode 100644 index 0000000..c6aaee2 --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPoolingHelper.java @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sysml.runtime.matrix.data; + +import java.util.Arrays; +import java.util.concurrent.Callable; + +/** + * This class contains the set of operators used for performing pooling + */ +public class LibMatrixDNNPoolingHelper { + + /** + * Performs the dense maxpooling + */ + public static class DenseMaxPooling implements Callable<Long> + { + public int _rl; public int _ru; + private final ConvolutionParameters _params; + double [] inputArray; double [] outputArray; + int C; int P; int Q; int W; + public DenseMaxPooling(int rl, int ru, ConvolutionParameters params) { + _rl = rl; _ru = ru; + _params = params; + inputArray = params.input1.getDenseBlock(); + outputArray = params.output.getDenseBlock(); + C = params.C; P = params.P; Q = params.Q; W = params.W; + } + + @Override + public Long call() throws Exception { + final int HW = _params.H*_params.W; + final int CHW = _params.C*_params.H*_params.W; + final int CPQ = C*P*Q; + for(int n = _rl; n < _ru; n++) { + final int inOffset = n*CHW; + int out_index = n*CPQ; + for (int c = 0; c < C; c++) { + final int inOffset1 = inOffset + c*HW; + for (int p = 0; p < P; p++) { + for (int q = 0; q < Q; q++, out_index++) { + for (int h = _params.start_indexes_h[p]; h < _params.end_indexes_h[p]; h++) { + for (int w = _params.start_indexes_w[q]; w < _params.end_indexes_w[q]; w++) { + outputArray[out_index] = Math.max(outputArray[out_index], inputArray[inOffset1 + h*W + w]); + } + } + } + } + } + } + return 0L; + } + } + + /** + * Performs the sparse maxpooling + */ + public static class SparseMaxPooling implements Callable<Long> + { + public int _rl; public int _ru; + private final ConvolutionParameters _params; + int HW; + double [] outputArray; + int C; int P; int Q; int W; + public SparseMaxPooling(int rl, int ru, ConvolutionParameters params) { + _rl = rl; _ru = ru; + _params = params; + outputArray = params.output.getDenseBlock(); + C = params.C; P = params.P; Q = params.Q; W = params.W; + HW = _params.H*_params.W; + } + + boolean isNthRowEmpty = false; + int apos; int alen; int[] aix; double[] avals; + private void getNthSparseRow(int n) { + if( !_params.input1.sparseBlock.isEmpty(n) ) { + apos = _params.input1.sparseBlock.pos(n); + alen = _params.input1.sparseBlock.size(n); + aix = _params.input1.sparseBlock.indexes(n); + avals = _params.input1.sparseBlock.values(n); + isNthRowEmpty = false; + } + else + isNthRowEmpty = true; + } + int fromIndex = -1; // as per C + int toIndex = -1; // as per C + private int setSearchIndex(int from, int searchVal) { + for(int j = from; j < apos+alen; j++) { + if(aix[j] > searchVal) + return Math.max(from, j-1); + } + return apos+alen; + } + private double getValue(int col) { + if( !isNthRowEmpty ) { + int index = Arrays.binarySearch(aix, fromIndex, toIndex, col); + return index > 0 ? avals[index] : 0; + } + return 0; + } + + @Override + public Long call() throws Exception { + final int CPQ = C*P*Q; + for(int n = _rl; n < _ru; n++) { + getNthSparseRow(n); + int out_index = n*CPQ; + for (int c = 0; c < C; c++) { + // This allows for binary search in getValue to be more efficient + fromIndex = setSearchIndex(apos, c*HW); + toIndex = Math.min(apos+alen, setSearchIndex(fromIndex, (c+1)*HW)); + for (int p = 0; p < P; p++) { + for (int q = 0; q < Q; q++, out_index++) { + for (int h = _params.start_indexes_h[p]; h < _params.end_indexes_h[p]; h++) { + for (int w = _params.start_indexes_w[q]; w < _params.end_indexes_w[q]; w++) { + outputArray[out_index] = Math.max(outputArray[out_index], getValue(c*HW + h*W + w)); + } + } + } + } + } + } + return 0L; + } + } +} http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/19eed8f3/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNRotate180Helper.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNRotate180Helper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNRotate180Helper.java new file mode 100644 index 0000000..c003756 --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNRotate180Helper.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sysml.runtime.matrix.data; + +import java.util.Arrays; + +/** + * This class contains the different implementation of rotate180 operation + */ +public class LibMatrixDNNRotate180Helper { + + static interface Rotate180Worker { + public void execute(int inputN, int outputN); + public static Rotate180Worker getWorker(MatrixBlock input, double [] outputArray, ConvolutionParameters params, boolean zeroOutSparseOutput) { + if(!input.isInSparseFormat()) + return new DenseRotate180Worker(input, outputArray, params); + else + return new SparseRotate180Worker(input, outputArray, params, zeroOutSparseOutput); + } + } + + /** + * Performing dense rotate180 (general case) + */ + static class DenseRotate180Worker implements Rotate180Worker { + + double [] inputArray; double [] outputArray; + ConvolutionParameters params; + public DenseRotate180Worker(MatrixBlock input, double [] outputArray, ConvolutionParameters params) { + this.outputArray = outputArray; + this.params = params; + inputArray = input.getDenseBlock(); + if(inputArray == null || outputArray == null) + throw new RuntimeException("Incorrect usage: empty inputs"); + } + + @Override + public void execute(int inputN, int outputN) { + int outputOffset = outputN*params.K*params.P*params.Q; + for (int k = 0; k < params.K; k++) { + for (int p = 0; p < params.P; p++) { + for (int q = 0; q < params.Q; q++) { + outputArray[outputOffset + p*params.Q*params.K + q*params.K + k] = + inputArray[inputN*params.K*params.P*params.Q + k*params.P*params.Q + p*params.Q + q]; + } + } + } + } + } + + /** + * Performing rotate180 when input is sparse (general case) + */ + static class SparseRotate180Worker implements Rotate180Worker { + + double [] outputArray; MatrixBlock input; + ConvolutionParameters params; boolean zeroOutSparseOutput; + public SparseRotate180Worker(MatrixBlock input, double [] outputArray, ConvolutionParameters params, boolean zeroOutSparseOutput) { + this.outputArray = outputArray; + this.params = params; + this.zeroOutSparseOutput = zeroOutSparseOutput; + this.input = input; + if(outputArray == null) + throw new RuntimeException("Incorrect usage: empty inputs"); + } + + @Override + public void execute(int inputN, int outputN) { + if(zeroOutSparseOutput) + Arrays.fill(outputArray, 0); + + int outputOffset = outputN*params.K*params.P*params.Q; + if(!input.isEmptyBlock()) { + if( !input.sparseBlock.isEmpty(inputN) ) { + int [] tensorIndexes = new int[3]; + int apos = input.sparseBlock.pos(inputN); + int alen = input.sparseBlock.size(inputN); + int[] aix = input.sparseBlock.indexes(inputN); + double[] avals = input.sparseBlock.values(inputN); + for(int j = apos; j < apos+alen; j++) { + LibMatrixDNNHelper.computeTensorIndexes(aix[j], tensorIndexes, params.P, params.Q); + int k = tensorIndexes[0]; + int p = tensorIndexes[1]; + int q = tensorIndexes[2]; + outputArray[outputOffset + p*params.Q*params.K + q*params.K + k] = avals[j]; + } + } + } + } + } +}