This is an automated email from the ASF dual-hosted git repository.

arnabp20 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/main by this push:
     new 4eb1db5  [SYSTEMDS-3284] Avoid binary search for equi-width binning 
apply
4eb1db5 is described below

commit 4eb1db5b59e954a133271ae30b4e17e3cfc303f0
Author: arnabp <[email protected]>
AuthorDate: Mon Jan 31 19:56:16 2022 +0100

    [SYSTEMDS-3284] Avoid binary search for equi-width binning apply
    
    This patch replaces the binary search with basic derivation
    for finding the right bin for a value. This change yields
    10% speed-up for a single-threaded binning of 10 columns
    with 100K bins per column.
---
 .../runtime/transform/encode/ColumnEncoderBin.java | 28 ++++++++++++++++++----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git 
a/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderBin.java 
b/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderBin.java
index d6e79c4..fdc1ad6 100644
--- 
a/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderBin.java
+++ 
b/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderBin.java
@@ -106,6 +106,7 @@ public class ColumnEncoderBin extends ColumnEncoder {
 
        protected double getCode(CacheBlock in, int row){
                // find the right bucket for a single row
+               double bin = 0;
                if( _binMins.length == 0 || _binMaxs.length == 0 ) {
                        LOG.warn("ColumnEncoderBin: applyValue without bucket 
boundaries, assign 1");
                        return 1; //robustness in case of missing bins
@@ -114,15 +115,24 @@ public class ColumnEncoderBin extends ColumnEncoder {
                double inVal = in.getDoubleNaN(row, _colID - 1);
                if (Double.isNaN(inVal) || inVal < _binMins[0] || inVal > 
_binMaxs[_binMaxs.length-1])
                        return Double.NaN;
-               int ix = Arrays.binarySearch(_binMaxs, inVal);
-               return ((ix < 0) ? Math.abs(ix + 1) : ix) + 1;
+               if (_binMethod == BinMethod.EQUI_HEIGHT) {
+                       int ix = Arrays.binarySearch(_binMaxs, inVal);
+                       bin = ((ix < 0) ? Math.abs(ix + 1) : ix) + 1;
+               }
+               if (_binMethod == BinMethod.EQUI_WIDTH) {
+                       //TODO: Skip computing bin boundaries for equi-width
+                       double binWidth = (_binMaxs[_binMaxs.length - 1] - 
_binMins[0]) / _numBin;
+                       double code = Math.ceil((inVal - _binMins[0]) / 
binWidth);
+                       bin = (code == 0) ? code + 1 : code;
+               }
+               return bin;
        }
        
        @Override
        protected double[] getCodeCol(CacheBlock in, int startInd, int blkSize) 
{
                // find the right bucket for a block of rows
                int endInd = getEndIndex(in.getNumRows(), startInd, blkSize);
-               double codes[] = new double[endInd-startInd];
+               double[] codes = new double[endInd-startInd];
                for (int i=startInd; i<endInd; i++) {
                        if (_binMins.length == 0 || _binMaxs.length == 0) {
                                LOG.warn("ColumnEncoderBin: applyValue without 
bucket boundaries, assign 1");
@@ -134,8 +144,16 @@ public class ColumnEncoderBin extends ColumnEncoder {
                                codes[i-startInd] = Double.NaN;
                                continue;
                        }
-                       int ix = Arrays.binarySearch(_binMaxs, inVal);
-                       codes[i-startInd] = ((ix < 0) ? Math.abs(ix + 1) : ix) 
+ 1;
+                       if (_binMethod == BinMethod.EQUI_HEIGHT) {
+                               int ix = Arrays.binarySearch(_binMaxs, inVal);
+                               codes[i-startInd] = ((ix < 0) ? Math.abs(ix + 
1) : ix) + 1;
+                       }
+                       if (_binMethod == BinMethod.EQUI_WIDTH) {
+                               //TODO: Skip computing bin boundaries for 
equi-width
+                               double binWidth = (_binMaxs[_binMaxs.length - 
1] - _binMins[0]) / _numBin;
+                               double bin = Math.ceil((inVal - _binMins[0]) / 
binWidth);
+                               codes[i - startInd] = bin == 0 ? bin + 1 : bin;
+                       }
                }
                return codes;
        }

Reply via email to