This is an automated email from the ASF dual-hosted git repository.
baunsgaard pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new d5182a7164 [SYSTEMDS-3485] Precompile detect type patterns
d5182a7164 is described below
commit d5182a7164ce99df0a0bc46d3a5ea8fbfadc9fdd
Author: baunsgaard <[email protected]>
AuthorDate: Fri Jan 6 01:46:08 2023 +0100
[SYSTEMDS-3485] Precompile detect type patterns
This commit change the detect type to precompile the regex expressions
The changes contained optimize the end-to-end sequence of:
read - detectSchema - applySchema - write from 129.36 to 46.9 sec
on criteo first 10 million on su1 node lines through optimizations:
- Precompile
- Shallow serialize
- CSV reading with reuse cache arrays
Closes #1760
---
.gitignore | 2 +
.../apache/sysds/conf/ConfigurationManager.java | 4 +
src/main/java/org/apache/sysds/hops/UnaryOp.java | 1 -
.../controlprogram/caching/FrameObject.java | 41 ++--
.../sysds/runtime/frame/data/FrameBlock.java | 98 +++++----
.../sysds/runtime/frame/data/columns/Array.java | 12 +-
.../runtime/frame/data/columns/BitSetArray.java | 20 +-
.../runtime/frame/data/columns/BooleanArray.java | 28 ++-
.../runtime/frame/data/columns/DoubleArray.java | 12 +-
.../runtime/frame/data/columns/FloatArray.java | 13 +-
.../runtime/frame/data/columns/IntegerArray.java | 20 +-
.../runtime/frame/data/columns/LongArray.java | 16 +-
.../runtime/frame/data/columns/StringArray.java | 147 ++++++++++----
.../runtime/frame/data/iterators/RowIterator.java | 5 +
.../runtime/frame/data/lib/FrameLibAppend.java | 1 -
.../frame/data/lib/FrameLibApplySchema.java | 7 +
.../sysds/runtime/frame/data/lib/FrameUtil.java | 133 +++++++++---
.../cp/BinaryFrameFrameCPInstruction.java | 4 +-
.../instructions/cp/VariableCPInstruction.java | 1 -
.../spark/ComputationSPInstruction.java | 2 -
.../spark/utils/FrameRDDConverterUtils.java | 1 -
.../sysds/runtime/io/FrameReaderTextCSV.java | 90 ++++++---
.../runtime/io/FrameReaderTextCSVParallel.java | 56 +++---
.../org/apache/sysds/runtime/io/FrameWriter.java | 7 +-
.../apache/sysds/runtime/io/IOUtilFunctions.java | 204 ++++++++++---------
.../runtime/transform/encode/EncoderFactory.java | 1 -
.../org/apache/sysds/utils/MemoryEstimates.java | 19 +-
.../frame/{FrameAppendTest.java => FrameTest.java} | 18 +-
.../sysds/test/component/frame/FrameUtilTest.java | 127 ++++++++++++
.../component/frame/array/CustomArrayTests.java | 12 +-
.../component/frame/array/FrameArrayTests.java | 4 +-
.../test/component/misc/IOUtilFunctionsTest.java | 222 +++++++++++++++++++++
.../test/functions/codegen/CellwiseTmplTest.java | 2 +-
.../functions/codegen/DAGCellwiseTmplTest.java | 2 +-
.../test/functions/codegen/MiscPatternTest.java | 2 +-
.../test/functions/codegen/MultiAggTmplTest.java | 2 +-
.../test/functions/codegen/OuterProdTmplTest.java | 2 +-
.../test/functions/codegen/RowAggTmplTest.java | 2 +-
.../functions/codegen/RowConv2DOperationsTest.java | 2 +-
.../functions/codegen/RowVectorComparisonTest.java | 2 +-
.../functions/codegen/SumProductChainTest.java | 2 +-
.../FederatedLineageTraceReuseTest.java | 7 +
.../multitenant/FederatedReuseReadTest.java | 4 +-
.../sysds/test/functions/frame/FrameMapTest.java | 15 +-
.../transform/TransformCSVFrameEncodeReadTest.java | 16 +-
.../TransformFrameEncodeDecodeTokenTest.java | 5 +-
46 files changed, 1030 insertions(+), 363 deletions(-)
diff --git a/.gitignore b/.gitignore
index 89df46df01..3328942ceb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -112,3 +112,5 @@ scripts/perftest/temp
scripts/perftest/fed/results
scripts/perftest/fed/scratch_space
scripts/perftest/fed/temp
+
+src/test/scripts/functions/iogen/*.raw
\ No newline at end of file
diff --git a/src/main/java/org/apache/sysds/conf/ConfigurationManager.java
b/src/main/java/org/apache/sysds/conf/ConfigurationManager.java
index 18bd83e959..a2855e0eed 100644
--- a/src/main/java/org/apache/sysds/conf/ConfigurationManager.java
+++ b/src/main/java/org/apache/sysds/conf/ConfigurationManager.java
@@ -262,6 +262,10 @@ public class ConfigurationManager{
|| OptimizerUtils.MAX_PARALLELIZE_ORDER);
}
+ public static boolean isParallelIOEnabled(){
+ return getDMLConfig().getBooleanValue(DMLConfig.CP_PARALLEL_IO);
+ }
+
public static boolean isBroadcastEnabled() {
return
(getDMLConfig().getBooleanValue(DMLConfig.ASYNC_SPARK_BROADCAST)
|| OptimizerUtils.ASYNC_BROADCAST_SPARK);
diff --git a/src/main/java/org/apache/sysds/hops/UnaryOp.java
b/src/main/java/org/apache/sysds/hops/UnaryOp.java
index 97b3b4096d..b0fb1c24f8 100644
--- a/src/main/java/org/apache/sysds/hops/UnaryOp.java
+++ b/src/main/java/org/apache/sysds/hops/UnaryOp.java
@@ -551,7 +551,6 @@ public class UnaryOp extends MultiThreadedHop
setDim2(1);
}
else if(_op == OpOp1.TYPEOF || _op == OpOp1.DETECTSCHEMA || _op
== OpOp1.COLNAMES) {
- //TODO these three builtins should rather be moved to
unary aggregates
setDim1(1);
setDim2(input.getDim2());
}
diff --git
a/src/main/java/org/apache/sysds/runtime/controlprogram/caching/FrameObject.java
b/src/main/java/org/apache/sysds/runtime/controlprogram/caching/FrameObject.java
index 9d5c1af84f..cf745af50a 100644
---
a/src/main/java/org/apache/sysds/runtime/controlprogram/caching/FrameObject.java
+++
b/src/main/java/org/apache/sysds/runtime/controlprogram/caching/FrameObject.java
@@ -185,32 +185,27 @@ public class FrameObject extends CacheableData<FrameBlock>
}
@Override
- protected FrameBlock readBlobFromHDFS(String fname, long[] dims)
- throws IOException
- {
+ protected FrameBlock readBlobFromHDFS(String fname, long[] dims) throws
IOException {
long clen = dims[1];
MetaDataFormat iimd = (MetaDataFormat) _metaData;
DataCharacteristics dc = iimd.getDataCharacteristics();
-
- //handle missing schema if necessary
- ValueType[] lschema = (_schema!=null) ? _schema :
- UtilFunctions.nCopies(clen>=1 ? (int)clen : 1,
ValueType.STRING);
-
- //read the frame block
- FrameBlock data = null;
- try {
- data = isFederated() ? acquireReadAndRelease() :
-
FrameReaderFactory.createFrameReader(iimd.getFileFormat(),
getFileFormatProperties())
- .readFrameFromHDFS(fname, lschema,
dc.getRows(), dc.getCols());
- }
- catch( DMLRuntimeException ex ) {
- throw new IOException(ex);
- }
-
- //sanity check correct output
- if( data == null )
- throw new IOException("Unable to load frame from file:
"+fname);
-
+
+ // handle missing schema if necessary
+ ValueType[] lschema = (_schema != null) ? _schema :
UtilFunctions.nCopies(clen >= 1 ? (int) clen : 1,
+ ValueType.STRING);
+
+ // read the frame block
+ FrameBlock data = isFederated() ? acquireReadAndRelease() :
FrameReaderFactory
+ .createFrameReader(iimd.getFileFormat(),
getFileFormatProperties())
+ .readFrameFromHDFS(fname, lschema, dc.getRows(),
dc.getCols());
+
+ if(iimd.getFileFormat() == FileFormat.CSV)
+ _metaData = _metaData instanceof MetaDataFormat ? new
MetaDataFormat(data.getDataCharacteristics(),
+ iimd.getFileFormat()) : new
MetaData(data.getDataCharacteristics());
+
+ // sanity check correct output
+ if(data == null)
+ throw new IOException("Unable to load frame from file:
" + fname);
return data;
}
diff --git a/src/main/java/org/apache/sysds/runtime/frame/data/FrameBlock.java
b/src/main/java/org/apache/sysds/runtime/frame/data/FrameBlock.java
index 4bf3495946..5e62723ed2 100644
--- a/src/main/java/org/apache/sysds/runtime/frame/data/FrameBlock.java
+++ b/src/main/java/org/apache/sysds/runtime/frame/data/FrameBlock.java
@@ -33,6 +33,8 @@ import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
import java.util.concurrent.ThreadLocalRandom;
import java.util.function.Function;
import java.util.function.IntFunction;
@@ -46,9 +48,11 @@ import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.sysds.api.DMLException;
import org.apache.sysds.common.Types.ValueType;
+import org.apache.sysds.conf.ConfigurationManager;
import org.apache.sysds.runtime.DMLRuntimeException;
import org.apache.sysds.runtime.codegen.CodegenUtils;
import org.apache.sysds.runtime.controlprogram.caching.CacheBlock;
+import
org.apache.sysds.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
import org.apache.sysds.runtime.controlprogram.parfor.util.IDSequence;
import org.apache.sysds.runtime.frame.data.columns.Array;
import org.apache.sysds.runtime.frame.data.columns.ArrayFactory;
@@ -71,6 +75,7 @@ import
org.apache.sysds.runtime.matrix.operators.BinaryOperator;
import org.apache.sysds.runtime.meta.DataCharacteristics;
import org.apache.sysds.runtime.meta.MatrixCharacteristics;
import org.apache.sysds.runtime.transform.encode.ColumnEncoderRecode;
+import org.apache.sysds.runtime.util.CommonThreadPool;
import org.apache.sysds.runtime.util.DMVUtils;
import org.apache.sysds.runtime.util.DataConverter;
import org.apache.sysds.runtime.util.EMAUtils;
@@ -90,9 +95,6 @@ public class FrameBlock implements CacheBlock<FrameBlock>,
Externalizable {
/** internal configuration */
private static final boolean REUSE_RECODE_MAPS = true;
- /** The number of rows of the FrameBlock */
- private int _numRows = -1;
-
/** The schema of the data frame as an ordered list of value types */
private ValueType[] _schema = null;
@@ -109,7 +111,6 @@ public class FrameBlock implements CacheBlock<FrameBlock>,
Externalizable {
private long _msize = -1;
public FrameBlock() {
- _numRows = 0;
}
/**
@@ -157,7 +158,6 @@ public class FrameBlock implements CacheBlock<FrameBlock>,
Externalizable {
}
public FrameBlock(ValueType[] schema, String[] names, String[][] data) {
- _numRows = 0; // maintained on append
_schema = schema;
_colnames = names;
ensureAllocateMeta();
@@ -167,7 +167,6 @@ public class FrameBlock implements CacheBlock<FrameBlock>,
Externalizable {
}
public FrameBlock(ValueType[] schema, String[] colNames,
ColumnMetadata[] meta, Array<?>[] data) {
- _numRows = data[0].size();
_schema = schema;
_colnames = colNames;
_colmeta = meta;
@@ -181,7 +180,7 @@ public class FrameBlock implements CacheBlock<FrameBlock>,
Externalizable {
*/
@Override
public int getNumRows() {
- return _numRows;
+ return (_coldata == null || _coldata.length == 0 || _coldata[0]
== null) ? 0 : _coldata[0].size();
}
@Override
@@ -210,10 +209,6 @@ public class FrameBlock implements CacheBlock<FrameBlock>,
Externalizable {
return s;
}
- public void setNumRows(int numRows) {
- _numRows = numRows;
- }
-
/**
* Get the number of columns of the frame block, that is the number of
columns defined in the schema.
*
@@ -341,15 +336,15 @@ public class FrameBlock implements
CacheBlock<FrameBlock>, Externalizable {
*/
public void ensureAllocatedColumns(int numRows) {
_msize = -1;
-
+ final int nRow = getNumRows();
// allocate column meta data if necessary
ensureAllocateMeta();
// early abort if already allocated
if(_coldata != null && _schema.length == _coldata.length) {
// handle special case that to few rows allocated
- if(_numRows < numRows) {
+ if(nRow < numRows) {
String[] tmp = new String[getNumColumns()];
- int len = numRows - _numRows;
+ int len = numRows - nRow;
// TODO: Add append N function.
for(int i = 0; i < len; i++)
appendRow(tmp);
@@ -361,8 +356,6 @@ public class FrameBlock implements CacheBlock<FrameBlock>,
Externalizable {
_coldata = new Array[_schema.length];
for(int j = 0; j < _schema.length; j++)
_coldata[j] = ArrayFactory.allocate(_schema[j],
numRows);
-
- _numRows = numRows;
}
private void ensureAllocateMeta() {
@@ -381,9 +374,9 @@ public class FrameBlock implements CacheBlock<FrameBlock>,
Externalizable {
* @param newLen number of rows to compare with existing number of rows
*/
public void ensureColumnCompatibility(int newLen) {
- if(_coldata != null && _coldata.length > 0 && _numRows !=
newLen)
- throw new RuntimeException("Mismatch in number of rows:
" + newLen + " (expected: " + _numRows + ")");
- _numRows = newLen;
+ final int nRow = getNumRows();
+ if(_coldata != null && _coldata.length > 0 && nRow != newLen)
+ throw new RuntimeException("Mismatch in number of rows:
" + newLen + " (expected: " + nRow + ")");
}
public static String[] createColNames(int size) {
@@ -447,6 +440,18 @@ public class FrameBlock implements CacheBlock<FrameBlock>,
Externalizable {
_coldata[c].set(r, UtilFunctions.objectToObject(_schema[c],
val));
}
+ /**
+ * Sets the value in position (r,c), to the input string value, and at
the individual arrays, convert to correct
+ * type.
+ *
+ * @param r row index
+ * @param c column index
+ * @param val value to set at specified position
+ */
+ public void set(int r, int c, String val) {
+ _coldata[c].set(r, val);
+ }
+
public void reset(int nrow, boolean clearMeta) {
if(clearMeta) {
_schema = null;
@@ -477,7 +482,6 @@ public class FrameBlock implements CacheBlock<FrameBlock>,
Externalizable {
ensureAllocatedColumns(0);
for(int j = 0; j < row.length; j++)
_coldata[j].append(row[j]);
- _numRows++;
_msize = -1;
}
@@ -490,7 +494,6 @@ public class FrameBlock implements CacheBlock<FrameBlock>,
Externalizable {
ensureAllocatedColumns(0);
for(int j = 0; j < row.length; j++)
_coldata[j].append(row[j]);
- _numRows++;
}
/**
@@ -596,9 +599,7 @@ public class FrameBlock implements CacheBlock<FrameBlock>,
Externalizable {
// modification
_schema = empty ? tmpSchema : (ValueType[])
ArrayUtils.addAll(_schema, tmpSchema);
_coldata = empty ? tmpData : (Array[])
ArrayUtils.addAll(_coldata, tmpData);
- _numRows = cols[0].length;
_msize = -1;
-
}
public static FrameBlock convertToFrameBlock(MatrixBlock mb,
ValueType[] schema, int k) {
@@ -670,7 +671,7 @@ public class FrameBlock implements CacheBlock<FrameBlock>,
Externalizable {
@Override
public void readFields(DataInput in) throws IOException {
// read head (rows, cols)
- _numRows = in.readInt();
+ final int nRow = in.readInt();
final int numCols = in.readInt();
final boolean isDefaultMeta = in.readBoolean();
// allocate schema/meta data arrays
@@ -691,7 +692,7 @@ public class FrameBlock implements CacheBlock<FrameBlock>,
Externalizable {
_colmeta[j] = new ColumnMetadata(); // must be
allocated.
}
if(type >= 0) // if in allocated column data then read
it
- _coldata[j] = ArrayFactory.read(in, _numRows);
+ _coldata[j] = ArrayFactory.read(in, nRow);
}
_msize = -1;
}
@@ -718,7 +719,7 @@ public class FrameBlock implements CacheBlock<FrameBlock>,
Externalizable {
return _msize;
// frame block header
- long size = 16 + 4 + 4; // object, num rows, msize
+ double size = 16 + 4; // object, msize
final int clen = getNumColumns();
@@ -735,14 +736,33 @@ public class FrameBlock implements
CacheBlock<FrameBlock>, Externalizable {
// data array
size += MemoryEstimates.objectArrayCost(clen);
+
if(_coldata == null) // not allocated estimate if allocated
for(int j = 0; j < clen; j++)
- ArrayFactory.getInMemorySize(_schema[j],
_numRows);
- else // allocated
- for(Array<?> aa : _coldata)
- size += aa.getInMemorySize();
-
- return _msize = size;
+ size +=
ArrayFactory.getInMemorySize(_schema[j], getNumRows());
+ else {// allocated
+ if(getNumRows() > 1000 && getNumColumns() > 10 &&
ConfigurationManager.isParallelIOEnabled()) {
+ final ExecutorService pool =
CommonThreadPool.get(InfrastructureAnalyzer.getLocalParallelism());
+ try {
+ size += pool.submit(() -> {
+ return
Arrays.stream(_coldata).parallel() // parallel columns
+ .map(x ->
x.getInMemorySize()).reduce(0L, Long::sum);
+ }).get();
+ pool.shutdown();
+ }
+ catch(InterruptedException | ExecutionException
e) {
+ pool.shutdown();
+ for(Array<?> aa : _coldata)
+ size += aa.getInMemorySize();
+ }
+ }
+ else {
+ for(Array<?> aa : _coldata)
+ size += aa.getInMemorySize();
+ }
+ }
+
+ return _msize = (long) size;
}
@Override
@@ -776,7 +796,7 @@ public class FrameBlock implements CacheBlock<FrameBlock>,
Externalizable {
// is always dense but strings have large array overhead per
cell
boolean ret = true;
for(int j = 0; j < _schema.length && ret; j++)
- ret &= (_schema[j] != ValueType.STRING);
+ ret &= _coldata[j].isShallowSerialize();
return ret;
}
@@ -896,7 +916,7 @@ public class FrameBlock implements CacheBlock<FrameBlock>,
Externalizable {
// allocate output frame (incl deep copy schema)
if(ret == null)
ret = new FrameBlock();
- ret._numRows = _numRows;
+
ret._schema = _schema.clone();
ret._colnames = (_colnames != null) ? _colnames.clone() : null;
ret._colmeta = _colmeta.clone();
@@ -973,12 +993,11 @@ public class FrameBlock implements
CacheBlock<FrameBlock>, Externalizable {
if(!isDefNames)
ret._colnames[j - cl] = getColumnName(j);
}
- ret._numRows = ru - rl + 1;
if(ret._coldata == null)
ret._coldata = new Array[numCols];
// fast-path: shallow copy column indexing
- if(ret._numRows == _numRows && !deep) {
+ if(ret.getNumRows() == getNumRows() && !deep) {
// this shallow copy does not only avoid an array copy,
but
// also allows for bi-directional reuses of recodemaps
for(int j = cl; j <= cu; j++)
@@ -1041,7 +1060,6 @@ public class FrameBlock implements
CacheBlock<FrameBlock>, Externalizable {
}
public void copy(FrameBlock src) {
- _numRows = src._numRows;
int nCol = src.getNumColumns();
_schema = Arrays.copyOf(src._schema, nCol);
if(src._colnames != null)
@@ -1388,7 +1406,7 @@ public class FrameBlock implements
CacheBlock<FrameBlock>, Externalizable {
// appropriate one by using the
similarity score
if(probColList.size() > 1) {
for(int w : probColList) {
- int randomIndex =
ThreadLocalRandom.current().nextInt(0, _numRows - 1);
+ int randomIndex =
ThreadLocalRandom.current().nextInt(0, getNumRows() - 1);
Object value =
this.get(randomIndex, w);
if(value != null) {
dataValue2 =
value.toString();
@@ -1575,7 +1593,7 @@ public class FrameBlock implements
CacheBlock<FrameBlock>, Externalizable {
if(select == null) {
Object[] row = new Object[getNumColumns()];
- for(int i = 0; i < _numRows; i++) {
+ for(int i = 0; i < getNumRows(); i++) {
boolean isEmpty = true;
for(int j = 0; j < getNumColumns(); j++) {
row[j] = _coldata[j].get(i);
@@ -1648,7 +1666,7 @@ public class FrameBlock implements
CacheBlock<FrameBlock>, Externalizable {
}
if(ret.getNumColumns() == 0 && emptyReturn) {
- String[][] arr = new String[_numRows][];
+ String[][] arr = new String[getNumRows()][];
Arrays.fill(arr, new String[] {null});
return new FrameBlock(new ValueType[]
{ValueType.STRING}, arr);
}
diff --git
a/src/main/java/org/apache/sysds/runtime/frame/data/columns/Array.java
b/src/main/java/org/apache/sysds/runtime/frame/data/columns/Array.java
index 0bdd8453fc..c757e3c7e8 100644
--- a/src/main/java/org/apache/sysds/runtime/frame/data/columns/Array.java
+++ b/src/main/java/org/apache/sysds/runtime/frame/data/columns/Array.java
@@ -121,6 +121,14 @@ public abstract class Array<T> implements Writable {
*/
public abstract void set(int index, double value);
+ /**
+ * Set index to the given value of the string parsed.
+ *
+ * @param index The index to set
+ * @param value The value to assign
+ */
+ public abstract void set(int index, String value);
+
/**
* Set range to given arrays value
*
@@ -215,7 +223,7 @@ public abstract class Array<T> implements Writable {
/**
* Slice out the sub range and return new array with the specified type.
*
- * If the conversion fails fallback to normal slice
+ * If the conversion fails fallback to normal slice.
*
* @param rl row start
* @param ru row end (not included)
@@ -390,6 +398,8 @@ public abstract class Array<T> implements Writable {
*/
public abstract void fill(T val);
+ public abstract boolean isShallowSerialize();
+
/**
* Overwrite of the java internal clone function for arrays, return a
clone of underlying data that is mutable, (not
* immutable data.)
diff --git
a/src/main/java/org/apache/sysds/runtime/frame/data/columns/BitSetArray.java
b/src/main/java/org/apache/sysds/runtime/frame/data/columns/BitSetArray.java
index 69fa75a6e0..a648b8b3fd 100644
--- a/src/main/java/org/apache/sysds/runtime/frame/data/columns/BitSetArray.java
+++ b/src/main/java/org/apache/sysds/runtime/frame/data/columns/BitSetArray.java
@@ -47,6 +47,8 @@ public class BitSetArray extends Array<Boolean> {
public BitSetArray(boolean[] data) {
super(data.length);
+ if(data.length == 11)
+ throw new DMLRuntimeException("Invalid length");
_data = new long[_size / 64 + 1];
// set bits.
for(int i = 0; i < data.length; i++)
@@ -101,6 +103,11 @@ public class BitSetArray extends Array<Boolean> {
set(index, value == 1.0);
}
+ @Override
+ public void set(int index, String value) {
+ set(index, BooleanArray.parseBoolean(value));
+ }
+
@Override
public void set(int rl, int ru, Array<Boolean> value) {
set(rl, ru, value, 0);
@@ -241,7 +248,7 @@ public class BitSetArray extends Array<Boolean> {
final int endSize = this._size + other.size();
final BitSetArray retBS = new BitSetArray(endSize);
retBS.set(0, this._size - 1, this, 0);
- retBS.set(this._size, endSize - 1, other,0);
+ retBS.set(this._size, endSize - 1, other, 0);
return retBS;
}
@@ -286,8 +293,8 @@ public class BitSetArray extends Array<Boolean> {
}
private BitSetArray sliceSimple(int rl, int ru) {
- final boolean[] ret = new boolean[ru - rl + 1];
- for(int i = rl, off = 0; i <= ru; i++, off++)
+ final boolean[] ret = new boolean[ru - rl];
+ for(int i = rl, off = 0; i < ru; i++, off++)
ret[off] = get(i);
return new BitSetArray(ret);
}
@@ -380,7 +387,7 @@ public class BitSetArray extends Array<Boolean> {
@Override
protected Array<Boolean> changeTypeBitSet() {
- return clone();
+ return this;
}
@Override
@@ -460,6 +467,11 @@ public class BitSetArray extends Array<Boolean> {
return get(i) ? 1.0 : 0.0;
}
+ @Override
+ public boolean isShallowSerialize() {
+ return true;
+ }
+
public static String longToBits(long l) {
String bits = Long.toBinaryString(l);
StringBuilder sb = new StringBuilder(64);
diff --git
a/src/main/java/org/apache/sysds/runtime/frame/data/columns/BooleanArray.java
b/src/main/java/org/apache/sysds/runtime/frame/data/columns/BooleanArray.java
index f8a5f61ebc..d14cc23428 100644
---
a/src/main/java/org/apache/sysds/runtime/frame/data/columns/BooleanArray.java
+++
b/src/main/java/org/apache/sysds/runtime/frame/data/columns/BooleanArray.java
@@ -63,6 +63,11 @@ public class BooleanArray extends Array<Boolean> {
_data[index] = value == 1.0;
}
+ @Override
+ public void set(int index, String value) {
+ set(index, BooleanArray.parseBoolean(value));
+ }
+
@Override
public void set(int rl, int ru, Array<Boolean> value) {
set(rl, ru, value, 0);
@@ -222,7 +227,7 @@ public class BooleanArray extends Array<Boolean> {
@Override
protected Array<Boolean> changeTypeBoolean() {
- return clone();
+ return this;
}
@Override
@@ -278,14 +283,8 @@ public class BooleanArray extends Array<Boolean> {
}
@Override
- public String toString() {
- StringBuilder sb = new StringBuilder(_data.length * 5 + 2);
- sb.append(super.toString() + ":[");
- for(int i = 0; i < _size - 1; i++)
- sb.append((_data[i] ? 1 : 0) + ",");
- sb.append(_data[_size - 1] ? 1 : 0);
- sb.append("]");
- return sb.toString();
+ public boolean isShallowSerialize() {
+ return true;
}
@Override
@@ -296,4 +295,15 @@ public class BooleanArray extends Array<Boolean> {
protected static boolean parseBoolean(String value) {
return value != null && (Boolean.parseBoolean(value) ||
value.equals("1") || value.equals("1.0"));
}
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder(_data.length * 5 + 2);
+ sb.append(super.toString() + ":[");
+ for(int i = 0; i < _size - 1; i++)
+ sb.append((_data[i] ? 1 : 0) + ",");
+ sb.append(_data[_size - 1] ? 1 : 0);
+ sb.append("]");
+ return sb.toString();
+ }
}
diff --git
a/src/main/java/org/apache/sysds/runtime/frame/data/columns/DoubleArray.java
b/src/main/java/org/apache/sysds/runtime/frame/data/columns/DoubleArray.java
index a7026bb747..6c37faf8b3 100644
--- a/src/main/java/org/apache/sysds/runtime/frame/data/columns/DoubleArray.java
+++ b/src/main/java/org/apache/sysds/runtime/frame/data/columns/DoubleArray.java
@@ -61,6 +61,11 @@ public class DoubleArray extends Array<Double> {
_data[index] = value;
}
+ @Override
+ public void set(int index, String value){
+ set(index, parseDouble(value) );
+ }
+
@Override
public void set(int rl, int ru, Array<Double> value) {
set(rl, ru, value, 0);
@@ -256,7 +261,7 @@ public class DoubleArray extends Array<Double> {
@Override
protected Array<Double> changeTypeDouble() {
- return clone();
+ return this;
}
@Override
@@ -320,6 +325,11 @@ public class DoubleArray extends Array<Double> {
return Double.parseDouble(value);
}
+ @Override
+ public boolean isShallowSerialize() {
+ return true;
+ }
+
@Override
public String toString() {
StringBuilder sb = new StringBuilder(_data.length * 5 + 2);
diff --git
a/src/main/java/org/apache/sysds/runtime/frame/data/columns/FloatArray.java
b/src/main/java/org/apache/sysds/runtime/frame/data/columns/FloatArray.java
index 90072a86a9..5c740310e8 100644
--- a/src/main/java/org/apache/sysds/runtime/frame/data/columns/FloatArray.java
+++ b/src/main/java/org/apache/sysds/runtime/frame/data/columns/FloatArray.java
@@ -60,6 +60,12 @@ public class FloatArray extends Array<Float> {
_data[index] = (float) value;
}
+
+ @Override
+ public void set(int index, String value){
+ set(index,parseFloat(value) );
+ }
+
@Override
public void set(int rl, int ru, Array<Float> value) {
set(rl, ru, value, 0);
@@ -240,7 +246,7 @@ public class FloatArray extends Array<Float> {
@Override
protected Array<Float> changeTypeFloat() {
- return clone();
+ return this;
}
@Override
@@ -274,6 +280,11 @@ public class FloatArray extends Array<Float> {
return Float.parseFloat(value);
}
+ @Override
+ public boolean isShallowSerialize() {
+ return true;
+ }
+
@Override
public String toString() {
StringBuilder sb = new StringBuilder(_data.length * 5 + 2);
diff --git
a/src/main/java/org/apache/sysds/runtime/frame/data/columns/IntegerArray.java
b/src/main/java/org/apache/sysds/runtime/frame/data/columns/IntegerArray.java
index 71c6a17051..a94987ea15 100644
---
a/src/main/java/org/apache/sysds/runtime/frame/data/columns/IntegerArray.java
+++
b/src/main/java/org/apache/sysds/runtime/frame/data/columns/IntegerArray.java
@@ -60,6 +60,11 @@ public class IntegerArray extends Array<Integer> {
_data[index] = (int) value;
}
+ @Override
+ public void set(int index, String value) {
+ set(index, parseInt(value));
+ }
+
@Override
public void set(int rl, int ru, Array<Integer> value) {
set(rl, ru, value, 0);
@@ -113,7 +118,7 @@ public class IntegerArray extends Array<Integer> {
final int endSize = this._size + other.size();
final int[] ret = new int[endSize];
System.arraycopy(_data, 0, ret, 0, this._size);
- System.arraycopy((int[])other.get(), 0, ret, this._size,
other.size());
+ System.arraycopy((int[]) other.get(), 0, ret, this._size,
other.size());
return new IntegerArray(ret);
}
@@ -144,7 +149,7 @@ public class IntegerArray extends Array<Integer> {
public void reset(int size) {
if(_data.length < size || _data.length > 2 * size)
_data = new int[size];
- else
+ else
for(int i = 0; i < size; i++)
_data[i] = 0;
_size = size;
@@ -228,7 +233,7 @@ public class IntegerArray extends Array<Integer> {
@Override
protected Array<Integer> changeTypeInteger() {
- return clone();
+ return this;
}
@Override
@@ -259,7 +264,7 @@ public class IntegerArray extends Array<Integer> {
}
@Override
- public double getAsDouble(int i){
+ public double getAsDouble(int i) {
return _data[i];
}
@@ -270,7 +275,7 @@ public class IntegerArray extends Array<Integer> {
return Integer.parseInt(s);
}
catch(NumberFormatException e) {
- if(s.contains(".")){
+ if(s.contains(".")) {
return Integer.parseInt(s.split("\\.")[0]);
}
else
@@ -278,6 +283,11 @@ public class IntegerArray extends Array<Integer> {
}
}
+ @Override
+ public boolean isShallowSerialize() {
+ return true;
+ }
+
@Override
public String toString() {
StringBuilder sb = new StringBuilder(_data.length * 5 + 2);
diff --git
a/src/main/java/org/apache/sysds/runtime/frame/data/columns/LongArray.java
b/src/main/java/org/apache/sysds/runtime/frame/data/columns/LongArray.java
index a705b3a095..4dbdc91d32 100644
--- a/src/main/java/org/apache/sysds/runtime/frame/data/columns/LongArray.java
+++ b/src/main/java/org/apache/sysds/runtime/frame/data/columns/LongArray.java
@@ -60,6 +60,11 @@ public class LongArray extends Array<Long> {
_data[index] = (long) value;
}
+ @Override
+ public void set(int index, String value) {
+ set(index, parseLong(value));
+ }
+
@Override
public void set(int rl, int ru, Array<Long> value) {
set(rl, ru, value, 0);
@@ -112,7 +117,7 @@ public class LongArray extends Array<Long> {
final int endSize = this._size + other.size();
final long[] ret = new long[endSize];
System.arraycopy(_data, 0, ret, 0, this._size);
- System.arraycopy((long[])other.get(), 0, ret, this._size,
other.size());
+ System.arraycopy((long[]) other.get(), 0, ret, this._size,
other.size());
return new LongArray(ret);
}
@@ -144,7 +149,7 @@ public class LongArray extends Array<Long> {
public void reset(int size) {
if(_data.length < size || _data.length > 2 * size)
_data = new long[size];
- else
+ else
for(int i = 0; i < size; i++)
_data[i] = 0;
_size = size;
@@ -239,7 +244,7 @@ public class LongArray extends Array<Long> {
@Override
protected Array<Long> changeTypeLong() {
- return clone();
+ return this;
}
@Override
@@ -280,6 +285,11 @@ public class LongArray extends Array<Long> {
}
}
+ @Override
+ public boolean isShallowSerialize() {
+ return true;
+ }
+
@Override
public String toString() {
StringBuilder sb = new StringBuilder(_data.length * 5 + 2);
diff --git
a/src/main/java/org/apache/sysds/runtime/frame/data/columns/StringArray.java
b/src/main/java/org/apache/sysds/runtime/frame/data/columns/StringArray.java
index 908509b9f7..896225b6f2 100644
--- a/src/main/java/org/apache/sysds/runtime/frame/data/columns/StringArray.java
+++ b/src/main/java/org/apache/sysds/runtime/frame/data/columns/StringArray.java
@@ -37,11 +37,18 @@ import org.apache.sysds.utils.MemoryEstimates;
public class StringArray extends Array<String> {
private String[] _data;
+ private long materializedSize = -1L;
+
public StringArray(String[] data) {
super(data.length);
_data = data;
}
+ private StringArray(String[] data, long materializedSize) {
+ this(data);
+ this.materializedSize = materializedSize;
+ }
+
public String[] get() {
return _data;
}
@@ -54,16 +61,19 @@ public class StringArray extends Array<String> {
@Override
public void set(int index, String value) {
_data[index] = value;
+ materializedSize = -1;
}
@Override
public void set(int index, double value) {
_data[index] = Double.toString(value);
+ materializedSize = -1;
}
@Override
public void set(int rl, int ru, Array<String> value) {
set(rl, ru, value, 0);
+ materializedSize = -1;
}
@Override
@@ -75,11 +85,13 @@ public class StringArray extends Array<String> {
else
_data[i] = null;
}
+ materializedSize = -1;
}
@Override
public void set(int rl, int ru, Array<String> value, int rlSrc) {
System.arraycopy(((StringArray) value)._data, rlSrc, _data, rl,
ru - rl + 1);
+ materializedSize = -1;
}
@Override
@@ -88,6 +100,7 @@ public class StringArray extends Array<String> {
for(int i = rl; i <= ru; i++)
if(data2[i] != null)
_data[i] = data2[i];
+ materializedSize = -1;
}
@Override
@@ -97,6 +110,7 @@ public class StringArray extends Array<String> {
if(v != null)
_data[i] = v.toString();
}
+ materializedSize = -1;
}
@Override
@@ -104,6 +118,7 @@ public class StringArray extends Array<String> {
if(_data.length <= _size)
_data = Arrays.copyOf(_data, newSize());
_data[_size++] = value;
+ materializedSize = -1;
}
@Override
@@ -112,13 +127,14 @@ public class StringArray extends Array<String> {
final String[] ret = new String[endSize];
System.arraycopy(_data, 0, ret, 0, this._size);
System.arraycopy((String[]) other.get(), 0, ret, this._size,
other.size());
- ;
+ materializedSize = -1;
return new StringArray(ret);
}
@Override
public void write(DataOutput out) throws IOException {
out.writeByte(FrameArrayType.STRING.ordinal());
+ out.writeLong(getInMemorySize());
for(int i = 0; i < _size; i++)
out.writeUTF((_data[i] != null) ? _data[i] : "");
}
@@ -126,6 +142,7 @@ public class StringArray extends Array<String> {
@Override
public void readFields(DataInput in) throws IOException {
_size = _data.length;
+ materializedSize = in.readLong();
for(int i = 0; i < _size; i++) {
String tmp = in.readUTF();
_data[i] = (!tmp.isEmpty()) ? tmp : null;
@@ -134,7 +151,7 @@ public class StringArray extends Array<String> {
@Override
public Array<String> clone() {
- return new StringArray(Arrays.copyOf(_data, _size));
+ return new StringArray(Arrays.copyOf(_data, _size),
materializedSize);
}
@Override
@@ -150,6 +167,7 @@ public class StringArray extends Array<String> {
for(int i = 0; i < size; i++)
_data[i] = null;
_size = size;
+ materializedSize = -1;
}
@Override
@@ -175,50 +193,56 @@ public class StringArray extends Array<String> {
return ValueType.STRING;
}
+ private static final ValueType getHighest(ValueType state, ValueType c)
{
+ switch(state) {
+ case FP32:
+ switch(c) {
+ case FP64:
+ return c;
+ default:
+ }
+ break;
+ case INT64:
+ switch(c) {
+ case FP64:
+ case FP32:
+ return c;
+ default:
+ }
+ break;
+ case INT32:
+ switch(c) {
+ case FP64:
+ case FP32:
+ case INT64:
+ return c;
+ default:
+ }
+ break;
+ case BOOLEAN:
+ switch(c) {
+ case FP64:
+ case FP32:
+ case INT64:
+ case INT32:
+ return c;
+ default:
+ }
+ break;
+ default:
+
+ }
+ return state;
+ }
+
@Override
public ValueType analyzeValueType() {
ValueType state = FrameUtil.isType(_data[0]);
for(int i = 1; i < _size; i++) {
- ValueType c = FrameUtil.isType(_data[i]);
+ ValueType c = FrameUtil.isType(_data[i], state);
if(c == ValueType.STRING || c == ValueType.UNKNOWN)
return ValueType.STRING;
- switch(state) {
- case FP32:
- switch(c) {
- case FP64:
- state = c;
- default:
- }
- break;
- case INT64:
- switch(c) {
- case FP64:
- case FP32:
- state = c;
- default:
- }
- break;
- case INT32:
- switch(c) {
- case FP64:
- case FP32:
- case INT64:
- state = c;
- default:
- }
- break;
- case BOOLEAN:
- switch(c) {
- case FP64:
- case FP32:
- case INT64:
- case INT32:
- state = c;
- default:
- }
- break;
- default:
- }
+ state = getHighest(state, c);
}
return state;
}
@@ -230,9 +254,12 @@ public class StringArray extends Array<String> {
@Override
public long getInMemorySize() {
+ if(materializedSize != -1)
+ return materializedSize;
long size = super.getInMemorySize(); // object header + object
reference
size += MemoryEstimates.stringArrayCost(_data);
- return size;
+ size += 8; // estimated size cache
+ return materializedSize = size;
}
@Override
@@ -253,8 +280,10 @@ public class StringArray extends Array<String> {
// detect type of transform.
if(_data[0].toLowerCase().equals("true") ||
_data[0].toLowerCase().equals("false"))
return changeTypeBooleanStandard();
- if(_data[0].equals("0") || _data[0].equals("1"))
+ else if(_data[0].equals("0") || _data[0].equals("1"))
return changeTypeBooleanNumeric();
+ else if(_data[0].toLowerCase().equals("t") ||
_data[0].toLowerCase().equals("f"))
+ return changeTypeBooleanCharacter();
else
throw new DMLRuntimeException("Not supported type of
Strings to change to Booleans value: " + _data[0]);
}
@@ -281,6 +310,31 @@ public class StringArray extends Array<String> {
return new BooleanArray(ret);
}
+ protected Array<Boolean> changeTypeBooleanCharacter() {
+ if(size() > ArrayFactory.bitSetSwitchPoint)
+ return changeTypeBooleanCharacterBitSet();
+ else
+ return changeTypeBooleanCharacterArray();
+ }
+
+ protected Array<Boolean> changeTypeBooleanCharacterBitSet() {
+ BitSet ret = new BitSet(size());
+ for(int i = 0; i < size(); i++)
+ ret.set(i, isTrueCharacter(_data[i].charAt(0)));
+ return new BitSetArray(ret, size());
+ }
+
+ protected Array<Boolean> changeTypeBooleanCharacterArray() {
+ boolean[] ret = new boolean[size()];
+ for(int i = 0; i < size(); i++)
+ ret[i] = isTrueCharacter(_data[i].charAt(0));
+ return new BooleanArray(ret);
+ }
+
+ private boolean isTrueCharacter(char a) {
+ return a == 'T' || a == 't';
+ }
+
protected Array<Boolean> changeTypeBooleanNumeric() {
if(size() > ArrayFactory.bitSetSwitchPoint)
return changeTypeBooleanNumericBitSet();
@@ -368,7 +422,7 @@ public class StringArray extends Array<String> {
@Override
public Array<String> changeTypeString() {
- return clone();
+ return this;
}
@Override
@@ -390,6 +444,7 @@ public class StringArray extends Array<String> {
public void fill(String value) {
for(int i = 0; i < _size; i++)
_data[i] = value;
+ materializedSize = -1;
}
@Override
@@ -397,6 +452,12 @@ public class StringArray extends Array<String> {
return Double.parseDouble(_data[i]);
}
+ @Override
+ public boolean isShallowSerialize() {
+ long s = getInMemorySize();
+ return s / _size < 100;
+ }
+
@Override
public String toString() {
StringBuilder sb = new StringBuilder(_data.length * 5 + 2);
diff --git
a/src/main/java/org/apache/sysds/runtime/frame/data/iterators/RowIterator.java
b/src/main/java/org/apache/sysds/runtime/frame/data/iterators/RowIterator.java
index d62a5d6cd9..68266fd9fa 100644
---
a/src/main/java/org/apache/sysds/runtime/frame/data/iterators/RowIterator.java
+++
b/src/main/java/org/apache/sysds/runtime/frame/data/iterators/RowIterator.java
@@ -21,10 +21,15 @@ package org.apache.sysds.runtime.frame.data.iterators;
import java.util.Iterator;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
import org.apache.sysds.runtime.frame.data.FrameBlock;
import org.apache.sysds.runtime.util.UtilFunctions;
public abstract class RowIterator<T> implements Iterator<T[]> {
+
+ protected static final Log LOG =
LogFactory.getLog(RowIterator.class.getName());
+
protected final FrameBlock _fb;
protected final int[] _cols;
protected final T[] _curRow;
diff --git
a/src/main/java/org/apache/sysds/runtime/frame/data/lib/FrameLibAppend.java
b/src/main/java/org/apache/sysds/runtime/frame/data/lib/FrameLibAppend.java
index 54475b2640..6fd22ca4bc 100644
--- a/src/main/java/org/apache/sysds/runtime/frame/data/lib/FrameLibAppend.java
+++ b/src/main/java/org/apache/sysds/runtime/frame/data/lib/FrameLibAppend.java
@@ -83,7 +83,6 @@ public class FrameLibAppend {
else if(b.getNumRows() == 0)
return a;
- // ret._schema = a.getSchema().clone();
String[] retColNames = (a.getColumnNames(false) != null) ?
a.getColumnNames().clone() : null;
ColumnMetadata[] retColMeta = new
ColumnMetadata[a.getNumColumns()];
for(int j = 0; j < nCol; j++)
diff --git
a/src/main/java/org/apache/sysds/runtime/frame/data/lib/FrameLibApplySchema.java
b/src/main/java/org/apache/sysds/runtime/frame/data/lib/FrameLibApplySchema.java
index 6fc4515b53..136afc9589 100644
---
a/src/main/java/org/apache/sysds/runtime/frame/data/lib/FrameLibApplySchema.java
+++
b/src/main/java/org/apache/sysds/runtime/frame/data/lib/FrameLibApplySchema.java
@@ -71,6 +71,13 @@ public class FrameLibApplySchema {
applySingleThread();
else
applyMultiThread();
+
+ boolean same = true;
+ for(int i = 0; i < columnsIn.length && same; i++)
+ same = columnsIn[i] == columnsOut[i];
+
+ if(same)
+ return this.fb;
final String[] colNames = fb.getColumnNames(false);
final ColumnMetadata[] meta = fb.getColumnMetadata();
diff --git
a/src/main/java/org/apache/sysds/runtime/frame/data/lib/FrameUtil.java
b/src/main/java/org/apache/sysds/runtime/frame/data/lib/FrameUtil.java
index b60f8a4fad..97130c5999 100644
--- a/src/main/java/org/apache/sysds/runtime/frame/data/lib/FrameUtil.java
+++ b/src/main/java/org/apache/sysds/runtime/frame/data/lib/FrameUtil.java
@@ -21,6 +21,7 @@ package org.apache.sysds.runtime.frame.data.lib;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.regex.Pattern;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -34,6 +35,12 @@ import org.apache.sysds.runtime.util.UtilFunctions;
public interface FrameUtil {
public static final Log LOG =
LogFactory.getLog(FrameUtil.class.getName());
+ public static final Pattern booleanPattern =
Pattern.compile("([tT]((rue)|(RUE))?|[fF]((alse)|(ALSE))?|0\\.0+|1\\.0+|0|1)");
+ public static final Pattern integerFloatPattern =
Pattern.compile("[-+]?\\d+(\\.0+)?");
+ public static final Pattern floatPattern =
Pattern.compile("[-+]?[0-9]*\\.?[0-9]*([eE][-+]?[0-9]+)?");
+
+ public static final Pattern dotSplitPattern = Pattern.compile("\\.");
+
public static Array<?>[] add(Array<?>[] ar, Array<?> e) {
if(ar == null)
return new Array[] {e};
@@ -43,59 +50,125 @@ public interface FrameUtil {
return ret;
}
- public static ValueType isType(String val) {
- if (val == null)
- return ValueType.UNKNOWN;
- val = val.trim().toLowerCase().replaceAll("\"", "");
- if(val.matches("(true|false|t|f|0|1|0\\.0+|1\\.0+)"))
+ private static ValueType isBooleanType(final String val, int len) {
+ if(val.length() <= 16 && booleanPattern.matcher(val).matches())
return ValueType.BOOLEAN;
- else if(val.matches("[-+]?\\d+\\.0+")) { // 11.00000000
1313241.0
- long maxValue = Long.parseLong(val.split("\\.")[0]);
- if((maxValue >= Integer.MIN_VALUE) && (maxValue <=
Integer.MAX_VALUE))
- return ValueType.INT32;
- else
- return ValueType.INT64;
+ return null;
+ }
+
+ private static boolean simpleIntMatch(final String val, final int len) {
+ for(int i = 0; i < len; i++) {
+ final char c = val.charAt(i);
+ if(c < '0' || c > '9')
+ return false;
}
- else if(val.matches("[-+]?\\d+")) { // 1 3 6 192 14152131
- long maxValue = Long.parseLong(val);
- if((maxValue >= Integer.MIN_VALUE) && (maxValue <=
Integer.MAX_VALUE))
- return ValueType.INT32;
- else
- return ValueType.INT64;
+ return true;
+ }
+
+ private static ValueType intType(final long value) {
+ if(value >= Integer.MIN_VALUE && value <= Integer.MAX_VALUE)
+ return ValueType.INT32;
+ else
+ return ValueType.INT64;
+ }
+
+ private static ValueType isIntType(final String val, final int len) {
+ if(len <= 22) {
+ if(simpleIntMatch(val, len)){
+ if(len < 8)
+ return ValueType.INT32;
+ return intType(Long.parseLong(val));
+ }
+ else if(integerFloatPattern.matcher(val).matches()) {
+ // 11.00000000 1313241.0 13 2415 -22
+ final long value =
Long.parseLong(val.contains(".") ? dotSplitPattern.split(val)[0] : val);
+ return intType(value);
+ }
}
- else if(val.matches("[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?"))
{
- // parse float, and make back to string if equivalent
use float.
+ return null;
+ }
+
+ private static ValueType isFloatType(final String val, final int len) {
+
+ if(len <= 25 && floatPattern.matcher(val).matches()) {
+ // return isFloatType(v);
+ // parse float and double,
+ // and make back to string if equivalent use float.
+ // This is expensive but accurate.
float f = Float.parseFloat(val);
- if(val.equals(Float.toString(f)))
+ double d = Double.parseDouble(val);
+ String v1 = Float.toString(f);
+ String v2 = Double.toString(d);
+ if(v1.equals(v2))
return ValueType.FP32;
else
return ValueType.FP64;
}
else if(val.equals("infinity") || val.equals("-infinity") ||
val.equals("nan"))
return ValueType.FP64;
- else
- return ValueType.STRING;
+ return null;
+ }
+
+ /**
+ * Get type type subject to minimum another type.
+ *
+ * This enable skipping checking for boolean type if floats are already
found.
+ *
+ * @param val The string value to check
+ * @param minType the minimum type to check.
+ * @return ValueType subject to restriction
+ */
+ public static ValueType isType(String val, ValueType minType) {
+ if(val == null)
+ return ValueType.UNKNOWN;
+ final int len = val.length();
+ if(len == 0)
+ return ValueType.UNKNOWN;
+ ValueType r = null;
+ switch(minType) {
+ case UNKNOWN:
+ case BOOLEAN:
+ if(isBooleanType(val, len) != null)
+ return ValueType.BOOLEAN;
+ case UINT8:
+ case INT32:
+ case INT64:
+ r = isIntType(val, len);
+ if(r != null)
+ return r;
+ case FP32:
+ case FP64:
+ r = isFloatType(val, len);
+ if(r != null)
+ return r;
+ case STRING:
+ return ValueType.STRING;
+ default:
+ throw new DMLRuntimeException("");
+ }
+ }
+
+ public static ValueType isType(String val) {
+ return isType(val, ValueType.BOOLEAN);
}
- public static ValueType isType(double val){
+ public static ValueType isType(double val) {
if(val == 1.0d || val == 0.0d)
return ValueType.BOOLEAN;
- else if((long)(val) == val){
- if((int)val == val)
+ else if((long) (val) == val) {
+ if((int) val == val)
return ValueType.INT32;
else
return ValueType.INT64;
}
- else if((double)((float) val) == val )
+ else if((double) ((float) val) == val)
// Detecting FP32 could use some extra work.
- return ValueType.FP32;
-
-
+ return ValueType.FP32;
+
return ValueType.FP64;
}
-
public static FrameBlock mergeSchema(FrameBlock temp1, FrameBlock
temp2) {
String[] rowTemp1 =
IteratorFactory.getStringRowIterator(temp1).next();
String[] rowTemp2 =
IteratorFactory.getStringRowIterator(temp2).next();
diff --git
a/src/main/java/org/apache/sysds/runtime/instructions/cp/BinaryFrameFrameCPInstruction.java
b/src/main/java/org/apache/sysds/runtime/instructions/cp/BinaryFrameFrameCPInstruction.java
index fd749f8430..f7a8ccb54f 100644
---
a/src/main/java/org/apache/sysds/runtime/instructions/cp/BinaryFrameFrameCPInstruction.java
+++
b/src/main/java/org/apache/sysds/runtime/instructions/cp/BinaryFrameFrameCPInstruction.java
@@ -62,7 +62,9 @@ public class BinaryFrameFrameCPInstruction extends
BinaryCPInstruction {
ValueType[] schema = new
ValueType[inBlock2.getNumColumns()];
for(int i=0; i<inBlock2.getNumColumns(); i++)
schema[i] =
ValueType.fromExternalString(inBlock2.get(0, i).toString());
- ec.setFrameOutput(output.getName(),
inBlock1.applySchema(schema, ((MultiThreadedOperator)_optr).getNumThreads()));
+ final int k =
((MultiThreadedOperator)_optr).getNumThreads();
+ final FrameBlock out = inBlock1.applySchema(schema, k);
+ ec.setFrameOutput(output.getName(), out);
}
else {
// Execute binary operations
diff --git
a/src/main/java/org/apache/sysds/runtime/instructions/cp/VariableCPInstruction.java
b/src/main/java/org/apache/sysds/runtime/instructions/cp/VariableCPInstruction.java
index 6026f6692a..4c55c7207b 100644
---
a/src/main/java/org/apache/sysds/runtime/instructions/cp/VariableCPInstruction.java
+++
b/src/main/java/org/apache/sysds/runtime/instructions/cp/VariableCPInstruction.java
@@ -59,7 +59,6 @@ import org.apache.sysds.runtime.lineage.LineageItem;
import org.apache.sysds.runtime.lineage.LineageItemUtils;
import org.apache.sysds.runtime.lineage.LineageTraceable;
import org.apache.sysds.runtime.matrix.data.MatrixBlock;
-import org.apache.sysds.runtime.matrix.operators.MultiThreadedOperator;
import org.apache.sysds.runtime.meta.DataCharacteristics;
import org.apache.sysds.runtime.meta.MatrixCharacteristics;
import org.apache.sysds.runtime.meta.MetaData;
diff --git
a/src/main/java/org/apache/sysds/runtime/instructions/spark/ComputationSPInstruction.java
b/src/main/java/org/apache/sysds/runtime/instructions/spark/ComputationSPInstruction.java
index 465ce35820..90312e0731 100644
---
a/src/main/java/org/apache/sysds/runtime/instructions/spark/ComputationSPInstruction.java
+++
b/src/main/java/org/apache/sysds/runtime/instructions/spark/ComputationSPInstruction.java
@@ -41,8 +41,6 @@ import org.apache.sysds.runtime.matrix.data.MatrixIndexes;
import org.apache.sysds.runtime.matrix.operators.Operator;
import org.apache.sysds.runtime.meta.DataCharacteristics;
-import java.util.Map;
-
public abstract class ComputationSPInstruction extends SPInstruction
implements LineageTraceable {
public CPOperand output;
public CPOperand input1, input2, input3;
diff --git
a/src/main/java/org/apache/sysds/runtime/instructions/spark/utils/FrameRDDConverterUtils.java
b/src/main/java/org/apache/sysds/runtime/instructions/spark/utils/FrameRDDConverterUtils.java
index d26b2fcd9e..51d47af4f3 100644
---
a/src/main/java/org/apache/sysds/runtime/instructions/spark/utils/FrameRDDConverterUtils.java
+++
b/src/main/java/org/apache/sysds/runtime/instructions/spark/utils/FrameRDDConverterUtils.java
@@ -620,7 +620,6 @@ public class FrameRDDConverterUtils
//preallocate physical columns (to avoid re-allocations)
fb.ensureAllocatedColumns(_maxRowsPerBlock);
fb.reset(0, false); //reset data but keep schema
- fb.setNumRows(0); //reset num rows to allow for append
//handle meta data
if( _colnames != null )
diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextCSV.java
b/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextCSV.java
index e87c1bf1d9..00a6fac5ec 100644
--- a/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextCSV.java
+++ b/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextCSV.java
@@ -41,7 +41,6 @@ import org.apache.sysds.runtime.frame.data.FrameBlock;
import org.apache.sysds.runtime.matrix.data.Pair;
import org.apache.sysds.runtime.transform.TfUtils;
import org.apache.sysds.runtime.util.InputStreamInputFormat;
-import org.apache.sysds.runtime.util.UtilFunctions;
/**
* Single-threaded frame text csv reader.
@@ -117,6 +116,10 @@ public class FrameReaderTextCSV extends FrameReader {
protected final int readCSVFrameFromInputSplit(InputSplit split,
InputFormat<LongWritable, Text> informat,
JobConf job, FrameBlock dest, ValueType[] schema, String[]
names, long rlen, long clen, int rl, boolean first)
throws IOException {
+
+ if( rl > rlen) // in case this method is called wrongly
+ throw new DMLRuntimeException("Invalid offset");
+ // return (int) rlen;
boolean hasHeader = _props.hasHeader();
boolean isFill = _props.isFill();
double dfillValue = _props.getFillValue();
@@ -129,7 +132,7 @@ public class FrameReaderTextCSV extends FrameReader {
LongWritable key = new LongWritable();
Text value = new Text();
int row = rl;
- int col = -1;
+ final int nCol = dest.getNumColumns();
// handle header if existing
if(first && hasHeader) {
@@ -138,46 +141,54 @@ public class FrameReaderTextCSV extends FrameReader {
}
// Read the data
- boolean emptyValuesFound = false;
try {
+ String[] parts = null; // cache array for line reading.
while(reader.next(key, value)) // foreach line
{
- String cellStr = value.toString().trim();
- emptyValuesFound = false;
- col = 0;
- String[] parts =
IOUtilFunctions.splitCSV(cellStr, delim);
+ String cellStr = value.toString();
+ boolean emptyValuesFound = false;
+ cellStr = IOUtilFunctions.trim(cellStr);
+ parts = IOUtilFunctions.splitCSV(cellStr,
delim, parts);
+ // sanity checks for empty values and number of
columns
+ final boolean mtdP =
parts[0].equals(TfUtils.TXMTD_MVPREFIX);
+ final boolean mtdx =
parts[0].equals(TfUtils.TXMTD_NDPREFIX);
// parse frame meta data (missing values / num
distinct)
- if(parts[0].equals(TfUtils.TXMTD_MVPREFIX) ||
parts[0].equals(TfUtils.TXMTD_NDPREFIX)) {
-
if(parts[0].equals(TfUtils.TXMTD_MVPREFIX))
+ if(mtdP || mtdx) {
+ parts =
IOUtilFunctions.splitCSV(cellStr, delim);
+ if(parts.length != dest.getNumColumns()
+ 1){
+ LOG.warn("Invalid metadata ");
+ parts = null;
+ continue;
+ }
+ if(mtdP)
for(int j = 0; j <
dest.getNumColumns(); j++)
dest.getColumnMetadata(j).setMvValue(parts[j + 1]);
- else
if(parts[0].equals(TfUtils.TXMTD_NDPREFIX))
+ else if(mtdx)
for(int j = 0; j <
dest.getNumColumns(); j++)
dest.getColumnMetadata(j).setNumDistinct(Long.parseLong(parts[j + 1]));
+ parts = null;
continue;
}
- for(String part : parts) // foreach cell
- {
- part = part.trim();
+ for(int col = 0; col < nCol; col++) {
+ String part =
IOUtilFunctions.trim(parts[col]);
if(part.isEmpty() || (naValues != null
&& naValues.contains(part))) {
if(isFill && dfillValue != 0)
- dest.set(row, col,
UtilFunctions.stringToObject(schema[col], sfillValue));
+ dest.set(row, col,
sfillValue);
emptyValuesFound = true;
}
- else {
- dest.set(row, col,
UtilFunctions.stringToObject(schema[col], part));
- }
- col++;
+ else
+ dest.set(row, col, part);
}
-
- // sanity checks for empty values and number of
columns
IOUtilFunctions.checkAndRaiseErrorCSVEmptyField(cellStr, isFill,
emptyValuesFound);
IOUtilFunctions.checkAndRaiseErrorCSVNumColumns("", cellStr, parts, clen);
row++;
}
}
+ catch(Exception e){
+ throw new DMLRuntimeException("Failed parsing string:
\"" + value +"\"", e);
+ }
finally {
IOUtilFunctions.closeSilently(reader);
}
@@ -198,19 +209,8 @@ public class FrameReaderTextCSV extends FrameReader {
int nrow = 0;
for(int i = 0; i < splits.length; i++) {
RecordReader<LongWritable, Text> reader =
informat.getRecordReader(splits[i], job, Reporter.NULL);
- LongWritable key = new LongWritable();
- Text value = new Text();
-
try {
- // ignore header of first split
- if(i == 0 && _props.hasHeader())
- reader.next(key, value);
-
- // count remaining number of rows, ignore meta
data
- while(reader.next(key, value)) {
- String val = value.toString();
- nrow +=
(val.startsWith(TfUtils.TXMTD_MVPREFIX) ||
val.startsWith(TfUtils.TXMTD_NDPREFIX)) ? 0 : 1;
- }
+ nrow = countLinesInReader(reader, ncol , i == 0
&& _props.hasHeader());
}
finally {
IOUtilFunctions.closeSilently(reader);
@@ -218,4 +218,30 @@ public class FrameReaderTextCSV extends FrameReader {
}
return new Pair<>(nrow, ncol);
}
+
+ protected static int countLinesInReader(RecordReader<LongWritable,
Text> reader, int ncol, boolean header)
+ throws IOException {
+ final LongWritable key = new LongWritable();
+ final Text value = new Text();
+
+ int nrow = 0;
+ try {
+ // ignore header of first split
+ if(header)
+ reader.next(key, value);
+ while(reader.next(key, value)) {
+ // note the metadata can be located at any row
when spark.
+ nrow += containsMetaTag(value.toString()) ? 0 :
1;
+ }
+ return nrow;
+ }
+ finally {
+ IOUtilFunctions.closeSilently(reader);
+ }
+ }
+
+ private final static boolean containsMetaTag(String val) {
+ return val.startsWith(TfUtils.TXMTD_MVPREFIX)//
+ || val.startsWith(TfUtils.TXMTD_NDPREFIX);
+ }
}
diff --git
a/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextCSVParallel.java
b/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextCSVParallel.java
index ec80b7d256..9d58f13825 100644
--- a/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextCSVParallel.java
+++ b/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextCSVParallel.java
@@ -37,9 +37,9 @@ import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.sysds.common.Types.ValueType;
import org.apache.sysds.hops.OptimizerUtils;
+import org.apache.sysds.runtime.DMLRuntimeException;
import org.apache.sysds.runtime.frame.data.FrameBlock;
import org.apache.sysds.runtime.matrix.data.Pair;
-import org.apache.sysds.runtime.transform.TfUtils;
import org.apache.sysds.runtime.util.CommonThreadPool;
/**
@@ -66,19 +66,19 @@ public class FrameReaderTextCSVParallel extends
FrameReaderTextCSV
try
{
- ExecutorService pool = CommonThreadPool.get(
- Math.min(numThreads, splits.length));
+ // get number of threads pool to use the common thread
pool.
+ ExecutorService pool = CommonThreadPool.get(numThreads);
//compute num rows per split
ArrayList<CountRowsTask> tasks = new ArrayList<>();
for( int i=0; i<splits.length; i++ )
tasks.add(new CountRowsTask(splits[i],
informat, job, _props.hasHeader(), i==0));
- List<Future<Long>> cret = pool.invokeAll(tasks);
+ List<Future<Integer>> cret = pool.invokeAll(tasks);
//compute row offset per split via cumsum on row counts
long offset = 0;
List<Long> offsets = new ArrayList<>();
- for( Future<Long> count : cret ) {
+ for( Future<Integer> count : cret ) {
offsets.add(offset);
offset += count.get();
}
@@ -108,14 +108,14 @@ public class FrameReaderTextCSVParallel extends
FrameReaderTextCSV
int ncol = IOUtilFunctions.countNumColumnsCSV(splits, informat,
job, _props.getDelim());
//compute number of rows
- int nrow = 0;
+ long nrow = 0;
ExecutorService pool = CommonThreadPool.get(numThreads);
try {
ArrayList<CountRowsTask> tasks = new ArrayList<>();
for( int i=0; i<splits.length; i++ )
tasks.add(new CountRowsTask(splits[i],
informat, job, _props.hasHeader(), i==0));
- List<Future<Long>> cret = pool.invokeAll(tasks);
- for( Future<Long> count : cret )
+ List<Future<Integer>> cret = pool.invokeAll(tasks);
+ for( Future<Integer> count : cret )
nrow += count.get().intValue();
}
catch (Exception e) {
@@ -124,10 +124,13 @@ public class FrameReaderTextCSVParallel extends
FrameReaderTextCSV
finally {
pool.shutdown();
}
- return new Pair<>(nrow, ncol);
+ if(nrow > (long)Integer.MAX_VALUE)
+ throw new DMLRuntimeException("invalid read with over
Integer number of rows");
+
+ return new Pair<>((int)nrow, ncol);
}
- private static class CountRowsTask implements Callable<Long>
+ private static class CountRowsTask implements Callable<Integer>
{
private InputSplit _split = null;
private TextInputFormat _informat = null;
@@ -144,29 +147,17 @@ public class FrameReaderTextCSVParallel extends
FrameReaderTextCSV
}
@Override
- public Long call()
+ public Integer call()
throws Exception
{
- RecordReader<LongWritable, Text> reader =
_informat.getRecordReader(_split, _job, Reporter.NULL);
- LongWritable key = new LongWritable();
- Text value = new Text();
- long nrows = 0;
-
- // count rows from the first non-header row
+ RecordReader<LongWritable, Text> reader =
_informat.getRecordReader(_split, _job, Reporter.NULL);
try {
- if ( _firstSplit && _hasHeader )
- reader.next(key, value);
- while ( reader.next(key, value) ) {
- String val = value.toString();
- nrows += (
val.startsWith(TfUtils.TXMTD_MVPREFIX)
- ||
val.startsWith(TfUtils.TXMTD_NDPREFIX)) ? 0 : 1;
- }
+ // it is assumed that if we read parallel
number of rows, there are at least two columns.
+ return countLinesInReader(reader, 2 ,
_firstSplit && _hasHeader);
}
finally {
IOUtilFunctions.closeSilently(reader);
}
-
- return nrows;
}
}
@@ -195,9 +186,16 @@ public class FrameReaderTextCSVParallel extends
FrameReaderTextCSV
public Object call()
throws Exception
{
- readCSVFrameFromInputSplit(_split, _informat, _job,
_dest, _dest.getSchema(),
- _dest.getColumnNames(),
_dest.getNumRows(), _dest.getNumColumns(), _offset, _isFirstSplit);
- return null;
+ try{
+
+ readCSVFrameFromInputSplit(_split, _informat,
_job, _dest, _dest.getSchema(),
+ _dest.getColumnNames(),
_dest.getNumRows(), _dest.getNumColumns(), _offset, _isFirstSplit);
+ return null;
+ }
+ catch(Exception e){
+ e.printStackTrace();
+ throw e;
+ }
}
}
}
diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameWriter.java
b/src/main/java/org/apache/sysds/runtime/io/FrameWriter.java
index 9cec9ed6b0..02c941701c 100644
--- a/src/main/java/org/apache/sysds/runtime/io/FrameWriter.java
+++ b/src/main/java/org/apache/sysds/runtime/io/FrameWriter.java
@@ -21,6 +21,8 @@ package org.apache.sysds.runtime.io;
import java.io.IOException;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
import org.apache.sysds.common.Types.ValueType;
import org.apache.sysds.runtime.DMLRuntimeException;
import org.apache.sysds.runtime.frame.data.FrameBlock;
@@ -32,9 +34,8 @@ import org.apache.sysds.runtime.frame.data.FrameBlock;
* for creating format-specific writers.
*
*/
-public abstract class FrameWriter
-{
-
+public abstract class FrameWriter {
+ protected static final Log LOG =
LogFactory.getLog(FrameWriter.class.getName());
public abstract void writeFrameToHDFS( FrameBlock src, String fname,
long rlen, long clen )
throws IOException, DMLRuntimeException;
diff --git a/src/main/java/org/apache/sysds/runtime/io/IOUtilFunctions.java
b/src/main/java/org/apache/sysds/runtime/io/IOUtilFunctions.java
index 9ad3f0aeee..b36f4497d1 100644
--- a/src/main/java/org/apache/sysds/runtime/io/IOUtilFunctions.java
+++ b/src/main/java/org/apache/sysds/runtime/io/IOUtilFunctions.java
@@ -75,7 +75,7 @@ public class IOUtilFunctions
//for empty text lines we use 0-0 despite for 1-based indexing in order
//to allow matrices with zero rows and columns (consistent with R)
public static final String EMPTY_TEXT_LINE = "0 0 0\n";
- private static final char CSV_QUOTE_CHAR = '"';
+ public static final char CSV_QUOTE_CHAR = '"';
public static final String LIBSVM_DELIM = " ";
public static final String LIBSVM_INDEX_DELIM = ":";
@@ -163,7 +163,7 @@ public class IOUtilFunctions
if( realncol != ncol ) {
throw new IOException("Invalid number of columns (" +
realncol + ", expected=" + ncol + ") "
- + "found in delimited file (" + fname +
") for line: " + line);
+ + "found in delimited file (" + fname +
") for line: " + line + "\n" + Arrays.toString(parts));
}
}
@@ -193,50 +193,111 @@ public class IOUtilFunctions
* @param delim delimiter
* @return string array of tokens
*/
- public static String[] splitCSV(String str, String delim)
- {
- // check for empty input
- if( str == null || str.isEmpty() )
- return new String[]{""};
-
- // scan string and create individual tokens
- ArrayList<String> tokens = new ArrayList<>();
- int from = 0, to = 0;
- int len = str.length();
- int dlen = delim.length();
- while( from < len ) { // for all tokens
- if( str.charAt(from) == CSV_QUOTE_CHAR
- && str.indexOf(CSV_QUOTE_CHAR, from+1) > 0 ) {
- to = str.indexOf(CSV_QUOTE_CHAR, from+1);
- // handle escaped inner quotes, e.g. "aa""a"
- while( to+1 < len &&
str.charAt(to+1)==CSV_QUOTE_CHAR )
- to = str.indexOf(CSV_QUOTE_CHAR, to+2);
// to + ""
- to += 1; // last "
- // handle remaining non-quoted characters "aa"a
- if( to<len-1 && !str.regionMatches(to, delim,
0, dlen) )
- to = str.indexOf(delim, to+1);
- }
- else if( str.regionMatches(from, delim, 0, dlen) ) {
- to = from; // empty string
- }
- else { // default: unquoted non-empty
- to = str.indexOf(delim, from+1);
- }
-
- // slice out token and advance position
- to = (to >= 0) ? to : len;
+ public static String[] splitCSV(String str, String delim){
+ if(str == null || str.isEmpty())
+ return new String[] {""};
+
+ int from = 0, to = 0;
+ final int len = str.length();
+ final int delimLen = delim.length();
+ final ArrayList<String> tokens = new ArrayList<>();
+
+ while(from < len) { // for all tokens
+ to = getTo(str, from, delim);
tokens.add(str.substring(from, to));
- from = to + delim.length();
+ from = to + delimLen;
}
-
+
// handle empty string at end
- if( from == len )
+ if(from == len)
tokens.add("");
-
- // return tokens
+
return tokens.toArray(new String[0]);
}
+ /**
+ * Splits a string by a specified delimiter into all tokens, including
empty
+ * while respecting the rules for quotes and escapes defined in RFC4180,
+ * with robustness for various special cases.
+ *
+ * @param str string to split
+ * @param delim delimiter
+ * @param cache cachedReturnArray
+ * @return string array of tokens
+ */
+ public static String[] splitCSV(String str, String delim, String[]
cache) {
+ // check for empty input
+ final boolean empty = str == null || str.isEmpty();
+ if(cache == null)
+ if(empty)
+ return new String[] {""};
+ else
+ return splitCSV(str, delim);
+ else if(empty) {
+ Arrays.fill(cache, "");
+ return cache;
+ }
+ else
+ return splitCSVNonNullWithCache(str,delim,cache);
+ }
+
+ private static String[] splitCSVNonNullWithCache(final String str,
final String delim, final String[] cache) {
+ final int len = str.length();
+ final int delimLen = delim.length();
+
+ int from = 0;
+ int id = 0;
+ while(from < len) { // for all tokens
+ final int to = getTo(str, from, delim);
+ cache[id++] =str.substring(from, to);
+ from = to + delimLen;
+ }
+
+ if(from == len)
+ cache[id] = "";
+ return cache;
+ }
+
+ private static boolean isEmptyMatch(final String str, final int from,
final String delim, final int dLen,
+ final int strLen) {
+ // return str.regionMatches(from, delim, 0, dLen); equivalent
to
+ for(int i = from, off = 0; off < dLen && i < strLen; i++, off++)
+ if(str.charAt(i) != delim.charAt(off))
+ return false;
+
+ return true;
+ }
+
+ private static int getTo(final String str, final int from, final String
delim) {
+ final int len = str.length();
+ final int dLen = delim.length();
+ final char cq = CSV_QUOTE_CHAR;
+ final int fromP1 = from + 1;
+ int to;
+
+ if(str.charAt(from) == cq && str.indexOf(cq, fromP1) > 0) {
+ to = str.indexOf(cq, fromP1);
+ // handle escaped inner quotes, e.g. "aa""a"
+ while(to + 1 < len && str.charAt(to + 1) == cq)
+ to = str.indexOf(cq, to + 2); // to + ""
+ to += 1; // last "
+ // handle remaining non-quoted characters "aa"a
+ if(to < len - 1 && !str.regionMatches(to, delim, 0,
dLen))
+ to = str.indexOf(delim, to + 1);
+ }
+ else if(isEmptyMatch(str, from, delim, dLen, len))
+ return to = from; // empty string
+ else // default: unquoted non-empty
+ to = str.indexOf(delim, fromP1);
+
+ // slice out token and advance position
+ return to >= 0 ? to : len;
+ }
+
+ public static String trim(String str) {
+ return str.trim();
+ }
+
/**
* Splits a string by a specified delimiter into all tokens, including
empty
* while respecting the rules for quotes and escapes defined in RFC4180,
@@ -253,37 +314,19 @@ public class IOUtilFunctions
// check for empty input
if( str == null || str.isEmpty() )
return new String[]{""};
+ else if (naStrings == null)
+ return splitCSV(str, delim, tokens);
// scan string and create individual tokens
- int from = 0, to = 0;
- int len = str.length();
- int dlen = delim.length();
- String curString;
+ final int len = str.length();
+ final int dLen = delim.length();
+ int from = 0;
int pos = 0;
while( from < len ) { // for all tokens
- if( str.charAt(from) == CSV_QUOTE_CHAR
- && str.indexOf(CSV_QUOTE_CHAR, from+1) > 0 ) {
- to = str.indexOf(CSV_QUOTE_CHAR, from+1);
- // handle escaped inner quotes, e.g. "aa""a"
- while( to+1 < len &&
str.charAt(to+1)==CSV_QUOTE_CHAR )
- to = str.indexOf(CSV_QUOTE_CHAR, to+2);
// to + ""
- to += 1; // last "
- // handle remaining non-quoted characters "aa"a
- if( to<len-1 && !str.regionMatches(to, delim,
0, dlen) )
- to = str.indexOf(delim, to+1);
- }
- else if( str.regionMatches(from, delim, 0, dlen) ) {
- to = from; // empty string
- }
- else { // default: unquoted non-empty
- to = str.indexOf(delim, from+1);
- }
-
- // slice out token and advance position
- to = (to >= 0) ? to : len;
- curString = str.substring(from, to);
- tokens[pos++] = naStrings!= null ?
((naStrings.contains(curString)) ? null: curString): curString;
- from = to + delim.length();
+ final int to = getTo(str, from, delim);
+ final String curString = str.substring(from, to);
+ tokens[pos++] = naStrings.contains(curString) ? null :
curString;
+ from = to + dLen;
}
// handle empty string at end
@@ -310,32 +353,13 @@ public class IOUtilFunctions
return 1;
// scan string and compute num tokens
+ final int len = str.length();
+ final int dlen = delim.length();
int numTokens = 0;
- int from = 0, to = 0;
- int len = str.length();
- int dlen = delim.length();
+ int from = 0;
while( from < len ) { // for all tokens
- if( str.charAt(from) == CSV_QUOTE_CHAR
- && str.indexOf(CSV_QUOTE_CHAR, from+1) > 0 ) {
- to = str.indexOf(CSV_QUOTE_CHAR, from+1);
- // handle escaped inner quotes, e.g. "aa""a"
- while( to+1 < len &&
str.charAt(to+1)==CSV_QUOTE_CHAR )
- to = str.indexOf(CSV_QUOTE_CHAR, to+2);
// to + ""
- to += 1; // last "
- // handle remaining non-quoted characters "aa"a
- if( to<len-1 && !str.regionMatches(to, delim,
0, dlen) )
- to = str.indexOf(delim, to+1);
- }
- else if( str.regionMatches(from, delim, 0, dlen) ) {
- to = from; // empty string
- }
- else { // default: unquoted non-empty
- to = str.indexOf(delim, from+1);
- }
-
- //increase counter and advance position
- to = (to >= 0) ? to : len;
- from = to + delim.length();
+ int to = getTo(str, from, delim);
+ from = to + dlen;
numTokens++;
}
diff --git
a/src/main/java/org/apache/sysds/runtime/transform/encode/EncoderFactory.java
b/src/main/java/org/apache/sysds/runtime/transform/encode/EncoderFactory.java
index f76a83e362..ca8a5d2b8b 100644
---
a/src/main/java/org/apache/sysds/runtime/transform/encode/EncoderFactory.java
+++
b/src/main/java/org/apache/sysds/runtime/transform/encode/EncoderFactory.java
@@ -160,7 +160,6 @@ public class EncoderFactory {
HashMap<String, Integer> colPos =
getColumnPositions(colnames2);
// create temporary meta frame block w/
shallow column copy
FrameBlock meta2 = new
FrameBlock(meta.getSchema(), colnames2);
- meta2.setNumRows(meta.getNumRows());
for(int i = 0; i < colnames.length;
i++) {
if(!colPos.containsKey(colnames[i])) {
throw new
DMLRuntimeException("Column name not found in meta data: " + colnames[i]
diff --git a/src/main/java/org/apache/sysds/utils/MemoryEstimates.java
b/src/main/java/org/apache/sysds/utils/MemoryEstimates.java
index 59d86a850e..43e9fddd25 100644
--- a/src/main/java/org/apache/sysds/utils/MemoryEstimates.java
+++ b/src/main/java/org/apache/sysds/utils/MemoryEstimates.java
@@ -189,7 +189,7 @@ public class MemoryEstimates {
* @return The array memory cost
*/
public static final double stringArrayCost(String[] strings) {
- double size = 0;
+ long size = 0;
for(int i = 0; i < strings.length; i++)
size += stringCost(strings[i]);
return size;
@@ -203,8 +203,9 @@ public class MemoryEstimates {
* @param value The String to measure
* @return The size in memory.
*/
- public static double stringCost(String value) {
- return value == null ? 16 + 8 : stringCost(value.length()); //
char array
+ public static long stringCost(String value) {
+ // if null 16 object + 8 array ref
+ return value == null ? 24L : stringCost(value.length()); //
char array
}
/**
@@ -213,11 +214,13 @@ public class MemoryEstimates {
* @param length The length of the string
* @return The size in memory
*/
- public static double stringCost(int length) {
- return 16 // object
- + 4 // hash
- + 8 // array ref
- + 32 + length; // char array
+ public static long stringCost(long length) {
+ // 16 object
+ // 4 hash
+ // 8 array ref
+ // 32 char array
+ // total base : 60
+ return 60L + length * 2L; // char array
}
}
diff --git
a/src/test/java/org/apache/sysds/test/component/frame/FrameAppendTest.java
b/src/test/java/org/apache/sysds/test/component/frame/FrameTest.java
similarity index 93%
rename from
src/test/java/org/apache/sysds/test/component/frame/FrameAppendTest.java
rename to src/test/java/org/apache/sysds/test/component/frame/FrameTest.java
index b3348b0f02..5d542215b2 100644
--- a/src/test/java/org/apache/sysds/test/component/frame/FrameAppendTest.java
+++ b/src/test/java/org/apache/sysds/test/component/frame/FrameTest.java
@@ -25,10 +25,12 @@ import static org.junit.Assert.fail;
import java.util.ArrayList;
import java.util.Collection;
+import java.util.Iterator;
import org.apache.sysds.common.Types.ValueType;
import org.apache.sysds.runtime.DMLRuntimeException;
import org.apache.sysds.runtime.frame.data.FrameBlock;
+import org.apache.sysds.runtime.frame.data.iterators.IteratorFactory;
import org.apache.sysds.runtime.util.UtilFunctions;
import org.apache.sysds.test.TestUtils;
import org.junit.Test;
@@ -37,7 +39,7 @@ import org.junit.runners.Parameterized;
import org.junit.runners.Parameterized.Parameters;
@RunWith(value = Parameterized.class)
-public class FrameAppendTest {
+public class FrameTest {
public FrameBlock f;
@Parameters
@@ -62,7 +64,7 @@ public class FrameAppendTest {
return tests;
}
- public FrameAppendTest(FrameBlock f) {
+ public FrameTest(FrameBlock f) {
this.f = f;
}
@@ -182,6 +184,18 @@ public class FrameAppendTest {
assertEquals(ff.get(r + 240, c).toString(),
f.get(r, c).toString());
}
+ @Test
+ public void testIterator() {
+
+ Iterator<Object[]> it = IteratorFactory.getObjectRowIterator(f);
+
+ for(int r = 0; r < f.getNumRows(); r++){
+ Object[] row = it.next();
+ for(int c = 0; c < f.getNumColumns(); c++)
+ assertEquals(f.get(r, c).toString(),
row[c].toString());
+ }
+ }
+
private static FrameBlock append(FrameBlock a, FrameBlock b, boolean
cBind) {
try {
return a.append(b, cBind);
diff --git
a/src/test/java/org/apache/sysds/test/component/frame/FrameUtilTest.java
b/src/test/java/org/apache/sysds/test/component/frame/FrameUtilTest.java
new file mode 100644
index 0000000000..acc4614afa
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/component/frame/FrameUtilTest.java
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.component.frame;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.sysds.common.Types.ValueType;
+import org.apache.sysds.runtime.frame.data.lib.FrameUtil;
+import org.junit.Test;
+
+public class FrameUtilTest {
+
+ @Test
+ public void testIsTypeMinimumFloat_1() {
+ assertEquals(ValueType.FP32, FrameUtil.isType("1",
ValueType.FP32));
+ }
+
+ @Test
+ public void testIsTypeMinimumFloat_2() {
+ assertEquals(ValueType.FP32, FrameUtil.isType("32.",
ValueType.FP32));
+ }
+
+ @Test
+ public void testIsTypeMinimumFloat_3() {
+ assertEquals(ValueType.FP32, FrameUtil.isType(".9",
ValueType.FP32));
+ }
+
+ @Test
+ public void testIsTypeMinimumFloat_4() {
+ assertEquals(ValueType.FP32, FrameUtil.isType(".9",
ValueType.FP64));
+ }
+
+ @Test
+ public void testIsTypeMinimumFloat_5() {
+ assertEquals(ValueType.FP64, FrameUtil.isType(".999999999999",
ValueType.FP64));
+ }
+
+ @Test
+ public void testIsTypeMinimumFloat_6() {
+ assertEquals(ValueType.FP64, FrameUtil.isType(".999999999999",
ValueType.FP32));
+ }
+
+ @Test
+ public void testIsTypeMinimumBoolean_1() {
+ assertEquals(ValueType.BOOLEAN, FrameUtil.isType("TRUE",
ValueType.UNKNOWN));
+ }
+
+ @Test
+ public void testIsTypeMinimumBoolean_2() {
+ assertEquals(ValueType.BOOLEAN, FrameUtil.isType("True",
ValueType.UNKNOWN));
+ }
+
+ @Test
+ public void testIsTypeMinimumBoolean_3() {
+ assertEquals(ValueType.BOOLEAN, FrameUtil.isType("true",
ValueType.UNKNOWN));
+ }
+
+ @Test
+ public void testIsTypeMinimumBoolean_4() {
+ assertEquals(ValueType.BOOLEAN, FrameUtil.isType("t",
ValueType.UNKNOWN));
+ }
+
+ @Test
+ public void testIsTypeMinimumBoolean_5() {
+ assertEquals(ValueType.BOOLEAN, FrameUtil.isType("f",
ValueType.UNKNOWN));
+ }
+
+ @Test
+ public void testIsTypeMinimumBoolean_6() {
+ assertEquals(ValueType.BOOLEAN, FrameUtil.isType("false",
ValueType.UNKNOWN));
+ }
+
+ @Test
+ public void testIsTypeMinimumBoolean_7() {
+ assertEquals(ValueType.BOOLEAN, FrameUtil.isType("False",
ValueType.UNKNOWN));
+ }
+
+ @Test
+ public void testIsTypeMinimumBoolean_8() {
+ assertEquals(ValueType.BOOLEAN, FrameUtil.isType("FALSE",
ValueType.UNKNOWN));
+ }
+
+
+ @Test
+ public void testIsTypeMinimumString_1() {
+ assertEquals(ValueType.STRING, FrameUtil.isType("FALSEE",
ValueType.UNKNOWN));
+ }
+
+
+ @Test
+ public void testIsTypeMinimumString_2() {
+ assertEquals(ValueType.STRING, FrameUtil.isType("falsse",
ValueType.UNKNOWN));
+ }
+
+ @Test
+ public void testIsTypeMinimumString_3() {
+ assertEquals(ValueType.STRING, FrameUtil.isType("agsss",
ValueType.UNKNOWN));
+ }
+
+ @Test
+ public void testIsTypeMinimumString_4() {
+ assertEquals(ValueType.STRING, FrameUtil.isType("AAGss",
ValueType.UNKNOWN));
+ }
+
+
+ @Test
+ public void testIsTypeMinimumString_5() {
+ assertEquals(ValueType.STRING, FrameUtil.isType("ttrue",
ValueType.UNKNOWN));
+ }
+}
diff --git
a/src/test/java/org/apache/sysds/test/component/frame/array/CustomArrayTests.java
b/src/test/java/org/apache/sysds/test/component/frame/array/CustomArrayTests.java
index d5276d9975..53d29c053f 100644
---
a/src/test/java/org/apache/sysds/test/component/frame/array/CustomArrayTests.java
+++
b/src/test/java/org/apache/sysds/test/component/frame/array/CustomArrayTests.java
@@ -168,7 +168,7 @@ public class CustomArrayTests {
public void analyzeValueTypeStringFP32() {
StringArray a = ArrayFactory.create(new String[] {"132",
"131.1", "-142"});
ValueType t = a.analyzeValueType();
- assertTrue(t == ValueType.FP32);
+ assertEquals(ValueType.FP32, t);
}
@Test
@@ -182,7 +182,7 @@ public class CustomArrayTests {
public void analyzeValueTypeStringFP32_string() {
StringArray a = ArrayFactory.create(new String[] {"\"132\"",
"131.1", "-142"});
ValueType t = a.analyzeValueType();
- assertTrue(t == ValueType.FP32);
+ assertEquals(ValueType.STRING, t);
}
@Test
@@ -468,22 +468,22 @@ public class CustomArrayTests {
}
@Test
- public void LongToBits_0(){
+ public void LongToBits_0() {
assertEquals(BitSetArray.longToBits(0),
"0000000000000000000000000000000000000000000000000000000000000000");
}
@Test
- public void LongToBits_2(){
+ public void LongToBits_2() {
assertEquals(BitSetArray.longToBits(2),
"0000000000000000000000000000000000000000000000000000000000000010");
}
@Test
- public void LongToBits_5(){
+ public void LongToBits_5() {
assertEquals(BitSetArray.longToBits(5),
"0000000000000000000000000000000000000000000000000000000000000101");
}
@Test
- public void LongToBits_minusOne(){
+ public void LongToBits_minusOne() {
assertEquals(BitSetArray.longToBits(-1),
"1111111111111111111111111111111111111111111111111111111111111111");
}
diff --git
a/src/test/java/org/apache/sysds/test/component/frame/array/FrameArrayTests.java
b/src/test/java/org/apache/sysds/test/component/frame/array/FrameArrayTests.java
index 22b8066355..080061302f 100644
---
a/src/test/java/org/apache/sysds/test/component/frame/array/FrameArrayTests.java
+++
b/src/test/java/org/apache/sysds/test/component/frame/array/FrameArrayTests.java
@@ -442,7 +442,7 @@ public class FrameArrayTests {
case STRING:
String vs = "1324L";
- ((Array<String>) a).set(0, vs);
+ a.set(0,vs);
assertEquals(((Array<String>) a).get(0), vs);
return;
@@ -584,7 +584,7 @@ public class FrameArrayTests {
@Test
public void setNull() {
// should not crash
- a.set(0, null);
+ a.set(0, (String)null);
}
@Test
diff --git
a/src/test/java/org/apache/sysds/test/component/misc/IOUtilFunctionsTest.java
b/src/test/java/org/apache/sysds/test/component/misc/IOUtilFunctionsTest.java
new file mode 100644
index 0000000000..e39c5413f9
--- /dev/null
+++
b/src/test/java/org/apache/sysds/test/component/misc/IOUtilFunctionsTest.java
@@ -0,0 +1,222 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.component.misc;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.fail;
+
+import org.apache.sysds.runtime.io.IOUtilFunctions;
+import org.junit.Test;
+
+public class IOUtilFunctionsTest {
+
+ @Test
+ public void splitCSV_1() {
+ String in = "\"1\",\"Prof\",\"B\",19,18,\"Male\",139750";
+ String[] ret = IOUtilFunctions.splitCSV(in, ",");
+ assertArrayEquals(new String[] {"\"1\"", "\"Prof\"", "\"B\"",
"19", "18", "\"Male\"", "139750"}, ret);
+ }
+
+ @Test
+ public void splitCSV_2() {
+ String in = "\"1,,,2\",139750";
+ String[] ret = IOUtilFunctions.splitCSV(in, ",");
+ assertArrayEquals(new String[] {"\"1,,,2\"", "139750"}, ret);
+ }
+
+ @Test
+ public void splitCSV_3() {
+ String in = "\"1,,,\"2,139750";
+ String[] ret = IOUtilFunctions.splitCSV(in, ",");
+ assertArrayEquals(new String[] {"\"1,,,\"2", "139750"}, ret);
+ }
+
+ @Test
+ public void splitCSV_4() {
+ String in = "\"1,,\",2,139750";
+ String[] ret = IOUtilFunctions.splitCSV(in, ",");
+ assertArrayEquals(new String[] {"\"1,,\"", "2", "139750"}, ret);
+ }
+
+ @Test
+ public void splitCSV_5() {
+ String in = "\"1,,\"aaaa,2,139750";
+ String[] ret = IOUtilFunctions.splitCSV(in, ",");
+ assertArrayEquals(new String[] {"\"1,,\"aaaa", "2", "139750"},
ret);
+ }
+
+ @Test
+ public void splitCustom() {
+ String in =
"0.0,-3.756323061556275,0.0,0.0,9.360046523539289,-2.8007958172584324,-6.233057304650478";
+ String[] ret = IOUtilFunctions.splitCSV(in, ",");
+ assertArrayEquals(new String[] {"0.0", "-3.756323061556275",
"0.0", "0.0", "9.360046523539289",
+ "-2.8007958172584324", "-6.233057304650478"}, ret);
+
+ }
+
+ @Test
+ public void splitCustom_2() {
+ String in = "aaaaaa \" abb";
+ String[] ret = IOUtilFunctions.splitCSV(in, " ");
+ assertArrayEquals(new String[] {"aaaaaa", "\"", "abb"}, ret);
+ }
+
+ @Test(expected = StringIndexOutOfBoundsException.class)
+ public void splitCustom_3() {
+ // try{
+ String in = "aaaaaa \"\"\" abb";
+ String[] ret = IOUtilFunctions.splitCSV(in, " ");
+ assertArrayEquals(new String[] {"aaaaaa", "\"\"\"",
"abb"}, ret);
+
+ // }
+ // catch(Exception e){
+ // e.printStackTrace();
+ // throw e;
+ // fail(e.getMessage());
+ // }
+ }
+
+ @Test
+ public void splitCustom_4() {
+ String in = "aaaaaa \"\"\"\" abb";
+ String[] ret = IOUtilFunctions.splitCSV(in, " ");
+ assertArrayEquals(new String[] {"aaaaaa", "\"\"\"\"", "abb"},
ret);
+ }
+
+ @Test
+ public void splitCustom_5() {
+ String in = "aaaaaa \"\"\"\"";
+ String[] ret = IOUtilFunctions.splitCSV(in, " ");
+ assertArrayEquals(new String[] {"aaaaaa", "\"\"\"\""}, ret);
+ }
+
+ // @Test
+ // public void splitCustom_6() {
+ // String in = "aaaaaa \"\"\"";
+ // String[] ret = IOUtilFunctions.splitCSV(in, " ");
+ // assertArrayEquals(new String[] {"aaaaaa", "\""}, ret);
+ // }
+
+ @Test
+ public void splitCustom_7() {
+ String in = "aaaaaa \"";
+ String[] ret = IOUtilFunctions.splitCSV(in, " ");
+ assertArrayEquals(new String[] {"aaaaaa", "\""}, ret);
+ }
+
+ @Test
+ public void splitRFC4180Standard_1() {
+ String in = "\"aaa\",\"b \r\n\",\"ccc\"";
+ String[] ret = IOUtilFunctions.splitCSV(in, ",");
+ assertArrayEquals(new String[] {"\"aaa\"", "\"b \r\n\"",
"\"ccc\""}, ret);
+ }
+
+ @Test
+ public void splitRFC4180Standard_BreakRule_1() {
+ String in = "\"aaa\",\"b\"\"bb\",\"ccc\"";
+ String[] ret = IOUtilFunctions.splitCSV(in, ",");
+ assertArrayEquals(new String[] {"\"aaa\"", "\"b\"\"bb\"",
"\"ccc\""}, ret);
+ }
+
+ @Test
+ public void splitRFC4180Standard_BreakRule_2() {
+ String in = "\"aaa\",\"b\"\"bb\"";
+ String[] ret = IOUtilFunctions.splitCSV(in, ",");
+ assertArrayEquals(new String[] {"\"aaa\"", "\"b\"\"bb\""}, ret);
+ }
+
+ @Test
+ public void splitEmptyMatch_1() {
+ String in = "\"aaa\",,,";
+ String[] ret = IOUtilFunctions.splitCSV(in, ",");
+ assertArrayEquals(new String[] {"\"aaa\"", "", "", ""}, ret);
+ }
+
+ @Test
+ public void splitEmptyMatch_2() {
+ String in = "\"aaa\",,a";
+ String[] ret = IOUtilFunctions.splitCSV(in, ",,");
+ assertArrayEquals(new String[] {"\"aaa\"", "a"}, ret);
+ }
+
+ @Test
+ public void split_cache_1() {
+ String in = "\"aaa\",,a";
+ String[] ret = IOUtilFunctions.splitCSV(in, ",,", new
String[2]);
+ assertArrayEquals(new String[] {"\"aaa\"", "a"}, ret);
+ }
+
+ @Test
+ public void split_cache_2() {
+ String in = "\"aaa\",,";
+ String[] ret = IOUtilFunctions.splitCSV(in, ",,", new
String[2]);
+ assertArrayEquals(new String[] {"\"aaa\"", ""}, ret);
+ }
+
+ @Test
+ public void split_cache_3() {
+ String in = "";
+ String[] ret = IOUtilFunctions.splitCSV(in, ",,", new
String[2]);
+ assertArrayEquals(new String[] {"", ""}, ret);
+ }
+
+ @Test
+ public void split_empty_1() {
+ String in = "";
+ String[] ret = IOUtilFunctions.splitCSV(in, ",,", null);
+ assertArrayEquals(new String[] {""}, ret);
+ }
+
+ @Test
+ public void split_empty_2() {
+ String in = "";
+ String[] ret = IOUtilFunctions.splitCSV(in, ",,");
+ assertArrayEquals(new String[] {""}, ret);
+ }
+
+ @Test
+ public void split_null() {
+ String in = null;
+ String[] ret = IOUtilFunctions.splitCSV(in, ",,", null);
+ assertArrayEquals(new String[] {""}, ret);
+ }
+
+ @Test
+ public void split_null_2() {
+ String in = null;
+ String[] ret = IOUtilFunctions.splitCSV(in, ",,");
+ assertArrayEquals(new String[] {""}, ret);
+ }
+
+ @Test
+ public void splitEmptyMatch_cache_2() {
+ String in = "\"aaa\",,a";
+ String[] ret = IOUtilFunctions.splitCSV(in, ",,", null);
+ assertArrayEquals(new String[] {"\"aaa\"", "a"}, ret);
+ }
+
+ @Test
+ public void splitCustom_fromRddTest(){
+ String in = "aaa,\"\"\",,,b,,\",\"c,c,c\"";
+
+ String[] ret = IOUtilFunctions.splitCSV(in, ",", null);
+ assertArrayEquals(new String[] {"aaa", "\"\"\",,,b,,\"",
"\"c,c,c\""}, ret);
+ }
+}
diff --git
a/src/test/java/org/apache/sysds/test/functions/codegen/CellwiseTmplTest.java
b/src/test/java/org/apache/sysds/test/functions/codegen/CellwiseTmplTest.java
index 4d47497daa..5e77189859 100644
---
a/src/test/java/org/apache/sysds/test/functions/codegen/CellwiseTmplTest.java
+++
b/src/test/java/org/apache/sysds/test/functions/codegen/CellwiseTmplTest.java
@@ -536,7 +536,7 @@ public class CellwiseTmplTest extends AutomatedTestBase
protected File getConfigTemplateFile() {
// Instrumentation in this test's output log to show custom
configuration file used for template.
File TEST_CONF_FILE = new File(SCRIPT_DIR + TEST_DIR,
TEST_CONF);
- LOG.info("This test case overrides default configuration with "
+ TEST_CONF_FILE.getPath());
+ LOG.debug("This test case overrides default configuration with
" + TEST_CONF_FILE.getPath());
return TEST_CONF_FILE;
}
}
diff --git
a/src/test/java/org/apache/sysds/test/functions/codegen/DAGCellwiseTmplTest.java
b/src/test/java/org/apache/sysds/test/functions/codegen/DAGCellwiseTmplTest.java
index d2b69c1e16..299f88b7ea 100644
---
a/src/test/java/org/apache/sysds/test/functions/codegen/DAGCellwiseTmplTest.java
+++
b/src/test/java/org/apache/sysds/test/functions/codegen/DAGCellwiseTmplTest.java
@@ -165,7 +165,7 @@ public class DAGCellwiseTmplTest extends AutomatedTestBase
@Override
protected File getConfigTemplateFile() {
// Instrumentation in this test's output log to show custom
configuration file used for template.
- LOG.info("This test case overrides default configuration with "
+ TEST_CONF_FILE.getPath());
+ LOG.debug("This test case overrides default configuration with
" + TEST_CONF_FILE.getPath());
return TEST_CONF_FILE;
}
}
diff --git
a/src/test/java/org/apache/sysds/test/functions/codegen/MiscPatternTest.java
b/src/test/java/org/apache/sysds/test/functions/codegen/MiscPatternTest.java
index 3310a4373c..9fa91c9d44 100644
--- a/src/test/java/org/apache/sysds/test/functions/codegen/MiscPatternTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/codegen/MiscPatternTest.java
@@ -174,7 +174,7 @@ public class MiscPatternTest extends AutomatedTestBase
@Override
protected File getConfigTemplateFile() {
// Instrumentation in this test's output log to show custom
configuration file used for template.
- LOG.info("This test case overrides default configuration with "
+ TEST_CONF_FILE.getPath());
+ LOG.debug("This test case overrides default configuration with
" + TEST_CONF_FILE.getPath());
return TEST_CONF_FILE;
}
}
diff --git
a/src/test/java/org/apache/sysds/test/functions/codegen/MultiAggTmplTest.java
b/src/test/java/org/apache/sysds/test/functions/codegen/MultiAggTmplTest.java
index 6900433329..e58c320457 100644
---
a/src/test/java/org/apache/sysds/test/functions/codegen/MultiAggTmplTest.java
+++
b/src/test/java/org/apache/sysds/test/functions/codegen/MultiAggTmplTest.java
@@ -210,7 +210,7 @@ public class MultiAggTmplTest extends AutomatedTestBase
@Override
protected File getConfigTemplateFile() {
// Instrumentation in this test's output log to show custom
configuration file used for template.
- LOG.info("This test case overrides default configuration with "
+ TEST_CONF_FILE.getPath());
+ LOG.debug("This test case overrides default configuration with
" + TEST_CONF_FILE.getPath());
return TEST_CONF_FILE;
}
}
diff --git
a/src/test/java/org/apache/sysds/test/functions/codegen/OuterProdTmplTest.java
b/src/test/java/org/apache/sysds/test/functions/codegen/OuterProdTmplTest.java
index bbc57154d3..893b6c1d73 100644
---
a/src/test/java/org/apache/sysds/test/functions/codegen/OuterProdTmplTest.java
+++
b/src/test/java/org/apache/sysds/test/functions/codegen/OuterProdTmplTest.java
@@ -313,7 +313,7 @@ public class OuterProdTmplTest extends AutomatedTestBase
@Override
protected File getConfigTemplateFile() {
// Instrumentation in this test's output log to show custom
configuration file used for template.
- LOG.info("This test case overrides default configuration with "
+ TEST_CONF_FILE.getPath());
+ LOG.debug("This test case overrides default configuration with
" + TEST_CONF_FILE.getPath());
return TEST_CONF_FILE;
}
}
diff --git
a/src/test/java/org/apache/sysds/test/functions/codegen/RowAggTmplTest.java
b/src/test/java/org/apache/sysds/test/functions/codegen/RowAggTmplTest.java
index 5ae31967cf..d704b4f674 100644
--- a/src/test/java/org/apache/sysds/test/functions/codegen/RowAggTmplTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/codegen/RowAggTmplTest.java
@@ -872,7 +872,7 @@ public class RowAggTmplTest extends AutomatedTestBase
@Override
protected File getConfigTemplateFile() {
// Instrumentation in this test's output log to show custom
configuration file used for template.
- LOG.info("This test case overrides default configuration with "
+ TEST_CONF_FILE.getPath());
+ LOG.debug("This test case overrides default configuration with
" + TEST_CONF_FILE.getPath());
return TEST_CONF_FILE;
}
}
diff --git
a/src/test/java/org/apache/sysds/test/functions/codegen/RowConv2DOperationsTest.java
b/src/test/java/org/apache/sysds/test/functions/codegen/RowConv2DOperationsTest.java
index 48611ef9f6..b79ff828f0 100644
---
a/src/test/java/org/apache/sysds/test/functions/codegen/RowConv2DOperationsTest.java
+++
b/src/test/java/org/apache/sysds/test/functions/codegen/RowConv2DOperationsTest.java
@@ -125,7 +125,7 @@ public class RowConv2DOperationsTest extends
AutomatedTestBase
@Override
protected File getConfigTemplateFile() {
// Instrumentation in this test's output log to show custom
configuration file used for template.
- LOG.info("This test case overrides default configuration with "
+ TEST_CONF_FILE.getPath());
+ LOG.debug("This test case overrides default configuration with
" + TEST_CONF_FILE.getPath());
return TEST_CONF_FILE;
}
}
diff --git
a/src/test/java/org/apache/sysds/test/functions/codegen/RowVectorComparisonTest.java
b/src/test/java/org/apache/sysds/test/functions/codegen/RowVectorComparisonTest.java
index a75e3b1295..3f287bdc07 100644
---
a/src/test/java/org/apache/sysds/test/functions/codegen/RowVectorComparisonTest.java
+++
b/src/test/java/org/apache/sysds/test/functions/codegen/RowVectorComparisonTest.java
@@ -168,7 +168,7 @@ public class RowVectorComparisonTest extends
AutomatedTestBase
@Override
protected File getConfigTemplateFile() {
// Instrumentation in this test's output log to show custom
configuration file used for template.
- LOG.info("This test case overrides default configuration with "
+ TEST_CONF_FILE.getPath());
+ LOG.debug("This test case overrides default configuration with
" + TEST_CONF_FILE.getPath());
return TEST_CONF_FILE;
}
}
diff --git
a/src/test/java/org/apache/sysds/test/functions/codegen/SumProductChainTest.java
b/src/test/java/org/apache/sysds/test/functions/codegen/SumProductChainTest.java
index 41d3c62f9e..d2631a7c9f 100644
---
a/src/test/java/org/apache/sysds/test/functions/codegen/SumProductChainTest.java
+++
b/src/test/java/org/apache/sysds/test/functions/codegen/SumProductChainTest.java
@@ -153,7 +153,7 @@ public class SumProductChainTest extends AutomatedTestBase
@Override
protected File getConfigTemplateFile() {
// Instrumentation in this test's output log to show custom
configuration file used for template.
- LOG.info("This test case overrides default configuration with "
+ TEST_CONF_FILE.getPath());
+ LOG.debug("This test case overrides default configuration with
" + TEST_CONF_FILE.getPath());
return TEST_CONF_FILE;
}
}
diff --git
a/src/test/java/org/apache/sysds/test/functions/federated/multitenant/FederatedLineageTraceReuseTest.java
b/src/test/java/org/apache/sysds/test/functions/federated/multitenant/FederatedLineageTraceReuseTest.java
index f01c196a63..5d249abfb6 100644
---
a/src/test/java/org/apache/sysds/test/functions/federated/multitenant/FederatedLineageTraceReuseTest.java
+++
b/src/test/java/org/apache/sysds/test/functions/federated/multitenant/FederatedLineageTraceReuseTest.java
@@ -148,6 +148,13 @@ public class FederatedLineageTraceReuseTest extends
MultiTenantTestBase {
int[] workerPorts = startFedWorkers(4, new String[]{"-lineage",
"reuse"});
+ try {
+ Thread.sleep(4000);
+ }
+ catch(InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
rtplatform = execMode;
if(rtplatform == ExecMode.SPARK) {
DMLScript.USE_LOCAL_SPARK_CONFIG = true;
diff --git
a/src/test/java/org/apache/sysds/test/functions/federated/multitenant/FederatedReuseReadTest.java
b/src/test/java/org/apache/sysds/test/functions/federated/multitenant/FederatedReuseReadTest.java
index 047a98fd85..522c9aafc1 100644
---
a/src/test/java/org/apache/sysds/test/functions/federated/multitenant/FederatedReuseReadTest.java
+++
b/src/test/java/org/apache/sysds/test/functions/federated/multitenant/FederatedReuseReadTest.java
@@ -104,8 +104,7 @@ public class FederatedReuseReadTest extends
MultiTenantTestBase {
@Test
public void testModifiedValCP() {
- //TODO with 4 runs sporadically into non-terminating state
- runReuseReadTest(OpType.MODIFIED_VAL, 3, ExecMode.SINGLE_NODE,
false);
+ runReuseReadTest(OpType.MODIFIED_VAL, 2, ExecMode.SINGLE_NODE,
false);
}
@Test
@@ -117,7 +116,6 @@ public class FederatedReuseReadTest extends
MultiTenantTestBase {
@Test
@Ignore
public void testModifiedValLineageCP() {
- //TODO with 4 runs sporadically into non-terminating state
runReuseReadTest(OpType.MODIFIED_VAL, 3, ExecMode.SINGLE_NODE,
true);
}
diff --git
a/src/test/java/org/apache/sysds/test/functions/frame/FrameMapTest.java
b/src/test/java/org/apache/sysds/test/functions/frame/FrameMapTest.java
index bb993de8a2..1d2a798382 100644
--- a/src/test/java/org/apache/sysds/test/functions/frame/FrameMapTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/frame/FrameMapTest.java
@@ -20,8 +20,8 @@
package org.apache.sysds.test.functions.frame;
import org.apache.sysds.common.Types;
-import org.apache.sysds.common.Types.FileFormat;
import org.apache.sysds.common.Types.ExecType;
+import org.apache.sysds.common.Types.FileFormat;
import org.apache.sysds.runtime.frame.data.FrameBlock;
import org.apache.sysds.runtime.io.FileFormatPropertiesCSV;
import org.apache.sysds.runtime.io.FrameWriterFactory;
@@ -133,12 +133,13 @@ public class FrameMapTest extends AutomatedTestBase {
String[][] date = new String[rows2][1];
for(int i = 0; i<rows2; i++)
date[i][0] =
(i%30)+"/"+(i%12)+"/200"+(i%20);
+ FrameBlock a = new FrameBlock(schemaStrings1,
date);
FrameWriterFactory.createFrameWriter(FileFormat.CSV).
- writeFrameToHDFS(new
FrameBlock(schemaStrings1, date), input("A"), rows2, 1);
+ writeFrameToHDFS(a, input("A"), rows2,
1);
}
else if(type == TestType.SHERLOCK_PREP) {
String[][] data = new String[1][1];
- data[0][0] = "\"['Global', 'United States',
'Australia']\"";
+ data[0][0] = "'Global', 'United States',
'Australia'";
FileFormatPropertiesCSV ffp = new
FileFormatPropertiesCSV();
ffp.setDelim(";");
FrameWriterFactory.createFrameWriter(FileFormat.CSV, ffp).
@@ -157,7 +158,7 @@ public class FrameMapTest extends AutomatedTestBase {
String[] output =
(String[])outputFrame.getColumnData(0);
String[] input = (String[])inputFrame.getColumnData(0);
-
+
switch (type) {
case SPLIT:
for(int i = 0; i<input.length; i++)
@@ -177,8 +178,10 @@ public class FrameMapTest extends AutomatedTestBase {
TestUtils.compareScalars(String.valueOf(input[i].toUpperCase()), output[i]);
break;
case DATE_UTILS:
- for(int i =0; i<input.length; i++)
-
TestUtils.compareScalars(String.valueOf(UtilFunctions.toMillis(input[i])),
output[i]);
+ for(int i =0; i<input.length; i++){
+ String c =
String.valueOf(UtilFunctions.toMillis(input[i]));
+ TestUtils.compareScalars(c,
output[i]);
+ }
break;
case SHERLOCK_PREP:
for(int i =0; i<input.length; i++)
diff --git
a/src/test/java/org/apache/sysds/test/functions/transform/TransformCSVFrameEncodeReadTest.java
b/src/test/java/org/apache/sysds/test/functions/transform/TransformCSVFrameEncodeReadTest.java
index fa9c1f020d..f66fc1db3c 100644
---
a/src/test/java/org/apache/sysds/test/functions/transform/TransformCSVFrameEncodeReadTest.java
+++
b/src/test/java/org/apache/sysds/test/functions/transform/TransformCSVFrameEncodeReadTest.java
@@ -19,7 +19,8 @@
package org.apache.sysds.test.functions.transform;
-import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
import org.apache.sysds.api.DMLScript;
import org.apache.sysds.common.Types.ExecMode;
import org.apache.sysds.runtime.frame.data.FrameBlock;
@@ -143,8 +144,17 @@ public class TransformCSVFrameEncodeReadTest extends
AutomatedTestBase
FrameBlock fb2 = reader2.readFrameFromHDFS(output("R"),
-1L, -1L);
String[] fromDisk =
DataConverter.toString(fb2).split("\n");
String[] printed = stdOut.split("\n");
- for(int i = 0; i < fromDisk.length; i++)
- assertEquals(fromDisk[i], printed[i]);
+ boolean equal = true;
+ String err = "";
+ for(int i = 0; i < fromDisk.length; i++){
+ if(! fromDisk[i].equals(printed[i])){
+ err += "\n not equal: \n"+ (fromDisk[i]
+ "\n" + printed[i]);
+ equal = false;
+ }
+
+ }
+ if(!equal)
+ fail(err);
}
catch(Exception ex) {
diff --git
a/src/test/java/org/apache/sysds/test/functions/transform/TransformFrameEncodeDecodeTokenTest.java
b/src/test/java/org/apache/sysds/test/functions/transform/TransformFrameEncodeDecodeTokenTest.java
index f51aa327b4..c9b2b1e566 100644
---
a/src/test/java/org/apache/sysds/test/functions/transform/TransformFrameEncodeDecodeTokenTest.java
+++
b/src/test/java/org/apache/sysds/test/functions/transform/TransformFrameEncodeDecodeTokenTest.java
@@ -86,16 +86,17 @@ public class TransformFrameEncodeDecodeTokenTest extends
AutomatedTestBase
try
{
getAndLoadTestConfiguration(TEST_NAME1);
+ setOutputBuffering(true);
String HOME = SCRIPT_DIR + TEST_DIR;
fullDMLScriptName = HOME + TEST_NAME1 + ".dml";
- programArgs = new String[]{"-explain","-nvargs",
+ programArgs = new String[]{"-nvargs",
"DATA=" + DATASET_DIR + DATASET1,
"TFSPEC=" + DATASET_DIR+ SPEC1,
"TFDATA=" + output("tfout"), "SEP= ",
"OFMT=" + ofmt, "OSEP= " };
- runTest(true, false, null, -1);
+ runTest(null);
//read input/output and compare
FrameReader reader1 =
FrameReaderFactory.createFrameReader(FileFormat.CSV,