[ 
https://issues.apache.org/jira/browse/ARROW-2144?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16369366#comment-16369366
 ] 

ASF GitHub Bot commented on ARROW-2144:
---------------------------------------

TheNeuralBit closed pull request #1599: ARROW-2144: [JS] Don't repeat 
dictionary lookups in DataFrame ops
URL: https://github.com/apache/arrow/pull/1599
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/js/package.json b/js/package.json
index d36e87263..1c22fc117 100644
--- a/js/package.json
+++ b/js/package.json
@@ -19,7 +19,7 @@
     "clean:testdata": "gulp clean:testdata",
     "create:testdata": "gulp create:testdata",
     "test:coverage": "gulp test -t ts --coverage",
-    "doc": "shx rm -rf ./doc && esdoc",
+    "doc": "shx rm -rf ./doc && typedoc --mode file --out doc src/Arrow.ts",
     "lint": "run-p lint:*",
     "lint:src": "tslint --fix --project -p tsconfig.json -c tslint.json 
\"src/**/*.ts\"",
     "lint:test": "tslint --fix --project -p test/tsconfig.json -c tslint.json 
\"test/**/*.ts\"",
@@ -70,8 +70,6 @@
     "benchmark": "2.1.4",
     "coveralls": "3.0.0",
     "del": "3.0.0",
-    "esdoc": "1.0.4",
-    "esdoc-standard-plugin": "1.0.0",
     "glob": "7.1.2",
     "google-closure-compiler": "20180101.0.0",
     "gulp": "github:gulpjs/gulp#6d71a658c61edb3090221579d8f97dbe086ba2ed",
@@ -97,6 +95,7 @@
     "trash": "4.2.1",
     "ts-jest": "22.0.1",
     "tslint": "5.9.1",
+    "typedoc": "0.10.0",
     "typescript": "2.7.1",
     "uglifyjs-webpack-plugin": "1.1.6",
     "webpack": "3.10.0",
diff --git a/js/src/predicate.ts b/js/src/predicate.ts
index 9d55274bd..981ffb166 100644
--- a/js/src/predicate.ts
+++ b/js/src/predicate.ts
@@ -125,6 +125,10 @@ export class Or extends CombinationPredicate {
 }
 
 export class Equals extends ComparisonPredicate {
+    // Helpers used to cache dictionary reverse lookups between calls to bind
+    private lastDictionary: Vector|undefined;
+    private lastKey: number|undefined;
+
     protected _bindLitLit(_batch: RecordBatch, left: Literal, right: Literal): 
PredicateFunc {
         const rtrn: boolean = left.v == right.v;
         return () => rtrn;
@@ -139,20 +143,17 @@ export class Equals extends ComparisonPredicate {
     protected _bindColLit(batch: RecordBatch, col: Col, lit: Literal): 
PredicateFunc {
         const col_func = col.bind(batch);
         if (col.vector instanceof DictionaryVector) {
-            // Assume that there is only one key with the value `lit.v`
-            // TODO: add lazily-computed reverse dictionary lookups, associated
-            // with col.vector.data so that we only have to do this once per
-            // dictionary
-            let key = -1;
-            let dict = col.vector;
-            let data = dict.dictionary!;
-            for (let len = data.length; ++key < len;) {
-                if (data.get(key) === lit.v) {
-                    break;
-                }
+            let key: any;
+            const vector = col.vector as DictionaryVector;
+            if (vector.dictionary !== this.lastDictionary) {
+                key = vector.reverseLookup(lit.v);
+                this.lastDictionary = vector.dictionary;
+                this.lastKey = key;
+            } else {
+                key = this.lastKey;
             }
 
-            if (key == data.length) {
+            if (key === -1) {
                 // the value doesn't exist in the dictionary - always return
                 // false
                 // TODO: special-case of PredicateFunc that encapsulates this
@@ -161,7 +162,7 @@ export class Equals extends ComparisonPredicate {
                 return () => false;
             } else {
                 return (idx: number) => {
-                    return dict.getKey(idx) === key;
+                    return vector.getKey(idx) === key;
                 };
             }
         } else {
diff --git a/js/src/vector.ts b/js/src/vector.ts
index d9ca97b5f..fa1d16efc 100644
--- a/js/src/vector.ts
+++ b/js/src/vector.ts
@@ -28,6 +28,7 @@ export interface View<T extends DataType> {
     get(index: number): T['TValue'] | null;
     set(index: number, value: T['TValue']): void;
     toArray(): IterableArrayLike<T['TValue'] | null>;
+    indexOf(search: T['TValue']): number;
     [Symbol.iterator](): IterableIterator<T['TValue'] | null>;
 }
 
@@ -77,6 +78,9 @@ export class Vector<T extends DataType = any> implements 
VectorLike, View<T>, Vi
     public toArray(): IterableArrayLike<T['TValue'] | null> {
         return this.view.toArray();
     }
+    public indexOf(value: T['TValue']) {
+        return this.view.indexOf(value);
+    }
     public [Symbol.iterator](): IterableIterator<T['TValue'] | null> {
         return this.view[Symbol.iterator]();
     }
@@ -414,6 +418,7 @@ export class DictionaryVector<T extends DataType = 
DataType> extends Vector<Dict
     }
     public getKey(index: number) { return this.indicies.get(index); }
     public getValue(key: number) { return this.dictionary.get(key); }
+    public reverseLookup(value: T) { return this.dictionary.indexOf(value); }
 }
 
 export const createVector = ((VectorLoader: new <T extends DataType>(data: 
Data<T>) => TypeVisitor) => (
diff --git a/js/src/vector/chunked.ts b/js/src/vector/chunked.ts
index 2eaf99c7c..7876bbae5 100644
--- a/js/src/vector/chunked.ts
+++ b/js/src/vector/chunked.ts
@@ -103,6 +103,16 @@ export class ChunkedView<T extends DataType> implements 
View<T> {
         }
         return target;
     }
+    public indexOf(search: T['TValue']) {
+        let offset = 0, result;
+        for (const vector of this.chunkVectors) {
+            result = vector.indexOf(search);
+            if (result !== -1) { return result + offset; }
+            offset += vector.length;
+        }
+
+        return -1;
+    }
 }
 
 function typedArraySet(source: TypedArray, target: TypedArray, index: number) {
diff --git a/js/src/vector/dictionary.ts b/js/src/vector/dictionary.ts
index 385729814..f4de810b0 100644
--- a/js/src/vector/dictionary.ts
+++ b/js/src/vector/dictionary.ts
@@ -47,4 +47,12 @@ export class DictionaryView<T extends DataType> implements 
View<T> {
             yield values.get(indicies.get(index));
         }
     }
+    public indexOf(search: T['TValue']) {
+        // First find the dictionary key for the desired value...
+        const key = this.dictionary.indexOf(search);
+        if (key === -1) { return key; }
+
+        // ... then find the first occurence of that key in indicies
+        return this.indicies.indexOf(key!);
+    }
 }
diff --git a/js/src/vector/flat.ts b/js/src/vector/flat.ts
index a32bd9d39..acc2f1af9 100644
--- a/js/src/vector/flat.ts
+++ b/js/src/vector/flat.ts
@@ -43,6 +43,15 @@ export class FlatView<T extends FlatType> implements View<T> 
{
     public toArray(): IterableArrayLike<T['TValue']> {
         return this.values.subarray(0, this.length);
     }
+    public indexOf(search: T['TValue']) {
+        let index = 0;
+        for (let value of this) {
+            if (value === search) { return index; }
+            ++index;
+        }
+
+        return -1;
+    }
     public [Symbol.iterator](): IterableIterator<T['TValue']> {
         return this.values.subarray(0, this.length)[Symbol.iterator]() as 
IterableIterator<T['TValue']>;
     }
@@ -64,6 +73,10 @@ export class NullView implements View<Null> {
     public toArray(): IterableArrayLike<null> {
         return [...this];
     }
+    public indexOf(search: any) {
+        // if you're looking for nulls and the view isn't empty, we've got 'em!
+        return search === null && this.length > 0 ? 0 : -1;
+    }
     public *[Symbol.iterator](): IterableIterator<null> {
         for (let index = -1, length = this.length; ++index < length;) {
             yield null;
@@ -107,6 +120,15 @@ export class ValidityView<T extends DataType> implements 
View<T> {
     public toArray(): IterableArrayLike<T['TValue'] | null> {
         return [...this];
     }
+    public indexOf(search: T['TValue']) {
+        let index = 0;
+        for (let value of this) {
+            if (value === search) { return index; }
+            ++index;
+        }
+
+        return -1;
+    }
     public isValid(index: number): boolean {
         const nullBitIndex = this.offset + index;
         return getBool(null, index, this.nullBitmap[nullBitIndex >> 3], 
nullBitIndex % 8);
@@ -169,6 +191,15 @@ export class FixedSizeView<T extends PrimitiveType> 
extends PrimitiveView<T> {
     public toArray(): IterableArrayLike<T['TValue']> {
         return this.values;
     }
+    public indexOf(search: T['TValue']) {
+        let index = 0;
+        for (let value of this) {
+            if (value.every((d: number, i: number) => d === search[i])) { 
return index; }
+            ++index;
+        }
+
+        return -1;
+    }
     protected getValue(values: T['TArray'], index: number, size: number): 
T['TValue'] {
         return values.subarray(index * size, index * size + size);
     }
diff --git a/js/src/vector/list.ts b/js/src/vector/list.ts
index 3d365ceac..8561c66ba 100644
--- a/js/src/vector/list.ts
+++ b/js/src/vector/list.ts
@@ -59,6 +59,15 @@ export abstract class ListViewBase<T extends (ListType | 
FlatListType | FixedSiz
             yield get(values, index, valueOffsets);
         }
     }
+    public indexOf(search: T['TValue']) {
+        let index = 0;
+        for (let value of this) {
+            if (value === search) { return index; }
+            ++index;
+        }
+
+        return -1;
+    }
     protected abstract getList(values: T['TArray'], index: number, 
valueOffsets?: Int32Array): T['TValue'];
     protected abstract setList(values: T['TArray'], index: number, value: 
T['TValue'], valueOffsets?: Int32Array): void;
 }
diff --git a/js/src/vector/nested.ts b/js/src/vector/nested.ts
index d0fb24ca9..a45a912c9 100644
--- a/js/src/vector/nested.ts
+++ b/js/src/vector/nested.ts
@@ -40,6 +40,9 @@ export abstract class NestedView<T extends NestedType> 
implements View<T> {
     public toArray(): IterableArrayLike<T['TValue']> {
         return [...this];
     }
+    public indexOf(_: T['TValue']): number {
+        throw new Error(`Not implemented yet`);
+    }
     public toJSON(): any { return this.toArray(); }
     public toString() {
         return [...this].map((x) => stringify(x)).join(', ');
diff --git a/js/test/unit/vector-tests.ts b/js/test/unit/vector-tests.ts
index 81676b003..e2be22983 100644
--- a/js/test/unit/vector-tests.ts
+++ b/js/test/unit/vector-tests.ts
@@ -15,13 +15,16 @@
 // specific language governing permissions and limitations
 // under the License.
 
+import { TextEncoder } from 'text-encoding-utf-8';
 import Arrow from '../Arrow';
 import { type, TypedArray, TypedArrayConstructor } from '../../src/Arrow';
 
-const { BoolData, FlatData } = Arrow.data;
-const { IntVector, FloatVector, BoolVector } = Arrow.vector;
+const utf8Encoder = new TextEncoder('utf-8');
+
+const { BoolData, FlatData, FlatListData } = Arrow.data;
+const { IntVector, FloatVector, BoolVector, Utf8Vector } = Arrow.vector;
 const {
-    Bool,
+    Utf8, Bool,
     Float16, Float32, Float64,
     Int8, Int16, Int32, Int64,
     Uint8, Uint16, Uint32, Uint64,
@@ -45,11 +48,14 @@ const FixedWidthVectors = {
 
 const fixedSizeVectors = toMap(FixedSizeVectors, 
Object.keys(FixedSizeVectors));
 const fixedWidthVectors = toMap(FixedWidthVectors, 
Object.keys(FixedWidthVectors));
+const randomBytes = (n: number) => Uint8Array.from(
+    { length: n },
+    () => Math.random() * 255 | 0
+);
 const bytes = Array.from(
     { length: 5 },
-    () => Uint8Array.from(
-        { length: 64 },
-        () => Math.random() * 255 | 0));
+    () => randomBytes(64)
+);
 
 describe(`BoolVector`, () => {
     const values = [true, true, false, true, true, false, false, false], n = 
values.length;
@@ -67,6 +73,16 @@ describe(`BoolVector`, () => {
             expect(v).toEqual(values[i]);
         }
     });
+    test(`indexOf returns expected values`, () => {
+        for (let test_value of [true, false]) {
+            const expected = values.indexOf(test_value);
+            expect(vector.indexOf(test_value)).toEqual(expected);
+        }
+    });
+    test(`indexOf returns -1 when value not found`, () => {
+        const v = new BoolVector(new BoolData(new Bool(), 3, null, new 
Uint8Array([0xFF])));
+        expect(v.indexOf(false)).toEqual(-1);
+    });
     test(`can set values to true and false`, () => {
         const v = new BoolVector(new BoolData(new Bool(), n, null, new 
Uint8Array([27, 0, 0, 0, 0, 0, 0, 0])));
         const expected1 = [true, true, false, true, true, false, false, false];
@@ -145,6 +161,13 @@ describe('Float16Vector', () => {
             expect(v).toEqual(clamp(values[i]));
         }
     });
+    test(`indexOf returns expected values`, () => {
+        const randomValues = new Uint16Array(randomBytes(64).buffer);
+        for (let value of [...values, ...randomValues]) {
+            const expected = values.indexOf(value);
+            expect(vector.indexOf(clamp(value))).toEqual(expected);
+        }
+    });
     test(`slices the entire array`, () => {
         expect(vector.slice().toArray()).toEqual(float16s);
     });
@@ -187,6 +210,21 @@ for (const [VectorName, [VectorType, DataType]] of 
fixedSizeVectors) {
                 expect(v).toEqual(values.slice(2 * i, 2 * (i + 1)));
             }
         });
+        test(`indexOf returns expected values`, () => {
+            // Create a set of test data composed of all of the actual values
+            // and a few random values
+            let testValues = concatTyped(
+                type.ArrayType,
+                ...bytes,
+                ...[randomBytes(8 * 2 * type.ArrayType.BYTES_PER_ELEMENT)]
+            );
+
+            for (let i = -1, n = testValues.length / 2 | 0; ++i < n;) {
+                const value = testValues.slice(2 * i, 2 * (i + 1));
+                const expected = values.findIndex((d, i) => i % 2 === 0 && d 
=== value[0] && testValues[i + 1] === value[1]);
+                expect(vector.indexOf(value)).toEqual(expected >= 0 ? expected 
/ 2 : -1);
+            }
+        });
         test(`slices the entire array`, () => {
             expect(vector.slice().toArray()).toEqual(values);
         });
@@ -232,6 +270,20 @@ for (const [VectorName, [VectorType, DataType]] of 
fixedWidthVectors) {
                 expect(v).toEqual(values[i]);
             }
         });
+        test(`indexOf returns expected values`, () => {
+            // Create a set of test data composed of all of the actual values
+            // and a few random values
+            let testValues = concatTyped(
+                type.ArrayType,
+                ...bytes,
+                ...[randomBytes(8 * type.ArrayType.BYTES_PER_ELEMENT)]
+            );
+
+            for (const value of testValues) {
+                const expected = values.indexOf(value);
+                expect(vector.indexOf(value)).toEqual(expected);
+            }
+        });
         test(`slices the entire array`, () => {
             expect(vector.slice().toArray()).toEqual(values);
         });
@@ -253,6 +305,35 @@ for (const [VectorName, [VectorType, DataType]] of 
fixedWidthVectors) {
     });
 }
 
+describe(`Utf8Vector`, () => {
+    const values = ['foo', 'bar', 'baz', 'foo bar', 'bar'], n = values.length;
+    let offset = 0;
+    const offsets = Uint32Array.of(0, ...values.map((d) => { offset += 
d.length; return offset; }));
+    const vector = new Utf8Vector(new FlatListData(new Utf8(), n, null, 
offsets, utf8Encoder.encode(values.join(''))));
+    test(`gets expected values`, () => {
+        let i = -1;
+        while (++i < n) {
+            expect(vector.get(i)).toEqual(values[i]);
+        }
+    });
+    test(`iterates expected values`, () => {
+        expect.hasAssertions();
+        let i = -1;
+        for (let v of vector) {
+            expect(++i).toBeLessThan(n);
+            expect(v).toEqual(values[i]);
+        }
+    });
+    test(`indexOf returns expected values`, () => {
+        let testValues = values.concat(['abc', '12345']);
+
+        for (const value of testValues) {
+            const expected = values.indexOf(value);
+            expect(vector.indexOf(value)).toEqual(expected);
+        }
+    });
+});
+
 function toMap<T>(entries: Record<string, T>, keys: string[]) {
     return keys.reduce((map, key) => {
         map.set(key, entries[key] as T);


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> [JS] Don't repeat dictionary lookups in DataFrame ops
> -----------------------------------------------------
>
>                 Key: ARROW-2144
>                 URL: https://issues.apache.org/jira/browse/ARROW-2144
>             Project: Apache Arrow
>          Issue Type: Improvement
>          Components: JavaScript
>            Reporter: Brian Hulette
>            Priority: Major
>              Labels: pull-request-available
>
> Currently we repeat dictionary lookups every time we bind a new record batch 
> when doing an equality check in a DataFrame op 
> (https://github.com/apache/arrow/blob/master/js/src/predicate.ts#L143).
> In most cases the dictionary won't be changing between record batches, so we 
> should remember these reverse dictionary lookups, either permanently, or at 
> least for the duration of the current operation.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to