[
https://issues.apache.org/jira/browse/ASTERIXDB-1544?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15391382#comment-15391382
]
Wenhai commented on ASTERIXDB-1544:
-----------------------------------
Done.
> Omit the fuzzyjoin on inverted index
> ------------------------------------
>
> Key: ASTERIXDB-1544
> URL: https://issues.apache.org/jira/browse/ASTERIXDB-1544
> Project: Apache AsterixDB
> Issue Type: Bug
> Environment: MAC/linux
> Reporter: Wenhai
> Assignee: Wenhai
> Priority: Critical
>
> In the current master, we have NO testCases covering the fuzzyjoin on the
> (inverted) indexed field. Once we trigger a fuzzy join "~=" on a indexed
> field, we will always get a error with following log.
> Schema
> {noformat}
> drop dataverse fuzzyjointest if exists;
> create dataverse fuzzyjointest;
> use dataverse fuzzyjointest;
> create type DBLPType as open {
> tid: uuid,
> id: int64,
> dblpid: string?,
> title: string?,
> authors: string?,
> misc: string?
> }
> create type CSXType as closed {
> tid: uuid,
> id: int64,
> csxid: string?,
> title: string?,
> authors: string?,
> misc: string?
> }
> create dataset DBLP(DBLPType) primary key tid autogenerated;
> create dataset CSX(CSXType) primary key tid autogenerated;
> load dataset DBLP
> using localfs
> (("path"="127.0.0.1:///Users/michael/Research/asterixdb-src/asterixdb-fuzzy/asterixdb/asterixdb/asterix-app/data/dblp-small/dblp-small-multi-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"))
> pre-sorted;
> load dataset CSX
> using localfs
> (("path"="127.0.0.1:///Users/michael/Research/asterixdb-src/asterixdb-fuzzy/asterixdb/asterixdb/asterix-app/data/pub-small/csx-small-multi-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
> use dataverse fuzzyjointest;
> drop index DBLP.title_index if exists;
> create index title_index on DBLP(title) type keyword;
> drop index DBLP.author_index if exists;
> create index author_index on DBLP(authors) type keyword;
> drop index CSX.csx_author_index if exists;
> create index csx_author_index on CSX(authors) type keyword;
> {noformat}
> Query
> {noformat}
> use dataverse fuzzyjointest;
> set simthreshold '.7f'
> for $o in dataset('DBLP')
> for $t in dataset('CSX')
> where word-tokens($o.authors) ~= word-tokens($t.authors)
> return {"cid": $t.id, "did": $o.id}
> {noformat}
> Plan
> {noformat}
> distribute result [%0->$$9]
> -- DISTRIBUTE_RESULT |PARTITIONED|
> exchange
> -- ONE_TO_ONE_EXCHANGE |PARTITIONED|
> project ([$$9])
> -- STREAM_PROJECT |PARTITIONED|
> assign [$$9] <- [function-call: asterix:closed-record-constructor,
> Args:[AString: {cid}, %0->$$18, AString: {did}, %0->$$19]]
> -- ASSIGN |PARTITIONED|
> project ([$$18, $$19])
> -- STREAM_PROJECT |PARTITIONED|
> exchange
> -- ONE_TO_ONE_EXCHANGE |PARTITIONED|
> join (function-call: algebricks:eq, Args:[%0->$$22, %0->$$12])
> -- HYBRID_HASH_JOIN [$$22][$$12] |PARTITIONED|
> exchange
> -- ONE_TO_ONE_EXCHANGE |PARTITIONED|
> project ([$$19, $$22])
> -- STREAM_PROJECT |PARTITIONED|
> assign [$$19] <- [function-call:
> asterix:field-access-by-index, Args:[%0->$$0, AInt32: {1}]]
> -- ASSIGN |PARTITIONED|
> exchange
> -- ONE_TO_ONE_EXCHANGE |PARTITIONED|
> data-scan []<-[$$22, $$0] <- fuzzyjointest:DBLP
> -- DATASOURCE_SCAN |PARTITIONED|
> exchange
> -- ONE_TO_ONE_EXCHANGE |PARTITIONED|
> empty-tuple-source
> -- EMPTY_TUPLE_SOURCE |PARTITIONED|
> exchange
> -- HASH_PARTITION_EXCHANGE [$$12] |PARTITIONED|
> project ([$$18, $$12])
> -- STREAM_PROJECT |PARTITIONED|
> select (function-call: asterix:get-item,
> Args:[function-call: asterix:similarity-jaccard-check, Args:[%0->$$14,
> function-call: asterix:word-tokens, Args:[%0->$$17], AFloat: {0.7}], AInt32:
> {0}])
> -- STREAM_SELECT |PARTITIONED|
> project ([$$17, $$18, $$12, $$14])
> -- STREAM_PROJECT |PARTITIONED|
> assign [$$18, $$17] <- [function-call:
> asterix:field-access-by-index, Args:[%0->$$1, AInt32: {1}], function-call:
> asterix:field-access-by-index, Args:[%0->$$1, AInt32: {4}]]
> -- ASSIGN |PARTITIONED|
> project ([$$1, $$12, $$14])
> -- STREAM_PROJECT |PARTITIONED|
> exchange
> -- ONE_TO_ONE_EXCHANGE |PARTITIONED|
> unnest-map [$$13, $$1] <- function-call:
> asterix:index-search, Args:[AString: {CSX}, AInt32: {0}, AString:
> {fuzzyjointest}, AString: {CSX}, ABoolean: {true}, ABoolean: {false}, AInt32:
> {1}, %0->$$26, AInt32: {1}, %0->$$26, TRUE, TRUE, TRUE]
> -- BTREE_SEARCH |PARTITIONED|
> exchange
> -- ONE_TO_ONE_EXCHANGE |PARTITIONED|
> order (ASC, %0->$$26)
> -- STABLE_SORT [$$26(ASC)] |PARTITIONED|
> exchange
> -- ONE_TO_ONE_EXCHANGE |PARTITIONED|
> unnest-map [$$26] <- function-call:
> asterix:index-search, Args:[AString: {csx_author_index}, AInt32: {4},
> AString: {fuzzyjointest}, AString: {CSX}, ABoolean: {true}, ABoolean: {true},
> AInt32: {1}, AFloat: {0.7}, AInt32: {25}, AInt32: {1}, %0->$$14]
> --
> LENGTH_PARTITIONED_INVERTED_INDEX_SEARCH |PARTITIONED|
> exchange
> -- BROADCAST_EXCHANGE |PARTITIONED|
> project ([$$12, $$14])
> -- STREAM_PROJECT |PARTITIONED|
> assign [$$14] <- [function-call:
> asterix:word-tokens, Args:[function-call: asterix:field-access-by-index,
> Args:[%0->$$25, AInt32: {4}]]]
> -- ASSIGN |PARTITIONED|
> exchange
> -- ONE_TO_ONE_EXCHANGE
> |PARTITIONED|
> data-scan []<-[$$12, $$25] <-
> fuzzyjointest:DBLP
> -- DATASOURCE_SCAN
> |PARTITIONED|
> exchange
> -- ONE_TO_ONE_EXCHANGE
> |PARTITIONED|
> empty-tuple-source
> -- EMPTY_TUPLE_SOURCE
> |PARTITIONED|
> {noformat}
> The same error occurs similar to issue-1487.
> {noformat}
> org.apache.hyracks.api.exceptions.HyracksException: Job failed on account of:
> HYR0002: null
> at
> org.apache.hyracks.control.cc.job.JobRun.waitForCompletion(JobRun.java:212)
> at
> org.apache.hyracks.control.cc.work.WaitForJobCompletionWork$1.run(WaitForJobCompletionWork.java:48)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
> at java.lang.Thread.run(Thread.java:745)
> Caused by: org.apache.hyracks.api.exceptions.HyracksDataException: HYR0002:
> null
> at
> org.apache.hyracks.control.common.utils.ExceptionUtils.setNodeIds(ExceptionUtils.java:62)
> at org.apache.hyracks.control.nc.Task.run(Task.java:319)
> ... 3 more
> Caused by: org.apache.hyracks.api.exceptions.HyracksDataException: null
> at org.apache.hyracks.control.nc.Task.pushFrames(Task.java:365)
> at org.apache.hyracks.control.nc.Task.run(Task.java:297)
> ... 3 more
> Caused by: org.apache.hyracks.api.exceptions.HyracksDataException: null
> at
> org.apache.hyracks.storage.am.common.dataflow.IndexSearchOperatorNodePushable.open(IndexSearchOperatorNodePushable.java:143)
> at org.apache.hyracks.control.nc.Task.pushFrames(Task.java:341)
> ... 4 more
> Caused by: java.lang.NullPointerException
> at
> org.apache.hyracks.storage.am.lsm.invertedindex.dataflow.LSMInvertedIndexSearchOperatorNodePushable.createSearchPredicate(LSMInvertedIndexSearchOperatorNodePushable.java:56)
> at
> org.apache.hyracks.storage.am.common.dataflow.IndexSearchOperatorNodePushable.open(IndexSearchOperatorNodePushable.java:131)
> ... 5 more
> org.apache.hyracks.api.exceptions.HyracksException: Job failed on account of:
> HYR0002: null
> at
> org.apache.hyracks.control.cc.job.JobRun.waitForCompletion(JobRun.java:212)
> at
> org.apache.hyracks.control.cc.work.WaitForJobCompletionWork$1.run(WaitForJobCompletionWork.java:48)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
> at java.lang.Thread.run(Thread.java:745)
> Caused by: org.apache.hyracks.api.exceptions.HyracksDataException: HYR0002:
> null
> at
> org.apache.hyracks.control.common.utils.ExceptionUtils.setNodeIds(ExceptionUtils.java:62)
> at org.apache.hyracks.control.nc.Task.run(Task.java:319)
> ... 3 more
> Caused by: org.apache.hyracks.api.exceptions.HyracksDataException: null
> at org.apache.hyracks.control.nc.Task.pushFrames(Task.java:365)
> at org.apache.hyracks.control.nc.Task.run(Task.java:297)
> ... 3 more
> Caused by: org.apache.hyracks.api.exceptions.HyracksDataException: null
> at
> org.apache.hyracks.storage.am.common.dataflow.IndexSearchOperatorNodePushable.open(IndexSearchOperatorNodePushable.java:143)
> at org.apache.hyracks.control.nc.Task.pushFrames(Task.java:341)
> ... 4 more
> Caused by: java.lang.NullPointerException
> at
> org.apache.hyracks.storage.am.lsm.invertedindex.dataflow.LSMInvertedIndexSearchOperatorNodePushable.createSearchPredicate(LSMInvertedIndexSearchOperatorNodePushable.java:56)
> at
> org.apache.hyracks.storage.am.common.dataflow.IndexSearchOperatorNodePushable.open(IndexSearchOperatorNodePushable.java:131)
> ... 5 more
> Jul 24, 2016 8:31:29 AM org.apache.asterix.api.http.servlet.APIServlet doPost
> SEVERE: Job failed on account of:
> HYR0002: null
> org.apache.hyracks.api.exceptions.HyracksException: Job failed on account of:
> HYR0002: null
> at
> org.apache.hyracks.control.cc.job.JobRun.waitForCompletion(JobRun.java:212)
> at
> org.apache.hyracks.control.cc.work.WaitForJobCompletionWork$1.run(WaitForJobCompletionWork.java:48)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
> at java.lang.Thread.run(Thread.java:745)
> Caused by: org.apache.hyracks.api.exceptions.HyracksDataException: HYR0002:
> null
> at
> org.apache.hyracks.control.common.utils.ExceptionUtils.setNodeIds(ExceptionUtils.java:62)
> at org.apache.hyracks.control.nc.Task.run(Task.java:319)
> ... 3 more
> Caused by: org.apache.hyracks.api.exceptions.HyracksDataException: null
> at org.apache.hyracks.control.nc.Task.pushFrames(Task.java:365)
> at org.apache.hyracks.control.nc.Task.run(Task.java:297)
> ... 3 more
> Caused by: org.apache.hyracks.api.exceptions.HyracksDataException: null
> at
> org.apache.hyracks.storage.am.common.dataflow.IndexSearchOperatorNodePushable.open(IndexSearchOperatorNodePushable.java:143)
> at org.apache.hyracks.control.nc.Task.pushFrames(Task.java:341)
> ... 4 more
> Caused by: java.lang.NullPointerException
> at
> org.apache.hyracks.storage.am.lsm.invertedindex.dataflow.LSMInvertedIndexSearchOperatorNodePushable.createSearchPredicate(LSMInvertedIndexSearchOperatorNodePushable.java:56)
> at
> org.apache.hyracks.storage.am.common.dataflow.IndexSearchOperatorNodePushable.open(IndexSearchOperatorNodePushable.java:131)
> ... 5 more
> {noformat}
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)