[
https://issues.apache.org/jira/browse/ASTERIXDB-1365?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Yingyi Bu updated ASTERIXDB-1365:
---------------------------------
Description:
[~wangsaeu],[~javierjia],
It seems there are two possible causes for the bug:
1. the inverted index generates malformed UTF8 strings;
2. UTF8StringUtil has some issue.
However, since UTF8StringUtil has been widely used elsewhere, it's very
possible the issue is in inverted index. Thus, I assign this to Taewoo. Please
re-assign owners if you think the assignment is not right.
DDL (closed data):
{noformat}
create type EmploymentType as {
organization: string,
start_date: date,
end_date: date?
}
create type GleambookUserType as {
id: int64,
alias: string,
name: string,
user_since: datetime,
friend_ids: {{ int64 }},
employment: [EmploymentType]
}
create type GleambookMessageType as {
message_id: int64,
author_id: int64,
in_response_to: int64?,
sender_location: point,
send_time: datetime,
message: string
}
create dataset GleambookUsers(GleambookUserType)
primary key id;
create dataset GleambookMessages(GleambookMessageType)
primary key message_id;
create index twmSndTmIx on GleambookMessages(send_time);
{noformat}
DDL (open data):
{noformat}
create type GleambookUserType as {
id: string
}
create type GleambookMessageType as {
message_id: int64
}
create dataset GleambookUsers(GleambookUserType)
primary key id;
create dataset GleambookMessages(GleambookMessageType)
primary key message_id;
create index twmSndTmIx on GleambookMessages(send_time:datetime) enforced;
{noformat}
This is the query:
{noformat}
use SocialNetworkData;
select distinct element message.message
from GleambookMessages as message,
"word-tokens"(message) as token,
(
select distinct element emp.organization
from GleambookUsers as user,
user.employment emp
) as org
where org=token
and message.send_time >= datetime('2000-06-07T12:05:32') and
message.send_time < datetime('2000-06-08T12:05:32');
{noformat}
This is the stack trace:
{noformat}
Caused by: java.lang.IllegalArgumentException
at
org.apache.hyracks.util.string.UTF8StringUtil.charAt(UTF8StringUtil.java:60)
at
org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.DelimitedUTF8StringBinaryTokenizer.hasNext(DelimitedUTF8StringBinaryTokenizer.java:47)
at
org.apache.asterix.runtime.evaluators.common.WordTokensEvaluator.evaluate(WordTokensEvaluator.java:61)
at
org.apache.asterix.runtime.unnestingfunctions.std.ScanCollectionDescriptor$ScanCollectionUnnestingFunctionFactory$1.init(ScanCollectionDescriptor.java:88)
at
org.apache.hyracks.algebricks.runtime.operators.std.UnnestRuntimeFactory$1.nextFrame(UnnestRuntimeFactory.java:121)
at
org.apache.hyracks.dataflow.common.comm.io.AbstractFrameAppender.write(AbstractFrameAppender.java:93)
at
org.apache.hyracks.algebricks.runtime.operators.base.AbstractOneInputOneOutputOneFramePushRuntime.flushAndReset(AbstractOneInputOneOutputOneFramePushRuntime.java:63)
at
org.apache.hyracks.algebricks.runtime.operators.base.AbstractOneInputOneOutputOneFramePushRuntime.flushIfNotFailed(AbstractOneInputOneOutputOneFramePushRuntime.java:69)
at
org.apache.hyracks.algebricks.runtime.operators.base.AbstractOneInputOneOutputOneFramePushRuntime.close(AbstractOneInputOneOutputOneFramePushRuntime.java:55)
at
org.apache.hyracks.algebricks.runtime.operators.std.StreamSelectRuntimeFactory$1.close(StreamSelectRuntimeFactory.java:125)
at
org.apache.hyracks.algebricks.runtime.operators.base.AbstractOneInputOneOutputOneFramePushRuntime.close(AbstractOneInputOneOutputOneFramePushRuntime.java:57)
at
org.apache.hyracks.algebricks.runtime.operators.std.AssignRuntimeFactory$1.close(AssignRuntimeFactory.java:122)
at
org.apache.hyracks.algebricks.runtime.operators.base.AbstractOneInputOneOutputOneFramePushRuntime.close(AbstractOneInputOneOutputOneFramePushRuntime.java:57)
at
org.apache.hyracks.algebricks.runtime.operators.meta.AlgebricksMetaOperatorDescriptor$2.close(AlgebricksMetaOperatorDescriptor.java:153)
at
org.apache.hyracks.storage.am.common.dataflow.IndexSearchOperatorNodePushable.close(IndexSearchOperatorNodePushable.java:227)
... 9 more
{noformat}
was:
[~wangsaeu],[~javierjia],
It seems there are two possible causes for the bug:
1. the inverted index generates malformed UTF8 strings;
2. UTF8StringUtil has some issue.
However, since UTF8StringUtil has been widely used elsewhere, it's very
possible the issue is in inverted index. Thus, I assign this to Taewoo. Please
re-assign owners if you think the assignment is not right.
This is the query:
{noformat}
use SocialNetworkData;
select distinct element message.message
from GleambookMessages as message,
"word-tokens"(message) as token,
(
select distinct element emp.organization
from GleambookUsers as user,
user.employment emp
) as org
where org=token
and message.send_time >= datetime('2000-06-07T12:05:32') and
message.send_time < datetime('2000-06-08T12:05:32');
{noformat}
This is the stack trace:
{noformat}
Caused by: java.lang.IllegalArgumentException
at
org.apache.hyracks.util.string.UTF8StringUtil.charAt(UTF8StringUtil.java:60)
at
org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.DelimitedUTF8StringBinaryTokenizer.hasNext(DelimitedUTF8StringBinaryTokenizer.java:47)
at
org.apache.asterix.runtime.evaluators.common.WordTokensEvaluator.evaluate(WordTokensEvaluator.java:61)
at
org.apache.asterix.runtime.unnestingfunctions.std.ScanCollectionDescriptor$ScanCollectionUnnestingFunctionFactory$1.init(ScanCollectionDescriptor.java:88)
at
org.apache.hyracks.algebricks.runtime.operators.std.UnnestRuntimeFactory$1.nextFrame(UnnestRuntimeFactory.java:121)
at
org.apache.hyracks.dataflow.common.comm.io.AbstractFrameAppender.write(AbstractFrameAppender.java:93)
at
org.apache.hyracks.algebricks.runtime.operators.base.AbstractOneInputOneOutputOneFramePushRuntime.flushAndReset(AbstractOneInputOneOutputOneFramePushRuntime.java:63)
at
org.apache.hyracks.algebricks.runtime.operators.base.AbstractOneInputOneOutputOneFramePushRuntime.flushIfNotFailed(AbstractOneInputOneOutputOneFramePushRuntime.java:69)
at
org.apache.hyracks.algebricks.runtime.operators.base.AbstractOneInputOneOutputOneFramePushRuntime.close(AbstractOneInputOneOutputOneFramePushRuntime.java:55)
at
org.apache.hyracks.algebricks.runtime.operators.std.StreamSelectRuntimeFactory$1.close(StreamSelectRuntimeFactory.java:125)
at
org.apache.hyracks.algebricks.runtime.operators.base.AbstractOneInputOneOutputOneFramePushRuntime.close(AbstractOneInputOneOutputOneFramePushRuntime.java:57)
at
org.apache.hyracks.algebricks.runtime.operators.std.AssignRuntimeFactory$1.close(AssignRuntimeFactory.java:122)
at
org.apache.hyracks.algebricks.runtime.operators.base.AbstractOneInputOneOutputOneFramePushRuntime.close(AbstractOneInputOneOutputOneFramePushRuntime.java:57)
at
org.apache.hyracks.algebricks.runtime.operators.meta.AlgebricksMetaOperatorDescriptor$2.close(AlgebricksMetaOperatorDescriptor.java:153)
at
org.apache.hyracks.storage.am.common.dataflow.IndexSearchOperatorNodePushable.close(IndexSearchOperatorNodePushable.java:227)
... 9 more
{noformat}
> word-tokens function gets malformed strings from the inverted index
> -------------------------------------------------------------------
>
> Key: ASTERIXDB-1365
> URL: https://issues.apache.org/jira/browse/ASTERIXDB-1365
> Project: Apache AsterixDB
> Issue Type: Bug
> Components: Functions - AQL
> Reporter: Yingyi Bu
> Assignee: Taewoo Kim
> Priority: Critical
>
> [~wangsaeu],[~javierjia],
> It seems there are two possible causes for the bug:
> 1. the inverted index generates malformed UTF8 strings;
> 2. UTF8StringUtil has some issue.
> However, since UTF8StringUtil has been widely used elsewhere, it's very
> possible the issue is in inverted index. Thus, I assign this to Taewoo.
> Please re-assign owners if you think the assignment is not right.
> DDL (closed data):
> {noformat}
> create type EmploymentType as {
> organization: string,
> start_date: date,
> end_date: date?
> }
> create type GleambookUserType as {
> id: int64,
> alias: string,
> name: string,
> user_since: datetime,
> friend_ids: {{ int64 }},
> employment: [EmploymentType]
> }
> create type GleambookMessageType as {
> message_id: int64,
> author_id: int64,
> in_response_to: int64?,
> sender_location: point,
> send_time: datetime,
> message: string
> }
> create dataset GleambookUsers(GleambookUserType)
> primary key id;
> create dataset GleambookMessages(GleambookMessageType)
> primary key message_id;
> create index twmSndTmIx on GleambookMessages(send_time);
> {noformat}
> DDL (open data):
> {noformat}
> create type GleambookUserType as {
> id: string
> }
> create type GleambookMessageType as {
> message_id: int64
> }
> create dataset GleambookUsers(GleambookUserType)
> primary key id;
> create dataset GleambookMessages(GleambookMessageType)
> primary key message_id;
> create index twmSndTmIx on GleambookMessages(send_time:datetime) enforced;
> {noformat}
> This is the query:
> {noformat}
> use SocialNetworkData;
> select distinct element message.message
> from GleambookMessages as message,
> "word-tokens"(message) as token,
> (
> select distinct element emp.organization
> from GleambookUsers as user,
> user.employment emp
> ) as org
> where org=token
> and message.send_time >= datetime('2000-06-07T12:05:32') and
> message.send_time < datetime('2000-06-08T12:05:32');
> {noformat}
> This is the stack trace:
> {noformat}
> Caused by: java.lang.IllegalArgumentException
> at
> org.apache.hyracks.util.string.UTF8StringUtil.charAt(UTF8StringUtil.java:60)
> at
> org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.DelimitedUTF8StringBinaryTokenizer.hasNext(DelimitedUTF8StringBinaryTokenizer.java:47)
> at
> org.apache.asterix.runtime.evaluators.common.WordTokensEvaluator.evaluate(WordTokensEvaluator.java:61)
> at
> org.apache.asterix.runtime.unnestingfunctions.std.ScanCollectionDescriptor$ScanCollectionUnnestingFunctionFactory$1.init(ScanCollectionDescriptor.java:88)
> at
> org.apache.hyracks.algebricks.runtime.operators.std.UnnestRuntimeFactory$1.nextFrame(UnnestRuntimeFactory.java:121)
> at
> org.apache.hyracks.dataflow.common.comm.io.AbstractFrameAppender.write(AbstractFrameAppender.java:93)
> at
> org.apache.hyracks.algebricks.runtime.operators.base.AbstractOneInputOneOutputOneFramePushRuntime.flushAndReset(AbstractOneInputOneOutputOneFramePushRuntime.java:63)
> at
> org.apache.hyracks.algebricks.runtime.operators.base.AbstractOneInputOneOutputOneFramePushRuntime.flushIfNotFailed(AbstractOneInputOneOutputOneFramePushRuntime.java:69)
> at
> org.apache.hyracks.algebricks.runtime.operators.base.AbstractOneInputOneOutputOneFramePushRuntime.close(AbstractOneInputOneOutputOneFramePushRuntime.java:55)
> at
> org.apache.hyracks.algebricks.runtime.operators.std.StreamSelectRuntimeFactory$1.close(StreamSelectRuntimeFactory.java:125)
> at
> org.apache.hyracks.algebricks.runtime.operators.base.AbstractOneInputOneOutputOneFramePushRuntime.close(AbstractOneInputOneOutputOneFramePushRuntime.java:57)
> at
> org.apache.hyracks.algebricks.runtime.operators.std.AssignRuntimeFactory$1.close(AssignRuntimeFactory.java:122)
> at
> org.apache.hyracks.algebricks.runtime.operators.base.AbstractOneInputOneOutputOneFramePushRuntime.close(AbstractOneInputOneOutputOneFramePushRuntime.java:57)
> at
> org.apache.hyracks.algebricks.runtime.operators.meta.AlgebricksMetaOperatorDescriptor$2.close(AlgebricksMetaOperatorDescriptor.java:153)
> at
> org.apache.hyracks.storage.am.common.dataflow.IndexSearchOperatorNodePushable.close(IndexSearchOperatorNodePushable.java:227)
> ... 9 more
> {noformat}
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)