Hyoungjun Kim created TAJO-925:
----------------------------------
Summary: Child ExecutionBlock of JOIN node has different number of
shuffle keys.
Key: TAJO-925
URL: https://issues.apache.org/jira/browse/TAJO-925
Project: Tajo
Issue Type: Bug
Reporter: Hyoungjun Kim
Priority: Minor
If both sides of a join node is not SCAN but SUBQUERY, each node has different
number shuffle keys.
In that case JOIN query returns a wrong result. I tested with the below test
code.
{code}
@Test
public void testJoinWithDifferentShuffleKey() throws Exception {
KeyValueSet tableOptions = new KeyValueSet();
tableOptions.put(StorageConstants.CSVFILE_DELIMITER,
StorageConstants.DEFAULT_FIELD_DELIMITER);
tableOptions.put(StorageConstants.CSVFILE_NULL, "\\\\N");
Schema schema = new Schema();
schema.addColumn("id", Type.INT4);
schema.addColumn("name", Type.TEXT);
List<String> data = new ArrayList<String>();
int bytes = 0;
for (int i = 0; i < 1000000; i++) {
String row = i + "|" + i +
"name012345678901234567890123456789012345678901234567890";
bytes += row.getBytes().length;
data.add(row);
if (bytes > 2 * 1024 * 1024) {
break;
}
}
TajoTestingCluster.createTable("large_table", schema, tableOptions,
data.toArray(new String[]{}));
int originConfValue =
conf.getIntVar(ConfVars.DIST_QUERY_JOIN_PARTITION_VOLUME);
testingCluster.setAllTajoDaemonConfValue(ConfVars.DIST_QUERY_JOIN_PARTITION_VOLUME.varname,
"1");
ResultSet res = executeString(
"select count(b.id) " +
"from (select id, count(*) as cnt from large_table group by id) a " +
"left outer join (select id, count(*) as cnt from large_table where id
< 200 group by id) b " +
"on a.id = b.id"
);
try {
String expected =
"?count\n" +
"-------------------------------\n" +
"200\n";
assertEquals(expected, resultSetToString(res));
} finally {
testingCluster.setAllTajoDaemonConfValue(ConfVars.DIST_QUERY_JOIN_PARTITION_VOLUME.varname,
"" + originConfValue);
cleanupQuery(res);
executeString("DROP TABLE large_table PURGE").close();
}
}
{code}
--
This message was sent by Atlassian JIRA
(v6.2#6252)