abhishekrb19 opened a new issue, #17951:
URL: https://github.com/apache/druid/issues/17951

   I have the following SQL query involving unnest and two CTEs that doesn't 
yield correct results. The query appears to be generating an incorrect native 
query plan:
   ```sql
   WITH cte1 AS (
     SELECT 
       c1, 
       TIME_FLOOR(__time, 'PT5M') AS floored_time,
       STRLEN(c1) AS c1_len
     FROM unnest_ds
     GROUP BY 1, 2
   ),
   cte2 AS (
     SELECT 
       c1,
       ARRAY_AGG(ARRAY[floored_time, c1_len]) AS pair
     FROM cte1
     GROUP BY 1
   )
   
   SELECT 
     tr.c1, 
     tv
   FROM cte2 AS tr
   CROSS JOIN UNNEST(pair) AS tv
   ```
   
   The above SQL query results in a native plan:
   <details>
     <summary>Native JSON query</summary>
   
   ```json
   {
     "queryType": "scan",
     "dataSource": {
       "type": "unnest",
       "base": {
         "type": "query",
         "query": {
           "queryType": "groupBy",
           "dataSource": {
             "type": "query",
             "query": {
               "queryType": "groupBy",
               "dataSource": {
                 "type": "table",
                 "name": "unnest_ds"
               },
               "intervals": {
                 "type": "intervals",
                 "intervals": [
                   
"-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z"
                 ]
               },
               "virtualColumns": [
                 {
                   "type": "expression",
                   "name": "v0",
                   "expression": 
"timestamp_floor(\"__time\",'PT5M',null,'UTC')",
                   "outputType": "LONG"
                 }
               ],
               "granularity": {
                 "type": "all"
               },
               "dimensions": [
                 {
                   "type": "default",
                   "dimension": "c1",
                   "outputName": "d0",
                   "outputType": "STRING"
                 },
                 {
                   "type": "default",
                   "dimension": "v0",
                   "outputName": "d1",
                   "outputType": "LONG"
                 }
               ],
               "limitSpec": {
                 "type": "NoopLimitSpec"
               },
               "context": {
                 "queryId": "cde89494-705c-4edd-a8a1-dda7cc7f5457",
                 "sqlOuterLimit": 1001,
                 "sqlQueryId": "cde89494-705c-4edd-a8a1-dda7cc7f5457",
                 "sqlStringifyArrays": false,
                 "timestampResultField": "d1",
                 "timestampResultFieldGranularity": "\"FIVE_MINUTE\"",
                 "timestampResultFieldInOriginalDimensions": 1,
                 "useNativeQueryExplain": true
               }
             }
           },
           "intervals": {
             "type": "intervals",
             "intervals": [
               "-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z"
             ]
           },
           "virtualColumns": [
             {
               "type": "expression",
               "name": "v0",
               "expression": "array(\"d1\",strlen(\"d0\"))",
               "outputType": "ARRAY<LONG>"
             }
           ],
           "granularity": {
             "type": "all"
           },
           "dimensions": [
             {
               "type": "default",
               "dimension": "d0",
               "outputName": "_d0",
               "outputType": "STRING"
             }
           ],
           "aggregations": [
             {
               "type": "expression",
               "name": "a0",
               "fields": [
                 "v0"
               ],
               "accumulatorIdentifier": "__acc",
               "initialValue": "ARRAY<ARRAY<LONG>>[]",
               "initialCombineValue": "ARRAY<ARRAY<LONG>>[]",
               "isNullUnlessAggregated": true,
               "shouldAggregateNullInputs": true,
               "shouldCombineAggregateNullInputs": false,
               "fold": "array_append(\"__acc\", \"v0\")",
               "combine": "array_concat(\"__acc\", \"a0\")",
               "maxSizeBytes": 1024
             }
           ],
           "limitSpec": {
             "type": "NoopLimitSpec"
           },
           "context": {
             "queryId": "cde89494-705c-4edd-a8a1-dda7cc7f5457",
             "sqlOuterLimit": 1001,
             "sqlQueryId": "cde89494-705c-4edd-a8a1-dda7cc7f5457",
             "sqlStringifyArrays": false,
             "useNativeQueryExplain": true
           }
         }
       },
       "virtualColumn": {
         "type": "expression",
         "name": "j0.unnest",
         "expression": "timestamp_floor(\"__time\",'PT5M',null,'UTC')",
         "outputType": "LONG"
       },
       "unnestFilter": null
     },
     "intervals": {
       "type": "intervals",
       "intervals": [
         "-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z"
       ]
     },
     "resultFormat": "compactedList",
     "limit": 1001,
     "columns": [
       "_d0",
       "j0.unnest"
     ],
     "context": {
       "queryId": "cde89494-705c-4edd-a8a1-dda7cc7f5457",
       "sqlOuterLimit": 1001,
       "sqlQueryId": "cde89494-705c-4edd-a8a1-dda7cc7f5457",
       "sqlStringifyArrays": false,
       "useNativeQueryExplain": true
     },
     "columnTypes": [
       "STRING",
       "ARRAY<LONG>"
     ],
     "granularity": {
       "type": "all"
     },
     "legacy": false
   }
   ```
   </details>
   
   The problematic part of the plan is:
   
   ```
       "virtualColumn": {
         "type": "expression",
         "name": "j0.unnest",
         "expression": "timestamp_floor(\"__time\",'PT5M',null,'UTC')",
         "outputType": "LONG"
       },
   ```
   
   The unnest column is using the `timestamp_floor` expression from the first 
CTE instead of the `a0` expression from the second CTE whose output type is 
`"ARRAY<ARRAY<LONG>>"`.
   
   Updating that in the plan and running it yields correct results.
   <details>
     <summary>Updated plan with the correct unnest expression</summary>
   
   ```json
   {
     "queryType": "scan",
     "dataSource": {
       "type": "unnest",
       "base": {
         "type": "query",
         "query": {
           "queryType": "groupBy",
           "dataSource": {
             "type": "query",
             "query": {
               "queryType": "groupBy",
               "dataSource": {
                 "type": "table",
                 "name": "unnest_ds"
               },
               "intervals": {
                 "type": "intervals",
                 "intervals": [
                   
"-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z"
                 ]
               },
               "virtualColumns": [
                 {
                   "type": "expression",
                   "name": "v0",
                   "expression": 
"timestamp_floor(\"__time\",'PT5M',null,'UTC')",
                   "outputType": "LONG"
                 }
               ],
               "granularity": {
                 "type": "all"
               },
               "dimensions": [
                 {
                   "type": "default",
                   "dimension": "c1",
                   "outputName": "d0",
                   "outputType": "STRING"
                 },
                 {
                   "type": "default",
                   "dimension": "v0",
                   "outputName": "d1",
                   "outputType": "LONG"
                 }
               ],
               "limitSpec": {
                 "type": "NoopLimitSpec"
               },
               "context": {
                 "queryId": "cde89494-705c-4edd-a8a1-dda7cc7f5457",
                 "sqlOuterLimit": 1001,
                 "sqlQueryId": "cde89494-705c-4edd-a8a1-dda7cc7f5457",
                 "sqlStringifyArrays": false,
                 "timestampResultField": "d1",
                 "timestampResultFieldGranularity": "\"FIVE_MINUTE\"",
                 "timestampResultFieldInOriginalDimensions": 1,
                 "useNativeQueryExplain": true
               }
             }
           },
           "intervals": {
             "type": "intervals",
             "intervals": [
               "-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z"
             ]
           },
           "virtualColumns": [
             {
               "type": "expression",
               "name": "v0",
               "expression": "array(\"d1\",strlen(\"d0\"))",
               "outputType": "ARRAY<LONG>"
             }
           ],
           "granularity": {
             "type": "all"
           },
           "dimensions": [
             {
               "type": "default",
               "dimension": "d0",
               "outputName": "_d0",
               "outputType": "STRING"
             }
           ],
           "aggregations": [
             {
               "type": "expression",
               "name": "a0",
               "fields": [
                 "v0"
               ],
               "accumulatorIdentifier": "__acc",
               "initialValue": "ARRAY<ARRAY<LONG>>[]",
               "initialCombineValue": "ARRAY<ARRAY<LONG>>[]",
               "isNullUnlessAggregated": true,
               "shouldAggregateNullInputs": true,
               "shouldCombineAggregateNullInputs": false,
               "fold": "array_append(\"__acc\", \"v0\")",
               "combine": "array_concat(\"__acc\", \"a0\")",
               "maxSizeBytes": 1024
             }
           ],
           "limitSpec": {
             "type": "NoopLimitSpec"
           },
           "context": {
             "queryId": "cde89494-705c-4edd-a8a1-dda7cc7f5457",
             "sqlOuterLimit": 1001,
             "sqlQueryId": "cde89494-705c-4edd-a8a1-dda7cc7f5457",
             "sqlStringifyArrays": false,
             "useNativeQueryExplain": true
           }
         }
       },
       "virtualColumn": {
         "type": "expression",
         "name": "j0.unnest",
         "expression": "a0",
         "outputType": "ARRAY<ARRAY<LONG>>"
       },
       "unnestFilter": null
     },
     "intervals": {
       "type": "intervals",
       "intervals": [
         "-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z"
       ]
     },
     "resultFormat": "compactedList",
     "limit": 1001,
     "columns": [
       "_d0",
       "j0.unnest"
     ],
     "context": {
       "queryId": "cde89494-705c-4edd-a8a1-dda7cc7f5457",
       "sqlOuterLimit": 1001,
       "sqlQueryId": "cde89494-705c-4edd-a8a1-dda7cc7f5457",
       "sqlStringifyArrays": false,
       "useNativeQueryExplain": true
     },
     "columnTypes": [
       "STRING",
       "ARRAY<LONG>"
     ],
     "granularity": {
       "type": "all"
     },
     "legacy": false
   }
   ```
   </details>
   
   ```diff
   <       "expression": "timestamp_floor(\"__time\",'PT5M',null,'UTC')",
   <       "outputType": "LONG"
   ---
   >       "expression": "a0",
   >       "outputType": "ARRAY<ARRAY<LONG>>"
   ```
   
   ### Affected Version
   
   32.0.0 & 2c46787d8c399b96f88dcf3a14c68b6753b12464 (tip of master)
   
   ### Description
   
   The issue appears to be related to how virtual columns are generated and 
handled during the native query planning stage. Specifically, when a `GROUP BY` 
clause is present in the first CTE, the planner incorrectly references virtual 
columns from the first native query in the unnest phase, instead of those 
generated by the second native query. Removing the `GROUP BY` from the first 
CTE and modifying the structure of the second CTE resolves the issue, but this 
approach loses the original grouping context.
   
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to