[ 
https://issues.apache.org/jira/browse/PHOENIX-3112?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15814579#comment-15814579
 ] 

Venkata Mohan commented on PHOENIX-3112:
----------------------------------------

I am also encountering the same issue.
Applying order by is not working on filtered data.
After order by on whole data it is working.
But we have large volume of data, so can't load whole data into memory before 
filter.
Please suggest any workaround.
Below is the code what I am using to get correct results.

  def readTable(sqlContext: SQLContext, tableName: String): DataFrame = {
    val connectionProperties: Map[String, String] = Map[String, String](
      "driver" -> sqlContext.sparkContext.getConf.get("spark.phoenix.driver"),
      "url" -> sqlContext.sparkContext.getConf.get("spark.phoenix.db_url"),
      "user" -> sqlContext.sparkContext.getConf.get("spark.phoenix.db_user"),
      "password" -> 
sqlContext.sparkContext.getConf.get("spark.phoenix.db_password"),
      "dbtable" -> tableName)

    val dataFrame: DataFrame = 
sqlContext.read.format("jdbc").options(connectionProperties).load();
    dataFrame
  }

 val dfTempTgt = readTable(sqlContext, "select * from TEST order by ID")
 dfTempTgt.persist(StorageLevel.MEMORY_AND_DISK)
 val dfTgt = dfTempTgt.filter("REGION_KEY = 'ABC'")
 dfTgt.persist(StorageLevel.MEMORY_AND_DISK)
 dfTempTgt.unpersist()
 dfTgt.registerTempTable("TEST")

> Partial row scan not handled correctly
> --------------------------------------
>
>                 Key: PHOENIX-3112
>                 URL: https://issues.apache.org/jira/browse/PHOENIX-3112
>             Project: Phoenix
>          Issue Type: Bug
>    Affects Versions: 4.7.0
>            Reporter: Pierre Lacave
>
> When doing a select of a relatively large table (a few touthands rows) some 
> rows return partially missing.
> When increasing the fitler to return those specific rows, the values appear 
> as expected
> {noformat}
> CREATE TABLE IF NOT EXISTS TEST (
>         BUCKET VARCHAR,
>         TIMESTAMP_DATE TIMESTAMP,
>         TIMESTAMP UNSIGNED_LONG NOT NULL,
>         SRC VARCHAR,
>         DST VARCHAR,
>         ID VARCHAR,
>         ION VARCHAR,
>         IC BOOLEAN NOT NULL,
>         MI UNSIGNED_LONG,
>         AV UNSIGNED_LONG,
>         MA UNSIGNED_LONG,
>         CNT UNSIGNED_LONG,
>         DUMMY VARCHAR
>     CONSTRAINT pk PRIMARY KEY (BUCKET, TIMESTAMP DESC, SRC, DST, ID, ION, IC)
> );{noformat}
> using a python script to generate a CSV with 5000 rows
> {noformat}
> for i in xrange(5000):
>     print "5SEC,2016-07-21 
> 07:25:35.{i},146908593500{i},WWWWWWWW,AAA,BBBB,CCCCCCCC,false,{i}1181000,1788000{i},2497001{i},{i},aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa{i}".format(i=i)
> {noformat}
> bulk inserting the csv in the table
> {noformat}
> phoenix/bin/psql.py localhost -t TEST large.csv
> {noformat}
> here we can see one row that contains no TIMESTAMP_DATE and null values in MI 
> and MA
> {noformat}
> 0: jdbc:phoenix:localhost:2181> select * from TEST 
> ....
> +---------+--------------------------+-------------------+-----------+------+-------+-----------+--------+--------------+--------------+--------------+-------+----------------------------------------------------------------------------+
> | BUCKET  |      TIMESTAMP_DATE      |     TIMESTAMP     |    SRC    | DST  | 
>  ID   |    ION    |   IC   |      MI      |      AV      |      MA      |  
> CNT  |                                   DUMMY                                
>     |
> +---------+--------------------------+-------------------+-----------+------+-------+-----------+--------+--------------+--------------+--------------+-------+----------------------------------------------------------------------------+
> | 5SEC    | 2016-07-21 07:25:35.100  | 1469085935001000  | WWWWWWWW  | AAA  | 
> BBBB  | CCCCCCCC  | false  | 10001181000  | 17880001000  | 24970011000  | 
> 1000  | 
> aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa1000  |
> | 5SEC    | 2016-07-21 07:25:35.999  | 146908593500999   | WWWWWWWW  | AAA  | 
> BBBB  | CCCCCCCC  | false  | 9991181000   | 1788000999   | 2497001999   | 999 
>   | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa999  
>  |
> | 5SEC    | 2016-07-21 07:25:35.998  | 146908593500998   | WWWWWWWW  | AAA  | 
> BBBB  | CCCCCCCC  | false  | 9981181000   | 1788000998   | 2497001998   | 998 
>   | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa998  
>  |
> | 5SEC    |                          | 146908593500997   | WWWWWWWW  | AAA  | 
> BBBB  | CCCCCCCC  | false  | null         | 1788000997   | null         | 997 
>   |                                                                           
>  |
> | 5SEC    | 2016-07-21 07:25:35.996  | 146908593500996   | WWWWWWWW  | AAA  | 
> BBBB  | CCCCCCCC  | false  | 9961181000   | 1788000996   | 2497001996   | 996 
>   | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa996  
>  |
> | 5SEC    | 2016-07-21 07:25:35.995  | 146908593500995   | WWWWWWWW  | AAA  | 
> BBBB  | CCCCCCCC  | false  | 9951181000   | 1788000995   | 2497001995   | 995 
>   | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa995  
>  |
> | 5SEC    | 2016-07-21 07:25:35.994  | 146908593500994   | WWWWWWWW  | AAA  | 
> BBBB  | CCCCCCCC  | false  | 9941181000   | 1788000994   | 2497001994   | 994 
>   | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa994  
>  |
> ....
> {noformat}
> but when selecting that row specifically the values are correct
> {noformat}
> 0: jdbc:phoenix:localhost:2181> select * from TEST where timestamp = 
> 146908593500997;
> +---------+--------------------------+------------------+-----------+------+-------+-----------+--------+-------------+-------------+-------------+------+---------------------------------------------------------------------------+
> | BUCKET  |      TIMESTAMP_DATE      |    TIMESTAMP     |    SRC    | DST  |  
> ID   |    ION    |   IC   |     MI      |     AV      |     MA      | CNT  |  
>                                  DUMMY                                   |
> +---------+--------------------------+------------------+-----------+------+-------+-----------+--------+-------------+-------------+-------------+------+---------------------------------------------------------------------------+
> | 5SEC    | 2016-07-21 07:25:35.997  | 146908593500997  | WWWWWWWW  | AAA  | 
> BBBB  | CCCCCCCC  | false  | 9971181000  | 1788000997  | 2497001997  | 997  | 
> aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa997  |
> +---------+--------------------------+------------------+-----------+------+-------+-----------+--------+-------------+-------------+-------------+------+---------------------------------------------------------------------------+
> 1 row selected (0.159 seconds){noformat}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to