[ https://issues.apache.org/jira/browse/CARBONDATA-4334?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Xinyu Zeng updated CARBONDATA-4334: ----------------------------------- Description: Hi, I follow the instructions of <Installing and Configuring CarbonData to run locally with Spark Shell> in Quick Start guide, using apache-carbondata-2.3.0-bin-spark2.3.4-hadoop2.7.2.jar. The scala script I am running is : {code:java} import java.io.File import org.apache.spark.sql.{CarbonEnv, SaveMode, SparkSession} import org.apache.carbondata.core.constants.{CarbonCommonConstants, CarbonV3DataFormatConstants} import org.apache.carbondata.core.util.CarbonProperties def createSparkSession(): SparkSession = { CarbonProperties.getInstance() .addProperty(CarbonCommonConstants.ENABLE_UNSAFE_COLUMN_PAGE, "true") .addProperty(CarbonV3DataFormatConstants.BLOCKLET_SIZE_IN_MB, "64") .addProperty(CarbonCommonConstants.CARBON_BADRECORDS_LOC, "") valspark=SparkSession.builder().config(sc.getConf).enableHiveSupport.config("spark.sql.extensions","org.apache.spark.sql.CarbonExtensions").getOrCreate() CarbonEnv.getInstance(spark) spark.sparkContext.setLogLevel("ERROR") spark } val spark = createSparkSession() spark.sql("DROP TABLE IF EXISTS lineitem_0509") spark.sql(s""" | CREATE TABLE IF NOT EXISTS lineitem_0509( | L_ORDERKEY BIGINT, | L_PARTKEY BIGINT, | L_SUPPKEY BIGINT, | L_LINENUMBER INTEGER, | L_QUANTITY DECIMAL, | L_EXTENDEDPRICE DECIMAL, | L_DISCOUNT DECIMAL, | L_TAX DECIMAL, | L_RETURNFLAG CHAR(1), | L_LINESTATUS CHAR(1), | L_SHIPDATE DATE, | L_COMMITDATE DATE, | L_RECEIPTDATE DATE, | L_SHIPINSTRUCT CHAR(25), | L_SHIPMODE CHAR(10), | L_COMMENT VARCHAR(44)) | STORED AS carbondata |TBLPROPERTIES ('SORT_COLUMNS'='L_SHIPDATE', 'COLUMN_META_CACHE'='L_SHIPDATE') """.stripMargin)spark.sql("LOAD DATA INPATH '/root/lineitem_1M_shipdate.csv' INTO TABLE lineitem_0509") spark.sql("LOAD DATA INPATH '/root/lineitem_1M_shipdate.csv' INTO TABLE lineitem_0509") val sqlContext = new org.apache.spark.sql.SQLContext(sc) // this is used to implicitly convert an RDD to a DataFrame. import sqlContext.implicits._ spark.time { val df = sqlContext.sql("SELECT l_extendedprice, l_discount, l_quantity FROM lineitem_0509 WHERE l_shipdate >= to_date('1994-01-01') AND l_shipdate < to_date('1995-01-01')") } {code} However, it seems like the blocklet index is not effective because whether I add the shipdate filter or not the select query takes the same time to run. Does anyone have an idea? Also, I feel the documentation of CarbonData is not as good as other open source project. The lack of response from community prevents it to be used by others. was: Hi, I follow the instructions of <Installing and Configuring CarbonData to run locally with Spark Shell> in Quick Start guide, using apache-carbondata-2.3.0-bin-spark2.3.4-hadoop2.7.2.jar. The scala script I am running is : {code:java} import java.io.File import org.apache.spark.sql.{CarbonEnv, SaveMode, SparkSession} import org.apache.carbondata.core.constants.{CarbonCommonConstants, CarbonV3DataFormatConstants} import org.apache.carbondata.core.util.CarbonProperties def createSparkSession(): SparkSession = { CarbonProperties.getInstance() .addProperty(CarbonCommonConstants.ENABLE_UNSAFE_COLUMN_PAGE, "true") .addProperty(CarbonV3DataFormatConstants.BLOCKLET_SIZE_IN_MB, "64") .addProperty(CarbonCommonConstants.CARBON_BADRECORDS_LOC, "") valspark=SparkSession.builder().config(sc.getConf).enableHiveSupport.config("spark.sql.extensions","org.apache.spark.sql.CarbonExtensions").getOrCreate() CarbonEnv.getInstance(spark) spark.sparkContext.setLogLevel("ERROR") spark } val spark = createSparkSession() spark.sql("DROP TABLE IF EXISTS lineitem_0509") spark.sql(s""" | CREATE TABLE IF NOT EXISTS lineitem_0509( | L_ORDERKEY BIGINT, | L_PARTKEY BIGINT, | L_SUPPKEY BIGINT, | L_LINENUMBER INTEGER, | L_QUANTITY DECIMAL, | L_EXTENDEDPRICE DECIMAL, | L_DISCOUNT DECIMAL, | L_TAX DECIMAL, | L_RETURNFLAG CHAR(1), | L_LINESTATUS CHAR(1), | L_SHIPDATE DATE, | L_COMMITDATE DATE, | L_RECEIPTDATE DATE, | L_SHIPINSTRUCT CHAR(25), | L_SHIPMODE CHAR(10), | L_COMMENT VARCHAR(44)) | STORED AS carbondata |TBLPROPERTIES ('SORT_COLUMNS'='L_SHIPDATE', 'COLUMN_META_CACHE'='L_SHIPDATE') """.stripMargin)spark.sql("LOAD DATA INPATH '/root/lineitem_1M_shipdate.csv' INTO TABLE lineitem_0509") spark.sql("LOAD DATA INPATH '/root/lineitem_1M_shipdate.csv' INTO TABLE lineitem_0509") val sqlContext = new org.apache.spark.sql.SQLContext(sc) // this is used to implicitly convert an RDD to a DataFrame. import sqlContext.implicits._ spark.time { val df = sqlContext.sql("SELECT l_extendedprice, l_discount, l_quantity FROM lineitem_0509 WHERE l_shipdate >= to_date('1994-01-01') AND l_shipdate < to_date('1995-01-01')") } {code} However, it seems like the blocklet index is not effective because whether I add the shipdate filter or not the select query takes the same time to run > Blocklet skipping not works? > ---------------------------- > > Key: CARBONDATA-4334 > URL: https://issues.apache.org/jira/browse/CARBONDATA-4334 > Project: CarbonData > Issue Type: Improvement > Reporter: Xinyu Zeng > Priority: Major > > Hi, I follow the instructions of <Installing and Configuring CarbonData to > run locally with Spark Shell> in Quick Start guide, using > apache-carbondata-2.3.0-bin-spark2.3.4-hadoop2.7.2.jar. > The scala script I am running is : > > {code:java} > import java.io.File > import org.apache.spark.sql.{CarbonEnv, SaveMode, SparkSession} > import org.apache.carbondata.core.constants.{CarbonCommonConstants, > CarbonV3DataFormatConstants} > import org.apache.carbondata.core.util.CarbonProperties > def createSparkSession(): SparkSession = > { > CarbonProperties.getInstance() > .addProperty(CarbonCommonConstants.ENABLE_UNSAFE_COLUMN_PAGE, "true") > .addProperty(CarbonV3DataFormatConstants.BLOCKLET_SIZE_IN_MB, "64") > .addProperty(CarbonCommonConstants.CARBON_BADRECORDS_LOC, "") > valspark=SparkSession.builder().config(sc.getConf).enableHiveSupport.config("spark.sql.extensions","org.apache.spark.sql.CarbonExtensions").getOrCreate() > CarbonEnv.getInstance(spark) spark.sparkContext.setLogLevel("ERROR") spark } > val spark = createSparkSession() > spark.sql("DROP TABLE IF EXISTS lineitem_0509") > spark.sql(s""" | CREATE TABLE IF NOT EXISTS lineitem_0509( > | L_ORDERKEY BIGINT, | L_PARTKEY BIGINT, | > L_SUPPKEY BIGINT, | L_LINENUMBER INTEGER, | L_QUANTITY > DECIMAL, | L_EXTENDEDPRICE DECIMAL, | L_DISCOUNT DECIMAL, > | L_TAX DECIMAL, | L_RETURNFLAG CHAR(1), | L_LINESTATUS > CHAR(1), | L_SHIPDATE DATE, | L_COMMITDATE DATE, | > L_RECEIPTDATE DATE, | L_SHIPINSTRUCT CHAR(25), | L_SHIPMODE > CHAR(10), | L_COMMENT VARCHAR(44)) | STORED AS > carbondata |TBLPROPERTIES ('SORT_COLUMNS'='L_SHIPDATE', > 'COLUMN_META_CACHE'='L_SHIPDATE') """.stripMargin)spark.sql("LOAD > DATA INPATH '/root/lineitem_1M_shipdate.csv' INTO TABLE lineitem_0509") > spark.sql("LOAD DATA INPATH '/root/lineitem_1M_shipdate.csv' INTO TABLE > lineitem_0509") > val sqlContext = new org.apache.spark.sql.SQLContext(sc) > // this is used to implicitly convert an RDD to a DataFrame. > import sqlContext.implicits._ > spark.time > { val df = sqlContext.sql("SELECT l_extendedprice, l_discount, l_quantity > FROM lineitem_0509 WHERE l_shipdate >= to_date('1994-01-01') AND l_shipdate < > to_date('1995-01-01')") } {code} > > However, it seems like the blocklet index is not effective because whether I > add the shipdate filter or not the select query takes the same time to run. > Does anyone have an idea? > > Also, I feel the documentation of CarbonData is not as good as other open > source project. The lack of response from community prevents it to be used by > others. -- This message was sent by Atlassian Jira (v8.20.7#820007)