[
https://issues.apache.org/jira/browse/CARBONDATA-4334?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Xinyu Zeng updated CARBONDATA-4334:
-----------------------------------
Description:
Hi, I follow the instructions of <Installing and Configuring CarbonData to run
locally with Spark Shell> in Quick Start guide, using
apache-carbondata-2.3.0-bin-spark2.3.4-hadoop2.7.2.jar.
The scala script I am running is :
{code:java}
import java.io.File
import org.apache.spark.sql.{CarbonEnv, SaveMode, SparkSession}
import org.apache.carbondata.core.constants.{CarbonCommonConstants,
CarbonV3DataFormatConstants}
import org.apache.carbondata.core.util.CarbonProperties
def createSparkSession(): SparkSession =
{
CarbonProperties.getInstance()
.addProperty(CarbonCommonConstants.ENABLE_UNSAFE_COLUMN_PAGE, "true")
.addProperty(CarbonV3DataFormatConstants.BLOCKLET_SIZE_IN_MB, "64")
.addProperty(CarbonCommonConstants.CARBON_BADRECORDS_LOC, "")
valspark=SparkSession.builder().config(sc.getConf).enableHiveSupport.config("spark.sql.extensions","org.apache.spark.sql.CarbonExtensions").getOrCreate()
CarbonEnv.getInstance(spark) spark.sparkContext.setLogLevel("ERROR") spark }
val spark = createSparkSession()
spark.sql("DROP TABLE IF EXISTS lineitem_0509")
spark.sql(s""" | CREATE TABLE IF NOT EXISTS lineitem_0509(
| L_ORDERKEY BIGINT, | L_PARTKEY BIGINT, | L_SUPPKEY
BIGINT, | L_LINENUMBER INTEGER, | L_QUANTITY DECIMAL,
| L_EXTENDEDPRICE DECIMAL, | L_DISCOUNT DECIMAL, | L_TAX
DECIMAL, | L_RETURNFLAG CHAR(1), | L_LINESTATUS CHAR(1), |
L_SHIPDATE DATE, | L_COMMITDATE DATE, | L_RECEIPTDATE DATE,
| L_SHIPINSTRUCT CHAR(25), | L_SHIPMODE CHAR(10), | L_COMMENT
VARCHAR(44)) | STORED AS carbondata
|TBLPROPERTIES ('SORT_COLUMNS'='L_SHIPDATE', 'COLUMN_META_CACHE'='L_SHIPDATE')
""".stripMargin)spark.sql("LOAD DATA INPATH
'/root/lineitem_1M_shipdate.csv' INTO TABLE lineitem_0509")
spark.sql("LOAD DATA INPATH '/root/lineitem_1M_shipdate.csv' INTO TABLE
lineitem_0509")
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
// this is used to implicitly convert an RDD to a DataFrame.
import sqlContext.implicits._
spark.time
{ val df = sqlContext.sql("SELECT l_extendedprice, l_discount, l_quantity FROM
lineitem_0509 WHERE l_shipdate >= to_date('1994-01-01') AND l_shipdate <
to_date('1995-01-01')") } {code}
However, it seems like the blocklet index is not effective because whether I
add the shipdate filter or not the select query takes the same time to run.
Does anyone have an idea?
Also, I feel the documentation of CarbonData is not as good as other open
source project. The lack of response from community prevents it to be used by
others.
was:
Hi, I follow the instructions of <Installing and Configuring CarbonData to run
locally with Spark Shell> in Quick Start guide, using
apache-carbondata-2.3.0-bin-spark2.3.4-hadoop2.7.2.jar.
The scala script I am running is :
{code:java}
import java.io.File
import org.apache.spark.sql.{CarbonEnv, SaveMode, SparkSession}
import org.apache.carbondata.core.constants.{CarbonCommonConstants,
CarbonV3DataFormatConstants}
import org.apache.carbondata.core.util.CarbonProperties
def createSparkSession(): SparkSession =
{
CarbonProperties.getInstance()
.addProperty(CarbonCommonConstants.ENABLE_UNSAFE_COLUMN_PAGE, "true")
.addProperty(CarbonV3DataFormatConstants.BLOCKLET_SIZE_IN_MB, "64")
.addProperty(CarbonCommonConstants.CARBON_BADRECORDS_LOC, "")
valspark=SparkSession.builder().config(sc.getConf).enableHiveSupport.config("spark.sql.extensions","org.apache.spark.sql.CarbonExtensions").getOrCreate()
CarbonEnv.getInstance(spark) spark.sparkContext.setLogLevel("ERROR") spark }
val spark = createSparkSession()
spark.sql("DROP TABLE IF EXISTS lineitem_0509")
spark.sql(s""" | CREATE TABLE IF NOT EXISTS lineitem_0509(
| L_ORDERKEY BIGINT, | L_PARTKEY BIGINT, | L_SUPPKEY
BIGINT, | L_LINENUMBER INTEGER, | L_QUANTITY DECIMAL,
| L_EXTENDEDPRICE DECIMAL, | L_DISCOUNT DECIMAL, | L_TAX
DECIMAL, | L_RETURNFLAG CHAR(1), | L_LINESTATUS CHAR(1), |
L_SHIPDATE DATE, | L_COMMITDATE DATE, | L_RECEIPTDATE DATE,
| L_SHIPINSTRUCT CHAR(25), | L_SHIPMODE CHAR(10), | L_COMMENT
VARCHAR(44)) | STORED AS carbondata
|TBLPROPERTIES ('SORT_COLUMNS'='L_SHIPDATE', 'COLUMN_META_CACHE'='L_SHIPDATE')
""".stripMargin)spark.sql("LOAD DATA INPATH
'/root/lineitem_1M_shipdate.csv' INTO TABLE lineitem_0509")
spark.sql("LOAD DATA INPATH '/root/lineitem_1M_shipdate.csv' INTO TABLE
lineitem_0509")
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
// this is used to implicitly convert an RDD to a DataFrame.
import sqlContext.implicits._
spark.time
{ val df = sqlContext.sql("SELECT l_extendedprice, l_discount, l_quantity FROM
lineitem_0509 WHERE l_shipdate >= to_date('1994-01-01') AND l_shipdate <
to_date('1995-01-01')") } {code}
However, it seems like the blocklet index is not effective because whether I
add the shipdate filter or not the select query takes the same time to run
> Blocklet skipping not works?
> ----------------------------
>
> Key: CARBONDATA-4334
> URL: https://issues.apache.org/jira/browse/CARBONDATA-4334
> Project: CarbonData
> Issue Type: Improvement
> Reporter: Xinyu Zeng
> Priority: Major
>
> Hi, I follow the instructions of <Installing and Configuring CarbonData to
> run locally with Spark Shell> in Quick Start guide, using
> apache-carbondata-2.3.0-bin-spark2.3.4-hadoop2.7.2.jar.
> The scala script I am running is :
>
> {code:java}
> import java.io.File
> import org.apache.spark.sql.{CarbonEnv, SaveMode, SparkSession}
> import org.apache.carbondata.core.constants.{CarbonCommonConstants,
> CarbonV3DataFormatConstants}
> import org.apache.carbondata.core.util.CarbonProperties
> def createSparkSession(): SparkSession =
> {
> CarbonProperties.getInstance()
> .addProperty(CarbonCommonConstants.ENABLE_UNSAFE_COLUMN_PAGE, "true")
> .addProperty(CarbonV3DataFormatConstants.BLOCKLET_SIZE_IN_MB, "64")
> .addProperty(CarbonCommonConstants.CARBON_BADRECORDS_LOC, "")
> valspark=SparkSession.builder().config(sc.getConf).enableHiveSupport.config("spark.sql.extensions","org.apache.spark.sql.CarbonExtensions").getOrCreate()
> CarbonEnv.getInstance(spark) spark.sparkContext.setLogLevel("ERROR") spark }
> val spark = createSparkSession()
> spark.sql("DROP TABLE IF EXISTS lineitem_0509")
> spark.sql(s""" | CREATE TABLE IF NOT EXISTS lineitem_0509(
> | L_ORDERKEY BIGINT, | L_PARTKEY BIGINT, |
> L_SUPPKEY BIGINT, | L_LINENUMBER INTEGER, | L_QUANTITY
> DECIMAL, | L_EXTENDEDPRICE DECIMAL, | L_DISCOUNT DECIMAL,
> | L_TAX DECIMAL, | L_RETURNFLAG CHAR(1), | L_LINESTATUS
> CHAR(1), | L_SHIPDATE DATE, | L_COMMITDATE DATE, |
> L_RECEIPTDATE DATE, | L_SHIPINSTRUCT CHAR(25), | L_SHIPMODE
> CHAR(10), | L_COMMENT VARCHAR(44)) | STORED AS
> carbondata |TBLPROPERTIES ('SORT_COLUMNS'='L_SHIPDATE',
> 'COLUMN_META_CACHE'='L_SHIPDATE') """.stripMargin)spark.sql("LOAD
> DATA INPATH '/root/lineitem_1M_shipdate.csv' INTO TABLE lineitem_0509")
> spark.sql("LOAD DATA INPATH '/root/lineitem_1M_shipdate.csv' INTO TABLE
> lineitem_0509")
> val sqlContext = new org.apache.spark.sql.SQLContext(sc)
> // this is used to implicitly convert an RDD to a DataFrame.
> import sqlContext.implicits._
> spark.time
> { val df = sqlContext.sql("SELECT l_extendedprice, l_discount, l_quantity
> FROM lineitem_0509 WHERE l_shipdate >= to_date('1994-01-01') AND l_shipdate <
> to_date('1995-01-01')") } {code}
>
> However, it seems like the blocklet index is not effective because whether I
> add the shipdate filter or not the select query takes the same time to run.
> Does anyone have an idea?
>
> Also, I feel the documentation of CarbonData is not as good as other open
> source project. The lack of response from community prevents it to be used by
> others.
--
This message was sent by Atlassian Jira
(v8.20.7#820007)