[ https://issues.apache.org/jira/browse/SPARK-17396?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15467956#comment-15467956 ]
Ryan Blue commented on SPARK-17396: ----------------------------------- I'll put together a patch for this with a shared executor service. Although ForkJoinPool isn't to blame, from what I've read about it we probably should be using a different implementation. We don't need the fork/join task pattern and it would be much better to have reliable (and documented) semantics. > Threads number keep increasing when query on external CSV partitioned table > --------------------------------------------------------------------------- > > Key: SPARK-17396 > URL: https://issues.apache.org/jira/browse/SPARK-17396 > Project: Spark > Issue Type: Bug > Components: Spark Core > Affects Versions: 2.0.0 > Reporter: pin_zhang > > 1. Create a external partitioned table row format CSV > 2. Add 16 partitions to the table > 3. Run SQL "select count(*) from test_csv" > 4. ForkJoinThread number keep increasing > This happend when table partitions number greater than 10. > 5. Test Code > {code:lang=java} > import org.apache.spark.SparkConf > import org.apache.spark.SparkContext > import org.apache.spark.sql.hive.HiveContext > object Bugs { > def main(args: Array[String]): Unit = { > val location = "file:///g:/home/test/csv" > val create = s"""CREATE EXTERNAL TABLE test_csv > (ID string, SEQ string ) > PARTITIONED BY(index int) > ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' > LOCATION "${location}" > """ > val add_part = s""" > ALTER TABLE test_csv ADD > PARTITION (index=1)LOCATION '${location}/index=1' > PARTITION (index=2)LOCATION '${location}/index=2' > PARTITION (index=3)LOCATION '${location}/index=3' > PARTITION (index=4)LOCATION '${location}/index=4' > PARTITION (index=5)LOCATION '${location}/index=5' > PARTITION (index=6)LOCATION '${location}/index=6' > PARTITION (index=7)LOCATION '${location}/index=7' > PARTITION (index=8)LOCATION '${location}/index=8' > PARTITION (index=9)LOCATION '${location}/index=9' > PARTITION (index=10)LOCATION '${location}/index=10' > PARTITION (index=11)LOCATION '${location}/index=11' > PARTITION (index=12)LOCATION '${location}/index=12' > PARTITION (index=13)LOCATION '${location}/index=13' > PARTITION (index=14)LOCATION '${location}/index=14' > PARTITION (index=15)LOCATION '${location}/index=15' > PARTITION (index=16)LOCATION '${location}/index=16' > """ > val conf = new SparkConf().setAppName("scala").setMaster("local[2]") > conf.set("spark.sql.warehouse.dir", "file:///g:/home/warehouse") > val ctx = new SparkContext(conf) > val hctx = new HiveContext(ctx) > hctx.sql(create) > hctx.sql(add_part) > for (i <- 1 to 6) { > new Query(hctx).start() > } > } > class Query(htcx: HiveContext) extends Thread { > setName("Query-Thread") > override def run = { > while (true) { > htcx.sql("select count(*) from test_csv").show() > Thread.sleep(100) > } > } > } > } > {code} -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org