Sure, the code is very simple. I think u guys can understand from the main function.
public class Test1 { public static double[][] createBroadcastPoints(String localPointPath, int row, int col) throws IOException{ BufferedReader br = RAWF.reader(localPointPath); String line = null; int rowIndex = 0; double[][] pointFeatures = new double[row][col]; while((line = br.readLine())!=null){ List<String> point = Arrays.asList(line.split(",")); int colIndex = 0; for(String pointFeature: point){ pointFeatures[rowIndex][colIndex] = Double.valueOf(pointFeature); colIndex++; } rowIndex++; } br.close(); return pointFeatures; } public static void main(String[] args) throws IOException{ /**Parameter Setting***********/ String localPointPath = "/home/hduser/skyrock/skyrockImageFeatures.csv"; String remoteFilePath = "hdfs://HadoopV26Master:9000/user/skyrock/skyrockImageIndexedFeatures.csv"; //this csv file is only 468MB final int row = 133433; final int col = 458; /******************/ SparkConf conf = new SparkConf(). setAppName("distance"). setMaster("spark://HadoopV26Master:7077"). set("spark.executor.memory", "4g"). set("spark.eventLog.enabled", "true") .set("spark.eventLog.dir", "/usr/local/spark/logs/spark-events") .set("spark.local.dir", "/tmp/spark-temp"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> textFile = sc.textFile(remoteFilePath); //Broadcast variable //double[][] xx =; final Broadcast<double[][]> broadcastPoints = sc.broadcast(createBroadcastPoints(localPointPath,row,col)); //final Broadcast<double[][]> broadcastPoints = sc.broadcast(xx); /** * Compute the distance in terms of each point on each instance. * distance list: index = n(i-1)- i*(i-1)/2 + j-i-1 */ JavaPairRDD<Pair,Double> distance = textFile.flatMapToPair(new PairFlatMapFunction<String, Pair, Double>(){ public Iterable<Tuple2<Pair, Double>> call(String v1) throws Exception{ List<String> al = Arrays.asList(v1.split(",")); double[] featureVals = new double[al.size()]; for(int j=0;j<al.size()-1;j++) featureVals[j] = Double.valueOf(al.get(j+1)); int jIndex = Integer.valueOf(al.get(0)); double[][] allPoints = broadcastPoints.getValue(); double sum = 0; List<Tuple2<Pair, Double>> list = new ArrayList<Tuple2<Pair, Double>>(); for(int i=0;i<row; i++){ sum = 0; for(int j=0;j<al.size()-1;j++){ sum += (allPoints[i][j]-featureVals[j])*(allPoints[i][j]-featureVals[j]); } list.add(new Tuple2<Pair,Double>(new Pair(i,jIndex),Math.sqrt(sum))); } return list; } }); distance.saveAsTextFile("hdfs://HadoopV26Master:9000/user/"+args[0]); } } On 21 May 2015 at 16:44, Akhil Das <ak...@sigmoidanalytics.com> wrote: > Can you share the code, may be i/someone can help you out > > Thanks > Best Regards > > On Thu, May 21, 2015 at 1:45 PM, Allan Jie <allanmcgr...@gmail.com> wrote: > >> Hi, >> >> Just check the logs of datanode, it looks like this: >> >> 2015-05-20 11:42:14,605 INFO >> org.apache.hadoop.hdfs.server.datanode.DataNode.clienttrace: src: / >> 10.9.0.48:50676, dest: /10.9.0.17:50010, bytes: 134217728, op: >> HDFS_WRITE, cliID: DFSClient_NONMAPREDUCE_804680172_54, offset: 0, srvID: >> 39fb78d5-828a-4319-8303-c704fab526e3, blockid: >> BP-436159032-10.9.0.16-1431330007172:blk_1073742096_1273, duration: >> 16994466261 >> 2015-05-20 11:42:14,606 INFO >> org.apache.hadoop.hdfs.server.datanode.DataNode: PacketResponder: >> BP-436159032-10.9.0.16-1431330007172:blk_1073742096_1273, >> type=LAST_IN_PIPELINE, downstreams=0:[] terminating >> 2015-05-20 11:42:17,788 INFO >> org.apache.hadoop.hdfs.server.datanode.DataNode.clienttrace: src: / >> 10.9.0.17:49046, dest: /10.9.0.17:50010, bytes: 134217728, op: >> HDFS_WRITE, cliID: DFSClient_NONMAPREDUCE_102926009_54, offset: 0, srvID: >> 39fb78d5-828a-4319-8303-c704fab526e3, blockid: >> BP-436159032-10.9.0.16-1431330007172:blk_1073742099_1276, duration: >> 17829554438 >> 2015-05-20 11:42:17,788 INFO >> org.apache.hadoop.hdfs.server.datanode.DataNode: PacketResponder: >> BP-436159032-10.9.0.16-1431330007172:blk_1073742099_1276, >> type=HAS_DOWNSTREAM_IN_PIPELINE terminating >> 2015-05-20 11:42:17,904 INFO >> org.apache.hadoop.hdfs.server.datanode.DataNode: Receiving >> BP-436159032-10.9.0.16-1431330007172:blk_1073742103_1280 src: / >> 10.9.0.17:49049 dest: /10.9.0.17:50010 >> 2015-05-20 11:42:17,904 WARN >> org.apache.hadoop.hdfs.server.datanode.DataNode: IOException in >> BlockReceiver constructor. Cause is >> 2015-05-20 11:42:17,904 INFO >> org.apache.hadoop.hdfs.server.datanode.DataNode: opWriteBlock >> BP-436159032-10.9.0.16-1431330007172:blk_1073742103_1280 received exception >> org.apache.hadoop.util.DiskChecker$DiskOutOfSpaceException: Out of space: >> The volume with the most available space (=114409472 B) is less than the >> block size (=134217728 B). >> 2015-05-20 11:42:17,905 ERROR >> org.apache.hadoop.hdfs.server.datanode.DataNode: >> HadoopV26Slave1:50010:DataXceiver error processing WRITE_BLOCK operation >> src: /10.9.0.17:49049 dst: /10.9.0.17:50010 >> org.apache.hadoop.util.DiskChecker$DiskOutOfSpaceException: Out of space: >> The volume with the most available space (=114409472 B) is less than the >> block size (=134217728 B). >> at >> org.apache.hadoop.hdfs.server.datanode.fsdataset.RoundRobinVolumeChoosingPolicy.chooseVolume(RoundRobinVolumeChoosingPolicy.java:67) >> at >> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsVolumeList.getNextVolume(FsVolumeList.java:69) >> at >> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetImpl.createRbw(FsDatasetImpl.java:1084) >> at >> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetImpl.createRbw(FsDatasetImpl.java:114) >> at >> org.apache.hadoop.hdfs.server.datanode.BlockReceiver.<init>(BlockReceiver.java:183) >> at >> org.apache.hadoop.hdfs.server.datanode.DataXceiver.writeBlock(DataXceiver.java:615) >> at >> org.apache.hadoop.hdfs.protocol.datatransfer.Receiver.opWriteBlock(Receiver.java:137) >> at >> org.apache.hadoop.hdfs.protocol.datatransfer.Receiver.processOp(Receiver.java:74) >> at >> org.apache.hadoop.hdfs.server.datanode.DataXceiver.run(DataXceiver.java:235) >> at java.lang.Thread.run(Thread.java:745) >> 2015-05-20 11:43:59,669 INFO >> org.apache.hadoop.hdfs.server.datanode.BlockPoolSliceScanner: Verification >> succeeded for BP-436159032-10.9.0.16-1431330007172:blk_1073741999_1176 >> 2015-05-20 11:46:10,214 INFO >> org.apache.hadoop.hdfs.server.datanode.BlockPoolSliceScanner: Verification >> succeeded for BP-436159032-10.9.0.16-1431330007172:blk_1073742000_1177 >> 2015-05-20 11:48:35,445 INFO >> org.apache.hadoop.hdfs.server.datanode.BlockPoolSliceScanner: Verification >> succeeded for BP-436159032-10.9.0.16-1431330007172:blk_1073741990_1167 >> 2015-05-20 11:50:04,043 INFO >> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetAsyncDiskService: >> Scheduling blk_1073742080_1257 file >> /tmp/hadoop-hduser/dfs/data/current/BP-436159032-10.9.0.16-1431330007172/current/finalized/subdir0/subdir1/blk_1073742080 >> for deletion >> 2015-05-20 11:50:04,136 INFO >> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetAsyncDiskService: >> Scheduling blk_1073742081_1258 file >> /tmp/hadoop-hduser/dfs/data/current/BP-436159032-10.9.0.16-1431330007172/current/finalized/subdir0/subdir1/blk_1073742081 >> for deletion >> 2015-05-20 11:50:04,136 INFO >> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetAsyncDiskService: >> Scheduling blk_1073742082_1259 file >> /tmp/hadoop-hduser/dfs/data/current/BP-436159032-10.9.0.16-1431330007172/current/finalized/subdir0/subdir1/blk_1073742082 >> for deletion >> 2015-05-20 11:50:04,136 INFO >> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetAsyncDiskService: >> Scheduling blk_1073742083_1260 file >> /tmp/hadoop-hduser/dfs/data/current/BP-436159032-10.9.0.16-1431330007172/current/finalized/subdir0/subdir1/blk_1073742083 >> for deletion >> 2015-05-20 11:50:04,136 INFO >> org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetAsyncDiskService: >> Scheduling blk_1073742084_1261 file >> /tmp/hadoop-hduser/dfs/data/current/BP-436159032-10.9.0.16-1431330007172/current/finalized/subdir0/subdir1/blk_1073742084 >> for deletion >> >> >> But why this is a hdfs issue, because I think spark broadcast the >> variable in memory. >> BTW, the datanode logs seems like I don't have any space to save the >> storage. So the question comes back, originally I have 50GB for HDFS, why I >> broadcast a variable and then that variable will dominate the whole hdfs? >> >> P.S. when I reduce the data size to only 10 lines (200kb), it works >> without problem. >> >> Best Regards, >> Allan >> >> On 21 May 2015 at 01:30, Akhil Das <ak...@sigmoidanalytics.com> wrote: >> >>> This is more like an issue with your HDFS setup, can you check in the >>> datanode logs? Also try putting a new file in HDFS and see if that works. >>> >>> Thanks >>> Best Regards >>> >>> On Wed, May 20, 2015 at 11:47 AM, allanjie <allanmcgr...@gmail.com> >>> wrote: >>> >>>> Hi All, >>>> The variable I need to broadcast is just 468 MB. >>>> >>>> >>>> When broadcasting, it just “stop” at here: >>>> >>>> * >>>> 15/05/20 11:36:14 INFO Configuration.deprecation: mapred.tip.id is >>>> deprecated. Instead, use mapreduce.task.id >>>> 15/05/20 11:36:14 INFO Configuration.deprecation: mapred.task.id is >>>> deprecated. Instead, use mapreduce.task.attempt.id >>>> 15/05/20 11:36:14 INFO Configuration.deprecation: mapred.task.is.map is >>>> deprecated. Instead, use mapreduce.task.ismap >>>> 15/05/20 11:36:14 INFO Configuration.deprecation: mapred.task.partition >>>> is >>>> deprecated. Instead, use mapreduce.task.partition >>>> 15/05/20 11:36:14 INFO Configuration.deprecation: mapred.job.id is >>>> deprecated. Instead, use mapreduce.job.id >>>> 15/05/20 11:36:14 INFO mapred.FileInputFormat: Total input paths to >>>> process >>>> : 1 >>>> 15/05/20 11:36:14 INFO spark.SparkContext: Starting job: saveAsTextFile >>>> at >>>> Test1.java:90 >>>> 15/05/20 11:36:15 INFO scheduler.DAGScheduler: Got job 0 >>>> (saveAsTextFile at >>>> Test1.java:90) with 4 output partitions (allowLocal=false) >>>> 15/05/20 11:36:15 INFO scheduler.DAGScheduler: Final stage: Stage >>>> 0(saveAsTextFile at Test1.java:90) >>>> 15/05/20 11:36:15 INFO scheduler.DAGScheduler: Parents of final stage: >>>> List() >>>> 15/05/20 11:36:15 INFO scheduler.DAGScheduler: Missing parents: List() >>>> 15/05/20 11:36:15 INFO scheduler.DAGScheduler: Submitting Stage 0 >>>> (MapPartitionsRDD[3] at saveAsTextFile at Test1.java:90), which has no >>>> missing parents >>>> 15/05/20 11:36:15 INFO storage.MemoryStore: ensureFreeSpace(129264) >>>> called >>>> with curMem=988453294, maxMem=2061647216 >>>> 15/05/20 11:36:15 INFO storage.MemoryStore: Block broadcast_2 stored as >>>> values in memory (estimated size 126.2 KB, free 1023.4 MB) >>>> 15/05/20 11:36:15 INFO storage.MemoryStore: ensureFreeSpace(78190) >>>> called >>>> with curMem=988582558, maxMem=2061647216 >>>> 15/05/20 11:36:15 INFO storage.MemoryStore: Block broadcast_2_piece0 >>>> stored >>>> as bytes in memory (estimated size 76.4 KB, free 1023.3 MB) >>>> 15/05/20 11:36:15 INFO storage.BlockManagerInfo: Added >>>> broadcast_2_piece0 in >>>> memory on HadoopV26Master:44855 (size: 76.4 KB, free: 1492.4 MB) >>>> 15/05/20 11:36:15 INFO storage.BlockManagerMaster: Updated info of block >>>> broadcast_2_piece0 >>>> 15/05/20 11:36:15 INFO spark.SparkContext: Created broadcast 2 from >>>> broadcast at DAGScheduler.scala:839 >>>> 15/05/20 11:36:15 INFO scheduler.DAGScheduler: Submitting 4 missing >>>> tasks >>>> from Stage 0 (MapPartitionsRDD[3] at saveAsTextFile at Test1.java:90) >>>> 15/05/20 11:36:15 INFO scheduler.TaskSchedulerImpl: Adding task set 0.0 >>>> with >>>> 4 tasks >>>> 15/05/20 11:36:15 INFO scheduler.TaskSetManager: Starting task 0.0 in >>>> stage >>>> 0.0 (TID 0, HadoopV26Slave5, NODE_LOCAL, 1387 bytes) >>>> 15/05/20 11:36:15 INFO scheduler.TaskSetManager: Starting task 1.0 in >>>> stage >>>> 0.0 (TID 1, HadoopV26Slave3, NODE_LOCAL, 1387 bytes) >>>> 15/05/20 11:36:15 INFO scheduler.TaskSetManager: Starting task 2.0 in >>>> stage >>>> 0.0 (TID 2, HadoopV26Slave4, NODE_LOCAL, 1387 bytes) >>>> 15/05/20 11:36:15 INFO scheduler.TaskSetManager: Starting task 3.0 in >>>> stage >>>> 0.0 (TID 3, HadoopV26Slave1, NODE_LOCAL, 1387 bytes) >>>> 15/05/20 11:36:15 INFO storage.BlockManagerInfo: Added >>>> broadcast_2_piece0 in >>>> memory on HadoopV26Slave5:45357 (size: 76.4 KB, free: 2.1 GB) >>>> 15/05/20 11:36:15 INFO storage.BlockManagerInfo: Added >>>> broadcast_2_piece0 in >>>> memory on HadoopV26Slave3:57821 (size: 76.4 KB, free: 2.1 GB) >>>> ……. >>>> 15/05/20 11:36:28 INFO storage.BlockManagerInfo: Added >>>> broadcast_1_piece1 in >>>> memory on HadoopV26Slave5:45357 (size: 4.0 MB, free: 1646.3 MB) >>>> * >>>> >>>> And didn’t go forward as I still waiting, basically not stop, but more >>>> like >>>> stuck. >>>> >>>> I have 6 workers/VMs: each of them has 8GB memory and 12GB disk storage. >>>> After a few mins pass, the program stopped and showed something like >>>> this: >>>> >>>> >>>> 15/05/20 11:42:45 WARN scheduler.TaskSetManager: Lost task 1.0 in stage >>>> 0.0 >>>> (TID 1, HadoopV26Slave3): >>>> org.apache.hadoop.ipc.RemoteException(java.io.IOException): File >>>> >>>> /user/output/_temporary/0/_temporary/attempt_201505201136_0000_m_000001_1/part-00001 >>>> could only be replicated to 0 nodes instead of minReplication (=1). >>>> There >>>> are 6 datanode(s) running and no node(s) are excluded in this operation. >>>> at >>>> >>>> org.apache.hadoop.hdfs.server.blockmanagement.BlockManager.chooseTarget4NewBlock(BlockManager.java:1549) >>>> at >>>> >>>> org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getAdditionalBlock(FSNamesystem.java:3200) >>>> at >>>> >>>> org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.addBlock(NameNodeRpcServer.java:641) >>>> at >>>> >>>> org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.addBlock(ClientNamenodeProtocolServerSideTranslatorPB.java:482) >>>> at >>>> >>>> org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java) >>>> at >>>> >>>> org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:619) >>>> at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:962) >>>> at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2039) >>>> at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2035) >>>> at java.security.AccessController.doPrivileged(Native Method) >>>> at javax.security.auth.Subject.doAs(Subject.java:415) >>>> at >>>> >>>> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1628) >>>> at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2033) >>>> >>>> at org.apache.hadoop.ipc.Client.call(Client.java:1468) >>>> at org.apache.hadoop.ipc.Client.call(Client.java:1399) >>>> at >>>> >>>> org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:232) >>>> at com.sun.proxy.$Proxy13.addBlock(Unknown Source) >>>> at >>>> >>>> org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.addBlock(ClientNamenodeProtocolTranslatorPB.java:399) >>>> at sun.reflect.GeneratedMethodAccessor7.invoke(Unknown Source) >>>> at >>>> >>>> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) >>>> at java.lang.reflect.Method.invoke(Method.java:606) >>>> at >>>> >>>> org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:187) >>>> at >>>> >>>> org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:102) >>>> at com.sun.proxy.$Proxy14.addBlock(Unknown Source) >>>> at >>>> >>>> org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.locateFollowingBlock(DFSOutputStream.java:1532) >>>> at >>>> >>>> org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.nextBlockOutputStream(DFSOutputStream.java:1349) >>>> at >>>> >>>> org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.run(DFSOutputStream.java:588) >>>> >>>> And then I check the volume of each slave, seems that almost all the >>>> storage >>>> has been dominated. But the variable I broadcast is just 468MB. >>>> >>>> Originally it is saved in HDFS. And In java program I read it from hdfs >>>> and >>>> then broadcast that variable. >>>> >>>> Anyone can help? Really thanks. >>>> >>>> >>>> >>>> -- >>>> View this message in context: >>>> http://apache-spark-user-list.1001560.n3.nabble.com/java-program-Get-Stuck-at-broadcasting-tp22953.html >>>> Sent from the Apache Spark User List mailing list archive at Nabble.com. >>>> >>>> --------------------------------------------------------------------- >>>> To unsubscribe, e-mail: user-unsubscr...@spark.apache.org >>>> For additional commands, e-mail: user-h...@spark.apache.org >>>> >>>> >>> >> >> >> -- >> PhD student, >> Social Media Laboratory <http://smedia.ust.hk/>, >> Department of Electronic & Computer Engineering >> <http://www.ece.ust.hk/ece.php>, >> The Hong Kong University of Science and Technology <http://www.ust.hk/>. >> Website: http://www.allanjie.net >> > > -- PhD student, Social Media Laboratory <http://smedia.ust.hk/>, Department of Electronic & Computer Engineering <http://www.ece.ust.hk/ece.php>, The Hong Kong University of Science and Technology <http://www.ust.hk/>. Website: http://www.allanjie.net