This is an automated email from the ASF dual-hosted git repository.

baunsgaard pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/main by this push:
     new 6a759ce18f [SYSTEMDS-3482] Parallel Hadoop IO Startup
6a759ce18f is described below

commit 6a759ce18f08d184d98d652f2e68c952f6a610a9
Author: baunsgaard <[email protected]>
AuthorDate: Tue Jan 3 12:37:29 2023 +0100

    [SYSTEMDS-3482] Parallel Hadoop IO Startup
    
    I observed that the compile time if we include IO operations increase to
    ~0.6 sec. While if we do not have IO operations it is ~0.2 sec. This
    is due to the hadoop IO we are using taking up to 70% of the compile time
    in cases where we have simple scripts with only read and a single operation.
    This is a constant overhead on the fist IO operation that does not effect
    subsequent IO operations, to improve this I have moved this to a parallel
    operation when we construct the JobConfiguration. This improve the
    compile time of systemds in general from ~0.6 sec when using IO to ~0.2 sec.
    
    Closes #1757
---
 .../apache/sysds/conf/ConfigurationManager.java    | 23 ++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/src/main/java/org/apache/sysds/conf/ConfigurationManager.java 
b/src/main/java/org/apache/sysds/conf/ConfigurationManager.java
index 12764eacf4..18bd83e959 100644
--- a/src/main/java/org/apache/sysds/conf/ConfigurationManager.java
+++ b/src/main/java/org/apache/sysds/conf/ConfigurationManager.java
@@ -19,11 +19,18 @@
 
 package org.apache.sysds.conf;
 
+import java.util.concurrent.ExecutorService;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.sysds.conf.CompilerConfig.ConfigType;
 import org.apache.sysds.hops.OptimizerUtils;
 import org.apache.sysds.lops.Compression.CompressConfig;
 import org.apache.sysds.lops.compile.linearization.ILinearize;
+import 
org.apache.sysds.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
+import org.apache.sysds.runtime.io.IOUtilFunctions;
+import org.apache.sysds.runtime.util.CommonThreadPool;
 
 /**
  * Singleton for accessing the parsed and merged system configuration.
@@ -31,8 +38,9 @@ import org.apache.sysds.lops.compile.linearization.ILinearize;
  * NOTE: parallel execution of multiple DML scripts (in the same JVM) with 
different configurations  
  *       would require changes/extensions of this class. 
  */
-public class ConfigurationManager
-{
+public class ConfigurationManager{
+       private static final Log LOG = 
LogFactory.getLog(ConfigurationManager.class.getName());
+
        /** Global cached job conf for read-only operations */
        private static JobConf _rJob = null; 
 
@@ -56,6 +64,17 @@ public class ConfigurationManager
                //ConfigManager -> OptimizerUtils -> InfrastructureAnalyer -> 
ConfigManager 
                _dmlconf = new DMLConfig();
                _cconf = new CompilerConfig();
+
+               final ExecutorService pool = 
CommonThreadPool.get(InfrastructureAnalyzer.getLocalParallelism());
+               pool.submit(() ->{
+                       try{
+                               IOUtilFunctions.getFileSystem(_rJob);
+                       }
+                       catch(Exception e){
+                               LOG.warn(e.getMessage());
+                       }
+               });
+               pool.shutdown();
        }
        
        

Reply via email to