[02/25] git commit: All the changes to make managing and running the benchmark easier. A xml configuration file holds all the data so partitions can be created with one argument to the cli. Also its easy to rebuild the data since the configuration is sav

prestonc Sun, 23 Feb 2014 17:43:16 -0800

All the changes to make managing and running the benchmark easier. A xml 
configuration file holds all the data so partitions can be created with one 
argument to the cli. Also its easy to rebuild the data since the configuration 
is saved.



Project: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/commit/a253e2f0
Tree: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/tree/a253e2f0
Diff: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/diff/a253e2f0

Branch: refs/heads/master
Commit: a253e2f0805ad7781720d9542ef3f4c295570f6c
Parents: 1ea5f1a
Author: Preston Carman <[email protected]>
Authored: Thu Feb 13 09:38:04 2014 -0800
Committer: Preston Carman <[email protected]>
Committed: Thu Feb 13 09:38:04 2014 -0800

----------------------------------------------------------------------
 .../resources/noaa-ghcn-daily/scripts/README.md |  15 ++-
 .../noaa-ghcn-daily/scripts/run_benchmark.sh    |  20 +++
 .../noaa-ghcn-daily/scripts/weather_cli.py      | 121 ++++++++-----------
 .../noaa-ghcn-daily/scripts/weather_config.py   | 115 ++++++++++++++++++
 .../scripts/weather_convert_to_xml.py           |   2 +-
 .../scripts/weather_data_files.py               | 109 +++++++++++------
 .../noaa-ghcn-daily/scripts/weather_example.xml |  39 ++++++
 7 files changed, 308 insertions(+), 113 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/a253e2f0/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/README.md
----------------------------------------------------------------------
diff --git 
a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/README.md 
b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/README.md
index 2d9c2eb..585f54f 100644
--- a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/README.md
+++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/README.md
@@ -18,6 +18,17 @@ Detailed GHDN-DAILY information:
 The process takes a save folder for the data. The folder contains a several 
 folders:
 
+ - all_xml_files (The generated xml files for a given package)
  - downloads (All files taken from the NOAA HTTP site)
- - 1\_node\_{package}\_xml{\_compression} (The generated xml files for a given 
-     package)
\ No newline at end of file
+ - dataset-[name] (all files related to a single dataset)
+     
+     
+# Examples commands
+
+Building
+
+
+Partitioning
+python weather_cli.py -x weather_example.xml
+
+Linking
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/a253e2f0/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_benchmark.sh
----------------------------------------------------------------------
diff --git 
a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_benchmark.sh 
b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_benchmark.sh
new file mode 100644
index 0000000..ff34a88
--- /dev/null
+++ 
b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_benchmark.sh
@@ -0,0 +1,20 @@
+# Run all the queries and save a log. 
+# First argument: Supply the folder which houses all the queries (recursive).
+# Second argument: adds options to the VXQuery CLI.
+#
+# run_benchmark.sh ./noaa-ghcn-daily/benchmarks/local_speed_up/queries/
+# run_benchmark.sh ./noaa-ghcn-daily/benchmarks/local_speed_up/queries/ 
"-client-net-ip-address 169.235.27.138"
+#
+
+if [ -z "${1}" ]
+then
+    echo "Please supply a directory for query files to be found."
+    exit
+fi
+
+for j in $(find ${1} -name '*.xq')
+do
+       echo "Running query: ${j}"
+       time sh ./vxquery-cli/target/appassembler/bin/vxq ${j} ${2} -timing 
-showquery -frame-size 1000000 -repeatexec 10 > ${j}.log 2>&1
+done
+

http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/a253e2f0/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_cli.py
----------------------------------------------------------------------
diff --git 
a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_cli.py 
b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_cli.py
index d9f83eb..c69133e 100644
--- 
a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_cli.py
+++ 
b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_cli.py
@@ -20,6 +20,7 @@ import sys, getopt
 from weather_data_files import *
 from weather_download_files import *
 from weather_convert_to_xml import *
+from weather_config import *
 
 DEBUG_OUTPUT = False
 COMPRESSED = False
@@ -31,21 +32,17 @@ COMPRESSED = False
 #
 def main(argv):
     append = False
-    download_path = ""
     max_records = 0
     package = "ghcnd_gsn"
-    partitions = 0
-    nodes = -1
     process_file_name = ""
     reset = False
-    save_path = ""
     section = "all"
-    test = ""
     token = ""
     update = False
+    xml_config_path = ""
     
     try:
-        opts, args = getopt.getopt(argv, "acd:f:hl:m:n:p:rs:t:uvw:", 
["download_directory=", "file=", "locality=", "max_station_files=", "nodes=", 
"save_directory=", "package=", "partitions=", "web_service="])
+        opts, args = getopt.getopt(argv, "acf:hl:m:ruvw:x:", ["file=", 
"locality=", "max_station_files=", "web_service=", "xml_config="])
     except getopt.GetoptError:
         print 'The file options for weather_cli.py were not correctly 
specified.'
         print 'To see a full list of options try:'
@@ -56,42 +53,23 @@ def main(argv):
             print 'Converting weather daily files to xml options:'
             print '    -a        Append the results to the progress file.'
             print '    -c        Compress the produced XML file with .gz.'
-            print '    --download_directory (str)  The directory for saving 
the downloaded files. (default: downloads)'
-            print '    -d (str)  The directory for saving the downloaded files 
and generated XML files.'
             print '    -f (str)  The file name of a specific station to 
process.'
             print '              * Helpful when testing a single stations XML 
file output.'
             print '    -l (str)  Select the locality of the scripts execution 
(download, progress_file, sensor_build, station_build, partition, statistics).'
             print '    -m (int)  Limits the number of files created for each 
station.'
             print '              * Helpful when testing to make sure all 
elements are supported for each station.'
             print '              Alternate form: --max_station_files=(int)'
-            print '    -n (int)  The numeric node id starting at 0.'
-            print '    --partitions (int)  The number of partitions (sections) 
for creating split up generated data.'
-            print '    -p (str)  The package used to generate files. (all, 
gsn, hcn)'
-            print '    -t (str)  The benchmark test used to partition files. 
(speed_up, batch_scale_up)'
             print '    -r        Reset the build process. (For one section or 
all sections depending on other parameters.)'
             print '    -u        Recalculate the file count and data size for 
each data source file.'
             print '    -v        Extra debug information.'
             print '    -w (str)  Downloads the station XML file form the web 
service.'
+            print '    -x (str)  XML config file for weather data.'
             sys.exit()
         elif opt in ('-a', "--append"):
             append = True
         elif opt == '-c':
             global COMPRESSED
             COMPRESSED = True
-        elif opt == '--download_directory':
-            # check if file exists.
-            if os.path.exists(arg):
-                download_path = arg
-            else:
-                print 'Error: Argument must be a directory for 
--download_directory.'
-                sys.exit()
-        elif opt in ('-d', "--save_directory"):
-            # check if file exists.
-            if os.path.exists(arg):
-                save_path = arg
-            else:
-                print 'Error: Argument must be a directory for 
--save_directory (-s).'
-                sys.exit()
         elif opt in ('-f', "--file"):
             # check if file exists.
             if os.path.exists(arg):
@@ -111,30 +89,12 @@ def main(argv):
             else:
                 print 'Error: Argument must be an integer for 
--max_station_files (-m).'
                 sys.exit()
-        elif opt in ('-n'):
-            if arg.isdigit():
-                nodes = int(arg)
-            else:
-                print 'Error: Argument must be an integer for -n.'
-                sys.exit()
-        elif opt == "--partitions":
-            if arg.isdigit():
-                partitions = int(arg)
-            else:
-                print 'Error: Argument must be an integer for --partitions.'
-                sys.exit()
         elif opt in ('-p', "--package"):
             if arg in ("all", "gsn", "hcn"):
                 package = "ghcnd_" + arg
             else:
                 print 'Error: Argument must be an string for one of the known 
weather packages: "all", "gsn", "hcn"'
                 sys.exit()
-        elif opt in ('-t'):
-            if arg in ("speed_up", "batch_scale_up"):
-                test = arg
-            else:
-                print 'Error: Argument must be an string for one of the known 
benchmark tests: "speed_up", "batch_scale_up"'
-                sys.exit()
         elif opt == '-r':
             reset = True
         elif opt == '-u':
@@ -149,15 +109,27 @@ def main(argv):
             else:
                 print 'Error: Argument must be a string --web_service (-w).'
                 sys.exit()
+        elif opt in ('-x', "--xml_config"):
+            # check if file exists.
+            if os.path.exists(arg):
+                xml_config_path = arg
+            else:
+                print 'Error: Argument must be a xml file for --xml_config 
(-x).'
+                sys.exit()
 
     # Required fields to run the script.
-    if save_path == "" or not os.path.exists(save_path):
+    if xml_config_path == "" or not os.path.exists(xml_config_path):
+        print 'Error: The xml config option must be supplied: --xml_config 
(-x).'
+        sys.exit()
+    config = WeatherConfig(xml_config_path)
+    
+    # Required fields to run the script.
+    if config.get_save_path() == "" or not 
os.path.exists(config.get_save_path()):
         print 'Error: The save directory option must be supplied: 
--save_directory (-d).'
         sys.exit()
 
     # Set up downloads folder.
-    if download_path == "":
-        download_path = save_path + "/downloads"
+    download_path = config.get_save_path() + "/downloads"
     if section in ("all", "download"):
         print 'Processing the download section.'
         download = WeatherDownloadFiles(download_path)
@@ -168,19 +140,13 @@ def main(argv):
 
 
     # Create some basic paths for save files and references.
-    ghcnd_data_dly_path = download_path + '/' + package + '/' + package
-    ghcnd_xml_path = save_path + "/1.0_partition_" + package + '_xml/'
-    ghcnd_xml_gz_path = save_path + "/1.0_partition_" + package + '_xml_gz/'
-    if COMPRESSED:
-        xml_data_save_path = ghcnd_xml_gz_path
-    else:
-        xml_data_save_path = ghcnd_xml_path
+    ghcnd_data_dly_path = download_path + '/' + config.get_package() + '/' + 
config.get_package()
+    xml_data_save_path = config.get_save_path() + '/all_xml_files/'
 
     # Make sure the xml folder is available.
     if not os.path.isdir(xml_data_save_path):
         os.makedirs(xml_data_save_path)
 
-
     # Set up the XML build objects.
     convert = WeatherWebServiceMonthlyXMLFile(download_path, 
xml_data_save_path, COMPRESSED, DEBUG_OUTPUT)
     progress_file = xml_data_save_path + "_data_progress.csv"
@@ -233,24 +199,35 @@ def main(argv):
                 data.update_file_station_status(file_name, status)
             else:
                 data.update_file_station_status(file_name, 
WeatherDataFiles.DATA_FILE_MISSING)
-                
-    if section in ("all", "partition") and partitions > 1:
-        print 'Processing the partition section.'
-        data.reset()
-        data.copy_to_n_partitions(xml_data_save_path, partitions)
+                    
+    for dataset in config.get_dataset_list():
+        # Set up the setting for each dataset.
+        dataset_folder = "/dataset-" + dataset.get_name()
+        progress_file = config.get_save_path() + dataset_folder + 
"/_data_progress.csv"
+        data = WeatherDataFiles(ghcnd_data_dly_path, progress_file)
+        base_paths = []
+        for paths in dataset.get_save_paths():
+            base_paths.append(paths + dataset_folder + "/")
 
-    if section in ("test_links"):
-        if test and partitions > 0 and nodes > -1:
-            print 'Processing the test links section.'
-            data.reset()
-            data.create_test_links(save_path, test, nodes, partitions)
-        else:
-            print 'Error: Not enough information for this section.'
-            sys.exit()
+        if section in ("all", "partition"):
+            for partition in dataset.get_partitions():
+                print 'Processing the partition section (' + 
dataset.get_name() + ':d' + str(len(base_paths)) + ':p' + str(partition) + ').'
+                data.reset()
+                data.copy_to_n_partitions(xml_data_save_path, partition, 
base_paths)
+    
+        if section in ("all", "test_links"):
+            # TODO determine current node 
+            if test and partitions > 0 and virtual_partitions > 0 and nodes > 
-1:
+                print 'Processing the test links section.'
+                data.reset()
+                data.create_test_links(config.get_save_path(), 
xml_data_save_path, test, nodes, partitions, virtual_partitions, base_paths)
+            else:
+                print 'Error: Not enough information for this section.'
+                sys.exit()
 
-    if section in ("all", "statistics"):
-        print 'Processing the statistics section.'
-        data.print_progress_file_stats(convert)
-                
+#     if section in ("statistics"):
+#         print 'Processing the statistics section.'
+#         data.print_progress_file_stats(convert)
+                  
 if __name__ == "__main__":
     main(sys.argv[1:])

http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/a253e2f0/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_config.py
----------------------------------------------------------------------
diff --git 
a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_config.py
 
b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_config.py
new file mode 100644
index 0000000..fa12342
--- /dev/null
+++ 
b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_config.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from xml.dom.minidom import parse
+
+class WeatherConfig:
+    def __init__(self, config_xml_file):
+        self.config_xml_file = config_xml_file
+        
+        self.config = parse(self.config_xml_file)
+
+    def get_save_path(self):
+        return self.get_text(self.config.getElementsByTagName("save_path")[0])
+
+    def get_package(self):
+        return self.get_text(self.config.getElementsByTagName("package")[0])
+
+    def get_node_machine_list(self):
+        nodes = []
+        for node in self.config.getElementsByTagName("node"):
+            id = self.get_node_name(node)
+            ip = self.get_node_ip(node)
+            nodes.append(Machine(id, ip))
+        return nodes
+
+    def get_dataset_list(self):
+        nodes = []
+        for node in self.config.getElementsByTagName("dataset"):
+            name = self.get_dataset_name(node)
+            save_paths = self.get_dataset_save_paths(node)
+            partitions = self.get_dataset_partitions(node)
+            nodes.append(Dataset(name, save_paths, partitions))
+        return nodes
+
+
+    # 
--------------------------------------------------------------------------
+    # Node Specific Functions
+    # 
--------------------------------------------------------------------------
+    def get_node_ip(self, node):
+        return self.get_text(node.getElementsByTagName("ip_address")[0])
+
+    def get_node_name(self, node):
+        return self.get_text(node.getElementsByTagName("name")[0])
+
+    
+    # 
--------------------------------------------------------------------------
+    # Dataset Specific Functions
+    # 
--------------------------------------------------------------------------
+    def get_dataset_name(self, node):
+        return self.get_text(node.getElementsByTagName("name")[0])
+
+    def get_dataset_save_paths(self, node):
+        paths = []
+        for node in node.getElementsByTagName("save_path"):
+            paths.append(self.get_text(node))
+        return paths
+
+    def get_dataset_partitions(self, node):
+        paths = []
+        for node in node.getElementsByTagName("partitions_per_path"):
+            paths.append(int(self.get_text(node)))
+        return paths
+
+    def get_text(self, xml_node):
+        rc = []
+        for node in xml_node.childNodes:
+            if node.nodeType == node.TEXT_NODE:
+                rc.append(node.data)
+        return ''.join(rc)
+
+class Machine:
+    def __init__(self, id, ip):
+        self.id = id
+        self.ip = ip
+    
+    def get_node_name(self):
+        return self.id
+    
+    def get_node_ip(self):
+        return self.ip
+    
+    def __repr__(self):
+        return self.id + "(" + self.ip + ")"
+    
+class Dataset:
+    def __init__(self, name, save_paths, partitions):
+        self.name = name
+        self.save_paths = save_paths
+        self.partitions = partitions
+    
+    def get_name(self):
+        return self.name
+    
+    def get_save_paths(self):
+        return self.save_paths
+    
+    def get_partitions(self):
+        return self.partitions
+    
+    def __repr__(self):
+        return self.name + ":" + str(self.save_paths) + ":" + 
str(self.partitions)
+    

http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/a253e2f0/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py
----------------------------------------------------------------------
diff --git 
a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py
 
b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py
index 0f62195..514f5f9 100644
--- 
a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py
+++ 
b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py
@@ -128,7 +128,7 @@ class WeatherConvertToXML:
                 # process set
                 file_count += self.process_one_month_sensor_set(records, page)
                 records = []
-                if sensor_count >= sensor_max:
+                if sensor_count >= sensor_max and month == month_last and year 
== year_last:
                     # start a new page.
                     page += 1
                 else:

http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/a253e2f0/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py
----------------------------------------------------------------------
diff --git 
a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py
 
b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py
index c85fe86..be85c69 100644
--- 
a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py
+++ 
b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py
@@ -20,6 +20,7 @@ import linecache
 import distutils.core
 
 from weather_convert_to_xml import *
+from collections import OrderedDict
 
 # Weather data files created to manage the conversion process.
 # Allows partition and picking up where you left off.
@@ -91,57 +92,79 @@ class WeatherDataFiles:
             self.close_progress_data(True)
         self.reset()
         
-    def create_test_links(self, save_path, test, node, partitions):
+    def create_test_links(self, save_path, xml_save_path, test, node, 
partitions, virtual_partitions, base_paths=[]):
+        if (len(base_paths) == 0):
+            base_paths.append(os.path.dirname(save_path))
+        partition_list = sorted(get_partition_paths(partitions, base_paths))
+        
         test_path = save_path + "/" + test
         if not os.path.isdir(test_path):
             os.makedirs(test_path)
-        for i in range(partitions):
-            test_partition_path = test_path + "/partition" + str(i+1)
-            if (node <= i):
-                # link
-                if test == "speed_up":
-                    os.symlink(save_path + "/" + str(i+1) + "." + str(node) + 
"_partition_ghcnd_all_xml_gz", test_partition_path)
-                if test == "batch_scale_up":
-                    os.symlink(save_path + "/" + str(partitions) + "." + 
str(node) + "_partition_ghcnd_all_xml_gz", test_partition_path)
-            else:
-                # fake directories
-                os.makedirs(test_partition_path + "/sensors")
-                os.makedirs(test_partition_path + "/stations")
+        for i in range(virtual_partitions):
+            # one virtual partition per disk
+            for j in range(len(base_paths)):
+                for index, path in enumerate(partition_list):
+                    offset = partitions * j
+                    test_partition_path = test_path + "/partition" + str(i + 
1) + "_disk" + str(j + 1)
+                    if not os.path.isdir(test_partition_path):
+                        os.makedirs(test_partition_path)
+                    if (node <= i):
+                        if test == "speed_up":
+                            group = partitions / (i + 1)
+                        elif test == "batch_scale_up":
+                            group = partitions / virtual_partitions
+                        else:
+                            group = -1
+                        # link
+                        if (group) * node + offset <= index and index < 
(group) * (node + 1) + offset:
+                            os.symlink(path, test_partition_path + "/index" + 
str(index))
+                    else:
+                        # fake directories
+                        os.makedirs(test_partition_path + "/sensors")
+                        os.makedirs(test_partition_path + "/stations")
             
         
     # Once the initial data has been generated, the data can be copied into a 
set number of partitions. 
-    def copy_to_n_partitions(self, save_path, partitions):
+    def copy_to_n_partitions(self, save_path, partitions, base_paths=[]):
+        if (len(base_paths) == 0):
+            return
         
         # Initialize the partition paths.
-        partition_paths = []
-        for i in range(0, partitions):
-            new_partition_path = save_path.replace("1.0_partition", 
str(partitions) + "." + str(i) + "_partition")
-            partition_paths.append(new_partition_path)
-            
+        partition_sizes = []
+        partition_paths = get_partition_paths(partitions, base_paths)
+        for path in partition_paths:
+            partition_sizes.append(0)
             # Make sure the xml folder is available.
-            if not os.path.isdir(new_partition_path):
-                os.makedirs(new_partition_path)
+            if not os.path.isdir(path):
+                os.makedirs(path)
 
-        # copy stations and sensors into each partition round robin
+        # copy stations and sensors into each partition
         current_partition = 0
         csv_sorted = self.get_csv_in_partition_order()
-        for item in csv_sorted:
-            row_contents = item.rsplit(self.SEPERATOR)
-            file_name = row_contents[self.INDEX_DATA_FILE_NAME]
-            station_id = os.path.basename(file_name).split('.')[0]
-
-            # Copy station files
-            for type in ("sensors", "stations"):
-                file_path = build_base_save_folder(save_path, station_id, type)
-                new_file_path = 
build_base_save_folder(partition_paths[current_partition], station_id, type)
-                if os.path.isdir(file_path):
-                    distutils.dir_util.copy_tree(file_path, new_file_path)
+        for item, size in csv_sorted.iteritems():
+            station_id = item.split('.')[0]
+            # Update partition bases on smallest current size.
+            current_partition = partition_sizes.index(min(partition_sizes))
             
-            # Update partition
-            current_partition += 1
-            if current_partition >= partitions:
-                current_partition = 0
+            # Copy sensor files
+            type = "sensors"
+            file_path = build_base_save_folder(save_path, station_id, type) + 
station_id
+            new_file_path = 
build_base_save_folder(partition_paths[current_partition], station_id, type) + 
station_id
+            if os.path.isdir(file_path):
+                distutils.dir_util.copy_tree(file_path, new_file_path)
+            partition_sizes[current_partition] += size
 
+        
+            # Copy station files
+            type = "stations"
+            file_path = build_base_save_folder(save_path, station_id, type) + 
station_id + ".xml"
+            new_file_base = 
build_base_save_folder(partition_paths[current_partition], station_id, type)
+            new_file_path = new_file_base + station_id + ".xml"
+            if os.path.isfile(file_path):
+                if not os.path.isdir(new_file_base):
+                    os.makedirs(new_file_base)
+                shutil.copyfile(file_path, new_file_path)
+    
     def get_csv_in_partition_order(self):
         self.open_progress_data()
         row_count = len(self.progress_data)
@@ -156,7 +179,7 @@ class WeatherDataFiles:
             csv_dict[file_name] = folder_data
         
         # New sorted list.
-        return sorted(csv_dict, key=csv_dict.get, reverse=True)
+        return OrderedDict(sorted(csv_dict.items(), key=lambda x: x[1], 
reverse=True))
         
     def get_file_row(self, file_name):
         for i in range(0, len(self.progress_data)):
@@ -323,3 +346,13 @@ class WeatherDataFiles:
             elif self.type == "station" and 
(columns[self.INDEX_DATA_STATION_STATUS].strip() != self.DATA_FILE_DOWNLOADED 
or self.data_reset):
                 break
         return columns[self.INDEX_DATA_FILE_NAME]
+    
+def get_partition_paths(partitions, base_paths):        
+    partition_paths = []
+    for i in range(0, partitions):
+        for j in range(0, len(base_paths)):
+            new_partition_path = base_paths[j] + "partitions/" + 
str(len(base_paths)) + "disk/d" + str(j) +"_p" + str(partitions) + "_i" + 
str(i) + "/"
+            partition_paths.append(new_partition_path)
+    return partition_paths
+
+

http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/a253e2f0/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_example.xml
----------------------------------------------------------------------
diff --git 
a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_example.xml
 
b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_example.xml
new file mode 100644
index 0000000..b8e60fd
--- /dev/null
+++ 
b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_example.xml
@@ -0,0 +1,39 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<data xmlns="data">
+    <name>Local Example</name>
+    <save_path>/data</save_path>
+    <package>all</package>
+    <node>
+        <name>localhost</name>
+        <ip_address>127.0.0.1</ip_address>
+    </node>
+    <dataset>
+        <name>tiny</name>
+        <save_path>/data</save_path>
+        <partitions_per_path>1</partitions_per_path>
+        <partitions_per_path>2</partitions_per_path>
+        <partitions_per_path>4</partitions_per_path>
+        <partitions_per_path>8</partitions_per_path>
+    </dataset>
+    <dataset>
+        <name>small</name>
+        <save_path>/data</save_path>
+        <partitions_per_path>4</partitions_per_path>
+        <partitions_per_path>8</partitions_per_path>
+    </dataset>
+</data>

[02/25] git commit: All the changes to make managing and running the benchmark easier. A xml configuration file holds all the data so partitions can be created with one argument to the cli. Also its easy to rebuild the data since the configuration is sav

Reply via email to