Removed custom frame size from benchmark and found some uncommitted testing changes.
Project: http://git-wip-us.apache.org/repos/asf/vxquery/repo Commit: http://git-wip-us.apache.org/repos/asf/vxquery/commit/cee27a70 Tree: http://git-wip-us.apache.org/repos/asf/vxquery/tree/cee27a70 Diff: http://git-wip-us.apache.org/repos/asf/vxquery/diff/cee27a70 Branch: refs/heads/prestonc/benchmark_updates Commit: cee27a704805c40dba5a0d778fea0aa1397cb2a9 Parents: a08b97f Author: Preston Carman <[email protected]> Authored: Thu Jul 17 14:23:27 2014 -0700 Committer: Preston Carman <[email protected]> Committed: Thu Jul 17 14:23:27 2014 -0700 ---------------------------------------------------------------------- .../noaa-ghcn-daily/scripts/run_benchmark.sh | 7 +- .../scripts/run_benchmark_cluster.sh | 7 +- .../scripts/weather_benchmark.py | 15 +-- .../scripts/weather_data_files.py | 103 +++++++++---------- 4 files changed, 66 insertions(+), 66 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/vxquery/blob/cee27a70/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_benchmark.sh ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_benchmark.sh b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_benchmark.sh index ff2d761..b82f0be 100755 --- a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_benchmark.sh +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_benchmark.sh @@ -27,7 +27,7 @@ # REPEAT=5 FRAME_SIZE=10000 -BUFFER_SIZE=$((8*1024*1024)) +BUFFER_SIZE=$((32*1024*1024)) if [ -z "${1}" ] then @@ -46,9 +46,10 @@ do log_file="$(basename ${j}).$(date +%Y%m%d%H%M).log" log_base_path=$(dirname ${j/queries/query_logs}) mkdir -p ${log_base_path} - time sh ./vxquery-cli/target/appassembler/bin/vxq ${j} ${2} -timing -showquery -showoet -showrp -frame-size ${FRAME_SIZE} -buffer-size ${BUFFER_SIZE} -repeatexec ${REPEAT} > ${log_base_path}/${log_file} 2>&1 + time sh ./vxquery-cli/target/appassembler/bin/vxq ${j} ${2} -timing -showquery -showoet -showrp -buffer-size ${BUFFER_SIZE} -repeatexec ${REPEAT} > ${log_base_path}/${log_file} 2>&1 + #time sh ./vxquery-cli/target/appassembler/bin/vxq ${j} ${2} -timing -showquery -showoet -showrp -frame-size ${FRAME_SIZE} -buffer-size ${BUFFER_SIZE} -repeatexec ${REPEAT} > ${log_base_path}/${log_file} 2>&1 echo "Buffer Size: ${BUFFER_SIZE}" >> ${log_base_path}/${log_file} - echo "Frame Size: ${FRAME_SIZE}" >> ${log_base_path}/${log_file} + #echo "Frame Size: ${FRAME_SIZE}" >> ${log_base_path}/${log_file} fi; done http://git-wip-us.apache.org/repos/asf/vxquery/blob/cee27a70/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_benchmark_cluster.sh ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_benchmark_cluster.sh b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_benchmark_cluster.sh index 67b7ca7..6c19713 100644 --- a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_benchmark_cluster.sh +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/run_benchmark_cluster.sh @@ -27,7 +27,7 @@ # REPEAT=5 FRAME_SIZE=10000 -BUFFER_SIZE=1*1024*1024 +BUFFER_SIZE=$((32*1024*1024)) if [ -z "${1}" ] then @@ -63,9 +63,10 @@ do log_file="$(basename ${j}).$(date +%Y%m%d%H%M).log" log_base_path=$(dirname ${j/queries/query_logs}) mkdir -p ${log_base_path} - time sh ./vxquery-cli/target/appassembler/bin/vxq ${j} ${3} -timing -showquery -showoet -showrp -frame-size ${FRAME_SIZE} -buffer-size ${BUFFER_SIZE} -repeatexec ${REPEAT} > ${log_base_path}/${log_file} 2>&1 + #time sh ./vxquery-cli/target/appassembler/bin/vxq ${j} ${3} -timing -showquery -showoet -showrp -frame-size ${FRAME_SIZE} -buffer-size ${BUFFER_SIZE} -repeatexec ${REPEAT} > ${log_base_path}/${log_file} 2>&1 + time sh ./vxquery-cli/target/appassembler/bin/vxq ${j} ${3} -timing -showquery -showoet -showrp -buffer-size ${BUFFER_SIZE} -repeatexec ${REPEAT} > ${log_base_path}/${log_file} 2>&1 echo "\nBuffer Size: ${BUFFER_SIZE}" >> ${log_base_path}/${log_file} - echo "\nFrame Size: ${FRAME_SIZE}" >> ${log_base_path}/${log_file} + #echo "\nFrame Size: ${FRAME_SIZE}" >> ${log_base_path}/${log_file} fi; fi; done http://git-wip-us.apache.org/repos/asf/vxquery/blob/cee27a70/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_benchmark.py ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_benchmark.py b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_benchmark.py index f3c9e68..3b0f9b3 100644 --- a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_benchmark.py +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_benchmark.py @@ -47,6 +47,7 @@ class WeatherBenchmark: "q07.xq" ] QUERY_UTILITY_LIST = [ + "no_result.xq", "sensor_count.xq", "station_count.xq", "q04_join_count.xq", @@ -92,6 +93,7 @@ class WeatherBenchmark: def print_local_partition_schemes(self, test): node_index = 0 virtual_partitions = get_local_virtual_partitions(self.partitions) + virtual_partitions_per_disk = virtual_partitions / len(self.base_paths) for p in self.partitions: scheme = self.get_local_partition_scheme(test, p) self.print_partition_schemes(virtual_partitions, scheme, test, p, node_index) @@ -99,6 +101,7 @@ class WeatherBenchmark: def print_cluster_partition_schemes(self, test): node_index = self.get_current_node_index() virtual_partitions = get_cluster_virtual_partitions(self.nodes, self.partitions) + virtual_partitions_per_disk = virtual_partitions / len(self.base_paths) for p in self.partitions: scheme = self.get_cluster_partition_scheme(test, p) self.print_partition_schemes(virtual_partitions, scheme, test, p, node_index) @@ -112,7 +115,7 @@ class WeatherBenchmark: print " Partitions: " + str(partitions) print " Node Id: " + str(node_id) - if len(scheme) > 0: + if isinstance(scheme, (tuple, list, dict, set)) and len(scheme) > 0: folder_length = len(scheme[0][3]) + 5 row_format = "{:>5} {:>5} {:>5} {:<" + str(folder_length) + "} {:<" + str(folder_length) + "}" HEADER = ("Disk", "Index", "Link", "Data Path", "Link Path") @@ -127,7 +130,7 @@ class WeatherBenchmark: scheme = [] virtual_partitions = get_local_virtual_partitions(self.partitions) data_schemes = get_partition_scheme(0, virtual_partitions, self.base_paths) - link_base_schemes = get_partition_scheme(0, partition, self.base_paths, self.DATA_LINKS_FOLDER + test) + link_base_schemes = get_partition_scheme(0, virtual_partitions, self.base_paths, self.DATA_LINKS_FOLDER + test) # Match link paths to real data paths. group_size = len(data_schemes) / len(link_base_schemes) @@ -155,20 +158,20 @@ class WeatherBenchmark: scheme = [] local_virtual_partitions = get_local_virtual_partitions(self.partitions) virtual_partitions = get_cluster_virtual_partitions(self.nodes, self.partitions) + virtual_partitions_per_disk = virtual_partitions / len(self.base_paths) data_schemes = get_partition_scheme(node_index, virtual_partitions, self.base_paths) - link_base_schemes = get_cluster_link_scheme(len(self.nodes), partition, self.base_paths, self.DATA_LINKS_FOLDER + test) + link_base_schemes = get_cluster_link_scheme(len(self.nodes), virtual_partitions, self.base_paths, self.DATA_LINKS_FOLDER + test) # Match link paths to real data paths. for link_node, link_disk, link_virtual, link_index, link_path in link_base_schemes: # Prep if test == "speed_up": - group_size = virtual_partitions / (link_node + 1) + group_size = virtual_partitions_per_disk / (link_node + 1) elif test == "batch_scale_out": - group_size = virtual_partitions / len(self.nodes) + group_size = virtual_partitions_per_disk / len(self.nodes) else: print "Unknown test." return - group_size = group_size / link_virtual node_offset = group_size * (node_index * partition) node_offset += group_size * link_index has_data = True http://git-wip-us.apache.org/repos/asf/vxquery/blob/cee27a70/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py index 1c9f129..b39f934 100644 --- a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py @@ -150,62 +150,56 @@ class WeatherDataFiles: XML_START = "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>" - # Initialize the partition paths. partition_paths = get_partition_paths(0, partitions, base_paths) - sensors_partition_files = [] - stations_partition_files = [] - for path in partition_paths: - # Make sure the xml folder is available. - prepare_path(path, reset) - prepare_path(path + "sensors/", False) - prepare_path(path + "stations/", False) - sensors_partition_files.append(open(path + "sensors/partition.xml", 'w')) - stations_partition_files.append(open(path + "stations/partition.xml", 'w')) - - for row in range(0, len(partition_paths)): - sensors_partition_files[row].write(XML_START + "<" + self.LARGE_FILE_ROOT_TAG + ">\n") - stations_partition_files[row].write(XML_START + "<" + self.LARGE_FILE_ROOT_TAG + ">\n") import fnmatch import os - # copy stations and sensors into each partition - current_sensor_partition = 0 - current_station_partition = 0 - self.open_progress_data() - row_count = len(self.progress_data) - for row in range(0, row_count): - row_contents = self.progress_data[row].rsplit(self.SEPERATOR) - file_name = row_contents[self.INDEX_DATA_FILE_NAME] - station_id = os.path.basename(file_name).split('.')[0] - - # Copy sensor files - type = "sensors" - file_path = build_base_save_folder(save_path, station_id, type) + station_id - for root, dirnames, filenames in os.walk(file_path): - for filename in fnmatch.filter(filenames, '*.xml'): - xml_path = os.path.join(root, filename) - xml_data = file_get_contents(xml_path).replace(XML_START, "") + "\n" - sensors_partition_files[current_sensor_partition].write(xml_data) - current_sensor_partition += 1 - if current_sensor_partition >= len(sensors_partition_files): - current_sensor_partition = 0 - - # Copy station files - type = "stations" - file_path = build_base_save_folder(save_path, station_id, type) + station_id + ".xml" - xml_path = os.path.join(root, file_path) - xml_data = file_get_contents(xml_path).replace(XML_START, "") + "\n" - stations_partition_files[current_station_partition].write(xml_data) - current_station_partition += 1 - if current_station_partition >= len(partition_paths): - current_station_partition = 0 + for path in partition_paths: + prepare_path(path, reset) + + # Initialize the partition paths. + types = ["sensors", "stations"] + for type in types: + partition_files = [] + for path in partition_paths: + # Make sure the xml folder is available. + prepare_path(path + type + "/", False) + partition_files.append(open(path + type + "/partition.xml", 'w')) + partition_files[-1].write(XML_START + "<" + self.LARGE_FILE_ROOT_TAG + ">\n") + + # copy into each partition + current_partition = 0 + self.open_progress_data() + row_count = len(self.progress_data) + for row in range(0, row_count): + row_contents = self.progress_data[row].rsplit(self.SEPERATOR) + file_name = row_contents[self.INDEX_DATA_FILE_NAME] + station_id = os.path.basename(file_name).split('.')[0] - for row in range(0, len(partition_paths)): - sensors_partition_files[row].write("</" + self.LARGE_FILE_ROOT_TAG + ">\n") - sensors_partition_files[row].close() - stations_partition_files[row].write("</" + self.LARGE_FILE_ROOT_TAG + ">\n") - stations_partition_files[row].close() + # Copy files + if type == "sensors": + file_path = build_base_save_folder(save_path, station_id, type) + station_id + for root, dirnames, filenames in os.walk(file_path): + for filename in fnmatch.filter(filenames, '*.xml'): + xml_path = os.path.join(root, filename) + xml_data = file_get_contents(xml_path).replace(XML_START, "") + "\n" + partition_files[current_partition].write(xml_data) + current_partition += 1 + if current_partition >= len(partition_files): + current_partition = 0 + elif type == "stations": + file_path = build_base_save_folder(save_path, station_id, type) + station_id + ".xml" + xml_path = os.path.join(root, file_path) + xml_data = file_get_contents(xml_path).replace(XML_START, "") + "\n" + partition_files[current_partition].write(xml_data) + current_partition += 1 + if current_partition >= len(partition_paths): + current_partition = 0 + + for row in range(0, len(partition_paths)): + partition_files[row].write("</" + self.LARGE_FILE_ROOT_TAG + ">\n") + partition_files[row].close() def get_file_row(self, file_name): for i in range(0, len(self.progress_data)): @@ -388,12 +382,13 @@ def get_partition_paths(node_id, partitions, base_paths, key="partitions"): partition_paths.append(scheme[PARTITION_INDEX_PATH]) return partition_paths -def get_partition_scheme(node_id, partitions, base_paths, key="partitions"): +def get_partition_scheme(node_id, virtual_partitions, base_paths, key="partitions"): partition_scheme = [] - for i in range(0, partitions): + partitions_per_disk = virtual_partitions / len(base_paths) + for i in range(0, partitions_per_disk): for j in range(0, len(base_paths)): - new_partition_path = base_paths[j] + key + "/" + get_partition_folder(j, partitions, i) + "/" - partition_scheme.append((node_id, j, partitions, i, new_partition_path)) + new_partition_path = base_paths[j] + key + "/" + get_partition_folder(j, partitions_per_disk, i) + "/" + partition_scheme.append((node_id, j, partitions_per_disk, i, new_partition_path)) return partition_scheme def get_partition_folder(disks, partitions, index):
