Repository: vxquery Updated Branches: refs/heads/master 72fd5c645 -> e97888ed8
http://git-wip-us.apache.org/repos/asf/vxquery/blob/3167366d/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py deleted file mode 100644 index 5db090a..0000000 --- a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py +++ /dev/null @@ -1,554 +0,0 @@ -#!/usr/bin/env python -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import textwrap -from datetime import date -import os -from collections import OrderedDict - -# Custom modules. -from weather_config_ghcnd import * -from weather_config_mshr import * -from weather_download_files import * - -class WeatherConvertToXML: - - STATES = OrderedDict({ - 'AK': 'Alaska', - 'AL': 'Alabama', - 'AR': 'Arkansas', - 'AS': 'American Samoa', - 'AZ': 'Arizona', - 'CA': 'California', - 'CO': 'Colorado', - 'CT': 'Connecticut', - 'DC': 'District of Columbia', - 'DE': 'Delaware', - 'FL': 'Florida', - 'GA': 'Georgia', - 'GU': 'Guam', - 'HI': 'Hawaii', - 'IA': 'Iowa', - 'ID': 'Idaho', - 'IL': 'Illinois', - 'IN': 'Indiana', - 'KS': 'Kansas', - 'KY': 'Kentucky', - 'LA': 'Louisiana', - 'MA': 'Massachusetts', - 'MD': 'Maryland', - 'ME': 'Maine', - 'MI': 'Michigan', - 'MN': 'Minnesota', - 'MO': 'Missouri', - 'MP': 'Northern Mariana Islands', - 'MS': 'Mississippi', - 'MT': 'Montana', - 'NA': 'National', - 'NC': 'North Carolina', - 'ND': 'North Dakota', - 'NE': 'Nebraska', - 'NH': 'New Hampshire', - 'NJ': 'New Jersey', - 'NM': 'New Mexico', - 'NV': 'Nevada', - 'NY': 'New York', - 'OH': 'Ohio', - 'OK': 'Oklahoma', - 'OR': 'Oregon', - 'PA': 'Pennsylvania', - 'PR': 'Puerto Rico', - 'RI': 'Rhode Island', - 'SC': 'South Carolina', - 'SD': 'South Dakota', - 'TN': 'Tennessee', - 'TX': 'Texas', - 'UT': 'Utah', - 'VA': 'Virginia', - 'VI': 'Virgin Islands', - 'VT': 'Vermont', - 'WA': 'Washington', - 'WI': 'Wisconsin', - 'WV': 'West Virginia', - 'WY': 'Wyoming' - }) - - MONTHS = [ - "January", - "February", - "March", - "April", - "May", - "June", - "July", - "August", - "September", - "October", - "November", - "December" - ] - - token = "" - - def __init__(self, base_path, save_path, debug_output): - self.save_path = save_path - self.debug_output = debug_output - - # Extra support files. - self.ghcnd_countries = base_path + '/ghcnd-countries.txt' - self.ghcnd_inventory = base_path + '/ghcnd-inventory.txt' - self.ghcnd_states = base_path + '/ghcnd-states.txt' - self.ghcnd_stations = base_path + '/ghcnd-stations.txt' - - # MSHR support files. - self.mshr_stations = base_path + '/mshr_enhanced_201402.txt' - - def set_token(self, token): - self.token = token - - def get_field_from_definition(self, row, field_definition): - return row[(field_definition[FIELD_INDEX_START] - 1):field_definition[FIELD_INDEX_END]] - - def get_field(self, fields_array, row, index): - return row[(fields_array[index][FIELD_INDEX_START] - 1):fields_array[index][FIELD_INDEX_END]] - - def get_dly_field(self, row, index): - return self.get_field(DLY_FIELDS, row, index) - - def print_row_files(self, row): - for field in DLY_FIELDS: - print str(field[FIELD_INDEX_NAME]) + " = '" + row[(field[FIELD_INDEX_START] - 1):field[FIELD_INDEX_END]] + "'" - - def save_file(self, filename, contents): - file = open(filename, 'w') - file.write(contents) - file.close() - return filename - - def get_folder_size(self, folder_name): - total_size = 0 - for dirpath, dirnames, filenames in os.walk(folder_name): - for f in filenames: - fp = os.path.join(dirpath, f) - total_size += os.path.getsize(fp) - return total_size - - def process_one_month_sensor_set(self, records, page): - # Default - return 0 - - def process_station_data(self, row): - # Default - return 0 - - def get_base_folder(self, station_id, data_type="sensors"): - return build_base_save_folder(self.save_path, station_id, data_type) - - def process_inventory_file(self): - print "Processing inventory file" - file_stream = open(self.ghcnd_inventory, 'r') - - csv_header = ['ID', 'SENSORS', 'SENSORS_COUNT', 'MAX_YEARS', 'TOTAL_YEARS_FOR_ALL_SENSORS'] - row = file_stream.readline() - csv_inventory = {} - for row in file_stream: - id = self.get_field_from_definition(row, INVENTORY_FIELDS['ID']) - sensor_id = self.get_field_from_definition(row, INVENTORY_FIELDS['ELEMENT']) - start = int(self.get_field_from_definition(row, INVENTORY_FIELDS['FIRSTYEAR'])) - end = int(self.get_field_from_definition(row, INVENTORY_FIELDS['LASTYEAR'])) - if id in csv_inventory: - new_count = str(int(csv_inventory[id][2]) + 1) - new_max = str(max(int(csv_inventory[id][3]), (end - start))) - new_total = str(int(csv_inventory[id][3]) + end - start) - csv_inventory[id] = [id, (csv_inventory[id][1] + "," + sensor_id), new_count, new_max, new_total] - else: - csv_inventory[id] = [id, sensor_id, str(1), str(end - start), str(end - start)] - - path = self.save_path + "/inventory.csv" - self.save_csv_file(path, csv_inventory, csv_header) - - def save_csv_file(self, path, csv_inventory, header): - csv_content = "|".join(header) + "\n" - for row_id in csv_inventory: - csv_content += "|".join(csv_inventory[row_id]) + "\n" - self.save_file(path, csv_content) - - - def process_station_file(self, file_name): - print "Processing station file: " + file_name - file_stream = open(file_name, 'r') - - row = file_stream.readline() - return self.process_station_data(row) - - def process_sensor_file(self, file_name, max_files, sensor_max=99): - print "Processing sensor file: " + file_name - file_stream = open(file_name, 'r') - - month_last = 0 - year_last = 0 - records = [] - page = 0 - sensor_count = 0 - - file_count = 0 - for row in file_stream: - month = self.get_dly_field(row, DLY_FIELD_MONTH) - year = self.get_dly_field(row, DLY_FIELD_YEAR) - - if (month_last != 0 and year_last != 0) and (sensor_count >= sensor_max or month != month_last or year != year_last): - # process set - file_count += self.process_one_month_sensor_set(records, page) - records = [] - if sensor_count >= sensor_max and month == month_last and year == year_last: - # start a new page. - page += 1 - else: - # start over. - page = 0 - sensor_count = 0 - - records.append(row) - sensor_count += 1 - if max_files != 0 and file_count >= max_files: - # Stop creating more files after the max is reached. - break - - month_last = month - year_last = year - - station_id = self.get_dly_field(records[0], DLY_FIELD_ID) - data_size = self.get_folder_size(self.get_base_folder(station_id) + "/" + station_id) - print "Created " + str(file_count) + " XML files for a data size of " + str(data_size) + "." - - return (file_count, data_size) - - def convert_c2f(self, c): - return (9 / 5 * c) + 32 - - def default_xml_web_service_start(self): - field_xml = "" - field_xml += "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n" - return field_xml - - def default_xml_data_start(self, total_records): - field_xml = "" - field_xml += "<dataCollection pageCount=\"1\" totalCount=\"" + str(total_records) + "\">\n" - return field_xml - - def default_xml_station_start(self): - field_xml = "" - field_xml = "<stationCollection pageSize=\"100\" pageCount=\"1\" totalCount=\"1\">\n" - return field_xml - - def default_xml_field_date(self, report_date, indent=2): - field_xml = "" - field_xml += self.get_indent_space(indent) + "<date>" + str(report_date.year) + "-" + str(report_date.month).zfill(2) + "-" + str(report_date.day).zfill(2) + "T00:00:00.000</date>\n" - return field_xml - - def default_xml_mshr_station_additional(self, station_id): - """The web service station data is generate from the MSHR data supplemented with GHCN-Daily.""" - station_mshr_row = "" - stations_mshr_file = open(self.mshr_stations, 'r') - for line in stations_mshr_file: - if station_id == self.get_field_from_definition(line, MSHR_FIELDS['GHCND_ID']).strip(): - station_mshr_row = line - break - - if station_mshr_row == "": - return "" - - additional_xml = "" - - county = self.get_field_from_definition(station_mshr_row, MSHR_FIELDS['COUNTY']).strip() - if county != "": - additional_xml += self.default_xml_location_labels("CNTY", "FIPS:-9999", county) - - country_code = self.get_field_from_definition(station_mshr_row, MSHR_FIELDS['FIPS_COUNTRY_CODE']).strip() - country_name = self.get_field_from_definition(station_mshr_row, MSHR_FIELDS['FIPS_COUNTRY_NAME']).strip() - if country_code != "" and country_name != "": - additional_xml += self.default_xml_location_labels("CNTRY", "FIPS:" + country_code, country_name) - - return additional_xml - - def default_xml_location_labels(self, type, id, display_name): - label_xml = "" - label_xml += self.default_xml_start_tag("locationLabels", 2) - label_xml += self.default_xml_element("type", type, 3) - label_xml += self.default_xml_element("id", id, 3) - label_xml += self.default_xml_element("displayName", display_name, 3) - label_xml += self.default_xml_end_tag("locationLabels", 2) - return label_xml - - - def default_xml_web_service_station(self, station_id): - """The web service station data is generate from available historical sources.""" - station_ghcnd_row = "" - stations_ghcnd_file = open(self.ghcnd_stations, 'r') - for line in stations_ghcnd_file: - if station_id == self.get_field_from_definition(line, STATIONS_FIELDS['ID']): - station_ghcnd_row = line - break - - xml_station = "" - xml_station += self.default_xml_start_tag("station", 1) - - xml_station += self.default_xml_element("id", "GHCND:" + station_id, 2) - xml_station += self.default_xml_element("displayName", self.get_field_from_definition(station_ghcnd_row, STATIONS_FIELDS['NAME']).strip(), 2) - xml_station += self.default_xml_element("latitude", self.get_field_from_definition(station_ghcnd_row, STATIONS_FIELDS['LATITUDE']).strip(), 2) - xml_station += self.default_xml_element("longitude", self.get_field_from_definition(station_ghcnd_row, STATIONS_FIELDS['LONGITUDE']).strip(), 2) - - elevation = self.get_field_from_definition(station_ghcnd_row, STATIONS_FIELDS['ELEVATION']).strip() - if elevation != "-999.9": - xml_station += self.default_xml_element("elevation", elevation, 2) - - state_code = self.get_field_from_definition(station_ghcnd_row, STATIONS_FIELDS['STATE']).strip() - if state_code != "" and state_code in self.STATES: - xml_station += self.default_xml_location_labels("ST", "FIPS:" + str(self.STATES.keys().index(state_code)), self.STATES[state_code]) - - # Add the MSHR data to the station generated information. - xml_station += self.default_xml_mshr_station_additional(station_id) - - xml_station += self.default_xml_end_tag("station", 1) - return xml_station - - def default_xml_day_reading_as_field(self, row, day): - day_index = DLY_FIELD_DAY_OFFSET + ((day - 1) * DLY_FIELD_DAY_FIELDS) - value = self.get_dly_field(row, day_index); - if value == "-9999": - return "" - - field_xml = "" - field_id = self.get_dly_field(row, DLY_FIELD_ELEMENT) - if field_id in ("MDTN", "MDTX", "MNPN", "MXPN", "TMAX", "TMIN", "TOBS",): - # Add both the celcius and fahrenheit temperatures. - celcius = float(value) / 10 - field_xml += " <" + field_id + "_c>" + str(celcius) + "</" + field_id + "_c>\n" - fahrenheit = self.convert_c2f(celcius) - field_xml += " <" + field_id + "_f>" + str(fahrenheit) + "</" + field_id + "_f>\n" - elif field_id in ("AWND", "EVAP", "PRCP", "THIC", "WESD", "WESF", "WSF1", "WSF2", "WSF5", "WSFG", "WSFI", "WSFM",): - # Field values that are in tenths. - converted_value = float(value) / 10 - field_xml += " <" + field_id + ">" + str(converted_value) + "</" + field_id + ">\n" - elif field_id in ("ACMC", "ACMH", "ACSC", "ACSH", "PSUN",): - # Fields is a percentage. - field_xml += " <" + field_id + ">" + value.strip() + "</" + field_id + ">\n" - elif field_id in ("FMTM", "PGTM",): - # Fields is a time value HHMM. - field_xml += " <" + field_id + ">" + value.strip() + "</" + field_id + ">\n" - elif field_id in ("DAEV", "DAPR", "DASF", "DATN", "DATX", "DAWM", "DWPR", "FRGB", "FRGT", "FRTH", "GAHT", "MDSF", "MDWM", "MDEV", "MDPR", "SNOW", "SNWD", "TSUN", "WDF1", "WDF2", "WDF5", "WDFG", "WDFI", "WDFM", "WDMV",): - # Fields with no alternation needed. - field_xml += " <" + field_id + ">" + value.strip() + "</" + field_id + ">\n" - else: - field_xml += " <unknown>" + field_id + "</unknown>\n" - - # print field_xml - return field_xml - - def default_xml_day_reading(self, row, day, indent=2): - day_index = DLY_FIELD_DAY_OFFSET + ((day - 1) * DLY_FIELD_DAY_FIELDS) - value = self.get_dly_field(row, day_index); - mflag = self.get_dly_field(row, day_index + 1); - qflag = self.get_dly_field(row, day_index + 2); - sflag = self.get_dly_field(row, day_index + 3); - - if value == "-9999": - return "" - - indent_space = self.get_indent_space(indent) - field_id = self.get_dly_field(row, DLY_FIELD_ELEMENT) - station_id = "GHCND:" + self.get_dly_field(row, DLY_FIELD_ID) - - field_xml = "" - field_xml += indent_space + "<dataType>" + field_id + "</dataType>\n" - field_xml += indent_space + "<station>" + station_id + "</station>\n" - field_xml += indent_space + "<value>" + value.strip() + "</value>\n" - field_xml += indent_space + "<attributes>\n" - field_xml += indent_space + indent_space + "<attribute>" + mflag.strip() + "</attribute>\n" - field_xml += indent_space + indent_space + "<attribute>" + qflag.strip() + "</attribute>\n" - field_xml += indent_space + indent_space + "<attribute>" + sflag.strip() + "</attribute>\n" - field_xml += indent_space + indent_space + "<attribute></attribute>\n" - field_xml += indent_space + "</attributes>\n" - - # print field_xml - return field_xml - - def default_xml_end(self): - return textwrap.dedent("""\ - </ghcnd_observation>""") - - def default_xml_data_end(self): - return self.default_xml_end_tag("dataCollection", 0) - - def default_xml_station_end(self): - return self.default_xml_end_tag("stationCollection", 0) - - def default_xml_element(self, tag, data, indent=1): - return self.get_indent_space(indent) + "<" + tag + ">" + data + "</" + tag + ">\n" - - def default_xml_start_tag(self, tag, indent=1): - return self.get_indent_space(indent) + "<" + tag + ">\n" - - def default_xml_end_tag(self, tag, indent=1): - return self.get_indent_space(indent) + "</" + tag + ">\n" - - def get_indent_space(self, indent): - return (" " * (4 * indent)) - - -class WeatherWebServiceMonthlyXMLFile(WeatherConvertToXML): - """The web service class details how to create files similar to the NOAA web service.""" - skip_downloading = False - # Station data - def process_station_data(self, row): - """Adds a single station record file either from downloading the data or generating a similar record.""" - station_id = self.get_dly_field(row, DLY_FIELD_ID) - download = 0 - if self.token is not "" and not self.skip_downloading: - download = self.download_station_data(station_id, self.token, True) - if download == 0: - self.skip_downloading = True - - # If not downloaded, generate. - if download != 0: - return download - else: - # Information for each daily file. - station_xml_file = self.default_xml_web_service_start() - station_xml_file += self.default_xml_station_start() - station_xml_file += self.default_xml_web_service_station(station_id) - station_xml_file += self.default_xml_station_end() - - # Remove white space. - station_xml_file = station_xml_file.replace("\n", ""); - station_xml_file = station_xml_file.replace(self.get_indent_space(1), ""); - - # Make sure the station folder is available. - ghcnd_xml_station_path = self.get_base_folder(station_id, "stations") - if not os.path.isdir(ghcnd_xml_station_path): - os.makedirs(ghcnd_xml_station_path) - - # Save XML string to disk. - save_file_name = ghcnd_xml_station_path + station_id + ".xml" - save_file_name = self.save_file(save_file_name, station_xml_file) - - if save_file_name is not "": - if self.debug_output: - print "Wrote file: " + save_file_name - return 1 - else: - return 0 - - # Station data - def download_station_data(self, station_id, token, reset=False): - """Downloads the station data from the web service.""" - import time - time.sleep(2) - # Make sure the station folder is available. - ghcnd_xml_station_path = self.get_base_folder(station_id, "stations") - if not os.path.isdir(ghcnd_xml_station_path): - os.makedirs(ghcnd_xml_station_path) - - # Build download URL. - url = "http://www.ncdc.noaa.gov/cdo-services/services/datasets/GHCND/stations/GHCND:" + station_id + ".xml?token=" + token - url_file = urllib.urlopen(url) - station_xml_file = "" - while (True): - line = url_file.readline() - if not line: - break - station_xml_file += line - - if station_xml_file.find("<cdoError>") != -1: - if self.debug_output: - print "Error in station download" - return 0 - - # Save XML string to disk. - save_file_name = ghcnd_xml_station_path + station_id + ".xml" - save_file_name = self.save_file(save_file_name, station_xml_file) - - if save_file_name is not "": - if self.debug_output: - print "Wrote file: " + save_file_name - return 2 - else: - return 0 - - # Sensor data - def process_one_month_sensor_set(self, records, page): - """Generates records for a station using the web service xml layout.""" - found_data = False - year = int(self.get_dly_field(records[0], DLY_FIELD_YEAR)) - month = int(self.get_dly_field(records[0], DLY_FIELD_MONTH)) - - station_id = self.get_dly_field(records[0], DLY_FIELD_ID) - - # Information for each daily file. - count = 0 - daily_xml_file = "" - - for day in range(1, 32): - try: - # TODO find out what is a valid python date range? 1889? - # Attempt to see if this is valid date. - report_date = date(year, month, day) - - for record in records: - record_xml_snip = self.default_xml_day_reading(record, report_date.day) - if record_xml_snip is not "": - daily_xml_file += self.default_xml_start_tag("data") - daily_xml_file += self.default_xml_field_date(report_date) - daily_xml_file += record_xml_snip - daily_xml_file += self.default_xml_end_tag("data") - found_data = True - count += 1 - - except ValueError: - pass - - daily_xml_file = self.default_xml_web_service_start() + self.default_xml_data_start(count) + daily_xml_file + self.default_xml_data_end() - daily_xml_file = daily_xml_file.replace("\n", ""); - daily_xml_file = daily_xml_file.replace(self.get_indent_space(1), ""); - - if not found_data: - return 0 - - # Make sure the station folder is available. - ghcnd_xml_station_path = self.get_base_folder(station_id) + "/" + station_id + "/" + str(report_date.year) + "/" - if not os.path.isdir(ghcnd_xml_station_path): - os.makedirs(ghcnd_xml_station_path) - - # Save XML string to disk. - save_file_name = ghcnd_xml_station_path + build_sensor_save_filename(station_id, report_date, page) - save_file_name = self.save_file(save_file_name, daily_xml_file) - - if save_file_name is not "": - if self.debug_output: - print "Wrote file: " + save_file_name - return 1 - else: - return 0 - -def build_base_save_folder(save_path, station_id, data_type="sensors"): - # Default - station_prefix = station_id[:3] - return save_path + data_type + "/" + station_prefix + "/" - -def build_sensor_save_filename(station_id, report_date, page): - # Default - return station_id + "_" + str(report_date.year).zfill(4) + str(report_date.month).zfill(2) + "_" + str(page) + ".xml" - http://git-wip-us.apache.org/repos/asf/vxquery/blob/3167366d/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py deleted file mode 100644 index 4877120..0000000 --- a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py +++ /dev/null @@ -1,416 +0,0 @@ -#!/usr/bin/env python -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import glob -import os.path -import linecache -import distutils.core - -from weather_convert_to_xml import * -from collections import OrderedDict - -# Weather data files created to manage the conversion process. -# Allows partition and picking up where you left off. -class WeatherDataFiles: - - LARGE_FILE_ROOT_TAG = "root" - - INDEX_DATA_FILE_NAME = 0 - INDEX_DATA_SENSORS_STATUS = 1 - INDEX_DATA_STATION_STATUS = 2 - INDEX_DATA_FILE_COUNT = 3 - INDEX_DATA_FOLDER_DATA = 4 - - DATA_FILE_START_INDEX = 0 - DATA_FILE_EXTENSION = ".dly" - DATA_FILE_MISSING = "missing" - DATA_FILE_INITIAL = "initialized" - DATA_FILE_DOWNLOADED = "downloaded" - DATA_FILE_GENERATED = "generated" - SEPERATOR = "," - - type = "sensor" - data_reset = False - - def __init__(self, base_path, progress_file_name="/tmp/_weather_data.csv"): - self.base_path = base_path - - self.progress_file_name = progress_file_name - - self.current = self.DATA_FILE_START_INDEX - self.progress_data = [] - - def get_file_list_iterator(self): - """Return the list of files one at a time.""" - return glob.iglob(self.base_path + "/*" + self.DATA_FILE_EXTENSION) - - # Save Functions - def build_progress_file(self, options, convert): - if not os.path.isfile(self.progress_file_name) or 'reset' in options: - # Build a new file. - file = open(self.progress_file_name, 'w') - contents = self.get_default_progress_file_csv() - file.write(contents) - file.close() - elif 'append' in options or 'recalculate' in options: - self.open_progress_data() - row_count = len(self.progress_data) - for row in range(0, row_count): - row_contents = self.progress_data[row].rsplit(self.SEPERATOR) - file_name = row_contents[self.INDEX_DATA_FILE_NAME] - if self.get_file_row(file_name) < 0 and 'append' in options: - self.progress_data.append(self.get_progress_csv_row(file_name, self.DATA_FILE_INITIAL, self.DATA_FILE_INITIAL)) - elif 'recalculate' in options: - # The folder is hard coded - station_id = os.path.basename(file_name).split('.')[0] - folder_name = convert.get_base_folder(station_id) - if os.path.exists(folder_name): - row_contents = self.progress_data[row].rsplit(self.SEPERATOR) - sensor_status = row_contents[self.INDEX_DATA_SENSORS_STATUS] - station_status = row_contents[self.INDEX_DATA_STATION_STATUS] - file_count = self.get_file_count(folder_name) - data_size = self.get_folder_size(folder_name) - self.progress_data[row] = self.get_progress_csv_row(file_name, sensor_status, station_status, file_count, data_size) - else: - self.progress_data[row] = self.get_progress_csv_row(file_name, self.DATA_FILE_INITIAL, self.DATA_FILE_INITIAL) - # Save file - self.close_progress_data(True) - self.reset() - - def copy_to_n_partitions(self, save_path, partitions, base_paths, reset): - """Once the initial data has been generated, the data can be copied into a set number of partitions. """ - if (len(base_paths) == 0): - return - - # Initialize the partition paths. - partition_paths = get_partition_paths(0, partitions, base_paths) - for path in partition_paths: - # Make sure the xml folder is available. - prepare_path(path, reset) - - import fnmatch - import os - - # copy stations and sensors into each partition - current_sensor_partition = 0 - current_station_partition = 0 - self.open_progress_data() - row_count = len(self.progress_data) - for row in range(0, row_count): - row_contents = self.progress_data[row].rsplit(self.SEPERATOR) - file_name = row_contents[self.INDEX_DATA_FILE_NAME] - station_id = os.path.basename(file_name).split('.')[0] - - # Copy sensor files - type = "sensors" - file_path = build_base_save_folder(save_path, station_id, type) + station_id - for root, dirnames, filenames in os.walk(file_path): - for filename in fnmatch.filter(filenames, '*.xml'): - xml_path = os.path.join(root, filename) - new_file_base = build_base_save_folder(partition_paths[current_sensor_partition], station_id, type) + station_id - if not os.path.isdir(new_file_base): - os.makedirs(new_file_base) - shutil.copyfile(xml_path, new_file_base + "/" + filename) - current_sensor_partition += 1 - if current_sensor_partition >= len(partition_paths): - current_sensor_partition = 0 - - # Copy station files - type = "stations" - file_path = build_base_save_folder(save_path, station_id, type) + station_id + ".xml" - new_file_base = build_base_save_folder(partition_paths[current_station_partition], station_id, type) - new_file_path = new_file_base + station_id + ".xml" - if os.path.isfile(file_path): - if not os.path.isdir(new_file_base): - os.makedirs(new_file_base) - shutil.copyfile(file_path, new_file_path) - current_station_partition += 1 - if current_station_partition >= len(partition_paths): - current_station_partition = 0 - - def build_to_n_partition_files(self, save_path, partitions, base_paths, reset): - """Once the initial data has been generated, the data can be divided into partitions - and stored in single files. - """ - if (len(base_paths) == 0): - return - - XML_START = "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>" - - partition_paths = get_partition_paths(0, partitions, base_paths) - - import fnmatch - import os - - for path in partition_paths: - prepare_path(path, reset) - - # Initialize the partition paths. - types = ["sensors", "stations"] - for type in types: - partition_files = [] - for path in partition_paths: - # Make sure the xml folder is available. - prepare_path(path + type + "/", False) - partition_files.append(open(path + type + "/partition.xml", 'w')) - partition_files[-1].write(XML_START + "<" + self.LARGE_FILE_ROOT_TAG + ">\n") - - # copy into each partition - current_partition = 0 - self.open_progress_data() - row_count = len(self.progress_data) - for row in range(0, row_count): - row_contents = self.progress_data[row].rsplit(self.SEPERATOR) - file_name = row_contents[self.INDEX_DATA_FILE_NAME] - station_id = os.path.basename(file_name).split('.')[0] - - # Copy files - if type == "sensors": - file_path = build_base_save_folder(save_path, station_id, type) + station_id - for root, dirnames, filenames in os.walk(file_path): - for filename in fnmatch.filter(filenames, '*.xml'): - xml_path = os.path.join(root, filename) - xml_data = file_get_contents(xml_path).replace(XML_START, "") + "\n" - partition_files[current_partition].write(xml_data) - current_partition += 1 - if current_partition >= len(partition_files): - current_partition = 0 - elif type == "stations": - file_path = build_base_save_folder(save_path, station_id, type) + station_id + ".xml" - xml_path = os.path.join(root, file_path) - xml_data = file_get_contents(xml_path).replace(XML_START, "") + "\n" - partition_files[current_partition].write(xml_data) - current_partition += 1 - if current_partition >= len(partition_paths): - current_partition = 0 - - for row in range(0, len(partition_paths)): - partition_files[row].write("</" + self.LARGE_FILE_ROOT_TAG + ">\n") - partition_files[row].close() - - def get_file_row(self, file_name): - for i in range(0, len(self.progress_data)): - if self.progress_data[i].startswith(file_name): - return i - return -1 - - def get_default_progress_file_csv(self): - contents = "" - for path in self.get_file_list_iterator(): - file_name = os.path.basename(path) - contents += self.get_progress_csv_row(file_name, self.DATA_FILE_INITIAL, self.DATA_FILE_INITIAL) - return contents - - def print_progress_file_stats(self, convert): - sensor_count_missing = 0 - sensor_count = 0 - file_count = 0 - data_size = 0 - - sensor_count_actual = 0 - file_count_actual = 0 - data_size_actual = 0 - - station_count_missing = 0 - station_count_generated = 0 - station_count_downloaded = 0 - - self.open_progress_data() - row_count = len(self.progress_data) - for row in range(0, row_count): - row_contents = self.progress_data[row].rsplit(self.SEPERATOR) - if int(row_contents[self.INDEX_DATA_FILE_COUNT]) != -1 and int(row_contents[self.INDEX_DATA_FOLDER_DATA]) != -1: - sensor_count += 1 - file_count += int(row_contents[self.INDEX_DATA_FILE_COUNT]) - data_size += int(row_contents[self.INDEX_DATA_FOLDER_DATA]) - else: - sensor_count_missing += 1 - - if row_contents[self.INDEX_DATA_STATION_STATUS] == "generated": - station_count_generated += 1 - if row_contents[self.INDEX_DATA_STATION_STATUS] == "downloaded": - station_count_downloaded += 1 - else: - station_count_missing += 1 - - file_name = row_contents[self.INDEX_DATA_FILE_NAME] - station_id = os.path.basename(file_name).split('.')[0] - folder_name = convert.get_base_folder(station_id) - if os.path.exists(folder_name): - sensor_count_actual += 1 - file_count_actual += self.get_file_count(folder_name) - data_size_actual += self.get_folder_size(folder_name) - - - print "Progress File:\t" + self.progress_file_name + "\n" - - print "CSV DETAILS OF PROCESSED SENSORS" - print "Number of stations:\t" + "{:,}".format(sensor_count) - print "Number of files:\t" + "{:,}".format(file_count) - print "Data size:\t\t" + "{:,}".format(data_size) + " Bytes\n" - - print "CSV DETAILS OF unPROCESSED SENSORS" - print "Number of stations:\t" + "{:,}".format(sensor_count_missing) + "\n" - - print "CSV DETAILS OF PROCESSED STATIONS" - print "Generated:\t\t" + "{:,}".format(station_count_generated) - print "Downloaded:\t\t" + "{:,}".format(station_count_downloaded) - print "Missing:\t\t" + "{:,}".format(station_count_missing) + "\n" - - print "FOLDER DETAILS" - print "Number of stations:\t" + "{:,}".format(sensor_count_actual) - print "Number of files:\t" + "{:,}".format(file_count_actual) - print "Data size:\t\t" + "{:,}".format(data_size_actual) + " Bytes\n" - - - def get_progress_csv_row(self, file_name, sensors_status, station_status, file_count=-1, data_size=-1): - return file_name + self.SEPERATOR + sensors_status + self.SEPERATOR + station_status + self.SEPERATOR + str(file_count) + self.SEPERATOR + str(data_size) + "\n" - - def update_file_sensor_status(self, file_name, sensors_status, file_count=-1, data_size=-1): - for row in range(0, len(self.progress_data)): - if self.progress_data[row].startswith(file_name): - station_status = self.progress_data[row].rsplit(self.SEPERATOR)[self.INDEX_DATA_STATION_STATUS] - self.progress_data[row] = self.get_progress_csv_row(file_name, sensors_status, station_status, file_count, data_size) - break - - # Save the file - self.close_progress_data(True) - - def update_file_station_status(self, file_name, station_status): - for row in range(0, len(self.progress_data)): - if self.progress_data[row].startswith(file_name): - row_contents = self.progress_data[row].rsplit(self.SEPERATOR) - sensors_status = row_contents[self.INDEX_DATA_SENSORS_STATUS] - file_count = int(row_contents[self.INDEX_DATA_FILE_COUNT]) - data_size = int(row_contents[self.INDEX_DATA_FOLDER_DATA]) - self.progress_data[row] = self.get_progress_csv_row(file_name, sensors_status, station_status, file_count, data_size) - break - - # Save the file - self.close_progress_data(True) - - def get_file_count(self, folder_name): - count = 0 - for dirpath, dirnames, filenames in os.walk(folder_name): - for f in filenames: - count += 1 - return count - - def get_folder_size(self, folder_name): - total_size = 0 - for dirpath, dirnames, filenames in os.walk(folder_name): - for f in filenames: - fp = os.path.join(dirpath, f) - total_size += os.path.getsize(fp) - return total_size - - def get_station_status(self, return_value): - if return_value == 2: - return self.DATA_FILE_DOWNLOADED - elif return_value == 1: - return self.DATA_FILE_GENERATED - return self.DATA_FILE_MISSING - - - def open_progress_data(self): - with open(self.progress_file_name, 'r') as file: - self.progress_data = file.readlines() - - def close_progress_data(self, force=False): - if len(self.progress_data) > 0 or force: - with open(self.progress_file_name, 'w') as file: - file.writelines(self.progress_data) - - - def reset(self): - self.close_progress_data() - - self.current = self.DATA_FILE_START_INDEX - self.open_progress_data() - - def set_type(self, type): - self.type = type - - def set_data_reset(self, data_reset): - self.data_reset = data_reset - - - # Iterator Functions - def __iter__(self): - return self - - def next(self): - columns = [] - while True: - # find a row that has not been created. - if self.current >= len(self.progress_data): - raise StopIteration - row = self.progress_data[self.current] - self.current += 1 - columns = row.rsplit(self.SEPERATOR) - if self.type == "sensor" and (columns[self.INDEX_DATA_SENSORS_STATUS].strip() != self.DATA_FILE_GENERATED or self.data_reset): - break - elif self.type == "station" and (columns[self.INDEX_DATA_STATION_STATUS].strip() != self.DATA_FILE_DOWNLOADED or self.data_reset): - break - return columns[self.INDEX_DATA_FILE_NAME] - - -# Index values of each field details. -PARTITION_INDEX_NODE = 0 -PARTITION_INDEX_DISK = 1 -PARTITION_INDEX_VIRTUAL = 2 -PARTITION_INDEX = 3 -PARTITION_INDEX_PATH = 4 -PARTITION_HEADER = ("Node", "Disk", "Virtual", "Index", "Path") - -def get_partition_paths(node_id, partitions, base_paths, key="partitions"): - partition_paths = [] - for scheme in get_partition_scheme(node_id, partitions, base_paths, key): - partition_paths.append(scheme[PARTITION_INDEX_PATH]) - return partition_paths - -def get_partition_scheme(node_id, virtual_partitions, base_paths, key="partitions"): - partitions_per_disk = virtual_partitions / len(base_paths) - return get_disk_partition_scheme(node_id, partitions_per_disk, base_paths, key) - -def get_disk_partition_paths(node_id, partitions, base_paths, key="partitions"): - partition_paths = [] - for scheme in get_disk_partition_scheme(node_id, partitions, base_paths, key): - partition_paths.append(scheme[PARTITION_INDEX_PATH]) - return partition_paths - -def get_disk_partition_scheme(node_id, virtual_disk_partitions, base_paths, key="partitions"): - partition_scheme = [] - for i in range(0, virtual_disk_partitions): - for j in range(0, len(base_paths)): - new_partition_path = base_paths[j] + key + "/" + get_partition_folder(j, virtual_disk_partitions, i) + "/" - partition_scheme.append((node_id, j, virtual_disk_partitions, i, new_partition_path)) - return partition_scheme - -def get_partition_folder(disks, partitions, index): - return "d" + str(disks) + "_p" + str(partitions) + "_i" + str(index) - -def prepare_path(path, reset): - """Ensures the directory is available. If reset, then its a brand new directory.""" - if os.path.isdir(path) and reset: - shutil.rmtree(path) - - if not os.path.isdir(path): - os.makedirs(path) - -def file_get_contents(filename): - with open(filename) as f: - return f.read() http://git-wip-us.apache.org/repos/asf/vxquery/blob/3167366d/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_download_files.py ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_download_files.py b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_download_files.py deleted file mode 100644 index fb59b50..0000000 --- a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_download_files.py +++ /dev/null @@ -1,102 +0,0 @@ -#!/usr/bin/env python -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import sys -import os.path -import shutil -import tarfile -import urllib -import zipfile - -# Custom modules. -from weather_config_ghcnd import * -from weather_config_mshr import * - -class WeatherDownloadFiles: - - def __init__(self, save_path): - self.save_path = save_path - - if not os.path.isdir(save_path): - os.makedirs(save_path) - - - def download_ghcnd_files(self, reset=False): - """Download the complete list.""" - for file_name in FILE_NAMES: - url = BASE_DOWNLOAD_URL + file_name - self.download_file(url, reset) - - def download_mshr_files(self, reset=False): - for url in MSHR_URLS: - self.download_file(url, reset) - - def download_file(self, url, reset=False): - """Download the file, unless it exists.""" - file_name = self.save_path + "/" + url.split('/')[-1] - - if not os.path.isfile(file_name) or reset: - print "Downloading: " + url - urllib.urlretrieve(url, file_name, report_download_status) - print - - def unzip_ghcnd_package(self, package, reset=False): - """Unzip the package file, unless it exists.""" - file_name = self.save_path + "/" + package + ".tar.gz" - unzipped_path = self.save_path + "/" + package - - if os.path.isdir(unzipped_path) and reset: - shutil.rmtree(unzipped_path) - - if not os.path.isdir(unzipped_path): - print "Unzipping: " + file_name - tar_file = tarfile.open(file_name, 'r:gz') - tar_file.extractall(unzipped_path) - - def unzip_mshr_files(self, reset=False): - """Unzip the package file, unless it exists.""" - for url in MSHR_URLS: - if url.endswith('.zip'): - file_name = self.save_path + "/" + url.split('/')[-1] - print "Unzipping: " + file_name - with zipfile.ZipFile(file_name, 'r') as myzip: - myzip.extractall(self.save_path) - -def report_download_status(count, block, size): - """Report download status.""" - line_size = 50 - erase = "\b" * line_size - sys.stdout.write(erase) - report = get_report_line((float(count) * block / size), line_size) - sys.stdout.write(report) - -def get_report_line(percentage, line_size): - """Creates a string to be used in reporting the percentage done.""" - report = "" - for i in range(0, line_size): - if (float(i) / line_size < percentage): - report += "=" - else: - report += "-" - return report - -def download_file_save_as(url, new_file_name, reset=False): - """Download the file, unless it exists.""" - if not os.path.isfile(new_file_name) or reset: - print "Downloading: " + url - urllib.urlretrieve(url, new_file_name, report_download_status) - print - http://git-wip-us.apache.org/repos/asf/vxquery/blob/3167366d/vxquery-benchmark/src/main/resources/util/README.md ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/util/README.md b/vxquery-benchmark/src/main/resources/util/README.md deleted file mode 100644 index 8e2a204..0000000 --- a/vxquery-benchmark/src/main/resources/util/README.md +++ /dev/null @@ -1,28 +0,0 @@ -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -Utilities for Benchmark Operations -===================== - -# Introduction - -Helpful scripts or configuration document to work with the benchmarks. - -## Saxon Collection - -To test the data with other XQuery processors, the saxon script helps with -creating a collection.xml file. \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/3167366d/vxquery-benchmark/src/main/resources/util/build_saxon_collection_xml.py ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/util/build_saxon_collection_xml.py b/vxquery-benchmark/src/main/resources/util/build_saxon_collection_xml.py deleted file mode 100644 index 02f39ee..0000000 --- a/vxquery-benchmark/src/main/resources/util/build_saxon_collection_xml.py +++ /dev/null @@ -1,63 +0,0 @@ -#!/usr/bin/env python -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import getopt, glob, os, sys - -def main(argv): - xml_folder = "" - - # Get the base folder - try: - opts, args = getopt.getopt(argv, "f:h", ["folder="]) - except getopt.GetoptError: - print 'The file options for build_saxon_collection_xml.py were not correctly specified.' - print 'To see a full list of options try:' - print ' $ python build_saxon_collection_xml.py -h' - sys.exit(2) - for opt, arg in opts: - if opt == '-h': - print 'Options:' - print ' -f The base folder to create collection XML file.' - sys.exit() - elif opt in ('-f', "--folder"): - # check if file exists. - if os.path.exists(arg): - xml_folder = arg - else: - print 'Error: Argument must be a folder name for --folder (-f).' - sys.exit() - - # Required fields to run the script. - if xml_folder == "" or not os.path.exists(xml_folder): - print 'Error: The folder path option must be supplied: --folder (-f).' - sys.exit() - - # find all XML files in folder - collection_xml = "<collection>" - for i in range(1, 5): - # Search the ith directory level. - search_pattern = xml_folder + ('/*' * i) + '.xml' - for file_path in glob.iglob(search_pattern): - collection_xml += '<doc href="' + str.replace(file_path, xml_folder, '') + '"/>' - collection_xml += "</collection>" - - # create collection XML - file = open('collection.xml', 'w') - file.write(collection_xml) - file.close() - -if __name__ == "__main__": - main(sys.argv[1:]) http://git-wip-us.apache.org/repos/asf/vxquery/blob/3167366d/vxquery-benchmark/src/main/resources/util/find_averages_in_logs.py ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/util/find_averages_in_logs.py b/vxquery-benchmark/src/main/resources/util/find_averages_in_logs.py deleted file mode 100644 index 1cd7939..0000000 --- a/vxquery-benchmark/src/main/resources/util/find_averages_in_logs.py +++ /dev/null @@ -1,97 +0,0 @@ -#!/usr/bin/env python -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import fnmatch -import getopt -import glob -import os -import sys -import csv - -SEARCH_STRING = 'Average execution time:' - -def find_files(directory, pattern): - for root, dirs, files in os.walk(directory): - for basename in files: - if fnmatch.fnmatch(basename, pattern): - yield (root, basename) - - -def main(argv): - ''' Same as bash: find $FOLDER -type f -name "*.xml" -exec basename {} \; > list_xml.csv - ''' - log_folder = "" - save_file = "" - data_type = "" - - # Get the base folder - try: - opts, args = getopt.getopt(argv, "f:hs:t:", ["folder=", "save_file=", "data_type="]) - except getopt.GetoptError: - print 'The file options for list_xml_files.py were not correctly specified.' - print 'To see a full list of options try:' - print ' $ python list_xml_files.py -h' - sys.exit(2) - for opt, arg in opts: - if opt == '-h': - print 'Options:' - print ' -f The base folder to build XML file list.' - print ' -s The save file.' - sys.exit() - elif opt in ('-f', "--folder"): - # check if file exists. - if os.path.exists(arg): - log_folder = arg - else: - print 'Error: Argument must be a folder name for --folder (-f).' - sys.exit() - elif opt in ('-s', "--save_file"): - save_file = arg - elif opt in ('-t', "--data_type"): - data_type = arg - - # Required fields to run the script. - if log_folder == "" or not os.path.exists(log_folder): - print 'Error: The folder path option must be supplied: --folder (-f).' - sys.exit() - if save_file == "": - print 'Error: The folder path option must be supplied: --save_file (-s).' - sys.exit() - - list_xml_csv = '' - with open(save_file, 'w') as outfile: - csvfile = csv.writer(outfile) - for path, filename in find_files(log_folder, '*.log'): - # Only write out a specific type of data xml documents found in a specific path. - with open(path + "/" + filename) as infile: - folders = path.replace(log_folder, "") - for line in infile: - # Skip the root tags. - if line.startswith(SEARCH_STRING): - time_split = line.split(" ") - name_split = filename.split(".") - folder_split = folders.split("/") - - # Build data row - row = folder_split - row.append(name_split[0]) - row.append(time_split[3]) - row.append(name_split[2]) - csvfile.writerow(row) - - -if __name__ == "__main__": - main(sys.argv[1:]) http://git-wip-us.apache.org/repos/asf/vxquery/blob/3167366d/vxquery-benchmark/src/main/resources/util/log_top.sh ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/util/log_top.sh b/vxquery-benchmark/src/main/resources/util/log_top.sh deleted file mode 100755 index 4a2f7e1..0000000 --- a/vxquery-benchmark/src/main/resources/util/log_top.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -LOG_FILE=logs/top.log - -# Reset counters -iostat >> /dev/null -sar -n DEV 1 1 >> /dev/null - -# Save IO, CPU and Network snapshot to a log file. -while (sleep 7) -do - echo "---------------------------------------------" >> ${LOG_FILE} - date >> ${LOG_FILE} - echo >> ${LOG_FILE} - iostat -y 1 1 >> ${LOG_FILE} - top -n 1 -b | head -11 | tail -6 >> ${LOG_FILE} - sar -n DEV 1 1 >> ${LOG_FILE} -done; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/3167366d/vxquery-benchmark/src/main/resources/util/merge_xml_files.py ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/util/merge_xml_files.py b/vxquery-benchmark/src/main/resources/util/merge_xml_files.py deleted file mode 100644 index 9238a19..0000000 --- a/vxquery-benchmark/src/main/resources/util/merge_xml_files.py +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/env python -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import fnmatch -import getopt -import glob -import os -import sys - -XML_PREFIX = '<?xml version="1.0" encoding="UTF-8" standalone="yes"?><root>' + "\n" -XML_SUFFIX = '</root>' + "\n" - -def find_files(directory, pattern): - for root, dirs, files in os.walk(directory): - for basename in files: - if fnmatch.fnmatch(basename, pattern): - yield (root, basename) - - -def main(argv): - ''' Same as bash: find $FOLDER -type f -name "*.xml" -exec basename {} \; > list_xml.csv - ''' - xml_folder = "" - save_file = "" - data_type = "" - - # Get the base folder - try: - opts, args = getopt.getopt(argv, "f:hs:t:", ["folder=", "save_file=", "data_type="]) - except getopt.GetoptError: - print 'The file options for list_xml_files.py were not correctly specified.' - print 'To see a full list of options try:' - print ' $ python merge_xml_files.py -f /path/to/folder -s new.xml -t sensors' - sys.exit(2) - for opt, arg in opts: - if opt == '-h': - print 'Options:' - print ' -f The base folder to build XML file list.' - print ' -s The save file.' - sys.exit() - elif opt in ('-f', "--folder"): - # check if file exists. - if os.path.exists(arg): - xml_folder = arg - else: - print 'Error: Argument must be a folder name for --folder (-f).' - sys.exit() - elif opt in ('-s', "--save_file"): - save_file = arg - elif opt in ('-t', "--data_type"): - data_type = arg - - # Required fields to run the script. - if xml_folder == "" or not os.path.exists(xml_folder): - print 'Error: The folder path option must be supplied: --folder (-f).' - sys.exit() - if save_file == "": - print 'Error: The folder path option must be supplied: --save_file (-s).' - sys.exit() - - list_xml_csv = '' - with open(save_file, 'w') as outfile: - outfile.write(XML_PREFIX) - for path, filename in find_files(xml_folder, '*.xml'): - # Only write out a specific type of data xml documents found in a specific path. - if data_type in path: - with open(path + "/" + filename) as infile: - for line in infile: - # Skip the root tags. - if line != XML_PREFIX and line != XML_SUFFIX: - outfile.write(line) - outfile.write(XML_SUFFIX) - -if __name__ == "__main__": - main(sys.argv[1:]) http://git-wip-us.apache.org/repos/asf/vxquery/blob/3167366d/vxquery-benchmark/src/main/resources/util/vxquery_functions.xq ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/util/vxquery_functions.xq b/vxquery-benchmark/src/main/resources/util/vxquery_functions.xq deleted file mode 100644 index d0621eb..0000000 --- a/vxquery-benchmark/src/main/resources/util/vxquery_functions.xq +++ /dev/null @@ -1,27 +0,0 @@ -(: Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. :) - -(: XQuery Function List :) -(: VXQuery function list in csv with arguments and return types :) -let $list := "../../../../../vxquery-core/src/main/java/org/apache/vxquery/functions/builtin-functions.xml" -let $r := - for $f in fn:doc($list)/functions/function - let $pl := - for $p in $f/param - return $p/@type - return fn:string-join(($f/@name, fn:string-join($pl, ' '), $f/return/@type), ',') -return fn:string-join($r , '|') \ No newline at end of file http://git-wip-us.apache.org/repos/asf/vxquery/blob/3167366d/vxquery-benchmark/src/main/resources/util/vxquery_operators.xq ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/util/vxquery_operators.xq b/vxquery-benchmark/src/main/resources/util/vxquery_operators.xq deleted file mode 100644 index f485807..0000000 --- a/vxquery-benchmark/src/main/resources/util/vxquery_operators.xq +++ /dev/null @@ -1,27 +0,0 @@ -(: Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. :) - -(: XQuery Function List :) -(: VXQuery function list in csv with arguments and return types :) -let $list := "../../../../../vxquery-core/src/main/java/org/apache/vxquery/functions/builtin-operators.xml" -let $r := - for $f in fn:doc($list)/operators/operator - let $pl := - for $p in $f/param - return $p/@type - return fn:string-join(($f/@name, fn:string-join($pl, ' '), $f/return/@type), ',') -return fn:string-join($r , '|') \ No newline at end of file
