[01/14] Remove benchmark files to allow easy copy from other branch.

prestonc Tue, 21 Oct 2014 12:36:24 -0700

Repository: vxquery
Updated Branches:
  refs/heads/master 72fd5c645 -> e97888ed8



http://git-wip-us.apache.org/repos/asf/vxquery/blob/3167366d/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py
----------------------------------------------------------------------
diff --git 
a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py
 
b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py
deleted file mode 100644
index 5db090a..0000000
--- 
a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py
+++ /dev/null
@@ -1,554 +0,0 @@
-#!/usr/bin/env python
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import textwrap
-from datetime import date
-import os
-from collections import OrderedDict
-
-# Custom modules.
-from weather_config_ghcnd import *
-from weather_config_mshr import *
-from weather_download_files import *
-
-class WeatherConvertToXML:
-    
-    STATES = OrderedDict({
-        'AK': 'Alaska',
-        'AL': 'Alabama',
-        'AR': 'Arkansas',
-        'AS': 'American Samoa',
-        'AZ': 'Arizona',
-        'CA': 'California',
-        'CO': 'Colorado',
-        'CT': 'Connecticut',
-        'DC': 'District of Columbia',
-        'DE': 'Delaware',
-        'FL': 'Florida',
-        'GA': 'Georgia',
-        'GU': 'Guam',
-        'HI': 'Hawaii',
-        'IA': 'Iowa',
-        'ID': 'Idaho',
-        'IL': 'Illinois',
-        'IN': 'Indiana',
-        'KS': 'Kansas',
-        'KY': 'Kentucky',
-        'LA': 'Louisiana',
-        'MA': 'Massachusetts',
-        'MD': 'Maryland',
-        'ME': 'Maine',
-        'MI': 'Michigan',
-        'MN': 'Minnesota',
-        'MO': 'Missouri',
-        'MP': 'Northern Mariana Islands',
-        'MS': 'Mississippi',
-        'MT': 'Montana',
-        'NA': 'National',
-        'NC': 'North Carolina',
-        'ND': 'North Dakota',
-        'NE': 'Nebraska',
-        'NH': 'New Hampshire',
-        'NJ': 'New Jersey',
-        'NM': 'New Mexico',
-        'NV': 'Nevada',
-        'NY': 'New York',
-        'OH': 'Ohio',
-        'OK': 'Oklahoma',
-        'OR': 'Oregon',
-        'PA': 'Pennsylvania',
-        'PR': 'Puerto Rico',
-        'RI': 'Rhode Island',
-        'SC': 'South Carolina',
-        'SD': 'South Dakota',
-        'TN': 'Tennessee',
-        'TX': 'Texas',
-        'UT': 'Utah',
-        'VA': 'Virginia',
-        'VI': 'Virgin Islands',
-        'VT': 'Vermont',
-        'WA': 'Washington',
-        'WI': 'Wisconsin',
-        'WV': 'West Virginia',
-        'WY': 'Wyoming'
-    })
-    
-    MONTHS = [
-        "January",
-        "February",
-        "March",
-        "April",
-        "May",
-        "June",
-        "July",
-        "August",
-        "September",
-        "October",
-        "November",
-        "December"
-    ]
-    
-    token = ""
-    
-    def __init__(self, base_path, save_path, debug_output):
-        self.save_path = save_path
-        self.debug_output = debug_output
-
-        # Extra support files.
-        self.ghcnd_countries = base_path + '/ghcnd-countries.txt'
-        self.ghcnd_inventory = base_path + '/ghcnd-inventory.txt'
-        self.ghcnd_states = base_path + '/ghcnd-states.txt'
-        self.ghcnd_stations = base_path + '/ghcnd-stations.txt'
-        
-        # MSHR support files.
-        self.mshr_stations = base_path + '/mshr_enhanced_201402.txt'
-        
-    def set_token(self, token):
-        self.token = token
-        
-    def get_field_from_definition(self, row, field_definition):
-        return row[(field_definition[FIELD_INDEX_START] - 
1):field_definition[FIELD_INDEX_END]]
-    
-    def get_field(self, fields_array, row, index):
-        return row[(fields_array[index][FIELD_INDEX_START] - 
1):fields_array[index][FIELD_INDEX_END]]
-    
-    def get_dly_field(self, row, index):
-        return self.get_field(DLY_FIELDS, row, index)
-    
-    def print_row_files(self, row):
-        for field in DLY_FIELDS:
-            print str(field[FIELD_INDEX_NAME]) + " = '" + 
row[(field[FIELD_INDEX_START] - 1):field[FIELD_INDEX_END]] + "'"
-    
-    def save_file(self, filename, contents):
-        file = open(filename, 'w')
-        file.write(contents)
-        file.close()
-        return filename
-    
-    def get_folder_size(self, folder_name):
-        total_size = 0
-        for dirpath, dirnames, filenames in os.walk(folder_name):
-            for f in filenames:
-                fp = os.path.join(dirpath, f)
-                total_size += os.path.getsize(fp)
-        return total_size
-
-    def process_one_month_sensor_set(self, records, page):
-        # Default
-        return 0
-    
-    def process_station_data(self, row):
-        # Default
-        return 0
-    
-    def get_base_folder(self, station_id, data_type="sensors"):
-        return build_base_save_folder(self.save_path, station_id, data_type) 
-    
-    def process_inventory_file(self):
-        print "Processing inventory file"
-        file_stream = open(self.ghcnd_inventory, 'r')
-        
-        csv_header = ['ID', 'SENSORS', 'SENSORS_COUNT', 'MAX_YEARS', 
'TOTAL_YEARS_FOR_ALL_SENSORS']
-        row = file_stream.readline()
-        csv_inventory = {}
-        for row in file_stream:
-            id = self.get_field_from_definition(row, INVENTORY_FIELDS['ID'])
-            sensor_id = self.get_field_from_definition(row, 
INVENTORY_FIELDS['ELEMENT'])
-            start = int(self.get_field_from_definition(row, 
INVENTORY_FIELDS['FIRSTYEAR']))
-            end = int(self.get_field_from_definition(row, 
INVENTORY_FIELDS['LASTYEAR']))
-            if id in csv_inventory:
-                new_count = str(int(csv_inventory[id][2]) + 1)
-                new_max = str(max(int(csv_inventory[id][3]), (end - start)))
-                new_total = str(int(csv_inventory[id][3]) + end - start)
-                csv_inventory[id] = [id, (csv_inventory[id][1] + "," + 
sensor_id), new_count, new_max, new_total]
-            else:
-                csv_inventory[id] = [id, sensor_id, str(1), str(end - start), 
str(end - start)]
-                
-        path = self.save_path + "/inventory.csv"
-        self.save_csv_file(path, csv_inventory, csv_header)
-    
-    def save_csv_file(self, path, csv_inventory, header):
-        csv_content = "|".join(header) + "\n"
-        for row_id in csv_inventory:
-            csv_content += "|".join(csv_inventory[row_id]) + "\n"
-        self.save_file(path, csv_content)
-        
-
-    def process_station_file(self, file_name):
-        print "Processing station file: " + file_name
-        file_stream = open(file_name, 'r')
-        
-        row = file_stream.readline()
-        return self.process_station_data(row)
-
-    def process_sensor_file(self, file_name, max_files, sensor_max=99):
-        print "Processing sensor file: " + file_name
-        file_stream = open(file_name, 'r')
-    
-        month_last = 0
-        year_last = 0
-        records = []
-        page = 0
-        sensor_count = 0
-    
-        file_count = 0
-        for row in file_stream:
-            month = self.get_dly_field(row, DLY_FIELD_MONTH)
-            year = self.get_dly_field(row, DLY_FIELD_YEAR)
-            
-            if (month_last != 0 and year_last != 0) and (sensor_count >= 
sensor_max or month != month_last or year != year_last):
-                # process set
-                file_count += self.process_one_month_sensor_set(records, page)
-                records = []
-                if sensor_count >= sensor_max and month == month_last and year 
== year_last:
-                    # start a new page.
-                    page += 1
-                else:
-                    # start over.
-                    page = 0
-                sensor_count = 0
-            
-            records.append(row)
-            sensor_count += 1
-            if max_files != 0 and file_count >= max_files:
-                # Stop creating more files after the max is reached.
-                break
-
-            month_last = month
-            year_last = year
-        
-        station_id = self.get_dly_field(records[0], DLY_FIELD_ID)
-        data_size = self.get_folder_size(self.get_base_folder(station_id) + 
"/" + station_id)
-        print "Created " + str(file_count) + " XML files for a data size of " 
+ str(data_size) + "."
-        
-        return (file_count, data_size)
-    
-    def convert_c2f(self, c):
-        return (9 / 5 * c) + 32
-    
-    def default_xml_web_service_start(self):
-        field_xml = ""
-        field_xml += "<?xml version=\"1.0\" encoding=\"UTF-8\" 
standalone=\"yes\"?>\n"
-        return field_xml
-    
-    def default_xml_data_start(self, total_records):
-        field_xml = ""
-        field_xml += "<dataCollection pageCount=\"1\" totalCount=\"" + 
str(total_records) + "\">\n"
-        return field_xml
-    
-    def default_xml_station_start(self):
-        field_xml = ""
-        field_xml = "<stationCollection pageSize=\"100\" pageCount=\"1\" 
totalCount=\"1\">\n"
-        return field_xml
-    
-    def default_xml_field_date(self, report_date, indent=2):
-        field_xml = ""
-        field_xml += self.get_indent_space(indent) + "<date>" + 
str(report_date.year) + "-" + str(report_date.month).zfill(2) + "-" + 
str(report_date.day).zfill(2) + "T00:00:00.000</date>\n"
-        return field_xml
-    
-    def default_xml_mshr_station_additional(self, station_id):
-        """The web service station data is generate from the MSHR data 
supplemented with GHCN-Daily."""
-        station_mshr_row = ""
-        stations_mshr_file = open(self.mshr_stations, 'r')
-        for line in stations_mshr_file:
-            if station_id == self.get_field_from_definition(line, 
MSHR_FIELDS['GHCND_ID']).strip():
-                station_mshr_row = line
-                break
-        
-        if station_mshr_row == "":
-            return ""
-
-        additional_xml = ""
-
-        county = self.get_field_from_definition(station_mshr_row, 
MSHR_FIELDS['COUNTY']).strip()
-        if county != "":
-            additional_xml += self.default_xml_location_labels("CNTY", 
"FIPS:-9999", county)
-            
-        country_code = self.get_field_from_definition(station_mshr_row, 
MSHR_FIELDS['FIPS_COUNTRY_CODE']).strip()
-        country_name = self.get_field_from_definition(station_mshr_row, 
MSHR_FIELDS['FIPS_COUNTRY_NAME']).strip()
-        if country_code != "" and country_name != "":
-            additional_xml += self.default_xml_location_labels("CNTRY", 
"FIPS:" + country_code, country_name)
-        
-        return additional_xml
-
-    def default_xml_location_labels(self, type, id, display_name):
-        label_xml = ""
-        label_xml += self.default_xml_start_tag("locationLabels", 2)
-        label_xml += self.default_xml_element("type", type, 3)
-        label_xml += self.default_xml_element("id", id, 3)
-        label_xml += self.default_xml_element("displayName", display_name, 3)
-        label_xml += self.default_xml_end_tag("locationLabels", 2)
-        return label_xml
-        
-
-    def default_xml_web_service_station(self, station_id):
-        """The web service station data is generate from available historical 
sources."""
-        station_ghcnd_row = ""
-        stations_ghcnd_file = open(self.ghcnd_stations, 'r')
-        for line in stations_ghcnd_file:
-            if station_id == self.get_field_from_definition(line, 
STATIONS_FIELDS['ID']):
-                station_ghcnd_row = line
-                break
-    
-        xml_station = ""
-        xml_station += self.default_xml_start_tag("station", 1)
-        
-        xml_station += self.default_xml_element("id", "GHCND:" + station_id, 2)
-        xml_station += self.default_xml_element("displayName", 
self.get_field_from_definition(station_ghcnd_row, 
STATIONS_FIELDS['NAME']).strip(), 2)
-        xml_station += self.default_xml_element("latitude", 
self.get_field_from_definition(station_ghcnd_row, 
STATIONS_FIELDS['LATITUDE']).strip(), 2)
-        xml_station += self.default_xml_element("longitude", 
self.get_field_from_definition(station_ghcnd_row, 
STATIONS_FIELDS['LONGITUDE']).strip(), 2)
-        
-        elevation = self.get_field_from_definition(station_ghcnd_row, 
STATIONS_FIELDS['ELEVATION']).strip()
-        if elevation != "-999.9":
-            xml_station += self.default_xml_element("elevation", elevation, 2)
-        
-        state_code = self.get_field_from_definition(station_ghcnd_row, 
STATIONS_FIELDS['STATE']).strip()
-        if state_code != "" and state_code in self.STATES:
-            xml_station += self.default_xml_location_labels("ST", "FIPS:" + 
str(self.STATES.keys().index(state_code)), self.STATES[state_code])
-        
-        # Add the MSHR data to the station generated information.
-        xml_station += self.default_xml_mshr_station_additional(station_id)
-            
-        xml_station += self.default_xml_end_tag("station", 1)
-        return xml_station
-        
-    def default_xml_day_reading_as_field(self, row, day):
-        day_index = DLY_FIELD_DAY_OFFSET + ((day - 1) * DLY_FIELD_DAY_FIELDS)
-        value = self.get_dly_field(row, day_index);
-        if value == "-9999":
-            return ""
-    
-        field_xml = ""
-        field_id = self.get_dly_field(row, DLY_FIELD_ELEMENT)
-        if field_id in ("MDTN", "MDTX", "MNPN", "MXPN", "TMAX", "TMIN", 
"TOBS",):
-            # Add both the celcius and fahrenheit temperatures.
-            celcius = float(value) / 10
-            field_xml += "            <" + field_id + "_c>" + str(celcius) + 
"</" + field_id + "_c>\n"
-            fahrenheit = self.convert_c2f(celcius)
-            field_xml += "            <" + field_id + "_f>" + str(fahrenheit) 
+ "</" + field_id + "_f>\n"
-        elif field_id in ("AWND", "EVAP", "PRCP", "THIC", "WESD", "WESF", 
"WSF1", "WSF2", "WSF5", "WSFG", "WSFI", "WSFM",):
-            # Field values that are in tenths.
-            converted_value = float(value) / 10
-            field_xml += "            <" + field_id + ">" + 
str(converted_value) + "</" + field_id + ">\n"
-        elif field_id in ("ACMC", "ACMH", "ACSC", "ACSH", "PSUN",):
-            # Fields is a percentage.
-            field_xml += "            <" + field_id + ">" + value.strip() + 
"</" + field_id + ">\n"
-        elif field_id in ("FMTM", "PGTM",):
-            # Fields is a time value HHMM.
-            field_xml += "            <" + field_id + ">" + value.strip() + 
"</" + field_id + ">\n"
-        elif field_id in ("DAEV", "DAPR", "DASF", "DATN", "DATX", "DAWM", 
"DWPR", "FRGB", "FRGT", "FRTH", "GAHT", "MDSF", "MDWM", "MDEV", "MDPR", "SNOW", 
"SNWD", "TSUN", "WDF1", "WDF2", "WDF5", "WDFG", "WDFI", "WDFM", "WDMV",):
-            # Fields with no alternation needed.
-            field_xml += "            <" + field_id + ">" + value.strip() + 
"</" + field_id + ">\n"
-        else:
-            field_xml += "            <unknown>" + field_id + "</unknown>\n"
-            
-        # print field_xml
-        return field_xml
-    
-    def default_xml_day_reading(self, row, day, indent=2):
-        day_index = DLY_FIELD_DAY_OFFSET + ((day - 1) * DLY_FIELD_DAY_FIELDS)
-        value = self.get_dly_field(row, day_index);
-        mflag = self.get_dly_field(row, day_index + 1);
-        qflag = self.get_dly_field(row, day_index + 2);
-        sflag = self.get_dly_field(row, day_index + 3);
-
-        if value == "-9999":
-            return ""
-
-        indent_space = self.get_indent_space(indent)
-        field_id = self.get_dly_field(row, DLY_FIELD_ELEMENT)
-        station_id = "GHCND:" + self.get_dly_field(row, DLY_FIELD_ID)
-    
-        field_xml = ""
-        field_xml += indent_space + "<dataType>" + field_id + "</dataType>\n"
-        field_xml += indent_space + "<station>" + station_id + "</station>\n"
-        field_xml += indent_space + "<value>" + value.strip() + "</value>\n"
-        field_xml += indent_space + "<attributes>\n"
-        field_xml += indent_space + indent_space + "<attribute>" + 
mflag.strip() + "</attribute>\n"
-        field_xml += indent_space + indent_space + "<attribute>" + 
qflag.strip() + "</attribute>\n"
-        field_xml += indent_space + indent_space + "<attribute>" + 
sflag.strip() + "</attribute>\n"
-        field_xml += indent_space + indent_space + "<attribute></attribute>\n"
-        field_xml += indent_space + "</attributes>\n"
-
-        # print field_xml
-        return field_xml
-    
-    def default_xml_end(self):
-        return textwrap.dedent("""\
-            </ghcnd_observation>""")
-
-    def default_xml_data_end(self):
-        return self.default_xml_end_tag("dataCollection", 0)
-
-    def default_xml_station_end(self):
-        return self.default_xml_end_tag("stationCollection", 0)
-
-    def default_xml_element(self, tag, data, indent=1):
-        return self.get_indent_space(indent) + "<" + tag + ">" + data + "</" + 
tag + ">\n"
-
-    def default_xml_start_tag(self, tag, indent=1):
-        return self.get_indent_space(indent) + "<" + tag + ">\n"
-
-    def default_xml_end_tag(self, tag, indent=1):
-        return self.get_indent_space(indent) + "</" + tag + ">\n"
-
-    def get_indent_space(self, indent):
-        return (" " * (4 * indent))
-    
-
-class WeatherWebServiceMonthlyXMLFile(WeatherConvertToXML):
-    """The web service class details how to create files similar to the NOAA 
web service."""
-    skip_downloading = False
-    # Station data
-    def process_station_data(self, row):
-        """Adds a single station record file either from downloading the data 
or generating a similar record."""
-        station_id = self.get_dly_field(row, DLY_FIELD_ID)
-        download = 0
-        if self.token is not "" and not self.skip_downloading:
-            download = self.download_station_data(station_id, self.token, True)
-            if download == 0:
-                self.skip_downloading = True
-        
-        # If not downloaded, generate.
-        if download != 0:
-            return download
-        else:
-            # Information for each daily file.
-            station_xml_file = self.default_xml_web_service_start()
-            station_xml_file += self.default_xml_station_start()
-            station_xml_file += 
self.default_xml_web_service_station(station_id)
-            station_xml_file += self.default_xml_station_end()
-            
-            # Remove white space.
-            station_xml_file = station_xml_file.replace("\n", "");
-            station_xml_file = 
station_xml_file.replace(self.get_indent_space(1), "");
-
-            # Make sure the station folder is available.
-            ghcnd_xml_station_path = self.get_base_folder(station_id, 
"stations")
-            if not os.path.isdir(ghcnd_xml_station_path):
-                os.makedirs(ghcnd_xml_station_path)
-                    
-            # Save XML string to disk.
-            save_file_name = ghcnd_xml_station_path + station_id + ".xml"
-            save_file_name = self.save_file(save_file_name, station_xml_file)
-    
-            if save_file_name is not "":
-                if self.debug_output:
-                    print "Wrote file: " + save_file_name
-                return 1
-            else:
-                return 0
-
-    # Station data
-    def download_station_data(self, station_id, token, reset=False):
-        """Downloads the station data from the web service."""
-        import time
-        time.sleep(2)
-        # Make sure the station folder is available.
-        ghcnd_xml_station_path = self.get_base_folder(station_id, "stations")
-        if not os.path.isdir(ghcnd_xml_station_path):
-            os.makedirs(ghcnd_xml_station_path)
-                
-        # Build download URL.
-        url = 
"http://www.ncdc.noaa.gov/cdo-services/services/datasets/GHCND/stations/GHCND:"; 
+ station_id + ".xml?token=" + token
-        url_file = urllib.urlopen(url)
-        station_xml_file = ""
-        while (True):
-            line = url_file.readline()
-            if not line:
-                break
-            station_xml_file += line
-        
-        if station_xml_file.find("<cdoError>") != -1:
-            if self.debug_output:
-                print "Error in station download"
-            return 0
-        
-        # Save XML string to disk.
-        save_file_name = ghcnd_xml_station_path + station_id + ".xml"
-        save_file_name = self.save_file(save_file_name, station_xml_file)
-    
-        if save_file_name is not "":
-            if self.debug_output:
-                print "Wrote file: " + save_file_name
-            return 2
-        else:
-            return 0
-
-    # Sensor data
-    def process_one_month_sensor_set(self, records, page):
-        """Generates records for a station using the web service xml layout."""
-        found_data = False        
-        year = int(self.get_dly_field(records[0], DLY_FIELD_YEAR))
-        month = int(self.get_dly_field(records[0], DLY_FIELD_MONTH))
-    
-        station_id = self.get_dly_field(records[0], DLY_FIELD_ID)
-
-        # Information for each daily file.
-        count = 0
-        daily_xml_file = ""
-        
-        for day in range(1, 32):
-            try:
-                # TODO find out what is a valid python date range? 1889?
-                # Attempt to see if this is valid date.
-                report_date = date(year, month, day)
-
-                for record in records:
-                    record_xml_snip = self.default_xml_day_reading(record, 
report_date.day)
-                    if record_xml_snip is not "":
-                        daily_xml_file += self.default_xml_start_tag("data")
-                        daily_xml_file += 
self.default_xml_field_date(report_date)
-                        daily_xml_file += record_xml_snip
-                        daily_xml_file += self.default_xml_end_tag("data")
-                        found_data = True
-                        count += 1
-
-            except ValueError:
-                pass
-
-        daily_xml_file = self.default_xml_web_service_start() + 
self.default_xml_data_start(count) + daily_xml_file + 
self.default_xml_data_end()
-        daily_xml_file = daily_xml_file.replace("\n", "");
-        daily_xml_file = daily_xml_file.replace(self.get_indent_space(1), "");
-
-        if not found_data:
-            return 0
-
-        # Make sure the station folder is available.
-        ghcnd_xml_station_path = self.get_base_folder(station_id) + "/" + 
station_id + "/" + str(report_date.year) + "/"
-        if not os.path.isdir(ghcnd_xml_station_path):
-            os.makedirs(ghcnd_xml_station_path)
-                
-        # Save XML string to disk.
-        save_file_name = ghcnd_xml_station_path + 
build_sensor_save_filename(station_id, report_date, page)
-        save_file_name = self.save_file(save_file_name, daily_xml_file)
-
-        if save_file_name is not "":
-            if self.debug_output:
-                print "Wrote file: " + save_file_name
-            return 1
-        else:
-            return 0
-
-def build_base_save_folder(save_path, station_id, data_type="sensors"):
-    # Default
-    station_prefix = station_id[:3]
-    return save_path + data_type + "/" + station_prefix + "/"
-
-def build_sensor_save_filename(station_id, report_date, page):
-    # Default
-    return station_id + "_" + str(report_date.year).zfill(4) + 
str(report_date.month).zfill(2) + "_" + str(page) + ".xml"
-

http://git-wip-us.apache.org/repos/asf/vxquery/blob/3167366d/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py
----------------------------------------------------------------------
diff --git 
a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py
 
b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py
deleted file mode 100644
index 4877120..0000000
--- 
a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py
+++ /dev/null
@@ -1,416 +0,0 @@
-#!/usr/bin/env python
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import glob
-import os.path
-import linecache
-import distutils.core
-
-from weather_convert_to_xml import *
-from collections import OrderedDict
-
-# Weather data files created to manage the conversion process.
-# Allows partition and picking up where you left off.
-class WeatherDataFiles:
-
-    LARGE_FILE_ROOT_TAG = "root"
-
-    INDEX_DATA_FILE_NAME = 0
-    INDEX_DATA_SENSORS_STATUS = 1
-    INDEX_DATA_STATION_STATUS = 2
-    INDEX_DATA_FILE_COUNT = 3
-    INDEX_DATA_FOLDER_DATA = 4
-
-    DATA_FILE_START_INDEX = 0
-    DATA_FILE_EXTENSION = ".dly"
-    DATA_FILE_MISSING = "missing"
-    DATA_FILE_INITIAL = "initialized"
-    DATA_FILE_DOWNLOADED = "downloaded"
-    DATA_FILE_GENERATED = "generated"
-    SEPERATOR = ","
-    
-    type = "sensor"
-    data_reset = False
-    
-    def __init__(self, base_path, progress_file_name="/tmp/_weather_data.csv"):
-        self.base_path = base_path
-
-        self.progress_file_name = progress_file_name
-        
-        self.current = self.DATA_FILE_START_INDEX
-        self.progress_data = []
-
-    def get_file_list_iterator(self):
-        """Return the list of files one at a time."""
-        return glob.iglob(self.base_path + "/*" + self.DATA_FILE_EXTENSION)
-
-    # Save Functions
-    def build_progress_file(self, options, convert):
-        if not os.path.isfile(self.progress_file_name) or 'reset' in options:
-            # Build a new file.
-            file = open(self.progress_file_name, 'w')
-            contents = self.get_default_progress_file_csv()
-            file.write(contents)
-            file.close()
-        elif 'append' in options or 'recalculate' in options:
-            self.open_progress_data()
-            row_count = len(self.progress_data)
-            for row in range(0, row_count):
-                row_contents = self.progress_data[row].rsplit(self.SEPERATOR)
-                file_name = row_contents[self.INDEX_DATA_FILE_NAME]
-                if self.get_file_row(file_name) < 0 and 'append' in options: 
-                    
self.progress_data.append(self.get_progress_csv_row(file_name, 
self.DATA_FILE_INITIAL, self.DATA_FILE_INITIAL))
-                elif 'recalculate' in options:
-                    # The folder is hard coded
-                    station_id = os.path.basename(file_name).split('.')[0]
-                    folder_name = convert.get_base_folder(station_id)
-                    if os.path.exists(folder_name):
-                        row_contents = 
self.progress_data[row].rsplit(self.SEPERATOR)
-                        sensor_status = 
row_contents[self.INDEX_DATA_SENSORS_STATUS]
-                        station_status = 
row_contents[self.INDEX_DATA_STATION_STATUS]
-                        file_count = self.get_file_count(folder_name)
-                        data_size = self.get_folder_size(folder_name)
-                        self.progress_data[row] = 
self.get_progress_csv_row(file_name, sensor_status, station_status, file_count, 
data_size)
-                    else:
-                        self.progress_data[row] = 
self.get_progress_csv_row(file_name, self.DATA_FILE_INITIAL, 
self.DATA_FILE_INITIAL)
-            # Save file
-            self.close_progress_data(True)
-        self.reset()
-        
-    def copy_to_n_partitions(self, save_path, partitions, base_paths, reset):
-        """Once the initial data has been generated, the data can be copied 
into a set number of partitions. """
-        if (len(base_paths) == 0):
-            return
-        
-        # Initialize the partition paths.
-        partition_paths = get_partition_paths(0, partitions, base_paths)
-        for path in partition_paths:
-            # Make sure the xml folder is available.
-            prepare_path(path, reset)
-
-        import fnmatch
-        import os
-        
-        # copy stations and sensors into each partition
-        current_sensor_partition = 0
-        current_station_partition = 0
-        self.open_progress_data()
-        row_count = len(self.progress_data)
-        for row in range(0, row_count):
-            row_contents = self.progress_data[row].rsplit(self.SEPERATOR)
-            file_name = row_contents[self.INDEX_DATA_FILE_NAME]
-            station_id = os.path.basename(file_name).split('.')[0]
-               
-            # Copy sensor files
-            type = "sensors"
-            file_path = build_base_save_folder(save_path, station_id, type) + 
station_id
-            for root, dirnames, filenames in os.walk(file_path):
-                for filename in fnmatch.filter(filenames, '*.xml'):
-                    xml_path = os.path.join(root, filename)
-                    new_file_base = 
build_base_save_folder(partition_paths[current_sensor_partition], station_id, 
type) + station_id
-                    if not os.path.isdir(new_file_base):
-                        os.makedirs(new_file_base)
-                    shutil.copyfile(xml_path, new_file_base + "/" + filename)
-                    current_sensor_partition += 1
-                    if current_sensor_partition >= len(partition_paths):
-                        current_sensor_partition = 0
-            
-            # Copy station files
-            type = "stations"
-            file_path = build_base_save_folder(save_path, station_id, type) + 
station_id + ".xml"
-            new_file_base = 
build_base_save_folder(partition_paths[current_station_partition], station_id, 
type)
-            new_file_path = new_file_base + station_id + ".xml"
-            if os.path.isfile(file_path):
-                if not os.path.isdir(new_file_base):
-                    os.makedirs(new_file_base)
-                shutil.copyfile(file_path, new_file_path)
-            current_station_partition += 1
-            if current_station_partition >= len(partition_paths):
-                current_station_partition = 0
-
-    def build_to_n_partition_files(self, save_path, partitions, base_paths, 
reset):
-        """Once the initial data has been generated, the data can be divided 
into partitions 
-        and stored in single files.
-        """
-        if (len(base_paths) == 0):
-            return
-        
-        XML_START = "<?xml version=\"1.0\" encoding=\"UTF-8\" 
standalone=\"yes\"?>"
-        
-        partition_paths = get_partition_paths(0, partitions, base_paths)
-
-        import fnmatch
-        import os
-        
-        for path in partition_paths:
-            prepare_path(path, reset)
-
-        # Initialize the partition paths.
-        types = ["sensors", "stations"]
-        for type in types:
-            partition_files = []
-            for path in partition_paths:
-                # Make sure the xml folder is available.
-                prepare_path(path + type + "/", False)
-                partition_files.append(open(path + type + "/partition.xml", 
'w'))
-                partition_files[-1].write(XML_START + "<" + 
self.LARGE_FILE_ROOT_TAG + ">\n")
-
-            # copy into each partition
-            current_partition = 0
-            self.open_progress_data()
-            row_count = len(self.progress_data)
-            for row in range(0, row_count):
-                row_contents = self.progress_data[row].rsplit(self.SEPERATOR)
-                file_name = row_contents[self.INDEX_DATA_FILE_NAME]
-                station_id = os.path.basename(file_name).split('.')[0]
-                
-                # Copy files
-                if type == "sensors":
-                    file_path = build_base_save_folder(save_path, station_id, 
type) + station_id
-                    for root, dirnames, filenames in os.walk(file_path):
-                        for filename in fnmatch.filter(filenames, '*.xml'):
-                            xml_path = os.path.join(root, filename)
-                            xml_data = 
file_get_contents(xml_path).replace(XML_START, "") + "\n"
-                            partition_files[current_partition].write(xml_data)
-                            current_partition += 1
-                            if current_partition >= len(partition_files):
-                                current_partition = 0
-                elif type == "stations":
-                    file_path = build_base_save_folder(save_path, station_id, 
type) + station_id + ".xml"
-                    xml_path = os.path.join(root, file_path)
-                    xml_data = file_get_contents(xml_path).replace(XML_START, 
"") + "\n"
-                    partition_files[current_partition].write(xml_data)
-                    current_partition += 1
-                    if current_partition >= len(partition_paths):
-                        current_partition = 0
-
-            for row in range(0, len(partition_paths)):
-                partition_files[row].write("</" + self.LARGE_FILE_ROOT_TAG + 
">\n")
-                partition_files[row].close()
-
-    def get_file_row(self, file_name):
-        for i in range(0, len(self.progress_data)):
-            if self.progress_data[i].startswith(file_name):
-                return i
-        return -1
-        
-    def get_default_progress_file_csv(self):
-        contents = ""
-        for path in self.get_file_list_iterator():
-            file_name = os.path.basename(path)
-            contents += self.get_progress_csv_row(file_name, 
self.DATA_FILE_INITIAL, self.DATA_FILE_INITIAL)
-        return contents
-    
-    def print_progress_file_stats(self, convert):
-        sensor_count_missing = 0
-        sensor_count = 0
-        file_count = 0
-        data_size = 0
-        
-        sensor_count_actual = 0
-        file_count_actual = 0
-        data_size_actual = 0
-        
-        station_count_missing = 0
-        station_count_generated = 0
-        station_count_downloaded = 0
-        
-        self.open_progress_data()
-        row_count = len(self.progress_data)
-        for row in range(0, row_count):
-            row_contents = self.progress_data[row].rsplit(self.SEPERATOR)
-            if int(row_contents[self.INDEX_DATA_FILE_COUNT]) != -1 and  
int(row_contents[self.INDEX_DATA_FOLDER_DATA]) != -1:
-                sensor_count += 1
-                file_count += int(row_contents[self.INDEX_DATA_FILE_COUNT])
-                data_size += int(row_contents[self.INDEX_DATA_FOLDER_DATA])
-            else:
-                sensor_count_missing += 1
-                
-            if row_contents[self.INDEX_DATA_STATION_STATUS] == "generated":
-                station_count_generated += 1
-            if row_contents[self.INDEX_DATA_STATION_STATUS] == "downloaded":
-                station_count_downloaded += 1
-            else:
-                station_count_missing += 1
-
-            file_name = row_contents[self.INDEX_DATA_FILE_NAME]
-            station_id = os.path.basename(file_name).split('.')[0]
-            folder_name = convert.get_base_folder(station_id)
-            if os.path.exists(folder_name):
-                sensor_count_actual += 1
-                file_count_actual += self.get_file_count(folder_name)
-                data_size_actual += self.get_folder_size(folder_name)
-
-
-        print "Progress File:\t" + self.progress_file_name + "\n"
-        
-        print "CSV DETAILS OF PROCESSED SENSORS"
-        print "Number of stations:\t" + "{:,}".format(sensor_count)
-        print "Number of files:\t" + "{:,}".format(file_count)
-        print "Data size:\t\t" + "{:,}".format(data_size) + " Bytes\n"
-
-        print "CSV DETAILS OF unPROCESSED SENSORS"
-        print "Number of stations:\t" + "{:,}".format(sensor_count_missing) + 
"\n"
-
-        print "CSV DETAILS OF PROCESSED STATIONS"
-        print "Generated:\t\t" + "{:,}".format(station_count_generated)
-        print "Downloaded:\t\t" + "{:,}".format(station_count_downloaded)
-        print "Missing:\t\t" + "{:,}".format(station_count_missing) + "\n"
-
-        print "FOLDER DETAILS"
-        print "Number of stations:\t" + "{:,}".format(sensor_count_actual)
-        print "Number of files:\t" + "{:,}".format(file_count_actual)
-        print "Data size:\t\t" + "{:,}".format(data_size_actual) + " Bytes\n"
-
-    
-    def get_progress_csv_row(self, file_name, sensors_status, station_status, 
file_count=-1, data_size=-1):
-        return file_name + self.SEPERATOR + sensors_status + self.SEPERATOR + 
station_status + self.SEPERATOR + str(file_count) + self.SEPERATOR + 
str(data_size) + "\n"
-    
-    def update_file_sensor_status(self, file_name, sensors_status, 
file_count=-1, data_size=-1):
-        for row in range(0, len(self.progress_data)):
-            if self.progress_data[row].startswith(file_name):
-                station_status = 
self.progress_data[row].rsplit(self.SEPERATOR)[self.INDEX_DATA_STATION_STATUS]
-                self.progress_data[row] = self.get_progress_csv_row(file_name, 
sensors_status, station_status, file_count, data_size)
-                break
-
-        # Save the file            
-        self.close_progress_data(True)
-
-    def update_file_station_status(self, file_name, station_status):
-        for row in range(0, len(self.progress_data)):
-            if self.progress_data[row].startswith(file_name):
-                row_contents = self.progress_data[row].rsplit(self.SEPERATOR)
-                sensors_status = row_contents[self.INDEX_DATA_SENSORS_STATUS]
-                file_count = int(row_contents[self.INDEX_DATA_FILE_COUNT])
-                data_size = int(row_contents[self.INDEX_DATA_FOLDER_DATA])
-                self.progress_data[row] = self.get_progress_csv_row(file_name, 
sensors_status, station_status, file_count, data_size)
-                break
-
-        # Save the file            
-        self.close_progress_data(True)
-
-    def get_file_count(self, folder_name):
-        count = 0
-        for dirpath, dirnames, filenames in os.walk(folder_name):
-            for f in filenames:
-                count += 1
-        return count
-
-    def get_folder_size(self, folder_name):
-        total_size = 0
-        for dirpath, dirnames, filenames in os.walk(folder_name):
-            for f in filenames:
-                fp = os.path.join(dirpath, f)
-                total_size += os.path.getsize(fp)
-        return total_size
-
-    def get_station_status(self, return_value):
-        if return_value == 2:
-            return self.DATA_FILE_DOWNLOADED
-        elif return_value == 1:
-            return self.DATA_FILE_GENERATED
-        return self.DATA_FILE_MISSING
-        
-    
-    def open_progress_data(self):
-        with open(self.progress_file_name, 'r') as file:
-            self.progress_data = file.readlines()
-
-    def close_progress_data(self, force=False):
-        if len(self.progress_data) > 0 or force:
-            with open(self.progress_file_name, 'w') as file:
-                file.writelines(self.progress_data)
-
-    
-    def reset(self):
-        self.close_progress_data()
-
-        self.current = self.DATA_FILE_START_INDEX
-        self.open_progress_data()
-
-    def set_type(self, type):
-        self.type = type
-
-    def set_data_reset(self, data_reset):
-        self.data_reset = data_reset
-
-
-    # Iterator Functions
-    def __iter__(self):
-        return self
-
-    def next(self):
-        columns = []
-        while True:
-            # find a row that has not been created.
-            if self.current >= len(self.progress_data):
-                raise StopIteration
-            row = self.progress_data[self.current]
-            self.current += 1
-            columns = row.rsplit(self.SEPERATOR)
-            if self.type == "sensor" and 
(columns[self.INDEX_DATA_SENSORS_STATUS].strip() != self.DATA_FILE_GENERATED or 
self.data_reset):
-                break
-            elif self.type == "station" and 
(columns[self.INDEX_DATA_STATION_STATUS].strip() != self.DATA_FILE_DOWNLOADED 
or self.data_reset):
-                break
-        return columns[self.INDEX_DATA_FILE_NAME]
-    
-    
-# Index values of each field details.
-PARTITION_INDEX_NODE = 0
-PARTITION_INDEX_DISK = 1
-PARTITION_INDEX_VIRTUAL = 2
-PARTITION_INDEX = 3
-PARTITION_INDEX_PATH = 4
-PARTITION_HEADER = ("Node", "Disk", "Virtual", "Index", "Path")
-            
-def get_partition_paths(node_id, partitions, base_paths, key="partitions"):
-    partition_paths = []
-    for scheme in get_partition_scheme(node_id, partitions, base_paths, key):
-        partition_paths.append(scheme[PARTITION_INDEX_PATH])
-    return partition_paths
-
-def get_partition_scheme(node_id, virtual_partitions, base_paths, 
key="partitions"):
-    partitions_per_disk = virtual_partitions / len(base_paths)
-    return get_disk_partition_scheme(node_id, partitions_per_disk, base_paths, 
key)
-
-def get_disk_partition_paths(node_id, partitions, base_paths, 
key="partitions"):
-    partition_paths = []
-    for scheme in get_disk_partition_scheme(node_id, partitions, base_paths, 
key):
-        partition_paths.append(scheme[PARTITION_INDEX_PATH])
-    return partition_paths
-
-def get_disk_partition_scheme(node_id, virtual_disk_partitions, base_paths, 
key="partitions"):
-    partition_scheme = []
-    for i in range(0, virtual_disk_partitions):
-        for j in range(0, len(base_paths)):
-            new_partition_path = base_paths[j] + key + "/" + 
get_partition_folder(j, virtual_disk_partitions, i) + "/"
-            partition_scheme.append((node_id, j, virtual_disk_partitions, i, 
new_partition_path))
-    return partition_scheme
-
-def get_partition_folder(disks, partitions, index):        
-    return "d" + str(disks) + "_p" + str(partitions) + "_i" + str(index)
-
-def prepare_path(path, reset):
-    """Ensures the directory is available. If reset, then its a brand new 
directory."""
-    if os.path.isdir(path) and reset:
-        shutil.rmtree(path)
-                
-    if not os.path.isdir(path):
-        os.makedirs(path)
-
-def file_get_contents(filename):
-    with open(filename) as f:
-        return f.read()

http://git-wip-us.apache.org/repos/asf/vxquery/blob/3167366d/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_download_files.py
----------------------------------------------------------------------
diff --git 
a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_download_files.py
 
b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_download_files.py
deleted file mode 100644
index fb59b50..0000000
--- 
a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_download_files.py
+++ /dev/null
@@ -1,102 +0,0 @@
-#!/usr/bin/env python
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import sys
-import os.path
-import shutil
-import tarfile
-import urllib
-import zipfile
-
-# Custom modules.
-from weather_config_ghcnd import *
-from weather_config_mshr import *
-
-class WeatherDownloadFiles:
-
-    def __init__(self, save_path):
-        self.save_path = save_path
-        
-        if not os.path.isdir(save_path):
-            os.makedirs(save_path)
-
-
-    def download_ghcnd_files(self, reset=False):
-        """Download the complete list."""
-        for file_name in FILE_NAMES:
-            url = BASE_DOWNLOAD_URL + file_name
-            self.download_file(url, reset)
-
-    def download_mshr_files(self, reset=False):
-        for url in MSHR_URLS:
-            self.download_file(url, reset)
-
-    def download_file(self, url, reset=False):
-        """Download the file, unless it exists."""
-        file_name = self.save_path + "/" + url.split('/')[-1]
-
-        if not os.path.isfile(file_name) or reset:
-            print "Downloading: " + url
-            urllib.urlretrieve(url, file_name, report_download_status)
-            print
-
-    def unzip_ghcnd_package(self, package, reset=False):
-        """Unzip the package file, unless it exists."""
-        file_name = self.save_path + "/" + package + ".tar.gz"
-        unzipped_path = self.save_path + "/" + package
-        
-        if os.path.isdir(unzipped_path) and reset:
-            shutil.rmtree(unzipped_path)
-            
-        if not os.path.isdir(unzipped_path):
-            print "Unzipping: " + file_name
-            tar_file = tarfile.open(file_name, 'r:gz')
-            tar_file.extractall(unzipped_path)
- 
-    def unzip_mshr_files(self, reset=False):
-        """Unzip the package file, unless it exists."""
-        for url in MSHR_URLS:
-            if url.endswith('.zip'):
-                file_name = self.save_path + "/" + url.split('/')[-1]
-                print "Unzipping: " + file_name
-                with zipfile.ZipFile(file_name, 'r') as myzip:
-                    myzip.extractall(self.save_path)
- 
-def report_download_status(count, block, size):
-    """Report download status."""
-    line_size = 50
-    erase = "\b" * line_size
-    sys.stdout.write(erase)
-    report = get_report_line((float(count) * block / size), line_size)
-    sys.stdout.write(report)
-
-def get_report_line(percentage, line_size):
-    """Creates a string to be used in reporting the percentage done."""
-    report = ""
-    for i in range(0, line_size):
-        if (float(i) / line_size < percentage):
-            report += "="
-        else:
-            report += "-"
-    return report
-            
-def download_file_save_as(url, new_file_name, reset=False):
-    """Download the file, unless it exists."""
-    if not os.path.isfile(new_file_name) or reset:
-        print "Downloading: " + url
-        urllib.urlretrieve(url, new_file_name, report_download_status)
-        print
-

http://git-wip-us.apache.org/repos/asf/vxquery/blob/3167366d/vxquery-benchmark/src/main/resources/util/README.md
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/util/README.md 
b/vxquery-benchmark/src/main/resources/util/README.md
deleted file mode 100644
index 8e2a204..0000000
--- a/vxquery-benchmark/src/main/resources/util/README.md
+++ /dev/null
@@ -1,28 +0,0 @@
-<!--
-  Licensed to the Apache Software Foundation (ASF) under one or more
-  contributor license agreements.  See the NOTICE file distributed with
-  this work for additional information regarding copyright ownership.
-  The ASF licenses this file to You under the Apache License, Version 2.0
-  (the "License"); you may not use this file except in compliance with
-  the License.  You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
--->
-
-Utilities for Benchmark Operations
-=====================
-
-# Introduction
-
-Helpful scripts or configuration document to work with the benchmarks.
-
-## Saxon Collection
-
-To test the data with other XQuery processors, the saxon script helps with 
-creating a collection.xml file.
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/vxquery/blob/3167366d/vxquery-benchmark/src/main/resources/util/build_saxon_collection_xml.py
----------------------------------------------------------------------
diff --git 
a/vxquery-benchmark/src/main/resources/util/build_saxon_collection_xml.py 
b/vxquery-benchmark/src/main/resources/util/build_saxon_collection_xml.py
deleted file mode 100644
index 02f39ee..0000000
--- a/vxquery-benchmark/src/main/resources/util/build_saxon_collection_xml.py
+++ /dev/null
@@ -1,63 +0,0 @@
-#!/usr/bin/env python
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import getopt, glob, os, sys
- 
-def main(argv):
-    xml_folder = ""
-     
-    # Get the base folder
-    try:
-        opts, args = getopt.getopt(argv, "f:h", ["folder="])
-    except getopt.GetoptError:
-        print 'The file options for build_saxon_collection_xml.py were not 
correctly specified.'
-        print 'To see a full list of options try:'
-        print '  $ python build_saxon_collection_xml.py -h'
-        sys.exit(2)
-    for opt, arg in opts:
-        if opt == '-h':
-            print 'Options:'
-            print '    -f        The base folder to create collection XML 
file.'
-            sys.exit()
-        elif opt in ('-f', "--folder"):
-            # check if file exists.
-            if os.path.exists(arg):
-                xml_folder = arg
-            else:
-                print 'Error: Argument must be a folder name for --folder 
(-f).'
-                sys.exit()
-  
-    # Required fields to run the script.
-    if xml_folder == "" or not os.path.exists(xml_folder):
-        print 'Error: The folder path option must be supplied:  --folder (-f).'
-        sys.exit()
-      
-    # find all XML files in folder
-    collection_xml = "<collection>"
-    for i in range(1, 5):
-        # Search the ith directory level.
-        search_pattern = xml_folder + ('/*' * i) + '.xml'
-        for file_path in glob.iglob(search_pattern):
-            collection_xml += '<doc href="' + str.replace(file_path, 
xml_folder, '') + '"/>'
-    collection_xml += "</collection>"
-          
-    # create collection XML
-    file = open('collection.xml', 'w')
-    file.write(collection_xml)
-    file.close()
-
-if __name__ == "__main__":
-    main(sys.argv[1:])

http://git-wip-us.apache.org/repos/asf/vxquery/blob/3167366d/vxquery-benchmark/src/main/resources/util/find_averages_in_logs.py
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/util/find_averages_in_logs.py 
b/vxquery-benchmark/src/main/resources/util/find_averages_in_logs.py
deleted file mode 100644
index 1cd7939..0000000
--- a/vxquery-benchmark/src/main/resources/util/find_averages_in_logs.py
+++ /dev/null
@@ -1,97 +0,0 @@
-#!/usr/bin/env python
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import fnmatch
-import getopt
-import glob
-import os
-import sys
-import csv
-
-SEARCH_STRING = 'Average execution time:'
-
-def find_files(directory, pattern):
-    for root, dirs, files in os.walk(directory):
-        for basename in files:
-            if fnmatch.fnmatch(basename, pattern):
-                yield (root, basename)
-    
-    
-def main(argv):
-    ''' Same as bash: find $FOLDER -type f -name "*.xml" -exec basename {} \; 
> list_xml.csv
-    '''
-    log_folder = ""
-    save_file = ""
-    data_type = ""
-    
-    # Get the base folder
-    try:
-        opts, args = getopt.getopt(argv, "f:hs:t:", ["folder=", "save_file=", 
"data_type="])
-    except getopt.GetoptError:
-        print 'The file options for list_xml_files.py were not correctly 
specified.'
-        print 'To see a full list of options try:'
-        print '  $ python list_xml_files.py -h'
-        sys.exit(2)
-    for opt, arg in opts:
-        if opt == '-h':
-            print 'Options:'
-            print '    -f        The base folder to build XML file list.'
-            print '    -s        The save file.'
-            sys.exit()
-        elif opt in ('-f', "--folder"):
-            # check if file exists.
-            if os.path.exists(arg):
-                log_folder = arg
-            else:
-                print 'Error: Argument must be a folder name for --folder 
(-f).'
-                sys.exit()
-        elif opt in ('-s', "--save_file"):
-            save_file = arg
-        elif opt in ('-t', "--data_type"):
-            data_type = arg
-  
-    # Required fields to run the script.
-    if log_folder == "" or not os.path.exists(log_folder):
-        print 'Error: The folder path option must be supplied:  --folder (-f).'
-        sys.exit()
-    if save_file == "":
-        print 'Error: The folder path option must be supplied:  --save_file 
(-s).'
-        sys.exit()
-      
-    list_xml_csv = ''
-    with open(save_file, 'w') as outfile:
-        csvfile = csv.writer(outfile)
-        for path, filename in find_files(log_folder, '*.log'):
-            # Only write out a specific type of data xml documents found in a 
specific path.
-            with open(path + "/" + filename) as infile:
-                folders = path.replace(log_folder, "")
-                for line in infile:
-                    # Skip the root tags.
-                    if line.startswith(SEARCH_STRING):
-                        time_split = line.split(" ")
-                        name_split = filename.split(".")
-                        folder_split = folders.split("/")
-
-                        # Build data row
-                        row = folder_split
-                        row.append(name_split[0])
-                        row.append(time_split[3])
-                        row.append(name_split[2])
-                        csvfile.writerow(row)
-        
-          
-if __name__ == "__main__":
-    main(sys.argv[1:])

http://git-wip-us.apache.org/repos/asf/vxquery/blob/3167366d/vxquery-benchmark/src/main/resources/util/log_top.sh
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/util/log_top.sh 
b/vxquery-benchmark/src/main/resources/util/log_top.sh
deleted file mode 100755
index 4a2f7e1..0000000
--- a/vxquery-benchmark/src/main/resources/util/log_top.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-LOG_FILE=logs/top.log
-
-# Reset counters
-iostat >> /dev/null
-sar -n DEV 1 1  >> /dev/null
-
-# Save IO, CPU and Network snapshot to a log file.
-while (sleep 7)
-do
-       echo "---------------------------------------------" >> ${LOG_FILE}
-       date >> ${LOG_FILE}
-       echo >> ${LOG_FILE}
-       iostat -y 1 1 >> ${LOG_FILE}
-       top -n 1 -b | head -11 | tail -6 >> ${LOG_FILE}
-       sar -n DEV 1 1 >> ${LOG_FILE}
-done;
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/vxquery/blob/3167366d/vxquery-benchmark/src/main/resources/util/merge_xml_files.py
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/util/merge_xml_files.py 
b/vxquery-benchmark/src/main/resources/util/merge_xml_files.py
deleted file mode 100644
index 9238a19..0000000
--- a/vxquery-benchmark/src/main/resources/util/merge_xml_files.py
+++ /dev/null
@@ -1,88 +0,0 @@
-#!/usr/bin/env python
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import fnmatch
-import getopt
-import glob
-import os
-import sys
-
-XML_PREFIX = '<?xml version="1.0" encoding="UTF-8" standalone="yes"?><root>' + 
"\n"
-XML_SUFFIX = '</root>' + "\n"
-
-def find_files(directory, pattern):
-    for root, dirs, files in os.walk(directory):
-        for basename in files:
-            if fnmatch.fnmatch(basename, pattern):
-                yield (root, basename)
-    
-    
-def main(argv):
-    ''' Same as bash: find $FOLDER -type f -name "*.xml" -exec basename {} \; 
> list_xml.csv
-    '''
-    xml_folder = ""
-    save_file = ""
-    data_type = ""
-     
-    # Get the base folder
-    try:
-        opts, args = getopt.getopt(argv, "f:hs:t:", ["folder=", "save_file=", 
"data_type="])
-    except getopt.GetoptError:
-        print 'The file options for list_xml_files.py were not correctly 
specified.'
-        print 'To see a full list of options try:'
-        print '  $ python merge_xml_files.py -f /path/to/folder -s new.xml -t 
sensors'
-        sys.exit(2)
-    for opt, arg in opts:
-        if opt == '-h':
-            print 'Options:'
-            print '    -f        The base folder to build XML file list.'
-            print '    -s        The save file.'
-            sys.exit()
-        elif opt in ('-f', "--folder"):
-            # check if file exists.
-            if os.path.exists(arg):
-                xml_folder = arg
-            else:
-                print 'Error: Argument must be a folder name for --folder 
(-f).'
-                sys.exit()
-        elif opt in ('-s', "--save_file"):
-            save_file = arg
-        elif opt in ('-t', "--data_type"):
-            data_type = arg
-  
-    # Required fields to run the script.
-    if xml_folder == "" or not os.path.exists(xml_folder):
-        print 'Error: The folder path option must be supplied:  --folder (-f).'
-        sys.exit()
-    if save_file == "":
-        print 'Error: The folder path option must be supplied:  --save_file 
(-s).'
-        sys.exit()
-      
-    list_xml_csv = ''
-    with open(save_file, 'w') as outfile:
-        outfile.write(XML_PREFIX)
-        for path, filename in find_files(xml_folder, '*.xml'):
-            # Only write out a specific type of data xml documents found in a 
specific path.
-            if data_type in path:
-                with open(path + "/" + filename) as infile:
-                    for line in infile:
-                        # Skip the root tags.
-                        if line != XML_PREFIX and line != XML_SUFFIX:
-                            outfile.write(line)
-        outfile.write(XML_SUFFIX)
-          
-if __name__ == "__main__":
-    main(sys.argv[1:])

http://git-wip-us.apache.org/repos/asf/vxquery/blob/3167366d/vxquery-benchmark/src/main/resources/util/vxquery_functions.xq
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/util/vxquery_functions.xq 
b/vxquery-benchmark/src/main/resources/util/vxquery_functions.xq
deleted file mode 100644
index d0621eb..0000000
--- a/vxquery-benchmark/src/main/resources/util/vxquery_functions.xq
+++ /dev/null
@@ -1,27 +0,0 @@
-(: Licensed to the Apache Software Foundation (ASF) under one
-   or more contributor license agreements.  See the NOTICE file
-   distributed with this work for additional information
-   regarding copyright ownership.  The ASF licenses this file
-   to you under the Apache License, Version 2.0 (the
-   "License"); you may not use this file except in compliance
-   with the License.  You may obtain a copy of the License at
-   
-     http://www.apache.org/licenses/LICENSE-2.0
-   
-   Unless required by applicable law or agreed to in writing,
-   software distributed under the License is distributed on an
-   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-   KIND, either express or implied.  See the License for the
-   specific language governing permissions and limitations
-   under the License. :)
-
-(: XQuery Function List :)
-(: VXQuery function list in csv with arguments and return types :)
-let $list := 
"../../../../../vxquery-core/src/main/java/org/apache/vxquery/functions/builtin-functions.xml"
-let $r :=
-    for $f in fn:doc($list)/functions/function
-        let $pl := 
-            for $p in $f/param
-            return $p/@type
-        return fn:string-join(($f/@name, fn:string-join($pl, ' '), 
$f/return/@type), ',')
-return fn:string-join($r , '|')
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/vxquery/blob/3167366d/vxquery-benchmark/src/main/resources/util/vxquery_operators.xq
----------------------------------------------------------------------
diff --git a/vxquery-benchmark/src/main/resources/util/vxquery_operators.xq 
b/vxquery-benchmark/src/main/resources/util/vxquery_operators.xq
deleted file mode 100644
index f485807..0000000
--- a/vxquery-benchmark/src/main/resources/util/vxquery_operators.xq
+++ /dev/null
@@ -1,27 +0,0 @@
-(: Licensed to the Apache Software Foundation (ASF) under one
-   or more contributor license agreements.  See the NOTICE file
-   distributed with this work for additional information
-   regarding copyright ownership.  The ASF licenses this file
-   to you under the Apache License, Version 2.0 (the
-   "License"); you may not use this file except in compliance
-   with the License.  You may obtain a copy of the License at
-   
-     http://www.apache.org/licenses/LICENSE-2.0
-   
-   Unless required by applicable law or agreed to in writing,
-   software distributed under the License is distributed on an
-   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-   KIND, either express or implied.  See the License for the
-   specific language governing permissions and limitations
-   under the License. :)
-
-(: XQuery Function List :)
-(: VXQuery function list in csv with arguments and return types :)
-let $list := 
"../../../../../vxquery-core/src/main/java/org/apache/vxquery/functions/builtin-operators.xml"
-let $r :=
-    for $f in fn:doc($list)/operators/operator
-        let $pl := 
-            for $p in $f/param
-            return $p/@type
-        return fn:string-join(($f/@name, fn:string-join($pl, ' '), 
$f/return/@type), ',')
-return fn:string-join($r , '|')
\ No newline at end of file

[01/14] Remove benchmark files to allow easy copy from other branch.

Reply via email to