Repository: incubator-vxquery Updated Branches: refs/heads/prestonc/benchmarks_staging 1e7880caf -> eaed030b6
Updated the generated station xml to mirror web service. Project: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/commit/9456d4ad Tree: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/tree/9456d4ad Diff: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/diff/9456d4ad Branch: refs/heads/prestonc/benchmarks_staging Commit: 9456d4ad6074b0c1b60a2105395a84a12885fd80 Parents: 1e7880c Author: Preston Carman <[email protected]> Authored: Wed Mar 5 21:56:13 2014 -0800 Committer: Preston Carman <[email protected]> Committed: Wed Mar 5 21:56:13 2014 -0800 ---------------------------------------------------------------------- .../noaa-ghcn-daily/scripts/weather_cli.py | 6 +- .../scripts/weather_config_ghcnd.py | 96 ++++++++++ .../scripts/weather_config_mshr.py | 78 ++++++++ .../scripts/weather_convert_to_xml.py | 183 +++++++++++++++++-- .../scripts/weather_data_files.py | 2 +- .../scripts/weather_dly_config.py | 96 ---------- .../scripts/weather_download_files.py | 33 +++- 7 files changed, 374 insertions(+), 120 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/9456d4ad/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_cli.py ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_cli.py b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_cli.py index 103c0d1..8d18607 100644 --- a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_cli.py +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_cli.py @@ -127,10 +127,12 @@ def main(argv): if section in ("all", "download"): print 'Processing the download section.' download = WeatherDownloadFiles(download_path) - download.download_all_files(reset) + download.download_ghcnd_files(reset) + download.download_mshr_files(reset) # Unzip the required file. - download.unzip_package(config.get_package(), reset) + download.unzip_ghcnd_package(config.get_package(), reset) + download.unzip_mshr_files(reset) # Create some basic paths for save files and references. http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/9456d4ad/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_config_ghcnd.py ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_config_ghcnd.py b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_config_ghcnd.py new file mode 100644 index 0000000..801e748 --- /dev/null +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_config_ghcnd.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Base URL used to get all the required files. +BASE_DOWNLOAD_URL = 'http://www1.ncdc.noaa.gov/pub/data/ghcn/daily/' + +# List of required files for a build. +FILE_NAMES = [] +FILE_NAMES.append('ghcnd-countries.txt') +FILE_NAMES.append('ghcnd-inventory.txt') +FILE_NAMES.append('ghcnd-states.txt') +FILE_NAMES.append('ghcnd-stations.txt') +FILE_NAMES.append('ghcnd-version.txt') +FILE_NAMES.append('ghcnd_all.tar.gz') +FILE_NAMES.append('ghcnd_gsn.tar.gz') +FILE_NAMES.append('ghcnd_hcn.tar.gz') +FILE_NAMES.append('readme.txt') +FILE_NAMES.append('status.txt') + +# Store the row details here. + +# Index values of each field details. +FIELD_INDEX_NAME = 0 +FIELD_INDEX_START = 1 +FIELD_INDEX_END = 2 +FIELD_INDEX_TYPE = 3 + +DLY_FIELD_ID = 0 +DLY_FIELD_YEAR = 1 +DLY_FIELD_MONTH = 2 +DLY_FIELD_ELEMENT = 3 + +DLY_FIELD_DAY_OFFSET = 4 +DLY_FIELD_DAY_FIELDS = 4 + +DLY_FIELDS = [] + +# Details about the row. +DLY_FIELDS.append(['ID', 1, 11, 'Character']) +DLY_FIELDS.append(['YEAR', 12, 15, 'Integer']) +DLY_FIELDS.append(['MONTH', 16, 17, 'Integer']) +DLY_FIELDS.append(['ELEMENT', 18, 21, 'Character']) + +# Days in each row. +for i in range(1, 32): + start = 22 + ((i - 1) * 8) + DLY_FIELDS.append(['VALUE' + str(i), (start + 0), (start + 4), 'Integer']) + DLY_FIELDS.append(['MFLAG' + str(i), (start + 5), (start + 5), 'Character']) + DLY_FIELDS.append(['QFLAG' + str(i), (start + 6), (start + 6), 'Character']) + DLY_FIELDS.append(['SFLAG' + str(i), (start + 7), (start + 7), 'Character']) + +# Details about the row. +STATIONS_FIELDS = {} +STATIONS_FIELDS['ID'] = ['ID', 1, 11, 'Character'] +STATIONS_FIELDS['LATITUDE'] = ['LATITUDE', 13, 20, 'Real'] +STATIONS_FIELDS['LONGITUDE'] = ['LONGITUDE', 22, 30, 'Real'] +STATIONS_FIELDS['ELEVATION'] = ['ELEVATION', 32, 37, 'Real'] +STATIONS_FIELDS['STATE'] = ['STATE', 39, 40, 'Character'] +STATIONS_FIELDS['NAME'] = ['NAME', 42, 71, 'Character'] +STATIONS_FIELDS['GSNFLAG'] = ['GSNFLAG', 73, 75, 'Character'] +STATIONS_FIELDS['HCNFLAG'] = ['HCNFLAG', 77, 79, 'Character'] +STATIONS_FIELDS['WMOID'] = ['WMOID', 81, 85, 'Character'] + +# Details about the row. +COUNTRIES_FIELDS = {} +COUNTRIES_FIELDS['CODE'] = ['CODE', 1, 2, 'Character'] +COUNTRIES_FIELDS['NAME'] = ['NAME', 4, 50, 'Character'] + +# Details about the row. +STATES_FIELDS = {} +STATES_FIELDS['CODE'] = ['CODE', 1, 2, 'Character'] +STATES_FIELDS['NAME'] = ['NAME', 4, 50, 'Character'] + +# Details about the row. +INVENTORY_FIELDS = [] +INVENTORY_FIELDS.append(['ID', 1, 11, 'Character']) +INVENTORY_FIELDS.append(['LATITUDE', 13, 20, 'Real']) +INVENTORY_FIELDS.append(['LONGITUDE', 22, 30, 'Real']) +INVENTORY_FIELDS.append(['ELEMENT', 32, 35, 'Character']) +INVENTORY_FIELDS.append(['FIRSTYEAR', 37, 40, 'Integer']) +INVENTORY_FIELDS.append(['LASTYEAR', 42, 45, 'Integer']) + http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/9456d4ad/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_config_mshr.py ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_config_mshr.py b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_config_mshr.py new file mode 100644 index 0000000..7b1434f --- /dev/null +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_config_mshr.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# List of required files for a build. +MSHR_URLS = [] +MSHR_URLS.append('ftp://ftp.ncdc.noaa.gov/pub/data/homr/docs/MSHR_Enhanced_Table.txt') +MSHR_URLS.append('http://www.ncdc.noaa.gov/homr/file/mshr_enhanced.txt.zip') + +# Index values of each field details. +MSHR_FIELD_INDEX_NAME = 0 +MSHR_FIELD_INDEX_START = 1 +MSHR_FIELD_INDEX_END = 2 +MSHR_FIELD_INDEX_TYPE = 3 + +# Store the row details here. +MSHR_FIELDS = {} + +# Details about the row. +MSHR_FIELDS['SOURCE_ID'] = ['SOURCE_ID', 1, 20, 'X(20)'] +MSHR_FIELDS['SOURCE'] = ['SOURCE', 22, 31, 'X(10)'] +MSHR_FIELDS['BEGIN_DATE'] = ['BEGIN_DATE', 33, 40, 'YYYYMMDD'] +MSHR_FIELDS['END_DATE'] = ['END_DATE', 42, 49, 'YYYYMMDD'] +MSHR_FIELDS['STATION_STATUS'] = ['STATION_STATUS', 51, 70, 'X(20)'] +MSHR_FIELDS['NCDCSTN_ID'] = ['NCDCSTN_ID', 72, 91, 'X(20)'] +MSHR_FIELDS['ICAO_ID'] = ['ICAO_ID', 93, 112, 'X(20)'] +MSHR_FIELDS['WBAN_ID'] = ['WBAN_ID', 114, 133, 'X(20)'] +MSHR_FIELDS['FAA_ID'] = ['FAA_ID', 135, 154, 'X(20)'] +MSHR_FIELDS['NWSLI_ID'] = ['NWSLI_ID', 156, 175, 'X(20)'] +MSHR_FIELDS['WMO_ID'] = ['WMO_ID', 177, 196, 'X(20)'] +MSHR_FIELDS['COOP_ID'] = ['COOP_ID', 198, 217, 'X(20)'] +MSHR_FIELDS['TRANSMITTAL_ID'] = ['TRANSMITTAL_ID', 219, 238, 'X(20)'] +MSHR_FIELDS['GHCND_ID'] = ['GHCND_ID', 240, 259, 'X(20)'] +MSHR_FIELDS['NAME_PRINCIPAL'] = ['NAME_PRINCIPAL', 261, 360, 'X(100)'] +MSHR_FIELDS['NAME_PRINCIPAL_SHORT'] = ['NAME_PRINCIPAL_SHORT', 362, 391, 'X(30)'] +MSHR_FIELDS['NAME_COOP'] = ['NAME_COOP', 393, 492, 'X(100)'] +MSHR_FIELDS['NAME_COOP_SHORT'] = ['NAME_COOP_SHORT', 494, 523, 'X(30)'] +MSHR_FIELDS['NAME_PUBLICATION'] = ['NAME_PUBLICATION', 525, 624, 'X(100)'] +MSHR_FIELDS['NAME_ALIAS'] = ['NAME_ALIAS', 626, 725, 'X(100)'] +MSHR_FIELDS['NWS_CLIM_DIV'] = ['NWS_CLIM_DIV', 727, 736, 'X(10)'] +MSHR_FIELDS['NWS_CLIM_DIV_NAME'] = ['NWS_CLIM_DIV_NAME', 738, 777, 'X(40)'] +MSHR_FIELDS['STATE_PROV'] = ['STATE_PROV', 779, 788, 'X(10)'] +MSHR_FIELDS['COUNTY'] = ['COUNTY', 790, 839, 'X(50)'] +MSHR_FIELDS['NWS_ST_CODE'] = ['NWS_ST_CODE', 841, 842, 'X(2)'] +MSHR_FIELDS['FIPS_COUNTRY_CODE'] = ['FIPS_COUNTRY_CODE', 844, 845, 'X(2)'] +MSHR_FIELDS['FIPS_COUNTRY_NAME'] = ['FIPS_COUNTRY_NAME', 847, 946, 'X(100)'] +MSHR_FIELDS['NWS_REGION'] = ['NWS_REGION', 948, 977, 'X(30)'] +MSHR_FIELDS['NWS_WFO'] = ['NWS_WFO', 979, 988, 'X(10)'] +MSHR_FIELDS['ELEV_GROUND'] = ['ELEV_GROUND', 990, 1029, 'X(40)'] +MSHR_FIELDS['ELEV_GROUND_UNIT'] = ['ELEV_GROUND_UNIT', 1031, 1050, 'X(20)'] +MSHR_FIELDS['ELEV_BAROM'] = ['ELEV_BAROM', 1052, 1091, 'X(40)'] +MSHR_FIELDS['ELEV_BAROM_UNIT'] = ['ELEV_BAROM_UNIT', 1093, 1112, 'X(20)'] +MSHR_FIELDS['ELEV_AIR'] = ['ELEV_AIR', 1114, 1153, 'X(40)'] +MSHR_FIELDS['ELEV_AIR_UNIT'] = ['ELEV_AIR_UNIT', 1155, 1174, 'X(20)'] +MSHR_FIELDS['ELEV_ZERODAT'] = ['ELEV_ZERODAT', 1176, 1215, 'X(40)'] +MSHR_FIELDS['ELEV_ZERODAT_UNIT'] = ['ELEV_ZERODAT_UNIT', 1217, 1236, 'X(20)'] +MSHR_FIELDS['ELEV_UNK'] = ['ELEV_UNK', 1238, 1277, 'X(40)'] +MSHR_FIELDS['ELEV_UNK_UNIT'] = ['ELEV_UNK_UNIT', 1279, 1298, 'X(20)'] +MSHR_FIELDS['LAT_DEC'] = ['LAT_DEC', 1300, 1319, 'X(20)'] +MSHR_FIELDS['LON_DEC'] = ['LON_DEC', 1321, 1340, 'X(20)'] +MSHR_FIELDS['LAT_LON_PRECISION'] = ['LAT_LON_PRECISION', 1342, 1351, 'X(10)'] +MSHR_FIELDS['RELOCATION'] = ['RELOCATION', 1353, 1414, 'X(62)'] +MSHR_FIELDS['UTC_OFFSET'] = ['UTC_OFFSET', 1416, 1431, '9(16)'] +MSHR_FIELDS['OBS_ENV'] = ['OBS_ENV', 1433, 1472, 'X(40) '] +MSHR_FIELDS['PLATFORM'] = ['PLATFORM', 1474, 1573, 'X(100)'] http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/9456d4ad/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py index 36aff16..1aee4a7 100644 --- a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py @@ -18,13 +18,75 @@ import textwrap from datetime import date import os import gzip +from collections import OrderedDict # Custom modules. -from weather_dly_config import * +from weather_config_ghcnd import * +from weather_config_mshr import * from weather_download_files import * class WeatherConvertToXML: + STATES = OrderedDict({ + 'AK': 'Alaska', + 'AL': 'Alabama', + 'AR': 'Arkansas', + 'AS': 'American Samoa', + 'AZ': 'Arizona', + 'CA': 'California', + 'CO': 'Colorado', + 'CT': 'Connecticut', + 'DC': 'District of Columbia', + 'DE': 'Delaware', + 'FL': 'Florida', + 'GA': 'Georgia', + 'GU': 'Guam', + 'HI': 'Hawaii', + 'IA': 'Iowa', + 'ID': 'Idaho', + 'IL': 'Illinois', + 'IN': 'Indiana', + 'KS': 'Kansas', + 'KY': 'Kentucky', + 'LA': 'Louisiana', + 'MA': 'Massachusetts', + 'MD': 'Maryland', + 'ME': 'Maine', + 'MI': 'Michigan', + 'MN': 'Minnesota', + 'MO': 'Missouri', + 'MP': 'Northern Mariana Islands', + 'MS': 'Mississippi', + 'MT': 'Montana', + 'NA': 'National', + 'NC': 'North Carolina', + 'ND': 'North Dakota', + 'NE': 'Nebraska', + 'NH': 'New Hampshire', + 'NJ': 'New Jersey', + 'NM': 'New Mexico', + 'NV': 'Nevada', + 'NY': 'New York', + 'OH': 'Ohio', + 'OK': 'Oklahoma', + 'OR': 'Oregon', + 'PA': 'Pennsylvania', + 'PR': 'Puerto Rico', + 'RI': 'Rhode Island', + 'SC': 'South Carolina', + 'SD': 'South Dakota', + 'TN': 'Tennessee', + 'TX': 'Texas', + 'UT': 'Utah', + 'VA': 'Virginia', + 'VI': 'Virgin Islands', + 'VT': 'Vermont', + 'WA': 'Washington', + 'WI': 'Wisconsin', + 'WV': 'West Virginia', + 'WY': 'Wyoming' + }) + MONTHS = [ "January", "February", @@ -51,6 +113,9 @@ class WeatherConvertToXML: self.ghcnd_countries = base_path + '/ghcnd-countries.txt' self.ghcnd_states = base_path + '/ghcnd-states.txt' self.ghcnd_stations = base_path + '/ghcnd-stations.txt' + + # MSHR support files. + self.mshr_stations = base_path + '/mshr_enhanced_201402.txt' def set_token(self, token): self.token = token @@ -109,7 +174,7 @@ class WeatherConvertToXML: row = file_stream.readline() return self.process_station_data(row) - def process_sensor_file(self, file_name, max_files, sensor_max = 99): + def process_sensor_file(self, file_name, max_files, sensor_max=99): print "Processing sensor file: " + file_name file_stream = open(file_name, 'r') @@ -164,17 +229,30 @@ class WeatherConvertToXML: <credit_URL>http://www.ncdc.noaa.gov/</credit_URL> """) - def default_xml_web_service_start(self, total_records): + def default_xml_web_service_start(self): field_xml = "" field_xml += "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n" + return field_xml + + def default_xml_data_start(self, total_records): + field_xml = "" field_xml += "<dataCollection pageCount=\"1\" totalCount=\"" + str(total_records) + "\">\n" return field_xml + def default_xml_station_start(self): + field_xml = "" + field_xml = "<stationCollection pageSize=\"100\" pageCount=\"1\" totalCount=\"1\">\n" + return field_xml + def default_xml_field_date(self, report_date, indent=2): field_xml = "" field_xml += self.get_indent_space(indent) + "<date>" + str(report_date.year) + "-" + str(report_date.month).zfill(2) + "-" + str(report_date.day).zfill(2) + "T00:00:00.000</date>\n" return field_xml + def get_date_from_field(self, row, field): + report_date = self.get_field_from_definition(row, field) + return str(report_date.year) + "-" + str(report_date.month).zfill(2) + "-" + str(report_date.day).zfill(2) + def default_xml_field_date_iso8601(self, report_date): field_xml = "" field_xml += " <observation_date>" + self.MONTHS[report_date.month - 1] + " " + str(report_date.day) + ", " + str(report_date.year) + "</observation_date>\n" @@ -241,6 +319,72 @@ class WeatherConvertToXML: return field_xml + def default_xml_mshr_station_additional(self, station_id): + """The web service station data is generate from the MSHR data supplemented with GHCN-Daily.""" + station_mshr_row = "" + stations_mshr_file = open(self.mshr_stations, 'r') + for line in stations_mshr_file: + if station_id == self.get_field_from_definition(line, MSHR_FIELDS['GHCND_ID']).strip(): + station_mshr_row = line + break + + if station_mshr_row == "": + return "" + + additional_xml = "" + + county = self.get_field_from_definition(station_mshr_row, MSHR_FIELDS['COUNTY']).strip() + if county != "": + additional_xml += self.default_xml_location_labels("CNTY", "FIPS:-9999", county) + + country_code = self.get_field_from_definition(station_mshr_row, MSHR_FIELDS['FIPS_COUNTRY_CODE']).strip() + country_name = self.get_field_from_definition(station_mshr_row, MSHR_FIELDS['FIPS_COUNTRY_NAME']).strip() + if country_code != "" and country_name != "": + additional_xml += self.default_xml_location_labels("CNTRY", "FIPS:"+country_code, country_name) + + return additional_xml + + def default_xml_location_labels(self, type, id, display_name): + label_xml = "" + label_xml += self.default_xml_start_tag("locationLabels", 2) + label_xml += self.default_xml_element("type", type, 3) + label_xml += self.default_xml_element("id", id, 3) + label_xml += self.default_xml_element("displayName", display_name, 3) + label_xml += self.default_xml_end_tag("locationLabels", 2) + return label_xml + + + def default_xml_web_service_station(self, station_id): + """The web service station data is generate from available historical sources.""" + station_ghcnd_row = "" + stations_ghcnd_file = open(self.ghcnd_stations, 'r') + for line in stations_ghcnd_file: + if station_id == self.get_field_from_definition(line, STATIONS_FIELDS['ID']): + station_ghcnd_row = line + break + + xml_station = "" + xml_station += self.default_xml_start_tag("station", 1) + + xml_station += self.default_xml_element("id", "GHCND:" + station_id, 2) + xml_station += self.default_xml_element("displayName", self.get_field_from_definition(station_ghcnd_row, STATIONS_FIELDS['NAME']).strip(), 2) + xml_station += self.default_xml_element("latitude", self.get_field_from_definition(station_ghcnd_row, STATIONS_FIELDS['LATITUDE']).strip(), 2) + xml_station += self.default_xml_element("longitude", self.get_field_from_definition(station_ghcnd_row, STATIONS_FIELDS['LONGITUDE']).strip(), 2) + + elevation = self.get_field_from_definition(station_ghcnd_row, STATIONS_FIELDS['ELEVATION']).strip() + if elevation != "-999.9": + xml_station += self.default_xml_element("elevation", elevation, 2) + + state_code = self.get_field_from_definition(station_ghcnd_row, STATIONS_FIELDS['STATE']).strip() + if state_code != "": + xml_station += self.default_xml_location_labels("ST", "FIPS:" + str(self.STATES.keys().index(state_code)), self.STATES[state_code]) + + # Add the MSHR data to the station generated information. + xml_station += self.default_xml_mshr_station_additional(station_id) + + xml_station += self.default_xml_end_tag("station", 1) + return xml_station + def default_xml_day_reading_as_field(self, row, day): day_index = DLY_FIELD_DAY_OFFSET + ((day - 1) * DLY_FIELD_DAY_FIELDS) value = self.get_dly_field(row, day_index); @@ -306,8 +450,14 @@ class WeatherConvertToXML: return textwrap.dedent("""\ </ghcnd_observation>""") - def default_xml_web_service_end(self): - return "</dataCollection>" + def default_xml_data_end(self): + return self.default_xml_end_tag("dataCollection", 0) + + def default_xml_station_end(self): + return self.default_xml_end_tag("stationCollection", 0) + + def default_xml_element(self, tag, data, indent=1): + return self.get_indent_space(indent) + "<" + tag + ">" + data + "</" + tag + ">\n" def default_xml_start_tag(self, tag, indent=1): return self.get_indent_space(indent) + "<" + tag + ">\n" @@ -434,9 +584,11 @@ class WeatherMonthlyXMLFile(WeatherConvertToXML): return 0 class WeatherWebServiceMonthlyXMLFile(WeatherConvertToXML): + """The web service class details how to create files similar to the NOAA web service.""" skip_downloading = False # Station data def process_station_data(self, row): + """Adds a single station record file either from downloading the data or generating a similar record.""" station_id = self.get_dly_field(row, DLY_FIELD_ID) download = 0 if self.token is not "" and not self.skip_downloading: @@ -444,15 +596,20 @@ class WeatherWebServiceMonthlyXMLFile(WeatherConvertToXML): if download == 0: self.skip_downloading = True - # If not downloaded generate. + # If not downloaded, generate. if download != 0: return download else: # Information for each daily file. - station_xml_file = self.default_xml_start() - station_xml_file += self.default_xml_field_station(station_id) - station_xml_file += self.default_xml_end() + station_xml_file = self.default_xml_web_service_start() + station_xml_file += self.default_xml_station_start() + station_xml_file += self.default_xml_web_service_station(station_id) + station_xml_file += self.default_xml_station_end() + # Remove white space. + station_xml_file = station_xml_file.replace("\n", ""); + station_xml_file = station_xml_file.replace(self.get_indent_space(1), ""); + # Make sure the station folder is available. ghcnd_xml_station_path = self.get_base_folder(station_id, "stations") if not os.path.isdir(ghcnd_xml_station_path): @@ -470,9 +627,10 @@ class WeatherWebServiceMonthlyXMLFile(WeatherConvertToXML): return 0 # Station data - def download_station_data(self, station_id, token, reset = False): + def download_station_data(self, station_id, token, reset=False): + """Downloads the station data from the web service.""" import time - time.sleep(10) + time.sleep(2) # Make sure the station folder is available. ghcnd_xml_station_path = self.get_base_folder(station_id, "stations") if not os.path.isdir(ghcnd_xml_station_path): @@ -506,6 +664,7 @@ class WeatherWebServiceMonthlyXMLFile(WeatherConvertToXML): # Sensor data def process_one_month_sensor_set(self, records, page): + """Generates records for a station using the web service xml layout.""" found_data = False year = int(self.get_dly_field(records[0], DLY_FIELD_YEAR)) month = int(self.get_dly_field(records[0], DLY_FIELD_MONTH)) @@ -535,7 +694,7 @@ class WeatherWebServiceMonthlyXMLFile(WeatherConvertToXML): except ValueError: pass - daily_xml_file = self.default_xml_web_service_start(count) + daily_xml_file + self.default_xml_web_service_end() + daily_xml_file = self.default_xml_web_service_start() + self.default_xml_data_start(count) + daily_xml_file + self.default_xml_data_end() daily_xml_file = daily_xml_file.replace("\n", ""); daily_xml_file = daily_xml_file.replace(self.get_indent_space(1), ""); http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/9456d4ad/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py index 8e26e99..da2afcc 100644 --- a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_data_files.py @@ -92,8 +92,8 @@ class WeatherDataFiles: self.close_progress_data(True) self.reset() - # Once the initial data has been generated, the data can be copied into a set number of partitions. def copy_to_n_partitions(self, save_path, partitions, base_paths=[]): + """Once the initial data has been generated, the data can be copied into a set number of partitions. """ if (len(base_paths) == 0): return http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/9456d4ad/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_dly_config.py ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_dly_config.py b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_dly_config.py deleted file mode 100644 index 801e748..0000000 --- a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_dly_config.py +++ /dev/null @@ -1,96 +0,0 @@ -#!/usr/bin/env python -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Base URL used to get all the required files. -BASE_DOWNLOAD_URL = 'http://www1.ncdc.noaa.gov/pub/data/ghcn/daily/' - -# List of required files for a build. -FILE_NAMES = [] -FILE_NAMES.append('ghcnd-countries.txt') -FILE_NAMES.append('ghcnd-inventory.txt') -FILE_NAMES.append('ghcnd-states.txt') -FILE_NAMES.append('ghcnd-stations.txt') -FILE_NAMES.append('ghcnd-version.txt') -FILE_NAMES.append('ghcnd_all.tar.gz') -FILE_NAMES.append('ghcnd_gsn.tar.gz') -FILE_NAMES.append('ghcnd_hcn.tar.gz') -FILE_NAMES.append('readme.txt') -FILE_NAMES.append('status.txt') - -# Store the row details here. - -# Index values of each field details. -FIELD_INDEX_NAME = 0 -FIELD_INDEX_START = 1 -FIELD_INDEX_END = 2 -FIELD_INDEX_TYPE = 3 - -DLY_FIELD_ID = 0 -DLY_FIELD_YEAR = 1 -DLY_FIELD_MONTH = 2 -DLY_FIELD_ELEMENT = 3 - -DLY_FIELD_DAY_OFFSET = 4 -DLY_FIELD_DAY_FIELDS = 4 - -DLY_FIELDS = [] - -# Details about the row. -DLY_FIELDS.append(['ID', 1, 11, 'Character']) -DLY_FIELDS.append(['YEAR', 12, 15, 'Integer']) -DLY_FIELDS.append(['MONTH', 16, 17, 'Integer']) -DLY_FIELDS.append(['ELEMENT', 18, 21, 'Character']) - -# Days in each row. -for i in range(1, 32): - start = 22 + ((i - 1) * 8) - DLY_FIELDS.append(['VALUE' + str(i), (start + 0), (start + 4), 'Integer']) - DLY_FIELDS.append(['MFLAG' + str(i), (start + 5), (start + 5), 'Character']) - DLY_FIELDS.append(['QFLAG' + str(i), (start + 6), (start + 6), 'Character']) - DLY_FIELDS.append(['SFLAG' + str(i), (start + 7), (start + 7), 'Character']) - -# Details about the row. -STATIONS_FIELDS = {} -STATIONS_FIELDS['ID'] = ['ID', 1, 11, 'Character'] -STATIONS_FIELDS['LATITUDE'] = ['LATITUDE', 13, 20, 'Real'] -STATIONS_FIELDS['LONGITUDE'] = ['LONGITUDE', 22, 30, 'Real'] -STATIONS_FIELDS['ELEVATION'] = ['ELEVATION', 32, 37, 'Real'] -STATIONS_FIELDS['STATE'] = ['STATE', 39, 40, 'Character'] -STATIONS_FIELDS['NAME'] = ['NAME', 42, 71, 'Character'] -STATIONS_FIELDS['GSNFLAG'] = ['GSNFLAG', 73, 75, 'Character'] -STATIONS_FIELDS['HCNFLAG'] = ['HCNFLAG', 77, 79, 'Character'] -STATIONS_FIELDS['WMOID'] = ['WMOID', 81, 85, 'Character'] - -# Details about the row. -COUNTRIES_FIELDS = {} -COUNTRIES_FIELDS['CODE'] = ['CODE', 1, 2, 'Character'] -COUNTRIES_FIELDS['NAME'] = ['NAME', 4, 50, 'Character'] - -# Details about the row. -STATES_FIELDS = {} -STATES_FIELDS['CODE'] = ['CODE', 1, 2, 'Character'] -STATES_FIELDS['NAME'] = ['NAME', 4, 50, 'Character'] - -# Details about the row. -INVENTORY_FIELDS = [] -INVENTORY_FIELDS.append(['ID', 1, 11, 'Character']) -INVENTORY_FIELDS.append(['LATITUDE', 13, 20, 'Real']) -INVENTORY_FIELDS.append(['LONGITUDE', 22, 30, 'Real']) -INVENTORY_FIELDS.append(['ELEMENT', 32, 35, 'Character']) -INVENTORY_FIELDS.append(['FIRSTYEAR', 37, 40, 'Integer']) -INVENTORY_FIELDS.append(['LASTYEAR', 42, 45, 'Integer']) - http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/9456d4ad/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_download_files.py ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_download_files.py b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_download_files.py index 87adb11..fb59b50 100644 --- a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_download_files.py +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_download_files.py @@ -19,9 +19,11 @@ import os.path import shutil import tarfile import urllib +import zipfile # Custom modules. -from weather_dly_config import * +from weather_config_ghcnd import * +from weather_config_mshr import * class WeatherDownloadFiles: @@ -32,14 +34,18 @@ class WeatherDownloadFiles: os.makedirs(save_path) - # Download the complete list - def download_all_files(self, reset=False): + def download_ghcnd_files(self, reset=False): + """Download the complete list.""" for file_name in FILE_NAMES: url = BASE_DOWNLOAD_URL + file_name self.download_file(url, reset) - # Download the file, unless it exists. + def download_mshr_files(self, reset=False): + for url in MSHR_URLS: + self.download_file(url, reset) + def download_file(self, url, reset=False): + """Download the file, unless it exists.""" file_name = self.save_path + "/" + url.split('/')[-1] if not os.path.isfile(file_name) or reset: @@ -47,8 +53,8 @@ class WeatherDownloadFiles: urllib.urlretrieve(url, file_name, report_download_status) print - # Unzip the package file, unless it exists. - def unzip_package(self, package, reset=False): + def unzip_ghcnd_package(self, package, reset=False): + """Unzip the package file, unless it exists.""" file_name = self.save_path + "/" + package + ".tar.gz" unzipped_path = self.save_path + "/" + package @@ -60,16 +66,25 @@ class WeatherDownloadFiles: tar_file = tarfile.open(file_name, 'r:gz') tar_file.extractall(unzipped_path) -# Report download status. + def unzip_mshr_files(self, reset=False): + """Unzip the package file, unless it exists.""" + for url in MSHR_URLS: + if url.endswith('.zip'): + file_name = self.save_path + "/" + url.split('/')[-1] + print "Unzipping: " + file_name + with zipfile.ZipFile(file_name, 'r') as myzip: + myzip.extractall(self.save_path) + def report_download_status(count, block, size): + """Report download status.""" line_size = 50 erase = "\b" * line_size sys.stdout.write(erase) report = get_report_line((float(count) * block / size), line_size) sys.stdout.write(report) -# Creates a string to be used in reporting the percentage done. def get_report_line(percentage, line_size): + """Creates a string to be used in reporting the percentage done.""" report = "" for i in range(0, line_size): if (float(i) / line_size < percentage): @@ -78,8 +93,8 @@ def get_report_line(percentage, line_size): report += "-" return report -# Download the file, unless it exists. def download_file_save_as(url, new_file_name, reset=False): + """Download the file, unless it exists.""" if not os.path.isfile(new_file_name) or reset: print "Downloading: " + url urllib.urlretrieve(url, new_file_name, report_download_status)
