Added the option to create an inventory csv file. The inventory hold the number of sensor and reading for each station.
Project: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/commit/f60f8858 Tree: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/tree/f60f8858 Diff: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/diff/f60f8858 Branch: refs/heads/prestonc/hash_join Commit: f60f885885409dd2781cbfda1baaf18c89911b53 Parents: a50cf44 Author: Preston Carman <[email protected]> Authored: Tue Mar 25 16:38:28 2014 -0700 Committer: Preston Carman <[email protected]> Committed: Tue Apr 1 20:56:25 2014 -0700 ---------------------------------------------------------------------- .../noaa-ghcn-daily/scripts/weather_cli.py | 8 +++-- .../scripts/weather_config_ghcnd.py | 14 ++++---- .../scripts/weather_convert_to_xml.py | 35 ++++++++++++++++++-- 3 files changed, 46 insertions(+), 11 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/f60f8858/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_cli.py ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_cli.py b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_cli.py index 52945e5..5bfa698 100644 --- a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_cli.py +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_cli.py @@ -53,7 +53,7 @@ def main(argv): print ' -a Append the results to the progress file.' print ' -f (str) The file name of a specific station to process.' print ' * Helpful when testing a single stations XML file output.' - print ' -l (str) Select the locality of the scripts execution (download, progress_file, sensor_build, station_build, partition, partition_scheme, statistics).' + print ' -l (str) Select the locality of the scripts execution (download, progress_file, sensor_build, station_build, partition, partition_scheme, inventory, statistics).' print ' -m (int) Limits the number of files created for each station.' print ' * Helpful when testing to make sure all elements are supported for each station.' print ' Alternate form: --max_station_files=(int)' @@ -73,7 +73,7 @@ def main(argv): print 'Error: Argument must be a file name for --file (-f).' sys.exit() elif opt in ('-l', "--locality"): - if arg in ("download", "progress_file", "sensor_build", "station_build", "partition", "partition_scheme", "test_links", "queries", "statistics"): + if arg in ("download", "progress_file", "sensor_build", "station_build", "partition", "partition_scheme", "test_links", "queries", "inventory", "statistics"): section = arg else: print 'Error: Argument must be a string for --locality (-l) and a valid locality.' @@ -221,6 +221,10 @@ def main(argv): print 'Processing the queries section (' + dataset.get_name() + ').' benchmark.copy_query_files(reset) + if section in ("inventory"): + print 'Processing the inventory section.' + convert.process_inventory_file() + # if section in ("statistics"): # print 'Processing the statistics section.' # data.print_progress_file_stats(convert) http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/f60f8858/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_config_ghcnd.py ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_config_ghcnd.py b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_config_ghcnd.py index 6d3bd9c..04fff52 100644 --- a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_config_ghcnd.py +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_config_ghcnd.py @@ -86,10 +86,10 @@ STATES_FIELDS['CODE'] = ['CODE', 1, 2, 'Character'] STATES_FIELDS['NAME'] = ['NAME', 4, 50, 'Character'] # Details about the row. -INVENTORY_FIELDS = [] -INVENTORY_FIELDS.append(['ID', 1, 11, 'Character']) -INVENTORY_FIELDS.append(['LATITUDE', 13, 20, 'Real']) -INVENTORY_FIELDS.append(['LONGITUDE', 22, 30, 'Real']) -INVENTORY_FIELDS.append(['ELEMENT', 32, 35, 'Character']) -INVENTORY_FIELDS.append(['FIRSTYEAR', 37, 40, 'Integer']) -INVENTORY_FIELDS.append(['LASTYEAR', 42, 45, 'Integer']) +INVENTORY_FIELDS = {} +INVENTORY_FIELDS['ID'] = ['ID', 1, 11, 'Character'] +INVENTORY_FIELDS['LATITUDE'] = ['LATITUDE', 13, 20, 'Real'] +INVENTORY_FIELDS['LONGITUDE'] = ['LONGITUDE', 22, 30, 'Real'] +INVENTORY_FIELDS['ELEMENT'] = ['ELEMENT', 32, 35, 'Character'] +INVENTORY_FIELDS['FIRSTYEAR'] = ['FIRSTYEAR', 37, 40, 'Integer'] +INVENTORY_FIELDS['LASTYEAR'] = ['LASTYEAR', 42, 45, 'Integer'] http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/f60f8858/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py ---------------------------------------------------------------------- diff --git a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py index c115efa..a4f33a1 100644 --- a/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py +++ b/vxquery-benchmark/src/main/resources/noaa-ghcn-daily/scripts/weather_convert_to_xml.py @@ -109,9 +109,10 @@ class WeatherConvertToXML: # Extra support files. self.ghcnd_countries = base_path + '/ghcnd-countries.txt' + self.ghcnd_inventory = base_path + '/ghcnd-inventory.txt' self.ghcnd_states = base_path + '/ghcnd-states.txt' self.ghcnd_stations = base_path + '/ghcnd-stations.txt' - + # MSHR support files. self.mshr_stations = base_path + '/mshr_enhanced_201402.txt' @@ -160,6 +161,36 @@ class WeatherConvertToXML: def get_base_folder(self, station_id, data_type="sensors"): return build_base_save_folder(self.save_path, station_id, data_type) + def process_inventory_file(self): + print "Processing inventory file" + file_stream = open(self.ghcnd_inventory, 'r') + + csv_header = ['ID', 'SENSORS', 'SENSORS_COUNT', 'MAX_YEARS', 'TOTAL_YEARS_FOR_ALL_SENSORS'] + row = file_stream.readline() + csv_inventory = {} + for row in file_stream: + id = self.get_field_from_definition(row, INVENTORY_FIELDS['ID']) + sensor_id = self.get_field_from_definition(row, INVENTORY_FIELDS['ELEMENT']) + start = int(self.get_field_from_definition(row, INVENTORY_FIELDS['FIRSTYEAR'])) + end = int(self.get_field_from_definition(row, INVENTORY_FIELDS['LASTYEAR'])) + if id in csv_inventory: + new_count = str(int(csv_inventory[id][2]) + 1) + new_max = str(max(int(csv_inventory[id][3]), (end - start))) + new_total = str(int(csv_inventory[id][3]) + end - start) + csv_inventory[id] = [id, (csv_inventory[id][1] + "," + sensor_id), new_count, new_max, new_total] + else: + csv_inventory[id] = [id, sensor_id, str(1), str(end - start), str(end - start)] + + path = self.save_path + "/inventory.csv" + self.save_csv_file(path, csv_inventory, csv_header) + + def save_csv_file(self, path, csv_inventory, header): + csv_content = "|".join(header) + "\n" + for row_id in csv_inventory: + csv_content += "|".join(csv_inventory[row_id]) + "\n" + self.save_file(path, csv_content) + + def process_station_file(self, file_name): print "Processing station file: " + file_name file_stream = open(file_name, 'r') @@ -333,7 +364,7 @@ class WeatherConvertToXML: country_code = self.get_field_from_definition(station_mshr_row, MSHR_FIELDS['FIPS_COUNTRY_CODE']).strip() country_name = self.get_field_from_definition(station_mshr_row, MSHR_FIELDS['FIPS_COUNTRY_NAME']).strip() if country_code != "" and country_name != "": - additional_xml += self.default_xml_location_labels("CNTRY", "FIPS:"+country_code, country_name) + additional_xml += self.default_xml_location_labels("CNTRY", "FIPS:" + country_code, country_name) return additional_xml
