ngachung commented on code in PR #260: URL: https://github.com/apache/incubator-sdap-nexus/pull/260#discussion_r1256085439
########## tools/cdms/cdms_reader.py: ########## @@ -1,250 +1,316 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import string -from netCDF4 import Dataset, num2date -import sys -import datetime -import csv -from collections import OrderedDict -import logging - -#TODO: Get rid of numpy errors? -#TODO: Update big SDAP README - -LOGGER = logging.getLogger("cdms_reader") - -def assemble_matches(filename): - """ - Read a CDMS netCDF file and return a list of matches. - - Parameters - ---------- - filename : str - The CDMS netCDF file name. - - Returns - ------- - matches : list - List of matches. Each list element is a dictionary. - For match m, netCDF group GROUP (PrimaryData or SecondaryData), and - group variable VARIABLE: - matches[m][GROUP]['matchID']: MatchedRecords dimension ID for the match - matches[m][GROUP]['GROUPID']: GROUP dim dimension ID for the record - matches[m][GROUP][VARIABLE]: variable value - """ - - try: - # Open the netCDF file - with Dataset(filename, 'r') as cdms_nc: - # Check that the number of groups is consistent w/ the MatchedGroups - # dimension - assert len(cdms_nc.groups) == cdms_nc.dimensions['MatchedGroups'].size,\ - ("Number of groups isn't the same as MatchedGroups dimension.") - - matches = [] - matched_records = cdms_nc.dimensions['MatchedRecords'].size - - # Loop through the match IDs to assemble matches - for match in range(0, matched_records): - match_dict = OrderedDict() - # Grab the data from each platform (group) in the match - for group_num, group in enumerate(cdms_nc.groups): - match_dict[group] = OrderedDict() - match_dict[group]['matchID'] = match - ID = cdms_nc.variables['matchIDs'][match][group_num] - match_dict[group][group + 'ID'] = ID - for var in cdms_nc.groups[group].variables.keys(): - match_dict[group][var] = cdms_nc.groups[group][var][ID] - - # Create a UTC datetime field from timestamp - dt = num2date(match_dict[group]['time'], - cdms_nc.groups[group]['time'].units) - match_dict[group]['datetime'] = dt - LOGGER.info(match_dict) - matches.append(match_dict) - - return matches - except (OSError, IOError) as err: - LOGGER.exception("Error reading netCDF file " + filename) - raise err - -def matches_to_csv(matches, csvfile): - """ - Write the CDMS matches to a CSV file. Include a header of column names - which are based on the group and variable names from the netCDF file. - - Parameters - ---------- - matches : list - The list of dictionaries containing the CDMS matches as returned from - assemble_matches. - csvfile : str - The name of the CSV output file. - """ - # Create a header for the CSV. Column names are GROUP_VARIABLE or - # GROUP_GROUPID. - header = [] - for key, value in matches[0].items(): - for otherkey in value.keys(): - header.append(key + "_" + otherkey) - - try: - # Write the CSV file - with open(csvfile, 'w') as output_file: - csv_writer = csv.writer(output_file) - csv_writer.writerow(header) - for match in matches: - row = [] - for group, data in match.items(): - for value in data.values(): - row.append(value) - csv_writer.writerow(row) - except (OSError, IOError) as err: - LOGGER.exception("Error writing CSV file " + csvfile) - raise err - -def get_globals(filename): - """ - Write the CDMS global attributes to a text file. Additionally, - within the file there will be a description of where all the different - outputs go and how to best utlize this program. - - Parameters - ---------- - filename : str - The name of the original '.nc' input file. - - """ - x0 = "README / cdms_reader.py Program Use and Description:\n" - x1 = "\nThe cdms_reader.py program reads a CDMS netCDF (a NETCDF file with a matchIDs variable)\n" - x2 = "file into memory, assembles a list of matches of primary and secondary data\n" - x3 = "and optionally\n" - x4 = "output the matches to a CSV file. Each matched pair contains one primary\n" - x5 = "data record and one secondary data record.\n" - x6 = "\nBelow, this file wil list the global attributes of the .nc (NETCDF) file.\n" - x7 = "If you wish to see a full dump of the data from the .nc file,\n" - x8 = "please utilize the ncdump command from NETCDF (or look at the CSV file).\n" - try: - with Dataset(filename, "r", format="NETCDF4") as ncFile: - txtName = filename.replace(".nc", ".txt") - with open(txtName, "w") as txt: - txt.write(x0 + x1 +x2 +x3 + x4 + x5 + x6 + x7 + x8) - txt.write("\nGlobal Attributes:") - for x in ncFile.ncattrs(): - txt.write(f'\t :{x} = "{ncFile.getncattr(x)}" ;\n') - - - except (OSError, IOError) as err: - LOGGER.exception("Error reading netCDF file " + filename) - print("Error reading file!") - raise err - -def create_logs(user_option, logName): - """ - Write the CDMS log information to a file. Additionally, the user may - opt to print this information directly to stdout, or discard it entirely. - - Parameters - ---------- - user_option : str - The result of the arg.log 's interpretation of - what option the user selected. - logName : str - The name of the log file we wish to write to, - assuming the user did not use the -l option. - """ - if user_option == 'N': - print("** Note: No log was created **") - - - elif user_option == '1': - #prints the log contents to stdout - logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', - level=logging.INFO, - datefmt='%Y-%m-%d %H:%M:%S', - handlers=[ - logging.StreamHandler(sys.stdout) - ]) - - else: - #prints log to a .log file - logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', - level=logging.INFO, - datefmt='%Y-%m-%d %H:%M:%S', - handlers=[ - logging.FileHandler(logName) - ]) - if user_option != 1 and user_option != 'Y': - print(f"** Bad usage of log option. Log will print to {logName} **") - - - - - -if __name__ == '__main__': - """ - Execution: - python cdms_reader.py filename - OR - python3 cdms_reader.py filename - OR - python3 cdms_reader.py filename -c -g - OR - python3 cdms_reader.py filename --csv --meta - - Note (For Help Try): - python3 cdms_reader.py -h - OR - python3 cdms_reader.py --help - - """ - - u0 = '\n%(prog)s -h OR --help \n' - u1 = '%(prog)s filename -c -g\n%(prog)s filename --csv --meta\n' - u2 ='Use -l OR -l1 to modify destination of logs' - p = argparse.ArgumentParser(usage= u0 + u1 + u2) - - #below block is to customize user options - p.add_argument('filename', help='CDMS netCDF file to read') - p.add_argument('-c', '--csv', nargs='?', const= 'Y', default='N', - help='Use -c or --csv to retrieve CSV output') - p.add_argument('-g', '--meta', nargs='?', const='Y', default='N', - help='Use -g or --meta to retrieve global attributes / metadata') - p.add_argument('-l', '--log', nargs='?', const='N', default='Y', - help='Use -l or --log to AVOID creating log files, OR use -l1 to print to stdout/console') - - #arguments are processed by the next line - args = p.parse_args() - - logName = args.filename.replace(".nc", ".log") - create_logs(args.log, logName) - - cdms_matches = assemble_matches(args.filename) - - if args.csv == 'Y' : - matches_to_csv(cdms_matches, args.filename.replace(".nc",".csv")) - - if args.meta == 'Y' : - get_globals(args.filename) - - - - - - - - +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import string +from netCDF4 import Dataset, num2date +import sys +import datetime +import csv +from collections import OrderedDict +import logging +import xarray as xr + + +#TODO: Get rid of numpy errors? +#TODO: Update big SDAP README + +LOGGER = logging.getLogger("cdms_reader") + +def assemble_matches(filename): + """ + Read a CDMS netCDF file and return a list of matches. + + Parameters + ---------- + filename : str + The CDMS netCDF file name. + + Returns + ------- + matches : list + List of matches. Each list element is a dictionary. + For match m, netCDF group GROUP (PrimaryData or SecondaryData), and + group variable VARIABLE: + matches[m][GROUP]['matchID']: MatchedRecords dimension ID for the match + matches[m][GROUP]['GROUPID']: GROUP dim dimension ID for the record + matches[m][GROUP][VARIABLE]: variable value + """ + + try: + # Open the netCDF file + + with Dataset(filename, 'r') as cdms_nc: + # Check that the number of groups is consistent w/ the MatchedGroups + # dimension + assert len(cdms_nc.groups) == cdms_nc.dimensions['MatchedGroups'].size,\ + ("Number of groups isn't the same as MatchedGroups dimension.") + + matches = [] + matched_records = cdms_nc.dimensions['MatchedRecords'].size + + # Loop through the match IDs to assemble matches + for match in range(0, matched_records): + match_dict = OrderedDict() + # Grab the data from each platform (group) in the match + for group_num, group in enumerate(cdms_nc.groups): + match_dict[group] = OrderedDict() + match_dict[group]['matchID'] = match + ID = cdms_nc.variables['matchIDs'][match][group_num] + match_dict[group][group + 'ID'] = ID + for var in cdms_nc.groups[group].variables.keys(): + match_dict[group][var] = cdms_nc.groups[group][var][ID] + + # Create a UTC datetime field from timestamp + dt = num2date(match_dict[group]['time'], + cdms_nc.groups[group]['time'].units) + match_dict[group]['datetime'] = dt + LOGGER.info(match_dict) + matches.append(match_dict) + + return matches + except (OSError, IOError) as err: + LOGGER.exception("Error reading netCDF file " + filename) + raise err + +def return_matches_array(filename): + """ + Read a CDMS netCDF file and return a list of matches in a more rudimentary format. + + Parameters + ---------- + filename : str + The CDMS netCDF file name. + + Returns + ------- + matches : list + List of matches. + For match m: + matches[m][0 - 8]: Contains either floats or arrays representing data for + PrimaryData and its associated SecondaryData. + + Please refer to the code in this function as well as the README to obtain information + regarding the ordering of this data. + """ + + match_ids = xr.open_dataset(filename) + primary = xr.open_dataset(filename, group= "PrimaryData") + secondary = xr.open_dataset(filename, group= "SecondaryData") + + #set up primary arrays + matches = [[0.0, 0.0, 0.0, 0.0, [], [], [], [], []] for x in range(primary.sizes['dim'])] + + #set up secondary arrays + sec_lon = [] + sec_lat = [] + sec_time = [] + sec_speed = [] + sec_dir = [] + prev_x = 0 + + #load values into the arrays + matches[0][0] = float(primary.lon[0].values) + matches[0][1] = float(primary.lat[0].values) + matches[0][2] = float(primary.time[0].values) + matches[0][3] = float(primary.sea_surface_foundation_temperature[0].values) + + for x, y in match_ids.matchIDs.values: + if prev_x != int(x): + matches[prev_x][4] = (sec_lon) + matches[prev_x][5] = (sec_lat) + matches[prev_x][6] = (sec_time) + matches[prev_x][7] = (sec_speed) + matches[prev_x][8] = (sec_dir) + + matches[int(x)][0] = float(primary.lon[int(x)].values) + matches[int(x)][1] = float(primary.lat[int(x)].values) + matches[int(x)][2] = float(primary.time[int(x)].values) + matches[int(x)][3] = float(primary.sea_surface_foundation_temperature[int(x)].values) + + sec_lon = [] + sec_lat = [] + sec_time = [] + sec_speed = [] + sec_dir = [] + + sec_lon.append(float(secondary.lon[int(y)].values)) + sec_lat.append(float(secondary.lat[int(y)].values)) + sec_time.append(float(secondary.time[int(y)].values)) + sec_speed.append(float(secondary.wind_speed[int(y)].values)) + sec_dir.append(float(secondary.wind_to_direction[int(y)].values)) Review Comment: @alovett-COAPS This change to the reader appears to be specific just for the primary = sst and secondary = wind use case. This reader should be generic to support any variable(s). In the use case where secondary is an in situ source there can be also be multiple variables returned. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: dev-unsubscr...@sdap.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org