[GitHub] [incubator-sdap-nexus] ngachung commented on a diff in pull request #260: added primitive match return function to CDMS reader

via GitHub Fri, 07 Jul 2023 09:19:22 -0700


ngachung commented on code in PR #260:
URL: 
https://github.com/apache/incubator-sdap-nexus/pull/260#discussion_r1256085439



##########
tools/cdms/cdms_reader.py:
##########
@@ -1,250 +1,316 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import string
-from netCDF4 import Dataset, num2date
-import sys
-import datetime
-import csv
-from collections import OrderedDict
-import logging
-
-#TODO: Get rid of numpy errors?
-#TODO: Update big SDAP README
-
-LOGGER =  logging.getLogger("cdms_reader")
-
-def assemble_matches(filename):
-    """
-    Read a CDMS netCDF file and return a list of matches.
-    
-    Parameters
-    ----------
-    filename : str
-        The CDMS netCDF file name.
-    
-    Returns
-    -------
-    matches : list
-        List of matches. Each list element is a dictionary.
-        For match m, netCDF group GROUP (PrimaryData or SecondaryData), and
-        group variable VARIABLE:
-        matches[m][GROUP]['matchID']: MatchedRecords dimension ID for the match
-        matches[m][GROUP]['GROUPID']: GROUP dim dimension ID for the record
-        matches[m][GROUP][VARIABLE]: variable value 
-    """
-   
-    try:
-        # Open the netCDF file
-        with Dataset(filename, 'r') as cdms_nc:
-            # Check that the number of groups is consistent w/ the 
MatchedGroups
-            # dimension
-            assert len(cdms_nc.groups) == 
cdms_nc.dimensions['MatchedGroups'].size,\
-                ("Number of groups isn't the same as MatchedGroups dimension.")
-            
-            matches = []
-            matched_records = cdms_nc.dimensions['MatchedRecords'].size
-            
-            # Loop through the match IDs to assemble matches
-            for match in range(0, matched_records):
-                match_dict = OrderedDict()
-                # Grab the data from each platform (group) in the match
-                for group_num, group in enumerate(cdms_nc.groups):
-                    match_dict[group] = OrderedDict()
-                    match_dict[group]['matchID'] = match
-                    ID = cdms_nc.variables['matchIDs'][match][group_num]
-                    match_dict[group][group + 'ID'] = ID
-                    for var in cdms_nc.groups[group].variables.keys():
-                        match_dict[group][var] = cdms_nc.groups[group][var][ID]
-                    
-                    # Create a UTC datetime field from timestamp
-                    dt = num2date(match_dict[group]['time'],
-                                  cdms_nc.groups[group]['time'].units)
-                    match_dict[group]['datetime'] = dt
-                LOGGER.info(match_dict)
-                matches.append(match_dict)
-            
-            return matches
-    except (OSError, IOError) as err:
-        LOGGER.exception("Error reading netCDF file " + filename)
-        raise err
-    
-def matches_to_csv(matches, csvfile):
-    """
-    Write the CDMS matches to a CSV file. Include a header of column names
-    which are based on the group and variable names from the netCDF file.
-    
-    Parameters
-    ----------
-    matches : list
-        The list of dictionaries containing the CDMS matches as returned from
-        assemble_matches.      
-    csvfile : str
-        The name of the CSV output file.
-    """
-    # Create a header for the CSV. Column names are GROUP_VARIABLE or
-    # GROUP_GROUPID.
-    header = []
-    for key, value in matches[0].items():
-        for otherkey in value.keys():
-            header.append(key + "_" + otherkey)
-    
-    try:
-        # Write the CSV file
-        with open(csvfile, 'w') as output_file:
-            csv_writer = csv.writer(output_file)
-            csv_writer.writerow(header)
-            for match in matches:
-                row = []
-                for group, data in match.items():
-                    for value in data.values():
-                        row.append(value)
-                csv_writer.writerow(row)
-    except (OSError, IOError) as err:
-        LOGGER.exception("Error writing CSV file " + csvfile)
-        raise err
-
-def get_globals(filename):
-    """
-    Write the CDMS  global attributes to a text file. Additionally,
-     within the file there will be a description of where all the different
-     outputs go and how to best utlize this program.
-    
-    Parameters
-    ----------      
-    filename : str
-        The name of the original '.nc' input file.
-    
-    """
-    x0 = "README / cdms_reader.py Program Use and Description:\n"
-    x1 = "\nThe cdms_reader.py program reads a CDMS netCDF (a NETCDF file with 
a matchIDs variable)\n"
-    x2 = "file into memory, assembles a list of matches of primary and 
secondary data\n"
-    x3 = "and optionally\n"
-    x4 = "output the matches to a CSV file. Each matched pair contains one 
primary\n"
-    x5 = "data record and one secondary data record.\n"
-    x6 = "\nBelow, this file wil list the global attributes of the .nc 
(NETCDF) file.\n"
-    x7 = "If you wish to see a full dump of the data from the .nc file,\n"
-    x8 = "please utilize the ncdump command from NETCDF (or look at the CSV 
file).\n"
-    try:
-        with Dataset(filename, "r", format="NETCDF4") as ncFile:
-            txtName = filename.replace(".nc", ".txt")
-            with open(txtName, "w") as txt:
-                txt.write(x0 + x1 +x2 +x3 + x4 + x5 + x6 + x7 + x8)
-                txt.write("\nGlobal Attributes:")
-                for x in ncFile.ncattrs():
-                    txt.write(f'\t :{x} = "{ncFile.getncattr(x)}" ;\n')
-
-
-    except (OSError, IOError) as err:
-        LOGGER.exception("Error reading netCDF file " + filename)
-        print("Error reading file!")
-        raise err
-
-def create_logs(user_option, logName):
-    """
-    Write the CDMS log information to a file. Additionally, the user may
-    opt to print this information directly to stdout, or discard it entirely.
-    
-    Parameters
-    ----------      
-    user_option : str
-        The result of the arg.log 's interpretation of
-         what option the user selected.
-    logName : str
-        The name of the log file we wish to write to,
-        assuming the user did not use the -l option.
-    """
-    if user_option == 'N':
-        print("** Note: No log was created **")
-
-
-    elif user_option == '1':
-        #prints the log contents to stdout
-        logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s',
-                        level=logging.INFO,
-                        datefmt='%Y-%m-%d %H:%M:%S',
-                        handlers=[
-                            logging.StreamHandler(sys.stdout)
-                            ])
-                
-    else:
-        #prints log to a .log file
-        logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s',
-                        level=logging.INFO,
-                        datefmt='%Y-%m-%d %H:%M:%S',
-                        handlers=[
-                            logging.FileHandler(logName)
-                            ])
-        if user_option != 1 and user_option != 'Y':
-            print(f"** Bad usage of log option. Log will print to {logName} 
**")
-
-    
-
-
-
-if __name__ == '__main__':
-    """
-    Execution:
-        python cdms_reader.py filename
-        OR
-        python3 cdms_reader.py filename 
-        OR
-        python3 cdms_reader.py filename -c -g 
-        OR
-        python3 cdms_reader.py filename --csv --meta
-
-    Note (For Help Try):
-            python3 cdms_reader.py -h
-            OR
-            python3 cdms_reader.py --help
-
-    """
-   
-    u0 = '\n%(prog)s -h OR --help \n'
-    u1 = '%(prog)s filename -c -g\n%(prog)s filename --csv --meta\n'
-    u2 ='Use -l OR -l1 to modify destination of logs'
-    p = argparse.ArgumentParser(usage= u0 + u1 + u2)
-
-    #below block is to customize user options
-    p.add_argument('filename', help='CDMS netCDF file to read')
-    p.add_argument('-c', '--csv', nargs='?', const= 'Y', default='N',
-     help='Use -c or --csv to retrieve CSV output')
-    p.add_argument('-g', '--meta', nargs='?', const='Y', default='N',
-     help='Use -g or --meta to retrieve global attributes / metadata')
-    p.add_argument('-l', '--log', nargs='?', const='N', default='Y',
-     help='Use -l or --log to AVOID creating log files, OR use -l1 to print to 
stdout/console') 
-
-    #arguments are processed by the next line
-    args = p.parse_args()
-
-    logName = args.filename.replace(".nc", ".log")
-    create_logs(args.log, logName)
-    
-    cdms_matches = assemble_matches(args.filename)
-
-    if args.csv == 'Y' :
-        matches_to_csv(cdms_matches, args.filename.replace(".nc",".csv"))
-
-    if args.meta == 'Y' :
-        get_globals(args.filename)
-
-
-
-
-    
-
-    
-    
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import string
+from netCDF4 import Dataset, num2date
+import sys
+import datetime
+import csv
+from collections import OrderedDict
+import logging
+import xarray as xr
+
+
+#TODO: Get rid of numpy errors?
+#TODO: Update big SDAP README
+
+LOGGER =  logging.getLogger("cdms_reader")
+
+def assemble_matches(filename):
+    """
+    Read a CDMS netCDF file and return a list of matches.
+    
+    Parameters
+    ----------
+    filename : str
+        The CDMS netCDF file name.
+    
+    Returns
+    -------
+    matches : list
+        List of matches. Each list element is a dictionary.
+        For match m, netCDF group GROUP (PrimaryData or SecondaryData), and
+        group variable VARIABLE:
+        matches[m][GROUP]['matchID']: MatchedRecords dimension ID for the match
+        matches[m][GROUP]['GROUPID']: GROUP dim dimension ID for the record
+        matches[m][GROUP][VARIABLE]: variable value 
+    """
+   
+    try:
+        # Open the netCDF file
+
+        with Dataset(filename, 'r') as cdms_nc:
+            # Check that the number of groups is consistent w/ the 
MatchedGroups
+            # dimension
+            assert len(cdms_nc.groups) == 
cdms_nc.dimensions['MatchedGroups'].size,\
+                ("Number of groups isn't the same as MatchedGroups dimension.")
+            
+            matches = []
+            matched_records = cdms_nc.dimensions['MatchedRecords'].size
+           
+            # Loop through the match IDs to assemble matches
+            for match in range(0, matched_records):
+                match_dict = OrderedDict()
+                # Grab the data from each platform (group) in the match
+                for group_num, group in enumerate(cdms_nc.groups):
+                    match_dict[group] = OrderedDict()
+                    match_dict[group]['matchID'] = match
+                    ID = cdms_nc.variables['matchIDs'][match][group_num]
+                    match_dict[group][group + 'ID'] = ID
+                    for var in cdms_nc.groups[group].variables.keys():
+                        match_dict[group][var] = cdms_nc.groups[group][var][ID]
+
+                    # Create a UTC datetime field from timestamp
+                    dt = num2date(match_dict[group]['time'],
+                                  cdms_nc.groups[group]['time'].units)
+                    match_dict[group]['datetime'] = dt
+                LOGGER.info(match_dict)
+                matches.append(match_dict)
+            
+            return matches
+    except (OSError, IOError) as err:
+        LOGGER.exception("Error reading netCDF file " + filename)
+        raise err
+   
+def return_matches_array(filename):
+     """
+        Read a CDMS netCDF file and return a list of matches in a more 
rudimentary format.
+    
+        Parameters
+        ----------
+        filename : str
+            The CDMS netCDF file name.
+    
+        Returns
+        -------
+        matches : list
+            List of matches.
+            For match m:
+                matches[m][0 - 8]: Contains either floats or arrays 
representing data for
+                PrimaryData and its associated SecondaryData.
+
+            Please refer to the code in this function as well as the README to 
obtain information
+            regarding the ordering of this data. 
+     """
+
+    match_ids = xr.open_dataset(filename)
+    primary = xr.open_dataset(filename, group= "PrimaryData")
+    secondary = xr.open_dataset(filename, group= "SecondaryData")
+
+    #set up primary arrays
+    matches = [[0.0, 0.0, 0.0, 0.0, [], [], [], [], []] for x in 
range(primary.sizes['dim'])]
+
+    #set up secondary arrays
+    sec_lon = []
+    sec_lat = []
+    sec_time = []
+    sec_speed = []
+    sec_dir = []
+    prev_x = 0
+    
+    #load values into the arrays 
+    matches[0][0] = float(primary.lon[0].values)
+    matches[0][1] = float(primary.lat[0].values)
+    matches[0][2] = float(primary.time[0].values)
+    matches[0][3] = float(primary.sea_surface_foundation_temperature[0].values)
+
+    for x, y in match_ids.matchIDs.values:
+        if prev_x != int(x):
+            matches[prev_x][4] = (sec_lon)
+            matches[prev_x][5] = (sec_lat)
+            matches[prev_x][6] = (sec_time)
+            matches[prev_x][7] = (sec_speed)
+            matches[prev_x][8] = (sec_dir)
+
+            matches[int(x)][0] = float(primary.lon[int(x)].values)
+            matches[int(x)][1] = float(primary.lat[int(x)].values)
+            matches[int(x)][2] = float(primary.time[int(x)].values)
+            matches[int(x)][3] = 
float(primary.sea_surface_foundation_temperature[int(x)].values)
+                
+            sec_lon = []
+            sec_lat = []
+            sec_time = []
+            sec_speed = []
+            sec_dir = []
+
+        sec_lon.append(float(secondary.lon[int(y)].values))
+        sec_lat.append(float(secondary.lat[int(y)].values))
+        sec_time.append(float(secondary.time[int(y)].values))
+        sec_speed.append(float(secondary.wind_speed[int(y)].values))
+        sec_dir.append(float(secondary.wind_to_direction[int(y)].values))

Review Comment:
   @alovett-COAPS This change to the reader appears to be specific just for the 
primary = sst and secondary = wind use case. This reader should be generic to 
support any variable(s). In the use case where secondary is an in situ source 
there can be also be multiple variables returned.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: dev-unsubscr...@sdap.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [incubator-sdap-nexus] ngachung commented on a diff in pull request #260: added primitive match return function to CDMS reader

Reply via email to