reductionista commented on a change in pull request #462: DL: Add asymmetric
cluster support for fit and evaluate
URL: https://github.com/apache/madlib/pull/462#discussion_r352939750
##########
File path: src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in
##########
@@ -266,3 +281,47 @@ def create_summary_view(module_name, model_table,
mst_key):
WHERE mst_key = {mst_key}
""".format(**locals()))
return tmp_view_summary
+
+def get_gpus_per_seg(schema_madlib, segments_per_host, module_name):
+
+ if is_platform_pg():
+ gpus = GPUInfoFunctions.get_gpu_info_from_tensorflow()
+ return [len(gpus)]
+ else:
+ gpu_table_name = unique_string(desp = 'gpu_table')
+ gpu_table_query = """
+ SELECT {schema_madlib}.gpu_configuration('{gpu_table_name}')
+ """.format(**locals())
+ plpy.execute(gpu_table_query)
+ gpu_query = """
+ SELECT hostname, count(*) AS count FROM {gpu_table_name} GROUP BY
hostname
+ """.format(**locals())
+ gpu_query_result = plpy.execute(gpu_query)
+
+ if not gpu_query_result:
+ plpy.error("{0} error: No GPUs configured on
hosts.".format(module_name))
+
+ host_dict = {}
+ for i in gpu_query_result:
+ host_dict[i['hostname']] = int(i['count'])
+
+ seg_query = """
+ SELECT hostname, content AS segment_id
+ FROM gp_segment_configuration
+ WHERE content != -1 AND role = 'p'
+ """
+ seg_query_result = plpy.execute(seg_query)
+
+ gpus_per_seg = [0] * len(seg_query_result)
+ warning_flag = True
+ for i in seg_query_result:
+ if i['hostname'] in host_dict.keys():
+ gpus_per_seg[i['segment_id']] = host_dict[i['hostname']]
+ if 0 < gpus_per_seg[i['segment_id']] < segments_per_host and
warning_flag:
+ plpy.warning(
+ 'The number of GPUs per segment host is less than the
number of '
+ 'segments per segment host. When different segments share
the '
+ 'same GPU, this may fail in some scenarios. The current '
+ 'recommended configuration is to have 1 GPU available per
segment.')
+ warning_flag = False
+ return gpus_per_seg
Review comment:
Here's a good example of why I find it misleading:
`gpus_per_seg[seg_id]` is the number of gpus on that host. We're
checking to make sure that this is less than the number of segments per host.
ie, `gpus_per_host < segments_per_host`. But the bottom line is, we're calling
the number of gpus per host "gpus_per_segment".
To the user, we refer to the gpus per host as "number of GPUs per segment
host" which is more accurate (we could just say host instead of segment host,
but I think either is fine).
Later we take gpus_per_seg and divide by the number of segments per host, to
get the actual number of gpus per seg (which, per the user message, is
recommended to be 1).
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services