TheNeuralBit commented on a change in pull request #12882:
URL: https://github.com/apache/beam/pull/12882#discussion_r492330184



##########
File path: sdks/python/apache_beam/dataframe/schemas.py
##########
@@ -55,17 +159,149 @@ def expand(self, pcoll):
         lambda batch: pd.DataFrame.from_records(batch, columns=columns))
 
 
-def _make_empty_series(name, typ):
-  try:
-    return pd.Series(name=name, dtype=typ)
-  except TypeError:
-    raise TypeError("Unable to convert type '%s' for field '%s'" % (name, typ))
+def _make_proxy_series(name, typehint):
+  # Default to np.object. This is lossy, we won't be able to recover the type
+  # at the output.
+  dtype = BEAM_TO_PANDAS.get(typehint, np.object)
+
+  return pd.Series(name=name, dtype=dtype)
 
 
 def generate_proxy(element_type):
   # type: (type) -> pd.DataFrame
-  return pd.DataFrame({
-      name: _make_empty_series(name, typ)
-      for name,
-      typ in named_fields_from_element_type(element_type)
-  })
+
+  """ Generate a proxy pandas object for the given PCollection element_type.
+
+  Currently only supports generating a DataFrame proxy from a schema-aware
+  PCollection."""
+  fields = named_fields_from_element_type(element_type)
+  return pd.DataFrame(
+      {name: _make_proxy_series(name, typehint)
+       for name, typehint in fields},
+      columns=[name for name, _ in fields])
+
+
+def element_type_from_proxy(proxy):
+  # type: (pd.DataFrame) -> type
+
+  """ Generate an element_type for an element-wise PCollection from a proxy
+  pandas object. Currently only supports converting the element_type for
+  a schema-aware PCollection to a proxy DataFrame.
+
+  Currently only supports generating a DataFrame proxy from a schema-aware
+  PCollection."""
+  indices = [] if proxy.index.names == (None, ) else [

Review comment:
       I added an `included_indexes` option on `DataframeTransform`, 
`to_pcollection`, and `UnbatchPandas`. It raises an exception if used when 
index names are not unique or unnamed. PTAL




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to