Re: [PR] feat: dataframe string formatter [datafusion-python]

2025-06-25 Thread via GitHub


timsaucer merged PR #1170:
URL: https://github.com/apache/datafusion-python/pull/1170


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] feat: dataframe string formatter [datafusion-python]

2025-06-25 Thread via GitHub


timsaucer commented on code in PR #1170:
URL: 
https://github.com/apache/datafusion-python/pull/1170#discussion_r2166481205


##
python/datafusion/dataframe_formatter.py:
##
@@ -0,0 +1,739 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""HTML formatting utilities for DataFusion DataFrames."""
+
+from __future__ import annotations
+
+from typing import (
+Any,
+Callable,
+Optional,
+Protocol,
+runtime_checkable,
+)
+
+from datafusion._internal import DataFrame as DataFrameInternal
+
+
+def _validate_positive_int(value: Any, param_name: str) -> None:
+"""Validate that a parameter is a positive integer.
+
+Args:
+value: The value to validate
+param_name: Name of the parameter (used in error message)
+
+Raises:
+ValueError: If the value is not a positive integer
+"""
+if not isinstance(value, int) or value <= 0:
+msg = f"{param_name} must be a positive integer"
+raise ValueError(msg)
+
+
+def _validate_bool(value: Any, param_name: str) -> None:
+"""Validate that a parameter is a boolean.
+
+Args:
+value: The value to validate
+param_name: Name of the parameter (used in error message)
+
+Raises:
+TypeError: If the value is not a boolean
+"""
+if not isinstance(value, bool):
+msg = f"{param_name} must be a boolean"
+raise TypeError(msg)
+
+
+@runtime_checkable
+class CellFormatter(Protocol):
+"""Protocol for cell value formatters."""
+
+def __call__(self, value: Any) -> str:
+"""Format a cell value to string representation."""
+...
+
+
+@runtime_checkable
+class StyleProvider(Protocol):
+"""Protocol for HTML style providers."""
+
+def get_cell_style(self) -> str:
+"""Get the CSS style for table cells."""
+...
+
+def get_header_style(self) -> str:
+"""Get the CSS style for header cells."""
+...
+
+
+class DefaultStyleProvider:
+"""Default implementation of StyleProvider."""
+
+def get_cell_style(self) -> str:
+"""Get the CSS style for table cells.
+
+Returns:
+CSS style string
+"""
+return (
+"border: 1px solid black; padding: 8px; text-align: left; "
+"white-space: nowrap;"
+)
+
+def get_header_style(self) -> str:
+"""Get the CSS style for header cells.
+
+Returns:
+CSS style string
+"""
+return (
+"border: 1px solid black; padding: 8px; text-align: left; "
+"background-color: #f2f2f2; white-space: nowrap; min-width: 
fit-content; "
+"max-width: fit-content;"
+)
+
+
+class DataFrameHtmlFormatter:
+"""Configurable HTML formatter for DataFusion DataFrames.
+
+This class handles the HTML rendering of DataFrames for display in
+Jupyter notebooks and other rich display contexts.
+
+This class supports extension through composition. Key extension points:
+- Provide a custom StyleProvider for styling cells and headers
+- Register custom formatters for specific types
+- Provide custom cell builders for specialized cell rendering
+
+Args:
+max_cell_length: Maximum characters to display in a cell before 
truncation
+max_width: Maximum width of the HTML table in pixels
+max_height: Maximum height of the HTML table in pixels
+max_memory_bytes: Maximum memory in bytes for rendered data (default: 
2MB)
+min_rows_display: Minimum number of rows to display
+repr_rows: Default number of rows to display in repr output
+enable_cell_expansion: Whether to add expand/collapse buttons for long 
cell
+  values
+custom_css: Additional CSS to include in the HTML output
+show_truncation_message: Whether to display a message when data is 
truncated
+style_provider: Custom provider for cell and header styles
+use_shared_styles: Whether to load styles and scripts only once per 
notebook
+  session
+"""
+
+# Class variable to track if styles have been loaded in the notebook
+_styles_loaded = False
+
+def __init__(
+self,
+max_cell_length: 

Re: [PR] feat: dataframe string formatter [datafusion-python]

2025-06-24 Thread via GitHub


kosiew commented on code in PR #1170:
URL: 
https://github.com/apache/datafusion-python/pull/1170#discussion_r2165554718


##
python/datafusion/dataframe.py:
##
@@ -1112,3 +,17 @@ def fill_null(self, value: Any, subset: list[str] | None 
= None) -> DataFrame:
 - For columns not in subset, the original column is kept unchanged
 """
 return DataFrame(self.df.fill_null(value, subset))
+
+@staticmethod
+def default_str_repr(
+batches: list[pa.RecordBatch],
+schema: pa.Schema,
+has_more: bool,
+table_uuid: str | None = None,
+) -> str:
+"""Return the default string representation of a DataFrame.
+
+This method is used by the default formatter and implemented in Rust 
for
+performance reasons.
+"""
+return DataFrameInternal.default_str_repr(batches, schema, has_more, 
table_uuid)

Review Comment:
   default_str_repr is at the bottom of the class.
   
   It might be clearer to place it alongside other repr/formatter methods 
(__repr__, _repr_html_) for cohesion.



##
python/datafusion/dataframe_formatter.py:
##
@@ -0,0 +1,739 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""HTML formatting utilities for DataFusion DataFrames."""
+
+from __future__ import annotations
+
+from typing import (
+Any,
+Callable,
+Optional,
+Protocol,
+runtime_checkable,
+)
+
+from datafusion._internal import DataFrame as DataFrameInternal
+
+
+def _validate_positive_int(value: Any, param_name: str) -> None:
+"""Validate that a parameter is a positive integer.
+
+Args:
+value: The value to validate
+param_name: Name of the parameter (used in error message)
+
+Raises:
+ValueError: If the value is not a positive integer
+"""
+if not isinstance(value, int) or value <= 0:
+msg = f"{param_name} must be a positive integer"
+raise ValueError(msg)
+
+
+def _validate_bool(value: Any, param_name: str) -> None:
+"""Validate that a parameter is a boolean.
+
+Args:
+value: The value to validate
+param_name: Name of the parameter (used in error message)
+
+Raises:
+TypeError: If the value is not a boolean
+"""
+if not isinstance(value, bool):
+msg = f"{param_name} must be a boolean"
+raise TypeError(msg)
+
+
+@runtime_checkable
+class CellFormatter(Protocol):
+"""Protocol for cell value formatters."""
+
+def __call__(self, value: Any) -> str:
+"""Format a cell value to string representation."""
+...
+
+
+@runtime_checkable
+class StyleProvider(Protocol):
+"""Protocol for HTML style providers."""
+
+def get_cell_style(self) -> str:
+"""Get the CSS style for table cells."""
+...
+
+def get_header_style(self) -> str:
+"""Get the CSS style for header cells."""
+...
+
+
+class DefaultStyleProvider:
+"""Default implementation of StyleProvider."""
+
+def get_cell_style(self) -> str:
+"""Get the CSS style for table cells.
+
+Returns:
+CSS style string
+"""
+return (
+"border: 1px solid black; padding: 8px; text-align: left; "
+"white-space: nowrap;"
+)
+
+def get_header_style(self) -> str:
+"""Get the CSS style for header cells.
+
+Returns:
+CSS style string
+"""
+return (
+"border: 1px solid black; padding: 8px; text-align: left; "
+"background-color: #f2f2f2; white-space: nowrap; min-width: 
fit-content; "
+"max-width: fit-content;"
+)
+
+
+class DataFrameHtmlFormatter:
+"""Configurable HTML formatter for DataFusion DataFrames.
+
+This class handles the HTML rendering of DataFrames for display in
+Jupyter notebooks and other rich display contexts.
+
+This class supports extension through composition. Key extension points:
+- Provide a custom StyleProvider for styling cells and headers
+- Register custom formatters for specific types
+- Provide custom cell builders for specialized cell rendering
+
+Args:
+max_cell_length: Maximum characters to displ