Fokko commented on code in PR #2291:
URL: https://github.com/apache/iceberg-python/pull/2291#discussion_r2286101986


##########
pyiceberg/io/pyarrow.py:
##########
@@ -381,21 +381,38 @@ def to_input_file(self) -> PyArrowFile:
 
 class PyArrowFileIO(FileIO):
     fs_by_scheme: Callable[[str, Optional[str]], FileSystem]
+    config: Config
 
     def __init__(self, properties: Properties = EMPTY_DICT):
         self.fs_by_scheme: Callable[[str, Optional[str]], FileSystem] = 
lru_cache(self._initialize_fs)
+        self.config = Config()
         super().__init__(properties=properties)
 
     @staticmethod
-    def parse_location(location: str) -> Tuple[str, str, str]:
-        """Return the path without the scheme."""
+    def parse_location(location: str, config: Config) -> Tuple[str, str, str]:
+        """Return (scheme, netloc, path) for the given location.
+
+        Uses environment variables DEFAULT_SCHEME and DEFAULT_NETLOC
+        if scheme/netloc are missing.
+        """
         uri = urlparse(location)
-        if not uri.scheme:
-            return "file", uri.netloc, os.path.abspath(location)
-        elif uri.scheme in ("hdfs", "viewfs"):
-            return uri.scheme, uri.netloc, uri.path
+
+        # Load defaults from environment
+        default_scheme = config.get_str("default-scheme") or "file"
+        default_netloc = config.get_str("default-netloc") or ""
+

Review Comment:
   Thanks for sticking with us @mccormickt12 here, I think there is some 
miscommunication.
   
   How are you opening up the catalog? `load_catalog` is the recommended way of 
doing this: https://py.iceberg.apache.org/api/
   
   Let's consider the following `parse_location`:
   
   ```python
       def parse_location(self, location: str) -> Tuple[str, str, str]:
           """Return (scheme, netloc, path) for the given location.
   
           Uses environment variables default-scheme and default-netloc
           if scheme/netloc are missing.
           """
           uri = urlparse(location)
   
           # Apply logic
           scheme = uri.scheme or self.properties.get("default-scheme")
           netloc = uri.netloc or self.properties.get("default-netloc")
   
           if scheme in ("hdfs", "viewfs"):
               return scheme, netloc, uri.path
           else:
               # For non-HDFS URIs, include netloc in the path if present
               path = uri.path if uri.scheme else os.path.abspath(location)
               if netloc and not path.startswith(netloc):
                   path = f"{netloc}{path}"
               return scheme, netloc, path
   ```
   
   You can inject the `properties` through:
   
   ```python
   load_catalog('default', properties={
       'default-scheme': 'hdfs',
       'default-netloc': 
'ltx1-yugioh-cluster01.linkfs.prod-ltx1.atd.prod.linkedin.com:9000',
   })
   ```
   
   If you use `load_catalog`, it will also pick up the configuration and the 
environment variables:
   
   ```yaml
   catalog:
     default:
       default-scheme: hdfs
       default-netloc: 
ltx1-yugioh-cluster01.linkfs.prod-ltx1.atd.prod.linkedin.com:9000
   ```
   
   ```sh
   export PYICEBERG_CATALOG__DEFAULT__HDFS__DEFAULT_SCHEME=hdfs
   ```
   
   Or use the `FileIO` directly:
   
   ```python
   PyArrowFileIO(properties={
       'default-scheme': 'hdfs',
       'default-netloc': 
'ltx1-yugioh-cluster01.linkfs.prod-ltx1.atd.prod.linkedin.com:9000',
   })
   ```
   
   What do you think? Does this align with the way you're using PyIceberg?
   
   



##########
pyiceberg/io/pyarrow.py:
##########
@@ -381,21 +381,38 @@ def to_input_file(self) -> PyArrowFile:
 
 class PyArrowFileIO(FileIO):
     fs_by_scheme: Callable[[str, Optional[str]], FileSystem]
+    config: Config
 
     def __init__(self, properties: Properties = EMPTY_DICT):
         self.fs_by_scheme: Callable[[str, Optional[str]], FileSystem] = 
lru_cache(self._initialize_fs)
+        self.config = Config()
         super().__init__(properties=properties)
 
     @staticmethod
-    def parse_location(location: str) -> Tuple[str, str, str]:
-        """Return the path without the scheme."""
+    def parse_location(location: str, config: Config) -> Tuple[str, str, str]:
+        """Return (scheme, netloc, path) for the given location.
+
+        Uses environment variables DEFAULT_SCHEME and DEFAULT_NETLOC
+        if scheme/netloc are missing.
+        """
         uri = urlparse(location)
-        if not uri.scheme:
-            return "file", uri.netloc, os.path.abspath(location)
-        elif uri.scheme in ("hdfs", "viewfs"):
-            return uri.scheme, uri.netloc, uri.path
+
+        # Load defaults from environment
+        default_scheme = config.get_str("default-scheme") or "file"
+        default_netloc = config.get_str("default-netloc") or ""
+

Review Comment:
   Or, in the case of a `StaticTable`:
   
   ```python
   static_table = StaticTable.from_metadata(
       
"hdfs://warehouse/wh/nyc.db/taxis/metadata/00002-6ea51ce3-62aa-4197-9cf8-43d07c3440ca.metadata.json"
        properties={
           'default-scheme': 'hdfs',
           'default-netloc': 
'ltx1-yugioh-cluster01.linkfs.prod-ltx1.atd.prod.linkedin.com:9000',
       }
   )
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Reply via email to