This is an automated email from the ASF dual-hosted git repository.

potiuk pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow-steward.git


The following commit(s) were added to refs/heads/main by this push:
     new e0a0508  fix(privacy-llm-redactor): correct --field help text and 
force UTF-8 mapping reads (#231)
e0a0508 is described below

commit e0a0508ffee6fa390fe4a1948fc9c638d6fba29e
Author: André Ahlert <[email protected]>
AuthorDate: Tue May 19 19:49:43 2026 -0300

    fix(privacy-llm-redactor): correct --field help text and force UTF-8 
mapping reads (#231)
    
    Two critical correctness bugs in the redactor package.
    
    1. The --field help text listed a type name "reporter" and code "R"
       that the parser does not accept (valid names come from TYPE_CODES:
       name/email/phone/ip/handle/address, codes N/E/P/IP/H/A). A user
       copying the documented form got SystemExit and their PII flowed to
       the LLM unredacted. Help text now lists the real names and codes.
    
    2. load_mapping read the mapping file with the locale-default encoding
       while save_mapping_atomic writes UTF-8. On a non-UTF-8 host this
       corrupts non-ASCII PII values (accented names, IDN domains) on the
       round-trip, so pii-reveal substitutes wrong text. load_mapping now
       reads with encoding="utf-8".
    
    Adds a regression test for each fix.
---
 tools/privacy-llm/redactor/src/redactor/mapping.py |  2 +-
 tools/privacy-llm/redactor/src/redactor/redact.py  |  2 +-
 tools/privacy-llm/redactor/tests/test_mapping.py   | 17 +++++++++++++++++
 tools/privacy-llm/redactor/tests/test_redact.py    | 19 ++++++++++++++++++-
 4 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/tools/privacy-llm/redactor/src/redactor/mapping.py 
b/tools/privacy-llm/redactor/src/redactor/mapping.py
index 6248d47..2926d75 100644
--- a/tools/privacy-llm/redactor/src/redactor/mapping.py
+++ b/tools/privacy-llm/redactor/src/redactor/mapping.py
@@ -90,7 +90,7 @@ def load_mapping(path: pathlib.Path) -> dict[str, Entry]:
     """
     if not path.exists():
         return {}
-    raw = json.loads(path.read_text())
+    raw = json.loads(path.read_text(encoding="utf-8"))
     if not isinstance(raw, dict):
         raise ValueError(f"{path}: expected a JSON object at the top level")
     version = raw.get("version")
diff --git a/tools/privacy-llm/redactor/src/redactor/redact.py 
b/tools/privacy-llm/redactor/src/redactor/redact.py
index eccd6f2..6fb7d37 100644
--- a/tools/privacy-llm/redactor/src/redactor/redact.py
+++ b/tools/privacy-llm/redactor/src/redactor/redact.py
@@ -130,7 +130,7 @@ def main(argv: list[str] | None = None) -> int:
         help=(
             "PII to redact, declared as type:value. "
             "Repeat for each field. Type is one of: "
-            "reporter, email, phone, ip, handle, address (or codes R, E, P, 
IP, H, A)."
+            "name, email, phone, ip, handle, address (or codes N, E, P, IP, H, 
A)."
         ),
     )
     parser.add_argument(
diff --git a/tools/privacy-llm/redactor/tests/test_mapping.py 
b/tools/privacy-llm/redactor/tests/test_mapping.py
index 8afec1a..f9d8b55 100644
--- a/tools/privacy-llm/redactor/tests/test_mapping.py
+++ b/tools/privacy-llm/redactor/tests/test_mapping.py
@@ -148,6 +148,23 @@ def test_save_and_load_round_trip(tmp_path: pathlib.Path):
     assert loaded == mapping
 
 
+def test_load_round_trips_non_ascii_values(tmp_path: pathlib.Path):
+    """Non-ASCII PII values must survive a save/load round-trip.
+
+    Regression: ``load_mapping`` read the file with the locale-default
+    encoding while ``save_mapping_atomic`` writes UTF-8, corrupting
+    non-ASCII values (accented names, IDN domains) on non-UTF-8 hosts.
+    """
+    path = tmp_path / "pii.json"
+    mapping: dict[str, Entry] = {}
+    upsert(mapping, "N", "José Müller")
+    upsert(mapping, "E", "renée@exámple.com")
+
+    save_mapping_atomic(path, mapping)
+    loaded = load_mapping(path)
+    assert loaded == mapping
+
+
 def test_save_creates_parent_dir(tmp_path: pathlib.Path):
     path = tmp_path / "deeper" / "nested" / "pii.json"
     mapping: dict[str, Entry] = {}
diff --git a/tools/privacy-llm/redactor/tests/test_redact.py 
b/tools/privacy-llm/redactor/tests/test_redact.py
index 8527d74..7feb317 100644
--- a/tools/privacy-llm/redactor/tests/test_redact.py
+++ b/tools/privacy-llm/redactor/tests/test_redact.py
@@ -77,6 +77,23 @@ def test_parse_field_rejects_empty_value():
         redact.parse_field("name:")
 
 
+def test_field_help_text_lists_real_type_names(monkeypatch):
+    """The ``--field`` help must name types the parser accepts.
+
+    Regression: the help listed ``reporter`` / code ``R``, neither of
+    which exists. A user copying the help got ``SystemExit`` and their
+    PII flowed to the LLM unredacted.
+    """
+    stdout = io.StringIO()
+    monkeypatch.setattr("sys.stdout", stdout)
+    with pytest.raises(SystemExit):
+        redact.main(["--help"])
+    # argparse wraps the help line; collapse whitespace before matching.
+    help_text = " ".join(stdout.getvalue().split())
+    assert "reporter" not in help_text
+    assert "name, email, phone, ip, handle, address" in help_text
+
+
 # -- end-to-end redaction ------------------------------------------------
 
 
@@ -102,7 +119,7 @@ def test_redact_persists_mapping(mapping_path, monkeypatch):
     )
     assert rc == 0
     mapping = load_mapping(mapping_path)
-    # Exactly one entry, of type reporter, value "Jane Smith".
+    # Exactly one entry, of type name, value "Jane Smith".
     assert len(mapping) == 1
     [entry] = mapping.values()
     assert entry.type == "name"

Reply via email to