Re: [PR] feat: add length unit support in FileSystem limits [commons-io]

via GitHub Sun, 07 Sep 2025 01:46:08 -0700


ppkarwasz commented on code in PR #781:
URL: https://github.com/apache/commons-io/pull/781#discussion_r2328576906



##########
src/main/java/org/apache/commons/io/FileSystem.java:
##########
@@ -624,75 +631,102 @@ CharSequence trimExtension(final CharSequence cs) {
         return index < 0 ? cs : cs.subSequence(0, index);
     }
 
-    private boolean isLegalFileLength(final CharSequence candidate, final 
Charset charset) {
-        if (candidate == null || candidate.length() == 0) {
-            return false;
-        }
-        if (lengthUnit == LengthUnit.CHARS) {
-            return candidate.length() <= getMaxFileNameLength();
-        }
-        final CharsetEncoder encoder = charset.newEncoder();
-        try {
-            final ByteBuffer buffer = 
encoder.encode(CharBuffer.wrap(candidate));
-            return buffer.remaining() <= getMaxFileNameLength();
-        } catch (CharacterCodingException e) {
-            // If we can't encode, it's not legal
-            return false;
-        }
-    }
+    /**
+     * Strategy for measuring and truncating file or path names in different 
units.
+     * Implementations measure length and can truncate to a specified limit.
+     */
+    enum NameLengthStrategy {
+        /** Length measured as encoded bytes. */
+        BYTES {
+            @Override
+            int getLength(final CharSequence value, final Charset charset) {
+                final CharsetEncoder enc = charset.newEncoder()
+                        .onMalformedInput(CodingErrorAction.REPORT)
+                        .onUnmappableCharacter(CodingErrorAction.REPORT);
+                try {
+                    return enc.encode(CharBuffer.wrap(value)).remaining();
+                } catch (CharacterCodingException e) {
+                    // Unencodable ⇒ does not fit any byte limit.
+                    return Integer.MAX_VALUE;
+                }
+            }
 
-    CharSequence truncateFileName(final CharSequence candidate, final Charset 
charset) {
-        final int maxFileNameLength = getMaxFileNameLength();
-        // Character-based limit: simple substring if needed.
-        if (lengthUnit == LengthUnit.CHARS) {
-            return candidate.length() <= maxFileNameLength ? candidate : 
candidate.subSequence(0, maxFileNameLength);
-        }
+            @Override
+            CharSequence truncate(final CharSequence value, final int limit, 
final Charset charset) {
+                final CharsetEncoder encoder = charset.newEncoder()
+                        .onMalformedInput(CodingErrorAction.REPORT)
+                        .onUnmappableCharacter(CodingErrorAction.REPORT);
 
-        // Byte-based limit
-        return truncateByBytes(candidate, charset, maxFileNameLength);
-    }
+                if (!encoder.canEncode(value)) {
+                    throw new IllegalArgumentException(
+                            "The value " + value + " cannot be encoded using " 
+ charset.name());
+                }
 
-    static CharSequence truncateByBytes(final CharSequence candidate, final 
Charset charset, final int maxBytes) {
-        // Byte-based limit
-        final CharsetEncoder encoder = charset.newEncoder()
-                .onMalformedInput(CodingErrorAction.REPORT)
-                .onUnmappableCharacter(CodingErrorAction.REPORT);
+                // Fast path: if even the worst-case expansion fits, we're 
done.
+                if (value.length() <= Math.floor(limit / 
encoder.maxBytesPerChar())) {
+                    return value;
+                }
 
-        if (!encoder.canEncode(candidate)) {
-            throw new IllegalArgumentException(
-                    "File name contains characters that cannot be encoded with 
charset " + charset.name());
-        }
+                // Slow path: encode into a fixed-size byte buffer.
+                final ByteBuffer out = ByteBuffer.allocate(limit);
+                final CharBuffer in = CharBuffer.wrap(value);
 
-        // Fast path: if even the worst-case expansion fits, we're done.
-        if (candidate.length() <= Math.floor(maxBytes / 
encoder.maxBytesPerChar())) {
-            return candidate;
-        }
+                // Encode until the first character that would exceed the byte 
budget.
+                final CoderResult cr = encoder.encode(in, out, true);
+
+                if (cr.isUnderflow()) {
+                    // Entire candidate fit within maxFileNameLength bytes.
+                    return value;
+                }
 
-        // Slow path: encode into a fixed-size byte buffer.
-        final ByteBuffer out = ByteBuffer.allocate(maxBytes);
-        final CharBuffer in = CharBuffer.wrap(candidate);
+                // We ran out of space mid-encode: truncate BEFORE the 
offending character.
+                return value.subSequence(0, in.position());
+            }
+        },
 
-        // Encode until the first character that would exceed the byte budget.
-        final CoderResult cr = encoder.encode(in, out, true);
+        /** Length measured as UTF-16 code units (i.e., {@code 
CharSequence.length()}). */
+        UTF16_CHARS {
+            @Override
+            int getLength(final CharSequence value, final Charset charset) {
+                return value.length();
+            }
 
-        if (cr.isUnderflow()) {
-            // Entire candidate fit within maxFileNameLength bytes.
-            return candidate;
+            @Override
+            CharSequence truncate(final CharSequence value, final int limit, 
final Charset charset) {
+                return value.length() <= limit ? value : value.subSequence(0, 
limit);
+            }
+        };
+
+        /**
+         * Gets the measured length in this strategy’s unit.
+         *
+         * @param value The value to measure, not null.
+         * @param charset The charset to use when measuring in bytes.
+         * @return The length in this strategy’s unit.
+         */
+        abstract int getLength(CharSequence value, Charset charset);
+
+        /**
+         * Tests if the measured length is less or equal the {@code limit}.
+         *
+         * @param value The value to measure, not null.
+         * @param limit The limit to compare to.
+         * @param charset The charset to use when measuring in bytes.
+         * @return {@code true} if the measured length is less or equal the 
{@code limit}, {@code false} otherwise.
+         */
+        final boolean isWithinLimit(final CharSequence value, final int limit, 
final Charset charset) {
+            return getLength(value, charset) <= limit;
         }
 
-        // We ran out of space mid-encode: truncate BEFORE the offending 
character.
-        return candidate.subSequence(0, in.position());
+        /**
+         * Truncates to {@code limit} in this strategy’s unit (no-op if 
already within limit).
+         *
+         * @param value The value to truncate, not null.
+         * @param limit The limit to truncate to.
+         * @param charset The charset to use when measuring in bytes.
+         * @return The truncated value, not null.
+         */
+        abstract CharSequence truncate(CharSequence value, int limit, Charset 
charset);

Review Comment:
   Good point, I will fix it.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] feat: add length unit support in FileSystem limits [commons-io]

Reply via email to