YARN-5167. Escape occurences of encodedValues. (Sangjin Lee via Varun Saxena)
Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/2cab3fc0 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/2cab3fc0 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/2cab3fc0 Branch: refs/heads/YARN-2928 Commit: 2cab3fc03b80ae2e784acbd38a97d2e8b4cb221a Parents: 4df6d9b Author: Varun Saxena <varunsax...@apache.org> Authored: Mon Jun 6 09:39:59 2016 +0530 Committer: Vrushali <vrush...@twitter.com> Committed: Sun Jun 19 00:20:13 2016 -0700 ---------------------------------------------------------------------- .../storage/common/Separator.java | 123 ++++++++++++++----- .../storage/common/TestSeparator.java | 8 +- 2 files changed, 101 insertions(+), 30 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/2cab3fc0/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice/src/main/java/org/apache/hadoop/yarn/server/timelineservice/storage/common/Separator.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice/src/main/java/org/apache/hadoop/yarn/server/timelineservice/storage/common/Separator.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice/src/main/java/org/apache/hadoop/yarn/server/timelineservice/storage/common/Separator.java index 8a178db..5090b4d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice/src/main/java/org/apache/hadoop/yarn/server/timelineservice/storage/common/Separator.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice/src/main/java/org/apache/hadoop/yarn/server/timelineservice/storage/common/Separator.java @@ -20,12 +20,13 @@ import java.util.ArrayList; import java.util.Collection; import java.util.Iterator; import java.util.List; +import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.hadoop.hbase.util.Bytes; /** - * Used to separate row qualifiers, column qualifiers and compount fields. + * Used to separate row qualifiers, column qualifiers and compound fields. */ public enum Separator { @@ -53,25 +54,39 @@ public enum Separator { */ TAB("\t", "%3$"); + // a reserved character that starts each of the encoded values and is encoded + // first in order to escape naturally occurring instances of encoded values + // although it can be expressed as an enum instance, we define them as private + // variables to hide it from callers + private static final String PERCENT = "%"; + private static final String PERCENT_ENCODED = "%9$"; + + private static final Pattern PERCENT_PATTERN = + Pattern.compile(PERCENT, Pattern.LITERAL); + private static final String PERCENT_REPLACEMENT = + Matcher.quoteReplacement(PERCENT); + + private static final Pattern PERCENT_ENCODED_PATTERN = + Pattern.compile(PERCENT_ENCODED, Pattern.LITERAL); + private static final String PERCENT_ENCODED_REPLACEMENT = + Matcher.quoteReplacement(PERCENT_ENCODED); + /** * The string value of this separator. */ private final String value; /** - * The URLEncoded version of this separator. - */ - private final String encodedValue; - - /** * The bye representation of value. */ private final byte[] bytes; - /** - * The value quoted so that it can be used as a safe regex. - */ - private final String quotedValue; + // pre-compiled patterns and quoted replacements for optimization + private final Pattern valuePattern; + private final String valueReplacement; + + private final Pattern encodedValuePattern; + private final String encodedValueReplacement; /** * Indicator for variable size of an individual segment in a split. The @@ -97,7 +112,6 @@ public enum Separator { */ private Separator(String value, String encodedValue) { this.value = value; - this.encodedValue = encodedValue; // validation if (value == null || value.length() == 0 || encodedValue == null @@ -107,7 +121,11 @@ public enum Separator { } this.bytes = Bytes.toBytes(value); - this.quotedValue = Pattern.quote(value); + this.valuePattern = Pattern.compile(value, Pattern.LITERAL); + this.valueReplacement = Matcher.quoteReplacement(value); + + this.encodedValuePattern = Pattern.compile(encodedValue, Pattern.LITERAL); + this.encodedValueReplacement = Matcher.quoteReplacement(encodedValue); } /** @@ -119,6 +137,13 @@ public enum Separator { /** * Used to make token safe to be used with this separator without collisions. + * It <em>must</em> be paired with {@link #decode(String)} for it to be + * decoded correctly. + * <p> + * If you need to encode a given string for multiple separators, + * {@link #encode(String, Separator...)} should be used over successive + * invocations of this method. It will result in a more compact version of the + * encoded value. * * @param token Token to be encoded. * @return the token with any occurrences of this separator URLEncoded. @@ -128,11 +153,29 @@ public enum Separator { // Nothing to replace return token; } - return token.replace(value, encodedValue); + // first encode the percent to escape naturally occurring encoded values + String escaped = encodePercent(token); + return encodeSingle(escaped, this); + } + + private static String replace(String token, Pattern pattern, + String replacement) { + return pattern.matcher(token).replaceAll(replacement); + } + + private static String encodeSingle(String token, Separator separator) { + return replace(token, separator.valuePattern, + separator.encodedValueReplacement); + } + + private static String encodePercent(String token) { + return replace(token, PERCENT_PATTERN, PERCENT_ENCODED_REPLACEMENT); } /** - * Decode the token encoded using {@link #encode}. + * Decode the token encoded using {@link #encode(String)}. It <em>must</em> be + * used for the result encoded with {@link #encode(String)} to be able to + * recover the original. * * @param token Token to be decoded. * @return the token with any occurrences of the encoded separator replaced by @@ -143,13 +186,30 @@ public enum Separator { // Nothing to replace return token; } - return token.replace(encodedValue, value); + String escaped = decodeSingle(token, this); + // decode percent to de-escape + return decodePercent(escaped); + } + + private static String decodeSingle(String token, Separator separator) { + return replace(token, separator.encodedValuePattern, + separator.valueReplacement); + } + + private static String decodePercent(String token) { + return replace(token, PERCENT_ENCODED_PATTERN, PERCENT_REPLACEMENT); } /** - * Encode the given separators in the token with their encoding equivalent. - * This means that when encoding is already present in the token itself, this - * is not a reversible process. See also {@link #decode(String, Separator...)} + * Encode the given separators in the token with their encoding equivalents. + * It <em>must</em> be paired with {@link #decode(byte[], Separator...)} or + * {@link #decode(String, Separator...)} with the same separators for it to be + * decoded correctly. + * <p> + * If you need to encode a given string for multiple separators, this form of + * encoding should be used over successive invocations of + * {@link #encode(String)}. It will result in a more compact version of the + * encoded value. * * @param token containing possible separators that need to be encoded. * @param separators to be encoded in the token with their URLEncoding @@ -158,22 +218,25 @@ public enum Separator { * separators encoded. */ public static byte[] encode(String token, Separator... separators) { - if (token == null) { + if (token == null || token.length() == 0) { return EMPTY_BYTES; } String result = token; + // first encode the percent to escape naturally occurring encoded values + result = encodePercent(token); for (Separator separator : separators) { if (separator != null) { - result = separator.encode(result); + result = encodeSingle(result, separator); } } return Bytes.toBytes(result); } /** - * Decode the given separators in the token with their decoding equivalent. - * This means that when encoding is already present in the token itself, this - * is not a reversible process. + * Decode the given separators in the token with their decoding equivalents. + * It <em>must</em> be used for the result encoded with + * {@link #encode(String, Separator...)} with the same separators to be able + * to recover the original. * * @param token containing possible separators that need to be encoded. * @param separators to be encoded in the token with their URLEncoding @@ -189,9 +252,10 @@ public enum Separator { } /** - * Decode the given separators in the token with their decoding equivalent. - * This means that when encoding is already present in the token itself, this - * is not a reversible process. + * Decode the given separators in the token with their decoding equivalents. + * It <em>must</em> be used for the result encoded with + * {@link #encode(String, Separator...)} with the same separators to be able + * to recover the original. * * @param token containing possible separators that need to be encoded. * @param separators to be encoded in the token with their URLEncoding @@ -206,10 +270,11 @@ public enum Separator { String result = token; for (Separator separator : separators) { if (separator != null) { - result = separator.decode(result); + result = decodeSingle(result, separator); } } - return result; + // decode percent to de-escape + return decodePercent(result); } /** @@ -309,7 +374,7 @@ public enum Separator { public Collection<String> splitEncoded(String compoundValue) { List<String> result = new ArrayList<String>(); if (compoundValue != null) { - for (String val : compoundValue.split(quotedValue)) { + for (String val : valuePattern.split(compoundValue)) { result.add(decode(val)); } } http://git-wip-us.apache.org/repos/asf/hadoop/blob/2cab3fc0/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice/src/test/java/org/apache/hadoop/yarn/server/timelineservice/storage/common/TestSeparator.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice/src/test/java/org/apache/hadoop/yarn/server/timelineservice/storage/common/TestSeparator.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice/src/test/java/org/apache/hadoop/yarn/server/timelineservice/storage/common/TestSeparator.java index 0cda97c..27750f3 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice/src/test/java/org/apache/hadoop/yarn/server/timelineservice/storage/common/TestSeparator.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice/src/test/java/org/apache/hadoop/yarn/server/timelineservice/storage/common/TestSeparator.java @@ -34,7 +34,7 @@ public class TestSeparator { private static String villain = "Dr. Heinz Doofenshmirtz"; private static String special = - ". * | ? + \t ( ) [ ] { } ^ $ \\ \""; + ". * | ? + \t ( ) [ ] { } ^ $ \\ \" %"; /** * @@ -81,6 +81,12 @@ public class TestSeparator { Separator.VALUES, Separator.SPACE); } + @Test + public void testEncodedValues() { + testEncodeDecode("Double-escape %2$ and %9$ or %%2$ or %%3$, nor %%%2$" + + "= no problem!", + Separator.QUALIFIERS, Separator.VALUES, Separator.SPACE, Separator.TAB); + } @Test public void testSplits() { --------------------------------------------------------------------- To unsubscribe, e-mail: common-commits-unsubscr...@hadoop.apache.org For additional commands, e-mail: common-commits-h...@hadoop.apache.org