Repository: spark
Updated Branches:
  refs/heads/branch-2.3 b8dbfcc57 -> 50cdb4138


[SPARK-24542][SQL] UDF series UDFXPathXXXX allow users to pass carefully 
crafted XML to access arbitrary files

## What changes were proposed in this pull request?

UDF series UDFXPathXXXX allow users to pass carefully crafted XML to access 
arbitrary files. Spark does not have built-in access control. When users use 
the external access control library, users might bypass them and access the 
file contents.

This PR basically patches the Hive fix to Apache Spark. 
https://issues.apache.org/jira/browse/HIVE-18879

## How was this patch tested?

A unit test case

Author: Xiao Li <gatorsm...@gmail.com>

Closes #21549 from gatorsmile/xpathSecurity.

(cherry picked from commit 9a75c18290fff7d116cf88a44f9120bf67d8bd27)
Signed-off-by: Wenchen Fan <wenc...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/50cdb413
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/50cdb413
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/50cdb413

Branch: refs/heads/branch-2.3
Commit: 50cdb4138e5cb0e0d15f739db8066f3ea86ef037
Parents: b8dbfcc
Author: Xiao Li <gatorsm...@gmail.com>
Authored: Mon Jun 18 20:17:04 2018 -0700
Committer: Wenchen Fan <wenc...@databricks.com>
Committed: Mon Jun 18 20:17:32 2018 -0700

----------------------------------------------------------------------
 .../catalyst/expressions/xml/UDFXPathUtil.java  | 28 +++++++++++++++++++-
 .../expressions/xml/UDFXPathUtilSuite.scala     | 21 +++++++++++++++
 .../expressions/xml/XPathExpressionSuite.scala  |  5 ++--
 3 files changed, 51 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/50cdb413/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/xml/UDFXPathUtil.java
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/xml/UDFXPathUtil.java
 
b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/xml/UDFXPathUtil.java
index d224332..023ec13 100644
--- 
a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/xml/UDFXPathUtil.java
+++ 
b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/xml/UDFXPathUtil.java
@@ -21,6 +21,9 @@ import java.io.IOException;
 import java.io.Reader;
 
 import javax.xml.namespace.QName;
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
 import javax.xml.xpath.XPath;
 import javax.xml.xpath.XPathConstants;
 import javax.xml.xpath.XPathExpression;
@@ -37,9 +40,15 @@ import org.xml.sax.InputSource;
  * This is based on Hive's UDFXPathUtil implementation.
  */
 public class UDFXPathUtil {
+  public static final String SAX_FEATURE_PREFIX = 
"http://xml.org/sax/features/";;
+  public static final String EXTERNAL_GENERAL_ENTITIES_FEATURE = 
"external-general-entities";
+  public static final String EXTERNAL_PARAMETER_ENTITIES_FEATURE = 
"external-parameter-entities";
+  private DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
+  private DocumentBuilder builder = null;
   private XPath xpath = XPathFactory.newInstance().newXPath();
   private ReusableStringReader reader = new ReusableStringReader();
   private InputSource inputSource = new InputSource(reader);
+
   private XPathExpression expression = null;
   private String oldPath = null;
 
@@ -65,14 +74,31 @@ public class UDFXPathUtil {
       return null;
     }
 
+    if (builder == null){
+      try {
+        initializeDocumentBuilderFactory();
+        builder = dbf.newDocumentBuilder();
+      } catch (ParserConfigurationException e) {
+        throw new RuntimeException(
+          "Error instantiating DocumentBuilder, cannot build xml parser", e);
+      }
+    }
+
     reader.set(xml);
     try {
-      return expression.evaluate(inputSource, qname);
+      return expression.evaluate(builder.parse(inputSource), qname);
     } catch (XPathExpressionException e) {
       throw new RuntimeException("Invalid XML document: " + e.getMessage() + 
"\n" + xml, e);
+    } catch (Exception e) {
+      throw new RuntimeException("Error loading expression '" + oldPath + "'", 
e);
     }
   }
 
+  private void initializeDocumentBuilderFactory() throws 
ParserConfigurationException {
+    dbf.setFeature(SAX_FEATURE_PREFIX + EXTERNAL_GENERAL_ENTITIES_FEATURE, 
false);
+    dbf.setFeature(SAX_FEATURE_PREFIX + EXTERNAL_PARAMETER_ENTITIES_FEATURE, 
false);
+  }
+
   public Boolean evalBoolean(String xml, String path) throws 
XPathExpressionException {
     return (Boolean) eval(xml, path, XPathConstants.BOOLEAN);
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/50cdb413/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/xml/UDFXPathUtilSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/xml/UDFXPathUtilSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/xml/UDFXPathUtilSuite.scala
index c4cde70..0fec15b 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/xml/UDFXPathUtilSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/xml/UDFXPathUtilSuite.scala
@@ -77,6 +77,27 @@ class UDFXPathUtilSuite extends SparkFunSuite {
     assert(ret == "foo")
   }
 
+  test("embedFailure") {
+    import org.apache.commons.io.FileUtils
+    import java.io.File
+    val secretValue = String.valueOf(Math.random)
+    val tempFile = File.createTempFile("verifyembed", ".tmp")
+    tempFile.deleteOnExit()
+    val fname = tempFile.getAbsolutePath
+
+    FileUtils.writeStringToFile(tempFile, secretValue)
+
+    val xml =
+      s"""<?xml version="1.0" encoding="utf-8"?>
+        |<!DOCTYPE test [
+        |    <!ENTITY embed SYSTEM "$fname">
+        |]>
+        |<foo>&embed;</foo>
+      """.stripMargin
+    val evaled = new UDFXPathUtil().evalString(xml, "/foo")
+    assert(evaled.isEmpty)
+  }
+
   test("number eval") {
     var ret =
       
util.evalNumber("<a><b>true</b><b>false</b><b>b3</b><c>c1</c><c>-77</c></a>", 
"a/c[2]")

http://git-wip-us.apache.org/repos/asf/spark/blob/50cdb413/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/xml/XPathExpressionSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/xml/XPathExpressionSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/xml/XPathExpressionSuite.scala
index bfa18a0..c6f6d3a 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/xml/XPathExpressionSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/xml/XPathExpressionSuite.scala
@@ -40,8 +40,9 @@ class XPathExpressionSuite extends SparkFunSuite with 
ExpressionEvalHelper {
 
     // Test error message for invalid XML document
     val e1 = intercept[RuntimeException] { testExpr("<a>/a>", "a", 
null.asInstanceOf[T]) }
-    assert(e1.getCause.getMessage.contains("Invalid XML document") &&
-      e1.getCause.getMessage.contains("<a>/a>"))
+    assert(e1.getCause.getCause.getMessage.contains(
+      "XML document structures must start and end within the same entity."))
+    assert(e1.getMessage.contains("<a>/a>"))
 
     // Test error message for invalid xpath
     val e2 = intercept[RuntimeException] { testExpr("<a></a>", "!#$", 
null.asInstanceOf[T]) }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to