[1/7] hive git commit: HIVE-12075 : add analyze command to explictly cache file metadata in HBase metastore (Sergey Shelukhin, reviewed by Alan Gates)

sershe Fri, 18 Dec 2015 14:43:41 -0800

Repository: hive
Updated Branches:
  refs/heads/master 542eaf6bc -> 7df62023f



http://git-wip-us.apache.org/repos/asf/hive/blob/7df62023/metastore/src/test/org/apache/hadoop/hive/metastore/TestObjectStore.java
----------------------------------------------------------------------
diff --git 
a/metastore/src/test/org/apache/hadoop/hive/metastore/TestObjectStore.java 
b/metastore/src/test/org/apache/hadoop/hive/metastore/TestObjectStore.java
index 9089d1c..1157033 100644
--- a/metastore/src/test/org/apache/hadoop/hive/metastore/TestObjectStore.java
+++ b/metastore/src/test/org/apache/hadoop/hive/metastore/TestObjectStore.java
@@ -17,7 +17,6 @@
  */
 package org.apache.hadoop.hive.metastore;
 
-import java.nio.ByteBuffer;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
@@ -25,6 +24,7 @@ import java.util.List;
 import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.hadoop.hive.metastore.api.Database;
 import org.apache.hadoop.hive.metastore.api.FieldSchema;
+import org.apache.hadoop.hive.metastore.api.FileMetadataExprType;
 import org.apache.hadoop.hive.metastore.api.Function;
 import org.apache.hadoop.hive.metastore.api.InvalidInputException;
 import org.apache.hadoop.hive.metastore.api.InvalidObjectException;
@@ -71,12 +71,17 @@ public class TestObjectStore {
     }
 
     @Override
+    public FileMetadataExprType getMetadataType(String inputFormat) {
+      return null;
+    }
+
+    @Override
     public SearchArgument createSarg(byte[] expr) {
       return null;
     }
 
     @Override
-    public ByteBuffer applySargToFileMetadata(SearchArgument sarg, ByteBuffer 
byteBuffer) {
+    public FileFormatProxy getFileFormatProxy(FileMetadataExprType type) {
       return null;
     }
   }

http://git-wip-us.apache.org/repos/asf/hive/blob/7df62023/metastore/src/test/org/apache/hadoop/hive/metastore/hbase/MockUtils.java
----------------------------------------------------------------------
diff --git 
a/metastore/src/test/org/apache/hadoop/hive/metastore/hbase/MockUtils.java 
b/metastore/src/test/org/apache/hadoop/hive/metastore/hbase/MockUtils.java
index 983129a..784648a 100644
--- a/metastore/src/test/org/apache/hadoop/hive/metastore/hbase/MockUtils.java
+++ b/metastore/src/test/org/apache/hadoop/hive/metastore/hbase/MockUtils.java
@@ -19,6 +19,8 @@
 package org.apache.hadoop.hive.metastore.hbase;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hbase.Cell;
 import org.apache.hadoop.hbase.client.Delete;
 import org.apache.hadoop.hbase.client.Get;
@@ -29,7 +31,9 @@ import org.apache.hadoop.hbase.client.ResultScanner;
 import org.apache.hadoop.hbase.client.Scan;
 import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
+import org.apache.hadoop.hive.metastore.FileFormatProxy;
 import org.apache.hadoop.hive.metastore.PartitionExpressionProxy;
+import org.apache.hadoop.hive.metastore.api.FileMetadataExprType;
 import org.apache.hadoop.hive.metastore.api.MetaException;
 import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
 import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
@@ -73,10 +77,14 @@ public class MockUtils {
     }
 
     @Override
-    public ByteBuffer applySargToFileMetadata(SearchArgument sarg, ByteBuffer 
byteBuffer) {
+    public FileMetadataExprType getMetadataType(String inputFormat) {
       return null;
     }
 
+    @Override
+    public FileFormatProxy getFileFormatProxy(FileMetadataExprType type) {
+      return null;
+    }
   }
 
   static HBaseStore init(Configuration conf, HTableInterface htable,

http://git-wip-us.apache.org/repos/asf/hive/blob/7df62023/ql/src/java/org/apache/hadoop/hive/ql/exec/DDLTask.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/DDLTask.java 
b/ql/src/java/org/apache/hadoop/hive/ql/exec/DDLTask.java
index 290f489..30cae88 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/DDLTask.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/DDLTask.java
@@ -104,6 +104,7 @@ import org.apache.hadoop.hive.ql.plan.AlterTableDesc;
 import org.apache.hadoop.hive.ql.plan.AlterTableDesc.AlterTableTypes;
 import org.apache.hadoop.hive.ql.plan.AlterTableExchangePartition;
 import org.apache.hadoop.hive.ql.plan.AlterTableSimpleDesc;
+import org.apache.hadoop.hive.ql.plan.CacheMetadataDesc;
 import org.apache.hadoop.hive.ql.plan.CreateDatabaseDesc;
 import org.apache.hadoop.hive.ql.plan.CreateIndexDesc;
 import org.apache.hadoop.hive.ql.plan.CreateTableDesc;
@@ -518,6 +519,11 @@ public class DDLTask extends Task<DDLWork> implements 
Serializable {
       if (alterTableExchangePartition != null) {
         return exchangeTablePartition(db, alterTableExchangePartition);
       }
+
+      CacheMetadataDesc cacheMetadataDesc = work.getCacheMetadataDesc();
+      if (cacheMetadataDesc != null) {
+        return cacheMetadata(db, cacheMetadataDesc);
+      }
     } catch (Throwable e) {
       failed(e);
       return 1;
@@ -526,6 +532,12 @@ public class DDLTask extends Task<DDLWork> implements 
Serializable {
     return 0;
   }
 
+  private int cacheMetadata(Hive db, CacheMetadataDesc desc) throws 
HiveException {
+    db.cacheFileMetadata(desc.getDbName(), desc.getTableName(),
+        desc.getPartName(), desc.isAllParts());
+    return 0;
+  }
+
   private void failed(Throwable e) {
     while (e.getCause() != null && e.getClass() == RuntimeException.class) {
       e = e.getCause();

http://git-wip-us.apache.org/repos/asf/hive/blob/7df62023/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFileFormatProxy.java
----------------------------------------------------------------------
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFileFormatProxy.java 
b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFileFormatProxy.java
new file mode 100644
index 0000000..ef76723
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFileFormatProxy.java
@@ -0,0 +1,74 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.io.orc;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.metastore.FileFormatProxy;
+import org.apache.hadoop.hive.metastore.Metastore.SplitInfo;
+import org.apache.hadoop.hive.metastore.Metastore.SplitInfos;
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
+import org.apache.orc.OrcProto;
+import org.apache.orc.StripeInformation;
+
+/** File format proxy for ORC. */
+public class OrcFileFormatProxy implements FileFormatProxy {
+
+  @Override
+  public ByteBuffer applySargToMetadata(
+      SearchArgument sarg, ByteBuffer byteBuffer) throws IOException {
+    // TODO: ideally we should store shortened representation of only the 
necessary fields
+    //       in HBase; it will probably require custom SARG application code.
+    ReaderImpl.FooterInfo fi = 
ReaderImpl.extractMetaInfoFromFooter(byteBuffer, null);
+    OrcProto.Footer footer = fi.getFooter();
+    int stripeCount = footer.getStripesCount();
+    boolean[] result = OrcInputFormat.pickStripesViaTranslatedSarg(
+        sarg, fi.getFileMetaInfo().getWriterVersion(),
+        footer.getTypesList(), fi.getMetadata(), stripeCount);
+    // For ORC case, send the boundaries of the stripes so we don't have to 
send the footer.
+    SplitInfos.Builder sb = SplitInfos.newBuilder();
+    List<StripeInformation> stripes = fi.getStripes();
+    boolean isEliminated = true;
+    for (int i = 0; i < result.length; ++i) {
+      if (result != null && !result[i]) continue;
+      isEliminated = false;
+      StripeInformation si = stripes.get(i);
+      sb.addInfos(SplitInfo.newBuilder().setIndex(i)
+          .setOffset(si.getOffset()).setLength(si.getLength()));
+    }
+    return isEliminated ? null : ByteBuffer.wrap(sb.build().toByteArray());
+  }
+
+  public ByteBuffer[] getAddedColumnsToCache() {
+    return null; // Nothing so far.
+  }
+
+  public ByteBuffer[][] getAddedValuesToCache(List<ByteBuffer> metadata) {
+    throw new UnsupportedOperationException(); // Nothing so far (and 
shouldn't be called).
+  }
+
+  public ByteBuffer getMetadataToCache(
+      FileSystem fs, Path path, ByteBuffer[] addedVals) throws IOException {
+    // For now, there's nothing special to return in addedVals. Just return 
the footer.
+    return OrcFile.createReader(fs, path).getSerializedFileFooter();
+  }
+}

http://git-wip-us.apache.org/repos/asf/hive/blob/7df62023/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java 
b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java
index c682df2..29df4f9 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java
@@ -3401,4 +3401,17 @@ private void constructOneLBLocationMap(FileStatus fSta,
       throw new HiveException(e);
     }
   }
+
+  public void cacheFileMetadata(
+      String dbName, String tableName, String partName, boolean allParts) 
throws HiveException {
+    try {
+      boolean willCache = getMSC().cacheFileMetadata(dbName, tableName, 
partName, allParts);
+      if (!willCache) {
+        throw new HiveException(
+            "Caching file metadata is not supported by metastore or for this 
file format");
+      }
+    } catch (TException e) {
+      throw new HiveException(e);
+    }
+  }
 };

http://git-wip-us.apache.org/repos/asf/hive/blob/7df62023/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ppr/PartitionExpressionForMetastore.java
----------------------------------------------------------------------
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ppr/PartitionExpressionForMetastore.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ppr/PartitionExpressionForMetastore.java
index 7cddcc9..96910e3 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ppr/PartitionExpressionForMetastore.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ppr/PartitionExpressionForMetastore.java
@@ -18,18 +18,16 @@
 
 package org.apache.hadoop.hive.ql.optimizer.ppr;
 
-import java.io.IOException;
-import java.nio.ByteBuffer;
+import org.apache.hadoop.hive.metastore.api.FileMetadataExprType;
+
 import java.util.List;
 
-import org.apache.hadoop.hive.metastore.Metastore.SplitInfo;
-import org.apache.hadoop.hive.metastore.Metastore.SplitInfos;
+import org.apache.hadoop.hive.metastore.FileFormatProxy;
 import org.apache.hadoop.hive.metastore.PartitionExpressionProxy;
 import org.apache.hadoop.hive.metastore.api.MetaException;
 import org.apache.hadoop.hive.ql.exec.SerializationUtilities;
+import org.apache.hadoop.hive.ql.io.orc.OrcFileFormatProxy;
 import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat;
-import org.apache.hadoop.hive.ql.io.orc.ReaderImpl;
-import org.apache.orc.StripeInformation;
 import org.apache.hadoop.hive.ql.io.sarg.ConvertAstToSearchArg;
 import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
 import org.apache.hadoop.hive.ql.metadata.HiveException;
@@ -37,7 +35,6 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
 import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import org.apache.orc.OrcProto;
 
 /**
  * The basic implementation of PartitionExpressionProxy that uses ql package 
classes.
@@ -83,32 +80,29 @@ public class PartitionExpressionForMetastore implements 
PartitionExpressionProxy
   }
 
   @Override
-  public SearchArgument createSarg(byte[] expr) {
-    return ConvertAstToSearchArg.create(expr);
+  public FileFormatProxy getFileFormatProxy(FileMetadataExprType type) {
+    switch (type) {
+    case ORC_SARG: return new OrcFileFormatProxy();
+    default: throw new RuntimeException("Unsupported format " + type);
+    }
   }
 
   @Override
-  public ByteBuffer applySargToFileMetadata(
-      SearchArgument sarg, ByteBuffer byteBuffer) throws IOException {
-    // TODO: ideally we should store shortened representation of only the 
necessary fields
-    //       in HBase; it will probably require custom SARG application code.
-    ReaderImpl.FooterInfo fi = 
ReaderImpl.extractMetaInfoFromFooter(byteBuffer, null);
-    OrcProto.Footer footer = fi.getFooter();
-    int stripeCount = footer.getStripesCount();
-    boolean[] result = OrcInputFormat.pickStripesViaTranslatedSarg(
-        sarg, fi.getFileMetaInfo().getWriterVersion(),
-        footer.getTypesList(), fi.getMetadata(), stripeCount);
-    // For ORC case, send the boundaries of the stripes so we don't have to 
send the footer.
-    SplitInfos.Builder sb = SplitInfos.newBuilder();
-    List<StripeInformation> stripes = fi.getStripes();
-    boolean isEliminated = true;
-    for (int i = 0; i < result.length; ++i) {
-      if (result != null && !result[i]) continue;
-      isEliminated = false;
-      StripeInformation si = stripes.get(i);
-      sb.addInfos(SplitInfo.newBuilder().setIndex(i)
-          .setOffset(si.getOffset()).setLength(si.getLength()));
+  public FileMetadataExprType getMetadataType(String inputFormat) {
+    try {
+      Class<?> ifClass = Class.forName(inputFormat);
+      if (OrcInputFormat.class.isAssignableFrom(ifClass)) {
+        return FileMetadataExprType.ORC_SARG;
+      }
+      return null;
+    } catch (Throwable t) {
+      LOG.warn("Can't create the class for input format " + inputFormat, t);
+      return null;
     }
-    return isEliminated ? null : ByteBuffer.wrap(sb.build().toByteArray());
+  }
+
+  @Override
+  public SearchArgument createSarg(byte[] expr) {
+    return ConvertAstToSearchArg.create(expr);
   }
 }

http://git-wip-us.apache.org/repos/asf/hive/blob/7df62023/ql/src/java/org/apache/hadoop/hive/ql/parse/AnalyzeCommandUtils.java
----------------------------------------------------------------------
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/parse/AnalyzeCommandUtils.java 
b/ql/src/java/org/apache/hadoop/hive/ql/parse/AnalyzeCommandUtils.java
new file mode 100644
index 0000000..ac1383c
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/AnalyzeCommandUtils.java
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.parse;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.ql.exec.Utilities;
+import org.apache.hadoop.hive.ql.metadata.Table;
+import org.apache.hadoop.hive.ql.session.SessionState;
+
+public class AnalyzeCommandUtils {
+  public static boolean isPartitionLevelStats(ASTNode tree) {
+    boolean isPartitioned = false;
+    ASTNode child = (ASTNode) tree.getChild(0);
+    if (child.getChildCount() > 1) {
+      child = (ASTNode) child.getChild(1);
+      if (child.getToken().getType() == HiveParser.TOK_PARTSPEC) {
+        isPartitioned = true;
+      }
+    }
+    return isPartitioned;
+  }
+
+  public static Table getTable(ASTNode tree, BaseSemanticAnalyzer sa) throws 
SemanticException {
+    String tableName = ColumnStatsSemanticAnalyzer.getUnescapedName((ASTNode) 
tree.getChild(0).getChild(0));
+    String currentDb = SessionState.get().getCurrentDatabase();
+    String [] names = Utilities.getDbTableName(currentDb, tableName);
+    return sa.getTable(names[0], names[1], true);
+  }
+
+  public static Map<String,String> getPartKeyValuePairsFromAST(Table tbl, 
ASTNode tree,
+      HiveConf hiveConf) throws SemanticException {
+    ASTNode child = ((ASTNode) tree.getChild(0).getChild(1));
+    Map<String,String> partSpec = new HashMap<String, String>();
+    if (child != null) {
+      partSpec = DDLSemanticAnalyzer.getValidatedPartSpec(tbl, child, 
hiveConf, false);
+    } //otherwise, it is the case of analyze table T compute statistics for 
columns;
+    return partSpec;
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/hive/blob/7df62023/ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java
----------------------------------------------------------------------
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java 
b/ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java
index 832a5bc..1f30cbd 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java
@@ -19,7 +19,6 @@
 package org.apache.hadoop.hive.ql.parse;
 
 import java.io.IOException;
-import java.util.HashMap;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
@@ -83,35 +82,6 @@ public class ColumnStatsSemanticAnalyzer extends 
SemanticAnalyzer {
     return rwt;
   }
 
-  private boolean isPartitionLevelStats(ASTNode tree) {
-    boolean isPartitioned = false;
-    ASTNode child = (ASTNode) tree.getChild(0);
-    if (child.getChildCount() > 1) {
-      child = (ASTNode) child.getChild(1);
-      if (child.getToken().getType() == HiveParser.TOK_PARTSPEC) {
-        isPartitioned = true;
-      }
-    }
-    return isPartitioned;
-  }
-
-  private Table getTable(ASTNode tree) throws SemanticException {
-    String tableName = getUnescapedName((ASTNode) 
tree.getChild(0).getChild(0));
-    String currentDb = SessionState.get().getCurrentDatabase();
-    String [] names = Utilities.getDbTableName(currentDb, tableName);
-    return getTable(names[0], names[1], true);
-  }
-
-  private Map<String,String> getPartKeyValuePairsFromAST(Table tbl, ASTNode 
tree,
-      HiveConf hiveConf) throws SemanticException {
-    ASTNode child = ((ASTNode) tree.getChild(0).getChild(1));
-    Map<String,String> partSpec = new HashMap<String, String>();
-    if (child != null) {
-      partSpec = DDLSemanticAnalyzer.getValidatedPartSpec(tbl, child, 
hiveConf, false);
-    } //otherwise, it is the case of analyze table T compute statistics for 
columns;
-    return partSpec;
-  }
-
   private List<String> getColumnName(ASTNode tree) throws SemanticException{
 
     switch (tree.getChildCount()) {
@@ -405,11 +375,11 @@ public class ColumnStatsSemanticAnalyzer extends 
SemanticAnalyzer {
      * an aggregation.
      */
     if (shouldRewrite(ast)) {
-      tbl = getTable(ast);
+      tbl = AnalyzeCommandUtils.getTable(ast, this);
       colNames = getColumnName(ast);
       // Save away the original AST
       originalTree = ast;
-      boolean isPartitionStats = isPartitionLevelStats(ast);
+      boolean isPartitionStats = 
AnalyzeCommandUtils.isPartitionLevelStats(ast);
       Map<String,String> partSpec = null;
       checkForPartitionColumns(
           colNames, 
Utilities.getColumnNamesFromFieldSchema(tbl.getPartitionKeys()));
@@ -420,7 +390,7 @@ public class ColumnStatsSemanticAnalyzer extends 
SemanticAnalyzer {
 
       if (isPartitionStats) {
         isTableLevel = false;
-        partSpec = getPartKeyValuePairsFromAST(tbl, ast, conf);
+        partSpec = AnalyzeCommandUtils.getPartKeyValuePairsFromAST(tbl, ast, 
conf);
         handlePartialPartitionSpec(partSpec);
       } else {
         isTableLevel = true;

http://git-wip-us.apache.org/repos/asf/hive/blob/7df62023/ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java
----------------------------------------------------------------------
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java 
b/ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java
index c407aae..5e6b606 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java
@@ -26,6 +26,7 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.FileUtils;
 import org.apache.hadoop.hive.common.JavaUtils;
 import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
@@ -79,10 +80,12 @@ import org.apache.hadoop.hive.ql.plan.AlterTableDesc;
 import org.apache.hadoop.hive.ql.plan.AlterTableDesc.AlterTableTypes;
 import org.apache.hadoop.hive.ql.plan.AlterTableExchangePartition;
 import org.apache.hadoop.hive.ql.plan.AlterTableSimpleDesc;
+import org.apache.hadoop.hive.ql.plan.CacheMetadataDesc;
 import org.apache.hadoop.hive.ql.plan.ColumnStatsDesc;
 import org.apache.hadoop.hive.ql.plan.ColumnStatsUpdateWork;
 import org.apache.hadoop.hive.ql.plan.CreateDatabaseDesc;
 import org.apache.hadoop.hive.ql.plan.CreateIndexDesc;
+import org.apache.hadoop.hive.ql.plan.DDLDesc;
 import org.apache.hadoop.hive.ql.plan.DDLWork;
 import org.apache.hadoop.hive.ql.plan.DescDatabaseDesc;
 import org.apache.hadoop.hive.ql.plan.DescFunctionDesc;
@@ -492,7 +495,10 @@ public class DDLSemanticAnalyzer extends 
BaseSemanticAnalyzer {
    case HiveParser.TOK_SHOW_SET_ROLE:
      analyzeSetShowRole(ast);
      break;
-    default:
+   case HiveParser.TOK_CACHE_METADATA:
+     analyzeCacheMetadata(ast);
+     break;
+   default:
       throw new SemanticException("Unsupported command.");
     }
     if (fetchTask != null && !rootTasks.isEmpty()) {
@@ -500,6 +506,24 @@ public class DDLSemanticAnalyzer extends 
BaseSemanticAnalyzer {
     }
   }
 
+  private void analyzeCacheMetadata(ASTNode ast) throws SemanticException {
+    Table tbl = AnalyzeCommandUtils.getTable(ast, this);
+    Map<String,String> partSpec = null;
+    CacheMetadataDesc desc;
+    // In 2 cases out of 3, we could pass the path and type directly to 
metastore...
+    if (AnalyzeCommandUtils.isPartitionLevelStats(ast)) {
+      partSpec = AnalyzeCommandUtils.getPartKeyValuePairsFromAST(tbl, ast, 
conf);
+      Partition part = getPartition(tbl, partSpec, true);
+      desc = new CacheMetadataDesc(tbl.getDbName(), tbl.getTableName(), 
part.getName());
+      inputs.add(new ReadEntity(part));
+    } else {
+      // Should we get all partitions for a partitioned table?
+      desc = new CacheMetadataDesc(tbl.getDbName(), tbl.getTableName(), 
tbl.isPartitioned());
+      inputs.add(new ReadEntity(tbl));
+    }
+    rootTasks.add(TaskFactory.get(new DDLWork(getInputs(), getOutputs(), 
desc), conf));
+  }
+
   private void analyzeAlterTableUpdateStats(ASTNode ast, String tblName, 
Map<String, String> partSpec)
       throws SemanticException {
     String colName = getUnescapedName((ASTNode) ast.getChild(0));
@@ -511,7 +535,6 @@ public class DDLSemanticAnalyzer extends 
BaseSemanticAnalyzer {
       try {
         partName = Warehouse.makePartName(partSpec, false);
       } catch (MetaException e) {
-        // TODO Auto-generated catch block
         throw new SemanticException("partition " + partSpec.toString()
             + " not found");
       }

http://git-wip-us.apache.org/repos/asf/hive/blob/7df62023/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveLexer.g
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveLexer.g 
b/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveLexer.g
index 1c72b1c..4c4470b 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveLexer.g
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveLexer.g
@@ -312,6 +312,7 @@ KW_ISOLATION: 'ISOLATION';
 KW_LEVEL: 'LEVEL';
 KW_SNAPSHOT: 'SNAPSHOT';
 KW_AUTOCOMMIT: 'AUTOCOMMIT';
+KW_CACHE: 'CACHE';
 
 // Operators
 // NOTE: if you add a new function/operator, add it to sysFuncNames so that 
describe function _FUNC_ will work.

http://git-wip-us.apache.org/repos/asf/hive/blob/7df62023/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g 
b/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g
index d5051ce..5f14c6b 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g
@@ -364,6 +364,7 @@ TOK_TXN_READ_WRITE;
 TOK_COMMIT;
 TOK_ROLLBACK;
 TOK_SET_AUTOCOMMIT;
+TOK_CACHE_METADATA;
 }
 
 
@@ -1370,9 +1371,14 @@ descStatement
 analyzeStatement
 @init { pushMsg("analyze statement", state); }
 @after { popMsg(state); }
-    : KW_ANALYZE KW_TABLE (parttype=tableOrPartition) KW_COMPUTE KW_STATISTICS 
((noscan=KW_NOSCAN) | (partialscan=KW_PARTIALSCAN) 
+    : KW_ANALYZE KW_TABLE (parttype=tableOrPartition)
+      (
+      (KW_COMPUTE) => KW_COMPUTE KW_STATISTICS ((noscan=KW_NOSCAN) | 
(partialscan=KW_PARTIALSCAN)
                                                       | (KW_FOR KW_COLUMNS 
(statsColumnName=columnNameList)?))?
       -> ^(TOK_ANALYZE $parttype $noscan? $partialscan? KW_COLUMNS? 
$statsColumnName?)
+      |
+      (KW_CACHE) => KW_CACHE KW_METADATA -> ^(TOK_CACHE_METADATA $parttype)
+      )
     ;
 
 showStatement

http://git-wip-us.apache.org/repos/asf/hive/blob/7df62023/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzerFactory.java
----------------------------------------------------------------------
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzerFactory.java 
b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzerFactory.java
index 0affe84..98860c6 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzerFactory.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzerFactory.java
@@ -110,6 +110,7 @@ public final class SemanticAnalyzerFactory {
     commandType.put(HiveParser.TOK_DESCDATABASE, HiveOperation.DESCDATABASE);
     commandType.put(HiveParser.TOK_ALTERTABLE_SKEWED, 
HiveOperation.ALTERTABLE_SKEWED);
     commandType.put(HiveParser.TOK_ANALYZE, HiveOperation.ANALYZE_TABLE);
+    commandType.put(HiveParser.TOK_CACHE_METADATA, 
HiveOperation.CACHE_METADATA);
     commandType.put(HiveParser.TOK_ALTERTABLE_PARTCOLTYPE, 
HiveOperation.ALTERTABLE_PARTCOLTYPE);
     commandType.put(HiveParser.TOK_SHOW_COMPACTIONS, 
HiveOperation.SHOW_COMPACTIONS);
     commandType.put(HiveParser.TOK_SHOW_TRANSACTIONS, 
HiveOperation.SHOW_TRANSACTIONS);
@@ -259,6 +260,7 @@ public final class SemanticAnalyzerFactory {
       case HiveParser.TOK_ALTERDATABASE_OWNER:
       case HiveParser.TOK_TRUNCATETABLE:
       case HiveParser.TOK_SHOW_SET_ROLE:
+      case HiveParser.TOK_CACHE_METADATA:
         return new DDLSemanticAnalyzer(conf);
 
       case HiveParser.TOK_CREATEFUNCTION:

http://git-wip-us.apache.org/repos/asf/hive/blob/7df62023/ql/src/java/org/apache/hadoop/hive/ql/plan/CacheMetadataDesc.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/CacheMetadataDesc.java 
b/ql/src/java/org/apache/hadoop/hive/ql/plan/CacheMetadataDesc.java
new file mode 100644
index 0000000..1649b40
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/CacheMetadataDesc.java
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.plan;
+
+
+@SuppressWarnings("serial")
+public class CacheMetadataDesc extends DDLDesc {
+  private final String dbName, tableName, partName;
+  private final boolean isAllParts;
+
+  public CacheMetadataDesc(String dbName, String tableName, String partName) {
+    this(dbName, tableName, partName, false);
+  }
+
+  public CacheMetadataDesc(String dbName, String tableName, boolean 
isAllParts) {
+    this(dbName, tableName, null, isAllParts);
+  }
+
+  private CacheMetadataDesc(String dbName, String tableName, String partName, 
boolean isAllParts) {
+    super();
+    this.dbName = dbName;
+    this.tableName = tableName;
+    this.partName = partName;
+    this.isAllParts = isAllParts;
+  }
+
+  public boolean isAllParts() {
+    return isAllParts;
+  }
+
+  public String getPartName() {
+    return partName;
+  }
+
+  public String getDbName() {
+    return dbName;
+  }
+
+  public String getTableName() {
+    return tableName;
+  }
+}

http://git-wip-us.apache.org/repos/asf/hive/blob/7df62023/ql/src/java/org/apache/hadoop/hive/ql/plan/DDLWork.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/DDLWork.java 
b/ql/src/java/org/apache/hadoop/hive/ql/plan/DDLWork.java
index a4c3db1..7bb818c 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/plan/DDLWork.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/DDLWork.java
@@ -91,6 +91,7 @@ public class DDLWork implements Serializable {
    */
   protected HashSet<WriteEntity> outputs;
   private AlterTablePartMergeFilesDesc mergeFilesDesc;
+  private CacheMetadataDesc cacheMetadataDesc;
 
   public DDLWork() {
   }
@@ -510,6 +511,12 @@ public class DDLWork implements Serializable {
     this.alterTableExchangePartition = alterTableExchangePartition;
   }
 
+  public DDLWork(HashSet<ReadEntity> inputs, HashSet<WriteEntity> outputs,
+      CacheMetadataDesc cacheMetadataDesc) {
+    this(inputs, outputs);
+    this.cacheMetadataDesc = cacheMetadataDesc;
+  }
+
     /**
    * @return Create Database descriptor
    */
@@ -1140,6 +1147,13 @@ public class DDLWork implements Serializable {
   }
 
   /**
+   * @return information about the metadata to be cached
+   */
+  public CacheMetadataDesc getCacheMetadataDesc() {
+    return this.cacheMetadataDesc;
+  }
+
+  /**
    * @param alterTableExchangePartition
    *          set the value of the table partition to be exchanged
    */

http://git-wip-us.apache.org/repos/asf/hive/blob/7df62023/ql/src/java/org/apache/hadoop/hive/ql/plan/HiveOperation.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/HiveOperation.java 
b/ql/src/java/org/apache/hadoop/hive/ql/plan/HiveOperation.java
index af7e43e..07134b3 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/plan/HiveOperation.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/HiveOperation.java
@@ -55,6 +55,7 @@ public enum HiveOperation {
   ALTERTABLE_CLUSTER_SORT("ALTERTABLE_CLUSTER_SORT",
       new Privilege[]{Privilege.ALTER_METADATA}, null),
   ANALYZE_TABLE("ANALYZE_TABLE", null, null),
+  CACHE_METADATA("CACHE_METADATA", new Privilege[]{Privilege.SELECT}, null),
   ALTERTABLE_BUCKETNUM("ALTERTABLE_BUCKETNUM",
       new Privilege[]{Privilege.ALTER_METADATA}, null),
   ALTERPARTITION_BUCKETNUM("ALTERPARTITION_BUCKETNUM",

http://git-wip-us.apache.org/repos/asf/hive/blob/7df62023/ql/src/test/queries/clientpositive/stats_filemetadata.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/stats_filemetadata.q 
b/ql/src/test/queries/clientpositive/stats_filemetadata.q
new file mode 100644
index 0000000..dc9f242
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/stats_filemetadata.q
@@ -0,0 +1,17 @@
+set hive.mapred.mode=nonstrict;
+
+CREATE TABLE many_files(key string, value string)
+partitioned by (ds string)
+clustered by (key) into 4 buckets
+stored as orc;
+
+insert overwrite table many_files partition (ds='1') select * from src;
+insert overwrite table many_files partition (ds='2') select * from src;
+
+dfs -ls -R ${hiveconf:hive.metastore.warehouse.dir}/many_files/;
+
+analyze table many_files cache metadata;
+
+set hive.fetch.task.conversion=none;
+
+select sum(hash(*)) from many_files;

http://git-wip-us.apache.org/repos/asf/hive/blob/7df62023/ql/src/test/results/clientpositive/tez/stats_filemetadata.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/tez/stats_filemetadata.q.out 
b/ql/src/test/results/clientpositive/tez/stats_filemetadata.q.out
new file mode 100644
index 0000000..a86e1c1
--- /dev/null
+++ b/ql/src/test/results/clientpositive/tez/stats_filemetadata.q.out
@@ -0,0 +1,54 @@
+PREHOOK: query: CREATE TABLE many_files(key string, value string)
+partitioned by (ds string)
+clustered by (key) into 4 buckets
+stored as orc
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@many_files
+POSTHOOK: query: CREATE TABLE many_files(key string, value string)
+partitioned by (ds string)
+clustered by (key) into 4 buckets
+stored as orc
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@many_files
+PREHOOK: query: insert overwrite table many_files partition (ds='1') select * 
from src
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@many_files@ds=1
+POSTHOOK: query: insert overwrite table many_files partition (ds='1') select * 
from src
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@many_files@ds=1
+POSTHOOK: Lineage: many_files PARTITION(ds=1).key SIMPLE 
[(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: many_files PARTITION(ds=1).value SIMPLE 
[(src)src.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: insert overwrite table many_files partition (ds='2') select * 
from src
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@many_files@ds=2
+POSTHOOK: query: insert overwrite table many_files partition (ds='2') select * 
from src
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@many_files@ds=2
+POSTHOOK: Lineage: many_files PARTITION(ds=2).key SIMPLE 
[(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: many_files PARTITION(ds=2).value SIMPLE 
[(src)src.FieldSchema(name:value, type:string, comment:default), ]
+#### A masked pattern was here ####
+PREHOOK: query: analyze table many_files cache metadata
+PREHOOK: type: CACHE_METADATA
+PREHOOK: Input: default@many_files
+POSTHOOK: query: analyze table many_files cache metadata
+POSTHOOK: type: CACHE_METADATA
+POSTHOOK: Input: default@many_files
+PREHOOK: query: select sum(hash(*)) from many_files
+PREHOOK: type: QUERY
+PREHOOK: Input: default@many_files
+PREHOOK: Input: default@many_files@ds=1
+PREHOOK: Input: default@many_files@ds=2
+#### A masked pattern was here ####
+POSTHOOK: query: select sum(hash(*)) from many_files
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@many_files
+POSTHOOK: Input: default@many_files@ds=1
+POSTHOOK: Input: default@many_files@ds=2
+#### A masked pattern was here ####
+73724366848

http://git-wip-us.apache.org/repos/asf/hive/blob/7df62023/service/src/gen/thrift/gen-py/hive_service/ThriftHive-remote
----------------------------------------------------------------------
diff --git a/service/src/gen/thrift/gen-py/hive_service/ThriftHive-remote 
b/service/src/gen/thrift/gen-py/hive_service/ThriftHive-remote
index e167d5b..4bd7a5a 100755
--- a/service/src/gen/thrift/gen-py/hive_service/ThriftHive-remote
+++ b/service/src/gen/thrift/gen-py/hive_service/ThriftHive-remote
@@ -54,6 +54,7 @@ if len(sys.argv) <= 1 or sys.argv[1] == '--help':
   print('  void drop_table(string dbname, string name, bool deleteData)')
   print('  void drop_table_with_environment_context(string dbname, string 
name, bool deleteData, EnvironmentContext environment_context)')
   print('   get_tables(string db_name, string pattern)')
+  print('   get_table_meta(string db_patterns, string tbl_patterns,  
tbl_types)')
   print('   get_all_tables(string db_name)')
   print('  Table get_table(string dbname, string tbl_name)')
   print('   get_table_objects_by_name(string dbname,  tbl_names)')
@@ -77,6 +78,7 @@ if len(sys.argv) <= 1 or sys.argv[1] == '--help':
   print('  DropPartitionsResult drop_partitions_req(DropPartitionsRequest 
req)')
   print('  Partition get_partition(string db_name, string tbl_name,  
part_vals)')
   print('  Partition exchange_partition( partitionSpecs, string source_db, 
string source_table_name, string dest_db, string dest_table_name)')
+  print('   exchange_partitions( partitionSpecs, string source_db, string 
source_table_name, string dest_db, string dest_table_name)')
   print('  Partition get_partition_with_auth(string db_name, string tbl_name,  
part_vals, string user_name,  group_names)')
   print('  Partition get_partition_by_name(string db_name, string tbl_name, 
string part_name)')
   print('   get_partitions(string db_name, string tbl_name, i16 max_parts)')
@@ -162,6 +164,7 @@ if len(sys.argv) <= 1 or sys.argv[1] == '--help':
   print('  GetFileMetadataResult get_file_metadata(GetFileMetadataRequest 
req)')
   print('  PutFileMetadataResult put_file_metadata(PutFileMetadataRequest 
req)')
   print('  ClearFileMetadataResult 
clear_file_metadata(ClearFileMetadataRequest req)')
+  print('  CacheFileMetadataResult 
cache_file_metadata(CacheFileMetadataRequest req)')
   print('  string getName()')
   print('  string getVersion()')
   print('  fb_status getStatus()')
@@ -411,6 +414,12 @@ elif cmd == 'get_tables':
     sys.exit(1)
   pp.pprint(client.get_tables(args[0],args[1],))
 
+elif cmd == 'get_table_meta':
+  if len(args) != 3:
+    print('get_table_meta requires 3 args')
+    sys.exit(1)
+  pp.pprint(client.get_table_meta(args[0],args[1],eval(args[2]),))
+
 elif cmd == 'get_all_tables':
   if len(args) != 1:
     print('get_all_tables requires 1 args')
@@ -549,6 +558,12 @@ elif cmd == 'exchange_partition':
     sys.exit(1)
   
pp.pprint(client.exchange_partition(eval(args[0]),args[1],args[2],args[3],args[4],))
 
+elif cmd == 'exchange_partitions':
+  if len(args) != 5:
+    print('exchange_partitions requires 5 args')
+    sys.exit(1)
+  
pp.pprint(client.exchange_partitions(eval(args[0]),args[1],args[2],args[3],args[4],))
+
 elif cmd == 'get_partition_with_auth':
   if len(args) != 5:
     print('get_partition_with_auth requires 5 args')
@@ -1059,6 +1074,12 @@ elif cmd == 'clear_file_metadata':
     sys.exit(1)
   pp.pprint(client.clear_file_metadata(eval(args[0]),))
 
+elif cmd == 'cache_file_metadata':
+  if len(args) != 1:
+    print('cache_file_metadata requires 1 args')
+    sys.exit(1)
+  pp.pprint(client.cache_file_metadata(eval(args[0]),))
+
 elif cmd == 'getName':
   if len(args) != 0:
     print('getName requires 0 args')

http://git-wip-us.apache.org/repos/asf/hive/blob/7df62023/shims/common/src/main/java/org/apache/hadoop/hive/io/HdfsUtils.java
----------------------------------------------------------------------
diff --git 
a/shims/common/src/main/java/org/apache/hadoop/hive/io/HdfsUtils.java 
b/shims/common/src/main/java/org/apache/hadoop/hive/io/HdfsUtils.java
new file mode 100644
index 0000000..c90b34c
--- /dev/null
+++ b/shims/common/src/main/java/org/apache/hadoop/hive/io/HdfsUtils.java
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.io;
+
+import java.io.IOException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.DistributedFileSystem;
+import org.apache.hadoop.hive.shims.HadoopShims;
+import org.apache.hadoop.hive.shims.ShimLoader;
+
+public class HdfsUtils {
+  private static final HadoopShims SHIMS = ShimLoader.getHadoopShims();
+  private static final Log LOG = LogFactory.getLog(HdfsUtils.class);
+
+  public static long getFileId(FileSystem fileSystem, Path path) throws 
IOException {
+    String pathStr = path.toUri().getPath();
+    if (fileSystem instanceof DistributedFileSystem) {
+      return SHIMS.getFileId(fileSystem, pathStr);
+    }
+    // If we are not on DFS, we just hash the file name + size and hope for 
the best.
+    // TODO: we assume it only happens in tests. Fix?
+    int nameHash = pathStr.hashCode();
+    long fileSize = fileSystem.getFileStatus(path).getLen();
+    long id = ((fileSize ^ (fileSize >>> 32)) << 32) | ((long)nameHash & 
0xffffffffL);
+    LOG.warn("Cannot get unique file ID from "
+        + fileSystem.getClass().getSimpleName() + "; using " + id + "(" + 
pathStr
+        + "," + nameHash + "," + fileSize + ")");
+    return id;
+  }
+
+  // TODO: this relies on HDFS not changing the format; we assume if we could 
get inode ID, this
+  //       is still going to work. Otherwise, file IDs can be turned off. 
Later, we should use
+  //       as public utility method in HDFS to obtain the inode-based path.
+  private static String HDFS_ID_PATH_PREFIX = "/.reserved/.inodes/";
+
+  public static Path getFileIdPath(
+      FileSystem fileSystem, Path path, long fileId) {
+    return (fileSystem instanceof DistributedFileSystem)
+        ? new Path(HDFS_ID_PATH_PREFIX + fileId) : path;
+  }
+}

[1/7] hive git commit: HIVE-12075 : add analyze command to explictly cache file metadata in HBase metastore (Sergey Shelukhin, reviewed by Alan Gates)

Reply via email to