Repository: hadoop Updated Branches: refs/heads/HDFS-6584 91f6ddeb3 -> b014e83bc
HDFS-6864. Archival Storage: add user documentation. Contributed by Tsz Wo Nicholas Sze. Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/b014e83b Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/b014e83b Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/b014e83b Branch: refs/heads/HDFS-6584 Commit: b014e83bc5899ec135b1e7a54ca1902c970047a5 Parents: 91f6dde Author: Jing Zhao <j...@hortonworks.com> Authored: Wed Sep 17 09:40:17 2014 -0700 Committer: Jing Zhao <j...@hortonworks.com> Committed: Wed Sep 17 09:40:17 2014 -0700 ---------------------------------------------------------------------- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 2 + .../hadoop/hdfs/DistributedFileSystem.java | 6 + .../apache/hadoop/hdfs/server/mover/Mover.java | 6 +- .../src/site/apt/ArchivalStorage.apt.vm | 302 +++++++++++++++++++ .../src/site/apt/HDFSCommands.apt.vm | 43 ++- hadoop-project/src/site/site.xml | 1 + 6 files changed, 349 insertions(+), 11 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/b014e83b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index e859ca2..7a9c723 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -71,6 +71,8 @@ HDFS-6584: Archival Storage HDFS-7072. Fix TestBlockManager and TestStorageMover. (jing9 via szetszwo) + HDFS-6864. Archival Storage: add user documentation. (szetszwo via jing9) + Trunk (Unreleased) INCOMPATIBLE CHANGES http://git-wip-us.apache.org/repos/asf/hadoop/blob/b014e83b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java index 1c60e7b..6bce8b9 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java @@ -472,6 +472,12 @@ public class DistributedFileSystem extends FileSystem { }.resolve(this, absF); } + /** + * Set the source path to the specified storage policy. + * + * @param src The source path referring to either a directory or a file. + * @param policyName The name of the storage policy. + */ public void setStoragePolicy(final Path src, final String policyName) throws IOException { statistics.incrementWriteOps(1); http://git-wip-us.apache.org/repos/asf/hadoop/blob/b014e83b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/mover/Mover.java ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/mover/Mover.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/mover/Mover.java index 0812c03..f1837ae 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/mover/Mover.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/mover/Mover.java @@ -498,9 +498,9 @@ public class Mover { static class Cli extends Configured implements Tool { private static final String USAGE = "Usage: java " - + Mover.class.getSimpleName() - + " [-p <space separated files/dirs> specify a list of files/dirs to migrate]" - + " [-f <local file name> specify a local file containing files/dirs to migrate]"; + + Mover.class.getSimpleName() + " [-p <files/dirs> | -f <local file>]" + + "\n\t-p <files/dirs>\ta space separated list of HDFS files/dirs to migrate." + + "\n\t-f <local file>\ta local file containing a list of HDFS files/dirs to migrate."; private static Options buildCliOptions() { Options opts = new Options(); http://git-wip-us.apache.org/repos/asf/hadoop/blob/b014e83b/hadoop-hdfs-project/hadoop-hdfs/src/site/apt/ArchivalStorage.apt.vm ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/site/apt/ArchivalStorage.apt.vm b/hadoop-hdfs-project/hadoop-hdfs/src/site/apt/ArchivalStorage.apt.vm new file mode 100644 index 0000000..5301d52 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/site/apt/ArchivalStorage.apt.vm @@ -0,0 +1,302 @@ +~~ Licensed under the Apache License, Version 2.0 (the "License"); +~~ you may not use this file except in compliance with the License. +~~ You may obtain a copy of the License at +~~ +~~ http://www.apache.org/licenses/LICENSE-2.0 +~~ +~~ Unless required by applicable law or agreed to in writing, software +~~ distributed under the License is distributed on an "AS IS" BASIS, +~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +~~ See the License for the specific language governing permissions and +~~ limitations under the License. See accompanying LICENSE file. + + --- + HDFS Archival Storage + --- + --- + ${maven.build.timestamp} + +HDFS Archival Storage + +%{toc|section=1|fromDepth=0} + +* {Introduction} + + <Archival Storage> is a solution to decouple growing storage capacity from compute capacity. + Nodes with higher density and less expensive storage with low compute power are becoming available + and can be used as cold storage in the clusters. + Based on policy the data from hot can be moved to the cold. + Adding more nodes to the cold storage can grow the storage independent of the compute capacity + in the cluster. + +* {Storage Types and Storage Policies} + +** {Storage Types: DISK, SSD and ARCHIVE} + + The first phase of + {{{https://issues.apache.org/jira/browse/HDFS-2832}Heterogeneous Storage (HDFS-2832)}} + changed datanode storage model from a single storage, + which may correspond to multiple physical storage medias, + to a collection of storages with each storage corresponding to a physical storage media. + It also added the notion of storage types, DISK and SSD, + where DISK is the default storage type. + + A new storage type <ARCHIVE>, + which has high storage density (petabyte of storage) but little compute power, + is added for supporting archival storage. + +** {Storage Policies: Hot, Warm and Cold} + + A new concept of storage policies is introduced in order to allow files to be stored + in different storage types according to the storage policy. + + We have the following storage policies: + + * <<Hot>> - for both storage and compute. + The data that is popular and still being used for processing will stay in this policy. + When a block is hot, all replicas are stored in DISK. + + * <<Cold>> - only for storage with limited compute. + The data that is no longer being used, or data that needs to be archived is moved + from hot storage to cold storage. + When a block is cold, all replicas are stored in ARCHIVE. + + * <<Warm>> - partially hot and partially cold. + When a block is warm, some of its replicas are stored in DISK + and the remaining replicas are stored in ARCHIVE. + + [] + + More formally, a storage policy consists of the following fields: + + [[1]] Policy ID + + [[2]] Policy name + + [[3]] A list of storage types for block placement + + [[4]] A list of fallback storage types for file creation + + [[5]] A list of fallback storage types for replication + + [] + + When there is enough space, + block replicas are stored according to the storage type list specified in #3. + When some of the storage types in list #3 are running out of space, + the fallback storage type lists specified in #4 and #5 are used + to replace the out-of-space storage types for file creation and replication, respectively. + + The following is a typical storage policy table. + +*--------+---------------+-------------------------+-----------------------+-----------------------+ +| <<Policy>> | <<Policy>>| <<Block Placement>> | <<Fallback storages>> | <<Fallback storages>> | +| <<ID>> | <<Name>> | <<(n\ replicas)>> | <<for creation>> | <<for replication>> | +*--------+---------------+-------------------------+-----------------------+-----------------------+ +| 12 | Hot (default) | DISK: <n> | \<none\> | ARCHIVE | +*--------+---------------+-------------------------+-----------------------+-----------------------+ +| 8 | Warm | DISK: 1, ARCHIVE: <n>-1 | ARCHIVE, DISK | ARCHIVE, DISK | +*--------+---------------+-------------------------+-----------------------+-----------------------+ +| 4 | Cold | ARCHIVE: <n> | \<none\> | \<none\> | +*--------+---------------+-------------------------+-----------------------+-----------------------+ + + Note that cluster administrators may change the storage policy table + according to the characteristic of the cluster. + For example, in order to prevent losing archival data, + administrators may want to use DISK as fallback storage for replication in the Cold policy. + A drawback of such setting is that the DISK storages could be filled up with archival data. + As a result, the entire cluster may become full and cannot serve hot data anymore. + +** {Configurations} + +*** {Setting The List of All Storage Policies} + + * <<dfs.block.storage.policies>> + - a list of block storage policy names and IDs. + The syntax is + + NAME_1:ID_1, NAME_2:ID_2, ..., NAME_<n>:ID_<n> + + where ID is an integer in the closed range [1,15] and NAME is case insensitive. + The first element is the <default policy>. Empty list is not allowed. + + The default value is shown below. + ++------------------------------------------+ +<property> + <name>dfs.block.storage.policies</name> + <value>HOT:12, WARM:8, COLD:4</value> +</property> ++------------------------------------------+ + + [] + +*** {Setting Storage Policy Details} + + The following configuration properties are for setting the details of each storage policy, + where <<<\<ID\>>>> is the actual policy ID. + + * <<dfs.block.storage.policy.\<ID\>>> + - a list of storage types for storing the block replicas. + The syntax is + + STORAGE_TYPE_1, STORAGE_TYPE_2, ..., STORAGE_TYPE_<n> + + When creating a block, the <i>-th replica is stored using <i>-th storage type + for <i> less than or equal to <n>, and + the <j>-th replica is stored using <n>-th storage type for <j> greater than <n>. + + Empty list is not allowed. + + Examples: + ++------------------------------------------+ +DISK : all replicas stored using DISK. +DISK, ARCHIVE : the first replica is stored using DISK and all the + remaining replicas are stored using ARCHIVE. ++------------------------------------------+ + + * <<dfs.block.storage.policy.creation-fallback.\<ID\>>> + - a list of storage types for creation fallback storage. + The syntax is + + STORAGE_TYPE_1, STORAGE_TYPE_2, ..., STORAGE_TYPE_n + + When creating a block, if a particular storage type specified in the policy + is unavailable, the fallback STORAGE_TYPE_1 is used. Further, if + STORAGE_TYPE_<i> is also unavailable, the fallback STORAGE_TYPE_<(i+1)> is used. + In case all fallback storages are unavailable, the block will be created + with number of replicas less than the specified replication factor. + + An empty list indicates that there is no fallback storage. + + * <<dfs.block.storage.policy.replication-fallback.\<ID\>>> + - a list of storage types for replication fallback storage. + The usage of this configuration property is similar to + <<<dfs.block.storage.policy.creation-fallback.\<ID\>>>> + except that it takes effect on replication but not block creation. + + [] + + The following are the default configuration values for Hot, Warm and Cold storage policies. + + * Block Storage Policy <<HOT:12>> + ++------------------------------------------+ +<property> + <name>dfs.block.storage.policy.12</name> + <value>DISK</value> +</property> +<property> + <name>dfs.block.storage.policy.creation-fallback.12</name> + <value></value> +</property> +<property> + <name>dfs.block.storage.policy.replication-fallback.12</name> + <value>ARCHIVE</value> +</property> ++------------------------------------------+ + + * Block Storage Policy <<WARM:8>> + ++------------------------------------------+ +<property> + <name>dfs.block.storage.policy.8</name> + <value>DISK, ARCHIVE</value> +</property> +<property> + <name>dfs.block.storage.policy.creation-fallback.8</name> + <value>DISK, ARCHIVE</value> +</property> +<property> + <name>dfs.block.storage.policy.replication-fallback.8</name> + <value>DISK, ARCHIVE</value> +</property> ++------------------------------------------+ + + * Block Storage Policy <<COLD:4>> + ++------------------------------------------+ +<property> + <name>dfs.block.storage.policy.4</name> + <value>ARCHIVE</value> +</property> +<property> + <name>dfs.block.storage.policy.creation-fallback.4</name> + <value></value> +</property> +<property> + <name>dfs.block.storage.policy.replication-fallback.4</name> + <value></value> +</property> ++------------------------------------------+ + + [] + +* {Mover - A New Data Migration Tool} + + A new data migration tool is added for archiving data. + The tool is similar to Balancer. + It periodically scans the files in HDFS to check if the block placement satisfies the storage policy. + For the blocks violating the storage policy, + it moves the replicas to a different storage type + in order to fulfill the storage policy requirement. + + * Command: + ++------------------------------------------+ +hdfs mover [-p <files/dirs> | -f <local file name>] ++------------------------------------------+ + + * Arguments: + +*-------------------------+--------------------------------------------------------+ +| <<<-p \<files/dirs\>>>> | Specify a space separated list of HDFS files/dirs to migrate. +*-------------------------+--------------------------------------------------------+ +| <<<-f \<local file\>>>> | Specify a local file containing a list of HDFS files/dirs to migrate. +*-------------------------+--------------------------------------------------------+ + + Note that, when both -p and -f options are omitted, the default path is the root directory. + + [] + + +* {<<<DFSAdmin>>> Commands} + +** {Set Storage Policy} + + Set a storage policy to a file or a directory. + + * Command: + ++------------------------------------------+ +hdfs dfsadmin -setStoragePolicy <path> <policyName> ++------------------------------------------+ + + * Arguments: + +*----------------------+-----------------------------------------------------+ +| <<<\<path\>>>> | The path referring to either a directory or a file. | +*----------------------+-----------------------------------------------------+ +| <<<\<policyName\>>>> | The name of the storage policy. | +*----------------------+-----------------------------------------------------+ + + [] + +** {Get Storage Policy} + + Get the storage policy of a file or a directory. + + * Command: + ++------------------------------------------+ +hdfs dfsadmin -getStoragePolicy <path> ++------------------------------------------+ + + * Arguments: + +*----------------------+-----------------------------------------------------+ +| <<<\<path\>>>> | The path referring to either a directory or a file. | +*----------------------+-----------------------------------------------------+ + + [] http://git-wip-us.apache.org/repos/asf/hadoop/blob/b014e83b/hadoop-hdfs-project/hadoop-hdfs/src/site/apt/HDFSCommands.apt.vm ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/site/apt/HDFSCommands.apt.vm b/hadoop-hdfs-project/hadoop-hdfs/src/site/apt/HDFSCommands.apt.vm index 6eb60f0..170f352 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/site/apt/HDFSCommands.apt.vm +++ b/hadoop-hdfs-project/hadoop-hdfs/src/site/apt/HDFSCommands.apt.vm @@ -147,18 +147,19 @@ HDFS Commands Guide *-----------------+-----------------------------------------------------------+ | -regular | Normal datanode startup (default). *-----------------+-----------------------------------------------------------+ -| -rollback | Rollsback the datanode to the previous version. This should +| -rollback | Rollback the datanode to the previous version. This should | | be used after stopping the datanode and distributing the | | old hadoop version. *-----------------+-----------------------------------------------------------+ -| -rollingupgrade rollback | Rollsback a rolling upgrade operation. +| -rollingupgrade rollback | Rollback a rolling upgrade operation. *-----------------+-----------------------------------------------------------+ ** <<<dfsadmin>>> Runs a HDFS dfsadmin client. - Usage: <<<hdfs dfsadmin [GENERIC_OPTIONS] ++------------------------------------------+ + Usage: hdfs dfsadmin [GENERIC_OPTIONS] [-report [-live] [-dead] [-decommissioning]] [-safemode enter | leave | get | wait] [-saveNamespace] @@ -169,6 +170,8 @@ HDFS Commands Guide [-clrQuota <dirname>...<dirname>] [-setSpaceQuota <quota> <dirname>...<dirname>] [-clrSpaceQuota <dirname>...<dirname>] + [-setStoragePolicy <path> <policyName>] + [-getStoragePolicy <path>] [-finalizeUpgrade] [-rollingUpgrade [<query>|<prepare>|<finalize>]] [-metasave filename] @@ -186,7 +189,8 @@ HDFS Commands Guide [-fetchImage <local directory>] [-shutdownDatanode <datanode_host:ipc_port> [upgrade]] [-getDatanodeInfo <datanode_host:ipc_port>] - [-help [cmd]]>>> + [-help [cmd]] ++------------------------------------------+ *-----------------+-----------------------------------------------------------+ || COMMAND_OPTION || Description @@ -236,6 +240,10 @@ HDFS Commands Guide | {{{../hadoop-hdfs/HdfsQuotaAdminGuide.html#Administrative_Commands}HDFS Quotas Guide}} | for the detail. *-----------------+-----------------------------------------------------------+ +| -setStoragePolicy \<path\> \<policyName\> | Set a storage policy to a file or a directory. +*-----------------+-----------------------------------------------------------+ +| -getStoragePolicy \<path\> | Get the storage policy of a file or a directory. +*-----------------+-----------------------------------------------------------+ | -finalizeUpgrade| Finalize upgrade of HDFS. Datanodes delete their previous | version working directories, followed by Namenode doing the | same. This completes the upgrade process. @@ -250,7 +258,7 @@ HDFS Commands Guide | <filename> will contain one line for each of the following\ | 1. Datanodes heart beating with Namenode\ | 2. Blocks waiting to be replicated\ - | 3. Blocks currrently being replicated\ + | 3. Blocks currently being replicated\ | 4. Blocks waiting to be deleted *-----------------+-----------------------------------------------------------+ | -refreshServiceAcl | Reload the service-level authorization policy file. @@ -312,12 +320,30 @@ HDFS Commands Guide | is specified. *-----------------+-----------------------------------------------------------+ +** <<<mover>>> + + Runs the data migration utility. + See {{{./ArchivalStorage.html#Mover_-_A_New_Data_Migration_Tool}Mover}} for more details. + + Usage: <<<hdfs mover [-p <files/dirs> | -f <local file name>]>>> + +*--------------------+--------------------------------------------------------+ +|| COMMAND_OPTION || Description +*--------------------+--------------------------------------------------------+ +| -p \<files/dirs\> | Specify a space separated list of HDFS files/dirs to migrate. +*--------------------+--------------------------------------------------------+ +| -f \<local file\> | Specify a local file containing a list of HDFS files/dirs to migrate. +*--------------------+--------------------------------------------------------+ + + Note that, when both -p and -f options are omitted, the default path is the root directory. + ** <<<namenode>>> Runs the namenode. More info about the upgrade, rollback and finalize is at {{{./HdfsUserGuide.html#Upgrade_and_Rollback}Upgrade Rollback}}. - Usage: <<<hdfs namenode [-backup] | ++------------------------------------------+ + Usage: hdfs namenode [-backup] | [-checkpoint] | [-format [-clusterid cid ] [-force] [-nonInteractive] ] | [-upgrade [-clusterid cid] [-renameReserved<k-v pairs>] ] | @@ -329,7 +355,8 @@ HDFS Commands Guide [-initializeSharedEdits] | [-bootstrapStandby] | [-recover [-force] ] | - [-metadataVersion ]>>> + [-metadataVersion ] ++------------------------------------------+ *--------------------+--------------------------------------------------------+ || COMMAND_OPTION || Description @@ -351,7 +378,7 @@ HDFS Commands Guide | -upgradeOnly [-clusterid cid] [-renameReserved\<k-v pairs\>] | Upgrade the | specified NameNode and then shutdown it. *--------------------+--------------------------------------------------------+ -| -rollback | Rollsback the NameNode to the previous version. This +| -rollback | Rollback the NameNode to the previous version. This | should be used after stopping the cluster and | distributing the old Hadoop version. *--------------------+--------------------------------------------------------+ http://git-wip-us.apache.org/repos/asf/hadoop/blob/b014e83b/hadoop-project/src/site/site.xml ---------------------------------------------------------------------- diff --git a/hadoop-project/src/site/site.xml b/hadoop-project/src/site/site.xml index a42aff0..991447f 100644 --- a/hadoop-project/src/site/site.xml +++ b/hadoop-project/src/site/site.xml @@ -93,6 +93,7 @@ <item name="Extended Attributes" href="hadoop-project-dist/hadoop-hdfs/ExtendedAttributes.html"/> <item name="Transparent Encryption" href="hadoop-project-dist/hadoop-hdfs/TransparentEncryption.html"/> <item name="HDFS Support for Multihoming" href="hadoop-project-dist/hadoop-hdfs/HdfsMultihoming.html"/> + <item name="Archival Storage" href="hadoop-project-dist/hadoop-hdfs/ArchivalStorage.html"/> </menu> <menu name="MapReduce" inherit="top">