http://git-wip-us.apache.org/repos/asf/incubator-crail-website/blob/c782b73a/content/css/group.css
----------------------------------------------------------------------
diff --git a/content/css/group.css b/content/css/group.css
deleted file mode 100644
index b03d722..0000000
--- a/content/css/group.css
+++ /dev/null
@@ -1,215 +0,0 @@
-/* Space out content a bit */
- html {
-    overflow:scroll;    
-} 
-
-body {
-  padding-top: 20px;
-  padding-bottom: 20px;
-}
-
-/* Everything but the jumbotron gets side spacing for mobile first views */
-.header,
-.footer {
-  padding-left: 15px;
-  padding-right: 15px;
-}
-
-/* Custom page header */
-.header {
-  border-bottom: 1px solid #e5e5e5;
-}
-
-/* Custom page footer */
-.footer {
-  padding-top: 19px;
-  color: #777;
-  border-top: 1px solid #e5e5e5;
-  text-align: center;
-  font-size: 0.8em;
-  clear: both;
-}
-.footer p {
-  margin: 0.1em 0;
-}
-.footer a:link, .footer a:visited {
-  color: #777;
-}
-
-/* Customize container */
-@media (min-width: 768px) {
-  .container {
-    max-width: 767px;
-  }
-}
-.container-narrow > hr {
-  margin: 30px 0;
-}
-
-/* Jumbotron */
-.jumbotron {
-  text-align: center;
-  border-bottom: 1px solid #e5e5e5;
-}
-.jumbotron p {
-    font-size: 15px;
-}
-.jumbotron {
-    padding-top: 18px;
-    padding-bottom: 10px;
-}
-
-/* Group logo */
-#logo {
-    height: 35px;
-    margin: 2px 0 8px 0;
-}
-
-/* Responsive: Portrait tablets and up */
-@media screen and (min-width: 768px) {
-  /* Remove the padding we set earlier */
-  .header,
-  .footer {
-    clear: both;
-    padding-left: 0;
-    padding-right: 0;
-  }
-  /* Space out the masthead */
-  .header {
-    margin-bottom: 30px;
-  }
-  /* Remove the bottom border on the jumbotron for visual effect */
-  .jumbotron {
-    border-bottom: 0;
-  }
-}
-
-/* Slightly better-looking header on mobile. */
-@media screen and (max-width: 767px) {
-    .nav-pills {
-        display: block;
-        margin: 0 auto 1em;
-    }
-    #logo {
-        display: block;
-        clear: both;
-        margin: 1em auto;
-    }
-}
-
-/* Photo */
-.inset-image {
-    width: 60%;
-    margin: 0 auto;
-}
-
-/* People list */
-
-dl.people dl {
-  width: 100%;
-  clear: both;
-  overflow: hidden; 
-  margin: 0;
-}
-
-dl.people dt {
-  clear: both;
-  float: left;
-  content: " ";
-  width: 15%;
-  margin-bottom: 2em;
-  margin: 0;
-}
-
-dl.people dd {
-  width: 85%; 
-  min-height: 6.5em;
-  margin-bottom: 1em;
-}
-
-
-
-/* Front page news. */
-ul.news .date {
-    color: #999;
-    font-weight: bold;
-    display: block;
-}
-ul.news > li {
-    margin-top: 1em;
-}
-ul.news .shortnews .date {
-    float: left;
-    margin-right: 1em;
-}
-ul.news .bloglink a {
-    font-size: 1.4em;
-}
-
-.illustration {
-    float: right;
-    margin: 0 0 1em 1em;
-}
-
-ul.double-col {
-   columns: 2;
-  -webkit-columns: 2;
-  -moz-columns: 2;
-   overflow:hidden;
-   width: 800px;
-}
-
-div.bio {
-    margin: 0 0 1em 1em;
-}
-
-ul.double-col li {
-    clear: both;
-    height: auto;
-    display: inline;
-    vertical-align: middle;
-    width: 100%;
-    margin: .5rem 0rem;
-    float: left;
-}
-
-.col-md-8 {
-}
-
-.col-md-4 {
-}
-
-table tr:nth-child(odd) {
-    background-color: #FFFFFF;
-}
-
-table tr:nth-child(even) {
-    background-color: #F5F5F5;
-}
-
-table tr:first-child th {
-  border-top: 1;
-}
-
-table tr:last-child td {
-  border-bottom: 1;
-}
-
-table tr td:first-child, table tr th:first-child {
-  border-left: 1;
-}
-
-table tr td:last-child, table tr th:last-child {
-  border-right: 1;
-}
-table, th {
-    background-color: #F5F5F5;
-    border-color: lightgrey;
-}
-
-th, td {
-    border: 0.75px solid grey;
-    padding: 7px;
-    line-height: 24px;
-    border-color: lightgrey;
-}

http://git-wip-us.apache.org/repos/asf/incubator-crail-website/blob/c782b73a/content/css/print.css
----------------------------------------------------------------------
diff --git a/content/css/print.css b/content/css/print.css
deleted file mode 100644
index e95929e..0000000
--- a/content/css/print.css
+++ /dev/null
@@ -1,36 +0,0 @@
-.container {
-    -moz-column-count: 2;
-    -webkit-column-count: 2;
-    column-count: 2;
-
-    font-size: 0.8em;
-}
-
-.header {
-    display: none;
-}
-
-h2 {
-    font-size: 1.3em;
-    margin: 0;
-}
-h3 {
-    font-size: 1.2em;
-    margin-top: 0;
-}
-
-.noprint {
-    display: none;
-}
-
-body {
-    padding: -2em 0 0 0;
-}
-
-/* Disable Bootstrap's link display. */
-@media print {
-    a:link:after,
-    a:visited:after {
-        content: "" !important;
-    }
-}

http://git-wip-us.apache.org/repos/asf/incubator-crail-website/blob/c782b73a/content/documentation/index.html
----------------------------------------------------------------------
diff --git a/content/documentation/index.html b/content/documentation/index.html
deleted file mode 100644
index 68ee8c8..0000000
--- a/content/documentation/index.html
+++ /dev/null
@@ -1,358 +0,0 @@
-<!DOCTYPE html>
-<html>
-    <head>
-        <meta charset="utf-8">
-        <title>The Apache Crail (Incubating) Project: Documentation</title>
-        <meta name="viewport" content="width=device-width, initial-scale=1.0">
-        <link href="http://crail.incubator.apache.org/css/bootstrap.min.css"; 
rel="stylesheet">
-        <link href="http://crail.incubator.apache.org/css/group.css"; 
rel="stylesheet">
-        <link rel="alternate" type="application/atom+xml" title="Atom"
-            href="http://crail.incubator.apache.org/blog/blog.xml";>
-        
-        <meta property="og:image" 
content="http://crail.incubator.apache.org/img/blog/preview/documentation-summary.png";
 />
-        <meta property="og:image:secure_url" 
content="http://crail.incubator.apache.org/img/blog/preview/documentation-summary.png";
 />
-    </head>
-
-    <body>
-        <div class="container">
-          <div class="header">
-            <ul class="nav nav-pills pull-right">
-              
-              
-                
-                <li >
-                  <a href="http://crail.incubator.apache.org/";>
-                    Home
-                  </a>
-                </li>
-              
-                
-                <li >
-                  <a href="http://crail.incubator.apache.org/overview/";>
-                    Overview
-                  </a>
-                </li>
-              
-                
-                <li >
-                  <a href="http://crail.incubator.apache.org/blog/";>
-                    Blog
-                  </a>
-                </li>
-              
-                
-                <li >
-                  <a href="http://crail.incubator.apache.org/community/";>
-                    Community
-                  </a>
-                </li>
-              
-                
-                <li class="active">
-                  <a href="http://crail.incubator.apache.org/documentation/";>
-                    Documentation
-                  </a>
-                </li>
-              
-            </ul>
-            <a href="http://crail.incubator.apache.org/";>
-                <img src="http://crail.incubator.apache.org/img/crail_logo.png";
-                    
srcset="http://crail.incubator.apache.org/img/crail_logo.png";
-                    alt="Crail" id="logo">
-            </a>
-          </div>
-
-          
-          
-          <h2>Documentation</h2>   
-          
-
-          <p>Apache Crail (Incubating) is a fast multi-tiered distributed 
storage system designed from ground up for high-performance network and storage 
hardware. The unique features of Crail include:</p>
-
-<ul>
-  <li>Zero-copy network access from userspace</li>
-  <li>Integration of multiple storage tiers such DRAM, flash and disaggregated 
shared storage</li>
-  <li>Ultra-low latencies for both meta data and data operations. For 
instance: opening, reading and closing a small file residing in the distributed 
DRAM tier less than 10 microseconds, which is in the same ballpark as some of 
the fastest RDMA-based key/value stores</li>
-  <li>High-performance sequential read/write operations: For instance: read 
operations on large files residing in the distributed DRAM tier are typically 
limited only by the performance of the network</li>
-  <li>Very low CPU consumption: a single core sharing both application and 
file system client can drive sequential read/write operations at the speed of 
up to 100Gbps and more</li>
-  <li>Asynchronous API leveraging the asynchronous nature of RDMA-based 
networking hardware</li>
-  <li>Extensible plugin architecture: new storage tiers tailored to specific 
hardware can be added easily</li>
-</ul>
-
-<p>Crail is implemented in Java offering a Java API which integrates directly 
with the Java off-heap memory. Crail is designed for performance critical 
temporary data within a scope of a rack or two.</p>
-
-<h2 id="requirements">Requirements</h2>
-
-<ul>
-  <li>Java 8 or higher</li>
-  <li>RDMA-based network, e.g., Infiniband, iWARP, RoCE. There are two options 
to run Crail without RDMA networking hardware: (a) use SoftiWARP, (b) us the 
TCP/DRAM storage tier</li>
-  <li>Libdisni.so, available as part of <a 
href="https://github.com/zrlio/disni";>DiSNI</a></li>
-</ul>
-
-<h2 id="building">Building</h2>
-
-<p>To build Crail from source using <a href="http://maven.apache.org/";>Apache 
Maven</a> execute the following steps:</p>
-
-<ol>
-  <li>Obtain a copy of <a 
href="https://github.com/apache/incubator-crail";>Crail</a> from Github</li>
-  <li>Run: mvn -DskipTests install</li>
-  <li>Copy tarball to the cluster and unpack it using tar xvfz 
crail-1.0-bin.tar.gz</li>
-</ol>
-
-<p>Note: later, when deploying Crail, make sure libdisni.so is part of your 
LD_LIBRARY_PATH. The easiest way to make it work is to copy libdisni.so into 
crail-1.0/lib</p>
-
-<h2 id="configuration">Configuration</h2>
-
-<p>To configure Crail use crail-site.conf.template as a basis and modify it to 
match your environment.</p>
-
-<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>cd crail-1.0/conf
-mv crail-site.conf.template crail-site.conf
-</code></pre></div></div>
-
-<p>There are a general file system properties and specific properties for the 
different storage tiers. A typical configuration for the general file system 
section may look as follows:</p>
-
-<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>crail.namenode.address                
crail://namenode:9060
-crail.storage.types                   
org.apache.crail.storage.rdma.RdmaStorageTier
-crail.cachepath                       /dev/hugepages/cache
-crail.cachelimit                      12884901888
-crail.blocksize                       1048576
-crail.buffersize                      1048576
-</code></pre></div></div>
-
-<p>In this configuration the namenode is configured to run using port 9060 on 
host ‘namenode’, which must be a valid host in the cluster. We further 
configure a single storage tier, in this case the RDMA-based DRAM tier. The 
cachepath property needs to point to a directory that is used by the file 
system to allocate memory for the client cache. Up to cachelimit size, all the 
memory that is used by Crail will be allocated via mmap from this location. 
Ideally, the directory specified in cachepath points to a hugetlbfs mountpoint. 
Aside from the general properties, each storage tier needs to be configured 
separately.</p>
-
-<h3 id="rdmadram-storage">RDMA/DRAM Storage</h3>
-
-<p>For the RDMA/DRAM tier we need to specify the interface that should be used 
by the storage nodes.</p>
-
-<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>crail.storage.rdma.interface         eth0
-</code></pre></div></div>
-
-<p>The datapath property specifies a path from which the storage nodes will 
allocate blocks of memory via mmap. Again, that path best points to a hugetlbfs 
mountpoint.</p>
-
-<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>crail.storage.rdma.datapath          /memory/data
-</code></pre></div></div>
-
-<p>You want to specify how much DRAM each datanode should donate into the file 
system pool using the <code class="highlighter-rouge">storagelimit</code> 
property. DRAM is allocated in chunks of <code 
class="highlighter-rouge">allocationsize</code>, which needs to be a multiple 
of <code class="highlighter-rouge">crail.blocksize</code>.</p>
-
-<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>crail.storage.rdma.allocationsize    1073741824
-crail.storage.rdma.storagelimit      75161927680
-</code></pre></div></div>
-
-<p>Crail supports optimized local operations via memcpy (instead of RDMA) in 
case a given file operation is backed by a local storage node. The indexpath 
specifies where Crail will store the necessary metadata that make these 
optimizations possible. Important: the indexpath must NOT point to a hugetlbfs 
mountpoint because index files will be updated which not possible in 
hugetlbfs.</p>
-
-<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>crail.storage.rdma.localmap          true
-crail.storage.rdma.indexpath         /index
-</code></pre></div></div>
-
-<h3 id="nvmfflash-storage">NVMf/Flash Storage</h3>
-
-<p>Crail is a multi-tiered storage system. Additinoal tiers can be enabled by 
adding them to the configuration as follows.</p>
-
-<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>crail.storage.types                  
org.apache.crail.storage.rdma.RdmaStorageTier,org.apache.crail.storage.nvmf.NvmfStorageTier
-</code></pre></div></div>
-
-<p>For the NVMf storage tier we need to configure the server IP that is used 
when listening for new connections. We also need to configure the PCI address 
of the flash device we want to use, as well as the huge page mount point to be 
used for allocating memory.</p>
-
-<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>crail.storage.nvmf.bindip           10.40.0.XX
-crail.storage.nvmf.pcieaddr         0000:11:00.0
-crail.storage.nvmf.hugedir          /dev/hugepages
-crail.storage.nvmf.servermempool    512
-crail.storage.nvmf.clientmempool    512
-</code></pre></div></div>
-
-<h2 id="deploying">Deploying</h2>
-
-<p>For all deployments, make sure you define CRAIL_HOME on each machine to 
point to the top level Crail directory.</p>
-
-<h3 id="starting-crail-manually">Starting Crail manually</h3>
-
-<p>The simplest way to run Crail is to start it manually on just a handful 
nodes. You will need to start the Crail namenode, plus at least one datanode. 
To start the namenode execute the following command on the host that is 
configured to be the namenode:</p>
-
-<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>cd crail-1.0/
-./bin/crail namenode
-</code></pre></div></div>
-
-<p>To start a datanode run the following command on a host in the cluster 
(ideally this is a different physical machine than the one running the 
namenode):</p>
-
-<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>./bin/crail datanode
-</code></pre></div></div>
-
-<p>Now you should have a small deployment up with just one datanode. In this 
case the datanode is of type RDMA/DRAM, which is the default datnode. If you 
want to start a different storage tier you can do so by passing a specific 
datanode class as follows:</p>
-
-<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>./bin/crail datanode -t 
org.apache.crail.storage.nvmf.NvmfStorageTier
-</code></pre></div></div>
-
-<p>This would start the shared storage datanode. Note that configuration in 
crail-site.conf needs to have the specific properties set of this type of 
datanode, in order for this to work.</p>
-
-<h3 id="larger-deployments">Larger deployments</h3>
-
-<p>To run larger deployments start Crail using</p>
-
-<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>./bin/start-crail.sh
-</code></pre></div></div>
-
-<p>Similarly, Crail can be stopped by using</p>
-
-<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>./bin/stop-crail.sh
-</code></pre></div></div>
-
-<p>For this to work include the list of machines to start datanodes in 
conf/slaves. You can start multiple datanode of different types on the same 
host as follows:</p>
-
-<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>host02-ib
-host02-ib -t org.apache.crail.storage.nvmf.NvmfStorageTier
-host03-ib
-</code></pre></div></div>
-
-<p>In this example, we are configuring a Crail cluster with 2 physical hosts 
but 3 datanodes and two different storage tiers.</p>
-
-<h2 id="crail-shell">Crail Shell</h2>
-
-<p>Crail provides an contains an HDFS adaptor, thus, you can interact with 
Crail using the HDFS shell:</p>
-
-<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>./bin/crail fs
-</code></pre></div></div>
-
-<p>Crail, however, does not implement the full HDFS shell functionality. The 
basic commands to copy file to/from Crail, or to move and delete files, will 
work.</p>
-
-<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>./bin/crail fs -mkdir /test
-./bin/crail fs -ls /
-./bin/crail fs -copyFromLocal &lt;path-to-local-file&gt; /test
-./bin/crail fs -cat /test/&lt;file-name&gt;
-</code></pre></div></div>
-
-<p>For the Crail shell to work properly, the HDFS configuration in 
crail-1.0/conf/core-site.xml needs to be configured accordingly:</p>
-
-<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>&lt;configuration&gt;
-  &lt;property&gt;
-   &lt;name&gt;fs.crail.impl&lt;/name&gt;
-   &lt;value&gt;org.apache.crail.hdfs.CrailHadoopFileSystem&lt;/value&gt;
-  &lt;/property&gt;
-  &lt;property&gt;
-    &lt;name&gt;fs.defaultFS&lt;/name&gt;
-    &lt;value&gt;crail://namenode:9060&lt;/value&gt;
-  &lt;/property&gt;
-  &lt;property&gt;
-    &lt;name&gt;fs.AbstractFileSystem.crail.impl&lt;/name&gt;
-    &lt;value&gt;org.apache.crail.hdfs.CrailHDFS&lt;/value&gt;
-  &lt;/property&gt;
- &lt;/configuration&gt;
-</code></pre></div></div>
-
-<p>Note that the Crail HDFS interface currently cannot provide the full 
performance of Crail due to limitations of the HDFS API. In particular, the 
HDFS <code class="highlighter-rouge">FSDataOutputStream</code> API only support 
heap-based <code class="highlighter-rouge">byte[]</code> arrays which requires 
a data copy. Moreover, HDFS operations are synchronous preventing efficient 
pipelining of operations. Instead, applications that seek the best performance 
should use the Crail interface directly, as shown next.</p>
-
-<h2 id="programming-against-crail">Programming against Crail</h2>
-
-<p>The best way to program against Crail is to use Maven. Make sure you have 
the Crail dependency specified in your application pom.xml file:</p>
-
-<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>&lt;dependency&gt;
-  &lt;groupId&gt;org.apache.crail&lt;/groupId&gt;
-  &lt;artifactId&gt;crail-client&lt;/artifactId&gt;
-  &lt;version&gt;1.0&lt;/version&gt;
-&lt;/dependency&gt;
-</code></pre></div></div>
-
-<p>Then, create a Crail client as follows:</p>
-
-<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>CrailConfiguration conf = new CrailConfiguration();
-CrailStore store = CrailStore.newInstance(conf);
-</code></pre></div></div>
-
-<p>Make sure the crail-1.0/conf directory is part of the classpath.</p>
-
-<p>Crail supports different file types. The simplest way to create a file in 
Crail is as follows:</p>
-
-<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>CrailFile file = store.create(filename, 
CrailNodeType.DATAFILE, CrailStorageClass.DEFAULT, 
CrailLocationClass.DEFAULT).get().syncDir();
-</code></pre></div></div>
-
-<p>Aside from the actual filename, the ‘create()’ call takes as input the 
storage and location classes which are preferences for the storage tier and 
physical location that this file should be created in. Crail tries to satisfy 
these preferences later when the file is written. In the example we do not 
request any particular storage or location affinity.</p>
-
-<p>This ‘create()’ command is non-blocking, calling ‘get()’ on the 
returning future object awaits the completion of the call. At that time, the 
file has been created, but its directory entry may not be visible. Therefore, 
the file may not yet show up in a file enumeration of the given parent 
directory. Calling ‘syncDir()’ waits to for the directory entry to be 
completed. Both the ‘get()’ and the ‘syncDir()’ operation can be 
deffered to a later time at which they may become non-blocking operations.</p>
-
-<p>Once the file is created, a file stream can be obtained for writing:</p>
-
-<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>CrailBufferedOutputStream outstream = 
file.getBufferedOutputStream(1024);   
-</code></pre></div></div>
-
-<p>Here, we create a buffered stream so that we can pass heap byte arrays as 
well. We could also create a non-buffered stream using</p>
-
-<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>CrailOutputStream outstream = 
file.getDirectOutputStream(1024);
-</code></pre></div></div>
-
-<p>In both cases, we pass a write hint (1024 in the example) that indicates to 
Crail how much data we are intending to write. This allows Crail to optimize 
metadatanode lookups. Crail never prefetches data, but it may fetch the 
metadata of the very next operation concurrently with the current data 
operation if the write hint allows to do so.</p>
-
-<p>Once the stream has been obtained, there exist various ways to write a 
file. The code snippet below shows the use of the asynchronous interface:</p>
-
-<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>CrailBuffer dataBuf = fs.allocateBuffer();
-Future&lt;DataResult&gt; future = outputStream.write(dataBuf);
-...
-future.get();
-</code></pre></div></div>
-
-<p>Reading files works very similar to writing. There exist various examples 
in org.apache.crail.tools.CrailBenchmark.</p>
-
-<h2 id="tcp-storage-tiers-and-rpc-binding">TCP Storage Tiers and RPC 
binding</h2>
-
-<p>Crail is designed for user-level networking and storage. It does, however, 
also provide plain TCP-based storage backends for storage and RPC and, thus, 
can be run easily on any machine without requiring spspecial hardware support. 
The TCP storage backend can be enabled as follows:</p>
-
-<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>crail.storage.types         
org.apache.crail.storage.tcp.TcpStorageTier
-</code></pre></div></div>
-
-<p>The TCP RPC binding can be enabled as follows:</p>
-
-<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>crail.namenode.rpctype      
org.apache.crail.namenode.rpc.tcp.TcpNameNode
-</code></pre></div></div>
-
-<h2 id="benchmarks">Benchmarks</h2>
-
-<p>Crail provides a set of benchmark tools to measure the performance. Type</p>
-
-<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>./bin/crail iobench
-</code></pre></div></div>
-
-<p>to get an overview of the available benchmarks. For instance, to benchmark 
the sequential write performance, type</p>
-
-<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>./bin/crail iobench -t write -s 1048576 -k 102400 -f 
/tmp.dat
-</code></pre></div></div>
-
-<p>This will create a file of size 100G, written sequentially in a sequence of 
1MB operations.</p>
-
-<p>To read a file sequentially, type</p>
-
-<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>./bin/crail iobench -t read -s 1048576 -k 102400 -f 
/tmp.dat
-</code></pre></div></div>
-
-<p>This command issues 102400 read operations of 1MB each.</p>
-
-<p>The tool also contains benchmarks to read files randomly, or to measure the 
performance of opening files, etc.</p>
-
-<h2 id="applications">Applications</h2>
-
-<p>Crail is used by <a 
href="https://github.com/zrlio/crail-spark-io";>Crail-Spark-IO</a>, a 
high-performance shuffle engine for Spark. <a 
href="https://github.com/zrlio/crail-terasort";>Crail-Terasort</a> is a fast 
sorting benchmark for Spark based on Crail.</p>
-
-<h2 id="contributions">Contributions</h2>
-
-<p>PRs are always welcome. Please fork, and make necessary modifications 
-you propose, and let us know.</p>
-
-<h2 id="contact">Contact</h2>
-
-<p>Please join the Crail developer mailing list for discussions and 
notifications. The list is at:</p>
-
-<p>d...@crail.incubator.apache.org.</p>
-
-
-        <br>
-       <br> 
-          <div class="footer">
-            <p>Apache Crail is an effort undergoing <a 
href="https://incubator.apache.org/";>incubation</a> at <a 
href="https://www.apache.org/";>The Apache Software Foundation (ASF)</a>, 
sponsored by the Apache Incubator PMC. Incubation is required of all newly 
accepted projects until a further review indicates that the infrastructure, 
communications, and decision making process have stabilized in a manner 
consistent with other successful ASF projects. While incubation status is not 
necessarily a reflection of the completeness or stability of the code, it does 
indicate that the project has yet to be fully endorsed by the ASF.
-            </p>
-          </div>
-
-        </div> <!-- /container -->
-
-        <!-- Support retina images. -->
-        <script type="text/javascript"
-            
src="http://crail.incubator.apache.org/js/srcset-polyfill.js";></script>
-    </body>
-</html>

http://git-wip-us.apache.org/repos/asf/incubator-crail-website/blob/c782b73a/content/feed.xml
----------------------------------------------------------------------
diff --git a/content/feed.xml b/content/feed.xml
index 9ec960d..247c017 100644
--- a/content/feed.xml
+++ b/content/feed.xml
@@ -1,670 +1 @@
-<?xml version="1.0" encoding="utf-8"?><feed 
xmlns="http://www.w3.org/2005/Atom"; ><generator uri="https://jekyllrb.com/"; 
version="3.6.2">Jekyll</generator><link 
href="http://crail.incubator.apache.org//feed.xml"; rel="self" 
type="application/atom+xml" /><link href="http://crail.incubator.apache.org//"; 
rel="alternate" type="text/html" 
/><updated>2018-06-05T11:03:53+02:00</updated><id>http://crail.incubator.apache.org//</id><title
 type="html">The Apache Crail (Incubating) Project</title><entry><title 
type="html">Sparksummit</title><link 
href="http://crail.incubator.apache.org//blog/2018/06/sparksummit.html"; 
rel="alternate" type="text/html" title="Sparksummit" 
/><published>2018-06-05T00:00:00+02:00</published><updated>2018-06-05T00:00:00+02:00</updated><id>http://crail.incubator.apache.org//blog/2018/06/sparksummit</id><content
 type="html" 
xml:base="http://crail.incubator.apache.org//blog/2018/06/sparksummit.html";>&lt;p&gt;A
 Spark serverless architecture powered by Crail will be presente
 d today at the &lt;a 
href=&quot;https://databricks.com/session/serverless-machine-learning-on-modern-hardware-using-apache-spark&quot;&gt;Spark
 Summit&lt;/a&gt;&lt;/p&gt;</content><author><name></name></author><category 
term="news" /><summary type="html">A Spark serverless architecture powered by 
Crail will be presented today at the Spark 
Summit</summary></entry><entry><title type="html">Dataworks</title><link 
href="http://crail.incubator.apache.org//blog/2018/06/dataworks.html"; 
rel="alternate" type="text/html" title="Dataworks" 
/><published>2018-06-05T00:00:00+02:00</published><updated>2018-06-05T00:00:00+02:00</updated><id>http://crail.incubator.apache.org//blog/2018/06/dataworks</id><content
 type="html" 
xml:base="http://crail.incubator.apache.org//blog/2018/06/dataworks.html";>&lt;p&gt;Apache
 Crail (incubating) to feature in the &lt;a 
href=&quot;https://dataworkssummit.com/san-jose-2018/session/data-processing-at-the-speed-of-100-gbpsapache-crail-incubating/&quot;&gt;DataWorks
 Sum
 mit&lt;/a&gt; on June 
21st&lt;/p&gt;</content><author><name></name></author><category term="news" 
/><summary type="html">Apache Crail (incubating) to feature in the DataWorks 
Summit on June 21st</summary></entry><entry><title type="html">Apache 
Release</title><link 
href="http://crail.incubator.apache.org//blog/2018/06/apache-release.html"; 
rel="alternate" type="text/html" title="Apache Release" 
/><published>2018-06-04T00:00:00+02:00</published><updated>2018-06-04T00:00:00+02:00</updated><id>http://crail.incubator.apache.org//blog/2018/06/apache-release</id><content
 type="html" 
xml:base="http://crail.incubator.apache.org//blog/2018/06/apache-release.html";>&lt;p&gt;Apache
 Crail 1.0 incubator &lt;a 
href=&quot;https://dist.apache.org/repos/dist/release/incubator/crail/1.0-incubating/&quot;&gt;release&lt;/a&gt;&lt;/p&gt;</content><author><name></name></author><category
 term="news" /><summary type="html">Apache Crail 1.0 incubator 
release</summary></entry><entry><title type="html">Apache</
 title><link href="http://crail.incubator.apache.org//blog/2018/01/apache.html"; 
rel="alternate" type="text/html" title="Apache" 
/><published>2018-01-22T00:00:00+01:00</published><updated>2018-01-22T00:00:00+01:00</updated><id>http://crail.incubator.apache.org//blog/2018/01/apache</id><content
 type="html" 
xml:base="http://crail.incubator.apache.org//blog/2018/01/apache.html";>&lt;p&gt;Crail
 is now an Apache Incubator 
project!&lt;/p&gt;</content><author><name></name></author><category term="news" 
/><summary type="html">Crail is now an Apache Incubator 
project!</summary></entry><entry><title type="html">Iops</title><link 
href="http://crail.incubator.apache.org//blog/2017/11/iops.html"; 
rel="alternate" type="text/html" title="Iops" 
/><published>2017-11-23T00:00:00+01:00</published><updated>2017-11-23T00:00:00+01:00</updated><id>http://crail.incubator.apache.org//blog/2017/11/iops</id><content
 type="html" 
xml:base="http://crail.incubator.apache.org//blog/2017/11/iops.html";>&lt;p&gt;New
 blog
  &lt;a 
href=&quot;http://crail.incubator.apache.org/blog/2017/11/crail-metadata.html&quot;&gt;post&lt;/a&gt;
 about Crail’s metadata performance and 
scalability&lt;/p&gt;</content><author><name></name></author><category 
term="news" /><summary type="html">New blog post about Crail’s metadata 
performance and scalability</summary></entry><entry><title type="html">Crail 
Storage Performance – Part III: Metadata</title><link 
href="http://crail.incubator.apache.org//blog/2017/11/crail-metadata.html"; 
rel="alternate" type="text/html" title="Crail Storage Performance -- Part III: 
Metadata" 
/><published>2017-11-21T00:00:00+01:00</published><updated>2017-11-21T00:00:00+01:00</updated><id>http://crail.incubator.apache.org//blog/2017/11/crail-metadata</id><content
 type="html" 
xml:base="http://crail.incubator.apache.org//blog/2017/11/crail-metadata.html";>&lt;div
 style=&quot;text-align: justify&quot;&gt;
-&lt;p&gt;
-This is part III of our series of posts discussing Crail's raw storage 
performance. This part is about Crail's metadata performance and scalability.
-&lt;/p&gt;
-&lt;/div&gt;
-
-&lt;h3 id=&quot;hardware-configuration&quot;&gt;Hardware 
Configuration&lt;/h3&gt;
-
-&lt;p&gt;The specific cluster configuration used for the experiments in this 
blog:&lt;/p&gt;
-
-&lt;ul&gt;
-  &lt;li&gt;Cluster
-    &lt;ul&gt;
-      &lt;li&gt;8 node x86_64 cluster&lt;/li&gt;
-    &lt;/ul&gt;
-  &lt;/li&gt;
-  &lt;li&gt;Node configuration
-    &lt;ul&gt;
-      &lt;li&gt;CPU: 2 x Intel(R) Xeon(R) CPU E5-2690 0 @ 2.90GHz&lt;/li&gt;
-      &lt;li&gt;DRAM: 96GB DDR3&lt;/li&gt;
-      &lt;li&gt;Network: 1x100Gbit/s Mellanox ConnectX-5&lt;/li&gt;
-    &lt;/ul&gt;
-  &lt;/li&gt;
-  &lt;li&gt;Software
-    &lt;ul&gt;
-      &lt;li&gt;Ubuntu 16.04.3 LTS (Xenial Xerus) with Linux kernel version 
4.10.0-33-generic&lt;/li&gt;
-      &lt;li&gt;Crail 1.0, internal version 2993&lt;/li&gt;
-    &lt;/ul&gt;
-  &lt;/li&gt;
-&lt;/ul&gt;
-
-&lt;h3 id=&quot;crail-metadata-operation-overview&quot;&gt;Crail Metadata 
Operation Overview&lt;/h3&gt;
-
-&lt;div style=&quot;text-align: justify&quot;&gt; 
-&lt;p&gt;
-As described in &lt;a 
href=&quot;http://crail.incubator.apache.org/blog/2017/08/crail-memory.html&quot;&gt;part
 I&lt;/a&gt;, Crail data operations are composed of actual data transfers and 
metadata operations. Examples of metadata operations are operations for 
creating or modifying the state of a file, or operations to lookup the storage 
server that stores a particular range (block) of a file. In Crail, all the 
metadata is managed by the namenode(s) (as opposed to the data which is managed 
by the storage nodes). Clients interact with Crail namenodes via Remote 
Procedure Calls (RPCs). Crail supports multiple RPC protocols for different 
types of networks and also offers a pluggable RPC interface so that new RPC 
bindings can be implemented easily. On RDMA networks, the default DaRPC (&lt;a 
href=&quot;https://dl.acm.org/citation.cfm?id=2670994&quot;&gt;DaRPC 
paper&lt;/a&gt;, &lt;a href=&quot;http://github.com/zrlio/darpc&quot;&gt;DaRPC 
GitHub&lt;/a&gt;) based RPC binding provides the be
 st performance. The figure below gives an overview of the Crail metadata 
processing in a DaRPC configuration. 
-&lt;/p&gt;
-&lt;/div&gt;
-
-&lt;div style=&quot;text-align:center&quot;&gt;&lt;img 
src=&quot;http://crail.incubator.apache.org/img/blog/crail-metadata/rpc.png&quot;
 width=&quot;480&quot; /&gt;&lt;/div&gt;
-&lt;p&gt;&lt;br /&gt;&lt;/p&gt;
-
-&lt;div style=&quot;text-align: justify&quot;&gt; 
-&lt;p&gt;
-Crail supports partitioning of metadata across several namenods. Thereby, 
metadata operations issued by clients are hashed to a particular namenode 
depending on the name of object the operation attempts to create or retrieve. 
With the DaRPC binding, RPC messages are exchanged using RDMA send/recv 
operations. At the server, RPC processing is parallelized across different 
cores. To minimize locking and cache contention, each core handles a disjoint 
set of client connections. Connections assigned to the same core share the same 
RDMA completion queue which is processed exclusively by that given core. All 
the network queues, including send-, recv- and completion queues are mapped 
into user-space and accessed directly from within the JVM process. Since Crail 
offers a hierarchical storage namespace, metadata operations to create, delete 
or rename new storage resources effectively result in modifications to a 
tree-like data structure at the namenode. These structural operations require a 
so
 mewhat more expensive locking than the more lightweight operations used to 
lookup the file status or to extend a file with a new storage block. 
Consequently, Crail namenodes use two separate data structures to manage 
metadata: (a) a basic tree data structure that requires directory-based 
locking, and (b) a fast lock-free map to lookup of storage resources that are 
currently being read or written.
-&lt;/p&gt;
-&lt;/div&gt;
-
-&lt;h3 id=&quot;experimental-setup&quot;&gt;Experimental Setup&lt;/h3&gt;
-
-&lt;div style=&quot;text-align: justify&quot;&gt; 
-&lt;p&gt;
-In two of the previous blogs (&lt;a 
href=&quot;http://crail.incubator.apache.org/blog/2017/08/crail-memory.html&quot;&gt;DRAM&lt;/a&gt;,
 &lt;a 
href=&quot;http://crail.incubator.apache.org/blog/2017/08/crail-nvme-fabrics-v1.html&quot;&gt;NVMf&lt;/a&gt;)
 we have already shown that Crail metadata operations are very low latency. 
Essentially a single metadata operation issued by a remote client takes 5-6 
microseconds, which is only slightly more than the raw network latency of the 
RDMA network fabric. In this blog, we want to explore the scalability of 
Crail's metadata management, that is, the number of clients Crail can support, 
or how Crail scales as the cluster size increases. The level of scalability of 
Crail is mainly determined by the number of metadata operations Crail can 
process concurrently, a metric that is often referred to as IOPS. The higher 
the number of IOPS the system can handle, the more clients can concurrently use 
Crail without performance loss. 
-&lt;/p&gt;
-&lt;p&gt;
-An important metadata operation is ''getFile()'', which is used by clients to 
lookup the status of a file (whether the file exists, what size it has, etc.). 
The ''getFile()'' operation is served by Crail's fast lock-free map and in 
spirit is very similar to the ''getBlock()'' metadata operation (used by 
clients to query which storage nodes holds a particular block). In a typical 
Crail use case, ''getFile()'' and ''getBlock()'' are responsible for the peak 
metadata load at a namenode. In this experiment, we measure the achievable IOPS 
on the server side in an artificial configuration with many clients distributed 
across the cluster issuing ''getFile()'' in a tight loop. Note that the client 
side RPC interface in Crail is asynchronous, thus, clients can issue multiple 
metadata operations without blocking while asynchronously waiting for the 
result. In the experiments below, each client may have a maximum of 128 
''getFile()'' operations outstanding at any point in time. In a practical 
 scenario, Crail clients may also have multiple metadata operations in flight 
either because clients are shared by different cores, or because Crail 
interleaves metadata and data operations (see &lt;a 
href=&quot;http://crail.incubator.apache.org/blog/2017/08/crail-memory.html&quot;&gt;DRAM&lt;/a&gt;).
 What makes the benchmark artificial is that clients exclusively focus on 
generating load for the namenode and thereby are neither performing data 
operations nor are they doing any compute. The basic command of the benchmark 
as executed by each of the individual clients is given by the following command:
-&lt;/p&gt;
-&lt;/div&gt;
-&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div 
class=&quot;highlight&quot;&gt;&lt;pre 
class=&quot;highlight&quot;&gt;&lt;code&gt;./bin/crail iobench -t 
getMultiFileAsync -f / -k 10000000 -b 128
-&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
-&lt;div style=&quot;text-align: justify&quot;&gt; 
-&lt;p&gt;
-Where ''-t'' specifies the benchmark to run, ''-f'' specifies the path on the
-Crail file system to be used for the benchmark, ''-k'' specifies the number of
-iterations to be performed by the benchmark
-(how many times will the benchmark execute ''getFile()'') and
-''-b'' specifies the maximum number of requests in flight.
-&lt;/p&gt;
-&lt;/div&gt;
-
-&lt;h3 id=&quot;single-namenode-scalability&quot;&gt;Single Namenode 
Scalability&lt;/h3&gt;
-
-&lt;div style=&quot;text-align: justify&quot;&gt; 
-&lt;p&gt;
-In the first experiment, we measure the aggregated number of metadata 
operations a single Crail namenode can handle per second. The namenode runs on 
8 physical cores with hyper-threading disabled. The result is shown in the 
first graph below, labeled ''Namenode IOPS''. The namenode only gets saturated 
with more than 16 clients. The graph shows that the namenode can handle close 
to 10 million ''getFile()'' operations per second. With significantly more 
clients, the overall number of IOPS drops slightly, as more resources are being 
allocated on the single RDMA card, which basically creates a contention on 
hardware resources.
-&lt;/p&gt;
-&lt;p&gt; 
-As comparison, we measure the raw number of IOPS, which can be executed on the 
RDMA network. We measure the raw number using ib_send_bw. We configured 
ib_send_bw with the same parameters in terms of RDMA configuration as the 
namenode. This means, we instructed ib_send_bw not to do CQ moderation, and to 
use a receive queue and a send queue of length 32, which equals the length of 
the namenode queues. Note that the default configuration of ib_send_bw uses CQ 
moderation and does preposting of send operations, which can only be done, if 
the operation is known in advance. This is not the case in a real system, like 
crail's namenode. The basic ib_send_bw command is given below:
-&lt;/p&gt;
-&lt;/div&gt;
-&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div 
class=&quot;highlight&quot;&gt;&lt;pre 
class=&quot;highlight&quot;&gt;&lt;code&gt;ib_send_bw -s 1 -Q 1 -r 32 -t 32 -n 
10000000
-&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
-&lt;div style=&quot;text-align: justify&quot;&gt; 
-&lt;p&gt;
-Where ''-s 1'' specifies to send packets with a payload of 1 (we don't want to
-measure the transmission time of data, just the number of I/O operations),
-''-Q 1'' specifies not to do CQ moderation, ''-r 32'' specifies the receive
-queue length to be 32, ''-t 32'' specifies the send queue length to be 32
-and ''-n'' specifies the number of
-iterations to be performed by ib_send_bw.
-&lt;/p&gt;
-&lt;/div&gt;
-&lt;div style=&quot;text-align: justify&quot;&gt; 
-&lt;p&gt;
-The line of the raw number of IOPS, labeled ''ib send'' is shown in the same 
graph. With this measurement we show that Crail's namenode IOPS are similar to 
the raw ib_send_bw IOPS with the same configuration.
-&lt;/p&gt;
-&lt;/div&gt;
-&lt;p&gt;&lt;br /&gt;&lt;/p&gt;
-&lt;div style=&quot;text-align:center&quot;&gt;&lt;img 
src=&quot;http://crail.incubator.apache.org/img/blog/crail-metadata/namenode_ibsend_iops64.svg&quot;
 width=&quot;550&quot; /&gt;&lt;/div&gt;
-&lt;p&gt;&lt;br /&gt;&lt;/p&gt;
-&lt;div style=&quot;text-align: justify&quot;&gt; 
-&lt;p&gt;
-If one starts ib_send_bw without specifying the queue sizes or whether or not 
to use CQ moderation, the raw number of IOPS might be higher. This is due to 
the fact, that the default values of ib_send_bw use a receive queue of 512, a 
send queue of 128 and CQ moderation of 100, meaning that a new completion is 
generated only after 100 sends. As comparison, we did this
-measurement too and show the result, labeled 'ib_send CQ mod', in the same 
graph. Fine tuning of receive and send queue sizes, CQ moderation size, 
postlists etc might lead to a higher number of IOPS. 
-&lt;/p&gt;
-&lt;/div&gt;
-
-&lt;h3 id=&quot;multiple-namenode-scalability&quot;&gt;Multiple Namenode 
Scalability&lt;/h3&gt;
-
-&lt;div style=&quot;text-align: justify&quot;&gt; 
-&lt;p&gt;
-To increase the number of IOPS the overall system can handle, we allow 
starting multiple namenode instances. Hot metadata operations, such as 
''getFile()'', are distributed over all running instances of the namenode. 
''getFile()'' is implemented such that no synchronization among the namenodes 
is required. As such, we expect good scalability. The graph below compares the 
overall IOPS of a system with one namenode to a system with two namenodes and 
four namenodes.
-&lt;/p&gt;
-&lt;/div&gt;
-&lt;p&gt;&lt;br /&gt;&lt;/p&gt;
-&lt;div style=&quot;text-align:center&quot;&gt;&lt;img 
src=&quot;http://crail.incubator.apache.org/img/blog/crail-metadata/namenode_multi64.svg&quot;
 width=&quot;550&quot; /&gt;&lt;/div&gt;
-&lt;p&gt;&lt;br /&gt;&lt;/p&gt;
-
-&lt;div style=&quot;text-align: justify&quot;&gt; 
-&lt;p&gt;
-We show in this graph that the system can handle around 17Mio IOPS with two 
namenodes and 28Mio IOPS with four namenodes (with more than 64 clients we 
measured the number of IOPS to be slightly higher than 30Mio IOPS). Having 
multiple namenode instances matters especially with a higher number of clients. 
In the graph we see that the more clients we have the more we can benefit from 
a second namenode instance or even more instances.
-&lt;/p&gt;
-&lt;/div&gt;
-
-&lt;div style=&quot;text-align: justify&quot;&gt; 
-&lt;p&gt;
-We only have 7 physical nodes available to run the client processes. This
-means, after 7 client processes, processes start sharing a physical machine.
-With 64 client processes, each machine runs 9 (10 in one case) client
-instances, which share the cores and the resources of the RDMA hardware.
-We believe this is the reason, why the graphs appear not to scale linearly.
-The number of total IOPS is client-bound, not namenode-bound.
-With more physical machines, we believe that scalability could be shown
-much better. Again, there is absolutely no communication among the
-namenodes happening, which should lead to linear scalability.
-&lt;/p&gt;
-&lt;/div&gt;
-
-&lt;h3 id=&quot;cluster-sizes&quot;&gt;Cluster sizes&lt;/h3&gt;
-
-&lt;div style=&quot;text-align: justify&quot;&gt; 
-&lt;p&gt;
-Let us look at a concrete application, which ideally runs on a large cluster:
-TeraSort. In a previous blog, &lt;a 
href=&quot;http://crail.incubator.apache.org/blog/2017/01/sorting.html&quot;&gt;sorting&lt;/a&gt;,
-we analyze performance characteristics of TeraSort on Crail on a big cluster
-of 128 nodes, where we run 384 executors in total. This already proves that
-Crail can at least handle 384 clients. Now we analyze the theoretical number
-of clients without performance loss at the namenode. Still this theoretical
-number is not a hard limit on the number of clients. Just adding more
-clients would start dropping the number of IOPS per client (not at the
-namenode).
-&lt;/p&gt;
-&lt;/div&gt;
-
-&lt;div style=&quot;text-align: justify&quot;&gt; 
-&lt;p&gt;
-In contrast to the benchmarks above, a real-world application, like TeraSort,
-does not issue RPC requests in a tight loop. It rather does sorting
-(computation), file reading and writing and and of course a certain amount of
-RPCs to manage the files.
-&lt;/p&gt;
-&lt;/div&gt;
-
-&lt;div style=&quot;text-align: justify&quot;&gt; 
-&lt;p&gt;
-We would like to know how many RPCs a run of TeraSort generates and therefore
-how big the load in terms of number of IOPS is at the namenode for a
-real-world application.
-We run TeraSort on a data set of 200GB and measured the
-number of IOPS at the namenode with 4 executors, 8 executors and 12 executors.
-Every executor runs 12 cores. For this experiment, we use a single namenode
-instance. We plot the distribution of the number of IOPS measured at the
-namenode over the elapsed runtime of the TeraSort application.
-&lt;/p&gt;
-&lt;/div&gt;
-
-&lt;p&gt;&lt;br /&gt;&lt;/p&gt;
-&lt;div style=&quot;text-align:center&quot;&gt;&lt;img 
src=&quot;http://crail.incubator.apache.org/img/blog/crail-metadata/terasort_iops.svg&quot;
 width=&quot;550&quot; /&gt;&lt;/div&gt;
-&lt;p&gt;&lt;br /&gt;&lt;/p&gt;
-
-&lt;div style=&quot;text-align: justify&quot;&gt; 
-&lt;p&gt;
-From the graph we pick the peak number of IOPS measured
-throughout the execution time for all three cases. The following table
-shows the three peak IOPS numbers:
-&lt;/p&gt;
-&lt;/div&gt;
-
-&lt;p&gt;&lt;br /&gt;&lt;/p&gt;
-&lt;center&gt;
-&lt;table&gt;
-  &lt;thead&gt;
-    &lt;tr&gt;
-      &lt;th&gt;#Executor nodes&lt;/th&gt;
-      &lt;th&gt;Measured IOPS&lt;/th&gt;
-      &lt;th&gt;% of single namenode&lt;/th&gt;
-    &lt;/tr&gt;
-  &lt;/thead&gt;
-  &lt;tbody&gt;
-    &lt;tr&gt;
-      &lt;td align=&quot;right&quot;&gt;4&lt;/td&gt;
-      &lt;td align=&quot;right&quot;&gt;32k&lt;/td&gt;
-      &lt;td align=&quot;right&quot;&gt;0.32%&lt;/td&gt;
-    &lt;/tr&gt;
-    &lt;tr&gt;
-      &lt;td align=&quot;right&quot;&gt;8&lt;/td&gt;
-      &lt;td align=&quot;right&quot;&gt;67k&lt;/td&gt;
-      &lt;td align=&quot;right&quot;&gt;0.67%&lt;/td&gt;
-    &lt;/tr&gt;
-    &lt;tr&gt;
-      &lt;td align=&quot;right&quot;&gt;12&lt;/td&gt;
-      &lt;td align=&quot;right&quot;&gt;107k&lt;/td&gt;
-      &lt;td align=&quot;right&quot;&gt;1.07%&lt;/td&gt;
-    &lt;/tr&gt;
-  &lt;/tbody&gt;
-&lt;/table&gt;
-&lt;/center&gt;
-&lt;p&gt;&lt;br /&gt;&lt;/p&gt;
-
-&lt;div style=&quot;text-align: justify&quot;&gt; 
-&lt;p&gt;
-From this table we see that it scales linearly. Even more important,
-we notice that with 12 nodes we still use only around 1% of the
-number of IOPS a single namenode can handle.
-If we extrapolate this to a
-100%, we can handle a cluster size of almost 1200 nodes (1121 clients being 
just
-below 10Mio IOPS at the namenode). The
-extrapolated numbers would look like this:
-&lt;/p&gt;
-&lt;/div&gt;
-
-&lt;p&gt;&lt;br /&gt;&lt;/p&gt;
-&lt;center&gt;
-&lt;table&gt;
-  &lt;thead&gt;
-    &lt;tr&gt;
-      &lt;th&gt;#Namenodes&lt;/th&gt;
-      &lt;th&gt;Max IOPS by  namenodes&lt;/th&gt;
-      &lt;th&gt;#Executor nodes&lt;/th&gt;
-      &lt;th&gt;Extrapolated IOPS&lt;/th&gt;
-      &lt;th&gt;% of all namenodes&lt;/th&gt;
-    &lt;/tr&gt;
-  &lt;/thead&gt;
-  &lt;tbody&gt;
-    &lt;tr&gt;
-      &lt;td align=&quot;right&quot;&gt;1&lt;/td&gt;
-      &lt;td align=&quot;right&quot;&gt;10000k&lt;/td&gt;
-      &lt;td align=&quot;right&quot;&gt;1121&lt;/td&gt;
-      &lt;td align=&quot;right&quot;&gt;9996k&lt;/td&gt;
-      &lt;td align=&quot;right&quot;&gt;99.96%&lt;/td&gt;
-    &lt;/tr&gt;
-    &lt;tr&gt;
-      &lt;td align=&quot;right&quot;&gt;1&lt;/td&gt;
-      &lt;td align=&quot;right&quot;&gt;10000k&lt;/td&gt;
-      &lt;td align=&quot;right&quot;&gt;1200&lt;/td&gt;
-      &lt;td align=&quot;right&quot;&gt;10730k&lt;/td&gt;
-      &lt;td align=&quot;right&quot;&gt;107.3%&lt;/td&gt;
-    &lt;/tr&gt;
-    &lt;tr&gt;
-      &lt;td align=&quot;right&quot;&gt;2&lt;/td&gt;
-      &lt;td align=&quot;right&quot;&gt;17000k&lt;/td&gt;
-      &lt;td align=&quot;right&quot;&gt;1906&lt;/td&gt;
-      &lt;td align=&quot;right&quot;&gt;16995k&lt;/td&gt;
-      &lt;td align=&quot;right&quot;&gt;99.97%&lt;/td&gt;
-    &lt;/tr&gt;
-    &lt;tr&gt;
-      &lt;td align=&quot;right&quot;&gt;4&lt;/td&gt;
-      &lt;td align=&quot;right&quot;&gt;30000k&lt;/td&gt;
-      &lt;td align=&quot;right&quot;&gt;3364&lt;/td&gt;
-      &lt;td align=&quot;right&quot;&gt;29995k&lt;/td&gt;
-      &lt;td align=&quot;right&quot;&gt;99.98%&lt;/td&gt;
-    &lt;/tr&gt;
-&lt;/tbody&gt;
-&lt;/table&gt;
-&lt;/center&gt;
-&lt;p&gt;&lt;br /&gt;&lt;/p&gt;
-
-&lt;div style=&quot;text-align: justify&quot;&gt; 
-&lt;p&gt;
-Of course we know that there is no system with perfect linear scalability.
-But even if we would loose 50% of the number of IOPS (compared to the
-theoretical maximum) on a big cluster, Crail could still handle a cluster size
-of 600 nodes and a single namenode without any performance loss at the
-namenode.
-Should we still want to run an application like TeraSort on a bigger cluster,
-we can add a second namenode or have even more instances of namenodes
-to ensure that clients do not suffer from contention in terms of IOPS at
-the namenode.
-&lt;/p&gt;
-&lt;/div&gt;
-
-&lt;div style=&quot;text-align: justify&quot;&gt;
-&lt;p&gt;
-We believe that the combination of benchmarks above, the scalability
-experiments and the real-world
-application of TeraSort shows clearly that Crail and Crail's namenode can 
handle
-a big cluster of at least several hundreds of nodes, theoretically up to
-1200 nodes with a single namenode and even more with multiple namenodes.
-&lt;/p&gt;
-&lt;/div&gt;
-
-&lt;h3 id=&quot;system-comparison&quot;&gt;System comparison&lt;/h3&gt;
-&lt;div style=&quot;text-align: justify&quot;&gt;
-&lt;p&gt;
-In this section we compare the number of IOPS Crail can handle to
-two other systems:
-&lt;a href=&quot;http://hadoop.apache.org/&quot;&gt;Hadoop's HDFS 
namenode&lt;/a&gt; and
-&lt;a 
href=&quot;https://ramcloud.atlassian.net/wiki/spaces/RAM/overview&quot;&gt;RAMCloud&lt;/a&gt;.
-&lt;/p&gt;
-&lt;/div&gt;
-
-&lt;div style=&quot;text-align: justify&quot;&gt;
-&lt;p&gt;
-HDFS is a well known distributed file system. Like Crail, HDFS runs
-a namenode and several datanodes. The namenode implements similar functionality
-as Crail's namenode, while HDFS's datanodes provide additional functionality,
-like replication, for example. We are interested in the
-number of IOPS the namenode can handle. As such, the datanode's functionality
-is not relevant for this experiment. HDFS is implemented in Java like Crail.
-Due to this high similarity in terms of functionality and language used to
-implement the system, HDFS is a good candidate to compare Crail to.
-&lt;/p&gt;
-&lt;/div&gt;
-
-&lt;div style=&quot;text-align: justify&quot;&gt;
-&lt;p&gt;
-HDFS does not use RDMA to send RPCs. Instead, RPCs are sent over a regular
-IP network. In our case, it is the same 100Gbit/s ethernet-based RoCE network.
-&lt;/p&gt;
-&lt;/div&gt;
-
-&lt;div style=&quot;text-align: justify&quot;&gt;
-&lt;p&gt;
-To measure the number of IOPS HDFS's namenode can handle, we run the same
-experiment as for Crail. The clients issue a ''getFile()'' RPC to the
-namenode and we vary the number of clients from 1 to 64. The following
-plot shows the number of IOPS relative to the number of clients.
-&lt;/p&gt;
-&lt;/div&gt;
-
-&lt;p&gt;&lt;br /&gt;&lt;/p&gt;
-&lt;div style=&quot;text-align:center&quot;&gt;&lt;img 
src=&quot;http://crail.incubator.apache.org/img/blog/crail-metadata/namenode_hdfs_iops.svg&quot;
 width=&quot;550&quot; /&gt;&lt;/div&gt;
-&lt;p&gt;&lt;br /&gt;&lt;/p&gt;
-
-&lt;div style=&quot;text-align: justify&quot;&gt;
-&lt;p&gt;
-The graph shows that the namenode can handle around 200000 IOPS. One reason
-for the difference to the number of IOPS of Crail is surely that HDFS does not
-use the capabilities offered by the RDMA network, while Crail does. However
-this cannot be the only reason, why the namenode cannot handle more than
-200000 IOPS. We would need to analyze more deeply where the bottleneck is
-to find an answer. We believe that the amount of code which
-gets executed at probably various layers of the software stack
-is too big to achieve high performance in terms of IOPS.
-&lt;/p&gt;
-&lt;/div&gt;
-
-&lt;div style=&quot;text-align: justify&quot;&gt;
-&lt;p&gt;
-RAMCloud is a fast key-value store, which makes use of the RDMA network
-to reach low latency and high throughput. It runs one master coordinator and
-and optionally several slave coordinators, which can take over, if the master
-coordinator fails. Coordinator persistence can be achieved
-by external persistent storage, like Zookeeper or LogCabin.
-RAMCloud runs several storage servers, which
-store key-value pairs in RAM. Optionally, replicas can be stored on secondary
-storage, which provides persistence. RAMCloud is implemented in C++. Therefore
-it is natively compiled code.
-&lt;/p&gt;
-&lt;/div&gt;
-
-&lt;div style=&quot;text-align: justify&quot;&gt;
-&lt;p&gt;
-We are interested in the number of IOPS RAMCloud can handle. We decided
-to run the readThroughput benchmark of RAMCloud's ClusterPerf program, which
-measures the number of object reads per second. This is probably the closest
-benchmark to the RPC benchmark of Crail and HDFS.
-&lt;/p&gt;
-&lt;/div&gt;
-
-&lt;div style=&quot;text-align: justify&quot;&gt;
-&lt;p&gt;
-For a fair comparison, we run RAMCloud without any persistence, so without
-Zookeeper and without replicas to secondary storage. We run one coordinator
-and one storage server, which is somewhat similar to running one namenode
-in the Crail and HDFS cases. Also, we wanted to vary the number of clients
-from 1 to 64. At the moment we can only get results for up to 16 clients.
-We asked the RAMCloud developers for possible reasons and got to know that the
-reason is a starvation bug in the benchmark (not in the RAMCloud system
-itself). The RAMCloud developers are looking into this issue. We will update
-the blog with the latest numbers as soon as the bug is fixed.
-&lt;/p&gt;
-&lt;/div&gt;
-
-&lt;p&gt;&lt;br /&gt;&lt;/p&gt;
-&lt;div style=&quot;text-align:center&quot;&gt;&lt;img 
src=&quot;http://crail.incubator.apache.org/img/blog/crail-metadata/ramcloud_iops.svg&quot;
 width=&quot;550&quot; /&gt;&lt;/div&gt;
-&lt;p&gt;&lt;br /&gt;&lt;/p&gt;
-
-&lt;div style=&quot;text-align: justify&quot;&gt;
-&lt;p&gt;
-RAMCloud reaches a peak of 1.12Mio IOPS with 14 clients. The utilization of the
-dispatcher thread is at 100% already with 10 clients. Even with more clients,
-the number of IOPS won't get higher than 1.12Mio, because the
-dispatcher thread is the bottleneck, as can be seen in the graph.
-In addition, we got a confirmation from the developers that more than
-10 clients will not increase the number of IOPS.
-So we think that the measurements are not unfair, even if we do not have
-results for more than 16 clients. Again, we we will update the blog
-with a higher number of clients, as soon as the bug is fixed.
-&lt;/p&gt;
-&lt;/div&gt;
-
-&lt;div style=&quot;text-align: justify&quot;&gt;
-&lt;p&gt;
-Let us now summarize the number of IOPS of all three systems in one plot
-below. For a fair comparison, Crail runs only one namenode for this
-experiments and we compare the results to RAMCloud with one coordinator and
-one storage server (without replication as described above) and the one
-namenode instance of HDFS. We see that Crail's single namenode can handle
-a much bigger number of RPCs compared to the other two systems (remember
-that Crail can run multiple namenodes and we measured a number of IOPS
-of 30Mio/s with 4 namenodes).
-&lt;/p&gt;
-&lt;/div&gt;
-
-&lt;p&gt;&lt;br /&gt;&lt;/p&gt;
-&lt;div style=&quot;text-align:center&quot;&gt;&lt;img 
src=&quot;http://crail.incubator.apache.org/img/blog/crail-metadata/max_iops_crail_hdfs_ramcloud.svg&quot;
 width=&quot;550&quot; /&gt;&lt;/div&gt;
-&lt;p&gt;&lt;br /&gt;&lt;/p&gt;
-
-&lt;div style=&quot;text-align: justify&quot;&gt;
-&lt;p&gt;
-HDFS is deployed on production clusters and handles real workloads
-with roughly 200000 IOPS. We believe that Crail, which can handle a much
-bigger number of IOPS, is able to run real workloads on very large
-clusters. A common assumption is that Java-based implementations suffer from
-performance loss. We show that a Java-based system can handle a high amount
-of operations even compared to a C++-based system like RAMCloud.
-&lt;/p&gt;
-&lt;/div&gt;
-
-&lt;h3 id=&quot;summary&quot;&gt;Summary&lt;/h3&gt;
-
-&lt;div style=&quot;text-align: justify&quot;&gt; 
-&lt;p&gt;
-In this blog we show three key points of Crail: First, Crail's namenode 
performs the same as ib_send_bw with realistic parameters in terms of IOPS. 
This shows that the actual processing of the RPC is implemented efficiently. 
Second, with only one namenode, Crail performs 10x to 50x better than RAMCloud 
and HDFS, two popular systems, where RAMCloud is RDMA-based and implemented 
natively. Third, Crail's metadata service can be scaled out to serve large 
number of clients. We have shown that Crail offers near linear scaling with up 
to 4 namenodes, offering a performance that is sufficient to serve several 
1000s of clients. 
-&lt;/p&gt;
-&lt;/div&gt;</content><author><name>Adrian Schuepbach and Patrick 
Stuedi</name></author><category term="blog" /><summary type="html">This is part 
III of our series of posts discussing Crail's raw storage performance. This 
part is about Crail's metadata performance and 
scalability.</summary></entry><entry><title type="html">Floss</title><link 
href="http://crail.incubator.apache.org//blog/2017/11/floss.html"; 
rel="alternate" type="text/html" title="Floss" 
/><published>2017-11-17T00:00:00+01:00</published><updated>2017-11-17T00:00:00+01:00</updated><id>http://crail.incubator.apache.org//blog/2017/11/floss</id><content
 type="html" 
xml:base="http://crail.incubator.apache.org//blog/2017/11/floss.html";>&lt;p&gt;Crail
 features in the &lt;a 
href=&quot;https://twit.tv/shows/floss-weekly/episodes/458?autostart=false&quot;&gt;FLOSS
 weekly 
podcast&lt;/a&gt;&lt;/p&gt;</content><author><name></name></author><category 
term="news" /><summary type="html">Crail features in the FLOSS weekly 
podcast</sum
 mary></entry><entry><title type="html">Blog</title><link 
href="http://crail.incubator.apache.org//blog/2017/11/blog.html"; 
rel="alternate" type="text/html" title="Blog" 
/><published>2017-11-17T00:00:00+01:00</published><updated>2017-11-17T00:00:00+01:00</updated><id>http://crail.incubator.apache.org//blog/2017/11/blog</id><content
 type="html" 
xml:base="http://crail.incubator.apache.org//blog/2017/11/blog.html";>&lt;p&gt;New
 blog &lt;a 
href=&quot;http://crail.incubator.apache.org/blog/2017/11/rdmashuffle.html&quot;&gt;post&lt;/a&gt;
 about SparkRDMA and Crail shuffle 
plugins&lt;/p&gt;</content><author><name></name></author><category term="news" 
/><summary type="html">New blog post about SparkRDMA and Crail shuffle 
plugins</summary></entry><entry><title type="html">Spark Shuffle: SparkRDMA vs 
Crail</title><link 
href="http://crail.incubator.apache.org//blog/2017/11/rdmashuffle.html"; 
rel="alternate" type="text/html" title="Spark Shuffle: SparkRDMA vs Crail" 
/><published>2017-11-17T00:00:00
 
+01:00</published><updated>2017-11-17T00:00:00+01:00</updated><id>http://crail.incubator.apache.org//blog/2017/11/rdmashuffle</id><content
 type="html" 
xml:base="http://crail.incubator.apache.org//blog/2017/11/rdmashuffle.html";>&lt;div
 style=&quot;text-align: justify&quot;&gt;
-&lt;p&gt;
-This blog is comparing the shuffle performance of Crail with SparkRDMA, an 
alternative RDMA-based shuffle plugin for Spark.
-&lt;/p&gt;
-&lt;/div&gt;
-
-&lt;h3 id=&quot;hardware-configuration&quot;&gt;Hardware 
Configuration&lt;/h3&gt;
-
-&lt;p&gt;The specific cluster configuration used for the experiments in this 
blog:&lt;/p&gt;
-
-&lt;ul&gt;
-  &lt;li&gt;Cluster
-    &lt;ul&gt;
-      &lt;li&gt;8 compute + 1 management node x86_64 cluster&lt;/li&gt;
-    &lt;/ul&gt;
-  &lt;/li&gt;
-  &lt;li&gt;Node configuration
-    &lt;ul&gt;
-      &lt;li&gt;CPU: 2 x Intel(R) Xeon(R) CPU E5-2690 0 @ 2.90GHz&lt;/li&gt;
-      &lt;li&gt;DRAM: 96GB DDR3&lt;/li&gt;
-      &lt;li&gt;Network: 1x100Gbit/s Mellanox ConnectX-5&lt;/li&gt;
-    &lt;/ul&gt;
-  &lt;/li&gt;
-  &lt;li&gt;Software
-    &lt;ul&gt;
-      &lt;li&gt;Ubuntu 16.04.3 LTS (Xenial Xerus) with Linux kernel version 
4.10.0-33-generic&lt;/li&gt;
-      &lt;li&gt;&lt;a href=&quot;https://github.com/zrlio/crail&quot;&gt;Crail 
1.0&lt;/a&gt;, commit a45c8382050f471e9342e1c6cf25f9f2001af6b5&lt;/li&gt;
-      &lt;li&gt;&lt;a href=&quot;&quot;&gt;Crail Shuffle plugin&lt;/a&gt;, 
commit 2273b5dd53405cab3389f5c1fc2ee4cd30f02ae6&lt;/li&gt;
-      &lt;li&gt;&lt;a 
href=&quot;https://github.com/Mellanox/SparkRDMA&quot;&gt;SparkRDMA&lt;/a&gt;, 
commit d95ce3e370a8e3b5146f4e0ab5e67a19c6f405a5 (latest master on 8th of 
November 2017)&lt;/li&gt;
-    &lt;/ul&gt;
-  &lt;/li&gt;
-&lt;/ul&gt;
-
-&lt;h3 id=&quot;overview&quot;&gt;Overview&lt;/h3&gt;
-&lt;div style=&quot;text-align: justify&quot;&gt;
-&lt;p&gt;
-Lately there has been an increasing interest in the community to include RDMA 
networking into data processing frameworks like Spark and Hadoop. One natural 
spot to integrate RDMA is in the shuffle operation that involves all-to-all 
network communication pattern. Naturally, due to its performance requirements 
the shuffle operation is of interest to us as well, and we have developed a 
Spark plugin for shuffle. In our previous blog posts, we have already shown 
that the Crail Shuffler achieves great workload-level speedups compared to 
vanilla Spark. In this blog post, we take a look at another recently proposed 
design called &lt;a 
href=&quot;https://github.com/Mellanox/SparkRDMA&quot;&gt;SparkRDMA&lt;/a&gt; 
(&lt;a 
href=&quot;https://issues.apache.org/jira/browse/SPARK-22229&quot;&gt;SPARK-22229
 JIRA&lt;/a&gt;). SparkRDMA proposes to improve the shuffle performance of 
Spark by performing data transfers over RDMA. For this, the code manages its 
own off-heap memory which needs to be regist
 ered with the NIC for RDMA use. It supports two ways to store shuffle data 
between the stages: (1) shuffle data is stored in regular files (just like 
vanilla Spark) but the data transfer is implemented via RDMA, (2) data is 
stored in memory (allocated and registered for RDMA transfer) and the data 
transfer is implemented via RDMA. We call it the &quot;last-mile&quot; approach 
where just the networking operations are replaced by the RDMA operations.
-&lt;/p&gt;
-&lt;p&gt;
-In contrast, the Crail shuffler plugin takes a more holistic approach and 
leverages the high performance of Crail distributed data store to deliver 
gains. It uses Crail store to efficiently manage I/O resources, storage and 
networking devices, memory registrations, client sessions, data distribution, 
etc. Consequently, the shuffle operation becomes as simple as writing and 
reading files. And recall that Crail store is designed as a fast data bus for 
the intermediate data. The shuffle operation is just one of many operations 
that can be accelerated using Crail store. Beyond these operations, the modular 
architecture of Crail store enables us to seamlessly leverage different storage 
types (DRAM, NVMe, and more), perform tiering, support disaggregation, share 
inter-job data, jointly optimize I/O resources for various workloads, etc. 
These capabilities and performance gains give us confidence in the design 
choices we made for the Crail project.
-&lt;/p&gt;
-&lt;/div&gt;
-
-&lt;h3 id=&quot;performance-comparison&quot;&gt;Performance 
comparison&lt;/h3&gt;
-&lt;div style=&quot;text-align: justify&quot;&gt;
-&lt;p&gt;Lets start by quantitatively assessing performance gains from the 
Crail shuffle plugin and SparkRDMA. As described above, SparkRDMA can be 
operated in two different modes. Users decide which mode to use by selecting a 
particular type of shuffle writer (spark.shuffle.rdma.shuffleWriterMethod). The 
Wrapper shuffle writer writes shuffle data to files between the stages, the 
Chunked shuffle writer stores shuffle data in memory. We evaluate both writer 
methods for terasort and SQL equijoin.
-&lt;/p&gt;
-&lt;/div&gt;
-&lt;div style=&quot;text-align:center&quot;&gt;&lt;img 
src=&quot;http://crail.incubator.apache.org/img/blog/rdma-shuffle/terasort.svg&quot;
 width=&quot;550&quot; /&gt;&lt;/div&gt;
-&lt;p&gt;&lt;br /&gt;&lt;/p&gt;
-&lt;div style=&quot;text-align: justify&quot;&gt;
-&lt;p&gt;
-First we run &lt;a 
href=&quot;https://github.com/zrlio/crail-spark-terasort&quot;&gt;terasort&lt;/a&gt;
 on our 8+1 machine cluster (see above). We sort 200GB, thus, each node gets 
25GB of data (equal distribution). We further did a basic search of the 
parameter space for each of the systems to find the best possible 
configuration. In all the experiments we use 8 executors with 12 cores each. 
Note that in a typical Spark run more CPU cores than assigned are engaged 
because of garbabge collection, etc. In our test runs assigning 12 cores lead 
to the best performance.
-&lt;/p&gt;
-&lt;p&gt;
-The plot above shows runtimes of the various configuration we run with 
terasort. SparkRDMA with the Wrapper shuffle writer performance slightly better 
(3-4%) than vanilla Spark whereas the Chunked shuffle writer shows a 30% 
overhead. On a quick inspection we found that this overhead stems from memory 
allocation and registration for the shuffle data that is kept in memory between 
the stages. Compared to vanilla Spark, Crail's shuffle plugin shows performance 
improvement of around 235%.
-&lt;/p&gt;
-&lt;/div&gt;
-&lt;div style=&quot;text-align:center&quot;&gt;&lt;img 
src=&quot;http://crail.incubator.apache.org/img/blog/rdma-shuffle/sql.svg&quot; 
width=&quot;550&quot; /&gt;&lt;/div&gt;
-&lt;p&gt;&lt;br /&gt;&lt;/p&gt;
-
-&lt;div style=&quot;text-align: justify&quot;&gt;
-&lt;p&gt;
-For our second workload we choose the &lt;a 
href=&quot;https://github.com/zrlio/sql-benchmarks&quot;&gt;SQL 
equijoin&lt;/a&gt; with a &lt;a 
href=&quot;https://github.com/zrlio/spark-nullio-fileformat&quot;&gt;special 
fileformat&lt;/a&gt; that allows data to be generated on the fly. By generating 
data on the fly we eliminate any costs for reading data from storage and focus 
entirely on the shuffle performance. The shuffle data size is around 148GB. 
Here the Wrapper shuffle writer is slightly slower than vanilla Spark but 
instead the Chunked shuffle writer is roughly the same amount faster. The Crail 
shuffle plugin again delivers a great performance increase over vanilla Spark.
-&lt;/p&gt;
-&lt;/div&gt;
-
-&lt;div style=&quot;text-align: justify&quot;&gt;
-&lt;p&gt;Please let us know if your have recommendations about how these 
experiments can be improved.&lt;/p&gt;
-&lt;/div&gt;
-
-&lt;h3 id=&quot;summary&quot;&gt;Summary&lt;/h3&gt;
-
-&lt;div style=&quot;text-align: justify&quot;&gt;
-&lt;p&gt;
-These benchmarks validate our belief that a &quot;last-mile&quot; integration 
cannot deliver the same performance gains as a holistic approach, i.e. one has 
to look at the whole picture in how to integrate RDMA into Spark applications 
(and for that matter any framework or application). Only replacing the data 
transfer alone does not lead to the anticipated performance increase. We 
learned this the hard way when we intially started working on Crail.
-&lt;/p&gt;
-
-&lt;/div&gt;</content><author><name>Jonas Pfefferle, Patrick Stuedi, Animesh 
Trivedi, Bernard Metzler, Adrian Schuepbach</name></author><category 
term="blog" /><summary type="html">This blog is comparing the shuffle 
performance of Crail with SparkRDMA, an alternative RDMA-based shuffle plugin 
for Spark.</summary></entry><entry><title type="html">Crail Storage Performance 
– Part II: NVMf</title><link 
href="http://crail.incubator.apache.org//blog/2017/08/crail-nvme-fabrics-v1.html";
 rel="alternate" type="text/html" title="Crail Storage Performance -- Part II: 
NVMf" 
/><published>2017-08-22T00:00:00+02:00</published><updated>2017-08-22T00:00:00+02:00</updated><id>http://crail.incubator.apache.org//blog/2017/08/crail-nvme-fabrics-v1</id><content
 type="html" 
xml:base="http://crail.incubator.apache.org//blog/2017/08/crail-nvme-fabrics-v1.html";>&lt;div
 style=&quot;text-align: justify&quot;&gt;
-&lt;p&gt;
-This is part II of our series of posts discussing Crail's raw storage 
performance. This part is about Crail's NVMe storage tier, a low-latency flash 
storage backend for Crail completely based on user-level storage access.
-&lt;/p&gt;
-&lt;/div&gt;
-
-&lt;h3 id=&quot;hardware-configuration&quot;&gt;Hardware 
Configuration&lt;/h3&gt;
-
-&lt;p&gt;The specific cluster configuration used for the experiments in this 
blog:&lt;/p&gt;
-
-&lt;ul&gt;
-  &lt;li&gt;Cluster
-    &lt;ul&gt;
-      &lt;li&gt;8 node OpenPower cluster&lt;/li&gt;
-    &lt;/ul&gt;
-  &lt;/li&gt;
-  &lt;li&gt;Node configuration
-    &lt;ul&gt;
-      &lt;li&gt;CPU: 2x OpenPOWER Power8 10-core @2.9Ghz&lt;/li&gt;
-      &lt;li&gt;DRAM: 512GB DDR4&lt;/li&gt;
-      &lt;li&gt;4x 512 GB Samsung 960Pro NVMe SSDs (512Byte sector size, no 
metadata)&lt;/li&gt;
-      &lt;li&gt;Network: 1x100Gbit/s Mellanox ConnectX-4 IB&lt;/li&gt;
-    &lt;/ul&gt;
-  &lt;/li&gt;
-  &lt;li&gt;Software
-    &lt;ul&gt;
-      &lt;li&gt;RedHat 7.3 with Linux kernel version 3.10&lt;/li&gt;
-      &lt;li&gt;Crail 1.0, internal version 2843&lt;/li&gt;
-      &lt;li&gt;SPDK git commit 5109f56ea5e85b99207556c4ff1d48aa638e7ceb with 
patches for POWER support&lt;/li&gt;
-      &lt;li&gt;DPDK git commit 
bb7927fd2179d7482de58d87352ecc50c69da427&lt;/li&gt;
-    &lt;/ul&gt;
-  &lt;/li&gt;
-&lt;/ul&gt;
-
-&lt;h3 id=&quot;the-crail-nvmf-storage-tier&quot;&gt;The Crail NVMf Storage 
Tier&lt;/h3&gt;
-
-&lt;div style=&quot;text-align: justify&quot;&gt; 
-&lt;p&gt;
-Crail is a framework that allows arbitrary storage backends to be added by 
implementing the Crail storage interface. A storage backend manages the 
point-to-point data transfers on a per block granularity between a Crail client 
and a set of storage servers. The Crail storage interface essentially consists 
of three virtual functions, which simplified look like this:
-&lt;/p&gt;
-&lt;/div&gt;
-&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div 
class=&quot;highlight&quot;&gt;&lt;pre 
class=&quot;highlight&quot;&gt;&lt;code&gt;//Server-side interface: donate 
storage resources to Crail
-StorageResource allocateResource();
-//Client-side interface: read/write remote/local storage resources
-writeBlock(BlockInfo, ByteBuffer);
-readBlock(BlockInfo, ByteBuffer);
-&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
-&lt;div style=&quot;text-align: justify&quot;&gt; 
-&lt;p&gt;
-A specific implementation of this interface provides an efficient mapping of 
Crail storage operations to the actual storage and network hardware the backend 
is exporting. Crail comes with two native storage backends, an RDMA-based DRAM 
backend and an RDMA-based NVMe backend, but other storage backends are 
available as well (e.g., Netty) and we plan to provide more custom backends in 
the future as new storage and network technologies are emerging. 
-&lt;/p&gt;
-&lt;p&gt;
-The Crail NVMf storage backend we evaluate in this blog provides user-level 
access to local and remote flash through the NVMe over Fabrics protocol. Crail 
NVMf is implemented using &lt;a 
href=&quot;https://github.com/zrlio/disni&quot;&gt;DiSNI&lt;/a&gt;, a 
user-level network and storage interface for Java offering both RDMA and NVMf 
APIs. DiSNI itself is based on &lt;a 
href=&quot;http://www.spdk.io&quot;&gt;SPDK&lt;/a&gt; for its NVMf APIs. 
-&lt;/p&gt;
-&lt;p&gt;
-The server side of the NVMf backend is designed in a way that each server 
process manages exactly one NVMe drive. On hosts with multiple NVMe drives one 
may start several Crail NVMf servers. A server is setting up an NVMf target 
through DiSNI and implements the allocateResource() storage interface by 
allocating storage regions from the NVMe drive (basically splits up the NVMe 
namespace into smaller segments). The Crail storage runtime makes information 
about storage regions available to the Crail namenode, from where regions are 
further broken down into smaller units called blocks that make up files in 
Crail.
-&lt;/p&gt;
-&lt;p&gt;
-The Crail client runtime invokes the NVMf client interface during file 
read/write operations for all data transfers on NVMf blocks. Using the block 
information provided by the namenode, the NVMf storage client implementation is 
able to connect to the appropriate NVMf target and perform the data operations 
using DiSNI's NVMf API.
-&lt;/p&gt;
-&lt;p&gt;
-One downside of the NVMe interface is that byte level access is prohibited. 
Instead data operations have to be issued for entire drive sectors which are 
typically 512Byte or 4KB large (we used 512Byte sector size in all the 
experiments shown in this blog). As we wanted to use the standard NVMf protocol 
(and Crail has a client driven philosophy) we needed to implement byte level 
access at the client side. For reads this can be achieved in a straight forward 
way by reading the whole sector and copying out the requested part. For writes 
that modify a certain subrange of a sector that has already been written before 
we need to do a read modify write operation.
-&lt;/p&gt;
-&lt;/div&gt;
-
-&lt;h3 
id=&quot;performance-comparison-to-native-spdk-nvmf&quot;&gt;Performance 
comparison to native SPDK NVMf&lt;/h3&gt;
-
-&lt;div style=&quot;text-align: justify&quot;&gt; 
-&lt;p&gt;
-We perform latency and throughput measurement of our Crail NVMf storage tier 
against a native SPDK NVMf benchmark to determine how much overhead our 
implementation adds. The first plot shows random read latency on a single 512GB 
Samsung 960Pro accessed remotely through SPDK. For Crail we also show the time 
it takes to perform a metadata operations. You can run the Crail benchmark from 
the command line like this:
-&lt;/p&gt;
-&lt;/div&gt;
-&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div 
class=&quot;highlight&quot;&gt;&lt;pre 
class=&quot;highlight&quot;&gt;&lt;code&gt;./bin/crail iobench -t readRandom -b 
false -s &amp;lt;size&amp;gt; -k &amp;lt;iterations&amp;gt; -w 32 -f /tmp.dat
-&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
-&lt;p&gt;and SPDK:&lt;/p&gt;
-&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div 
class=&quot;highlight&quot;&gt;&lt;pre 
class=&quot;highlight&quot;&gt;&lt;code&gt;./perf -q 1 -s &amp;lt;size&amp;gt; 
-w randread -r 'trtype:RDMA adrfam:IPv4 traddr:&amp;lt;ip&amp;gt; 
trsvcid:&amp;lt;port&amp;gt;' -t &amp;lt;time in seconds&amp;gt;
-&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
-&lt;div style=&quot;text-align: justify&quot;&gt; 
-&lt;p&gt;
-The main take away from this plot is that the time it takes to perform a 
random read operation on a NVMe-backed file in Crail takes only about 7 
microseconds more time than fetching the same amount of data over a 
point-to-point SPDK connection. This is impressive because it shows that using 
Crail a bunch of NVMe drives can be turned into a fully distributed storage 
space at almost no extra cost. The 7 microseconds are due to Crail having to 
look up the specific NVMe storage node that holdes the data -- an operation 
which requires one extra network roundtrip (client to namenode). The experiment 
represents an extreme case where no metadata is cached at the client. In 
practice, file blocks are often accessed multiple times in which case the read 
latency is further reduced. Also note that unlike SPDK which is a native 
library, Crail delivers data directly into Java off-heap memory. 
-&lt;/p&gt;
-&lt;/div&gt;
-
-&lt;div style=&quot;text-align:center&quot;&gt;&lt;img 
src=&quot;http://crail.incubator.apache.org/img/blog/crail-nvmf/latency.svg&quot;
 width=&quot;550&quot; /&gt;&lt;/div&gt;
-&lt;p&gt;&lt;br /&gt;&lt;/p&gt;
-
-&lt;div style=&quot;text-align: justify&quot;&gt; 
-&lt;p&gt;
-The second plot shows sequential read and write throughput with a transfer 
size of 64KB and 128 outstanding operations. The Crail throughput benchmark can 
be run like this:
-&lt;/p&gt;
-&lt;/div&gt;
-&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div 
class=&quot;highlight&quot;&gt;&lt;pre 
class=&quot;highlight&quot;&gt;&lt;code&gt;./bin/crail iobench -t readAsync -s 
65536 -k &amp;lt;iterations&amp;gt; -b 128 -w 32 -f /tmp.dat
-&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
-&lt;p&gt;and SPDK:&lt;/p&gt;
-&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div 
class=&quot;highlight&quot;&gt;&lt;pre 
class=&quot;highlight&quot;&gt;&lt;code&gt;./perf -q 128 -s 65536 -w read -r 
'trtype:RDMA adrfam:IPv4 traddr:&amp;lt;ip&amp;gt; 
trsvcid:&amp;lt;port&amp;gt;' -t &amp;lt;time in seconds&amp;gt;
-&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
-&lt;div style=&quot;text-align: justify&quot;&gt; 
-&lt;p&gt;
-For sequential operations in Crail, metadata fetching is inlined with data 
operations as described in the &lt;a 
href=&quot;http://crail.incubator.apache.org/blog/2017/08/crail-memory.html&quot;&gt;DRAM&lt;/a&gt;
 blog. This is possible as long as the data transfer has a lower latency than 
the metadata RPC, which is typically the case. As a consequence, our NVMf 
storage tier reaches the same throughput as the native SPDK benchmark (device 
limit).
-&lt;/p&gt;
-&lt;/div&gt;
-&lt;div style=&quot;text-align:center&quot;&gt;&lt;img 
src=&quot;http://crail.incubator.apache.org/img/blog/crail-nvmf/throughput.svg&quot;
 width=&quot;550&quot; /&gt;&lt;/div&gt;
-
-&lt;h3 id=&quot;sequential-throughput&quot;&gt;Sequential Throughput&lt;/h3&gt;
-
-&lt;div style=&quot;text-align: justify&quot;&gt; 
-&lt;p&gt;
-Let us look at the sequential read and write throughput for buffered and 
direct streams and compare them to a buffered Crail stream on DRAM. All 
benchmarks are single thread/client performed against 8 storage nodes with 4 
drives each, cf. configuration above. In this benchmark we use 32 outstanding 
operations for the NVMf storage tier buffered stream experiments by using a 
buffer size of 16MB and a slice size of 512KB, cf. &lt;a 
href=&quot;http://crail.incubator.apache.org/blog/2017/07/crail-memory.html&quot;&gt;part
 I&lt;/a&gt;. The buffered stream reaches line speed at a transfer size of 
around 1KB and shows only slightly slower performance when compared to the DRAM 
tier buffered stream. However we are only using 2 outstanding operations with 
the DRAM tier to achieve these results. Basically for sizes smaller than 1KB 
the buffered stream is limited by the copy speed to fill the application 
buffer. The direct stream reaches line speed at around 128KB with 128 
outstanding operations
 . Here no copy operation is performed for transfer size greater than 512Byte 
(sector size). The command to run the Crail buffered stream benchmark:
-&lt;/p&gt;
-&lt;/div&gt;
-&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div 
class=&quot;highlight&quot;&gt;&lt;pre 
class=&quot;highlight&quot;&gt;&lt;code&gt;./bin/crail iobench -t read -s 
&amp;lt;size&amp;gt; -k &amp;lt;iterations&amp;gt; -w 32 -f /tmp.dat
-&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
-&lt;p&gt;The direct stream benchmark:&lt;/p&gt;
-&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div 
class=&quot;highlight&quot;&gt;&lt;pre 
class=&quot;highlight&quot;&gt;&lt;code&gt;./bin/crail iobench -t readAsync -s 
&amp;lt;size&amp;gt; -k &amp;lt;iterations&amp;gt; -b 128 -w 32 -f /tmp.dat
-&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
-
-&lt;div style=&quot;text-align:center&quot;&gt;&lt;img 
src=&quot;http://crail.incubator.apache.org/img/blog/crail-nvmf/throughput2.svg&quot;
 width=&quot;550&quot; /&gt;&lt;/div&gt;
-
-&lt;h3 id=&quot;random-read-latency&quot;&gt;Random Read Latency&lt;/h3&gt;
-
-&lt;div style=&quot;text-align: justify&quot;&gt; 
-&lt;p&gt;
-Random read latency is limited by the flash technology and we currently see 
around 70us when performing sector size accesses to the device with the Crail 
NVMf backend. In comparison, remote DRAM latencies with Crail are about 7-8x 
faster. However, we believe that this will change in the near future with new 
technologies like PCM. Intel's Optane drives already can deliver random read 
latencies of around 10us. Considering that there is an overhead of around 10us 
to access a drive with Crail from anywhere in the cluster, using such a device 
would put random read latencies somewhere around 20us which is only half the 
performance of our DRAM tier.
-&lt;/p&gt;
-&lt;/div&gt;
-
-&lt;div style=&quot;text-align:center&quot;&gt;&lt;img 
src=&quot;http://crail.incubator.apache.org/img/blog/crail-nvmf/latency2.svg&quot;
 width=&quot;550&quot; /&gt;&lt;/div&gt;
-
-&lt;h3 id=&quot;tiering-dram---nvmf&quot;&gt;Tiering DRAM - NVMf&lt;/h3&gt;
-
-&lt;div style=&quot;text-align: justify&quot;&gt; 
-&lt;p&gt;
-In this paragraph we show how Crail can leverage flash memory when there is 
not sufficient DRAM available in the cluster to hold all the data. As described 
in the &lt;a 
href=&quot;http://crail.incubator.apache.org/overview/&quot;&gt;overview&lt;/a&gt;
 section, if you have multiple storage tiers deployed in Crail, e.g. the DRAM 
tier and the NVMf tier, Crail by default first uses up all available resources 
of the faster tier. Basically a remote resource of a faster tier (e.g. remote 
DRAM) is preferred over a slower local resource (e.g., local flash), motivated 
by the fast network. This is what we call horizontal tiering.
-&lt;/p&gt;
-&lt;/div&gt;
-&lt;div style=&quot;text-align:center&quot;&gt;&lt;img 
src=&quot;http://crail.incubator.apache.org/img/blog/crail-nvmf/crail_tiering.png&quot;
 width=&quot;500&quot; vspace=&quot;10&quot; /&gt;&lt;/div&gt;
-&lt;p&gt;&lt;br /&gt;&lt;/p&gt;
-&lt;div style=&quot;text-align: justify&quot;&gt; 
-&lt;p&gt;
-In the following 200G Terasort experiment we gradually limit the DRAM 
resources in Crail while adding more flash to the Crail NVMf storage tier. Note 
that here Crail is used for both input/output as well as shuffle data. The 
figure shows that by putting all the data in flash we only increase the sorting 
time by around 48% compared to the configuration where all the data resides in 
DRAM. Considering the cost of DRAM and the advances in technology described 
above we believe cheaper NVM storage can replace DRAM for most of the 
applications with only a minor performance decrease. Also, note that even with 
100% of the data in NVMe, Spark/Crail is still faster than vanilla Spark with 
all the data in memory. The vanilla Spark experiment uses Alluxio for 
input/output and RamFS for the shuffle data.
-&lt;/p&gt;
-&lt;/div&gt;
-
-&lt;div style=&quot;text-align:center&quot;&gt;&lt;img 
src=&quot;http://crail.incubator.apache.org/img/blog/crail-nvmf/tiering.svg&quot;
 width=&quot;550&quot; /&gt;&lt;/div&gt;
-
-&lt;p&gt;To summarize, in this blog we have shown that the NVMf storage 
backend for Crail – due to its efficient user-level implementation – offers 
latencies and throughput very close to the hardware speed. The Crail NVMf 
storage tier can be used conveniently in combination with the Crail DRAM tier 
to either save cost or to handle situations where the available DRAM is not 
sufficient to store the working set of a data processing 
workload.&lt;/p&gt;</content><author><name>Jonas 
Pfefferle</name></author><category term="blog" /><summary type="html">This is 
part II of our series of posts discussing Crail's raw storage performance. This 
part is about Crail's NVMe storage tier, a low-latency flash storage backend 
for Crail completely based on user-level storage 
access.</summary></entry></feed>
\ No newline at end of file
+<?xml version="1.0" encoding="utf-8"?><feed 
xmlns="http://www.w3.org/2005/Atom"; ><generator uri="https://jekyllrb.com/"; 
version="3.8.3">Jekyll</generator><link href="http://localhost:4000/feed.xml"; 
rel="self" type="application/atom+xml" /><link href="http://localhost:4000/"; 
rel="alternate" type="text/html" 
/><updated>2018-06-07T13:27:00+02:00</updated><id>http://localhost:4000/</id><title
 type="html">The Apache Crail (Incubating) Project</title></feed>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-crail-website/blob/c782b73a/content/img/apache_incubator.png
----------------------------------------------------------------------
diff --git a/content/img/apache_incubator.png b/content/img/apache_incubator.png
deleted file mode 100644
index 987c79e..0000000
Binary files a/content/img/apache_incubator.png and /dev/null differ

Reply via email to