http://git-wip-us.apache.org/repos/asf/zookeeper/blob/74d00962/_released_docs/r3.5.4-beta/zookeeperOver.html ---------------------------------------------------------------------- diff --git a/_released_docs/r3.5.4-beta/zookeeperOver.html b/_released_docs/r3.5.4-beta/zookeeperOver.html new file mode 100644 index 0000000..04da3b5 --- /dev/null +++ b/_released_docs/r3.5.4-beta/zookeeperOver.html @@ -0,0 +1,692 @@ +<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> +<html> +<head> +<META http-equiv="Content-Type" content="text/html; charset=UTF-8"> +<meta content="Apache Forrest" name="Generator"> +<meta name="Forrest-version" content="0.9"> +<meta name="Forrest-skin-name" content="pelt"> +<title>ZooKeeper</title> +<link type="text/css" href="skin/basic.css" rel="stylesheet"> +<link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet"> +<link media="print" type="text/css" href="skin/print.css" rel="stylesheet"> +<link type="text/css" href="skin/profile.css" rel="stylesheet"> +<script src="skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="skin/fontsize.js" language="javascript" type="text/javascript"></script> +<link rel="shortcut icon" href="images/favicon.ico"> +</head> +<body onload="init()"> +<script type="text/javascript">ndeSetTextSize();</script> +<div id="top"> +<!--+ + |breadtrail + +--> +<div class="breadtrail"> +<a href="http://www.apache.org/">Apache</a> > <a href="http://zookeeper.apache.org/">ZooKeeper</a> > <a href="http://zookeeper.apache.org/">ZooKeeper</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script> +</div> +<!--+ + |header + +--> +<div class="header"> +<!--+ + |start group logo + +--> +<div class="grouplogo"> +<a href="http://hadoop.apache.org/"><img class="logoImage" alt="Hadoop" src="images/hadoop-logo.jpg" title="Apache Hadoop"></a> +</div> +<!--+ + |end group logo + +--> +<!--+ + |start Project Logo + +--> +<div class="projectlogo"> +<a href="http://zookeeper.apache.org/"><img class="logoImage" alt="ZooKeeper" src="images/zookeeper_small.gif" title="ZooKeeper: distributed coordination"></a> +</div> +<!--+ + |end Project Logo + +--> +<!--+ + |start Search + +--> +<div class="searchbox"> +<form action="http://www.google.com/search" method="get" class="roundtopsmall"> +<input value="zookeeper.apache.org" name="sitesearch" type="hidden"><input onFocus="getBlank (this, 'Search the site with google');" size="25" name="q" id="query" type="text" value="Search the site with google"> + <input name="Search" value="Search" type="submit"> +</form> +</div> +<!--+ + |end search + +--> +<!--+ + |start Tabs + +--> +<ul id="tabs"> +<li> +<a class="unselected" href="http://zookeeper.apache.org/">Project</a> +</li> +<li> +<a class="unselected" href="https://cwiki.apache.org/confluence/display/ZOOKEEPER/">Wiki</a> +</li> +<li class="current"> +<a class="selected" href="index.html">ZooKeeper 3.5 Documentation</a> +</li> +</ul> +<!--+ + |end Tabs + +--> +</div> +</div> +<div id="main"> +<div id="publishedStrip"> +<!--+ + |start Subtabs + +--> +<div id="level2tabs"></div> +<!--+ + |end Endtabs + +--> +<script type="text/javascript"><!-- +document.write("Last Published: " + document.lastModified); +// --></script> +</div> +<!--+ + |breadtrail + +--> +<div class="breadtrail"> + + + </div> +<!--+ + |start Menu, mainarea + +--> +<!--+ + |start Menu + +--> +<div id="menu"> +<div onclick="SwitchMenu('menu_selected_1.1', 'skin/')" id="menu_selected_1.1Title" class="menutitle" style="background-image: url('skin/images/chapter_open.gif');">Overview</div> +<div id="menu_selected_1.1" class="selectedmenuitemgroup" style="display: block;"> +<div class="menuitem"> +<a href="index.html">Welcome</a> +</div> +<div class="menupage"> +<div class="menupagetitle">Overview</div> +</div> +<div class="menuitem"> +<a href="zookeeperStarted.html">Getting Started</a> +</div> +<div class="menuitem"> +<a href="releasenotes.html">Release Notes</a> +</div> +</div> +<div onclick="SwitchMenu('menu_1.2', 'skin/')" id="menu_1.2Title" class="menutitle">Developer</div> +<div id="menu_1.2" class="menuitemgroup"> +<div class="menuitem"> +<a href="api/index.html">API Docs</a> +</div> +<div class="menuitem"> +<a href="zookeeperProgrammers.html">Programmer's Guide</a> +</div> +<div class="menuitem"> +<a href="javaExample.html">Java Example</a> +</div> +<div class="menuitem"> +<a href="zookeeperTutorial.html">Barrier and Queue Tutorial</a> +</div> +<div class="menuitem"> +<a href="recipes.html">Recipes</a> +</div> +</div> +<div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title" class="menutitle">Admin & Ops</div> +<div id="menu_1.3" class="menuitemgroup"> +<div class="menuitem"> +<a href="zookeeperAdmin.html">Administrator's Guide</a> +</div> +<div class="menuitem"> +<a href="zookeeperQuotas.html">Quota Guide</a> +</div> +<div class="menuitem"> +<a href="zookeeperJMX.html">JMX</a> +</div> +<div class="menuitem"> +<a href="zookeeperObservers.html">Observers Guide</a> +</div> +<div class="menuitem"> +<a href="zookeeperReconfig.html">Dynamic Reconfiguration</a> +</div> +</div> +<div onclick="SwitchMenu('menu_1.4', 'skin/')" id="menu_1.4Title" class="menutitle">Contributor</div> +<div id="menu_1.4" class="menuitemgroup"> +<div class="menuitem"> +<a href="zookeeperInternals.html">ZooKeeper Internals</a> +</div> +</div> +<div onclick="SwitchMenu('menu_1.5', 'skin/')" id="menu_1.5Title" class="menutitle">Miscellaneous</div> +<div id="menu_1.5" class="menuitemgroup"> +<div class="menuitem"> +<a href="https://cwiki.apache.org/confluence/display/ZOOKEEPER">Wiki</a> +</div> +<div class="menuitem"> +<a href="https://cwiki.apache.org/confluence/display/ZOOKEEPER/FAQ">FAQ</a> +</div> +<div class="menuitem"> +<a href="http://zookeeper.apache.org/mailing_lists.html">Mailing Lists</a> +</div> +</div> +<div id="credit"></div> +<div id="roundbottom"> +<img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div> +<!--+ + |alternative credits + +--> +<div id="credit2"></div> +</div> +<!--+ + |end Menu + +--> +<!--+ + |start content + +--> +<div id="content"> +<div title="Portable Document Format" class="pdflink"> +<a class="dida" href="zookeeperOver.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br> + PDF</a> +</div> +<h1>ZooKeeper</h1> +<div id="front-matter"> +<div id="minitoc-area"> +<ul class="minitoc"> +<li> +<a href="#ch_DesignOverview">ZooKeeper: A Distributed Coordination Service for Distributed + Applications</a> +<ul class="minitoc"> +<li> +<a href="#sc_designGoals">Design Goals</a> +</li> +<li> +<a href="#sc_dataModelNameSpace">Data model and the hierarchical namespace</a> +</li> +<li> +<a href="#Nodes+and+ephemeral+nodes">Nodes and ephemeral nodes</a> +</li> +<li> +<a href="#Conditional+updates+and+watches">Conditional updates and watches</a> +</li> +<li> +<a href="#Guarantees">Guarantees</a> +</li> +<li> +<a href="#Simple+API">Simple API</a> +</li> +<li> +<a href="#Implementation">Implementation</a> +</li> +<li> +<a href="#Uses">Uses</a> +</li> +<li> +<a href="#Performance">Performance</a> +</li> +<li> +<a href="#Reliability">Reliability</a> +</li> +<li> +<a href="#The+ZooKeeper+Project">The ZooKeeper Project</a> +</li> +</ul> +</li> +</ul> +</div> +</div> + + + + + +<a name="ch_DesignOverview"></a> +<h2 class="h3">ZooKeeper: A Distributed Coordination Service for Distributed + Applications</h2> +<div class="section"> +<p>ZooKeeper is a distributed, open-source coordination service for + distributed applications. It exposes a simple set of primitives that + distributed applications can build upon to implement higher level services + for synchronization, configuration maintenance, and groups and naming. It + is designed to be easy to program to, and uses a data model styled after + the familiar directory tree structure of file systems. It runs in Java and + has bindings for both Java and C.</p> +<p>Coordination services are notoriously hard to get right. They are + especially prone to errors such as race conditions and deadlock. The + motivation behind ZooKeeper is to relieve distributed applications the + responsibility of implementing coordination services from scratch.</p> +<a name="sc_designGoals"></a> +<h3 class="h4">Design Goals</h3> +<p> +<strong>ZooKeeper is simple.</strong> ZooKeeper + allows distributed processes to coordinate with each other through a + shared hierarchal namespace which is organized similarly to a standard + file system. The name space consists of data registers - called znodes, + in ZooKeeper parlance - and these are similar to files and directories. + Unlike a typical file system, which is designed for storage, ZooKeeper + data is kept in-memory, which means ZooKeeper can achieve high + throughput and low latency numbers.</p> +<p>The ZooKeeper implementation puts a premium on high performance, + highly available, strictly ordered access. The performance aspects of + ZooKeeper means it can be used in large, distributed systems. The + reliability aspects keep it from being a single point of failure. The + strict ordering means that sophisticated synchronization primitives can + be implemented at the client.</p> +<p> +<strong>ZooKeeper is replicated.</strong> Like the + distributed processes it coordinates, ZooKeeper itself is intended to be + replicated over a sets of hosts called an ensemble.</p> +<table class="ForrestTable" cellspacing="1" cellpadding="4"> +<tr> +<td>ZooKeeper Service</td> +</tr> +<tr> +<td> + + <img alt="" src="images/zkservice.jpg"> + + </td> +</tr> +</table> +<p>The servers that make up the ZooKeeper service must all know about + each other. They maintain an in-memory image of state, along with a + transaction logs and snapshots in a persistent store. As long as a + majority of the servers are available, the ZooKeeper service will be + available.</p> +<p>Clients connect to a single ZooKeeper server. The client maintains + a TCP connection through which it sends requests, gets responses, gets + watch events, and sends heart beats. If the TCP connection to the server + breaks, the client will connect to a different server.</p> +<p> +<strong>ZooKeeper is ordered.</strong> ZooKeeper + stamps each update with a number that reflects the order of all + ZooKeeper transactions. Subsequent operations can use the order to + implement higher-level abstractions, such as synchronization + primitives.</p> +<p> +<strong>ZooKeeper is fast.</strong> It is + especially fast in "read-dominant" workloads. ZooKeeper applications run + on thousands of machines, and it performs best where reads are more + common than writes, at ratios of around 10:1.</p> +<a name="sc_dataModelNameSpace"></a> +<h3 class="h4">Data model and the hierarchical namespace</h3> +<p>The name space provided by ZooKeeper is much like that of a + standard file system. A name is a sequence of path elements separated by + a slash (/). Every node in ZooKeeper's name space is identified by a + path.</p> +<table class="ForrestTable" cellspacing="1" cellpadding="4"> +<tr> +<td>ZooKeeper's Hierarchical Namespace</td> +</tr> +<tr> +<td> + + <img alt="" src="images/zknamespace.jpg"> + + </td> +</tr> +</table> +<a name="Nodes+and+ephemeral+nodes"></a> +<h3 class="h4">Nodes and ephemeral nodes</h3> +<p>Unlike standard file systems, each node in a ZooKeeper + namespace can have data associated with it as well as children. It is + like having a file-system that allows a file to also be a directory. + (ZooKeeper was designed to store coordination data: status information, + configuration, location information, etc., so the data stored at each + node is usually small, in the byte to kilobyte range.) We use the term + <em>znode</em> to make it clear that we are talking about + ZooKeeper data nodes.</p> +<p>Znodes maintain a stat structure that includes version numbers for + data changes, ACL changes, and timestamps, to allow cache validations + and coordinated updates. Each time a znode's data changes, the version + number increases. For instance, whenever a client retrieves data it also + receives the version of the data.</p> +<p>The data stored at each znode in a namespace is read and written + atomically. Reads get all the data bytes associated with a znode and a + write replaces all the data. Each node has an Access Control List (ACL) + that restricts who can do what.</p> +<p>ZooKeeper also has the notion of ephemeral nodes. These znodes + exists as long as the session that created the znode is active. When the + session ends the znode is deleted. Ephemeral nodes are useful when you + want to implement <em>[tbd]</em>.</p> +<a name="Conditional+updates+and+watches"></a> +<h3 class="h4">Conditional updates and watches</h3> +<p>ZooKeeper supports the concept of <em>watches</em>. + Clients can set a watch on a znode. A watch will be triggered and + removed when the znode changes. When a watch is triggered, the client + receives a packet saying that the znode has changed. If the + connection between the client and one of the Zoo Keeper servers is + broken, the client will receive a local notification. These can be used + to <em>[tbd]</em>.</p> +<a name="Guarantees"></a> +<h3 class="h4">Guarantees</h3> +<p>ZooKeeper is very fast and very simple. Since its goal, though, is + to be a basis for the construction of more complicated services, such as + synchronization, it provides a set of guarantees. These are:</p> +<ul> + +<li> + +<p>Sequential Consistency - Updates from a client will be applied + in the order that they were sent.</p> + +</li> + + +<li> + +<p>Atomicity - Updates either succeed or fail. No partial + results.</p> + +</li> + + +<li> + +<p>Single System Image - A client will see the same view of the + service regardless of the server that it connects to.</p> + +</li> + +</ul> +<ul> + +<li> + +<p>Reliability - Once an update has been applied, it will persist + from that time forward until a client overwrites the update.</p> + +</li> + +</ul> +<ul> + +<li> + +<p>Timeliness - The clients view of the system is guaranteed to + be up-to-date within a certain time bound.</p> + +</li> + +</ul> +<p>For more information on these, and how they can be used, see + <em>[tbd]</em> +</p> +<a name="Simple+API"></a> +<h3 class="h4">Simple API</h3> +<p>One of the design goals of ZooKeeper is provide a very simple + programming interface. As a result, it supports only these + operations:</p> +<dl> + +<dt> +<term>create</term> +</dt> +<dd> +<p>creates a node at a location in the tree</p> +</dd> + + +<dt> +<term>delete</term> +</dt> +<dd> +<p>deletes a node</p> +</dd> + + +<dt> +<term>exists</term> +</dt> +<dd> +<p>tests if a node exists at a location</p> +</dd> + + +<dt> +<term>get data</term> +</dt> +<dd> +<p>reads the data from a node</p> +</dd> + + +<dt> +<term>set data</term> +</dt> +<dd> +<p>writes data to a node</p> +</dd> + + +<dt> +<term>get children</term> +</dt> +<dd> +<p>retrieves a list of children of a node</p> +</dd> + + +<dt> +<term>sync</term> +</dt> +<dd> +<p>waits for data to be propagated</p> +</dd> + +</dl> +<p>For a more in-depth discussion on these, and how they can be used + to implement higher level operations, please refer to + <em>[tbd]</em> +</p> +<a name="Implementation"></a> +<h3 class="h4">Implementation</h3> +<p> +<a href="#fg_zkComponents">ZooKeeper Components</a> shows the high-level components + of the ZooKeeper service. With the exception of the request processor, + each of + the servers that make up the ZooKeeper service replicates its own copy + of each of the components.</p> +<table class="ForrestTable" cellspacing="1" cellpadding="4"> +<tr> +<td>ZooKeeper Components</td> +</tr> +<tr> +<td> + + <img alt="" src="images/zkcomponents.jpg"> + + </td> +</tr> +</table> +<p>The replicated database is an in-memory database containing the + entire data tree. Updates are logged to disk for recoverability, and + writes are serialized to disk before they are applied to the in-memory + database.</p> +<p>Every ZooKeeper server services clients. Clients connect to + exactly one server to submit irequests. Read requests are serviced from + the local replica of each server database. Requests that change the + state of the service, write requests, are processed by an agreement + protocol.</p> +<p>As part of the agreement protocol all write requests from clients + are forwarded to a single server, called the + <em>leader</em>. The rest of the ZooKeeper servers, called + <em>followers</em>, receive message proposals from the + leader and agree upon message delivery. The messaging layer takes care + of replacing leaders on failures and syncing followers with + leaders.</p> +<p>ZooKeeper uses a custom atomic messaging protocol. Since the + messaging layer is atomic, ZooKeeper can guarantee that the local + replicas never diverge. When the leader receives a write request, it + calculates what the state of the system is when the write is to be + applied and transforms this into a transaction that captures this new + state.</p> +<a name="Uses"></a> +<h3 class="h4">Uses</h3> +<p>The programming interface to ZooKeeper is deliberately simple. + With it, however, you can implement higher order operations, such as + synchronizations primitives, group membership, ownership, etc. Some + distributed applications have used it to: <em>[tbd: add uses from + white paper and video presentation.]</em> For more information, see + <em>[tbd]</em> +</p> +<a name="Performance"></a> +<h3 class="h4">Performance</h3> +<p>ZooKeeper is designed to be highly performant. But is it? The + results of the ZooKeeper's development team at Yahoo! Research indicate + that it is. (See <a href="#fg_zkPerfRW">ZooKeeper Throughput as the Read-Write Ratio Varies</a>.) It is especially high + performance in applications where reads outnumber writes, since writes + involve synchronizing the state of all servers. (Reads outnumbering + writes is typically the case for a coordination service.)</p> +<table class="ForrestTable" cellspacing="1" cellpadding="4"> +<tr> +<td>ZooKeeper Throughput as the Read-Write Ratio Varies</td> +</tr> +<tr> +<td> + + <img alt="" src="images/zkperfRW-3.2.jpg"> + + </td> +</tr> +</table> +<p>The figure <a href="#fg_zkPerfRW">ZooKeeper Throughput as the Read-Write Ratio Varies</a> is a throughput + graph of ZooKeeper release 3.2 running on servers with dual 2Ghz + Xeon and two SATA 15K RPM drives. One drive was used as a + dedicated ZooKeeper log device. The snapshots were written to + the OS drive. Write requests were 1K writes and the reads were + 1K reads. "Servers" indicate the size of the ZooKeeper + ensemble, the number of servers that make up the + service. Approximately 30 other servers were used to simulate + the clients. The ZooKeeper ensemble was configured such that + leaders do not allow connections from clients.</p> +<div class="note"> +<div class="label">Note</div> +<div class="content"> +<p>In version 3.2 r/w performance improved by ~2x + compared to the <a href="http://zookeeper.apache.org/docs/r3.1.1/zookeeperOver.html#Performance">previous + 3.1 release</a>.</p> +</div> +</div> +<p>Benchmarks also indicate that it is reliable, too. <a href="#fg_zkPerfReliability">Reliability in the Presence of Errors</a> shows how a deployment responds to + various failures. The events marked in the figure are the + following:</p> +<ol> + +<li> + +<p>Failure and recovery of a follower</p> + +</li> + + +<li> + +<p>Failure and recovery of a different follower</p> + +</li> + + +<li> + +<p>Failure of the leader</p> + +</li> + + +<li> + +<p>Failure and recovery of two followers</p> + +</li> + + +<li> + +<p>Failure of another leader</p> + +</li> + +</ol> +<a name="Reliability"></a> +<h3 class="h4">Reliability</h3> +<p>To show the behavior of the system over time as + failures are injected we ran a ZooKeeper service made up of + 7 machines. We ran the same saturation benchmark as before, + but this time we kept the write percentage at a constant + 30%, which is a conservative ratio of our expected + workloads. + </p> +<table class="ForrestTable" cellspacing="1" cellpadding="4"> +<tr> +<td>Reliability in the Presence of Errors</td> +</tr> +<tr> +<td> + + <img alt="" src="images/zkperfreliability.jpg"> + + </td> +</tr> +</table> +<p>The are a few important observations from this graph. First, if + followers fail and recover quickly, then ZooKeeper is able to sustain a + high throughput despite the failure. But maybe more importantly, the + leader election algorithm allows for the system to recover fast enough + to prevent throughput from dropping substantially. In our observations, + ZooKeeper takes less than 200ms to elect a new leader. Third, as + followers recover, ZooKeeper is able to raise throughput again once they + start processing requests.</p> +<a name="The+ZooKeeper+Project"></a> +<h3 class="h4">The ZooKeeper Project</h3> +<p>ZooKeeper has been + <a href="https://cwiki.apache.org/confluence/display/ZOOKEEPER/PoweredBy"> + successfully used + </a> + in many industrial applications. It is used at Yahoo! as the + coordination and failure recovery service for Yahoo! Message + Broker, which is a highly scalable publish-subscribe system + managing thousands of topics for replication and data + delivery. It is used by the Fetching Service for Yahoo! + crawler, where it also manages failure recovery. A number of + Yahoo! advertising systems also use ZooKeeper to implement + reliable services. + </p> +<p>All users and developers are encouraged to join the + community and contribute their expertise. See the + <a href="http://zookeeper.apache.org/"> + Zookeeper Project on Apache + </a> + for more information. + </p> +</div> + +<p align="right"> +<font size="-2"></font> +</p> +</div> +<!--+ + |end content + +--> +<div class="clearboth"> </div> +</div> +<div id="footer"> +<!--+ + |start bottomstrip + +--> +<div class="lastmodified"> +<script type="text/javascript"><!-- +document.write("Last Published: " + document.lastModified); +// --></script> +</div> +<div class="copyright"> + Copyright © + <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a> +</div> +<!--+ + |end bottomstrip + +--> +</div> +</body> +</html>
http://git-wip-us.apache.org/repos/asf/zookeeper/blob/74d00962/_released_docs/r3.5.4-beta/zookeeperOver.pdf ---------------------------------------------------------------------- diff --git a/_released_docs/r3.5.4-beta/zookeeperOver.pdf b/_released_docs/r3.5.4-beta/zookeeperOver.pdf new file mode 100644 index 0000000..f48c7d6 Binary files /dev/null and b/_released_docs/r3.5.4-beta/zookeeperOver.pdf differ