Author: rgoers
Date: Fri Jul 13 16:24:48 2012
New Revision: 1361287
URL: http://svn.apache.org/viewvc?rev=1361287&view=rev
Log:
Initial Flume Site
Added:
flume/site/trunk/
flume/site/trunk/content/
flume/site/trunk/content/filtered-resources/
flume/site/trunk/content/filtered-resources/.htaccess
flume/site/trunk/content/resources/
flume/site/trunk/content/resources/extpaths.txt
flume/site/trunk/content/site.xml
flume/site/trunk/content/sphinx/
flume/site/trunk/content/sphinx/FlumeDeveloperGuide.rst
flume/site/trunk/content/sphinx/FlumeUserGuide.rst
flume/site/trunk/content/sphinx/_templates/
flume/site/trunk/content/sphinx/_templates/extranav.html
flume/site/trunk/content/sphinx/_themes/
flume/site/trunk/content/sphinx/_themes/flume/
flume/site/trunk/content/sphinx/_themes/flume/globaltoc.html
flume/site/trunk/content/sphinx/_themes/flume/layout.html
flume/site/trunk/content/sphinx/_themes/flume/localtoc.html
flume/site/trunk/content/sphinx/_themes/flume/static/
flume/site/trunk/content/sphinx/_themes/flume/static/flume.css_t
flume/site/trunk/content/sphinx/_themes/flume/theme.conf
flume/site/trunk/content/sphinx/conf.py
flume/site/trunk/content/sphinx/contents.rsx
flume/site/trunk/content/sphinx/documentation.rst
flume/site/trunk/content/sphinx/download.rst
flume/site/trunk/content/sphinx/getinvolved.rst
flume/site/trunk/content/sphinx/images/
flume/site/trunk/content/sphinx/images/DevGuide_image00.png (with props)
flume/site/trunk/content/sphinx/images/DevGuide_image01.png (with props)
flume/site/trunk/content/sphinx/images/UserGuide_image00.png (with props)
flume/site/trunk/content/sphinx/images/UserGuide_image01.png (with props)
flume/site/trunk/content/sphinx/images/UserGuide_image02.png (with props)
flume/site/trunk/content/sphinx/images/UserGuide_image03.png (with props)
flume/site/trunk/content/sphinx/images/feather-small.png (with props)
flume/site/trunk/content/sphinx/images/flume-logo.png (with props)
flume/site/trunk/content/sphinx/index.rst
flume/site/trunk/content/sphinx/license.rst
flume/site/trunk/content/sphinx/mailinglists.rst
flume/site/trunk/content/sphinx/releases/
flume/site/trunk/content/sphinx/releases/1.0.0.rst
flume/site/trunk/content/sphinx/releases/1.1.0.rst
flume/site/trunk/content/sphinx/releases/1.2.0.rst
flume/site/trunk/content/sphinx/releases/content/
flume/site/trunk/content/sphinx/releases/index.rst
flume/site/trunk/content/sphinx/source.rst
flume/site/trunk/content/sphinx/team.rst
flume/site/trunk/pom.xml
Added: flume/site/trunk/content/filtered-resources/.htaccess
URL:
http://svn.apache.org/viewvc/flume/site/trunk/content/filtered-resources/.htaccess?rev=1361287&view=auto
==============================================================================
--- flume/site/trunk/content/filtered-resources/.htaccess (added)
+++ flume/site/trunk/content/filtered-resources/.htaccess Fri Jul 13 16:24:48
2012
@@ -0,0 +1,3 @@
+ErrorDocument 404 /errors/404.html
+
+
Added: flume/site/trunk/content/resources/extpaths.txt
URL:
http://svn.apache.org/viewvc/flume/site/trunk/content/resources/extpaths.txt?rev=1361287&view=auto
==============================================================================
--- flume/site/trunk/content/resources/extpaths.txt (added)
+++ flume/site/trunk/content/resources/extpaths.txt Fri Jul 13 16:24:48 2012
@@ -0,0 +1,5 @@
+# File format: lines starting with # or space chars are ignored.
+# directories in /www/logging.apache.org/ but not in any svn tree: dtd,
project, xsd
+# space indentation = directories not yet committed to svn production tree
+
+releases/content
\ No newline at end of file
Added: flume/site/trunk/content/site.xml
URL:
http://svn.apache.org/viewvc/flume/site/trunk/content/site.xml?rev=1361287&view=auto
==============================================================================
--- flume/site/trunk/content/site.xml (added)
+++ flume/site/trunk/content/site.xml Fri Jul 13 16:24:48 2012
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="ISO-8859-1"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="Flume">
+ <skin>
+ <groupId>org.apache.maven.skins</groupId>
+ <artifactId>maven-fluido-skin</artifactId>
+ <version>1.0</version>
+ </skin>
+
+ <publishDate position="right"/>
+ <version position="none"/>
+
+ <poweredBy>
+ <logo name="Flume" href="http://incubator.apache.org/flume"
+ img="_static/flume-logo.png" />
+ </poweredBy>
+
+ <body>
+
+
+ </body>
+</project>
Added: flume/site/trunk/content/sphinx/FlumeDeveloperGuide.rst
URL:
http://svn.apache.org/viewvc/flume/site/trunk/content/sphinx/FlumeDeveloperGuide.rst?rev=1361287&view=auto
==============================================================================
--- flume/site/trunk/content/sphinx/FlumeDeveloperGuide.rst (added)
+++ flume/site/trunk/content/sphinx/FlumeDeveloperGuide.rst Fri Jul 13 16:24:48
2012
@@ -0,0 +1,370 @@
+.. Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+
+=========================
+Flume 1.x Developer Guide
+=========================
+
+Introduction
+============
+
+Overview
+--------
+
+Apache Flume is a distributed, reliable, and available system for
+efficiently collecting, aggregating and moving large amounts of log
+data from many different sources to a centralized data store.
+
+At the moment Flume is an incubating Apache project. There are
+currently two release code lines available, version 0.9.x and 1.x.x. This guide
+is specific to 1.x (more specifically 1.1.0 release). Please click here for
+`the Flume 0.9.x Developer Guide
+<http://archive.cloudera.com/cdh/3/flume/DeveloperGuide/>`_.
+
+Architecture
+------------
+
+Data flow model
+~~~~~~~~~~~~~~~
+
+A unit of data flow is called event which is a byte payload that is accompanied
+by an optional set of string attributes. Flume agent is a process (JVM) that
+hosts the components that flows events from an external source to next
+destination.
+
+.. figure:: images/DevGuide_image00.png
+ :align: center
+ :alt: Agent component diagram
+
+A source consumes events delivered to it by an external source like web server
+in a specific format. For example, an Avro source can be used to receive Avro
+events from clients or other agents in the flow. When a source receives an
+event, it stores it into one or more channels. The channel is a passive store
+that keeps the event until its consumed by a sink. An example of channel is
+the JDBC channel that uses a file-system backed embedded database. The sink
+removes the event from channel and puts it into an external repository like
+HDFS or forwards it to the source in next hop of the flow. The source and sink
+within the given agent run asynchronously with the events staged in the
+channel.
+
+Reliability
+~~~~~~~~~~~
+
+The events are staged in the channel on each agent. Then they are delivered to
+the next agent or terminal repository (like HDFS) in the flow. The events are
+removed from the channel only after they are stored in the channel of next
+agent or in the terminal repository. This is a how the single-hop message
+delivery semantics in Flume provide end-to-end reliability of the flowFlume
+uses transactional approach to guarantee the reliable delivery of the events.
+The sources and sinks encapsulate the store/retrieval of the events in a
+transaction provided by the channel. This ensures that the set of events are
+reliably passed from point to point in the flow. In case of multi hop flow, the
+sink on previous hop and source on next hop both have their transactions
+running to ensure that the data is safely stored in the channel of the next
+hop.
+
+Building Flume
+--------------
+
+Getting the source
+~~~~~~~~~~~~~~~~~~
+
+Check out the code using Subversion. Click here for
+`the SVN repository root <http://svn.apache.org/repos/asf/incubator/flume/>`_.
+
+The Flume 1.x development happens under the branch "trunk" so this command line
+can be used::
+
+ svn checkout http://svn.apache.org/repos/asf/incubator/flume/trunk
flume-trunk
+
+Alternatively, if you prefer using Git, you may use::
+
+ git clone git://git.apache.org/flume.git
+ cd flume
+ git checkout trunk
+
+Compile/test Flume
+~~~~~~~~~~~~~~~~~~
+
+The Flume build is mavenized. You can compile Flume using the standard Maven
+commands:
+
+#. Compile only: ``mvn clean compile``
+#. Compile and run unit tests: ``mvn clean test``
+#. Run individual test(s): ``mvn clean test -Dtest=<Test1>,<Test2>,...
-DfailIfNoTests=false``
+#. Create tarball package: ``mvn clean install``
+#. Create tarball package (skip unit tests): ``mvn clean install -DskipTests``
+
+
+Developing custom components
+----------------------------
+
+Client
+~~~~~~
+
+The client operates at the point of origin of events and delivers them to a
+Flume agent. Clients typically operate in the process space of the application
+they are consuming data from. Currently flume supports Avro, log4j and syslog
+as ways to transfer data from remote source. Additionally thereâs an Exec
+source that can consume the output of a local process as input to Flume.
+
+Itâs quite possible to have a use case where these existing options are not
+sufficient. In this case you can build a custom mechanism to send data to
+Flume. There are two ways of achieving this. First is to create a custom client
+that communicates to one of the flumeâs existing sources like Avro or syslog.
+Here the client should convert itâs data into messages understood by these
+Flume sources. The other option is to write a custom Flume source that directly
+talks to your existing client application using some IPC or RPC protocols, and
+then convert the data into flume events to send it upstream.
+
+
+Client SDK
+''''''''''
+
+Though flume contains a number of built in mechanisms to ingest data, often one
+wants the ability to communicate with flume directly from a custom application.
+The Client SDK is a library that enables applications to connect to Flume and
+send data into Flumeâs data flow over RPC.
+
+
+RPC Client interface
+''''''''''''''''''''
+
+The is an interface to wrap the user data data and attributes into an
+``Event``, which is Flumeâs unit of flow. This encapsulates the RPC mechanism
+supported by Flume. The application can simply call ``append()`` or
+``appendBatch()`` to send data and not worry about the underlying message
+exchanges.
+
+
+Avro RPC Client
+'''''''''''''''
+
+As of Flume 1.1.0, Avro is the only support RPC protocol. The
+``NettyAvroRpcClient`` implements the ``RpcClient`` interface. The client needs
+to create this object with the host and port of the Flume agent and use it to
+send data into flume. The following example shows how to use the Client SDK
+API:
+
+.. code-block:: java
+
+ import org.apache.flume.Event;
+ import org.apache.flume.EventDeliveryException;
+ import org.apache.flume.FlumeException;
+ import org.apache.flume.api.RpcClient;
+ import org.apache.flume.api.RpcClientFactory;
+ import org.apache.flume.event.EventBuilder;
+
+ public void myInit () {
+ // setup the RPC connection to Flume agent at hostname/port
+ RpcClient rpcClient = RpcClientFactory.getDefaultInstance(hostname, port);
+ ...
+ }
+
+ public void sendDataToFlume(String data) {
+ // Create flume event object
+ Event event = EventBuilder.withBody(data, Charset.forName("UTF-8"));
+ try {
+ rpcClient.append(event);
+ } catch (EventDeliveryException e) {
+ // clean up and recreate rpcClient
+ rpcClient.close();
+ rpcClient = null;
+ rpcClient = RpcClientFactory.getDefaultInstance(hostname, port);
+ }
+ ...
+ }
+
+ public void cleanUp () {
+ // close the rpc connection
+ rpcClient.close();
+ ...
+ }
+
+
+Failover handler
+''''''''''''''''
+
+This class wraps the Avro RPC client to provide failover handling capability to
+clients. This takes a list of host/ports of the Flume agent. If thereâs an
+error in communicating the current agent, then it automatically falls back to
+the next agent in the list:
+
+.. code-block:: java
+
+ // Setup properties for the failover
+ Properties props = new Properties();
+ props.put("client.type", "default_failover");
+
+ // list of hosts
+ props.put("hosts", "host1 host2 host3");
+
+ // address/port pair for each host
+ props.put("hosts.host1", host1 + ":" + port1);
+ props.put("hosts.host1", host2 + ":" + port2);
+ props.put("hosts.host1", host3 + ":" + port3);
+
+ // create the client with failover properties
+ client = (FailoverRpcClient);
+ RpcClientFactory.getInstance(props);
+
+
+Transaction interface
+~~~~~~~~~~~~~~~~~~~~~
+
+The ``Transaction`` interface is the basis of reliability for Flume. All the
+major components ie. sources, sinks and channels needs to interface with Flume
+transaction.
+
+.. figure:: images/DevGuide_image01.png
+ :align: center
+ :alt: Transaction sequence diagram
+
+The transaction interface is implemented by a channel implementation. The
+source and sink connected to channel obtain a transaction object. The sources
+actually use a channel selector interface that encapsulate the transaction
+(discussed in later sections). The operations to stage or extract an event is
+done inside an active transaction. For example:
+
+.. code-block:: java
+
+ Channel ch = ...
+ Transaction tx = ch.getTransaction();
+ try {
+ tx.begin();
+ ...
+ // ch.put(event) or ch.take()
+ ...
+ tx.commit();
+ } catch (ChannelException ex) {
+ tx.rollback();
+ ...
+ } finally {
+ tx.close();
+ }
+
+Here we get hold of a transaction from a channel. After the begin method is
+executed, the event is put in the channel and transaction is committed.
+
+
+Sink
+~~~~
+
+The purpose of a sink to extract events from the channel and forward it to the
+next Agent in the flow or store in an external repository. A sink is linked to
+a channel instance as per the flow configuration. Thereâs a sink runner
thread
+thatâs get created for every configured sink which manages the sinkâs
+lifecycle. The sink needs to implement ``start()`` and ``stop()`` methods that
+are part of the ``LifecycleAware`` interface. The ``start()`` method should
+initialize the sink and bring it to a state where it can forward the events to
+its next destination. The ``process()`` method from the ``Sink`` interface
+should do the core processing of extracting the event from channel and
+forwarding it. The ``stop()`` method should do the necessary cleanup. The sink
+also needs to implement a ``Configurable`` interface for processing its own
+configuration settings:
+
+.. code-block:: java
+
+ // foo sink
+ public class FooSink extends AbstractSink implements Configurable {
+ @Override
+ public void configure(Context context) {
+ some_Param = context.get("some_param", String.class);
+ // process some_param â¦
+ }
+ @Override
+ public void start() {
+ // initialize the connection to foo repository ..
+ }
+ @Override
+ public void stop () {
+ // cleanup and disconnect from foo repository ..
+ }
+ @Override
+ public Status process() throws EventDeliveryException {
+ // Start transaction
+ ch = getChannel();
+ tx = ch.getTransaction();
+ try {
+ tx.begin();
+ Event e = ch.take();
+ // send the event to foo
+ // foo.some_operation(e);
+ tx.commit();
+ sgtatus = Status.READY;
+ (ChannelException e) {
+ tx.rollback();
+ status = Status.BACKOFF;
+ } finally {
+ tx.close();
+ }
+ return status;
+ }
+ }
+ }
+
+
+Source
+~~~~~~
+
+The purpose of a Source is to receive data from an external client and store it
+in the channel. As mentioned above, for sources the ``Transaction`` interface
+is encapsulated by the ``ChannelSelector``. Similar to ``SinkRunner``,
thereâs
+a ``SourceRunner`` thread that gets created for every configured source that
+manages the sourceâs lifecycle. The source needs to implement ``start()`` and
+``stop()`` methods that are part of the ``LifecycleAware`` interface. There are
+two types of sources, pollable and event-driven. The runner of pollable source
+runner invokes a ``process()`` method from the pollable source. The
+``process()`` method should check for new data and store it in the channel. The
+event driver source needs have its own callback mechanism that captures the new
+data:
+
+.. code-block:: java
+
+ // bar source
+ public class BarSource extends AbstractSource implements Configurable,
EventDrivenSource{
+ @Override
+ public void configure(Context context) {
+ some_Param = context.get("some_param", String.class);
+ // process some_param â¦
+ }
+ @Override
+ public void start() {
+ // initialize the connection to bar client ..
+ }
+ @Override
+ public void stop () {
+ // cleanup and disconnect from bar client ..
+ }
+ @Override
+ public Status process() throws EventDeliveryException {
+ try {
+ // receive new data
+ Event e = get_some_data();
+ // store the event to underlying channels(s)
+ getChannelProcessor().processEvent(e)
+ } catch (ChannelException ex) {
+ return Status.BACKOFF;
+ }
+ return Status.READY;
+ }
+ }
+
+
+Channel
+~~~~~~~
+
+TBD