documen...

ajothomas Wed, 18 Jan 2023 11:34:07 -0800

Modified: samza/site/learn/documentation/latest/jobs/samza-configurations.html
URL: 
http://svn.apache.org/viewvc/samza/site/learn/documentation/latest/jobs/samza-configurations.html?rev=1906774&r1=1906773&r2=1906774&view=diff
==============================================================================
--- samza/site/learn/documentation/latest/jobs/samza-configurations.html 
(original)
+++ samza/site/learn/documentation/latest/jobs/samza-configurations.html Wed 
Jan 18 19:33:25 2023
@@ -227,6 +227,12 @@
     
       
         
+      <a class="side-navigation__group-item" data-match-active="" 
href="/releases/1.8.0">1.8.0</a>
+      
+        
+      <a class="side-navigation__group-item" data-match-active="" 
href="/releases/1.7.0">1.7.0</a>
+      
+        
       <a class="side-navigation__group-item" data-match-active="" 
href="/releases/1.6.0">1.6.0</a>
       
         
@@ -538,6 +544,14 @@
               
               
 
+              <li class="hide"><a 
href="/learn/documentation/1.8.0/jobs/samza-configurations">1.8.0</a></li>
+
+              
+
+              <li class="hide"><a 
href="/learn/documentation/1.7.0/jobs/samza-configurations">1.7.0</a></li>
+
+              
+
               <li class="hide"><a 
href="/learn/documentation/1.6.0/jobs/samza-configurations">1.6.0</a></li>
 
               
@@ -639,1405 +653,857 @@
    limitations under the License.
 -->
 
-<p>The following table lists the complete set of properties that can be 
included in a Samza job configuration file.<br></p>
+<p>The following table lists the complete set of properties that can be 
included in a Samza job configuration file.<br /></p>
 
 <ul>
-<li><a href="#application-configurations">1. Application Configurations</a>
-
-<ul>
-<li><a href="#advanced-application-configurations">1.1 Advanced Application 
Configurations</a></li>
-</ul></li>
-<li><a href="#checkpointing">2. Checkpointing</a>
-
-<ul>
-<li><a href="#advanced-checkpointing-configuration">2.1 Advanced Checkpointing 
Configurations</a></li>
-</ul></li>
-<li><a href="#systems-streams">3. Systems &amp; Streams</a>
-
-<ul>
-<li><a href="#advanced-system-stream-configurations">3.1 Advanced System &amp; 
Stream Configuration</a></li>
-<li><a href="#kafka">3.2 Kafka</a></li>
-<li><a href="#hdfs">3.3 HDFS</a></li>
-<li><a href="#eventhubs">3.4 Event Hubs</a></li>
-<li><a href="#kinesis">3.5 Kinesis</a></li>
-<li><a href="#elasticsearch">3.6 ElasticSearch</a></li>
-<li><a href="#azure-blob-storage">3.7 Azure Blob Storage</a></li>
-</ul></li>
-<li><a href="#state-storage">4. State Storage</a>
-
-<ul>
-<li><a href="#advanced-storage-configurations">4.1 Advanced Storage 
Configurations</a></li>
-</ul></li>
-<li><a href="#deployment">5. Deployment</a>
-
-<ul>
-<li><a href="#yarn-cluster-deployment">5.1 YARN Cluster Deployment</a>
-
-<ul>
-<li><a href="#advanced-cluster-configurations">5.1.1 Advanced Cluster 
Configurations</a></li>
-</ul></li>
-<li><a href="#standalone-deployment">5.2 Standalone Deployment</a>
-
-<ul>
-<li><a href="#advanced-standalone-configurations">5.2.1 Advanced Standalone 
Configurations</a></li>
-</ul></li>
-</ul></li>
-<li><a href="#metrics">6. Metrics</a></li>
+  <li><a href="#application-configurations">1. Application Configurations</a>
+    <ul>
+      <li><a href="#advanced-application-configurations">1.1 Advanced 
Application Configurations</a></li>
+    </ul>
+  </li>
+  <li><a href="#checkpointing">2. Checkpointing</a>
+    <ul>
+      <li><a href="#advanced-checkpointing-configuration">2.1 Advanced 
Checkpointing Configurations</a></li>
+    </ul>
+  </li>
+  <li><a href="#systems-streams">3. Systems &amp; Streams</a>
+    <ul>
+      <li><a href="#advanced-system-stream-configurations">3.1 Advanced System 
&amp; Stream Configuration</a></li>
+      <li><a href="#kafka">3.2 Kafka</a></li>
+      <li><a href="#hdfs">3.3 HDFS</a></li>
+      <li><a href="#eventhubs">3.4 Event Hubs</a></li>
+      <li><a href="#kinesis">3.5 Kinesis</a></li>
+      <li><a href="#elasticsearch">3.6 ElasticSearch</a></li>
+      <li><a href="#azure-blob-storage">3.7 Azure Blob Storage</a></li>
+    </ul>
+  </li>
+  <li><a href="#state-storage">4. State Storage</a>
+    <ul>
+      <li><a href="#advanced-storage-configurations">4.1 Advanced Storage 
Configurations</a></li>
+    </ul>
+  </li>
+  <li><a href="#deployment">5. Deployment</a>
+    <ul>
+      <li><a href="#yarn-cluster-deployment">5.1 YARN Cluster Deployment</a>
+        <ul>
+          <li><a href="#advanced-cluster-configurations">5.1.1 Advanced 
Cluster Configurations</a></li>
+        </ul>
+      </li>
+      <li><a href="#standalone-deployment">5.2 Standalone Deployment</a>
+        <ul>
+          <li><a href="#advanced-standalone-configurations">5.2.1 Advanced 
Standalone Configurations</a></li>
+        </ul>
+      </li>
+    </ul>
+  </li>
+  <li><a href="#metrics">6. Metrics</a></li>
 </ul>
 
-<h3 id="1-application-configurations"><a 
name="application-configurations"></a> <a href="#application-configurations">1. 
Application Configurations</a></h3>
-
+<h3 id="-1-application-configurations"><a 
name="application-configurations"></a> <a href="#application-configurations">1. 
Application Configurations</a></h3>
 <p>These are the basic properties for setting up a Samza application.</p>
 
-<table><thead>
-<tr>
-<th>Name</th>
-<th>Default</th>
-<th>Description</th>
-</tr>
-</thead><tbody>
-<tr>
-<td>app.name</td>
-<td></td>
-<td><strong>Required:</strong> The name of your application.</td>
-</tr>
-<tr>
-<td>app.id</td>
-<td>1</td>
-<td>If you run several instances of your application at the same time, you 
need to give each instance a different app.id. This is important, since 
otherwise the applications will overwrite each others&rsquo; checkpoints, and 
perhaps interfere with each other in other ways.</td>
-</tr>
-<tr>
-<td>app.class</td>
-<td></td>
-<td>This is <strong>required if running on YARN</strong>. The application to 
run. The value is a fully-qualified Java classname, which must implement 
StreamApplication. A StreamApplication describes as a series of transformations 
on the streams.</td>
-</tr>
-<tr>
-<td>job.factory.class</td>
-<td></td>
-<td>This is <strong>required if running on YARN</strong>. The job factory to 
use for running this job. <br> The value is a fully-qualified Java classname, 
which must implement StreamJobFactory.<br> Samza ships with three 
implementations:<br><br><code>org.apache.samza.job.yarn.YarnJobFactory</code><br>Runs
 your job on a YARN grid. See below for YARN-specific 
configuration.<br><br><code>org.apache.samza.job.local.ThreadJobFactory</code><br><strong>For
 dev deployments only.</strong> Runs your job on your local machine using 
threads.<br><br><code>org.apache.samza.job.local.ProcessJobFactory</code><br><strong>For
 dev deployments only.</strong> Runs your job on your local machine as a 
subprocess. An optional command builderproperty can also be specified (see 
task.command.class for details).</td>
-</tr>
-<tr>
-<td>job.name</td>
-<td></td>
-<td><em>(Deprecated in favor of app.name)</em>  The name of your job. This 
name appears on the Samza dashboard, and it is used to tell apart this 
job&rsquo;s checkpoints from other jobs&rsquo; checkpoints.</td>
-</tr>
-<tr>
-<td>job.id</td>
-<td>1</td>
-<td><em>(Deprecated in favor of app.id)</em> If you run several instances of 
your job at the same time, you need to give each execution a different job.id. 
This is important, since otherwise the jobs will overwrite each others&rsquo; 
checkpoints, and perhaps interfere with each other in other ways.</td>
-</tr>
-<tr>
-<td>job.default.system</td>
-<td></td>
-<td><strong>Required:</strong> The system-name to use for creating input or 
output streams for which the system is not explicitly configured. This property 
will also be used as default for <code>job.coordinator.system</code>, 
<code>task.checkpoint.system</code> and <code>job.changelog.system</code> if 
none are defined.</td>
-</tr>
-<tr>
-<td>task.class</td>
-<td></td>
-<td>Used for legacy purposes; replace with <code>app.class</code> in new jobs. 
The fully-qualified name of the Java class which processes incoming messages 
from input streams. The class must implement <a 
href="../api/javadocs/org/apache/samza/task/StreamTask.html">StreamTask</a> or 
<a 
href="../api/javadocs/org/apache/samza/task/AsyncStreamTask.html">AsyncStreamTask</a>,
 and may optionally implement <a 
href="../api/javadocs/org/apache/samza/task/InitableTask.html">InitableTask</a>,
 <a 
href="../api/javadocs/org/apache/samza/task/ClosableTask.html">ClosableTask</a> 
and/or <a 
href="../api/javadocs/org/apache/samza/task/WindowableTask.html">WindowableTask</a>.
 The class will be instantiated several times, once for every input stream 
partition.</td>
-</tr>
-<tr>
-<td>job.host-affinity.enabled</td>
-<td>false</td>
-<td>This property indicates whether host-affinity is enabled or not. 
Host-affinity refers to the ability of Samza to request and allocate a 
container on the same host every time the job is deployed. When host-affinity 
is enabled, Samza makes a &ldquo;best-effort&rdquo; to honor the host-affinity 
constraint. The property 
<code>cluster-manager.container.request.timeout.ms</code> determines how long 
to wait before de-prioritizing the host-affinity constraint and assigning the 
container to any available resource.</td>
-</tr>
-<tr>
-<td>job.jmx.enabled</td>
-<td>true</td>
-<td>Determines whether a JMX server should be started on the job&rsquo;s 
JobCoordinator and Container. (true or false).</td>
-</tr>
-<tr>
-<td>task.window.ms</td>
-<td>-1</td>
-<td>If task.class implements <a 
href="../api/javadocs/org/apache/samza/task/WindowableTask.html">WindowableTask</a>,
 it can receive a windowing callback in regular intervals. This property 
specifies the time between window() calls, in milliseconds. If the number is 
negative (the default), window() is never called. A <code>window()</code> call 
will never  occur concurrently with the processing of a message. If a message 
is being processed when a window() call is due, the invocation of window 
happens after processing the message. This property is set automatically when 
using join or window operators in a High Level API StreamApplication Note: 
task.window.ms should be set to be much larger than average process or window 
call duration to avoid starving regular processing.</td>
-</tr>
-<tr>
-<td>task.log4j.system</td>
-<td></td>
-<td>Specify the system name for the StreamAppender. If this property is not 
specified in the config, an exception will be thrown. (See <a 
href="logging.html#stream-log4j-appender">Stream Log4j Appender</a>) Example: 
task.log4j.system=kafka</td>
-</tr>
-<tr>
-<td>serializers.registry.<br><strong><em>serde-name</em></strong>.class</td>
-<td></td>
-<td>Use this property to register a serializer/deserializer, which defines a 
way of encoding data as an array of bytes (used for messages in streams, and 
for data in persistent storage). You can give a serde any serde-name you want, 
and reference that name in properties like systems.*.samza.key.serde, 
systems.*.samza.msg.serde, streams.*.samza.key.serde, 
streams.*.samza.msg.serde, stores.*.key.serde and stores.*.msg.serde. The value 
of this property is the fully-qualified name of a Java class that implements 
SerdeFactory. Samza ships with the following serde 
implementations:<br><br><code>org.apache.samza.serializers.ByteSerdeFactory</code><br>A
 no-op serde which passes through the undecoded byte array. 
<br><br><code>org.apache.samza.serializers.ByteBufferSerdeFactory</code><br>Encodes
 <code>java.nio.ByteBuffer</code> objects. 
<br><br><code>org.apache.samza.serializers.IntegerSerdeFactory</code><br>Encodes
 <code>java.lang.Integer</code> objects as binary (4 bytes fixed-length big-end
 ian 
encoding).<br><br><code>org.apache.samza.serializers.StringSerdeFactory</code><br>Encodes
 <code>java.lang.String</code> objects as UTF-8. 
<br><br><code>org.apache.samza.serializers.JsonSerdeFactory</code><br>Encodes 
nested structures of <code>java.util.Map</code>, <code>java.util.List</code> 
etc. as JSON. Note: This Serde enforces a dash-separated property naming 
convention, while JsonSerdeV2 doesn&rsquo;t. This serde is primarily meant for 
Samza&rsquo;s internal usage, and is publicly available for backwards 
compatibility.<br><br><code>org.apache.samza.serializers.JsonSerdeV2Factory</code><br>Encodes
 nested structures of <code>java.util.Map</code>, <code>java.util.List</code> 
etc. as JSON. Note: This Serde uses Jackson&rsquo;s default (camelCase) 
property naming convention. This serde should be preferred over JsonSerde, 
especially in High Level API, unless the dasherized naming convention is 
required (e.g., for backwards 
compatibility).<br><br><code>org.apache.samza.serializers
 .LongSerdeFactory</code><br>Encodes <code>java.lang.Long</code> as binary (8 
bytes fixed-length big-endian 
encoding).<br><br><code>org.apache.samza.serializers.DoubleSerdeFactory</code><br>Encodes
 <code>java.lang.Double</code> as binary (8 bytes double-precision float 
point). 
<br><br><code>org.apache.samza.serializers.UUIDSerdeFactory</code><br>Encodes 
<code>java.util.UUID</code> 
objects.<br><br><code>org.apache.samza.serializers.SerializableSerdeFactory</code><br>Encodes
 <code>java.io.Serializable</code> 
objects.<br><br><code>org.apache.samza.serializers.MetricsSnapshotSerdeFactory</code><br>Encodes
 <code>org.apache.samza.metrics.reporter.MetricsSnapshot</code> objects (which 
are used for reporting metrics) as 
JSON.<br><br><code>org.apache.samza.serializers.KafkaSerdeFactory</code><br>Adapter
 which allows existing <code>kafka.serializer.Encoder</code> and 
<code>kafka.serializer.Decoder</code> implementations to be used as Samza 
serdes. Set <code>serializers.registry.serde-name.enco
 der</code> and  <code>serializers.registry.serde-name.decoder</code> to the 
appropriate class names.</td>
-</tr>
-</tbody></table>
-
-<h4 id="1-1-advanced-application-configurations"><a 
name="advanced-application-configurations"></a> <a 
href="#advanced-application-configurations">1.1 Advanced Application 
Configurations</a></h4>
-
-<table><thead>
-<tr>
-<th>Name</th>
-<th>Default</th>
-<th>Description</th>
-</tr>
-</thead><tbody>
-<tr>
-<td>job.changelog.system</td>
-<td>inherited from job.default.system</td>
-<td>This property is required if you would like to override the system defined 
in <code>job.default.system</code> for the changelog. The changelog will be 
used with the stream specified in <code>stores.store-name.changelog</code> 
config. You can override this system by specifying both the system and the 
stream in <code>stores.store-name.changelog</code>.</td>
-</tr>
-<tr>
-<td>job.coordinator.system</td>
-<td>inherited from job.default.system</td>
-<td>This property is required if you would like to override the system defined 
in <code>job.default.system</code> for coordination. The 
<strong><em>system-name</em></strong> to use for creating and maintaining the 
Coordinator Stream.</td>
-</tr>
-<tr>
-<td>job.config.rewriter.<br><strong><em>rewriter-name</em></strong>.class</td>
-<td>(none)</td>
-<td>You can optionally define configuration rewriters, which have the 
opportunity to dynamically modify the job configuration before the job is 
started. For example, this can be useful for pulling configuration from an 
external configuration management system, or for determining the set of input 
streams dynamically at runtime. The value of this property is a fully-qualified 
Java classname which must implement <a 
href="../api/javadocs/org/apache/samza/config/ConfigRewriter.html">ConfigRewriter</a>.
 Samza ships with these rewriters by 
default:<br><br><code>org.apache.samza.config.RegExTopicGenerator</code><br>When
 consuming from Kafka, this allows you to consume all Kafka topics that match 
some regular expression (rather than having to list each topic explicitly). 
This rewriter has additional 
configuration.<br><br><code>org.apache.samza.config.EnvironmentConfigRewriter</code><br>This
 rewriter takes environment variables that are prefixed with 
<code>SAMZA_</code> and adds them to the c
 onfiguration, overriding previous values where they exist. The keys are 
lowercased and underscores are converted to dots.</td>
-</tr>
-<tr>
-<td>job.config.rewriters</td>
-<td>(none)</td>
-<td>If you have defined configuration rewriters, you need to list them here, 
in the order in which they should be applied. The value of this property is a 
comma-separated list of <strong><em>rewriter-name</em></strong> tokens.</td>
-</tr>
-<tr>
-<td>job.config.rewriter.<br><strong><em>rewriter-name</em></strong>.system</td>
-<td>(none)</td>
-<td>Set this property to the <code>system-name</code> of the Kafka system from 
which you want to consume all matching topics.</td>
-</tr>
-<tr>
-<td>job.config.rewriter.<br><strong><em>rewriter-name</em></strong>.regex</td>
-<td>(none)</td>
-<td>A regular expression specifying which topics you want to consume within 
the Kafka system <code>job.config.rewriter.*.system</code>. Any topics matched 
by this regular expression will be consumed in addition to any topics you 
specify in your application.</td>
-</tr>
-<tr>
-<td>job.config.rewriter.<br><strong><em>rewriter-name</em></strong>.config.*</td>
-<td></td>
-<td>Any properties specified within this namespace are applied to the 
configuration of streams that match the regex in 
<code>job.config.rewriter.*.regex</code>. For example, you can set 
<code>job.config.rewriter.*.config.samza.msg.serde</code> to configure the 
deserializer for messages in the matching streams, which is equivalent to 
setting <code>systems.*.streams.*.samza.msg.serde</code> for each topic that 
matches the regex.</td>
-</tr>
-<tr>
-<td>job.container.thread.<br>pool.size</td>
-<td>0</td>
-<td>If configured, the container thread pool will be used to run synchronous 
operations of each task <a href="#../container/event-loop.html">in 
parallel</a>. The operations include StreamTask.process(), 
WindowableTask.window(), and internally Task.commit(). If not configured and 
the default value of 0 is used, all task operations will run in a single 
thread.</td>
-</tr>
-<tr>
-<td>job.coordinator.<br>monitor-partition-change.<br>frequency.ms</td>
-<td>300000</td>
-<td>The frequency at which the input streams&rsquo; partition count change 
should be detected. When the input partition count change is detected, Samza 
will automatically restart a stateless job or fail a stateful job. A longer 
time interval is recommended for jobs w/ large number of input system stream 
partitions, since gathering partition count may incur measurable overhead to 
the job. You can completely disable partition count monitoring by setting this 
value to 0 or a negative integer, which will also disable auto-restart/failing 
behavior of a Samza job on partition count changes.</td>
-</tr>
-<tr>
-<td>job.coordinator.segment.<br>bytes</td>
-<td>26214400</td>
-<td>If you are using a Kafka system for coordinator stream, this is the 
segment size to be used for the coordinator topic&rsquo;s log segments. Keeping 
this number small is useful because it increases the frequency that Kafka will 
garbage collect old messages.</td>
-</tr>
-<tr>
-<td>job.coordinator.replication.<br>factor</td>
-<td>300000</td>
-<td>The frequency at which the input streams&rsquo; partition count change 
should be detected. When the input partition count change is detected, Samza 
will automatically restart a stateless job or fail a stateful job. A longer 
time interval is recommended for jobs w/ large number of input system stream 
partitions, since gathering partition count may incur measurable overhead to 
the job. You can completely disable partition count monitoring by setting this 
value to 0 or a negative integer, which will also disable auto-restart/failing 
behavior of a Samza job on partition count changes.</td>
-</tr>
-<tr>
-<td>job.systemstreampartition.<br>grouper.factory</td>
-<td><code>org.apache.samza.</code><br><code>container.grouper.stream.</code><br><code>GroupByPartitionFactory</code></td>
-<td>A factory class that is used to determine how input SystemStreamPartitions 
are grouped together for processing in individual StreamTask instances. The 
factory must implement the SystemStreamPartitionGrouperFactory interface. Once 
this configuration is set, it can&rsquo;t be changed, since doing so could 
violate state semantics, and lead to a loss of 
data.<br><br><code>org.apache.samza.container.grouper.stream.</code><br><code>GroupByPartitionFactory</code><br>Groups
 input stream partitions according to their partition number. This grouping 
leads to a single StreamTask processing all messages for a single partition 
(e.g. partition 0) across all input streams that have a partition 0. Therefore, 
the default is that you get one StreamTask for all input partitions with the 
same partition number. Using this strategy, if two input streams have a 
partition 0, then messages from both partitions will be routed to a single 
StreamTask. This partitioning strategy is useful for joining and ag
 gregating 
streams.<br><br><code>org.apache.samza.container.grouper.stream.</code><br><code>GroupBySystemStreamPartitionFactory</code><br>Assigns
 each SystemStreamPartition to its own unique StreamTask. The 
GroupBySystemStreamPartitionFactory is useful in cases where you want increased 
parallelism (more containers), and don&rsquo;t care about co-locating 
partitions for grouping or joins, since it allows for a greater number of 
StreamTasks to be divided up amongst Samza containers.</td>
-</tr>
-<tr>
-<td>job.systemstreampartition.<br>matcher.class</td>
-<td></td>
-<td>If you want to enable static partition assignment, then this is a required 
configuration. The value of this property is a fully-qualified Java class name 
that implements the interface 
org.apache.samza.system.SystemStreamPartitionMatcher. Samza ships with two 
matcher 
classes:<br><br><code>org.apache.samza.system.RangeSystemStreamPartitionMatcher</code><br>This
 classes uses a comma separated list of range(s) to determine which partition 
matches, and thus statically assigned to the Job. For example 
&ldquo;2,3,1-2&rdquo;, statically assigns partition 1, 2, and 3 for all the 
specified system and streams (topics in case of Kafka) to the job. For config 
validation each element in the comma separated list much conform to one of the 
following regex:<br><code>(\\d+)</code>&ldquo; 
or&rdquo;<code>(\\d+-\\d+)</code>&ldquo;<br><code>JobConfig.SSP_MATCHER_CLASS_RANGE</code>
 constant has the canonical name of this 
class.<br><br><code>org.apache.samza.system.RegexSystemStreamPartitionMatcher</co
 de><br>This classes uses a standard Java supported regex to determine which 
partition matches, and thus statically assigned to the Job. For example 
&rdquo;[1-2]&ldquo;, statically assigns partition 1 and 2 for all the specified 
system and streams (topics in case of Kafka) to the job. 
JobConfig.SSP<em>MATCHER</em>CLASS_REGEX constant has the canonical name of 
this class.</td>
-</tr>
-<tr>
-<td>job.systemstreampartition.<br>matcher.config.<br>range</td>
-<td></td>
-<td>If <code>job.systemstreampartition.matcher.class</code> is specified, and 
the value of this property is 
<code>org.apache.samza.system.RangeSystemStreamPartitionMatcher</code>, then 
this property is a required configuration. Specify a comma separated list of 
range(s) to determine which partition matches, and thus statically assigned to 
the Job. For example &quot;2,3,11-20&rdquo;, statically assigns partition 2, 3, 
and 11 to 20 for all the specified system and streams (topics in case of Kafka) 
to the job. A single configuration value like &ldquo;19&rdquo; is valid as 
well. This statically assigns partition 19. For config validation each element 
in the comma separated list much conform to one of the following 
regex:<br>&ldquo;<code>(\\d+)</code>&rdquo; or 
&ldquo;<code>(\\d+-\\d+)</code>&rdquo;</td>
-</tr>
-<tr>
-<td>job.systemstreampartition.<br>matcher.config.<br>regex</td>
-<td></td>
-<td>If <code>job.systemstreampartition.matcher.class</code> is specified, and 
the value of this property is 
<code>org.apache.samza.system.RegexSystemStreamPartitionMatcher</code>, then 
this property is a required configuration. The value should be a valid Java 
supported regex. For example &ldquo;[1-2]&rdquo;, statically assigns partition 
1 and 2 for all the specified system and streams (topics in case of Kakfa) to 
the job.</td>
-</tr>
-<tr>
-<td>job.systemstreampartition.<br>matcher.config.<br>job.factory.regex</td>
-<td></td>
-<td>This configuration can be used to specify the Java supported regex to 
match the StreamJobFactory for which the static partition assignment should be 
enabled. This configuration enables the partition assignment feature to be used 
for custom StreamJobFactory(ies) as well.<br>This config defaults to the 
following value: 
&ldquo;<em>org\.apache\.samza\.job\.local(.<em>ProcessJobFactory &#124; 
.</em>ThreadJobFactory)</em>&rdquo;, which enables static partition assignment 
when job.factory.class is set to 
<code>org.apache.samza.job.local.ProcessJobFactory</code> or 
<code>org.apache.samza.job.local.ThreadJobFactory</code>.</td>
-</tr>
-<tr>
-<td>job.systemstreampartition.<br>input.expansion.enabled</td>
-<td>true</td>
-<td>When enabled, this allows stateful jobs to expand or contract their 
partition count by a multiple of the previous count so that events from an 
input stream partition are processed on the same task as before. This will 
prevent erroneous results. This feature is disabled if the configuration is set 
to false or if the job is stateless. See <a 
href="https://cwiki.apache.org/confluence/display/SAMZA/SEP-5%3A+Enable+partition+expansion+of+input+streams";>SEP-5</a>
 for more details.</td>
-</tr>
-<tr>
-<td>job.security.manager.<br>factory</td>
-<td>(none)</td>
-<td>This is the factory class used to create the proper SecurityManager to 
handle security for Samza containers when running in a secure environment, such 
as Yarn with Kerberos eanbled. Samza ships with one security manager by 
default:<br><br><code>org.apache.samza.job.yarn.SamzaYarnSecurityManagerFactory</code><br>Supports
 Samza containers to run properly in a Kerberos enabled Yarn cluster. Each 
Samza container, once started, will create a SamzaContainerSecurityManager. 
SamzaContainerSecurityManager runs on its separate thread and update 
user&rsquo;s delegation tokens at the interval specified by 
yarn.token.renewal.interval.seconds. See Yarn Security for details.</td>
-</tr>
-<tr>
-<td>task.callback.timeout.ms</td>
-<td>-1(no timeout)</td>
-<td>For an AsyncStreamTask, this defines the max allowed time for a 
processAsync callback to complete. For a StreamTask, this is the max allowed 
time for a process call to complete. When the timeout happens,the container is 
shutdown. Default is no timeout.</td>
-</tr>
-<tr>
-<td>task.chooser.class</td>
-<td><code>org.apache.samza.</code><br><code>system.chooser.</code><br><code>RoundRobinChooserFactory</code></td>
-<td>This property can be optionally set to override the default <a 
href="../container/streams.html#messagechooser">message chooser</a>, which 
determines the order in which messages from multiple input streams are 
processed. The value of this property is the fully-qualified name of a Java 
class that implements <a 
href="../api/javadocs/org/apache/samza/system/chooser/MessageChooserFactory.html">MessageChooserFactory</a>.</td>
-</tr>
-<tr>
-<td>task.command.class</td>
-<td><code>org.apache.samza.job.</code><br><code>ShellCommandBuilder</code></td>
-<td>The fully-qualified name of the Java class which determines the command 
line and environment variables for a <a 
href="../container/samza-container.html">container</a>. It must be a subclass 
of <a 
href="../api/javadocs/org/apache/samza/job/CommandBuilder.html">CommandBuilder</a>.
 This defaults to 
task.command.class=<code>org.apache.samza.job.ShellCommandBuilder</code>.</td>
-</tr>
-<tr>
-<td>task.drop.deserialization.errors</td>
-<td>false</td>
-<td>This property is to define how the system deals with deserialization 
failure situation. If set to true, the system will skip the error messages and 
keep running. If set to false, the system with throw exceptions and fail the 
container.</td>
-</tr>
-<tr>
-<td>task.drop.serialization.errors</td>
-<td>false</td>
-<td>This property is to define how the system deals with serialization failure 
situation. If set to true, the system will drop the error messages and keep 
running. If set to false, the system with throw exceptions and fail the 
container.</td>
-</tr>
-<tr>
-<td>task.drop.producer.errors</td>
-<td>false</td>
-<td>If true, producer errors will be logged and ignored. The only exceptions 
that will be thrown are those which are likely caused by the application itself 
(e.g. serializaiton errors). If false, the producer will be closed and producer 
errors will be propagated upward until the container ultimately fails. Failing 
the container is a safety precaution to ensure the latest checkpoints only 
reflect the events that have been completely and successfully processed. 
However, some applications prefer to remain running at all costs, even if that 
means lost messages. Setting this property to true will enable applications to 
recover from producer errors at the expense of one or many (in the case of 
batching producers) dropped messages. If you enable this, it is highly 
recommended that you also configure alerting on the 
&lsquo;producer-send-failed&rsquo; metric, since the producer might drop 
messages indefinitely. The logic for this property is specific to each 
SystemProducer implementation. It
  will have no effect for SystemProducers that ignore the property.</td>
-</tr>
-<tr>
-<td>task.ignored.exceptions</td>
-<td></td>
-<td>This property specifies which exceptions should be ignored if thrown in a 
task&rsquo;s process or window methods. The exceptions to be ignored should be 
a comma-separated list of fully-qualified class names of the exceptions or * to 
ignore all exceptions.</td>
-</tr>
-<tr>
-<td>task.log4j.location.info.enabled</td>
-<td>false</td>
-<td>Defines whether or not to include log4j&rsquo;s LocationInfo data in Log4j 
StreamAppender messages. LocationInfo includes information such as the file, 
class, and line that wrote a log message. This setting is only active if the 
Log4j stream appender is being used. (See <a 
href="../logging.html#stream-log4j-appender">Stream Log4j Appender</a>)</td>
-</tr>
-<tr>
-<td>task.max.idle.ms</td>
-<td>10</td>
-<td>The maximum time to wait for a task worker to complete when there are no 
new messages to handle before resuming the main loop and potentially polling 
for more messages. <code>See task.poll.interval.ms</code> This timeout value 
prevents the main loop from spinning when there is nothing for it to do. 
Increasing this value will reduce the background load of the thread, but, also 
potentially increase message latency. It should not be set greater than the 
<code>task.poll.interval.ms</code>.</td>
-</tr>
-<tr>
-<td>task.max.concurrency</td>
-<td>1</td>
-<td>Max number of outstanding messages being processed per task at a time, and 
itâs applicable to both StreamTask and AsyncStreamTask. The values can 
be:<br><br><code>1</code><br>Each task processes one message at a time. Next 
message will wait until the current message process completes. This ensures 
strict in-order processing.<br><br><code>&gt;1</code><br>Multiple outstanding 
messages are allowed to be processed per task at a time. The completion can be 
out of order. This option increases the parallelism within a task, but may 
result in out-of-order processing.</td>
-</tr>
-<tr>
-<td>task.name.grouper.factory</td>
-<td><code>org.apache.samza.</code><br><code>container.grouper.task.</code><br><code>GroupByContainerCountFactory</code></td>
-<td>The fully-qualified name of the Java class which determines the factory 
class which will build the TaskNameGrouper. The default configuration value if 
the property is not present is 
task.name.grouper.factory=<code>org.apache.samza.container.grouper.task.</code><br><code>GroupByContainerCountFactory</code>.The
 user can specify a custom implementation of the TaskNameGrouperFactory where a 
custom logic is implemented for grouping the tasks.<br>Note: For non-cluster 
applications (ones using coordination service) one must use 
<code>org.apache.samza.container.grouper.</code><br><code>task.GroupByContainerIdsFactory</code></td>
-</tr>
-<tr>
-<td>task.opts</td>
-<td></td>
-<td>Any JVM options to include in the command line when executing Samza 
containers. For example, this can be used to set the JVM heap size, to tune the 
garbage collector, or to enable remote debugging. This cannot be used when 
running with ThreadJobFactory. Anything you put in task.opts gets forwarded 
directly to the commandline as part of the JVM invocation.<br>Example: 
<code>task.opts=-XX:+HeapDumpOnOutOfMemoryError 
-XX:+UseConcMarkSweepGC</code></td>
-</tr>
-<tr>
-<td>task.poll.interval.ms</td>
-<td>50</td>
-<td>Samza&rsquo;s container polls for more messages under two conditions. The 
first condition arises when there are simply no remaining buffered messages to 
process for any input SystemStreamPartition. The second condition arises when 
some input SystemStreamPartitions have empty buffers, but some do not. In the 
latter case, a polling interval is defined to determine how often to refresh 
the empty SystemStreamPartition buffers. By default, this interval is 50ms, 
which means that any empty SystemStreamPartition buffer will be refreshed at 
least every 50ms. A higher value here means that empty SystemStreamPartitions 
will be refreshed less often, which means more latency is introduced, but less 
CPU and network will be used. Decreasing this value means that empty 
SystemStreamPartitions are refreshed more frequently, thereby introducing less 
latency, but increasing CPU and network utilization.</td>
-</tr>
-<tr>
-<td>task.shutdown.ms</td>
-<td>30000</td>
-<td>This property controls how long the Samza container will wait for an 
orderly shutdown of task instances.</td>
-</tr>
-</tbody></table>
-
-<h3 id="2-checkpointing"><a name="checkpointing"></a> <a 
href="#checkpointing">2. Checkpointing</a></h3>
-
-<p><a href="../container/checkpointing.html">Checkpointing</a> is not 
required, but recommended for most jobs. If you don&rsquo;t configure 
checkpointing, and a job or container restarts, it does not remember which 
messages it has already processed. Without checkpointing, consumer behavior on 
startup is determined by the &hellip;samza.offset.default setting. 
Checkpointing allows a job to start up where it previously left off.</p>
-
-<table><thead>
-<tr>
-<th>Name</th>
-<th>Default</th>
-<th>Description</th>
-</tr>
-</thead><tbody>
-<tr>
-<td>task.checkpoint.factory</td>
-<td></td>
-<td>To enable <a href="../container/checkpointing.html">checkpointing</a>, you 
must set this property to the fully-qualified name of a Java class that 
implements <a 
href="../api/javadocs/org/apache/samza/checkpoint/CheckpointManagerFactory.html">CheckpointManagerFactory</a>.
 Samza ships with two checkpoint managers by default: 
<br><br><code>org.apache.samza.checkpoint.kafka.KafkaCheckpointManagerFactory</code>
 <br>Writes checkpoints to a dedicated topic on a Kafka cluster. This is the 
recommended option if you are already using Kafka for input or output streams. 
Use the task.checkpoint.system property to configure which Kafka cluster to use 
for 
checkpoints.<br><br><code>org.apache.samza.checkpoint.file.FileSystemCheckpointManagerFactory</code>
 <br><strong>For dev deployments only.</strong> Writes checkpoints to files on 
the local filesystem. You can configure the file path with the 
task.checkpoint.path property. This is a simple option if your job always runs 
on the same machine. On
  a multi-machine cluster, this would require a network filesystem mount.</td>
-</tr>
-<tr>
-<td>task.commit.ms</td>
-<td>60000</td>
-<td>If task.checkpoint.factory is configured, this property determines how 
often a checkpoint is written. The value is the time between checkpoints, in 
milliseconds. The frequency of checkpointing affects failure recovery: if a 
container fails unexpectedly (e.g. due to crash or machine failure) and is 
restarted, it resumes processing at the last checkpoint. Any messages processed 
since the last checkpoint on the failed container are processed again. 
Checkpointing more frequently reduces the number of messages that may be 
processed twice, but also uses more resources.</td>
-</tr>
-</tbody></table>
-
-<h5 id="2-1-advanced-checkpointing-configurations"><a 
name="advanced-checkpointing-configuration"></a><a 
href="#advanced-checkpointing-configuration">2.1 Advanced Checkpointing 
Configurations</a></h5>
-
-<table><thead>
-<tr>
-<th>Name</th>
-<th>Default</th>
-<th>Description</th>
-</tr>
-</thead><tbody>
-<tr>
-<td>task.checkpoint.system</td>
-<td>inherited from job.default.system</td>
-<td>This property is required if you would like to override the system defined 
in <code>job.default.system</code> for checkpointing. You must set it to the 
<em><strong>system-name</strong></em> of the desired checkpointing system. The 
stream name (topic name) within that system is automatically determined from 
the job name and ID: _<em>samza</em>checkpoint<em>${job.name}</em>${job.id} 
(with underscores in the job name and ID replaced by hyphens).</td>
-</tr>
-<tr>
-<td>job.checkpoint.validation.enabled</td>
-<td>true</td>
-<td>This setting controls if the job should fail(true) or just warn(false) in 
case of the checkpoint topic fails.<br><strong>CAUTION:</strong> this 
configuration needs to be used w/ care. It should only be used as a work-around 
if the checkpoint topic was created with the wrong number of partitions, 
it&rsquo;s contents have been corrupted, or the 
<code>SystemStreamPartitionGrouperFactory</code> for the job needs to be 
changed.</td>
-</tr>
-<tr>
-<td>task.checkpoint.path</td>
-<td></td>
-<td>Required if you are using the filesystem for checkpoints. Set this to the 
path on your local filesystem where checkpoint files should be stored.</td>
-</tr>
-<tr>
-<td>task.checkpoint.<br>replication.factor</td>
-<td>2</td>
-<td>If you are using Kafka for checkpoints, this is the number of Kafka nodes 
to which you want the checkpoint topic replicated for durability.</td>
-</tr>
-<tr>
-<td>task.checkpoint.<br>segment.bytes</td>
-<td>26214400</td>
-<td>If you are using Kafka for checkpoints, this is the segment size to be 
used for the checkpoint topic&rsquo;s log segments. Keeping this number small 
is useful because it increases the frequency that Kafka will garbage collect 
old checkpoints.</td>
-</tr>
-</tbody></table>
-
-<h3 id="3-systems-streams"><a name="systems-streams"></a><a 
href="#systems-streams">3. Systems &amp; Streams</a></h3>
+<table>
+  <thead>
+    <tr>
+      <th>Name</th>
+      <th>Default</th>
+      <th>Description</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>app.name</td>
+      <td>Â </td>
+      <td><strong>Required:</strong> The name of your application.</td>
+    </tr>
+    <tr>
+      <td>app.id</td>
+      <td>1</td>
+      <td>If you run several instances of your application at the same time, 
you need to give each instance a different app.id. This is important, since 
otherwise the applications will overwrite each othersâ checkpoints, and 
perhaps interfere with each other in other ways.</td>
+    </tr>
+    <tr>
+      <td>app.class</td>
+      <td>Â </td>
+      <td>This is <strong>required if running on YARN</strong>. The 
application to run. The value is a fully-qualified Java classname, which must 
implement StreamApplication. A StreamApplication describes as a series of 
transformations on the streams.</td>
+    </tr>
+    <tr>
+      <td>job.factory.class</td>
+      <td>Â </td>
+      <td>This is <strong>required if running on YARN</strong>. The job 
factory to use for running this job. <br /> The value is a fully-qualified Java 
classname, which must implement StreamJobFactory.<br /> Samza ships with three 
implementations:<br /><br /><code class="language-plaintext 
highlighter-rouge">org.apache.samza.job.yarn.YarnJobFactory</code><br />Runs 
your job on a YARN grid. See below for YARN-specific configuration.<br /><br 
/><code class="language-plaintext 
highlighter-rouge">org.apache.samza.job.local.ThreadJobFactory</code><br 
/><strong>For dev deployments only.</strong> Runs your job on your local 
machine using threads.<br /><br /><code class="language-plaintext 
highlighter-rouge">org.apache.samza.job.local.ProcessJobFactory</code><br 
/><strong>For dev deployments only.</strong> Runs your job on your local 
machine as a subprocess. An optional command builderproperty can also be 
specified (see task.command.class for details).</td>
+    </tr>
+    <tr>
+      <td>job.name</td>
+      <td>Â </td>
+      <td><em>(Deprecated in favor of app.name)</em>  The name of your job. 
This name appears on the Samza dashboard, and it is used to tell apart this 
jobâs checkpoints from other jobsâ checkpoints.</td>
+    </tr>
+    <tr>
+      <td>job.id</td>
+      <td>1</td>
+      <td><em>(Deprecated in favor of app.id)</em> If you run several 
instances of your job at the same time, you need to give each execution a 
different job.id. This is important, since otherwise the jobs will overwrite 
each othersâ checkpoints, and perhaps interfere with each other in other 
ways.</td>
+    </tr>
+    <tr>
+      <td>job.default.system</td>
+      <td>Â </td>
+      <td><strong>Required:</strong> The system-name to use for creating input 
or output streams for which the system is not explicitly configured. This 
property will also be used as default for <code class="language-plaintext 
highlighter-rouge">job.coordinator.system</code>, <code 
class="language-plaintext highlighter-rouge">task.checkpoint.system</code> and 
<code class="language-plaintext highlighter-rouge">job.changelog.system</code> 
if none are defined.</td>
+    </tr>
+    <tr>
+      <td>task.class</td>
+      <td>Â </td>
+      <td>Used for legacy purposes; replace with <code 
class="language-plaintext highlighter-rouge">app.class</code> in new jobs. The 
fully-qualified name of the Java class which processes incoming messages from 
input streams. The class must implement <a 
href="../api/javadocs/org/apache/samza/task/StreamTask.html">StreamTask</a> or 
<a 
href="../api/javadocs/org/apache/samza/task/AsyncStreamTask.html">AsyncStreamTask</a>,
 and may optionally implement <a 
href="../api/javadocs/org/apache/samza/task/InitableTask.html">InitableTask</a>,
 <a 
href="../api/javadocs/org/apache/samza/task/ClosableTask.html">ClosableTask</a> 
and/or <a 
href="../api/javadocs/org/apache/samza/task/WindowableTask.html">WindowableTask</a>.
 The class will be instantiated several times, once for every input stream 
partition.</td>
+    </tr>
+    <tr>
+      <td>job.host-affinity.enabled</td>
+      <td>false</td>
+      <td>This property indicates whether host-affinity is enabled or not. 
Host-affinity refers to the ability of Samza to request and allocate a 
container on the same host every time the job is deployed. When host-affinity 
is enabled, Samza makes a âbest-effortâ to honor the host-affinity 
constraint. The property <code class="language-plaintext 
highlighter-rouge">cluster-manager.container.request.timeout.ms</code> 
determines how long to wait before de-prioritizing the host-affinity constraint 
and assigning the container to any available resource.</td>
+    </tr>
+    <tr>
+      <td>job.jmx.enabled</td>
+      <td>true</td>
+      <td>Determines whether a JMX server should be started on the jobâs 
JobCoordinator and Container. (true or false).</td>
+    </tr>
+    <tr>
+      <td>task.window.ms</td>
+      <td>-1</td>
+      <td>If task.class implements <a 
href="../api/javadocs/org/apache/samza/task/WindowableTask.html">WindowableTask</a>,
 it can receive a windowing callback in regular intervals. This property 
specifies the time between window() calls, in milliseconds. If the number is 
negative (the default), window() is never called. A <code 
class="language-plaintext highlighter-rouge">window()</code> call will never  
occur concurrently with the processing of a message. If a message is being 
processed when a window() call is due, the invocation of window happens after 
processing the message. This property is set automatically when using join or 
window operators in a High Level API StreamApplication Note: task.window.ms 
should be set to be much larger than average process or window call duration to 
avoid starving regular processing.</td>
+    </tr>
+    <tr>
+      <td>task.log4j.system</td>
+      <td>Â </td>
+      <td>Specify the system name for the StreamAppender. If this property is 
not specified in the config, an exception will be thrown. (See <a 
href="logging.html#stream-log4j-appender">Stream Log4j Appender</a>) Example: 
task.log4j.system=kafka</td>
+    </tr>
+    <tr>
+      <td>serializers.registry.<br 
/><strong><em>serde-name</em></strong>.class</td>
+      <td>Â </td>
+      <td>Use this property to register a serializer/deserializer, which 
defines a way of encoding data as an array of bytes (used for messages in 
streams, and for data in persistent storage). You can give a serde any 
serde-name you want, and reference that name in properties like 
systems.*.samza.key.serde, systems.*.samza.msg.serde, 
streams.*.samza.key.serde, streams.*.samza.msg.serde, stores.*.key.serde and 
stores.*.msg.serde. The value of this property is the fully-qualified name of a 
Java class that implements SerdeFactory. Samza ships with the following serde 
implementations:<br /><br /><code class="language-plaintext 
highlighter-rouge">org.apache.samza.serializers.ByteSerdeFactory</code><br />A 
no-op serde which passes through the undecoded byte array. <br /><br /><code 
class="language-plaintext 
highlighter-rouge">org.apache.samza.serializers.ByteBufferSerdeFactory</code><br
 />Encodes <code class="language-plaintext 
highlighter-rouge">java.nio.ByteBuffer</code> objects. <br />
 <br /><code class="language-plaintext 
highlighter-rouge">org.apache.samza.serializers.IntegerSerdeFactory</code><br 
/>Encodes <code class="language-plaintext 
highlighter-rouge">java.lang.Integer</code> objects as binary (4 bytes 
fixed-length big-endian encoding).<br /><br /><code class="language-plaintext 
highlighter-rouge">org.apache.samza.serializers.StringSerdeFactory</code><br 
/>Encodes <code class="language-plaintext 
highlighter-rouge">java.lang.String</code> objects as UTF-8. <br /><br /><code 
class="language-plaintext 
highlighter-rouge">org.apache.samza.serializers.JsonSerdeFactory</code><br 
/>Encodes nested structures of <code class="language-plaintext 
highlighter-rouge">java.util.Map</code>, <code class="language-plaintext 
highlighter-rouge">java.util.List</code> etc. as JSON. Note: This Serde 
enforces a dash-separated property naming convention, while JsonSerdeV2 
doesnât. This serde is primarily meant for Samzaâs internal usage, and is 
publicly available for back
 wards compatibility.<br /><br /><code class="language-plaintext 
highlighter-rouge">org.apache.samza.serializers.JsonSerdeV2Factory</code><br 
/>Encodes nested structures of <code class="language-plaintext 
highlighter-rouge">java.util.Map</code>, <code class="language-plaintext 
highlighter-rouge">java.util.List</code> etc. as JSON. Note: This Serde uses 
Jacksonâs default (camelCase) property naming convention. This serde should 
be preferred over JsonSerde, especially in High Level API, unless the 
dasherized naming convention is required (e.g., for backwards 
compatibility).<br /><br /><code class="language-plaintext 
highlighter-rouge">org.apache.samza.serializers.LongSerdeFactory</code><br 
/>Encodes <code class="language-plaintext 
highlighter-rouge">java.lang.Long</code> as binary (8 bytes fixed-length 
big-endian encoding).<br /><br /><code class="language-plaintext 
highlighter-rouge">org.apache.samza.serializers.DoubleSerdeFactory</code><br 
/>Encodes <code class="language-plainte
 xt highlighter-rouge">java.lang.Double</code> as binary (8 bytes 
double-precision float point). <br /><br /><code class="language-plaintext 
highlighter-rouge">org.apache.samza.serializers.UUIDSerdeFactory</code><br 
/>Encodes <code class="language-plaintext 
highlighter-rouge">java.util.UUID</code> objects.<br /><br /><code 
class="language-plaintext 
highlighter-rouge">org.apache.samza.serializers.SerializableSerdeFactory</code><br
 />Encodes <code class="language-plaintext 
highlighter-rouge">java.io.Serializable</code> objects.<br /><br /><code 
class="language-plaintext 
highlighter-rouge">org.apache.samza.serializers.MetricsSnapshotSerdeFactory</code><br
 />Encodes <code class="language-plaintext 
highlighter-rouge">org.apache.samza.metrics.reporter.MetricsSnapshot</code> 
objects (which are used for reporting metrics) as JSON.<br /><br /><code 
class="language-plaintext 
highlighter-rouge">org.apache.samza.serializers.KafkaSerdeFactory</code><br 
/>Adapter which allows existing <code class=
 "language-plaintext highlighter-rouge">kafka.serializer.Encoder</code> and 
<code class="language-plaintext 
highlighter-rouge">kafka.serializer.Decoder</code> implementations to be used 
as Samza serdes. Set <code class="language-plaintext 
highlighter-rouge">serializers.registry.serde-name.encoder</code> and  <code 
class="language-plaintext 
highlighter-rouge">serializers.registry.serde-name.decoder</code> to the 
appropriate class names.</td>
+    </tr>
+  </tbody>
+</table>
+
+<h4 id="-11-advanced-application-configurations"><a 
name="advanced-application-configurations"></a> <a 
href="#advanced-application-configurations">1.1 Advanced Application 
Configurations</a></h4>
+
+<table>
+  <thead>
+    <tr>
+      <th>Name</th>
+      <th>Default</th>
+      <th>Description</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>job.changelog.system</td>
+      <td>inherited from job.default.system</td>
+      <td>This property is required if you would like to override the system 
defined in <code class="language-plaintext 
highlighter-rouge">job.default.system</code> for the changelog. The changelog 
will be used with the stream specified in <code class="language-plaintext 
highlighter-rouge">stores.store-name.changelog</code> config. You can override 
this system by specifying both the system and the stream in <code 
class="language-plaintext 
highlighter-rouge">stores.store-name.changelog</code>.</td>
+    </tr>
+    <tr>
+      <td>job.coordinator.system</td>
+      <td>inherited from job.default.system</td>
+      <td>This property is required if you would like to override the system 
defined in <code class="language-plaintext 
highlighter-rouge">job.default.system</code> for coordination. The 
<strong><em>system-name</em></strong> to use for creating and maintaining the 
Coordinator Stream.</td>
+    </tr>
+    <tr>
+      <td>job.coordinator.segment.<br />bytes</td>
+      <td>26214400</td>
+      <td>If you are using a Kafka system for coordinator stream, this is the 
segment size to be used for the coordinator topicâs log segments. Keeping 
this number small is useful because it increases the frequency that Kafka will 
garbage collect old messages.</td>
+    </tr>
+    <tr>
+      <td>job.coordinator.replication.<br />factor</td>
+      <td>2</td>
+      <td>If you are using a Kafka system for coordinator stream, this is the 
replication factor to be used for the coordinator topic.</td>
+    </tr>
+    <tr>
+      <td>job.coordinator.<br />monitor-partition-change.<br 
/>frequency.ms</td>
+      <td>300000</td>
+      <td>The frequency at which the input streamsâ partition count change 
should be detected. When the input partition count change is detected, Samza 
will automatically restart a stateless job or fail a stateful job. A longer 
time interval is recommended for jobs w/ large number of input system stream 
partitions, since gathering partition count may incur measurable overhead to 
the job. You can completely disable partition count monitoring by setting this 
value to 0 or a negative integer, which will also disable auto-restart/failing 
behavior of a Samza job on partition count changes.</td>
+    </tr>
+    <tr>
+      <td>job.coordinator.execute</td>
+      <td>bin/run-jc.sh</td>
+      <td>The command that starts a Samza job coordinator. The script must be 
included in the job package. There is usually no need to customize this.</td>
+    </tr>
+    <tr>
+      <td>job.config.rewriter.<br 
/><strong><em>rewriter-name</em></strong>.class</td>
+      <td>(none)</td>
+      <td>You can optionally define configuration rewriters, which have the 
opportunity to dynamically modify the job configuration before the job is 
started. For example, this can be useful for pulling configuration from an 
external configuration management system, or for determining the set of input 
streams dynamically at runtime. The value of this property is a fully-qualified 
Java classname which must implement <a 
href="../api/javadocs/org/apache/samza/config/ConfigRewriter.html">ConfigRewriter</a>.
 Samza ships with these rewriters by default:<br /><br /><code 
class="language-plaintext 
highlighter-rouge">org.apache.samza.config.RegExTopicGenerator</code><br />When 
consuming from Kafka, this allows you to consume all Kafka topics that match 
some regular expression (rather than having to list each topic explicitly). 
This rewriter has additional configuration.<br /><br /><code 
class="language-plaintext 
highlighter-rouge">org.apache.samza.config.EnvironmentConfigRewriter</code><br /
 >This rewriter takes environment variables that are prefixed with <code 
 >class="language-plaintext highlighter-rouge">SAMZA_</code> and adds them to 
 >the configuration, overriding previous values where they exist. The keys are 
 >lowercased and underscores are converted to dots.</td>
+    </tr>
+    <tr>
+      <td>job.config.rewriters</td>
+      <td>(none)</td>
+      <td>If you have defined configuration rewriters, you need to list them 
here, in the order in which they should be applied. The value of this property 
is a comma-separated list of <strong><em>rewriter-name</em></strong> 
tokens.</td>
+    </tr>
+    <tr>
+      <td>job.config.rewriter.<br 
/><strong><em>rewriter-name</em></strong>.system</td>
+      <td>(none)</td>
+      <td>Set this property to the <code class="language-plaintext 
highlighter-rouge">system-name</code> of the Kafka system from which you want 
to consume all matching topics.</td>
+    </tr>
+    <tr>
+      <td>job.config.rewriter.<br 
/><strong><em>rewriter-name</em></strong>.regex</td>
+      <td>(none)</td>
+      <td>A regular expression specifying which topics you want to consume 
within the Kafka system <code class="language-plaintext 
highlighter-rouge">job.config.rewriter.*.system</code>. Any topics matched by 
this regular expression will be consumed in addition to any topics you specify 
in your application.</td>
+    </tr>
+    <tr>
+      <td>job.config.rewriter.<br 
/><strong><em>rewriter-name</em></strong>.config.*</td>
+      <td>Â </td>
+      <td>Any properties specified within this namespace are applied to the 
configuration of streams that match the regex in <code 
class="language-plaintext 
highlighter-rouge">job.config.rewriter.*.regex</code>. For example, you can set 
<code class="language-plaintext 
highlighter-rouge">job.config.rewriter.*.config.samza.msg.serde</code> to 
configure the deserializer for messages in the matching streams, which is 
equivalent to setting <code class="language-plaintext 
highlighter-rouge">systems.*.streams.*.samza.msg.serde</code> for each topic 
that matches the regex.</td>
+    </tr>
+    <tr>
+      <td>job.container.thread.<br />pool.size</td>
+      <td>0</td>
+      <td>If configured, the container thread pool will be used to run 
synchronous operations of each task <a href="#../container/event-loop.html">in 
parallel</a>. The operations include StreamTask.process(), 
WindowableTask.window(), and internally Task.commit(). If not configured and 
the default value of 0 is used, all task operations will run in a single 
thread.</td>
+    </tr>
+    <tr>
+      <td>job.systemstreampartition.<br />grouper.factory</td>
+      <td><code class="language-plaintext 
highlighter-rouge">org.apache.samza.</code><br /><code 
class="language-plaintext 
highlighter-rouge">container.grouper.stream.</code><br /><code 
class="language-plaintext highlighter-rouge">GroupByPartitionFactory</code></td>
+      <td>A factory class that is used to determine how input 
SystemStreamPartitions are grouped together for processing in individual 
StreamTask instances. The factory must implement the 
SystemStreamPartitionGrouperFactory interface. Once this configuration is set, 
it canât be changed, since doing so could violate state semantics, and lead 
to a loss of data.<br /><br /><code class="language-plaintext 
highlighter-rouge">org.apache.samza.container.grouper.stream.</code><br /><code 
class="language-plaintext highlighter-rouge">GroupByPartitionFactory</code><br 
/>Groups input stream partitions according to their partition number. This 
grouping leads to a single StreamTask processing all messages for a single 
partition (e.g. partition 0) across all input streams that have a partition 0. 
Therefore, the default is that you get one StreamTask for all input partitions 
with the same partition number. Using this strategy, if two input streams have 
a partition 0, then messages from both pa
 rtitions will be routed to a single StreamTask. This partitioning strategy is 
useful for joining and aggregating streams.<br /><br /><code 
class="language-plaintext 
highlighter-rouge">org.apache.samza.container.grouper.stream.</code><br /><code 
class="language-plaintext 
highlighter-rouge">GroupBySystemStreamPartitionFactory</code><br />Assigns each 
SystemStreamPartition to its own unique StreamTask. The 
GroupBySystemStreamPartitionFactory is useful in cases where you want increased 
parallelism (more containers), and donât care about co-locating partitions 
for grouping or joins, since it allows for a greater number of StreamTasks to 
be divided up amongst Samza containers.</td>
+    </tr>
+    <tr>
+      <td>job.systemstreampartition.<br />matcher.class</td>
+      <td>Â </td>
+      <td>If you want to enable static partition assignment, then this is a 
required configuration. The value of this property is a fully-qualified Java 
class name that implements the interface 
org.apache.samza.system.SystemStreamPartitionMatcher. Samza ships with two 
matcher classes:<br /><br /><code class="language-plaintext 
highlighter-rouge">org.apache.samza.system.RangeSystemStreamPartitionMatcher</code><br
 />This classes uses a comma separated list of range(s) to determine which 
partition matches, and thus statically assigned to the Job. For example 
â2,3,1-2â, statically assigns partition 1, 2, and 3 for all the specified 
system and streams (topics in case of Kafka) to the job. For config validation 
each element in the comma separated list much conform to one of the following 
regex:<br /><code class="language-plaintext highlighter-rouge">(\\d+)</code>â 
orâ<code class="language-plaintext 
highlighter-rouge">(\\d+-\\d+)</code>â<br /><code class="language-
 plaintext highlighter-rouge">JobConfig.SSP_MATCHER_CLASS_RANGE</code> constant 
has the canonical name of this class.<br /><br /><code 
class="language-plaintext 
highlighter-rouge">org.apache.samza.system.RegexSystemStreamPartitionMatcher</code><br
 />This classes uses a standard Java supported regex to determine which 
partition matches, and thus statically assigned to the Job. For example 
â[1-2]â, statically assigns partition 1 and 2 for all the specified system 
and streams (topics in case of Kafka) to the job. 
JobConfig.SSP_MATCHER_CLASS_REGEX constant has the canonical name of this 
class.</td>
+    </tr>
+    <tr>
+      <td>job.systemstreampartition.<br />matcher.config.<br />range</td>
+      <td>Â </td>
+      <td>If <code class="language-plaintext 
highlighter-rouge">job.systemstreampartition.matcher.class</code> is specified, 
and the value of this property is <code class="language-plaintext 
highlighter-rouge">org.apache.samza.system.RangeSystemStreamPartitionMatcher</code>,
 then this property is a required configuration. Specify a comma separated list 
of range(s) to determine which partition matches, and thus statically assigned 
to the Job. For example â2,3,11-20â, statically assigns partition 2, 3, and 
11 to 20 for all the specified system and streams (topics in case of Kafka) to 
the job. A single configuration value like â19â is valid as well. This 
statically assigns partition 19. For config validation each element in the 
comma separated list much conform to one of the following regex:<br />â<code 
class="language-plaintext highlighter-rouge">(\\d+)</code>â or â<code 
class="language-plaintext highlighter-rouge">(\\d+-\\d+)</code>â</td>
+    </tr>
+    <tr>
+      <td>job.systemstreampartition.<br />matcher.config.<br />regex</td>
+      <td>Â </td>
+      <td>If <code class="language-plaintext 
highlighter-rouge">job.systemstreampartition.matcher.class</code> is specified, 
and the value of this property is <code class="language-plaintext 
highlighter-rouge">org.apache.samza.system.RegexSystemStreamPartitionMatcher</code>,
 then this property is a required configuration. The value should be a valid 
Java supported regex. For example â[1-2]â, statically assigns partition 1 
and 2 for all the specified system and streams (topics in case of Kakfa) to the 
job.</td>
+    </tr>
+    <tr>
+      <td>job.systemstreampartition.<br />matcher.config.<br 
/>job.factory.regex</td>
+      <td>Â </td>
+      <td>This configuration can be used to specify the Java supported regex 
to match the StreamJobFactory for which the static partition assignment should 
be enabled. This configuration enables the partition assignment feature to be 
used for custom StreamJobFactory(ies) as well.<br />This config defaults to the 
following value: â<em>org\\.apache\\.samza\\.job\\.local(.*ProcessJobFactory 
| .*ThreadJobFactory)</em>â, which enables static partition assignment when 
job.factory.class is set to <code class="language-plaintext 
highlighter-rouge">org.apache.samza.job.local.ProcessJobFactory</code> or <code 
class="language-plaintext 
highlighter-rouge">org.apache.samza.job.local.ThreadJobFactory</code>.</td>
+    </tr>
+    <tr>
+      <td>job.systemstreampartition.<br />input.expansion.enabled</td>
+      <td>true</td>
+      <td>When enabled, this allows stateful jobs to expand or contract their 
partition count by a multiple of the previous count so that events from an 
input stream partition are processed on the same task as before. This will 
prevent erroneous results. This feature is disabled if the configuration is set 
to false or if the job is stateless. See <a 
href="https://cwiki.apache.org/confluence/display/SAMZA/SEP-5%3A+Enable+partition+expansion+of+input+streams";>SEP-5</a>
 for more details.</td>
+    </tr>
+    <tr>
+      <td>job.security.manager.<br />factory</td>
+      <td>(none)</td>
+      <td>This is the factory class used to create the proper SecurityManager 
to handle security for Samza containers when running in a secure environment, 
such as Yarn with Kerberos eanbled. Samza ships with one security manager by 
default:<br /><br /><code class="language-plaintext 
highlighter-rouge">org.apache.samza.job.yarn.SamzaYarnSecurityManagerFactory</code><br
 />Supports Samza containers to run properly in a Kerberos enabled Yarn 
cluster. Each Samza container, once started, will create a 
SamzaContainerSecurityManager. SamzaContainerSecurityManager runs on its 
separate thread and update userâs delegation tokens at the interval specified 
by yarn.token.renewal.interval.seconds. See Yarn Security for details.</td>
+    </tr>
+    <tr>
+      <td>task.callback.timeout.ms</td>
+      <td>-1(no timeout)</td>
+      <td>For an AsyncStreamTask, this defines the max allowed time for a 
processAsync callback to complete. For a StreamTask, this is the max allowed 
time for a process call to complete. When the timeout happens,the container is 
shutdown. Default is no timeout.</td>
+    </tr>
+    <tr>
+      <td>task.chooser.class</td>
+      <td><code class="language-plaintext 
highlighter-rouge">org.apache.samza.</code><br /><code 
class="language-plaintext highlighter-rouge">system.chooser.</code><br /><code 
class="language-plaintext 
highlighter-rouge">RoundRobinChooserFactory</code></td>
+      <td>This property can be optionally set to override the default <a 
href="../container/streams.html#messagechooser">message chooser</a>, which 
determines the order in which messages from multiple input streams are 
processed. The value of this property is the fully-qualified name of a Java 
class that implements <a 
href="../api/javadocs/org/apache/samza/system/chooser/MessageChooserFactory.html">MessageChooserFactory</a>.</td>
+    </tr>
+    <tr>
+      <td>task.command.class</td>
+      <td><code class="language-plaintext 
highlighter-rouge">org.apache.samza.job.</code><br /><code 
class="language-plaintext highlighter-rouge">ShellCommandBuilder</code></td>
+      <td>The fully-qualified name of the Java class which determines the 
command line and environment variables for a <a 
href="../container/samza-container.html">container</a>. It must be a subclass 
of <a 
href="../api/javadocs/org/apache/samza/job/CommandBuilder.html">CommandBuilder</a>.
 This defaults to task.command.class=<code class="language-plaintext 
highlighter-rouge">org.apache.samza.job.ShellCommandBuilder</code>.</td>
+    </tr>
+    <tr>
+      <td>task.drop.deserialization.errors</td>
+      <td>false</td>
+      <td>This property is to define how the system deals with deserialization 
failure situation. If set to true, the system will skip the error messages and 
keep running. If set to false, the system with throw exceptions and fail the 
container.</td>
+    </tr>
+    <tr>
+      <td>task.drop.serialization.errors</td>
+      <td>false</td>
+      <td>This property is to define how the system deals with serialization 
failure situation. If set to true, the system will drop the error messages and 
keep running. If set to false, the system with throw exceptions and fail the 
container.</td>
+    </tr>
+    <tr>
+      <td>task.drop.producer.errors</td>
+      <td>false</td>
+      <td>If true, producer errors will be logged and ignored. The only 
exceptions that will be thrown are those which are likely caused by the 
application itself (e.g. serializaiton errors). If false, the producer will be 
closed and producer errors will be propagated upward until the container 
ultimately fails. Failing the container is a safety precaution to ensure the 
latest checkpoints only reflect the events that have been completely and 
successfully processed. However, some applications prefer to remain running at 
all costs, even if that means lost messages. Setting this property to true will 
enable applications to recover from producer errors at the expense of one or 
many (in the case of batching producers) dropped messages. If you enable this, 
it is highly recommended that you also configure alerting on the 
âproducer-send-failedâ metric, since the producer might drop messages 
indefinitely. The logic for this property is specific to each SystemProducer 
implementation
 . It will have no effect for SystemProducers that ignore the property.</td>
+    </tr>
+    <tr>
+      <td>task.ignored.exceptions</td>
+      <td>Â </td>
+      <td>This property specifies which exceptions should be ignored if thrown 
in a taskâs process or window methods. The exceptions to be ignored should be 
a comma-separated list of fully-qualified class names of the exceptions or * to 
ignore all exceptions.</td>
+    </tr>
+    <tr>
+      <td>task.log4j.location.info.enabled</td>
+      <td>false</td>
+      <td>Defines whether or not to include log4jâs LocationInfo data in 
Log4j StreamAppender messages. LocationInfo includes information such as the 
file, class, and line that wrote a log message. This setting is only active if 
the Log4j stream appender is being used. (See <a 
href="../logging.html#stream-log4j-appender">Stream Log4j Appender</a>)</td>
+    </tr>
+    <tr>
+      <td>task.max.idle.ms</td>
+      <td>10</td>
+      <td>The maximum time to wait for a task worker to complete when there 
are no new messages to handle before resuming the main loop and potentially 
polling for more messages. <code class="language-plaintext 
highlighter-rouge">See task.poll.interval.ms</code> This timeout value prevents 
the main loop from spinning when there is nothing for it to do. Increasing this 
value will reduce the background load of the thread, but, also potentially 
increase message latency. It should not be set greater than the <code 
class="language-plaintext highlighter-rouge">task.poll.interval.ms</code>.</td>
+    </tr>
+    <tr>
+      <td>task.max.concurrency</td>
+      <td>1</td>
+      <td>Max number of outstanding messages being processed per task at a 
time, and itâs applicable to both StreamTask and AsyncStreamTask. The values 
can be:<br /><br /><code class="language-plaintext 
highlighter-rouge">1</code><br />Each task processes one message at a time. 
Next message will wait until the current message process completes. This 
ensures strict in-order processing.<br /><br /><code class="language-plaintext 
highlighter-rouge">&gt;1</code><br />Multiple outstanding messages are allowed 
to be processed per task at a time. The completion can be out of order. This 
option increases the parallelism within a task, but may result in out-of-order 
processing.</td>
+    </tr>
+    <tr>
+      <td>task.name.grouper.factory</td>
+      <td><code class="language-plaintext 
highlighter-rouge">org.apache.samza.</code><br /><code 
class="language-plaintext highlighter-rouge">container.grouper.task.</code><br 
/><code class="language-plaintext 
highlighter-rouge">GroupByContainerCountFactory</code></td>
+      <td>The fully-qualified name of the Java class which determines the 
factory class which will build the TaskNameGrouper. The default configuration 
value if the property is not present is task.name.grouper.factory=<code 
class="language-plaintext 
highlighter-rouge">org.apache.samza.container.grouper.task.</code><br /><code 
class="language-plaintext 
highlighter-rouge">GroupByContainerCountFactory</code>.The user can specify a 
custom implementation of the TaskNameGrouperFactory where a custom logic is 
implemented for grouping the tasks.<br />Note: For non-cluster applications 
(ones using coordination service) one must use <code class="language-plaintext 
highlighter-rouge">org.apache.samza.container.grouper.</code><br /><code 
class="language-plaintext 
highlighter-rouge">task.GroupByContainerIdsFactory</code></td>
+    </tr>
+    <tr>
+      <td>task.opts</td>
+      <td>Â </td>
+      <td>Any JVM options to include in the command line when executing Samza 
containers. For example, this can be used to set the JVM heap size, to tune the 
garbage collector, or to enable remote debugging. This cannot be used when 
running with ThreadJobFactory. Anything you put in task.opts gets forwarded 
directly to the commandline as part of the JVM invocation.<br />Example: <code 
class="language-plaintext 
highlighter-rouge">task.opts=-XX:+HeapDumpOnOutOfMemoryError 
-XX:+UseConcMarkSweepGC</code></td>
+    </tr>
+    <tr>
+      <td>task.poll.interval.ms</td>
+      <td>50</td>
+      <td>Samzaâs container polls for more messages under two conditions. 
The first condition arises when there are simply no remaining buffered messages 
to process for any input SystemStreamPartition. The second condition arises 
when some input SystemStreamPartitions have empty buffers, but some do not. In 
the latter case, a polling interval is defined to determine how often to 
refresh the empty SystemStreamPartition buffers. By default, this interval is 
50ms, which means that any empty SystemStreamPartition buffer will be refreshed 
at least every 50ms. A higher value here means that empty 
SystemStreamPartitions will be refreshed less often, which means more latency 
is introduced, but less CPU and network will be used. Decreasing this value 
means that empty SystemStreamPartitions are refreshed more frequently, thereby 
introducing less latency, but increasing CPU and network utilization.</td>
+    </tr>
+    <tr>
+      <td>task.shutdown.ms</td>
+      <td>30000</td>
+      <td>This property controls how long the Samza container will wait for an 
orderly shutdown of task instances.</td>
+    </tr>
+  </tbody>
+</table>
+
+<h3 id="-2-checkpointing"><a name="checkpointing"></a> <a 
href="#checkpointing">2. Checkpointing</a></h3>
+<p><a href="../container/checkpointing.html">Checkpointing</a> is not 
required, but recommended for most jobs. If you donât configure 
checkpointing, and a job or container restarts, it does not remember which 
messages it has already processed. Without checkpointing, consumer behavior on 
startup is determined by the â¦samza.offset.default setting. Checkpointing 
allows a job to start up where it previously left off.</p>
+
+<table>
+  <thead>
+    <tr>
+      <th>Name</th>
+      <th>Default</th>
+      <th>Description</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>task.checkpoint.factory</td>
+      <td>Â </td>
+      <td>To enable <a 
href="../container/checkpointing.html">checkpointing</a>, you must set this 
property to the fully-qualified name of a Java class that implements <a 
href="../api/javadocs/org/apache/samza/checkpoint/CheckpointManagerFactory.html">CheckpointManagerFactory</a>.
 Samza ships with two checkpoint managers by default: <br /><br /><code 
class="language-plaintext 
highlighter-rouge">org.apache.samza.checkpoint.kafka.KafkaCheckpointManagerFactory</code>
 <br />Writes checkpoints to a dedicated topic on a Kafka cluster. This is the 
recommended option if you are already using Kafka for input or output streams. 
Use the task.checkpoint.system property to configure which Kafka cluster to use 
for checkpoints.<br /><br /><code class="language-plaintext 
highlighter-rouge">org.apache.samza.checkpoint.file.FileSystemCheckpointManagerFactory</code>
 <br /><strong>For dev deployments only.</strong> Writes checkpoints to files 
on the local filesystem. You can configure the file path wit
 h the task.checkpoint.path property. This is a simple option if your job 
always runs on the same machine. On a multi-machine cluster, this would require 
a network filesystem mount.</td>
+    </tr>
+    <tr>
+      <td>task.commit.ms</td>
+      <td>60000</td>
+      <td>If task.checkpoint.factory is configured, this property determines 
how often a checkpoint is written. The value is the time between checkpoints, 
in milliseconds. The frequency of checkpointing affects failure recovery: if a 
container fails unexpectedly (e.g. due to crash or machine failure) and is 
restarted, it resumes processing at the last checkpoint. Any messages processed 
since the last checkpoint on the failed container are processed again. 
Checkpointing more frequently reduces the number of messages that may be 
processed twice, but also uses more resources.</td>
+    </tr>
+  </tbody>
+</table>
+
+<h5 id="21-advanced-checkpointing-configurations"><a 
name="advanced-checkpointing-configuration"></a><a 
href="#advanced-checkpointing-configuration">2.1 Advanced Checkpointing 
Configurations</a></h5>
+<p>|Name|Default|Description|
+|â |â |â |
+|task.checkpoint.system|inherited from job.default.system|This property is 
required if you would like to override the system defined in <code 
class="language-plaintext highlighter-rouge">job.default.system</code> for 
checkpointing. You must set it to the <em><strong>system-name</strong></em> of 
the desired checkpointing system. The stream name (topic name) within that 
system is automatically determined from the job name and ID: 
<strong>samza_checkpoint_${job.name}_${job.id} (with underscores in the job 
name and ID replaced by hyphens).|
+|job.checkpoint.validation.enabled|true|This setting controls if the job 
should fail(true) or just warn(false) in case of the checkpoint topic fails.<br 
/>__CAUTION:</strong> this configuration needs to be used w/ care. It should 
only be used as a work-around if the checkpoint topic was created with the 
wrong number of partitions, itâs contents have been corrupted, or the <code 
class="language-plaintext 
highlighter-rouge">SystemStreamPartitionGrouperFactory</code> for the job needs 
to be changed.|
+|task.checkpoint.path| |Required if you are using the filesystem for 
checkpoints. Set this to the path on your local filesystem where checkpoint 
files should be stored.|
+|task.checkpoint.<br />replication.factor|2|If you are using Kafka for 
checkpoints, this is the number of Kafka nodes to which you want the checkpoint 
topic replicated for durability.|
+|task.checkpoint.<br />segment.bytes|26214400|If you are using Kafka for 
checkpoints, this is the segment size to be used for the checkpoint topicâs 
log segments. Keeping this number small is useful because it increases the 
frequency that Kafka will garbage collect old checkpoints.|</p>
 
+<h3 id="3-systems--streams"><a name="systems-streams"></a><a 
href="#systems-streams">3. Systems &amp; Streams</a></h3>
 <p>Samza consumes from and produces to <a 
href="../container/streams.html">Streams</a> and has support for a variety of 
Systems including Kafka, HDFS, Azure Event Hubs, Kinesis and ElasticSearch.</p>
 
-<table><thead>
-<tr>
-<th>Name</th>
-<th>Default</th>
-<th>Description</th>
-</tr>
-</thead><tbody>
-<tr>
-<td>task.inputs</td>
-<td></td>
-<td>This configuration is only required for legacy task applications. A 
comma-separated list of streams that are consumed by this job. Each stream is 
given in the format system-name.stream-name. For example, if you have one input 
system called my-kafka, and want to consume two Kafka topics called 
PageViewEvent and UserActivityEvent, then you would set 
task.inputs=my-kafka.PageViewEvent, my-kafka.UserActivityEvent.</td>
-</tr>
-<tr>
-<td>task.broadcast.inputs</td>
-<td></td>
-<td>This property specifies the partitions that all tasks should consume. The 
systemStreamPartitions you put here will be sent to all the tasks. <br>Format: 
system-name.stream-name#partitionId or 
system-name.stream-name#[startingPartitionId-endingPartitionId] <br>Example: 
task.broadcast.inputs=mySystem.broadcastStream#[0-2], 
mySystem.broadcastStream#0</td>
-</tr>
-<tr>
-<td>systems.<strong><em>system-name</em></strong>.samza.factory</td>
-<td></td>
-<td>The fully-qualified name of a Java class which provides a system. A system 
can provide input streams which you can consume in your Samza job, or output 
streams to which you can write, or both. The requirements on a system are very 
flexible â it may connect to a message broker, or read and write files, or 
use a database, or anything else. The class must implement <a 
href="../api/javadocs/org/apache/samza/system/SystemFactory.html">SystemFactory</a>.
 Alternatively, the user may define the system factory in code using 
SystemDescriptors. Samza ships with the following implementations: 
<br><br><code>org.apache.samza.system.kafka.KafkaSystemFactory</code> <a 
href="#kafka">(Configs)</a><br><code>org.apache.samza.system.hdfs.HdfsSystemFactory</code>
 <a href="#hdfs">(Configs)</a> 
<br><code>org.apache.samza.system.eventhub.EventHubSystemFactory</code> <a 
href="#eventhubs">(Configs)</a><br><code>org.apache.samza.system.kinesis.KinesisSystemFactory</code>
 <a href="#kinesis">(Configs)</
 
a><br><code>org.apache.samza.system..elasticsearch.ElasticsearchSystemFactory</code>
 <a href="#elasticsearch">(Configs)</a></td>
-</tr>
-<tr>
-<td>systems.<strong><em>system-name</em></strong>.default.stream.*</td>
-<td></td>
-<td>A set of default properties for any stream associated with the system. For 
example, if 
&ldquo;systems.kafka-system.default.stream.replication.factor&rdquo;=2 was 
configured, then every Kafka stream created on the kafka-system will have a 
replication factor of 2 unless the property is explicitly overridden at the 
stream scope using streams properties.</td>
-</tr>
-<tr>
-<td>systems.<strong><em>system-name</em></strong>.default.stream.samza.key.serde</td>
-<td></td>
-<td>The <a href="../container/serialization.html">serde</a> which will be used 
to deserialize the key of messages on input streams, and to serialize the key 
of messages on output streams. This property defines the serde for an for all 
streams in the system. See the stream-scoped property to define the serde for 
an individual stream. If both are defined, the stream-level definition takes 
precedence. The value of this property must be a serde-name that is registered 
with serializers.registry.*.class. If this property is not set, messages are 
passed unmodified between the input stream consumer, the task and the output 
stream producer.</td>
-</tr>
-<tr>
-<td>systems.<strong><em>system-name</em></strong>.default.stream.samza.msg.serde</td>
-<td></td>
-<td>The <a href="../container/serialization.html">serde</a> which will be used 
to deserialize the value of messages on input streams, and to serialize the 
value of messages on output streams. This property defines the serde for an for 
all streams in the system. See the stream-scoped property to define the serde 
for an individual stream. If both are defined, the stream-level definition 
takes precedence. The value of this property must be a serde-name that is 
registered with serializers.registry.*.class. If this property is not set, 
messages are passed unmodified between the input stream consumer, the task and 
the output stream producer.</td>
-</tr>
-<tr>
-<td>systems.<strong><em>system-name</em></strong>.default.stream.samza.offset.default</td>
-<td><code>upcoming</code></td>
-<td>If a container starts up without a <a 
href="../container/checkpointing.html">checkpoint</a>,  this property 
determines where in the input stream we should start consuming. The value must 
be an <a 
href="../api/javadocs/org/apache/samza/system/SystemStreamMetadata.OffsetType.html">OffsetType</a>,
 one of the following: <br><br><code>upcoming</code> <br>Start processing 
messages that are published after the job starts. Any messages published while 
the job was not running are not processed. <br><br><code>oldest</code> 
<br>Start processing at the oldest available message in the system, and <a 
href="reprocessing.html">reprocess</a> the entire available message history. 
<br><br>This property is for all streams within a system. To set it for an 
individual stream, see streams.stream-id.samza.offset.default. If both are 
defined, the stream-level definition takes precedence.</td>
-</tr>
-<tr>
-<td>streams.<strong><em>stream-id</em></strong>.samza.system</td>
-<td></td>
-<td>The system-name of the system on which this stream will be accessed. This 
property binds the stream to one of the systems defined with the property 
systems.system-name.samza.factory. If this property isn&rsquo;t specified, it 
is inherited from job.default.system.</td>
-</tr>
-<tr>
-<td>streams.<strong><em>stream-id</em></strong>.samza.physical.name</td>
-<td></td>
-<td>The physical name of the stream on the system on which this stream will be 
accessed. This is opposed to the stream-id which is the logical name that Samza 
uses to identify the stream. A physical name could be a Kafka topic name, an 
HDFS file URN or any other system-specific identifier.</td>
-</tr>
-<tr>
-<td>streams.<strong><em>stream-id</em></strong>.samza.key.serde</td>
-<td></td>
-<td>The <a href="../container/serialization.html">serde</a> which will be used 
to deserialize the key of messages on input streams, and to serialize the key 
of messages on output streams. This property defines the serde for an 
individual stream. See the system-scoped property to define the serde for all 
streams within a system. If both are defined, the stream-level definition takes 
precedence. The value of this property must be a serde-name that is registered 
with serializers.registry.*.class. If this property is not set, messages are 
passed unmodified between the input stream consumer, the task and the output 
stream producer.</td>
-</tr>
-<tr>
-<td>streams.<strong><em>stream-id</em></strong>.samza.msg.serde</td>
-<td></td>
-<td>The <a href="../container/serialization.html">serde</a> which will be used 
to deserialize the value of messages on input streams, and to serialize the 
value of messages on output streams. This property defines the serde for an 
individual stream. See the system-scoped property to define the serde for all 
streams within a system. If both are defined, the stream-level definition takes 
precedence. The value of this property must be a serde-name that is registered 
with serializers.registry.*.class. If this property is not set, messages are 
passed unmodified between the input stream consumer, the task and the output 
stream producer.</td>
-</tr>
-<tr>
-<td>streams.<strong><em>stream-id</em></strong>.samza.offset.default</td>
-<td><code>upcoming</code></td>
-<td>If a container starts up without a <a 
href="../container/checkpointing.html">checkpoint</a>, this property determines 
where in the input stream we should start consuming. The value must be an 
[OffsetType 
(../api/javadocs/org/apache/samza/system/SystemStreamMetadata.OffsetType.html), 
one of the following: <br><br><code>upcoming</code> <br>Start processing 
messages that are published after the job starts. Any messages published while 
the job was not running are not processed. <br><br><code>oldest</code> 
<br>Start processing at the oldest available message in the system, and <a 
href="reprocessing.html">reprocess</a> the entire available message history. 
<br><br>This property is for an individual stream. To set it for all streams 
within a system, see  systems.system-name.samza.offset.default. If both are 
defined, the stream-level definition takes precedence.</td>
-</tr>
-</tbody></table>
-
-<h5 id="3-1-advanced-system-stream-configuration"><a 
name="advanced-system-stream-configurations"></a><a 
href="#advanced-system-stream-configurations">3.1 Advanced System &amp; Stream 
Configuration</a></h5>
-
-<table><thead>
-<tr>
-<th>Name</th>
-<th>Default</th>
-<th>Description</th>
-</tr>
-</thead><tbody>
-<tr>
-<td>streams.<strong><em>stream-id</em></strong>.*</td>
-<td></td>
-<td>Any properties of the stream. These are typically system-specific and can 
be used by the system for stream creation or validation. Note that the other 
properties are prefixed with <code>samza</code>. which distinguishes them as 
Samza properties that are not system-specific.</td>
-</tr>
-<tr>
-<td>streams.<strong><em>stream-id</em></strong>.<br>samza.delete.committed.messages</td>
-<td>false</td>
-<td>If set to true, committed messages of this stream can be deleted. 
Committed messages of this stream will be deleted if 
<code>systems.system-name.samza.delete.committed.messages</code> is also set to 
true.</td>
-</tr>
-<tr>
-<td>streams.<strong><em>stream-id</em></strong>.<br>samza.reset.offset</td>
-<td>false</td>
-<td>If set to true, when a Samza container starts up, it ignores any <a 
href="../container/checkpointing.html">checkpointed offset</a> for this 
particular input stream. Its behavior is thus determined by the 
<code>samza.offset.default</code> setting. Note that the reset takes effect 
every time a container is started, which may be every time you restart your 
job, or more frequently if a container fails and is restarted by the 
framework.</td>
-</tr>
-<tr>
-<td>streams.<strong><em>stream-id</em></strong>.<br>samza.priority</td>
-<td>-1</td>
-<td>If one or more streams have a priority set (any positive integer), they 
will be processed with <a 
href="../container/streams.html#prioritizing-input-streams">higher priority</a> 
than the other streams. You can set several streams to the same priority, or 
define multiple priority levels by assigning a higher number to the 
higher-priority streams. If a higher-priority stream has any messages 
available, they will always be processed first; messages from lower-priority 
streams are only processed when there are no new messages on higher-priority 
inputs.</td>
-</tr>
-<tr>
-<td>streams.<strong><em>stream-id</em></strong>.<br>samza.bootstrap</td>
-<td>false</td>
-<td>If set to true, this stream will be processed as a <a 
href="../container/streams.html#bootstrapping">bootstrap stream</a>. This means 
that every time a Samza container starts up, this stream will be fully consumed 
before messages from any other stream are processed.</td>
-</tr>
-<tr>
-<td>streams.<strong><em>stream-id</em></strong>.<br>samza.broadcast</td>
-<td>false</td>
-<td>If set to true, this stream will be processed as a <a 
href="../container/samza-container.html#broadcast-streams">broadcast 
stream</a>. This means that ALL the partitions of this stream will be delivered 
to all the tasks.</td>
-</tr>
-<tr>
-<td>task.consumer.batch.size</td>
-<td>1</td>
-<td>If set to a positive integer, the task will try to consume batches with 
the given number of messages from each input stream, rather than consuming 
round-robin from all the input streams on each individual message. Setting this 
property can improve performance in some cases.</td>
-</tr>
-</tbody></table>
+<table>
+  <thead>
+    <tr>
+      <th>Name</th>
+      <th>Default</th>
+      <th>Description</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>task.inputs</td>
+      <td>Â </td>
+      <td>This configuration is only required for legacy task applications. A 
comma-separated list of streams that are consumed by this job. Each stream is 
given in the format system-name.stream-name. For example, if you have one input 
system called my-kafka, and want to consume two Kafka topics called 
PageViewEvent and UserActivityEvent, then you would set 
task.inputs=my-kafka.PageViewEvent, my-kafka.UserActivityEvent.</td>
+    </tr>


[... 1394 lines stripped ...]

svn commit: r1906774 [36/49] - in /samza/site: ./ archive/ blog/ case-studies/ community/ contribute/ img/latest/learn/documentation/api/ learn/documentation/latest/ learn/documentation/latest/api/ learn/documentation/latest/api/javadocs/ learn/documen...

Reply via email to