http://git-wip-us.apache.org/repos/asf/metron/blob/ae1d3eb9/site/current-book/metron-platform/metron-data-management/index.html
----------------------------------------------------------------------
diff --git
a/site/current-book/metron-platform/metron-data-management/index.html
b/site/current-book/metron-platform/metron-data-management/index.html
index df34389..dea600c 100644
--- a/site/current-book/metron-platform/metron-data-management/index.html
+++ b/site/current-book/metron-platform/metron-data-management/index.html
@@ -1,359 +1,190 @@
<!DOCTYPE html>
<!--
- | Generated by Apache Maven Doxia at 2018-01-03
- | Rendered using Apache Maven Fluido Skin 1.3.0
+ | Generated by Apache Maven Doxia Site Renderer 1.8 from
src/site/markdown/metron-platform/metron-data-management/index.md at 2018-06-07
+ | Rendered using Apache Maven Fluido Skin 1.7
-->
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
- <meta name="Date-Revision-yyyymmdd" content="20180103" />
+ <meta name="Date-Revision-yyyymmdd" content="20180607" />
<meta http-equiv="Content-Language" content="en" />
<title>Metron – Resource Data Management</title>
- <link rel="stylesheet" href="../../css/apache-maven-fluido-1.3.0.min.css"
/>
+ <link rel="stylesheet" href="../../css/apache-maven-fluido-1.7.min.css" />
<link rel="stylesheet" href="../../css/site.css" />
<link rel="stylesheet" href="../../css/print.css" media="print" />
-
-
- <script type="text/javascript"
src="../../js/apache-maven-fluido-1.3.0.min.js"></script>
-
-
-
-<script type="text/javascript">$( document ).ready( function() { $(
'.carousel' ).carousel( { interval: 3500 } ) } );</script>
-
- </head>
- <body class="topBarDisabled">
-
-
-
-
- <div class="container-fluid">
- <div id="banner">
- <div class="pull-left">
- <a href="http://metron.apache.org/"
id="bannerLeft">
-
<img src="../../images/metron-logo.png" alt="Apache Metron"
width="148px" height="48px"/>
- </a>
- </div>
- <div class="pull-right"> </div>
+ <script type="text/javascript"
src="../../js/apache-maven-fluido-1.7.min.js"></script>
+<script type="text/javascript">
+ $( document ).ready( function() { $( '.carousel' ).carousel( {
interval: 3500 } ) } );
+ </script>
+ </head>
+ <body class="topBarDisabled">
+ <div class="container-fluid">
+ <div id="banner">
+ <div class="pull-left"><a href="http://metron.apache.org/"
id="bannerLeft"><img src="../../images/metron-logo.png" alt="Apache Metron"
width="148px" height="48px"/></a></div>
+ <div class="pull-right"></div>
<div class="clear"><hr/></div>
</div>
<div id="breadcrumbs">
<ul class="breadcrumb">
-
-
- <li class="">
- <a href="http://www.apache.org" class="externalLink"
title="Apache">
- Apache</a>
- </li>
- <li class="divider ">/</li>
- <li class="">
- <a href="http://metron.apache.org/" class="externalLink"
title="Metron">
- Metron</a>
- </li>
- <li class="divider ">/</li>
- <li class="">
- <a href="../../index.html" title="Documentation">
- Documentation</a>
- </li>
- <li class="divider ">/</li>
- <li class="">Resource Data Management</li>
-
-
-
- <li id="publishDate" class="pull-right">Last Published:
2018-01-03</li> <li class="divider pull-right">|</li>
- <li id="projectVersion" class="pull-right">Version: 0.4.2</li>
-
- </ul>
+ <li class=""><a href="http://www.apache.org" class="externalLink"
title="Apache">Apache</a><span class="divider">/</span></li>
+ <li class=""><a href="http://metron.apache.org/" class="externalLink"
title="Metron">Metron</a><span class="divider">/</span></li>
+ <li class=""><a href="../../index.html"
title="Documentation">Documentation</a><span class="divider">/</span></li>
+ <li class="active ">Resource Data Management</li>
+ <li id="publishDate" class="pull-right"><span class="divider">|</span>
Last Published: 2018-06-07</li>
+ <li id="projectVersion" class="pull-right">Version: 0.5.0</li>
+ </ul>
</div>
-
-
<div class="row-fluid">
- <div id="leftColumn" class="span3">
+ <div id="leftColumn" class="span2">
<div class="well sidebar-nav">
-
-
- <ul class="nav nav-list">
- <li class="nav-header">User Documentation</li>
-
- <li>
-
- <a href="../../index.html" title="Metron">
- <i class="icon-chevron-down"></i>
- Metron</a>
- <ul class="nav nav-list">
-
- <li>
-
- <a href="../../Upgrading.html" title="Upgrading">
- <i class="none"></i>
- Upgrading</a>
- </li>
-
- <li>
-
- <a href="../../metron-analytics/index.html"
title="Analytics">
- <i class="icon-chevron-right"></i>
- Analytics</a>
- </li>
-
- <li>
-
- <a
href="../../metron-contrib/metron-docker/index.html" title="Docker">
- <i class="none"></i>
- Docker</a>
- </li>
-
- <li>
-
- <a href="../../metron-deployment/index.html"
title="Deployment">
- <i class="icon-chevron-right"></i>
- Deployment</a>
- </li>
-
- <li>
-
- <a
href="../../metron-interface/metron-alerts/index.html" title="Alerts">
- <i class="none"></i>
- Alerts</a>
- </li>
-
- <li>
-
- <a
href="../../metron-interface/metron-config/index.html" title="Config">
- <i class="none"></i>
- Config</a>
- </li>
-
- <li>
-
- <a
href="../../metron-interface/metron-rest/index.html" title="Rest">
- <i class="none"></i>
- Rest</a>
- </li>
-
- <li>
-
- <a href="../../metron-platform/index.html"
title="Platform">
- <i class="icon-chevron-down"></i>
- Platform</a>
- <ul class="nav nav-list">
-
- <li>
-
- <a
href="../../metron-platform/Performance-tuning-guide.html"
title="Performance-tuning-guide">
- <i class="none"></i>
- Performance-tuning-guide</a>
- </li>
-
- <li>
-
- <a
href="../../metron-platform/metron-api/index.html" title="Api">
- <i class="none"></i>
- Api</a>
- </li>
-
- <li>
-
- <a
href="../../metron-platform/metron-common/index.html" title="Common">
- <i class="none"></i>
- Common</a>
- </li>
-
- <li class="active">
-
- <a href="#"><i class="none"></i>Data-management</a>
- </li>
-
- <li>
-
- <a
href="../../metron-platform/metron-elasticsearch/index.html"
title="Elasticsearch">
- <i class="none"></i>
- Elasticsearch</a>
- </li>
-
- <li>
-
- <a
href="../../metron-platform/metron-enrichment/index.html" title="Enrichment">
- <i class="none"></i>
- Enrichment</a>
- </li>
-
- <li>
-
- <a
href="../../metron-platform/metron-indexing/index.html" title="Indexing">
- <i class="none"></i>
- Indexing</a>
- </li>
-
- <li>
-
- <a
href="../../metron-platform/metron-management/index.html" title="Management">
- <i class="none"></i>
- Management</a>
- </li>
-
- <li>
-
- <a
href="../../metron-platform/metron-parsers/index.html" title="Parsers">
- <i class="icon-chevron-right"></i>
- Parsers</a>
- </li>
-
- <li>
-
- <a
href="../../metron-platform/metron-pcap-backend/index.html"
title="Pcap-backend">
- <i class="none"></i>
- Pcap-backend</a>
- </li>
-
- <li>
-
- <a
href="../../metron-platform/metron-writer/index.html" title="Writer">
- <i class="none"></i>
- Writer</a>
- </li>
- </ul>
- </li>
-
- <li>
-
- <a href="../../metron-sensors/index.html"
title="Sensors">
- <i class="icon-chevron-right"></i>
- Sensors</a>
- </li>
-
- <li>
-
- <a
href="../../metron-stellar/stellar-3rd-party-example/index.html"
title="Stellar-3rd-party-example">
- <i class="none"></i>
- Stellar-3rd-party-example</a>
- </li>
-
- <li>
-
- <a
href="../../metron-stellar/stellar-common/index.html" title="Stellar-common">
- <i class="icon-chevron-right"></i>
- Stellar-common</a>
- </li>
-
- <li>
-
- <a href="../../use-cases/index.html"
title="Use-cases">
- <i class="icon-chevron-right"></i>
- Use-cases</a>
- </li>
- </ul>
- </li>
- </ul>
-
-
-
- <hr class="divider" />
-
- <div id="poweredBy">
- <div class="clear"></div>
- <div class="clear"></div>
- <div class="clear"></div>
- <a href="http://maven.apache.org/" title="Built
by Maven" class="poweredBy">
- <img class="builtBy" alt="Built by Maven"
src="../../images/logos/maven-feather.png" />
- </a>
- </div>
+ <ul class="nav nav-list">
+ <li class="nav-header">User Documentation</li>
+ <li><a href="../../index.html" title="Metron"><span
class="icon-chevron-down"></span>Metron</a>
+ <ul class="nav nav-list">
+ <li><a href="../../CONTRIBUTING.html" title="CONTRIBUTING"><span
class="none"></span>CONTRIBUTING</a></li>
+ <li><a href="../../Upgrading.html" title="Upgrading"><span
class="none"></span>Upgrading</a></li>
+ <li><a href="../../metron-analytics/index.html" title="Analytics"><span
class="icon-chevron-right"></span>Analytics</a></li>
+ <li><a href="../../metron-contrib/metron-docker/index.html"
title="Docker"><span class="none"></span>Docker</a></li>
+ <li><a href="../../metron-contrib/metron-performance/index.html"
title="Performance"><span class="none"></span>Performance</a></li>
+ <li><a href="../../metron-deployment/index.html" title="Deployment"><span
class="icon-chevron-right"></span>Deployment</a></li>
+ <li><a href="../../metron-interface/metron-alerts/index.html"
title="Alerts"><span class="none"></span>Alerts</a></li>
+ <li><a href="../../metron-interface/metron-config/index.html"
title="Config"><span class="none"></span>Config</a></li>
+ <li><a href="../../metron-interface/metron-rest/index.html"
title="Rest"><span class="none"></span>Rest</a></li>
+ <li><a href="../../metron-platform/index.html" title="Platform"><span
class="icon-chevron-down"></span>Platform</a>
+ <ul class="nav nav-list">
+ <li><a href="../../metron-platform/Performance-tuning-guide.html"
title="Performance-tuning-guide"><span
class="none"></span>Performance-tuning-guide</a></li>
+ <li><a href="../../metron-platform/metron-api/index.html"
title="Api"><span class="none"></span>Api</a></li>
+ <li><a href="../../metron-platform/metron-common/index.html"
title="Common"><span class="none"></span>Common</a></li>
+ <li class="active"><a href="#"><span
class="none"></span>Data-management</a></li>
+ <li><a href="../../metron-platform/metron-elasticsearch/index.html"
title="Elasticsearch"><span class="none"></span>Elasticsearch</a></li>
+ <li><a href="../../metron-platform/metron-enrichment/index.html"
title="Enrichment"><span class="icon-chevron-right"></span>Enrichment</a></li>
+ <li><a href="../../metron-platform/metron-indexing/index.html"
title="Indexing"><span class="none"></span>Indexing</a></li>
+ <li><a href="../../metron-platform/metron-management/index.html"
title="Management"><span class="none"></span>Management</a></li>
+ <li><a href="../../metron-platform/metron-parsers/index.html"
title="Parsers"><span class="icon-chevron-right"></span>Parsers</a></li>
+ <li><a href="../../metron-platform/metron-pcap-backend/index.html"
title="Pcap-backend"><span class="none"></span>Pcap-backend</a></li>
+ <li><a href="../../metron-platform/metron-writer/index.html"
title="Writer"><span class="none"></span>Writer</a></li>
+ </ul>
+</li>
+ <li><a href="../../metron-sensors/index.html" title="Sensors"><span
class="icon-chevron-right"></span>Sensors</a></li>
+ <li><a href="../../metron-stellar/stellar-3rd-party-example/index.html"
title="Stellar-3rd-party-example"><span
class="none"></span>Stellar-3rd-party-example</a></li>
+ <li><a href="../../metron-stellar/stellar-common/index.html"
title="Stellar-common"><span
class="icon-chevron-right"></span>Stellar-common</a></li>
+ <li><a href="../../metron-stellar/stellar-zeppelin/index.html"
title="Stellar-zeppelin"><span class="none"></span>Stellar-zeppelin</a></li>
+ <li><a href="../../use-cases/index.html" title="Use-cases"><span
class="icon-chevron-right"></span>Use-cases</a></li>
+ </ul>
+</li>
+</ul>
+ <hr />
+ <div id="poweredBy">
+ <div class="clear"></div>
+ <div class="clear"></div>
+ <div class="clear"></div>
+ <div class="clear"></div>
+<a href="http://maven.apache.org/" title="Built by Maven"
class="poweredBy"><img class="builtBy" alt="Built by Maven"
src="../../images/logos/maven-feather.png" /></a>
+ </div>
</div>
</div>
-
-
- <div id="bodyColumn" class="span9" >
-
- <h1>Resource Data Management</h1>
+ <div id="bodyColumn" class="span10" >
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+<h1>Resource Data Management</h1>
<p><a name="Resource_Data_Management"></a></p>
-<p>This project is a collection of classes to assist with loading of various
enrichment and threat intelligence sources into Metron.</p>
<div class="section">
-<h2><a name="Simple_HBase_EnrichmentsThreat_Intelligence"></a>Simple HBase
Enrichments/Threat Intelligence</h2>
-<p>The vast majority of enrichments and threat intelligence processing tend
toward the following pattern:</p>
+<h2><a name="Table_of_Contents"></a>Table of Contents</h2>
+<ul>
+<li><a href="#Overview">Overview</a></li>
+<li><a href="#Simple_HBase_EnrichmentsThreat_Intelligence">Simple HBase
Enrichments/Threat Intelligence</a></li>
+<li><a href="#Extractor_Framework">Extractor Framework</a></li>
+<li><a href="#Enrichment_Config">Enrichment Config</a></li>
+<li><a href="#Loading_Utilities">Loading Utilities</a></li>
+<li><a href="#Pruning_Data_from_Elasticsearch">Pruning Data from
Elasticsearch</a></li>
+</ul></div>
+<div class="section">
+<h2><a name="Overview"></a>Overview</h2>
+<p>This project is a collection of classes to assist with loading of various
enrichment and threat intelligence sources into Metron.</p></div>
+<div class="section">
+<h2><a name="Simple_HBase_Enrichments.2FThreat_Intelligence"></a>Simple HBase
Enrichments/Threat Intelligence</h2>
+<p>The vast majority of enrichments and threat intelligence processing tend
toward the following pattern:</p>
<ul>
-
+
<li>Take a field</li>
-
<li>Look up the field in a key/value store</li>
-
<li>If the key exists, then either it’s a threat to be alerted or it
should be enriched with the value associated with the key.</li>
</ul>
-<p>As such, we have created this capability as a default threat intel and
enrichment adapter. The basic primitive for simple enrichments and threat
intelligence sources is a complex key containing the following:</p>
-
+<p>As such, we have created this capability as a default threat intel and
enrichment adapter. The basic primitive for simple enrichments and threat
intelligence sources is a complex key containing the following:</p>
<ul>
-
+
<li>Type : The type of threat intel or enrichment (e.g. malicious_ip)</li>
-
<li>Indicator : The indicator in question</li>
-
-<li>Value : The value to associate with the type, indicator pair. This is a
JSON map.</li>
+<li>Value : The value to associate with the type, indicator pair. This is a
JSON map.</li>
</ul>
<p>At present, all of the dataloads utilities function by converting raw data
sources to this primitive key (type, indicator) and value to be placed in
HBase.</p>
<p>In the case of threat intel, a hit on the threat intel table will result
in:</p>
-
<ul>
-
+
<li>The <tt>is_alert</tt> field being set to <tt>true</tt> in the index</li>
-
<li>A field named
<tt>threatintels.hbaseThreatIntel.$field.$threatintel_type</tt> is set to
<tt>alert</tt>
-
<ul>
-
+
<li><tt>$field</tt> is the field in the original document that was a match
(e.g. <tt>src_ip_addr</tt>)</li>
-
<li><tt>$threatintel_type</tt> is the type of threat intel imported (defined
in the Extractor configuration below).</li>
- </ul></li>
</ul>
-<p>In the case of simple hbase enrichment, a hit on the enrichments table will
result in the following new field for each key in the
value:<tt>enrichments.hbaseEnrichment.$field.$enrichment_type.$key</tt> </p>
-
+</li>
+</ul>
+<p>In the case of simple hbase enrichment, a hit on the enrichments table will
result in the following new field for each key in the
value:<tt>enrichments.hbaseEnrichment.$field.$enrichment_type.$key</tt></p>
<ul>
-
-<li><tt>$field</tt> is the field in the original document that was a match
(e.g. <tt>src_ip_addr</tt>)</li>
-
+
+<li><tt>$field</tt> is the field in the original document that was a match
(e.g. <tt>src_ip_addr</tt>)</li>
<li><tt>$enrichment_type</tt> is the type of enrichment imported (defined in
the Extractor configuration below).</li>
-
<li><tt>$key</tt> is a key in the JSON map associated with the row in
HBase.</li>
</ul>
<p>For instance, in the situation where we had the following very silly
key/value in HBase in the enrichment table:</p>
-
<ul>
-
+
<li>indicator: <tt>127.0.0.1</tt></li>
-
<li>type : <tt>important_addresses</tt></li>
-
<li>value: <tt>{ "name" : "localhost",
"location" : "home" }</tt></li>
</ul>
<p>If we had a document whose <tt>ip_src_addr</tt> came through with a value
of <tt>127.0.0.1</tt>, we would have the following fields added to the indexed
document:</p>
-
<ul>
-
+
<li><tt>enrichments.hbaseEnrichment.ip_src_addr.important_addresses.name</tt>
: <tt>localhost</tt></li>
-
<li><tt>enrichments.hbaseEnrichment.ip_src_addr.important_addresses.location</tt>
: <tt>home</tt></li>
</ul></div>
<div class="section">
<h2><a name="Extractor_Framework"></a>Extractor Framework</h2>
-<p>For the purpose of ingesting data of a variety of formats, we have created
an Extractor framework which allows for common data formats to be interpreted
as enrichment or threat intelligence sources. The formats supported at present
are:</p>
-
+<p>For the purpose of ingesting data of a variety of formats, we have created
an Extractor framework which allows for common data formats to be interpreted
as enrichment or threat intelligence sources. The formats supported at present
are:</p>
<ul>
-
+
<li>CSV (both threat intel and enrichment)</li>
-
<li>STIX (threat intel only)</li>
-
<li>Custom (pass your own class)</li>
</ul>
-<p>All of the current utilities take a JSON file to configure how to interpret
input data. This JSON describes the type of data and the schema if necessary
for the data if it is not fixed (as in STIX, e.g.).</p>
+<p>All of the current utilities take a JSON file to configure how to interpret
input data. This JSON describes the type of data and the schema if necessary
for the data if it is not fixed (as in STIX, e.g.).</p>
<div class="section">
<h3><a name="CSV_Extractor"></a>CSV Extractor</h3>
<p>Consider the following example configuration file which describes how to
process a CSV file.</p>
-<div class="source">
-<div class="source">
-<pre>{
+<div>
+<div>
+<pre class="source">{
"config" : {
"columns" : {
"ip" : 0
@@ -366,129 +197,86 @@
,"extractor" : "CSV"
}
</pre></div></div>
-<p>In this example, we have instructed the extractor of the schema (i.e. the
columns field), two columns at the first and third position. We have indicated
that the <tt>ip</tt> column is the indicator type and that the enrichment type
is named <tt>malicious_ip</tt>. We have also indicated that the extractor to
use is the CSV Extractor. The other option is the STIX extractor or a fully
qualified classname for your own extractor.</p>
-<p>The meta column values will show up in the value in HBase because it is
called out as a non-indicator column. The key for the value will be
‘meta’. For instance, given an input string of
<tt>123.45.123.12,something,the grapevine</tt>, the following key, value would
be extracted:</p>
+<p>In this example, we have instructed the extractor of the schema (i.e. the
columns field), two columns at the first and third position. We have indicated
that the <tt>ip</tt> column is the indicator type and that the enrichment type
is named <tt>malicious_ip</tt>. We have also indicated that the extractor to
use is the CSV Extractor. The other option is the STIX extractor or a fully
qualified classname for your own extractor.</p>
+<p>The meta column values will show up in the value in HBase because it is
called out as a non-indicator column. The key for the value will be
‘meta’. For instance, given an input string of
<tt>123.45.123.12,something,the grapevine</tt>, the following key, value would
be extracted:</p>
<ul>
-
+
<li>Indicator : <tt>123.45.123.12</tt></li>
-
<li>Type : <tt>malicious_ip</tt></li>
-
<li>Value : <tt>{ "ip" : "123.45.123.12",
"source" : "the grapevine" }</tt></li>
</ul></div>
<div class="section">
<h3><a name="STIX_Extractor"></a>STIX Extractor</h3>
-<p>Consider the following config for importing STIX documents. This is a
threat intelligence interchange format, so it is particularly relevant and
attractive data to import for our purposes. Because STIX is a standard format,
there is no need to specify the schema or how to interpret the documents.</p>
+<p>Consider the following config for importing STIX documents. This is a
threat intelligence interchange format, so it is particularly relevant and
attractive data to import for our purposes. Because STIX is a standard format,
there is no need to specify the schema or how to interpret the documents.</p>
<p>We support the versions of Stix and Cybox supported by <a
class="externalLink"
href="https://github.com/STIXProject/java-stix/tree/v1.2.0.2">java-stix</a>:</p>
-
<ul>
-
+
<li>Stix - <a class="externalLink"
href="https://github.com/STIXProject/schemas/blob/356cc4f6b06625465f0808388eb166807313b4e0/stix_core.xsd">1.2</a>
and earlier</li>
-
<li>Cybox - <a class="externalLink"
href="https://github.com/CybOXProject/schemas/blob/97beb32c376a9223e91b52cb3e4c8d2af6baf786/cybox_core.xsd">2.1</a>
and earlier</li>
</ul>
<p>We support a subset of STIX messages for importation:</p>
-
<table border="0" class="table table-striped">
- <thead>
-
+<thead>
+
<tr class="a">
-
-<th>STIX Type </th>
-
-<th>Specific Type </th>
-
-<th>Enrichment Type Name </th>
- </tr>
- </thead>
- <tbody>
-
+<th> STIX Type </th>
+<th> Specific Type </th>
+<th> Enrichment Type Name </th></tr>
+</thead><tbody>
+
<tr class="b">
-
-<td>Address </td>
-
-<td>IPV_4_ADDR </td>
-
-<td>address:IPV_4_ADDR </td>
- </tr>
-
+<td> Address </td>
+<td> IPV_4_ADDR </td>
+<td> address:IPV_4_ADDR </td></tr>
<tr class="a">
-
-<td>Address </td>
-
-<td>IPV_6_ADDR </td>
-
-<td>address:IPV_6_ADDR </td>
- </tr>
-
+<td> Address </td>
+<td> IPV_6_ADDR </td>
+<td> address:IPV_6_ADDR </td></tr>
<tr class="b">
-
-<td>Address </td>
-
-<td>E_MAIL </td>
-
-<td>address:E_MAIL </td>
- </tr>
-
+<td> Address </td>
+<td> E_MAIL </td>
+<td> address:E_MAIL </td></tr>
<tr class="a">
-
-<td>Address </td>
-
-<td>MAC </td>
-
-<td>address:MAC </td>
- </tr>
-
+<td> Address </td>
+<td> MAC </td>
+<td> address:MAC </td></tr>
<tr class="b">
-
-<td>Domain </td>
-
-<td>FQDN </td>
-
-<td>domain:FQDN </td>
- </tr>
-
+<td> Domain </td>
+<td> FQDN </td>
+<td> domain:FQDN </td></tr>
<tr class="a">
-
-<td>Hostname </td>
-
-<td> </td>
-
-<td>hostname </td>
- </tr>
-
+<td> Hostname </td>
+<td> </td>
+<td> hostname </td></tr>
<tr class="b">
-
-<td>URI </td>
-
-<td> </td>
-
-<td>uriobjecttype </td>
- </tr>
- </tbody>
+<td> URI </td>
+<td> </td>
+<td> uriobjecttype </td></tr>
+</tbody>
</table>
<p>NOTE: The enrichment type will be used as the type above.</p>
<p>Consider the following configuration for an Extractor</p>
-<div class="source">
-<div class="source">
-<pre>{
+<div>
+<div>
+<pre class="source">{
"config" : {
"stix_address_categories" : "IPV_4_ADDR"
}
,"extractor" : "STIX"
}
</pre></div></div>
-<p>In here, we’re configuring the STIX extractor to load from a series
of STIX files, however we only want to bring in IPv4 addresses from the set of
all possible addresses. Note that if no categories are specified for import,
all are assumed. Also, only address and domain types allow filtering via
<tt>stix_address_categories</tt> and <tt>stix_domain_categories</tt> config
parameters.</p></div>
+
+<p>In here, we’re configuring the STIX extractor to load from a series
of STIX files, however we only want to bring in IPv4 addresses from the set of
all possible addresses. Note that if no categories are specified for import,
all are assumed. Also, only address and domain types allow filtering via
<tt>stix_address_categories</tt> and <tt>stix_domain_categories</tt> config
parameters.</p></div>
<div class="section">
<h3><a name="Common_Extractor_Properties"></a>Common Extractor Properties</h3>
<p>Users also have the ability to transform and filter enrichment and threat
intel data using Stellar as it is loaded into HBase. This feature is available
to all extractor types.</p>
<p>As an example, we will be providing a CSV list of top domains as an
enrichment and filtering the value metadata, as well as the indicator column,
with Stellar expressions.</p>
-<div class="source">
-<div class="source">
-<pre>{
+<div>
+<div>
+<pre class="source">{
"config" : {
"zk_quorum" : "node1:2181",
"columns" : {
@@ -510,154 +298,117 @@
"extractor" : "CSV"
}
</pre></div></div>
-<p>There are 2 property maps that work with full Stellar expressions, and 2
properties that will work with Stellar predicates.</p>
+<p>There are 2 property maps that work with full Stellar expressions, and 2
properties that will work with Stellar predicates.</p>
<table border="0" class="table table-striped">
- <thead>
-
+<thead>
+
+<tr class="a">
+<th> Property </th>
+<th> Description</th></tr>
+</thead><tbody>
+
+<tr class="b">
+<td> <tt>value_transform</tt> </td>
+<td> Transform fields defined in the “columns” mapping with
Stellar transformations. New keys introduced in the transform will be added to
the key metadata.</td></tr>
<tr class="a">
-
-<th>Property </th>
-
-<th>Description</th>
- </tr>
- </thead>
- <tbody>
-
+<td> <tt>value_filter</tt> </td>
+<td> Allows additional filtering with Stellar predicates based on results from
the value transformations. In this example, records whose domain property is
empty after removing the TLD will be omitted.</td></tr>
<tr class="b">
-
-<td>value_transform </td>
-
-<td>Transform fields defined in the “columns” mapping with
Stellar transformations. New keys introduced in the transform will be added to
the key metadata.</td>
- </tr>
-
+<td> <tt>indicator_transform</tt></td>
+<td> Transform the indicator column independent of the value transformations.
You can refer to the original indicator value by using
“indicator” as the variable name, as shown in the example above.
In addition, if you prefer to piggyback your transformations, you can refer to
the variable “domain”, which will allow your indicator transforms
to inherit transformations done to this value during the value
transformations.</td></tr>
<tr class="a">
-
-<td>value_filter </td>
-
-<td>Allows additional filtering with Stellar predicates based on results from
the value transformations. In this example, records whose domain property is
empty after removing the TLD will be omitted.</td>
- </tr>
-
+<td> <tt>indicator_filter</tt> </td>
+<td> Allows additional filtering with Stellar predicates based on results from
the value transformations. In this example, records whose indicator value is
empty after removing the TLD will be omitted.</td></tr>
<tr class="b">
-
-<td>indicator_transform </td>
-
-<td>Transform the indicator column independent of the value transformations.
You can refer to the original indicator value by using
“indicator” as the variable name, as shown in the example above.
In addition, if you prefer to piggyback your transformations, you can refer to
the variable “domain”, which will allow your indicator transforms
to inherit transformations done to this value during the value
transformations.</td>
- </tr>
-
+<td> <tt>state_init</tt> </td>
+<td> Allows a state object to be initialized. This is a string, so a single
expression is created. The output of this expression will be available as the
<tt>state</tt> variable. This is to be used with the
<tt>flatfile_summarizer.sh</tt> rather than the loader.</td></tr>
<tr class="a">
-
-<td>indicator_filter </td>
-
-<td>Allows additional filtering with Stellar predicates based on results from
the value transformations. In this example, records whose indicator value is
empty after removing the TLD will be omitted.</td>
- </tr>
- </tbody>
+<td> <tt>state_update</tt> </td>
+<td> Allows a state object to be updated. This is a map, so you can have
temporary variables here. Note that you can reference the <tt>state</tt>
variable from this. This is to be used with the
<tt>flatfile_summarizer.sh</tt> rather than the loader.</td></tr>
+<tr class="b">
+<td> <tt>state_merge</tt> </td>
+<td> Allows a list of states to be merged. This is a string, so a single
expression. There is a special field called <tt>states</tt> available, which
is a list of the states (one per thread). This is to be used with the
<tt>flatfile_summarizer.sh</tt> rather than the loader.</td></tr>
+</tbody>
</table>
<p>top-list.csv</p>
-<div class="source">
-<div class="source">
-<pre>1,google.com
+<div>
+<div>
+<pre class="source">1,google.com
2,youtube.com
...
</pre></div></div>
-<p>Running a file import with the above data and extractor configuration would
result in the following 2 extracted data records:</p>
+<p>Running a file import with the above data and extractor configuration would
result in the following 2 extracted data records:</p>
<table border="0" class="table table-striped">
- <thead>
-
+<thead>
+
<tr class="a">
-
-<th>Indicator </th>
-
-<th>Type </th>
-
-<th>Value </th>
- </tr>
- </thead>
- <tbody>
-
+<th> Indicator </th>
+<th> Type </th>
+<th> Value </th></tr>
+</thead><tbody>
+
<tr class="b">
-
-<td>google </td>
-
-<td>top_domains </td>
-
-<td>{ “rank” : “1”, “domain” :
“google” } </td>
- </tr>
-
+<td> google </td>
+<td> top_domains </td>
+<td> { “rank” : “1”, “domain” :
“google” } </td></tr>
<tr class="a">
-
-<td>yahoo </td>
-
-<td>top_domains </td>
-
-<td>{ “rank” : “2”, “domain” :
“yahoo” } </td>
- </tr>
- </tbody>
+<td> yahoo </td>
+<td> top_domains </td>
+<td> { “rank” : “2”, “domain” :
“yahoo” } </td></tr>
+</tbody>
</table>
<p>Similar to the parser framework, providing a Zookeeper quorum via the
zk_quorum property will enable Stellar to access properties that reside in the
global config. Expanding on our example above, if the global config looks as
follows:</p>
-<div class="source">
-<div class="source">
-<pre>{
+<div>
+<div>
+<pre class="source">{
"global_property" : "metron-ftw"
}
</pre></div></div>
+
<p>And we expand our value_tranform:</p>
-<div class="source">
-<div class="source">
-<pre>...
+<div>
+<div>
+<pre class="source">...
"value_transform" : {
"domain" : "DOMAIN_REMOVE_TLD(domain)",
"a-new-prop" : "global_property"
},
...
</pre></div></div>
-<p>The resulting value data would look like the following:</p>
+<p>The resulting value data would look like the following:</p>
<table border="0" class="table table-striped">
- <thead>
-
+<thead>
+
<tr class="a">
-
-<th>Indicator </th>
-
-<th>Type </th>
-
-<th>Value </th>
- </tr>
- </thead>
- <tbody>
-
+<th> Indicator </th>
+<th> Type </th>
+<th> Value
</th></tr>
+</thead><tbody>
+
<tr class="b">
-
-<td>google </td>
-
-<td>top_domains </td>
-
-<td>{ “rank” : “1”, “domain” :
“google”, “a-new-prop” : “metron-ftw”
} </td>
- </tr>
-
+<td> google </td>
+<td> top_domains </td>
+<td> { “rank” : “1”, “domain” :
“google”, “a-new-prop” : “metron-ftw”
} </td></tr>
<tr class="a">
-
-<td>yahoo </td>
-
-<td>top_domains </td>
-
-<td>{ “rank” : “2”, “domain” :
“yahoo”, “a-new-prop” : “metron-ftw”
} </td>
- </tr>
- </tbody>
+<td> yahoo </td>
+<td> top_domains </td>
+<td> { “rank” : “2”, “domain” :
“yahoo”, “a-new-prop” : “metron-ftw”
} </td></tr>
+</tbody>
</table></div></div>
<div class="section">
<h2><a name="Enrichment_Config"></a>Enrichment Config</h2>
-<p>In order to automatically add new enrichment and threat intel types to
existing, running enrichment topologies, you will need to add new fields and
new types to the zookeeper configuration. A convenience parameter has been made
to assist in this when doing an import. Namely, you can specify the enrichment
configs and how they associate with the fields of the documents flowing through
the enrichment topology.</p>
-<p>Consider the following Enrichment Configuration JSON. This one is for a
threat intelligence type:</p>
+<p>In order to automatically add new enrichment and threat intel types to
existing, running enrichment topologies, you will need to add new fields and
new types to the zookeeper configuration. A convenience parameter has been
made to assist in this when doing an import. Namely, you can specify the
enrichment configs and how they associate with the fields of the documents
flowing through the enrichment topology.</p>
+<p>Consider the following Enrichment Configuration JSON. This one is for a
threat intelligence type:</p>
-<div class="source">
-<div class="source">
-<pre>{
+<div>
+<div>
+<pre class="source">{
"zkQuorum" : "localhost:2181"
,"sensorToFieldList" : {
"bro" : {
@@ -670,35 +421,32 @@
}
}
</pre></div></div>
-<p>We have to specify the following:</p>
+<p>We have to specify the following:</p>
<ul>
-
+
<li>The zookeeper quorum which holds the cluster configuration</li>
-
<li>The mapping between the fields in the enriched documents and the
enrichment types.</li>
</ul>
<p>This configuration allows the ingestion tools to update zookeeper
post-ingestion so that the enrichment topology can take advantage immediately
of the new type.</p></div>
<div class="section">
<h2><a name="Loading_Utilities"></a>Loading Utilities</h2>
<p>The two configurations above are used in the three separate ingestion
tools:</p>
-
<ul>
-
+
<li>Taxii Loader</li>
-
<li>Bulk load from HDFS via MapReduce</li>
-
<li>Flat File ingestion</li>
</ul>
<div class="section">
<h3><a name="Taxii_Loader"></a>Taxii Loader</h3>
-<p>The shell script <tt>$METRON_HOME/bin/threatintel_taxii_load.sh</tt> can be
used to poll a Taxii server for STIX documents and ingest them into HBase.<br
/>It is quite common for this Taxii server to be an aggregation server such as
Soltra Edge.</p>
-<p>In addition to the Enrichment and Extractor configs described above, this
loader requires a configuration file describing the connection information to
the Taxii server. An illustrative example of such a configuration file is:</p>
+<p>The shell script <tt>$METRON_HOME/bin/threatintel_taxii_load.sh</tt> can be
used to poll a Taxii server for STIX documents and ingest them into HBase.<br />
+It is quite common for this Taxii server to be an aggregation server such as
Soltra Edge.</p>
+<p>In addition to the Enrichment and Extractor configs described above, this
loader requires a configuration file describing the connection information to
the Taxii server. An illustrative example of such a configuration file is:</p>
-<div class="source">
-<div class="source">
-<pre>{
+<div>
+<div>
+<pre class="source">{
"endpoint" :
"http://localhost:8282/taxii-discovery-service"
,"type" : "DISCOVER"
,"collection" : "guest.Abuse_ch"
@@ -707,363 +455,379 @@
,"allowedIndicatorTypes" : [ "domainname:FQDN",
"address:IPV_4_ADDR" ]
}
</pre></div></div>
-<p>As you can see, we are specifying the following information:</p>
+<p>As you can see, we are specifying the following information:</p>
<ul>
-
+
<li>endpoint : The URL of the endpoint</li>
-
<li>type : <tt>POLL</tt> or <tt>DISCOVER</tt> depending on the endpoint.</li>
-
<li>collection : The Taxii collection to ingest</li>
-
<li>table : The HBase table to import into</li>
-
<li>columnFamily : The column family to import into</li>
-
<li>allowedIndicatorTypes : an array of acceptable threat intel types (see the
“Enrichment Type Name” column of the Stix table above for the
possibilities).</li>
</ul>
<p>The parameters for the utility are as follows:</p>
-
<table border="0" class="table table-striped">
- <thead>
-
+<thead>
+
<tr class="a">
-
-<th>Short Code </th>
-
-<th>Long Code </th>
-
-<th>Is Required? </th>
-
-<th>Description </th>
- </tr>
- </thead>
- <tbody>
-
+<th> Short Code </th>
+<th> Long Code </th>
+<th> Is Required? </th>
+<th> Description
</th></tr>
+</thead><tbody>
+
<tr class="b">
-
-<td>-h </td>
-
-<td> </td>
-
-<td>No </td>
-
-<td>Generate the help screen/set of options </td>
- </tr>
-
+<td> -h </td>
+<td> </td>
+<td> No </td>
+<td> Generate the help screen/set of options
</td></tr>
<tr class="a">
-
-<td>-e </td>
-
-<td>–extractor_config </td>
-
-<td>Yes </td>
-
-<td>JSON Document describing the extractor for this input data source </td>
- </tr>
-
+<td> -e </td>
+<td> –extractor_config </td>
+<td> Yes </td>
+<td> JSON Document describing the extractor for this input data source
</td></tr>
<tr class="b">
-
-<td>-c </td>
-
-<td>–taxii_connection_config </td>
-
-<td>Yes </td>
-
-<td>The JSON config file to configure the connection </td>
- </tr>
-
+<td> -c </td>
+<td> –taxii_connection_config </td>
+<td> Yes </td>
+<td> The JSON config file to configure the connection
</td></tr>
<tr class="a">
-
-<td>-p </td>
-
-<td>–time_between_polls </td>
-
-<td>No </td>
-
-<td>The time between polling the Taxii server in milliseconds. (default: 1
hour) </td>
- </tr>
-
+<td> -p </td>
+<td> –time_between_polls </td>
+<td> No </td>
+<td> The time between polling the Taxii server in milliseconds. (default: 1
hour)
</td></tr>
<tr class="b">
-
-<td>-b </td>
-
-<td>–begin_time </td>
-
-<td>No </td>
-
-<td>Start time to poll the Taxii server (all data from that point will be
gathered in the first pull). The format for the date is yyyy-MM-dd HH:mm:ss
</td>
- </tr>
-
+<td> -b </td>
+<td> –begin_time </td>
+<td> No </td>
+<td> Start time to poll the Taxii server (all data from that point will be
gathered in the first pull). The format for the date is yyyy-MM-dd HH:mm:ss
</td></tr>
<tr class="a">
-
-<td>-l </td>
-
-<td>–log4j </td>
-
-<td>No </td>
-
-<td>The Log4j Properties to load </td>
- </tr>
-
+<td> -l </td>
+<td> –log4j </td>
+<td> No </td>
+<td> The Log4j Properties to load
</td></tr>
<tr class="b">
-
-<td>-n </td>
-
-<td>–enrichment_config </td>
-
-<td>No </td>
-
-<td>The JSON document describing the enrichments to configure. Unlike other
loaders, this is run first if specified. </td>
- </tr>
- </tbody>
+<td> -n </td>
+<td> –enrichment_config </td>
+<td> No </td>
+<td> The JSON document describing the enrichments to configure. Unlike other
loaders, this is run first if specified.
</td></tr>
+</tbody>
</table></div>
<div class="section">
<h3><a name="Flatfile_Loader"></a>Flatfile Loader</h3>
-<p>The shell script <tt>$METRON_HOME/bin/flatfile_loader.sh</tt> will read
data from local disk, HDFS or URLs and load the enrichment or threat intel data
into an HBase table.<br />Note: This utility works for enrichment as well as
threat intel due to the underlying infrastructure being the same.</p>
+<p>The shell script <tt>$METRON_HOME/bin/flatfile_loader.sh</tt> will read
data from local disk, HDFS or URLs and load the enrichment or threat intel data
into an HBase table.<br />
+Note: This utility works for enrichment as well as threat intel due to the
underlying infrastructure being the same.</p>
<p>One special thing to note here is that there is a special configuration
parameter to the Extractor config that is only considered during this
loader:</p>
-
<ul>
-
-<li>inputFormat : This specifies how to consider the data. The two
implementations are <tt>BY_LINE</tt> and <tt>WHOLE_FILE</tt>.</li>
+
+<li>inputFormat : This specifies how to consider the data. The two
implementations are <tt>BY_LINE</tt> and <tt>WHOLE_FILE</tt>.</li>
</ul>
<p>The default is <tt>BY_LINE</tt>, which makes sense for a list of CSVs where
each line indicates a unit of information which can be imported. However, if
you are importing a set of STIX documents, then you want each document to be
considered as input to the Extractor.</p>
<p>The parameters for the utility are as follows:</p>
-
<table border="0" class="table table-striped">
- <thead>
-
+<thead>
+
<tr class="a">
-
-<th>Short Code </th>
-
-<th>Long Code </th>
-
-<th>Is Required? </th>
-
-<th>Description </th>
- </tr>
- </thead>
- <tbody>
-
+<th> Short Code </th>
+<th> Long Code </th>
+<th> Is Required? </th>
+<th> Description
</th></tr>
+</thead><tbody>
+
<tr class="b">
-
-<td>-h </td>
-
-<td> </td>
-
-<td>No </td>
-
-<td>Generate the help screen/set of options </td>
- </tr>
-
+<td> -h </td>
+<td> </td>
+<td> No </td>
+<td> Generate the help screen/set of options
</td></tr>
<tr class="a">
-
-<td>-q </td>
-
-<td>–quiet </td>
-
-<td>No </td>
-
-<td>Do not update progress </td>
- </tr>
-
+<td> -q </td>
+<td> –quiet </td>
+<td> No </td>
+<td> Do not update progress
</td></tr>
<tr class="b">
-
-<td>-e </td>
-
-<td>–extractor_config </td>
-
-<td>Yes </td>
-
-<td>JSON Document describing the extractor for this input data source </td>
- </tr>
-
+<td> -e </td>
+<td> –extractor_config </td>
+<td> Yes </td>
+<td> JSON Document describing the extractor for this input data source
</td></tr>
<tr class="a">
-
-<td>-m </td>
-
-<td>–import_mode </td>
-
-<td>No </td>
-
-<td>The Import mode to use: LOCAL, MR. Default: LOCAL </td>
- </tr>
-
+<td> -m </td>
+<td> –import_mode </td>
+<td> No </td>
+<td> The Import mode to use: LOCAL, MR. Default: LOCAL
</td></tr>
<tr class="b">
-
-<td>-t </td>
-
-<td>–hbase_table </td>
-
-<td>Yes </td>
-
-<td>The HBase table to import into </td>
- </tr>
-
+<td> -t </td>
+<td> –hbase_table </td>
+<td> Yes </td>
+<td> The HBase table to import into
</td></tr>
<tr class="a">
-
-<td>-c </td>
-
-<td>–hbase_cf </td>
-
-<td>Yes </td>
-
-<td>The HBase table column family to import into </td>
- </tr>
-
+<td> -c </td>
+<td> –hbase_cf </td>
+<td> Yes </td>
+<td> The HBase table column family to import into
</td></tr>
<tr class="b">
-
-<td>-i </td>
-
-<td>–input </td>
-
-<td>Yes </td>
-
-<td>The input data location on local disk. If this is a file, then that file
will be loaded. If this is a directory, then the files will be loaded
recursively under that directory. </td>
- </tr>
-
+<td> -i </td>
+<td> –input </td>
+<td> Yes </td>
+<td> The input data location on local disk. If this is a file, then that file
will be loaded. If this is a directory, then the files will be loaded
recursively under that directory. </td></tr>
<tr class="a">
-
-<td>-l </td>
-
-<td>–log4j </td>
-
-<td>No </td>
-
-<td>The log4j properties file to load </td>
- </tr>
-
+<td> -l </td>
+<td> –log4j </td>
+<td> No </td>
+<td> The log4j properties file to load
</td></tr>
<tr class="b">
-
-<td>-n </td>
-
-<td>–enrichment_config </td>
-
-<td>No </td>
-
-<td>The JSON document describing the enrichments to configure. Unlike other
loaders, this is run first if specified. </td>
- </tr>
-
+<td> -n </td>
+<td> –enrichment_config </td>
+<td> No </td>
+<td> The JSON document describing the enrichments to configure. Unlike other
loaders, this is run first if specified.
</td></tr>
<tr class="a">
-
-<td>-p </td>
-
-<td>–threads </td>
-
-<td>No </td>
-
-<td>The number of threads to use when extracting data. The default is the
number of cores. </td>
- </tr>
-
+<td> -p </td>
+<td> –threads </td>
+<td> No </td>
+<td> The number of threads to use when extracting data. The default is the
number of cores.
</td></tr>
<tr class="b">
-
-<td>-b </td>
-
-<td>–batchSize </td>
-
-<td>No </td>
-
-<td>The batch size to use for HBase puts </td>
- </tr>
- </tbody>
+<td> -b </td>
+<td> –batchSize </td>
+<td> No </td>
+<td> The batch size to use for HBase puts
</td></tr>
+</tbody>
</table></div>
<div class="section">
<h3><a name="GeoLite2_Loader"></a>GeoLite2 Loader</h3>
<p>The shell script <tt>$METRON_HOME/bin/geo_enrichment_load.sh</tt> will
retrieve MaxMind GeoLite2 data and load data into HDFS, and update the
configuration.</p>
-<p>THIS SCRIPT WILL NOT UPDATE AMBARI’S GLOBAL.JSON, JUST THE ZK
CONFIGS. CHANGES WILL GO INTO EFFECT, BUT WILL NOT PERSIST PAST AN AMBARI
RESTART UNTIL UPDATED THERE.</p>
+<p>THIS SCRIPT WILL NOT UPDATE AMBARI’S GLOBAL.JSON, JUST THE ZK
CONFIGS. CHANGES WILL GO INTO EFFECT, BUT WILL NOT PERSIST PAST AN AMBARI
RESTART UNTIL UPDATED THERE.</p>
<p>The parameters for the utility are as follows:</p>
+<table border="0" class="table table-striped">
+<thead>
+<tr class="a">
+<th> Short Code </th>
+<th> Long Code </th>
+<th> Is Required? </th>
+<th> Description
</th></tr>
+</thead><tbody>
+
+<tr class="b">
+<td> -h </td>
+<td> </td>
+<td> No </td>
+<td> Generate the help screen/set of options
</td></tr>
+<tr class="a">
+<td> -g </td>
+<td> –geo_url </td>
+<td> No </td>
+<td> GeoIP URL - defaults to <a class="externalLink"
href="http://geolite.maxmind.com/download/geoip/database/GeoLite2-City.mmdb.gz">http://geolite.maxmind.com/download/geoip/database/GeoLite2-City.mmdb.gz</a>
</td></tr>
+<tr class="b">
+<td> -r </td>
+<td> –remote_dir </td>
+<td> No </td>
+<td> HDFS directory to land formatted GeoIP file - defaults to
/apps/metron/geo/<epoch millis>/ </td></tr>
+<tr class="a">
+<td> -t </td>
+<td> –tmp_dir </td>
+<td> No </td>
+<td> Directory for landing the temporary GeoIP data - defaults to /tmp
</td></tr>
+<tr class="b">
+<td> -z </td>
+<td> –zk_quorum </td>
+<td> Yes </td>
+<td> Zookeeper Quorum URL (zk1:port,zk2:port,…)
</td></tr>
+</tbody>
+</table></div>
+<div class="section">
+<h3><a name="Flatfile_Summarizer"></a>Flatfile Summarizer</h3>
+<p>The shell script <tt>$METRON_HOME/bin/flatfile_summarizer.sh</tt> will read
data from local disk, HDFS or URLs and generate a summary object. The object
will be serialized and written to disk, either HDFS or local disk depending on
the output mode specified.</p>
+<p>It should be noted that this utility uses the same extractor config as the
<tt>flatfile_loader.sh</tt>, but as the output target is not a key value store
(but rather a summary object), it is not necessary to specify certain
configs:</p>
+<ul>
+
+<li><tt>indicator</tt>, <tt>indicator_filter</tt> and
<tt>indicator_transform</tt> are not required, but will be executed if present.
As in the loader, there will be an indicator field available if you so specify
it (by using <tt>indicator</tt> in the config).</li>
+<li><tt>type</tt> is neither required nor used</li>
+</ul>
+<p>Indeed, some new configs are expected:</p>
+<ul>
+
+<li><tt>state_init</tt> : Executed once to initialize the state object (the
object written out).</li>
+<li><tt>state_update</tt>: Called once per message. The fields available are
the fields for the row as well as
+<ul>
+
+<li><tt>indicator</tt> - the indicator value if you’ve specified it in
the config</li>
+<li><tt>state</tt> - the current state. Useful for adding to the state (e.g.
<tt>BLOOM_ADD(state, val)</tt> where <tt>val</tt> is the name of a field).</li>
+</ul>
+</li>
+<li><tt>state_merge</tt> : If you are running this multi-threaded and your
objects can be merged, this is the statement that will merge the state objects
created per thread. There is a special field available to this config:
+<ul>
+
+<li><tt>states</tt> - a list of the state objects</li>
+</ul>
+</li>
+</ul>
+<p>One special thing to note here is that there is a special configuration
parameter to the Extractor config that is only considered during this
loader:</p>
+<ul>
+
+<li>inputFormat : This specifies how to consider the data. The two
implementations are <tt>BY_LINE</tt> and <tt>WHOLE_FILE</tt>.</li>
+</ul>
+<p>The default is <tt>BY_LINE</tt>, which makes sense for a list of CSVs where
each line indicates a unit of information which can be imported. However, if
you are importing a set of STIX documents, then you want each document to be
considered as input to the Extractor.</p>
+<div class="section">
+<h4><a name="Example"></a>Example</h4>
+<p>Consider the possibility that you want to generate a bloom filter with all
of the domains in a CSV structured similarly to the Alexa top 1M domains, so
the columns are:</p>
+<ul>
+
+<li>rank</li>
+<li>domain name</li>
+</ul>
+<p>You want to generate a bloom filter with just the domains, not considering
the TLD. You would execute the following to:</p>
+<ul>
+
+<li>read data from <tt>./top-1m.csv</tt></li>
+<li>write data to <tt>./filter.ser</tt></li>
+<li>use 5 threads</li>
+</ul>
+
+<div>
+<div>
+<pre class="source">$METRON_HOME/bin/flatfile_summarizer.sh -i ./top-1m.csv -o
./filter.ser -e ./extractor.json -p 5 -b 128
+</pre></div></div>
+
+<p>To configure this, <tt>extractor.json</tt> would look like:</p>
+
+<div>
+<div>
+<pre class="source">{
+ "config" : {
+ "columns" : {
+ "rank" : 0,
+ "domain" : 1
+ },
+ "value_transform" : {
+ "domain" : "DOMAIN_REMOVE_TLD(domain)"
+ },
+ "value_filter" : "LENGTH(domain) > 0",
+ "state_init" : "BLOOM_INIT()",
+ "state_update" : {
+ "state" : "BLOOM_ADD(state, domain)"
+ },
+ "state_merge" : "BLOOM_MERGE(states)",
+ "separator" : ","
+ },
+ "extractor" : "CSV"
+}
+</pre></div></div>
+</div>
+<div class="section">
+<h4><a name="Parameters"></a>Parameters</h4>
+<p>The parameters for the utility are as follows:</p>
<table border="0" class="table table-striped">
- <thead>
-
+<thead>
+
<tr class="a">
-
-<th>Short Code </th>
-
-<th>Long Code </th>
-
-<th>Is Required? </th>
-
-<th>Description </th>
- </tr>
- </thead>
- <tbody>
-
+<th> Short Code </th>
+<th> Long Code </th>
+<th> Is Required? </th>
+<th> Description
</th></tr>
+</thead><tbody>
+
<tr class="b">
-
-<td>-h </td>
-
-<td> </td>
-
-<td>No </td>
-
-<td>Generate the help screen/set of options </td>
- </tr>
-
+<td> -h </td>
+<td> </td>
+<td> No </td>
+<td> Generate the help screen/set of options
</td></tr>
<tr class="a">
-
-<td>-g </td>
-
-<td>–geo_url </td>
-
-<td>No </td>
-
-<td>GeoIP URL - defaults to <a class="externalLink"
href="http://geolite.maxmind.com/download/geoip/database/GeoLite2-City.mmdb.gz">http://geolite.maxmind.com/download/geoip/database/GeoLite2-City.mmdb.gz</a>
</td>
- </tr>
-
+<td> -q </td>
+<td> –quiet </td>
+<td> No </td>
+<td> Do not update progress
</td></tr>
<tr class="b">
-
-<td>-r </td>
-
-<td>–remote_dir </td>
-
-<td>No </td>
-
-<td>HDFS directory to land formatted GeoIP file - defaults to
/apps/metron/geo/<epoch millis>/ </td>
- </tr>
-
+<td> -e </td>
+<td> –extractor_config </td>
+<td> Yes </td>
+<td> JSON Document describing the extractor for this input data source
</td></tr>
<tr class="a">
-
-<td>-t </td>
-
-<td>–tmp_dir </td>
-
-<td>No </td>
-
-<td>Directory for landing the temporary GeoIP data - defaults to /tmp </td>
- </tr>
-
+<td> -m </td>
+<td> –import_mode </td>
+<td> No </td>
+<td> The Import mode to use: LOCAL, MR. Default: LOCAL
</td></tr>
<tr class="b">
-
-<td>-z </td>
-
-<td>–zk_quorum </td>
-
-<td>Yes </td>
-
-<td>Zookeeper Quorum URL (zk1:port,zk2:port,…) </td>
- </tr>
- </tbody>
-</table></div></div>
- </div>
- </div>
- </div>
+<td> -om </td>
+<td> –output_mode </td>
+<td> No </td>
+<td> The Output mode to use: LOCAL, HDFS. Default: LOCAL
</td></tr>
+<tr class="a">
+<td> -i </td>
+<td> –input </td>
+<td> Yes </td>
+<td> The input data location on local disk. If this is a file, then that file
will be loaded. If this is a directory, then the files will be loaded
recursively under that directory. </td></tr>
+<tr class="b">
+<td> -o </td>
+<td> –output </td>
+<td> Yes </td>
+<td> The output data location. </td></tr>
+<tr class="a">
+<td> -l </td>
+<td> –log4j </td>
+<td> No </td>
+<td> The log4j properties file to load
</td></tr>
+<tr class="b">
+<td> -p </td>
+<td> –threads </td>
+<td> No </td>
+<td> The number of threads to use when extracting data. The default is the
number of cores.
</td></tr>
+<tr class="a">
+<td> -b </td>
+<td> –batchSize </td>
+<td> No </td>
+<td> The batch size to use for HBase puts
</td></tr>
+</tbody>
+</table></div></div></div>
+<div class="section">
+<h2><a name="Pruning_Data_from_Elasticsearch"></a>Pruning Data from
Elasticsearch</h2>
+<p><b>Note</b> - As of the Metron upgrade from Elasticsearch 2.3.3 to 5.6.2,
the included Data Pruner is no longer supported. It is replaced in favor of the
Curator utility provided by Elasticsearch. The current Curator version is 5.4
as of this version of Metron and does not match exactly with ES and Kibana.</p>
+<p>Elasticsearch provides tooling to prune index data through <a
class="externalLink"
href="https://www.elastic.co/guide/en/elasticsearch/client/curator/5.4/index.html">Curator</a>.</p>
+<p>Here is a sample invocation that you can configure through Cron to prune
indexes based on timestamp in the index name.</p>
+
+<div>
+<div>
+<pre class="source">/opt/elasticsearch-curator/curator_cli --host localhost
delete_indices --filter_list '
+ {
+ "filtertype": "age",
+ "source": "name",
+ "timestring": "%Y.%m.%d",
+ "unit": "days",
+ "unit_count": 10,
+ "direction": "older”
+ }'
+</pre></div></div>
- <hr/>
+<p>From the ES documentation:</p>
+<blockquote>
+
+<p>Using name as the source tells Curator to look for a timestring within the
index or snapshot name, and convert that into an epoch timestamp (epoch implies
UTC).</p>
+</blockquote>
+<p>You can also provide multiple filters as an array of JSON objects to
filter_list if you want finer-grained control over the indexes that will be
pruned. There is an implicit logical AND when chaining multiple filters.</p>
+
+<div>
+<div>
+<pre class="source">--filter_list
'[{"filtertype":"age","source":"creation_date","direction":"older","unit":"days","unit_count":13},{"filtertype":"pattern","kind":"prefix","value":"logstash"}]'
+</pre></div></div>
+<div class="section">
+<h3><a name="Reference"></a>Reference</h3>
+<ul>
+
+<li><a class="externalLink"
href="https://www.elastic.co/guide/en/elasticsearch/client/curator/5.4/index.html">https://www.elastic.co/guide/en/elasticsearch/client/curator/5.4/index.html</a></li>
+<li><a class="externalLink"
href="https://www.elastic.co/guide/en/elasticsearch/client/curator/5.4/filtertype_age.html">https://www.elastic.co/guide/en/elasticsearch/client/curator/5.4/filtertype_age.html</a></li>
+<li><a class="externalLink"
href="https://www.elastic.co/guide/en/elasticsearch/client/curator/5.4/singleton-cli.html">https://www.elastic.co/guide/en/elasticsearch/client/curator/5.4/singleton-cli.html</a></li>
+</ul></div></div>
+ </div>
+ </div>
+ </div>
+ <hr/>
<footer>
- <div class="container-fluid">
- <div class="row span12">Copyright © 2018
- <a href="https://www.apache.org">The Apache Software
Foundation</a>.
- All Rights Reserved.
-
+ <div class="container-fluid">
+ <div class="row-fluid">
+é 2015-2016 The Apache Software Foundation. Apache Metron, Metron, Apache,
the Apache feather logo,
+ and the Apache Metron project logo are trademarks of The Apache
Software Foundation.
+ </div>
</div>
-
-
-
- </div>
</footer>
</body>
</html>