drill git commit: DRILL-1325: Throw UnsupportedRelOperatorException for unequal joins, implicit cross joins
Repository: drill Updated Branches: refs/heads/master 471013836 -> d72d6030e DRILL-1325: Throw UnsupportedRelOperatorException for unequal joins, implicit cross joins Project: http://git-wip-us.apache.org/repos/asf/drill/repo Commit: http://git-wip-us.apache.org/repos/asf/drill/commit/d72d6030 Tree: http://git-wip-us.apache.org/repos/asf/drill/tree/d72d6030 Diff: http://git-wip-us.apache.org/repos/asf/drill/diff/d72d6030 Branch: refs/heads/master Commit: d72d6030ed3961a5e4fa8839b4be5ec1065f4059 Parents: 4710138 Author: Hsuan-Yi Chu Authored: Tue Feb 24 19:08:40 2015 -0800 Committer: Hsuan-Yi Chu Committed: Wed Feb 25 17:49:26 2015 -0800 -- .../exec/physical/impl/join/JoinUtils.java | 42 + .../planner/sql/handlers/DefaultSqlHandler.java | 30 -- .../work/foreman/SqlUnsupportedException.java | 4 + .../foreman/UnsupportedDataTypeException.java | 4 + .../foreman/UnsupportedFunctionException.java | 6 +- .../UnsupportedRelOperatorException.java| 4 + .../apache/drill/TestDisabledFunctionality.java | 97 7 files changed, 177 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/drill/blob/d72d6030/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/join/JoinUtils.java -- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/join/JoinUtils.java b/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/join/JoinUtils.java index 04f3bbe..b94289c 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/join/JoinUtils.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/join/JoinUtils.java @@ -19,6 +19,11 @@ package org.apache.drill.exec.physical.impl.join; import org.apache.drill.common.logical.data.JoinCondition; +import org.eigenbase.rel.JoinRelBase; +import org.eigenbase.rel.RelNode; +import org.eigenbase.relopt.RelOptUtil; + +import java.util.List; public class JoinUtils { public static enum JoinComparator { @@ -51,4 +56,41 @@ public class JoinUtils { throw new IllegalArgumentException("Invalid comparator supplied to this join."); } +/** + * Check if the given RelNode contains any Cartesian join. + * Return true if find one. Otherwise, return false. + * + * @param relNode the RelNode to be inspected. + * @param leftKeys a list used for the left input into the join which has + * equi-join keys. It can be empty or not (but not null), + * this method will clear this list before using it. + * @param rightKeys a list used for the right input into the join which has + * equi-join keys. It can be empty or not (but not null), + * this method will clear this list before using it. + * @return Return true if the given relNode contains Cartesian join. + * Otherwise, return false + */ + public static boolean checkCartesianJoin(RelNode relNode, List leftKeys, List rightKeys) { +if (relNode instanceof JoinRelBase) { + leftKeys.clear(); + rightKeys.clear(); + + JoinRelBase joinRel = (JoinRelBase) relNode; + RelNode left = joinRel.getLeft(); + RelNode right = joinRel.getRight(); + + RelOptUtil.splitJoinCondition(left, right, joinRel.getCondition(), leftKeys, rightKeys); + if(leftKeys.isEmpty() || rightKeys.isEmpty()) { +return true; + } +} + +for (RelNode child : relNode.getInputs()) { + if(checkCartesianJoin(child, leftKeys, rightKeys)) { +return true; + } +} + +return false; + } } http://git-wip-us.apache.org/repos/asf/drill/blob/d72d6030/exec/java-exec/src/main/java/org/apache/drill/exec/planner/sql/handlers/DefaultSqlHandler.java -- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/sql/handlers/DefaultSqlHandler.java b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/sql/handlers/DefaultSqlHandler.java index 0ac7c97..35e7f5c 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/sql/handlers/DefaultSqlHandler.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/sql/handlers/DefaultSqlHandler.java @@ -18,6 +18,7 @@ package org.apache.drill.exec.planner.sql.handlers; import java.io.IOException; +import java.util.ArrayList; import java.util.Collection; import java.util.List; @@ -35,6 +36,7 @@ import org.apache.drill.exec.ops.QueryContext; import org.apache.drill.exec.physical.PhysicalPlan; import org.apache.drill.exec.physical.base.AbstractPhysicalVisitor; import org.apache.drill.exec.physical.base.PhysicalOperator; +import org.apache.
drill git commit: DRILL-1378: Ctrl-C to cancel a query that has not returned with the first result set.
Repository: drill Updated Branches: refs/heads/master f7ef5ec78 -> 471013836 DRILL-1378: Ctrl-C to cancel a query that has not returned with the first result set. Project: http://git-wip-us.apache.org/repos/asf/drill/repo Commit: http://git-wip-us.apache.org/repos/asf/drill/commit/47101383 Tree: http://git-wip-us.apache.org/repos/asf/drill/tree/47101383 Diff: http://git-wip-us.apache.org/repos/asf/drill/diff/47101383 Branch: refs/heads/master Commit: 471013836419185d51a2d57bf5b89c4087053255 Parents: f7ef5ec Author: Parth Chandra Authored: Wed Feb 25 09:56:12 2015 -0800 Committer: Parth Chandra Committed: Wed Feb 25 17:24:54 2015 -0800 -- .../java/org/apache/drill/jdbc/DrillResultSet.java | 17 + 1 file changed, 17 insertions(+) -- http://git-wip-us.apache.org/repos/asf/drill/blob/47101383/exec/jdbc/src/main/java/org/apache/drill/jdbc/DrillResultSet.java -- diff --git a/exec/jdbc/src/main/java/org/apache/drill/jdbc/DrillResultSet.java b/exec/jdbc/src/main/java/org/apache/drill/jdbc/DrillResultSet.java index 88a6c6d..77b2c37 100644 --- a/exec/jdbc/src/main/java/org/apache/drill/jdbc/DrillResultSet.java +++ b/exec/jdbc/src/main/java/org/apache/drill/jdbc/DrillResultSet.java @@ -76,6 +76,20 @@ public class DrillResultSet extends AvaticaResultSet { listener.close(); } + @Override + public boolean next() throws SQLException { +// Next may be called after close has been called (for example after a user cancel) which in turn +// sets the cursor to null. So we must check before we call next. +// TODO: handle next() after close is called in the Avatica code. +if(super.cursor!=null){ + return super.next(); +}else{ + return false; +} + + } + + @Override protected DrillResultSet execute() throws SQLException{ // Call driver's callback. It is permitted to throw a RuntimeException. DrillConnectionImpl connection = (DrillConnectionImpl) statement.getConnection(); @@ -200,6 +214,9 @@ public class DrillResultSet extends AvaticaResultSet { qrb.getData().release(); } } + // close may be called before the first result is received and the main thread is blocked waiting + // for the result. In that case we want to unblock the main thread. + latch.countDown(); completed = true; }
svn commit: r1662344 [1/8] - in /drill/site/trunk/content/drill: ./ blog/2014/12/11/apache-drill-qa-panelist-spotlight/ docs/ docs/2014-q1-drill-report/ docs/advanced-properties/ docs/analyzing-yelp-j
Author: adi Date: Thu Feb 26 01:16:43 2015 New Revision: 1662344 URL: http://svn.apache.org/r1662344 Log: DRILL-2315: Confluence conversion plus fixes (for Kristine Hahn) Added: drill/site/trunk/content/drill/README.md drill/site/trunk/content/drill/docs/advanced-properties/ drill/site/trunk/content/drill/docs/advanced-properties/index.html drill/site/trunk/content/drill/docs/apache-drill-contribution-ideas/ drill/site/trunk/content/drill/docs/apache-drill-contribution-ideas/index.html drill/site/trunk/content/drill/docs/configuring-odbc-connections-for-linux-and-mac-os-x/ drill/site/trunk/content/drill/docs/configuring-odbc-connections-for-linux-and-mac-os-x/index.html drill/site/trunk/content/drill/docs/drill-patch-review-tool/ drill/site/trunk/content/drill/docs/drill-patch-review-tool/index.html drill/site/trunk/content/drill/docs/driver-configuration-options/ drill/site/trunk/content/drill/docs/driver-configuration-options/index.html drill/site/trunk/content/drill/docs/img/58.png (with props) drill/site/trunk/content/drill/docs/img/BI_to_Drill_2.png (with props) drill/site/trunk/content/drill/docs/img/HbaseViewCreation0.png (with props) drill/site/trunk/content/drill/docs/img/HbaseViewDSN.png (with props) drill/site/trunk/content/drill/docs/img/Hbase_Browse.png (with props) drill/site/trunk/content/drill/docs/img/Hive_DSN.png (with props) drill/site/trunk/content/drill/docs/img/ODBC_CustomSQL.png (with props) drill/site/trunk/content/drill/docs/img/ODBC_HbasePreview2.png (with props) drill/site/trunk/content/drill/docs/img/ODBC_HbaseView.png (with props) drill/site/trunk/content/drill/docs/img/ODBC_HiveConnection.png (with props) drill/site/trunk/content/drill/docs/img/ODBC_to_Drillbit.png (with props) drill/site/trunk/content/drill/docs/img/ODBC_to_Quorum.png (with props) drill/site/trunk/content/drill/docs/img/Parquet_DSN.png (with props) drill/site/trunk/content/drill/docs/img/Parquet_Preview.png (with props) drill/site/trunk/content/drill/docs/img/RegionParquet_table.png (with props) drill/site/trunk/content/drill/docs/img/SelectHbaseView.png (with props) drill/site/trunk/content/drill/docs/img/Untitled.png (with props) drill/site/trunk/content/drill/docs/img/VoterContributions_hbaseview.png (with props) drill/site/trunk/content/drill/docs/img/ngram_plugin.png (with props) drill/site/trunk/content/drill/docs/img/ngram_plugin2.png (with props) drill/site/trunk/content/drill/docs/img/settings.png (with props) drill/site/trunk/content/drill/docs/img/student_hive.png (with props) drill/site/trunk/content/drill/docs/installing-the-mapr-drill-odbc-driver-on-linux/ drill/site/trunk/content/drill/docs/installing-the-mapr-drill-odbc-driver-on-linux/index.html drill/site/trunk/content/drill/docs/installing-the-mapr-drill-odbc-driver-on-mac-os-x/ drill/site/trunk/content/drill/docs/installing-the-mapr-drill-odbc-driver-on-mac-os-x/index.html drill/site/trunk/content/drill/docs/odbc-jdbc-interfaces/ drill/site/trunk/content/drill/docs/odbc-jdbc-interfaces/index.html drill/site/trunk/content/drill/docs/reserved-keywords/ drill/site/trunk/content/drill/docs/reserved-keywords/index.html drill/site/trunk/content/drill/docs/sql-reference/ drill/site/trunk/content/drill/docs/sql-reference/index.html drill/site/trunk/content/drill/docs/step-1-install-the-mapr-drill-odbc-driver-on-windows/ drill/site/trunk/content/drill/docs/step-1-install-the-mapr-drill-odbc-driver-on-windows/index.html drill/site/trunk/content/drill/docs/step-2-configure-odbc-connections-to-drill-data-sources/ drill/site/trunk/content/drill/docs/step-2-configure-odbc-connections-to-drill-data-sources/index.html drill/site/trunk/content/drill/docs/step-3-connect-to-drill-data-sources-from-a-bi-tool/ drill/site/trunk/content/drill/docs/step-3-connect-to-drill-data-sources-from-a-bi-tool/index.html drill/site/trunk/content/drill/docs/tableau-examples/ drill/site/trunk/content/drill/docs/tableau-examples/index.html drill/site/trunk/content/drill/docs/testing-the-odbc-connection-on-linux-and-mac-os-x/ drill/site/trunk/content/drill/docs/testing-the-odbc-connection-on-linux-and-mac-os-x/index.html drill/site/trunk/content/drill/docs/using-drill-explorer-to-browse-data-and-create-views/ drill/site/trunk/content/drill/docs/using-drill-explorer-to-browse-data-and-create-views/index.html drill/site/trunk/content/drill/docs/using-the-jdbc-driver/ drill/site/trunk/content/drill/docs/using-the-jdbc-driver/index.html drill/site/trunk/content/drill/docs/using-the-mapr-odbc-driver-on-linux-and-mac-os-x/ drill/site/trunk/content/drill/docs/using-the-mapr-odbc-driver-on-linux-and-mac-os-x/index.html drill/site/trunk/content/drill/docs/using-the-mapr-odbc-driver-
svn commit: r1662344 [5/8] - in /drill/site/trunk/content/drill: ./ blog/2014/12/11/apache-drill-qa-panelist-spotlight/ docs/ docs/2014-q1-drill-report/ docs/advanced-properties/ docs/analyzing-yelp-j
Modified: drill/site/trunk/content/drill/docs/install-drill/index.html URL: http://svn.apache.org/viewvc/drill/site/trunk/content/drill/docs/install-drill/index.html?rev=1662344&r1=1662343&r2=1662344&view=diff == --- drill/site/trunk/content/drill/docs/install-drill/index.html (original) +++ drill/site/trunk/content/drill/docs/install-drill/index.html Thu Feb 26 01:16:43 2015 @@ -74,16 +74,6 @@ clustered Hadoop environment, you can in Installing in distributed mode requires some configuration, however once you install you can connect Drill to your Hive, HBase, or distributed file system data sources and run queries on them. - -Click on any of the following links for more information about how to install -Drill in embedded or distributed mode: - - -Apache Drill in 10 Minutes -Deploying Apache Drill in a Clustered Environment -Installing Drill in Embedded Mode -Installing Drill in Distributed Mode - Modified: drill/site/trunk/content/drill/docs/installing-drill-in-distributed-mode/index.html URL: http://svn.apache.org/viewvc/drill/site/trunk/content/drill/docs/installing-drill-in-distributed-mode/index.html?rev=1662344&r1=1662343&r2=1662344&view=diff == --- drill/site/trunk/content/drill/docs/installing-drill-in-distributed-mode/index.html (original) +++ drill/site/trunk/content/drill/docs/installing-drill-in-distributed-mode/index.html Thu Feb 26 01:16:43 2015 @@ -106,11 +106,12 @@ tar xzf apache-drill-.tar If you are using external JAR files, edit drill-env.sh,located in /opt/drill/conf/, and define HADOOP_HOME: export HADOOP_HOME="~/hadoop/hadoop-0.20.2/" -In drill-override.conf,create a unique Drill cluster ID, and provide Zookeeper host names and port numbers to configure a connection to your Zookeeper quorum. +In drill-override.conf, create a unique Drill cluster ID, and provide Zookeeper host names and port numbers to configure a connection to your Zookeeper quorum. -a. Edit drill-override.conflocated in ~/drill/drill- /conf/. - -b. Provide a unique cluster-id and the Zookeeper host names and port numbers in zk.connect. If you install Drill on multiple nodes, assign the same cluster ID to each Drill node so that all Drill nodes share the same ID. The default Zookeeper port is 2181. + +Edit drill-override.conflocated in ~/drill/drill- /conf/. +Provide a unique cluster-id and the Zookeeper host names and port numbers in zk.connect. If you install Drill on multiple nodes, assign the same cluster ID to each Drill node so that all Drill nodes share the same ID. The default Zookeeper port is 2181. + Example drill.exec:{ @@ -123,7 +124,7 @@ tar xzf apache-drill- .tar -You can connect Drill to various types of data sources. Refer to https://cwiki.apache.org/confluence/display/DRIL%0AL/Connecting+to+Data+Sources";>Connect +You can connect Drill to various types of data sources. Refer to Connect Apache Drill to Data Sources to get configuration instructions for the particular type of data source that you want to connect to Drill. Modified: drill/site/trunk/content/drill/docs/installing-drill-in-embedded-mode/index.html URL: http://svn.apache.org/viewvc/drill/site/trunk/content/drill/docs/installing-drill-in-embedded-mode/index.html?rev=1662344&r1=1662343&r2=1662344&view=diff == --- drill/site/trunk/content/drill/docs/installing-drill-in-embedded-mode/index.html (original) +++ drill/site/trunk/content/drill/docs/installing-drill-in-embedded-mode/index.html Thu Feb 26 01:16:43 2015 @@ -78,22 +78,14 @@ running Linux, Mac OS X, or Windows. You must have the following software installed on your machine to run Drill: -SoftwareDescriptionhttp://www.oracle.com/technetwork/java/javase/downloads/jdk7-downloads-1880260.html"; rel="nofollow">Oracle JDK version 7A set of programming tools for developing Java applications. - -A set of programming tools for developing Java applications. +SoftwareDescriptionhttp://www.oracle.com/technetwork/java/javase/downloads/jdk7-downloads-1880260.html"; rel="nofollow">Oracle JDK version 7A set of programming tools for developing Java applications. You can run the following command to verify that the system meets the software prerequisite: -CommandExample Outputjava âversionjava version "1.7.0_65"Java(TM) SE Runtime Environment (build 1.7.0_65-b19)Java HotSpot(TM) 64-Bit Server VM (build 24.65-b04, mixed mode) - -Click on the installation link appropriate for your operating system: +CommandExample Outputjava âversionjava version "1.7.0_65"Java(TM) SE Runtime Environment (build 1.7.0_65-b19)Java HotSpot(TM) 64-Bit Server VM (build 24.65-b04, mixed mode) - -Installing Drill on Linux -Installing Drill on Mac OS X -Installing Drill on Windo
svn commit: r1662344 [3/8] - in /drill/site/trunk/content/drill: ./ blog/2014/12/11/apache-drill-qa-panelist-spotlight/ docs/ docs/2014-q1-drill-report/ docs/advanced-properties/ docs/analyzing-yelp-j
Modified: drill/site/trunk/content/drill/docs/explain-commands/index.html URL: http://svn.apache.org/viewvc/drill/site/trunk/content/drill/docs/explain-commands/index.html?rev=1662344&r1=1662343&r2=1662344&view=diff == --- drill/site/trunk/content/drill/docs/explain-commands/index.html (original) +++ drill/site/trunk/content/drill/docs/explain-commands/index.html Thu Feb 26 01:16:43 2015 @@ -91,7 +91,7 @@ conditions against the same data will re change a configuration option, for example, or update the tables or files that you are selecting from, you are likely to see plan changes. -EXPLAIN Syntax +EXPLAIN Syntax The EXPLAIN command supports the following syntax: explain plan [ including all attributes ] [ with implementation | without implementation ] for; @@ -108,7 +108,7 @@ physical and logical plans. These options return the physical and logical plan information, respectively. The default is physical (WITH IMPLEMENTATION). -EXPLAIN for Physical Plans +EXPLAIN for Physical Plans The EXPLAIN PLAN FOR command returns the chosen physical execution plan for a query statement without running the query. You can use this command @@ -173,7 +173,7 @@ for submitting the query via Drill APIs. }, -Costing Information +Costing Information Add the INCLUDING ALL ATTRIBUTES option to the EXPLAIN command to see cost estimates for the query plan. For example: @@ -192,7 +192,7 @@ select * from dfs.`/Users/brumsby/drill/ 00-04ProducerConsumer: rowcount = 1.0, cumulative cost = {1.0 rows, 1.0 cpu, 0.0 io, 0.0 network}, id = 3106 00-05 Scan(groupscan=[EasyGroupScan [selectionRoot=/Users/brumsby/drill/donuts.json, columns = null]]): rowcount = 1.0, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io, 0.0 network}, id = 3101 -EXPLAIN for Logical Plans +EXPLAIN for Logical Plans To return the logical plan for a query (again, without actually running the query), use the EXPLAIN PLAN WITHOUT IMPLEMENTATION syntax: Modified: drill/site/trunk/content/drill/docs/flatten-function/index.html URL: http://svn.apache.org/viewvc/drill/site/trunk/content/drill/docs/flatten-function/index.html?rev=1662344&r1=1662343&r2=1662344&view=diff == --- drill/site/trunk/content/drill/docs/flatten-function/index.html (original) +++ drill/site/trunk/content/drill/docs/flatten-function/index.html Thu Feb 26 01:16:43 2015 @@ -143,7 +143,7 @@ order by count(celltbl.catl) desc limit +---|+ A common use case for FLATTEN is its use in conjunction with the -KVGEN function. +KVGEN function. Modified: drill/site/trunk/content/drill/docs/flexibility/index.html URL: http://svn.apache.org/viewvc/drill/site/trunk/content/drill/docs/flexibility/index.html?rev=1662344&r1=1662343&r2=1662344&view=diff == --- drill/site/trunk/content/drill/docs/flexibility/index.html (original) +++ drill/site/trunk/content/drill/docs/flexibility/index.html Thu Feb 26 01:16:43 2015 @@ -69,7 +69,7 @@ The following features contribute to Drill's flexible architecture: -_Dynamic schema discovery _ +Dynamic schema discovery Drill does not require schema or type specification for the data in order to start the query execution process. Instead, Drill starts processing the data @@ -121,7 +121,7 @@ traditional DB (Databases->Tables/Vie through the ANSI standard INFORMATION_SCHEMA database For more information on how to configure and work various data sources with -Drill, refer to https://cwiki.apache.or%0Ag/confluence/display/DRILL/Connect+Apache+Drill+to+Data+Sources";>Connect Apache Drill to Data Sources. +Drill, refer to Connect Apache Drill to Data Sources. Extensibility Modified: drill/site/trunk/content/drill/docs/getting-to-know-the-drill-sandbox/index.html URL: http://svn.apache.org/viewvc/drill/site/trunk/content/drill/docs/getting-to-know-the-drill-sandbox/index.html?rev=1662344&r1=1662343&r2=1662344&view=diff == --- drill/site/trunk/content/drill/docs/getting-to-know-the-drill-sandbox/index.html (original) +++ drill/site/trunk/content/drill/docs/getting-to-know-the-drill-sandbox/index.html Thu Feb 26 01:16:43 2015 @@ -82,7 +82,7 @@ optimization rules for Drill to leverage Take a look at the pre-configured storage plugins by opening the Drill Web UI. -Feel free to skip this section and jump directly to the queries: Lesson 1: +Feel free to skip this section and jump directly to the queries: Lesson 1: Learn About the Data Set @@ -97,8 +97,7 @@ Set A storage plugin configuration for MapR-DB in the sandbox. Drill uses a single storage plugin for connecting to HBase as well as MapR-DB, which is an -enterprise grade in-Hadoop NoSQL databas
svn commit: r1662344 [6/8] - in /drill/site/trunk/content/drill: ./ blog/2014/12/11/apache-drill-qa-panelist-spotlight/ docs/ docs/2014-q1-drill-report/ docs/advanced-properties/ docs/analyzing-yelp-j
Modified: drill/site/trunk/content/drill/docs/planning-and-execution-options/index.html URL: http://svn.apache.org/viewvc/drill/site/trunk/content/drill/docs/planning-and-execution-options/index.html?rev=1662344&r1=1662343&r2=1662344&view=diff == --- drill/site/trunk/content/drill/docs/planning-and-execution-options/index.html (original) +++ drill/site/trunk/content/drill/docs/planning-and-execution-options/index.html Thu Feb 26 01:16:43 2015 @@ -88,17 +88,17 @@ persist across all sessions. The following table contains planning and execution options that you can set at the system or session level: -Option nameDefault valueDescriptionexec.errors.verbosefalseThis option enables or disables the verbose message that Drill returns when a query fails. When enabled, Drill provides additional information about failed queries.exec.max_hash_table_size1073741824The default maximum size for hash tables.exec.min_hash_table_size65536The default starting size for hash tables. Increasing this size is useful for very large aggregations or joins when you have large amounts of memory for Drill to use. Drill can spend a lot of time resizing the hash table as it finds new data. If you have large data sets, you can increase this hash table size to increase performance.planner.add_producer_consumerfalse This option enables or disables a secondary reading thread that works out of band of the rest of the scanning fragment to prefetch data from disk. If you interact with a certain type of storage medium that is slow or does not prefetch much data, this option tells Drill to add a producer consumer reading thread to the operati on. Drill can then assign one thread that focuses on a single reading fragment. If Drill is using memory, you can disable this option to get better performance. If Drill is using disk space, you should enable this option and set a reasonable queue size for the planner.producer_consumer_queue_size option.planner.broadcast_threshold100Threshold, in terms of a number of rows, that determines whether a broadcast join is chosen for a query. Regardless of the setting of the broadcast_join option (enabled or disabled), a broadcast join is not chosen unless the right side of the join is estimated to contain fewer rows than this threshold. The intent of this option is to avoid broadcasting too many rows for join purposes. Broadcasting involves sending data across nodes and is a network-intensive operation. (The "right side" of the join, which may itself be a join or simply a table, is determined by cost-based optimizations and heuristics during physical planning.)planner.enable_broadcast_joinplanner.enable_hashaggplanner.enable_hashjoinplanner.enable_mergejoinplanner.enable_multiphase_aggplanner.enable_streamaggtrueThese options enable or disable specific aggregation and join operators for queries. These operators are all enabled by default and in general should not be disabled.Hash aggregation and hash join are hash-based operations. Streaming aggregation and merge join are sort-based operations. Both hash-based and sort-based operations consume memory; however, currently, hash-based operations do not spill to disk as needed, but the sort-based operations do. If large hash operations do not fit in memory on your system, you may need to disable these operations. Queries will continue to run, using alternative plans.planner.producer_consumer_queue_size10Determines how much data to prefetch from disk (in record batches) out of band of query execution. The larger the queue size, the greater the amount of memory that the queue and overall query execution consumes.planner.slice_target10The number of records manipulated within a fragment before Drill parallelizes them.planner. width.max_per_node The default depends on the number of cores on each node.In this context "width" refers to fanout or distribution potential: the ability to run a query in parallel across the cores on a node and the nodes on a cluster.A physical plan consists of intermediate operations, known as query "fragments," that run concurrently, yielding opportunities for parallelism above and below each exchange operator in the plan. An exchange operator represents a breakpoint in the execution flow where processing can be distributed. For example, a single-process scan of a file may flow into an exchange operator, followed by a multi-process aggregation fragment. The maximum width per node defines the maximum degree of parallelism for any fragment of a query, but the setting applies at the level of a single node in the cluster.The default maximum degree of parallelism per node is calculated as follows, with the theoretical maximum automatically scaled back (and rounded down) so that only 70% of the actual available capacity is taken into account: +Option nameDefault valueDescriptionexec.errors.verbosefalseT
svn commit: r1662344 [8/8] - in /drill/site/trunk/content/drill: ./ blog/2014/12/11/apache-drill-qa-panelist-spotlight/ docs/ docs/2014-q1-drill-report/ docs/advanced-properties/ docs/analyzing-yelp-j
Modified: drill/site/trunk/content/drill/docs/supported-date-time-data-type-formats/index.html URL: http://svn.apache.org/viewvc/drill/site/trunk/content/drill/docs/supported-date-time-data-type-formats/index.html?rev=1662344&r1=1662343&r2=1662344&view=diff == --- drill/site/trunk/content/drill/docs/supported-date-time-data-type-formats/index.html (original) +++ drill/site/trunk/content/drill/docs/supported-date-time-data-type-formats/index.html Thu Feb 26 01:16:43 2015 @@ -136,14 +136,14 @@ Apache Drill does not support time - + ## Time Drill supports the `time` data type in the following format: HH:mm:ss.SSS (hour:minute:sec.milliseconds) -The following table provides some examples for the` time` data type: +The following table provides some examples for the `time` data type: Use @@ -162,7 +162,6 @@ The following table provides some exampl select cast(time_col as time) from dfs.`/tmp/input.json`; - Interval @@ -174,7 +173,7 @@ The following table provides some exampl supports the interval data type in the following format: P [qty] Y [qty] M -The following table provides examples for interval yeardata type: +The following table provides examples for interval year data type: Use @@ -190,7 +189,6 @@ supports the interval data select cast(col as interval year) from dfs.`/tmp/input.json`; - Interval Day @@ -201,15 +199,14 @@ supports the interval day d The following table provides examples for interval day data type: -UseExampleLiteralselect interval '1 10:20:30.123' day to second from dfs.`/tmp/input.json`;select interval '1 10' day to hour from dfs.`/tmp/input.json`;select interval '10' day from dfs.`/tmp/input.json`;select interval '10' hour from dfs.`/tmp/input.json`;select interval '10.999' second from dfs.`/tmp/input.json`;JSON Input{"col" : "P1DT10H20M30S"}{"col" : "P1DT 10H20M30.123S"}{"col" : "P1D"}{"col" : "PT10H"}{"col" : "PT10.10S"}{"col" : "PT20S"}{"col" : "PT10H10S"}CAST from VARCHARselect cast(col as interval day) from dfs.`/tmp/input.json`; - +UseExampleLiteralselect interval '1 10:20:30.123' day to second from dfs.`/tmp/input.json`;select interval '1 10' day to hour from dfs.`/tmp/input.json`;select interval '10' day from dfs.`/tmp/input.json`;select interval '10' hour from dfs.`/tmp/input.json`;select interval '10.999' second from dfs.`/tmp/input.json`;JSON Input{"col" : "P1DT10H20M30S"}{"col" : "P1DT10H20M30.123S"}{"col" : &q uot;P1D"}{"col" : "PT10H"}{"col" : "PT10.10S"}{"col" : "PT20S"}{"col" : "PT10H10S"}CAST from VARCHARselect cast(col as interval day) from dfs.`/tmp/input.json`; Literal -The following table provides a list ofdate/time literals that Drill +The following table provides a list of date/time literals that Drill supports with examples of each: -FormatInterpretationExampleinterval '1 10:20:30.123' day to second1 day, 10 hours, 20 minutes, 30 seconds, and 123 thousandths of a secondselect interval '1 10:20:30.123' day to second from dfs.`/tmp/input.json`;interval '1 10' day to hour1 day 10 hoursselect interval '1 10' day to hour from dfs.`/tmp/input.json`;interval '10' day10 daysselect interval '10' day from dfs.`/tmp/input.json`;interval '10' hour10 hoursselect interval '10' hour from dfs.`/tmp/input.json`;interval '10.999' second10.999 secondsselect interval '10.999' second from dfs.`/tmp/input.json`; +FormatInterpretationExampleinterval '1 10:20:30.123' day to second1 day, 10 hours, 20 minutes, 30 seconds, and 123 thousandths of a secondselect interval '1 10:20:30.123' day to second from dfs.`/tmp/input.json`;interval '1 10' day to hour1 day 10 hoursselect interval '1 10' day to hour from dfs.`/tmp/input.json`;interval '10' day10 daysselect interval '10' day from >dfs.`/tmp/input.json`;valign="top">interval '10' >hour10 >hoursselect interval '10' hour from >dfs.`/tmp/input.json`;valign="top">interval '10.999' >second10.999 >secondsselect interval '10.999' second from dfs.`/tmp/input.json`; > Added: drill/site/trunk/content/drill/docs/tableau-examples/index.html URL: http://svn.apache.org/viewvc/drill/site/trunk/content/drill/docs/tableau-examples/index.html?rev=1662344&view=auto == --- drill/site/trunk/content/drill/docs/tableau-examples/index.html (added) +++ drill/site/trunk/content/drill/docs/tableau-examples/index.html Thu Feb 26 01:16:43 2015 @@ -0,0 +1,343 @@ + + + + + + + + +Tableau Examples - Apache Drill + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Documentation + + Overview + https://cwiki.apache.org/confluence/display/DRILL/Apache+Drill+in+10+Minutes"; target="_blank">Drill in 10 Minutes + Why Drill? + Architecture + + + +Community + + Team + Events and Meetup
svn commit: r1662344 [4/8] - in /drill/site/trunk/content/drill: ./ blog/2014/12/11/apache-drill-qa-panelist-spotlight/ docs/ docs/2014-q1-drill-report/ docs/advanced-properties/ docs/analyzing-yelp-j
Modified: drill/site/trunk/content/drill/docs/index.html URL: http://svn.apache.org/viewvc/drill/site/trunk/content/drill/docs/index.html?rev=1662344&r1=1662343&r2=1662344&view=diff == --- drill/site/trunk/content/drill/docs/index.html (original) +++ drill/site/trunk/content/drill/docs/index.html Thu Feb 26 01:16:43 2015 @@ -71,7 +71,7 @@ -Apache Drill Documentation +Architectural Overview @@ -80,7 +80,48 @@ - Architectural Overview + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Core Modules within a Drillbit + + + + + + + Architectural Highlights @@ -123,226 +164,15 @@ -Core Modules within a Drillbit - - - -Architectural Highlights - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Apache Drill Tutorial - - - - - - - - - - - - - - - - - - - +Flexibility - +Performance @@ -534,28 +364,16 @@ -Installing the Apache Drill Sandbox - -Getting to Know the Drill Sandbox - -Lession 1: Learn about the Data Set - -Lession 2: Run Queri
svn commit: r1662344 [2/8] - in /drill/site/trunk/content/drill: ./ blog/2014/12/11/apache-drill-qa-panelist-spotlight/ docs/ docs/2014-q1-drill-report/ docs/advanced-properties/ docs/analyzing-yelp-j
Modified: drill/site/trunk/content/drill/docs/apache-drill-in-10-minutes/index.html URL: http://svn.apache.org/viewvc/drill/site/trunk/content/drill/docs/apache-drill-in-10-minutes/index.html?rev=1662344&r1=1662343&r2=1662344&view=diff == --- drill/site/trunk/content/drill/docs/apache-drill-in-10-minutes/index.html (original) +++ drill/site/trunk/content/drill/docs/apache-drill-in-10-minutes/index.html Thu Feb 26 01:16:43 2015 @@ -85,13 +85,13 @@ More Information -Objective +Objective Use Apache Drill to query sample data in 10 minutes. For simplicity, youâll run Drill in embedded mode rather than distributed mode to try out Drill without having to perform any setup tasks. -A Few Bits About Apache Drill +A Few Bits About Apache Drill Drill is a clustered, powerful MPP (Massively Parallel Processing) query engine for Hadoop that can process petabytes of data, fast. Drill is useful @@ -100,7 +100,7 @@ capable of querying nested data in forma performing dynamic schema discovery. Drill does not require a centralized metadata repository. -_Dynamic schema discovery _ +Dynamic schema discovery Drill does not require schema or type specification for data in order to start the query execution process. Drill starts data processing in record-batches @@ -144,7 +144,7 @@ extend the layer to a broader array of u classpath scanning and plugin concept to add additional storage plugins, functions, and operators with minimal configuration. -Process Overview +Process Overview Download the Apache Drill archive and extract the contents to a directory on your machine. The Apache Drill archive contains sample JSON and Parquet files @@ -159,19 +159,19 @@ commands. SQLLine is used as the shell f You must have the following software installed on your machine to run Drill: -SoftwareDescriptionhttp://www.oracle.com/technetwork/java/javase/downloads/jdk7-downloads-1880260.html"; class="external-link" rel="nofollow">Oracle JDK version 7A set of programming tools for developing Java applications. +SoftwareDescriptionhttp://www.oracle.com/technetwork/java/javase/downloads/jdk7-downloads-1880260.html"; class="external-link" rel="nofollow">Oracle JDK version 7A set of programming tools for developing Java applications. Prerequisite Validation Run the following command to verify that the system meets the software prerequisite: -Command Example Outputjava âversionjava version "1.7.0_65"Java(TM) SE Runtime Environment (build 1.7.0_65-b19)Java HotSpot(TM) 64-Bit Server VM (build 24.65-b04, mixed mode) +Command Example Outputjava âversionjava version "1.7.0_65"Java(TM) SE Runtime Environment (build 1.7.0_65-b19)Java HotSpot(TM) 64-Bit Server VM (build 24.65-b04, mixed mode) -Install Drill +Install Drill You can install Drill on a machine running Linux, Mac OS X, or Windows. -Installing Drill on Linux +Installing Drill on Linux Complete the following steps to install Drill: @@ -182,7 +182,7 @@ prerequisite: Issue the following command to create a new directory to which you can extract the contents of the Drill tar.gz file: sudo mkdir -p /opt/drill -Navigate to the directory where you downloaded the Drill tar.gz file. +Navigate to the directory where you downloaded the Drill tar.gz file. Issue the following command to extract the contents of the Drill tar.gz file: sudo tar -xvzf apache-drill-.tar.gz -C /opt/drill @@ -191,9 +191,9 @@ prerequisite: -At this point, you can https://cwiki.apache.org/confluence/displ%0Aay/DRILL/Apache+Drill+in+10+Minutes#ApacheDrillin10Minutes-StartDrill";>start Drill. +At this point, you can start Drill. -Installing Drill on Mac OS X +Installing Drill on Mac OS X Complete the following steps to install Drill: @@ -208,9 +208,8 @@ $ cd drill $ pwd /Users/max/drill -Click the following link to download the latest, stable version of Apache Drill: - -http://www.apache.org/dyn/closer.cgi/drill/drill-0.7.0/apache-drill-0.7.0.tar.gz";>http://www.apache.org/dyn/closer.cgi/drill/drill-0.7.0/apache-drill-0.7.0.tar.gz +Click the following link to download the latest, stable version of Apache Drill: + http://www.apache.org/dyn/closer.cgi/drill/drill-0.7.0/apache-drill-0.7.0.tar.gz";>http://www.apache.org/dyn/closer.cgi/drill/drill-0.7.0/apache-drill-0.7.0.tar.gz Open the downloaded TAR file with the Mac Archive utility or a similar tool for unzipping files. Move the resulting apache-drill- folder into the drill directory that you created. Issue the following command to navigate to the apache-drill- directory: @@ -218,9 +217,9 @@ $ pwd -At this point, you can https://cwiki.apache.org/confluence/displ%0Aay/DRILL/Apache+Drill+in+10+Minutes#ApacheDrillin10Minutes-StartDrill";>start Drill. +At this point, you can start Drill. -Installing Drill on Windows +Installing Drill on Windows You can install
svn commit: r1662344 [7/8] - in /drill/site/trunk/content/drill: ./ blog/2014/12/11/apache-drill-qa-panelist-spotlight/ docs/ docs/2014-q1-drill-report/ docs/advanced-properties/ docs/analyzing-yelp-j
Modified: drill/site/trunk/content/drill/docs/release-notes/index.html URL: http://svn.apache.org/viewvc/drill/site/trunk/content/drill/docs/release-notes/index.html?rev=1662344&r1=1662343&r2=1662344&view=diff == --- drill/site/trunk/content/drill/docs/release-notes/index.html (original) +++ drill/site/trunk/content/drill/docs/release-notes/index.html Thu Feb 26 01:16:43 2015 @@ -80,7 +80,7 @@ Drill has been tested against MapR, Clou distributions. There are associated build profiles and JIRAs that can help you run Drill against your preferred distribution -Apache Drill 0.7.0 Key Features +Apache Drill 0.7.0 Key Features No more dependency on UDP/Multicast - Making it possible for Drill to work well in the following scenarios: @@ -104,7 +104,7 @@ run Drill against your preferred distrib Stability improvements in ODBC and JDBC drivers -Apache Drill 0.7.0 Key Notes and Limitations +Apache Drill 0.7.0 Key Notes and Limitations The current release supports in-memory and beyond-memory execution. However, you must disable memory-intensive hash aggregate and hash join operations to leverage this functionality. @@ -123,18 +123,18 @@ against Apache Hadoop. Drill has been te Hortonworks Hadoop distributions. There are associated build profiles and JIRAs that can help you run Drill against your preferred distribution. -Apache Drill 0.6.0 Key Features +Apache Drill 0.6.0 Key Features This release is primarily a bug fix release, with https://issues.apache.org/jira/secure/ReleaseNote.jspa?projectId=12313820&vers%0Aion=12327472";>more than 30 JIRAs closed, but there are some notable features: -Direct ANSI SQL access to MongoDB, using the latest MongoDB Plugin for Apache Drill +Direct ANSI SQL access to MongoDB, using the latest MongoDB Plugin for Apache Drill Filesystem query performance improvements with partition pruning Ability to use the file system as a persistent store for query profiles and diagnostic information Window function support (alpha) -Apache Drill 0.6.0 Key Notes and Limitations +Apache Drill 0.6.0 Key Notes and Limitations The current release supports in-memory and beyond-memory execution. However, you must disable memory-intensive hash aggregate and hash join operations to leverage this functionality. @@ -157,7 +157,7 @@ against Apache Hadoop. Drill has been te Hortonworks Hadoop distributions. There are associated build profiles and JIRAs that can help you run Drill against your preferred distribution. -Apache Drill 0.5.0 Key Notes and Limitations +Apache Drill 0.5.0 Key Notes and Limitations The current release supports in memory and beyond memory execution. However, you must disable memory-intensive hash aggregate and hash join operations to leverage this functionality. @@ -191,7 +191,7 @@ MapR, Cloudera and Hortonworks Hadoop di build profiles or JIRAs that can help you run against your preferred distribution. -Some Key Notes & Limitations +Some Key Notes & Limitations The current release supports in memory and beyond memory execution. However, users must disable memory-intensive hash aggregate and hash join operations to leverage this functionality. @@ -241,7 +241,7 @@ will be correct in a future milestone re Drill Alpha does not include, there are currently a couple of differences for how to write a query in In order to query against -UDFs +UDFs Drill currently supports simple and aggregate functions using scalar, repeated and Modified: drill/site/trunk/content/drill/docs/repeated-count-function/index.html URL: http://svn.apache.org/viewvc/drill/site/trunk/content/drill/docs/repeated-count-function/index.html?rev=1662344&r1=1662343&r2=1662344&view=diff == --- drill/site/trunk/content/drill/docs/repeated-count-function/index.html (original) +++ drill/site/trunk/content/drill/docs/repeated-count-function/index.html Thu Feb 26 01:16:43 2015 @@ -94,7 +94,7 @@ the count to be grouped by other columns this example). For another example of this function, see the following lesson in the Apache -Drill Tutorial for Hadoop: Lesson 3: Run Queries on Complex Data Types. +Drill Tutorial for Hadoop: Lesson 3: Run Queries on Complex Data Types. Added: drill/site/trunk/content/drill/docs/reserved-keywords/index.html URL: http://svn.apache.org/viewvc/drill/site/trunk/content/drill/docs/reserved-keywords/index.html?rev=1662344&view=auto == --- drill/site/trunk/content/drill/docs/reserved-keywords/index.html (added) +++ drill/site/trunk/content/drill/docs/reserved-keywords/index.html Thu Feb 26 01:16:43 2015 @@ -0,0 +1,102 @@ + + + + + + + + +Reserved Keywords - Apache Drill + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Documentation + + Overview +
[03/13] drill git commit: DRILL-2315: Confluence conversion plus fixes
http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/query/005-query-info-skema.md -- diff --git a/_docs/query/005-query-info-skema.md b/_docs/query/005-query-info-skema.md new file mode 100644 index 000..1ad0008 --- /dev/null +++ b/_docs/query/005-query-info-skema.md @@ -0,0 +1,109 @@ +--- +title: "Querying the INFORMATION SCHEMA" +parent: "Query Data" +--- +When you are using Drill to connect to multiple data sources, you need a +simple mechanism to discover what each data source contains. The information +schema is an ANSI standard set of metadata tables that you can query to return +information about all of your Drill data sources (or schemas). Data sources +may be databases or file systems; they are all known as "schemas" in this +context. You can query the following INFORMATION_SCHEMA tables: + + * SCHEMATA + * CATALOGS + * TABLES + * COLUMNS + * VIEWS + +## SCHEMATA + +The SCHEMATA table contains the CATALOG_NAME and SCHEMA_NAME columns. To allow +maximum flexibility inside BI tools, the only catalog that Drill supports is +`DRILL`. + +0: jdbc:drill:zk=local> select CATALOG_NAME, SCHEMA_NAME as all_my_data_sources from INFORMATION_SCHEMA.SCHEMATA order by SCHEMA_NAME; ++--+-+ +| CATALOG_NAME | all_my_data_sources | ++--+-+ +| DRILL| INFORMATION_SCHEMA | +| DRILL| cp.default | +| DRILL| dfs.default | +| DRILL| dfs.root| +| DRILL| dfs.tmp | +| DRILL| HiveTest.SalesDB| +| DRILL| maprfs.logs | +| DRILL| sys | ++--+-+ + +The INFORMATION_SCHEMA name and associated keywords are case-sensitive. You +can also return a list of schemas by running the SHOW DATABASES command: + +0: jdbc:drill:zk=local> show databases; ++-+ +| SCHEMA_NAME | ++-+ +| dfs.default | +| dfs.root| +| dfs.tmp | +... + +## CATALOGS + +The CATALOGS table returns only one row, with the hardcoded DRILL catalog name +and description. + +## TABLES + +The TABLES table returns the table name and type for each table or view in +your databases. (Type means TABLE or VIEW.) Note that Drill does not return +files available for querying in file-based data sources. Instead, use SHOW +FILES to explore these data sources. + +## COLUMNS + +The COLUMNS table returns the column name and other metadata (such as the data +type) for each column in each table or view. + +## VIEWS + +The VIEWS table returns the name and definition for each view in your +databases. Note that file schemas are the canonical repository for views in +Drill. Depending on how you create a view, the may only be displayed in Drill +after it has been used. + +## Useful Queries + +Run an ``INFORMATION_SCHEMA.`TABLES` ``query to view all of the tables and views +within a database. TABLES is a reserved word in Drill and requires back ticks +(`). + +For example, the following query identifies all of the tables and views that +Drill can access: + +SELECT TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE +FROM INFORMATION_SCHEMA.`TABLES` +ORDER BY TABLE_NAME DESC; + +TABLE_SCHEMA TABLE_NAMETABLE_TYPE + +HiveTest.CustomersDB Customers TABLE +HiveTest.SalesDB OrdersTABLE +HiveTest.SalesDB OrderLinesTABLE +HiveTest.SalesDB USOrders VIEW +dfs.default CustomerSocialProfile VIEW + + +**Note:** Currently, Drill only supports querying Drill views; Hive views are not yet supported. + +You can run a similar query to identify columns in tables and the data types +of those columns: + +SELECT COLUMN_NAME, DATA_TYPE +FROM INFORMATION_SCHEMA.COLUMNS +WHERE TABLE_NAME = 'Orders' AND TABLE_SCHEMA = 'HiveTest.SalesDB' AND COLUMN_NAME LIKE '%Total'; ++-++ +| COLUMN_NAME | DATA_TYPE | ++-++ +| OrderTotal | Decimal| ++-++ + http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/query/006-query-sys-tbl.md -- diff --git a/_docs/query/006-query-sys-tbl.md b/_docs/query/006-query-sys-tbl.md new file mode 100644 index 000..9b853ec --- /dev/null +++ b/_docs/query/006-query-sys-tbl.md @@ -0,0 +1,159 @@ +--- +title: "Querying System Tables" +parent: "Query Data" +--- +Drill has a sys database that contains system tables. You can query the system +tables for in
[02/13] drill git commit: DRILL-2315: Confluence conversion plus fixes
http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/sql-ref/cmd-summary/003-select.md -- diff --git a/_docs/sql-ref/cmd-summary/003-select.md b/_docs/sql-ref/cmd-summary/003-select.md new file mode 100644 index 000..4a4 --- /dev/null +++ b/_docs/sql-ref/cmd-summary/003-select.md @@ -0,0 +1,85 @@ +--- +title: "SELECT Statements" +parent: "SQL Commands Summary" +--- +Drill supports the following ANSI standard clauses in the SELECT statement: + + * WITH clause + * SELECT list + * FROM clause + * WHERE clause + * GROUP BY clause + * HAVING clause + * ORDER BY clause (with an optional LIMIT clause) + +You can use the same SELECT syntax in the following commands: + + * CREATE TABLE AS (CTAS) + * CREATE VIEW + +INSERT INTO SELECT is not yet supported. + +## Column Aliases + +You can use named column aliases in the SELECT list to provide meaningful +names for regular columns and computed columns, such as the results of +aggregate functions. See the section on running queries for examples. + +You cannot reference column aliases in the following clauses: + + * WHERE + * GROUP BY + * HAVING + +Because Drill works with schema-less data sources, you cannot use positional +aliases (1, 2, etc.) to refer to SELECT list columns, except in the ORDER BY +clause. + +## UNION ALL Set Operator + +Drill supports the UNION ALL set operator to combine two result sets. The +distinct UNION operator is not yet supported. + +The EXCEPT, EXCEPT ALL, INTERSECT, and INTERSECT ALL operators are not yet +supported. + +## Joins + +Drill supports ANSI standard joins in the FROM and WHERE clauses: + + * Inner joins + * Left, full, and right outer joins + +The following types of join syntax are supported: + +Join type| Syntax +---|--- +Join condition in WHERE clause|FROM table1, table 2 WHERE table1.col1=table2.col1 +USING join in FROM clause|FROM table1 JOIN table2 USING(col1, ...) +ON join in FROM clause|FROM table1 JOIN table2 ON table1.col1=table2.col1 +NATURAL JOIN in FROM clause|FROM table 1 NATURAL JOIN table 2 + +Cross-joins are not yet supported. You must specify a join condition when more +than one table is listed in the FROM clause. + +Non-equijoins are supported if the join also contains an equality condition on +the same two tables as part of a conjunction: + +table1.col1 = table2.col1 AND table1.c2 < table2.c2 + +This restriction applies to both inner and outer joins. + +## Subqueries + +You can use the following subquery operators in Drill queries. These operators +all return Boolean results. + + * ALL + * ANY + * EXISTS + * IN + * SOME + +In general, correlated subqueries are supported. EXISTS and NOT EXISTS +subqueries that do not contain a correlation join are not yet supported. + http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/sql-ref/cmd-summary/004-show-files.md -- diff --git a/_docs/sql-ref/cmd-summary/004-show-files.md b/_docs/sql-ref/cmd-summary/004-show-files.md new file mode 100644 index 000..1fcf395 --- /dev/null +++ b/_docs/sql-ref/cmd-summary/004-show-files.md @@ -0,0 +1,65 @@ +--- +title: "SHOW FILES Command" +parent: "SQL Commands Summary" +--- +The SHOW FILES command provides a quick report of the file systems that are +visible to Drill for query purposes. This command is unique to Apache Drill. + +## Syntax + +The SHOW FILES command supports the following syntax. + +SHOW FILES [ FROM filesystem.directory_name | IN filesystem.directory_name ]; + +The FROM or IN clause is required if you do not specify a default file system +first. You can do this with the USE command. FROM and IN are synonyms. + +The directory name is optional. (If the directory name is a Drill reserved +word, you must use back ticks around the name.) + +The command returns standard Linux `stat` information for each file or +directory, such as permissions, owner, and group values. This information is +not specific to Drill. + +## Examples + +The following example returns information about directories and files in the +local (`dfs`) file system. + + 0: jdbc:drill:> use dfs; + + +++ + | ok | summary | + +++ + | true | Default schema changed to 'dfs' | + +++ + 1 row selected (0.318 seconds) + + 0: jdbc:drill:> show files; + ++-+++++-++--+ + |name| isDirectory | isFile | length | owner| group| permissions | accessTime | modificationTime | + ++-+++++-++--+ + | user | true| false | 1
[06/13] drill git commit: DRILL-2315: Confluence conversion plus fixes
http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/drill-docs/sql-ref/nested/001-flatten.md -- diff --git a/_docs/drill-docs/sql-ref/nested/001-flatten.md b/_docs/drill-docs/sql-ref/nested/001-flatten.md deleted file mode 100644 index 124db91..000 --- a/_docs/drill-docs/sql-ref/nested/001-flatten.md +++ /dev/null @@ -1,89 +0,0 @@ -title: "FLATTEN Function" -parent: "Nested Data Functions" -The FLATTEN function is useful for flexible exploration of repeated data. -FLATTEN separates the elements in a repeated field into individual records. To -maintain the association between each flattened value and the other fields in -the record, all of the other columns are copied into each new record. A very -simple example would turn this data (one record): - -{ - "x" : 5, - "y" : "a string", - "z" : [ 1,2,3] -} - -into three distinct records: - -select flatten(z) from table; -| x | y | z | -+-++---+ -| 5 | "a string" | 1 | -| 5 | "a string" | 2 | -| 5 | "a string" | 3 | - -The function takes a single argument, which must be an array (the `z` column -in this example). - - - -For a more interesting example, consider the JSON data in the publicly -available [Yelp](https://www.yelp.com/dataset_challenge/dataset) data set. The -first query below returns three columns from the -`yelp_academic_dataset_business.json` file: `name`, `hours`, and `categories`. -The query is restricted to distinct rows where the name is `z``pizza`. The -query returns only one row that meets those criteria; however, note that this -row contains an array of four categories: - -0: jdbc:drill:zk=local> select distinct name, hours, categories -from dfs.yelp.`yelp_academic_dataset_business.json` -where name ='zpizza'; -++++ -|name| hours| categories | -++++ -| zpizza | {"Tuesday":{"close":"22:00","open":"10:00"},"Friday":{"close":"23:00","open":"10:00"},"Monday":{"close":"22:00","open":"10:00"},"Wednesday":{"close":"22:00","open":"10:00"},"Thursday":{"close":"22:00","open":"10:00"},"Sunday":{"close":"22:00","open":"10:00"},"Saturday":{"close":"23:00","open":"10:00"}} | ["Gluten-Free","Pizza","Vegan","Restaurants"] | - -The FLATTEN function can operate on this single row and return multiple rows, -one for each category: - -0: jdbc:drill:zk=local> select distinct name, flatten(categories) as categories -from dfs.yelp.`yelp_academic_dataset_business.json` -where name ='zpizza' order by 2; -++-+ -|name| categories | -++-+ -| zpizza | Gluten-Free | -| zpizza | Pizza | -| zpizza | Restaurants | -| zpizza | Vegan | -++-+ -4 rows selected (2.797 seconds) - -Having used the FLATTEN function to break down arrays into distinct rows, you -can run queries that do deeper analysis on the flattened result set. For -example, you can use FLATTEN in a subquery, then apply WHERE clause -constraints or aggregate functions to the results in the outer query. - -The following query uses the same data file as the previous query to flatten -the categories array, then run a COUNT function on the flattened result: - -select celltbl.catl, count(celltbl.catl) catcount -from (select flatten(categories) catl -from dfs.yelp.`yelp_academic_dataset_business.json`) celltbl -group by celltbl.catl -order by count(celltbl.catl) desc limit 5; - -+---++ -|catl | catcount | -+---++ -| Restaurants | 14303 | -| Shopping | 6428 | -| Food | 5209 | -| Beauty & Spas | 3421 | -| Nightlife | 2870 | -+---|+ - -A common use case for FLATTEN is its use in conjunction with the -[KVGEN](/confluence/display/DRILL/KVGEN+Function) function. - http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/drill-docs/sql-ref/nested/002-kvgen.md -- diff --git a/_docs/drill-docs/sql-ref/nested/002-kvgen.md b/_docs/drill-docs/sql-ref/nested/002-kvgen.md deleted file mode 100644 index a27a781..000 --- a/_docs/drill-docs/sql-ref/nested/002-kvgen.md +++ /dev/null @@ -1,150 +0,0 @@ -title: "KVGEN Function" -parent: "Nested Data Functions" -KVGEN stands for _key-value generation_. This function is useful when complex -data files contain arbitrary maps that consist of relatively "unknown" column -names. Instead of having to specify columns in the map to access the data, you -can use KVGEN to ret
[11/13] drill git commit: DRILL-2315: Confluence conversion plus fixes
http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/design/005-value.md -- diff --git a/_docs/design/005-value.md b/_docs/design/005-value.md new file mode 100644 index 000..828376a --- /dev/null +++ b/_docs/design/005-value.md @@ -0,0 +1,163 @@ +--- +title: "Value Vectors" +parent: "Design Docs" +--- +This document defines the data structures required for passing sequences of +columnar data between [Operators](https://docs.google.com/a/maprtech.com/document/d/1zaxkcrK9mYyfpGwX1kAV80z0PCi8abefL45zOzb97dI/edit#bookmark=id.iip15ful18mm). + +## Goals + +### Support Operators Written in Multiple Language + +ValueVectors should support operators written in C/C++/Assembly. To support +this, the underlying ByteBuffer will not require modification when passed +through the JNI interface. The ValueVector will be considered immutable once +constructed. Endianness has not yet been considered. + +### Access + +Reading a random element from a ValueVector must be a constant time operation. +To accomodate, elements are identified by their offset from the start of the +buffer. Repeated, nullable and variable width ValueVectors utilize in an +additional fixed width value vector to index each element. Write access is not +supported once the ValueVector has been constructed by the RecordBatch. + +### Efficient Subsets of Value Vectors + +When an operator returns a subset of values from a ValueVector, it should +reuse the original ValueVector. To accomplish this, a level of indirection is +introduced to skip over certain values in the vector. This level of +indirection is a sequence of offsets which reference an offset in the original +ValueVector and the count of subsequent values which are to be included in the +subset. + +### Pooled Allocation + +ValueVectors utilize one or more buffers under the covers. These buffers will +be drawn from a pool. Value vectors are themselves created and destroyed as a +schema changes during the course of record iteration. + +### Homogenous Value Types + +Each value in a Value Vector is of the same type. The [Record Batch](https://docs.google.com/a/maprtech.com/document/d/1zaxkcrK9mYyfpGwX1kAV80z0PCi8abefL45zOzb97dI/edit#bookmark=kix.s2xuoqnr8obe) implementation is responsible for +creating a new Value Vector any time there is a change in schema. + +## Definitions + +Data Types + +The canonical source for value type definitions is the [Drill +Datatypes](http://bit.ly/15JO9bC) document. The individual types are listed +under the âBasic Data Typesâ tab, while the value vector types can be found +under the âValue Vectorsâ tab. + +Operators + +An operator is responsible for transforming a stream of fields. It operates on +Record Batches or constant values. + +Record Batch + +A set of field values for some range of records. The batch may be composed of +Value Vectors, in which case each batch consists of exactly one schema. + +Value Vector + +The value vector is comprised of one or more contiguous buffers; one which +stores a sequence of values, and zero or more which store any metadata +associated with the ValueVector. + +## Data Structure + +A ValueVector stores values in a ByteBuf, which is a contiguous region of +memory. Additional levels of indirection are used to support variable value +widths, nullable values, repeated values and selection vectors. These levels +of indirection are primarily lookup tables which consist of one or more fixed +width ValueVectors which may be combined (e.g. for nullable, variable width +values). A fixed width ValueVector of non-nullable, non-repeatable values does +not require an indirect lookup; elements can be accessed directly by +multiplying position by stride. + +Fixed Width Values + +Fixed width ValueVectors simply contain a packed sequence of values. Random +access is supported by accessing element n at ByteBuf[0] + Index * Stride, +where Index is 0-based. The following illustrates the underlying buffer of +INT4 values [1 .. 6]: + +![drill query flow]({{ site.baseurl }}/docs/img/value1.png) + +Nullable Values + +Nullable values are represented by a vector of bit values. Each bit in the +vector corresponds to an element in the ValueVector. If the bit is not set, +the value is NULL. Otherwise the value is retrieved from the underlying +buffer. The following illustrates a NullableValueVector of INT4 values 2, 3 +and 6: + +![drill query flow]({{ site.baseurl }}/docs/img/value2.png) + +### Repeated Values + +A repeated ValueVector is used for elements which can contain multiple values +(e.g. a JSON array). A table of offset and count pairs is used to represent +each repeated element in the ValueVector. A count of zero means the element +has no values (note the offset field is unused in this case). The following +illustrates three fields; one with two values, one with no values, and one +with a single value: + +![drill query flow]({{ site.baseurl }}/docs/img/value3
[05/13] drill git commit: DRILL-2315: Confluence conversion plus fixes
http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/img/ngram_plugin2.png -- diff --git a/_docs/img/ngram_plugin2.png b/_docs/img/ngram_plugin2.png new file mode 100644 index 000..60d432d Binary files /dev/null and b/_docs/img/ngram_plugin2.png differ http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/img/settings.png -- diff --git a/_docs/img/settings.png b/_docs/img/settings.png new file mode 100644 index 000..dcff0d9 Binary files /dev/null and b/_docs/img/settings.png differ http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/img/student_hive.png -- diff --git a/_docs/img/student_hive.png b/_docs/img/student_hive.png new file mode 100644 index 000..7e22b88 Binary files /dev/null and b/_docs/img/student_hive.png differ http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/install/001-drill-in-10.md -- diff --git a/_docs/install/001-drill-in-10.md b/_docs/install/001-drill-in-10.md new file mode 100644 index 000..13d2410 --- /dev/null +++ b/_docs/install/001-drill-in-10.md @@ -0,0 +1,365 @@ +--- +title: "Apache Drill in 10 Minutes" +parent: "Install Drill" +--- +* Objective +* A Few Bits About Apache Drill +* Process Overview +* Install Drill + * Installing Drill on Linux + * Installing Drill on Mac OS X + * Installing Drill on Windows +* Start Drill +* Query Sample Data +* Summary +* Next Steps +* More Information + +## Objective + +Use Apache Drill to query sample data in 10 minutes. For simplicity, youâll +run Drill in _embedded_ mode rather than _distributed_ mode to try out Drill +without having to perform any setup tasks. + +## A Few Bits About Apache Drill + +Drill is a clustered, powerful MPP (Massively Parallel Processing) query +engine for Hadoop that can process petabytes of data, fast. Drill is useful +for short, interactive ad-hoc queries on large-scale data sets. Drill is +capable of querying nested data in formats like JSON and Parquet and +performing dynamic schema discovery. Drill does not require a centralized +metadata repository. + +### **_Dynamic schema discovery_** + +Drill does not require schema or type specification for data in order to start +the query execution process. Drill starts data processing in record-batches +and discovers the schema during processing. Self-describing data formats such +as Parquet, JSON, AVRO, and NoSQL databases have schema specified as part of +the data itself, which Drill leverages dynamically at query time. Because +schema can change over the course of a Drill query, all Drill operators are +designed to reconfigure themselves when schemas change. + +### **_Flexible data model_** + +Drill allows access to nested data attributes, just like SQL columns, and +provides intuitive extensions to easily operate on them. From an architectural +point of view, Drill provides a flexible hierarchical columnar data model that +can represent complex, highly dynamic and evolving data models. Drill allows +for efficient processing of these models without the need to flatten or +materialize them at design time or at execution time. Relational data in Drill +is treated as a special or simplified case of complex/multi-structured data. + +### **_De-centralized metadata_** + +Drill does not have a centralized metadata requirement. You do not need to +create and manage tables and views in a metadata repository, or rely on a +database administrator group for such a function. Drill metadata is derived +from the storage plugins that correspond to data sources. Storage plugins +provide a spectrum of metadata ranging from full metadata (Hive), partial +metadata (HBase), or no central metadata (files). De-centralized metadata +means that Drill is NOT tied to a single Hive repository. You can query +multiple Hive repositories at once and then combine the data with information +from HBase tables or with a file in a distributed file system. You can also +use SQL DDL syntax to create metadata within Drill, which gets organized just +like a traditional database. Drill metadata is accessible through the ANSI +standard INFORMATION_SCHEMA database. + +### **_Extensibility_** + +Drill provides an extensible architecture at all layers, including the storage +plugin, query, query optimization/execution, and client API layers. You can +customize any layer for the specific needs of an organization or you can +extend the layer to a broader array of use cases. Drill provides a built in +classpath scanning and plugin concept to add additional storage plugins, +functions, and operators with minimal configuration. + +## Process Overview + +Download the Apache Drill archive and extract the contents to a directory on +your machine. The Apache Drill archiv
[09/13] drill git commit: DRILL-2315: Confluence conversion plus fixes
http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/drill-docs/datasets/001-aol.md -- diff --git a/_docs/drill-docs/datasets/001-aol.md b/_docs/drill-docs/datasets/001-aol.md deleted file mode 100644 index 472f52f..000 --- a/_docs/drill-docs/datasets/001-aol.md +++ /dev/null @@ -1,47 +0,0 @@ -title: "AOL Search" -parent: "Sample Datasets" -## Quick Stats - -The [AOL Search dataset](http://en.wikipedia.org/wiki/AOL_search_data_leak) is -a collection of real query log data that is based on real users. - -## The Data Source - -The dataset consists of 20M Web queries from 650k users over a period of three -months, 440MB in total and available [for -download](http://zola.di.unipi.it/smalltext/datasets.html). The format used in -the dataset is: - -AnonID, Query, QueryTime, ItemRank, ClickURL - -... with: - - * AnonID, an anonymous user ID number. - * Query, the query issued by the user, case shifted with most punctuation removed. - * QueryTime, the time at which the query was submitted for search. - * ItemRank, if the user clicked on a search result, the rank of the item on which they clicked is listed. - * [ClickURL](http://www.dietkart.com/), if the user clicked on a search result, the domain portion of the URL in the clicked result is listed. - -Each line in the data represents one of two types of events - - * A query that was NOT followed by the user clicking on a result item. - * A click through on an item in the result list returned from a query. - -In the first case (query only) there is data in only the first three columns, -in the second case (click through), there is data in all five columns. For -click through events, the query that preceded the click through is included. -Note that if a user clicked on more than one result in the list returned from -a single query, there will be TWO lines in the data to represent the two -events. - -## The Queries - -Interesting queries, for example - - * Users querying for topic X - * Users that click on the first (second, third) ranked item - * TOP 10 domains searched - * TOP 10 domains clicked at - http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/drill-docs/datasets/002-enron.md -- diff --git a/_docs/drill-docs/datasets/002-enron.md b/_docs/drill-docs/datasets/002-enron.md deleted file mode 100644 index 2ddbef6..000 --- a/_docs/drill-docs/datasets/002-enron.md +++ /dev/null @@ -1,21 +0,0 @@ -title: "Enron Emails" -parent: "Sample Datasets" -## Quick Stats - -The [Enron Email dataset](http://www.cs.cmu.edu/~enron/) contains data from -about 150 users, mostly senior management of Enron. - -## The Data Source - -Totalling some 500,000 messages, the [raw -data](http://www.cs.cmu.edu/~enron/enron_mail_20110402.tgz) (2009 version of -the dataset; ~423MB) is available for download as well as a [MySQL -dump](ftp://ftp.isi.edu/sims/philpot/data/enron-mysqldump.sql.gz) (~177MB). - -## The Queries - -Interesting queries, for example - - * Via [Query Dataset for Email Search](https://dbappserv.cis.upenn.edu/spell/) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/drill-docs/datasets/003-wikipedia.md -- diff --git a/_docs/drill-docs/datasets/003-wikipedia.md b/_docs/drill-docs/datasets/003-wikipedia.md deleted file mode 100644 index 99e6e24..000 --- a/_docs/drill-docs/datasets/003-wikipedia.md +++ /dev/null @@ -1,105 +0,0 @@ -title: "Wikipedia Edit History" -parent: "Sample Datasets" -# Quick Stats - -The Wikipedia Edit History is a public dump of the website made available by -the wikipedia foundation. You can find details -[here](http://en.wikipedia.org/wiki/Wikipedia:Database_download). The dumps -are made available as SQL or XML dumps. You can find the entire schema drawn -together in this great [diagram](http://upload.wikimedia.org/wikipedia/commons -/thumb/4/42/MediaWiki_1.20_%2844edaa2%29_database_schema.svg/2193px- -MediaWiki_1.20_%2844edaa2%29_database_schema.svg.png). - -# Approach - -The _main_ distribution files are: - - * Current Pages: As of January 2013 this SQL dump was 9.0GB in its compressed format. - * Complere Archive: This is what we actually want, but at a size of multiple terrabytes, clearly exceeds the storage available at home. - -To have some real historic data, it is recommended to download a _Special -Export_ use this -[link](http://en.wikipedia.org/w/index.php?title=Special:Export). Using this -tool you generate a category specific XML dump and configure various export -options. There are some limits like a maximum of 1000 revisions per export, -but otherwise this should work out just fine. - -![](../../img/Overview.png) - -The entities used in the query use cases. - -# Use Cases - -## Select Change Volume
[12/13] drill git commit: DRILL-2315: Confluence conversion plus fixes
http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/arch/001-core-mod.md -- diff --git a/_docs/arch/001-core-mod.md b/_docs/arch/001-core-mod.md new file mode 100644 index 000..17fa18d --- /dev/null +++ b/_docs/arch/001-core-mod.md @@ -0,0 +1,29 @@ +--- +title: "Core Modules within a Drillbit" +parent: "Architectural Overview" +--- +The following image represents components within each Drillbit: + +![drill query flow]({{ site.baseurl }}/docs/img/DrillbitModules.png) + +The following list describes the key components of a Drillbit: + + * **RPC end point**: Drill exposes a low overhead protobuf-based RPC protocol to communicate with the clients. Additionally, a C++ and Java API layers are also available for the client applications to interact with Drill. Clients can communicate to a specific Drillbit directly or go through a ZooKeeper quorum to discover the available Drillbits before submitting queries. It is recommended that the clients always go through ZooKeeper to shield clients from the intricacies of cluster management, such as the addition or removal of nodes. + + * **SQL parser**: Drill uses Optiq, the open source framework, to parse incoming queries. The output of the parser component is a language agnostic, computer-friendly logical plan that represents the query. + * **Storage plugin interfaces**: Drill serves as a query layer on top of several data sources. Storage plugins in Drill represent the abstractions that Drill uses to interact with the data sources. Storage plugins provide Drill with the following information: +* Metadata available in the source +* Interfaces for Drill to read from and write to data sources +* Location of data and a set of optimization rules to help with efficient and faster execution of Drill queries on a specific data source + +In the context of Hadoop, Drill provides storage plugins for files and +HBase/M7. Drill also integrates with Hive as a storage plugin since Hive +provides a metadata abstraction layer on top of files, HBase/M7, and provides +libraries to read data and operate on these sources (Serdes and UDFs). + +When users query files and HBase/M7 with Drill, they can do it directly or go +through Hive if they have metadata defined there. Drill integration with Hive +is only for metadata. Drill does not invoke the Hive execution engine for any +requests. + + * **Distributed cache**: Drill uses a distributed cache to manage metadata (not the data) and configuration information across various nodes. Sample metadata information that is stored in the cache includes query plan fragments, intermediate state of the query execution, and statistics. Drill uses Infinispan as its cache technology. \ No newline at end of file http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/arch/002-arch-hilite.md -- diff --git a/_docs/arch/002-arch-hilite.md b/_docs/arch/002-arch-hilite.md new file mode 100644 index 000..5ac51bc --- /dev/null +++ b/_docs/arch/002-arch-hilite.md @@ -0,0 +1,10 @@ +--- +title: "Architectural Highlights" +parent: "Architectural Overview" +--- +The goal for Drill is to bring the **SQL Ecosystem** and **Performance** of +the relational systems to **Hadoop scale** data **WITHOUT** compromising on +the **Flexibility** of Hadoop/NoSQL systems. There are several core +architectural elements in Apache Drill that make it a highly flexible and +efficient query engine. + http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/arch/arch-hilite/001-flexibility.md -- diff --git a/_docs/arch/arch-hilite/001-flexibility.md b/_docs/arch/arch-hilite/001-flexibility.md new file mode 100644 index 000..0b5c5e3 --- /dev/null +++ b/_docs/arch/arch-hilite/001-flexibility.md @@ -0,0 +1,78 @@ +--- +title: "Flexibility" +parent: "Architectural Highlights" +--- +The following features contribute to Drill's flexible architecture: + +**_Dynamic schema discovery_** + +Drill does not require schema or type specification for the data in order to +start the query execution process. Instead, Drill starts processing the data +in units called record-batches and discovers the schema on the fly during +processing. Self-describing data formats such as Parquet, JSON, AVRO, and +NoSQL databases have schema specified as part of the data itself, which Drill +leverages dynamically at query time. Schema can change over the course of a +Drill query, so all of the Drill operators are designed to reconfigure +themselves when such schema changing events occur. + +**_Flexible data model_** + +Drill is purpose-built from the ground up for complex/multi-structured data +commonly seen in Hadoop/NoSQL applications such as social/mobile, clickstream, +logs, and sensor equipped IOT. From a user point of view, Drill a
[13/13] drill git commit: DRILL-2315: Confluence conversion plus fixes
DRILL-2315: Confluence conversion plus fixes Project: http://git-wip-us.apache.org/repos/asf/drill/repo Commit: http://git-wip-us.apache.org/repos/asf/drill/commit/d959a210 Tree: http://git-wip-us.apache.org/repos/asf/drill/tree/d959a210 Diff: http://git-wip-us.apache.org/repos/asf/drill/diff/d959a210 Branch: refs/heads/gh-pages-master Commit: d959a210053f02b5069f0a0cb9f0d34131640ffb Parents: 23f82db Author: Kristine Hahn Authored: Thu Jan 15 19:42:12 2015 -0800 Committer: Bridget Bevens Committed: Wed Feb 25 16:22:24 2015 -0800 -- .gitignore | 1 + _docs/001-arch.md | 49 +++ _docs/001-drill-docs.md | 4 - _docs/002-tutorial.md | 51 +++ _docs/003-yelp.md | 412 ++ _docs/004-install.md| 13 + _docs/005-connect.md| 41 ++ _docs/006-interfaces.md | 50 +++ _docs/007-query.md | 41 ++ _docs/008-sql-ref.md| 14 + _docs/009-dev-custom-func.md| 37 ++ _docs/010-manage.md | 14 + _docs/011-develop.md| 9 + _docs/012-rn.md | 191 + _docs/013-contribute.md | 9 + _docs/014-sample-ds.md | 10 + _docs/015-design.md | 13 + _docs/016-progress.md | 8 + _docs/017-archived-pages.md | 8 + _docs/018-bylaws.md | 170 _docs/arch/001-core-mod.md | 29 ++ _docs/arch/002-arch-hilite.md | 10 + _docs/arch/arch-hilite/001-flexibility.md | 78 _docs/arch/arch-hilite/002-performance.md | 55 +++ _docs/archive/001-how-to-demo.md| 309 ++ _docs/archive/002-meet-drill.md | 41 ++ _docs/connect/001-plugin-reg.md | 35 ++ _docs/connect/002-workspaces.md | 74 _docs/connect/003-reg-fs.md | 64 +++ _docs/connect/004-reg-hbase.md | 32 ++ _docs/connect/005-reg-hive.md | 83 _docs/connect/006-default-frmt.md | 60 +++ _docs/connect/007-mongo-plugin.md | 167 _docs/connect/008-mapr-db-plugin.md | 31 ++ _docs/contribute/001-guidelines.md | 229 ++ _docs/contribute/002-ideas.md | 158 +++ _docs/datasets/001-aol.md | 47 +++ _docs/datasets/002-enron.md | 19 + _docs/datasets/003-wikipedia.md | 105 + _docs/design/001-plan.md| 25 ++ _docs/design/002-rpc.md | 19 + _docs/design/003-query-stages.md| 42 ++ _docs/design/004-research.md| 48 +++ _docs/design/005-value.md | 163 +++ _docs/dev-custom-fcn/001-dev-simple.md | 50 +++ _docs/dev-custom-fcn/002-dev-aggregate.md | 55 +++ _docs/dev-custom-fcn/003-add-custom.md | 26 ++ _docs/dev-custom-fcn/004-use-custom.md | 55 +++ _docs/dev-custom-fcn/005-cust-interface.md | 8 + _docs/develop/001-compile.md| 37 ++ _docs/develop/002-setup.md | 5 + _docs/develop/003-patch-tool.md | 160 +++ _docs/drill-docs/001-arch.md| 58 --- _docs/drill-docs/002-tutorial.md| 58 --- _docs/drill-docs/003-yelp.md| 402 -- _docs/drill-docs/004-install.md | 20 - _docs/drill-docs/005-connect.md | 49 --- _docs/drill-docs/006-query.md | 57 --- _docs/drill-docs/006-sql-ref.md | 25 -- _docs/drill-docs/007-dev-custom-func.md | 47 --- _docs/drill-docs/008-manage.md | 23 - _docs/drill-docs/009-develop.md | 16 - _docs/drill-docs/010-rn.md | 192 - _docs/drill-docs/011-contribute.md | 11 - _docs/drill-docs/012-sample-ds.md | 11 - _docs/drill-docs/013-design.md | 14 - _docs/drill-docs/014-progress.md| 9 - _docs/drill-docs/015-archived-pages.md | 9 - _docs/drill-docs/016-bylaws.md | 171 _docs/drill-docs/arch/001-core-mod.md | 30 -- _docs/drill-docs/arch/002-arch-hilite.md| 15 - .../arch/arch-hilite/001-flexibility.md | 79 .../arch/arch-hilite/002-performance.md | 56 --- _docs/drill-docs/archive/001-how-to-demo.md |
[07/13] drill git commit: DRILL-2315: Confluence conversion plus fixes
http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/drill-docs/query/query-fs/001-query-json.md -- diff --git a/_docs/drill-docs/query/query-fs/001-query-json.md b/_docs/drill-docs/query/query-fs/001-query-json.md deleted file mode 100644 index 048903b..000 --- a/_docs/drill-docs/query/query-fs/001-query-json.md +++ /dev/null @@ -1,41 +0,0 @@ -title: "Querying JSON Files" -parent: "Querying a File System" -Your Drill installation includes a sample JSON file located in Drill's -classpath. The sample JSON file, `employee.json`, contains fictitious employee -data. Use SQL syntax to query the sample `JSON` file. - -To view the data in the `employee.json` file, submit the following SQL query -to Drill: - -``0: jdbc:drill:zk=local> SELECT * FROM cp.`employee.json`;`` - -The query returns the following results: - -**Example of partial output** - - +-++++-+---+ -| employee_id | full_name | first_name | last_name | position_id | position_ | - +-++++-+---+ -| 1101| Steve Eurich | Steve | Eurich | 16 | Store T | -| 1102| Mary Pierson | Mary | Pierson| 16 | Store T | -| 1103| Leo Jones | Leo| Jones | 16 | Store Tem | -| 1104| Nancy Beatty | Nancy | Beatty | 16 | Store T | -| 1105| Clara McNight | Clara | McNight| 16 | Store | -| 1106| Marcella Isaacs | Marcella | Isaacs | 17 | Stor | -| 1107| Charlotte Yonce | Charlotte | Yonce | 17 | Stor | -| 1108| Benjamin Foster | Benjamin | Foster | 17 | Stor | -| 1109| John Reed | John | Reed | 17 | Store Per | -| 1110| Lynn Kwiatkowski | Lynn | Kwiatkowski | 17 | St | -| | Donald Vann | Donald | Vann | 17 | Store Pe | -| 1112| William Smith | William| Smith | 17 | Store | -| 1113| Amy Hensley | Amy| Hensley| 17 | Store Pe | -| 1114| Judy Owens | Judy | Owens | 17 | Store Per | -| 1115| Frederick Castillo | Frederick | Castillo | 17 | S | -| 1116| Phil Munoz | Phil | Munoz | 17 | Store Per | -| 1117| Lori Lightfoot | Lori | Lightfoot | 17 | Store | -... - +-++++-+---+ -1,155 rows selected (0.762 seconds) -0: jdbc:drill:zk=local> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/drill-docs/query/query-fs/002-query-parquet.md -- diff --git a/_docs/drill-docs/query/query-fs/002-query-parquet.md b/_docs/drill-docs/query/query-fs/002-query-parquet.md deleted file mode 100644 index 9b4e874..000 --- a/_docs/drill-docs/query/query-fs/002-query-parquet.md +++ /dev/null @@ -1,99 +0,0 @@ -title: "Querying Parquet Files" -parent: "Querying a File System" -Your Drill installation includes a `sample-date` directory with Parquet files -that you can query. Use SQL syntax to query the `region.parquet` and -`nation.parquet` files in the `sample-data` directory. - -**Note:** Your Drill installation location may differ from the examples used here. The examples assume that Drill was installed in embedded mode on your machine following the [Apache Drill in 10 Minutes ](https://cwiki.apache.org/confluence/display/DRILL/Apache+Drill+in+10+Minutes)tutorial. If you installed Drill in distributed mode, or your `sample-data` directory differs from the location used in the examples, make sure to change the `sample-data` directory to the correct location before you run the queries. - - Region File - -If you followed the Apache Drill in 10 Minutes instructions to install Drill -in embedded mode, the path to the parquet file varies between operating -systems. - -To view the data in the `region.parquet` file, issue the query appropriate for -your operating system: - - * Linux -``SELECT * FROM dfs.`/opt/drill/apache-drill-0.4.0-incubating/sample- -data/region.parquet`; `` - - * Mac OS X -``SELECT * FROM dfs.`/Users/max/drill/apache-drill-0.4.0-incubating/sample- -data/region.parquet`;`` - - * Windows -``SELECT * FROM dfs.`C:\drill\apache-drill-0.4.0-incubating\sample- -data\region.parquet`;`` - -The query returns the following results: - -+++ -| EXPR$0 | EXPR$1 | -+++ -| AFRICA | lar deposits. blithely final packages cajole. regular waters ar | -
[01/13] drill git commit: DRILL-2315: Confluence conversion plus fixes
Repository: drill Updated Branches: refs/heads/gh-pages-master 23f82db9f -> d959a2100 http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/tutorial/005-lesson3.md -- diff --git a/_docs/tutorial/005-lesson3.md b/_docs/tutorial/005-lesson3.md new file mode 100644 index 000..f6c7ae4 --- /dev/null +++ b/_docs/tutorial/005-lesson3.md @@ -0,0 +1,379 @@ +--- +title: "Lession 3: Run Queries on Complex Data Types" +parent: "Apache Drill Tutorial" +--- +## Goal + +This lesson focuses on queries that exercise functions and operators on self- +describing data and complex data types. Drill offers intuitive SQL extensions +to work with such data and offers high query performance with an architecture +built from the ground up for complex data. + +## Queries in This Lesson + +Now that you have run ANSI SQL queries against different tables and files with +relational data, you can try some examples including complex types. + + * Access directories and subdirectories of files in a single SELECT statement. + * Demonstrate simple ways to access complex data in JSON files. + * Demonstrate the repeated_count function to aggregate values in an array. + +## Query Partitioned Directories + +You can use special variables in Drill to refer to subdirectories in your +workspace path: + + * dir0 + * dir1 + * ⦠+ +Note that these variables are dynamically determined based on the partitioning +of the file system. No up-front definitions are required on what partitions +exist. Here is a visual example of how this works: + +![drill query flow]({{ site.baseurl }}/docs/img/example_query.png) + +### Set workspace to dfs.logs: + +0: jdbc:drill:> use dfs.logs; ++++ +| ok | summary | ++++ +| true | Default schema changed to 'dfs.logs' | ++++ + +### Query logs data for a specific year: + +0: jdbc:drill:> select * from logs where dir0='2013' limit 10; + +++++++++++---++ +| dir0 | dir1 | trans_id | date | time | cust_id | device | state | camp_id | keywords | prod_id | purch_flag | + +++++++++++---++ +| 2013 | 11 | 12119 | 11/09/2013 | 02:24:51 | 262 | IOS5 | ny | 0 | chamber | 198 | false | +| 2013 | 11 | 12120 | 11/19/2013 | 09:37:43 | 0 | AOS4.4 | il | 2 | outside | 511 | false | +| 2013 | 11 | 12134 | 11/10/2013 | 23:42:47 | 60343 | IOS5 | ma | 4 | and | 421 | false | +| 2013 | 11 | 12135 | 11/16/2013 | 01:42:13 | 46762 | AOS4.3 | ca | 4 | here's | 349 | false | +| 2013 | 11 | 12165 | 11/26/2013 | 21:58:09 | 41987 | AOS4.2 | mn | 4 | he | 271 | false | +| 2013 | 11 | 12168 | 11/09/2013 | 23:41:48 | 8600 | IOS5 | in | 6 | i | 459 | false | +| 2013 | 11 | 12196 | 11/20/2013 | 02:23:06 | 15603 | IOS5 | tn | 1 | like | 324 | false | +| 2013 | 11 | 12203 | 11/25/2013 | 23:50:29 | 221 | IOS6 | tx | 10 | if | 323 | false | +| 2013 | 11 | 12206 | 11/09/2013 | 23:53:01 | 2488 | AOS4.2 | tx | 14 | unlike | 296 | false | +| 2013 | 11 | 12217 | 11/06/2013 | 23:51:56 | 0 | AOS4.2 | tx | 9 | can't | 54 | false | + +++++++++++++ + + +This query constrains files inside the subdirectory named 2013. The variable +dir0 refers to the first level down from logs, dir1 to the next level, and so +on. So this query returned 10 of the rows for February 2013. + +### Further constrain the results using multiple predicates in the query: + +This query returns a list of customer IDs for people who made a purchase via +an IOS5 device in August 2013. + +0: jdbc:drill:> select dir0 as yr, dir1 as mth, cust_id from logs +where dir0='2013' and dir1='8' and device='IOS5' and purch_flag='true' +order by `date`; +++++ +| yr | mth | cust_id | +++++ +| 2013 | 8 | 4 | +| 2013 | 8 | 521 | +| 2013 | 8 | 1 | +| 2013 | 8 | 2 | +| 2013 | 8 | 4 | +| 2013 | 8 | 549 | +| 2013 | 8 | 72827 | +| 2013 | 8 | 38127 | +... + +### Return monthly counts per customer for a given year: + +0: jdbc:drill:> select cust_id, dir1 month_no, count(*) month_count from logs +where dir0=2014 group by cust_id, dir1 order by cust_id, month_no limit 10; ++++-+ +| cust_id | month_no | month_count | ++++-+ +| 0 | 1 | 143 | +| 0 | 2 | 118 | +| 0 | 3
[08/13] drill git commit: DRILL-2315: Confluence conversion plus fixes
http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/drill-docs/manage/004-partition-prune.md -- diff --git a/_docs/drill-docs/manage/004-partition-prune.md b/_docs/drill-docs/manage/004-partition-prune.md deleted file mode 100644 index fa81034..000 --- a/_docs/drill-docs/manage/004-partition-prune.md +++ /dev/null @@ -1,75 +0,0 @@ -title: "Partition Pruning" -parent: "Manage Drill" -Partition pruning is a performance optimization that limits the number of -files and partitions that Drill reads when querying file systems and Hive -tables. Drill only reads a subset of the files that reside in a file system or -a subset of the partitions in a Hive table when a query matches certain filter -criteria. - -For Drill to apply partition pruning to Hive tables, you must have created the -tables in Hive using the `PARTITION BY` clause: - -`CREATE TABLE () PARTITION BY ();` - -When you create Hive tables using the `PARTITION BY` clause, each partition of -data is automatically split out into different directories as data is written -to disk. For more information about Hive partitioning, refer to the [Apache -Hive wiki](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL/#LanguageManualDDL-PartitionedTables). - -Typically, table data in a file system is organized by directories and -subdirectories. Queries on table data may contain `WHERE` clause filters on -specific directories. - -Drillâs query planner evaluates the filters as part of a Filter operator. If -no partition filters are present, the underlying Scan operator reads all files -in all directories and then sends the data to operators downstream, such as -Filter. - -When partition filters are present, the query planner determines if it can -push the filters down to the Scan such that the Scan only reads the -directories that match the partition filters, thus reducing disk I/O. - -## Partition Pruning Example - -The /`Users/max/data/logs` directory in a file system contains subdirectories -that span a few years. - -The following image shows the hierarchical structure of the `â¦/logs` directory -and (sub) directories: - -![](../../img/54.png) - -The following query requests log file data for 2013 from the `â¦/logs` -directory in the file system: - -SELECT * FROM dfs.`/Users/max/data/logs` WHERE cust_id < 10 and dir0 = 2013 limit 2; - -If you run the `EXPLAIN PLAN` command for the query, you can see that the` -â¦/logs` directory is filtered by the scan operator. - -EXPLAIN PLAN FOR SELECT * FROM dfs.`/Users/max/data/logs` WHERE cust_id < 10 and dir0 = 2013 limit 2; - -The following image shows a portion of the physical plan when partition -pruning is applied: - -![](../../img/21.png) - -## Filter Examples - -The following queries include examples of the types of filters eligible for -partition pruning optimization: - -**Example 1: Partition filters ANDed together** - -SELECT * FROM dfs.`/Users/max/data/logs` WHERE dir0 = '2014' AND dir1 = '1' - -**Example 2: Partition filter ANDed with regular column filter** - -SELECT * FROM dfs.`/Users/max/data/logs` WHERE cust_id < 10 AND dir0 = 2013 limit 2; - -**Example 3: Combination of AND, OR involving partition filters** - -SELECT * FROM dfs.`/Users/max/data/logs` WHERE (dir0 = '2013' AND dir1 = '1') OR (dir0 = '2014' AND dir1 = '2') - http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/drill-docs/manage/005-monitor-cancel.md -- diff --git a/_docs/drill-docs/manage/005-monitor-cancel.md b/_docs/drill-docs/manage/005-monitor-cancel.md deleted file mode 100644 index 6888eea..000 --- a/_docs/drill-docs/manage/005-monitor-cancel.md +++ /dev/null @@ -1,30 +0,0 @@ -title: "Monitoring and Canceling Queries in the Drill Web UI" -parent: "Manage Drill" -You can monitor and cancel queries from the Drill Web UI. To access the Drill -Web UI, the Drillbit process must be running on the Drill node that you use to -access the Drill Web UI. - -To monitor or cancel a query from the Drill Web UI, complete the following -steps: - - 1. Navigate to the Drill Web UI at `:8047.` -When you access the Drill Web UI, you see some general information about Drill -running in your cluster, such as the nodes running the Drillbit process, the -various ports Drill is using, and the amount of direct memory assigned to -Drill. -![](../../img/7.png) - - 2. Select **Profiles** in the toolbar. A list of running and completed queries appears. Drill assigns a query ID to each query and lists the Foreman node. The Foreman is the Drillbit node that receives the query from the client or application. The Foreman drives the entire query. -![](../../img/51.png) - - 3. Click the **Query ID** for the query that you want to monitor or cancel. The Query and Planning window appears. -![](../../img/4.png) - - 4. Selec
[10/13] drill git commit: DRILL-2315: Confluence conversion plus fixes
http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/drill-docs/016-bylaws.md -- diff --git a/_docs/drill-docs/016-bylaws.md b/_docs/drill-docs/016-bylaws.md deleted file mode 100644 index 6f2604f..000 --- a/_docs/drill-docs/016-bylaws.md +++ /dev/null @@ -1,171 +0,0 @@ -title: "Project Bylaws" -parent: "Apache Drill Documentation" -# Introduction - -This document defines the bylaws under which the Apache Drill project -operates. It defines the roles and responsibilities of the project, who may -vote, how voting works, how conflicts are resolved, etc. - -Drill is a project of the [Apache Software -Foundation](http://www.apache.org/foundation/). The foundation holds the -copyright on Apache code including the code in the Drill codebase. The -[foundation FAQ](http://www.apache.org/foundation/faq.html) explains the -operation and background of the foundation. - -Drill is typical of Apache projects in that it operates under a set of -principles, known collectively as the _Apache Way_. If you are new to Apache -development, please refer to the [Incubator -project](http://incubator.apache.org/) for more information on how Apache -projects operate. - -# Roles and Responsibilities - -Apache projects define a set of roles with associated rights and -responsibilities. These roles govern what tasks an individual may perform -within the project. The roles are defined in the following sections. - -## Users - -The most important participants in the project are people who use our -software. The majority of our contributors start out as users and guide their -development efforts from the user's perspective. - -Users contribute to the Apache projects by providing feedback to contributors -in the form of bug reports and feature suggestions. As well, users participate -in the Apache community by helping other users on mailing lists and user -support forums. - -## Contributors - -All of the volunteers who are contributing time, code, documentation, or -resources to the Drill Project. A contributor that makes sustained, welcome -contributions to the project may be invited to become a committer, though the -exact timing of such invitations depends on many factors. - -## Committers - -The project's committers are responsible for the project's technical -management. Committers have access to a specified set of subproject's code -repositories. Committers on subprojects may cast binding votes on any -technical discussion regarding that subproject. - -Committer access is by invitation only and must be approved by lazy consensus -of the active PMC members. A Committer is considered _emeritus_ by his or her -own declaration or by not contributing in any form to the project for over six -months. An emeritus committer may request reinstatement of commit access from -the PMC which will be sufficient to restore him or her to active committer -status. - -Commit access can be revoked by a unanimous vote of all the active PMC members -(except the committer in question if he or she is also a PMC member). - -All Apache committers are required to have a signed [Contributor License -Agreement (CLA)](http://www.apache.org/licenses/icla.txt) on file with the -Apache Software Foundation. There is a [Committer -FAQ](http://www.apache.org/dev/committers.html) which provides more details on -the requirements for committers. - -A committer who makes a sustained contribution to the project may be invited -to become a member of the PMC. The form of contribution is not limited to -code. It can also include code review, helping out users on the mailing lists, -documentation, etc. - -## Project Management Committee - -The PMC is responsible to the board and the ASF for the management and -oversight of the Apache Drill codebase. The responsibilities of the PMC -include - - * Deciding what is distributed as products of the Apache Drill project. In particular all releases must be approved by the PMC. - * Maintaining the project's shared resources, including the codebase repository, mailing lists, websites. - * Speaking on behalf of the project. - * Resolving license disputes regarding products of the project. - * Nominating new PMC members and committers. - * Maintaining these bylaws and other guidelines of the project. - -Membership of the PMC is by invitation only and must be approved by a lazy -consensus of active PMC members. A PMC member is considered _emeritus_ by his -or her own declaration or by not contributing in any form to the project for -over six months. An emeritus member may request reinstatement to the PMC, -which will be sufficient to restore him or her to active PMC member. - -Membership of the PMC can be revoked by an unanimous vote of all the active -PMC members other than the member in question. - -The chair of the PMC is appointed by the ASF board. The chair is an office -holder of the Apache Software Foundation (Vice President, Apache D
[04/13] drill git commit: DRILL-2315: Confluence conversion plus fixes
http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/interfaces/odbc-win/002-conf-odbc-win.md -- diff --git a/_docs/interfaces/odbc-win/002-conf-odbc-win.md b/_docs/interfaces/odbc-win/002-conf-odbc-win.md new file mode 100644 index 000..636bd9f --- /dev/null +++ b/_docs/interfaces/odbc-win/002-conf-odbc-win.md @@ -0,0 +1,143 @@ +--- +title: "Step 2. Configure ODBC Connections to Drill Data Sources" +parent: "Using the MapR ODBC Driver on Windows" +--- +Complete one of the following steps to create an ODBC connection to Drill data +sources: + + * Create a Data Source Name + * Create an ODBC Connection String + +**Prerequisite:** An Apache Drill installation must be available that is configured to access the data sources that you want to connect to. For information about how to install Apache Drill, see [Install Drill](/drill/docs/install-drill). For information about configuring data sources, see the [Apache Drill documentation](/drill/docs). + +## Create a Data Source Name (DSN) + +Create a DSN that an application can use to connect to Drill data sources. If +you want to create a DSN for a 32-bit application, you must use the 32-bit +version of the ODBC Administrator to create the DSN. + + 1. To launch the ODBC Administrator, click **Start > All Programs > MapR Drill ODBC Driver 1.0 (32|64-bit) > (32|64-bit) ODBC Administrator**. +The ODBC Data Source Administrator window appears. + + To launch the 32-bit version of the ODBC driver on a 64-bit machine, run: +`C:\WINDOWS\SysWOW64\odbcad32.exe`. + 2. Click the **System DSN** tab to create a system DSN or click the **User DSN** tab to create a user DSN. A system DSN is available for all users who log in to the machine. A user DSN is available to the user who creates the DSN. + 3. Click **Add**. + 4. Select **MapR Drill ODBC Driver** and click **Finish**. + The _MapR Drill ODBC Driver DSN Setup_ window appears. + 5. In the **Data Source Name** field, enter a name for the DSN, + 6. Optionally, enter a description of the DSN in the Description field. + 7. In the Connection Type section, select a connection type and enter the associated connection details: + + Connection TypePropertiesDescriptionsZookeeper QuorumQuorumA comma-separated list of servers in a Zookeeper cluster.For example,:5181, :5181,â¦ClusterIDName of the drillbit cluster. The default is drillbits1. You may need to specify a different value if the cluster ID was changed in the drill-override.conf file.Direct to Drillbit Provide the IP address or host name of the Drill server and the port number that that the Drill server is listening on. The port number defaults to 31010. You may need to specify a different value if the port number was changed in the drill-override.conf file. + For information on selecting the appropriate connection type, see [Connection +Types](/drill/docs/step-2-configure-odbc-connections-to-drill-data-sources#connection-type). + 8. In the **Default Schema** field, select the default schema that you want to connect to. + For more information about the schemas that appear in this list, see Schemas. + 9. Optionally, perform one of the following operations: + + OptionActionUpdate the configuration of the advanced properties.Edit the default values in the Advanced Properties section. For more information, see Advanced Properties.Configure the types of events that you want the driver to log.Click Logging Options. For more information, see Logging Options.Create views or explore Drill sources.Click Drill Explorer. For more information, see Using Drill Explorer to Browse Data and Create Views. + 10. Click **OK** to save the DSN. + +## Configuration Options + +### Connection Type + +ODBC can connect directly to a Drillbit or to a ZooKeeper Quorum. Select your +connection type based on your environment and Drillbit configuration. + +The following table lists the appropriate connection type for each scenario: + +ScenarioConnection TypeDrillbit is running in embedded mode.Direct to DrillbitDrillbit is registered with the ZooKeeper in a testing environment.ZooKeeper Quorum or Direct to DrillbitDrillbit is registered with the ZooKeeper in a production environment.ZooKeeper Quorum + + Connection to Zookeeper Quorum + +When you choose to connect to a ZooKeeper Quorum, the ODBC driver connects to +the ZooKeeper Quorum to get a list of available Drillbits in the specified +cluster. Then, the ODBC driver submits a query after selecting a Drillbit. All +Drillbits in the cluster process the query and the Drillbit that received the +query returns the query results. + +![ODBC to Quorum]({{ site.baseurl }}/docs/img/ODBC_to_Quorum.png) + +In a production environment, you should connect to a ZooKeeper Quorum for a +more reliable connection. If one Drillbit is not available, another Drillbit +that is
[09/13] drill git commit: DRILL-2315: Confluence conversion plus fixes
http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/drill-docs/datasets/001-aol.md -- diff --git a/_docs/drill-docs/datasets/001-aol.md b/_docs/drill-docs/datasets/001-aol.md deleted file mode 100644 index 472f52f..000 --- a/_docs/drill-docs/datasets/001-aol.md +++ /dev/null @@ -1,47 +0,0 @@ -title: "AOL Search" -parent: "Sample Datasets" -## Quick Stats - -The [AOL Search dataset](http://en.wikipedia.org/wiki/AOL_search_data_leak) is -a collection of real query log data that is based on real users. - -## The Data Source - -The dataset consists of 20M Web queries from 650k users over a period of three -months, 440MB in total and available [for -download](http://zola.di.unipi.it/smalltext/datasets.html). The format used in -the dataset is: - -AnonID, Query, QueryTime, ItemRank, ClickURL - -... with: - - * AnonID, an anonymous user ID number. - * Query, the query issued by the user, case shifted with most punctuation removed. - * QueryTime, the time at which the query was submitted for search. - * ItemRank, if the user clicked on a search result, the rank of the item on which they clicked is listed. - * [ClickURL](http://www.dietkart.com/), if the user clicked on a search result, the domain portion of the URL in the clicked result is listed. - -Each line in the data represents one of two types of events - - * A query that was NOT followed by the user clicking on a result item. - * A click through on an item in the result list returned from a query. - -In the first case (query only) there is data in only the first three columns, -in the second case (click through), there is data in all five columns. For -click through events, the query that preceded the click through is included. -Note that if a user clicked on more than one result in the list returned from -a single query, there will be TWO lines in the data to represent the two -events. - -## The Queries - -Interesting queries, for example - - * Users querying for topic X - * Users that click on the first (second, third) ranked item - * TOP 10 domains searched - * TOP 10 domains clicked at - http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/drill-docs/datasets/002-enron.md -- diff --git a/_docs/drill-docs/datasets/002-enron.md b/_docs/drill-docs/datasets/002-enron.md deleted file mode 100644 index 2ddbef6..000 --- a/_docs/drill-docs/datasets/002-enron.md +++ /dev/null @@ -1,21 +0,0 @@ -title: "Enron Emails" -parent: "Sample Datasets" -## Quick Stats - -The [Enron Email dataset](http://www.cs.cmu.edu/~enron/) contains data from -about 150 users, mostly senior management of Enron. - -## The Data Source - -Totalling some 500,000 messages, the [raw -data](http://www.cs.cmu.edu/~enron/enron_mail_20110402.tgz) (2009 version of -the dataset; ~423MB) is available for download as well as a [MySQL -dump](ftp://ftp.isi.edu/sims/philpot/data/enron-mysqldump.sql.gz) (~177MB). - -## The Queries - -Interesting queries, for example - - * Via [Query Dataset for Email Search](https://dbappserv.cis.upenn.edu/spell/) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/drill-docs/datasets/003-wikipedia.md -- diff --git a/_docs/drill-docs/datasets/003-wikipedia.md b/_docs/drill-docs/datasets/003-wikipedia.md deleted file mode 100644 index 99e6e24..000 --- a/_docs/drill-docs/datasets/003-wikipedia.md +++ /dev/null @@ -1,105 +0,0 @@ -title: "Wikipedia Edit History" -parent: "Sample Datasets" -# Quick Stats - -The Wikipedia Edit History is a public dump of the website made available by -the wikipedia foundation. You can find details -[here](http://en.wikipedia.org/wiki/Wikipedia:Database_download). The dumps -are made available as SQL or XML dumps. You can find the entire schema drawn -together in this great [diagram](http://upload.wikimedia.org/wikipedia/commons -/thumb/4/42/MediaWiki_1.20_%2844edaa2%29_database_schema.svg/2193px- -MediaWiki_1.20_%2844edaa2%29_database_schema.svg.png). - -# Approach - -The _main_ distribution files are: - - * Current Pages: As of January 2013 this SQL dump was 9.0GB in its compressed format. - * Complere Archive: This is what we actually want, but at a size of multiple terrabytes, clearly exceeds the storage available at home. - -To have some real historic data, it is recommended to download a _Special -Export_ use this -[link](http://en.wikipedia.org/w/index.php?title=Special:Export). Using this -tool you generate a category specific XML dump and configure various export -options. There are some limits like a maximum of 1000 revisions per export, -but otherwise this should work out just fine. - -![](../../img/Overview.png) - -The entities used in the query use cases. - -# Use Cases - -## Select Change Volume
[07/13] drill git commit: DRILL-2315: Confluence conversion plus fixes
http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/drill-docs/query/query-fs/001-query-json.md -- diff --git a/_docs/drill-docs/query/query-fs/001-query-json.md b/_docs/drill-docs/query/query-fs/001-query-json.md deleted file mode 100644 index 048903b..000 --- a/_docs/drill-docs/query/query-fs/001-query-json.md +++ /dev/null @@ -1,41 +0,0 @@ -title: "Querying JSON Files" -parent: "Querying a File System" -Your Drill installation includes a sample JSON file located in Drill's -classpath. The sample JSON file, `employee.json`, contains fictitious employee -data. Use SQL syntax to query the sample `JSON` file. - -To view the data in the `employee.json` file, submit the following SQL query -to Drill: - -``0: jdbc:drill:zk=local> SELECT * FROM cp.`employee.json`;`` - -The query returns the following results: - -**Example of partial output** - - +-++++-+---+ -| employee_id | full_name | first_name | last_name | position_id | position_ | - +-++++-+---+ -| 1101| Steve Eurich | Steve | Eurich | 16 | Store T | -| 1102| Mary Pierson | Mary | Pierson| 16 | Store T | -| 1103| Leo Jones | Leo| Jones | 16 | Store Tem | -| 1104| Nancy Beatty | Nancy | Beatty | 16 | Store T | -| 1105| Clara McNight | Clara | McNight| 16 | Store | -| 1106| Marcella Isaacs | Marcella | Isaacs | 17 | Stor | -| 1107| Charlotte Yonce | Charlotte | Yonce | 17 | Stor | -| 1108| Benjamin Foster | Benjamin | Foster | 17 | Stor | -| 1109| John Reed | John | Reed | 17 | Store Per | -| 1110| Lynn Kwiatkowski | Lynn | Kwiatkowski | 17 | St | -| | Donald Vann | Donald | Vann | 17 | Store Pe | -| 1112| William Smith | William| Smith | 17 | Store | -| 1113| Amy Hensley | Amy| Hensley| 17 | Store Pe | -| 1114| Judy Owens | Judy | Owens | 17 | Store Per | -| 1115| Frederick Castillo | Frederick | Castillo | 17 | S | -| 1116| Phil Munoz | Phil | Munoz | 17 | Store Per | -| 1117| Lori Lightfoot | Lori | Lightfoot | 17 | Store | -... - +-++++-+---+ -1,155 rows selected (0.762 seconds) -0: jdbc:drill:zk=local> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/drill-docs/query/query-fs/002-query-parquet.md -- diff --git a/_docs/drill-docs/query/query-fs/002-query-parquet.md b/_docs/drill-docs/query/query-fs/002-query-parquet.md deleted file mode 100644 index 9b4e874..000 --- a/_docs/drill-docs/query/query-fs/002-query-parquet.md +++ /dev/null @@ -1,99 +0,0 @@ -title: "Querying Parquet Files" -parent: "Querying a File System" -Your Drill installation includes a `sample-date` directory with Parquet files -that you can query. Use SQL syntax to query the `region.parquet` and -`nation.parquet` files in the `sample-data` directory. - -**Note:** Your Drill installation location may differ from the examples used here. The examples assume that Drill was installed in embedded mode on your machine following the [Apache Drill in 10 Minutes ](https://cwiki.apache.org/confluence/display/DRILL/Apache+Drill+in+10+Minutes)tutorial. If you installed Drill in distributed mode, or your `sample-data` directory differs from the location used in the examples, make sure to change the `sample-data` directory to the correct location before you run the queries. - - Region File - -If you followed the Apache Drill in 10 Minutes instructions to install Drill -in embedded mode, the path to the parquet file varies between operating -systems. - -To view the data in the `region.parquet` file, issue the query appropriate for -your operating system: - - * Linux -``SELECT * FROM dfs.`/opt/drill/apache-drill-0.4.0-incubating/sample- -data/region.parquet`; `` - - * Mac OS X -``SELECT * FROM dfs.`/Users/max/drill/apache-drill-0.4.0-incubating/sample- -data/region.parquet`;`` - - * Windows -``SELECT * FROM dfs.`C:\drill\apache-drill-0.4.0-incubating\sample- -data\region.parquet`;`` - -The query returns the following results: - -+++ -| EXPR$0 | EXPR$1 | -+++ -| AFRICA | lar deposits. blithely final packages cajole. regular waters ar | -
[13/13] drill git commit: DRILL-2315: Confluence conversion plus fixes
DRILL-2315: Confluence conversion plus fixes Project: http://git-wip-us.apache.org/repos/asf/drill/repo Commit: http://git-wip-us.apache.org/repos/asf/drill/commit/d959a210 Tree: http://git-wip-us.apache.org/repos/asf/drill/tree/d959a210 Diff: http://git-wip-us.apache.org/repos/asf/drill/diff/d959a210 Branch: refs/heads/gh-pages Commit: d959a210053f02b5069f0a0cb9f0d34131640ffb Parents: 23f82db Author: Kristine Hahn Authored: Thu Jan 15 19:42:12 2015 -0800 Committer: Bridget Bevens Committed: Wed Feb 25 16:22:24 2015 -0800 -- .gitignore | 1 + _docs/001-arch.md | 49 +++ _docs/001-drill-docs.md | 4 - _docs/002-tutorial.md | 51 +++ _docs/003-yelp.md | 412 ++ _docs/004-install.md| 13 + _docs/005-connect.md| 41 ++ _docs/006-interfaces.md | 50 +++ _docs/007-query.md | 41 ++ _docs/008-sql-ref.md| 14 + _docs/009-dev-custom-func.md| 37 ++ _docs/010-manage.md | 14 + _docs/011-develop.md| 9 + _docs/012-rn.md | 191 + _docs/013-contribute.md | 9 + _docs/014-sample-ds.md | 10 + _docs/015-design.md | 13 + _docs/016-progress.md | 8 + _docs/017-archived-pages.md | 8 + _docs/018-bylaws.md | 170 _docs/arch/001-core-mod.md | 29 ++ _docs/arch/002-arch-hilite.md | 10 + _docs/arch/arch-hilite/001-flexibility.md | 78 _docs/arch/arch-hilite/002-performance.md | 55 +++ _docs/archive/001-how-to-demo.md| 309 ++ _docs/archive/002-meet-drill.md | 41 ++ _docs/connect/001-plugin-reg.md | 35 ++ _docs/connect/002-workspaces.md | 74 _docs/connect/003-reg-fs.md | 64 +++ _docs/connect/004-reg-hbase.md | 32 ++ _docs/connect/005-reg-hive.md | 83 _docs/connect/006-default-frmt.md | 60 +++ _docs/connect/007-mongo-plugin.md | 167 _docs/connect/008-mapr-db-plugin.md | 31 ++ _docs/contribute/001-guidelines.md | 229 ++ _docs/contribute/002-ideas.md | 158 +++ _docs/datasets/001-aol.md | 47 +++ _docs/datasets/002-enron.md | 19 + _docs/datasets/003-wikipedia.md | 105 + _docs/design/001-plan.md| 25 ++ _docs/design/002-rpc.md | 19 + _docs/design/003-query-stages.md| 42 ++ _docs/design/004-research.md| 48 +++ _docs/design/005-value.md | 163 +++ _docs/dev-custom-fcn/001-dev-simple.md | 50 +++ _docs/dev-custom-fcn/002-dev-aggregate.md | 55 +++ _docs/dev-custom-fcn/003-add-custom.md | 26 ++ _docs/dev-custom-fcn/004-use-custom.md | 55 +++ _docs/dev-custom-fcn/005-cust-interface.md | 8 + _docs/develop/001-compile.md| 37 ++ _docs/develop/002-setup.md | 5 + _docs/develop/003-patch-tool.md | 160 +++ _docs/drill-docs/001-arch.md| 58 --- _docs/drill-docs/002-tutorial.md| 58 --- _docs/drill-docs/003-yelp.md| 402 -- _docs/drill-docs/004-install.md | 20 - _docs/drill-docs/005-connect.md | 49 --- _docs/drill-docs/006-query.md | 57 --- _docs/drill-docs/006-sql-ref.md | 25 -- _docs/drill-docs/007-dev-custom-func.md | 47 --- _docs/drill-docs/008-manage.md | 23 - _docs/drill-docs/009-develop.md | 16 - _docs/drill-docs/010-rn.md | 192 - _docs/drill-docs/011-contribute.md | 11 - _docs/drill-docs/012-sample-ds.md | 11 - _docs/drill-docs/013-design.md | 14 - _docs/drill-docs/014-progress.md| 9 - _docs/drill-docs/015-archived-pages.md | 9 - _docs/drill-docs/016-bylaws.md | 171 _docs/drill-docs/arch/001-core-mod.md | 30 -- _docs/drill-docs/arch/002-arch-hilite.md| 15 - .../arch/arch-hilite/001-flexibility.md | 79 .../arch/arch-hilite/002-performance.md | 56 --- _docs/drill-docs/archive/001-how-to-demo.md | 309 ---
[01/13] drill git commit: DRILL-2315: Confluence conversion plus fixes
Repository: drill Updated Branches: refs/heads/gh-pages 23f82db9f -> d959a2100 http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/tutorial/005-lesson3.md -- diff --git a/_docs/tutorial/005-lesson3.md b/_docs/tutorial/005-lesson3.md new file mode 100644 index 000..f6c7ae4 --- /dev/null +++ b/_docs/tutorial/005-lesson3.md @@ -0,0 +1,379 @@ +--- +title: "Lession 3: Run Queries on Complex Data Types" +parent: "Apache Drill Tutorial" +--- +## Goal + +This lesson focuses on queries that exercise functions and operators on self- +describing data and complex data types. Drill offers intuitive SQL extensions +to work with such data and offers high query performance with an architecture +built from the ground up for complex data. + +## Queries in This Lesson + +Now that you have run ANSI SQL queries against different tables and files with +relational data, you can try some examples including complex types. + + * Access directories and subdirectories of files in a single SELECT statement. + * Demonstrate simple ways to access complex data in JSON files. + * Demonstrate the repeated_count function to aggregate values in an array. + +## Query Partitioned Directories + +You can use special variables in Drill to refer to subdirectories in your +workspace path: + + * dir0 + * dir1 + * ⦠+ +Note that these variables are dynamically determined based on the partitioning +of the file system. No up-front definitions are required on what partitions +exist. Here is a visual example of how this works: + +![drill query flow]({{ site.baseurl }}/docs/img/example_query.png) + +### Set workspace to dfs.logs: + +0: jdbc:drill:> use dfs.logs; ++++ +| ok | summary | ++++ +| true | Default schema changed to 'dfs.logs' | ++++ + +### Query logs data for a specific year: + +0: jdbc:drill:> select * from logs where dir0='2013' limit 10; + +++++++++++---++ +| dir0 | dir1 | trans_id | date | time | cust_id | device | state | camp_id | keywords | prod_id | purch_flag | + +++++++++++---++ +| 2013 | 11 | 12119 | 11/09/2013 | 02:24:51 | 262 | IOS5 | ny | 0 | chamber | 198 | false | +| 2013 | 11 | 12120 | 11/19/2013 | 09:37:43 | 0 | AOS4.4 | il | 2 | outside | 511 | false | +| 2013 | 11 | 12134 | 11/10/2013 | 23:42:47 | 60343 | IOS5 | ma | 4 | and | 421 | false | +| 2013 | 11 | 12135 | 11/16/2013 | 01:42:13 | 46762 | AOS4.3 | ca | 4 | here's | 349 | false | +| 2013 | 11 | 12165 | 11/26/2013 | 21:58:09 | 41987 | AOS4.2 | mn | 4 | he | 271 | false | +| 2013 | 11 | 12168 | 11/09/2013 | 23:41:48 | 8600 | IOS5 | in | 6 | i | 459 | false | +| 2013 | 11 | 12196 | 11/20/2013 | 02:23:06 | 15603 | IOS5 | tn | 1 | like | 324 | false | +| 2013 | 11 | 12203 | 11/25/2013 | 23:50:29 | 221 | IOS6 | tx | 10 | if | 323 | false | +| 2013 | 11 | 12206 | 11/09/2013 | 23:53:01 | 2488 | AOS4.2 | tx | 14 | unlike | 296 | false | +| 2013 | 11 | 12217 | 11/06/2013 | 23:51:56 | 0 | AOS4.2 | tx | 9 | can't | 54 | false | + +++++++++++++ + + +This query constrains files inside the subdirectory named 2013. The variable +dir0 refers to the first level down from logs, dir1 to the next level, and so +on. So this query returned 10 of the rows for February 2013. + +### Further constrain the results using multiple predicates in the query: + +This query returns a list of customer IDs for people who made a purchase via +an IOS5 device in August 2013. + +0: jdbc:drill:> select dir0 as yr, dir1 as mth, cust_id from logs +where dir0='2013' and dir1='8' and device='IOS5' and purch_flag='true' +order by `date`; +++++ +| yr | mth | cust_id | +++++ +| 2013 | 8 | 4 | +| 2013 | 8 | 521 | +| 2013 | 8 | 1 | +| 2013 | 8 | 2 | +| 2013 | 8 | 4 | +| 2013 | 8 | 549 | +| 2013 | 8 | 72827 | +| 2013 | 8 | 38127 | +... + +### Return monthly counts per customer for a given year: + +0: jdbc:drill:> select cust_id, dir1 month_no, count(*) month_count from logs +where dir0=2014 group by cust_id, dir1 order by cust_id, month_no limit 10; ++++-+ +| cust_id | month_no | month_count | ++++-+ +| 0 | 1 | 143 | +| 0 | 2 | 118 | +| 0 | 3
[03/13] drill git commit: DRILL-2315: Confluence conversion plus fixes
http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/query/005-query-info-skema.md -- diff --git a/_docs/query/005-query-info-skema.md b/_docs/query/005-query-info-skema.md new file mode 100644 index 000..1ad0008 --- /dev/null +++ b/_docs/query/005-query-info-skema.md @@ -0,0 +1,109 @@ +--- +title: "Querying the INFORMATION SCHEMA" +parent: "Query Data" +--- +When you are using Drill to connect to multiple data sources, you need a +simple mechanism to discover what each data source contains. The information +schema is an ANSI standard set of metadata tables that you can query to return +information about all of your Drill data sources (or schemas). Data sources +may be databases or file systems; they are all known as "schemas" in this +context. You can query the following INFORMATION_SCHEMA tables: + + * SCHEMATA + * CATALOGS + * TABLES + * COLUMNS + * VIEWS + +## SCHEMATA + +The SCHEMATA table contains the CATALOG_NAME and SCHEMA_NAME columns. To allow +maximum flexibility inside BI tools, the only catalog that Drill supports is +`DRILL`. + +0: jdbc:drill:zk=local> select CATALOG_NAME, SCHEMA_NAME as all_my_data_sources from INFORMATION_SCHEMA.SCHEMATA order by SCHEMA_NAME; ++--+-+ +| CATALOG_NAME | all_my_data_sources | ++--+-+ +| DRILL| INFORMATION_SCHEMA | +| DRILL| cp.default | +| DRILL| dfs.default | +| DRILL| dfs.root| +| DRILL| dfs.tmp | +| DRILL| HiveTest.SalesDB| +| DRILL| maprfs.logs | +| DRILL| sys | ++--+-+ + +The INFORMATION_SCHEMA name and associated keywords are case-sensitive. You +can also return a list of schemas by running the SHOW DATABASES command: + +0: jdbc:drill:zk=local> show databases; ++-+ +| SCHEMA_NAME | ++-+ +| dfs.default | +| dfs.root| +| dfs.tmp | +... + +## CATALOGS + +The CATALOGS table returns only one row, with the hardcoded DRILL catalog name +and description. + +## TABLES + +The TABLES table returns the table name and type for each table or view in +your databases. (Type means TABLE or VIEW.) Note that Drill does not return +files available for querying in file-based data sources. Instead, use SHOW +FILES to explore these data sources. + +## COLUMNS + +The COLUMNS table returns the column name and other metadata (such as the data +type) for each column in each table or view. + +## VIEWS + +The VIEWS table returns the name and definition for each view in your +databases. Note that file schemas are the canonical repository for views in +Drill. Depending on how you create a view, the may only be displayed in Drill +after it has been used. + +## Useful Queries + +Run an ``INFORMATION_SCHEMA.`TABLES` ``query to view all of the tables and views +within a database. TABLES is a reserved word in Drill and requires back ticks +(`). + +For example, the following query identifies all of the tables and views that +Drill can access: + +SELECT TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE +FROM INFORMATION_SCHEMA.`TABLES` +ORDER BY TABLE_NAME DESC; + +TABLE_SCHEMA TABLE_NAMETABLE_TYPE + +HiveTest.CustomersDB Customers TABLE +HiveTest.SalesDB OrdersTABLE +HiveTest.SalesDB OrderLinesTABLE +HiveTest.SalesDB USOrders VIEW +dfs.default CustomerSocialProfile VIEW + + +**Note:** Currently, Drill only supports querying Drill views; Hive views are not yet supported. + +You can run a similar query to identify columns in tables and the data types +of those columns: + +SELECT COLUMN_NAME, DATA_TYPE +FROM INFORMATION_SCHEMA.COLUMNS +WHERE TABLE_NAME = 'Orders' AND TABLE_SCHEMA = 'HiveTest.SalesDB' AND COLUMN_NAME LIKE '%Total'; ++-++ +| COLUMN_NAME | DATA_TYPE | ++-++ +| OrderTotal | Decimal| ++-++ + http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/query/006-query-sys-tbl.md -- diff --git a/_docs/query/006-query-sys-tbl.md b/_docs/query/006-query-sys-tbl.md new file mode 100644 index 000..9b853ec --- /dev/null +++ b/_docs/query/006-query-sys-tbl.md @@ -0,0 +1,159 @@ +--- +title: "Querying System Tables" +parent: "Query Data" +--- +Drill has a sys database that contains system tables. You can query the system +tables for in
[04/13] drill git commit: DRILL-2315: Confluence conversion plus fixes
http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/interfaces/odbc-win/002-conf-odbc-win.md -- diff --git a/_docs/interfaces/odbc-win/002-conf-odbc-win.md b/_docs/interfaces/odbc-win/002-conf-odbc-win.md new file mode 100644 index 000..636bd9f --- /dev/null +++ b/_docs/interfaces/odbc-win/002-conf-odbc-win.md @@ -0,0 +1,143 @@ +--- +title: "Step 2. Configure ODBC Connections to Drill Data Sources" +parent: "Using the MapR ODBC Driver on Windows" +--- +Complete one of the following steps to create an ODBC connection to Drill data +sources: + + * Create a Data Source Name + * Create an ODBC Connection String + +**Prerequisite:** An Apache Drill installation must be available that is configured to access the data sources that you want to connect to. For information about how to install Apache Drill, see [Install Drill](/drill/docs/install-drill). For information about configuring data sources, see the [Apache Drill documentation](/drill/docs). + +## Create a Data Source Name (DSN) + +Create a DSN that an application can use to connect to Drill data sources. If +you want to create a DSN for a 32-bit application, you must use the 32-bit +version of the ODBC Administrator to create the DSN. + + 1. To launch the ODBC Administrator, click **Start > All Programs > MapR Drill ODBC Driver 1.0 (32|64-bit) > (32|64-bit) ODBC Administrator**. +The ODBC Data Source Administrator window appears. + + To launch the 32-bit version of the ODBC driver on a 64-bit machine, run: +`C:\WINDOWS\SysWOW64\odbcad32.exe`. + 2. Click the **System DSN** tab to create a system DSN or click the **User DSN** tab to create a user DSN. A system DSN is available for all users who log in to the machine. A user DSN is available to the user who creates the DSN. + 3. Click **Add**. + 4. Select **MapR Drill ODBC Driver** and click **Finish**. + The _MapR Drill ODBC Driver DSN Setup_ window appears. + 5. In the **Data Source Name** field, enter a name for the DSN, + 6. Optionally, enter a description of the DSN in the Description field. + 7. In the Connection Type section, select a connection type and enter the associated connection details: + + Connection TypePropertiesDescriptionsZookeeper QuorumQuorumA comma-separated list of servers in a Zookeeper cluster.For example,:5181, :5181,â¦ClusterIDName of the drillbit cluster. The default is drillbits1. You may need to specify a different value if the cluster ID was changed in the drill-override.conf file.Direct to Drillbit Provide the IP address or host name of the Drill server and the port number that that the Drill server is listening on. The port number defaults to 31010. You may need to specify a different value if the port number was changed in the drill-override.conf file. + For information on selecting the appropriate connection type, see [Connection +Types](/drill/docs/step-2-configure-odbc-connections-to-drill-data-sources#connection-type). + 8. In the **Default Schema** field, select the default schema that you want to connect to. + For more information about the schemas that appear in this list, see Schemas. + 9. Optionally, perform one of the following operations: + + OptionActionUpdate the configuration of the advanced properties.Edit the default values in the Advanced Properties section. For more information, see Advanced Properties.Configure the types of events that you want the driver to log.Click Logging Options. For more information, see Logging Options.Create views or explore Drill sources.Click Drill Explorer. For more information, see Using Drill Explorer to Browse Data and Create Views. + 10. Click **OK** to save the DSN. + +## Configuration Options + +### Connection Type + +ODBC can connect directly to a Drillbit or to a ZooKeeper Quorum. Select your +connection type based on your environment and Drillbit configuration. + +The following table lists the appropriate connection type for each scenario: + +ScenarioConnection TypeDrillbit is running in embedded mode.Direct to DrillbitDrillbit is registered with the ZooKeeper in a testing environment.ZooKeeper Quorum or Direct to DrillbitDrillbit is registered with the ZooKeeper in a production environment.ZooKeeper Quorum + + Connection to Zookeeper Quorum + +When you choose to connect to a ZooKeeper Quorum, the ODBC driver connects to +the ZooKeeper Quorum to get a list of available Drillbits in the specified +cluster. Then, the ODBC driver submits a query after selecting a Drillbit. All +Drillbits in the cluster process the query and the Drillbit that received the +query returns the query results. + +![ODBC to Quorum]({{ site.baseurl }}/docs/img/ODBC_to_Quorum.png) + +In a production environment, you should connect to a ZooKeeper Quorum for a +more reliable connection. If one Drillbit is not available, another Drillbit +that is
[12/13] drill git commit: DRILL-2315: Confluence conversion plus fixes
http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/arch/001-core-mod.md -- diff --git a/_docs/arch/001-core-mod.md b/_docs/arch/001-core-mod.md new file mode 100644 index 000..17fa18d --- /dev/null +++ b/_docs/arch/001-core-mod.md @@ -0,0 +1,29 @@ +--- +title: "Core Modules within a Drillbit" +parent: "Architectural Overview" +--- +The following image represents components within each Drillbit: + +![drill query flow]({{ site.baseurl }}/docs/img/DrillbitModules.png) + +The following list describes the key components of a Drillbit: + + * **RPC end point**: Drill exposes a low overhead protobuf-based RPC protocol to communicate with the clients. Additionally, a C++ and Java API layers are also available for the client applications to interact with Drill. Clients can communicate to a specific Drillbit directly or go through a ZooKeeper quorum to discover the available Drillbits before submitting queries. It is recommended that the clients always go through ZooKeeper to shield clients from the intricacies of cluster management, such as the addition or removal of nodes. + + * **SQL parser**: Drill uses Optiq, the open source framework, to parse incoming queries. The output of the parser component is a language agnostic, computer-friendly logical plan that represents the query. + * **Storage plugin interfaces**: Drill serves as a query layer on top of several data sources. Storage plugins in Drill represent the abstractions that Drill uses to interact with the data sources. Storage plugins provide Drill with the following information: +* Metadata available in the source +* Interfaces for Drill to read from and write to data sources +* Location of data and a set of optimization rules to help with efficient and faster execution of Drill queries on a specific data source + +In the context of Hadoop, Drill provides storage plugins for files and +HBase/M7. Drill also integrates with Hive as a storage plugin since Hive +provides a metadata abstraction layer on top of files, HBase/M7, and provides +libraries to read data and operate on these sources (Serdes and UDFs). + +When users query files and HBase/M7 with Drill, they can do it directly or go +through Hive if they have metadata defined there. Drill integration with Hive +is only for metadata. Drill does not invoke the Hive execution engine for any +requests. + + * **Distributed cache**: Drill uses a distributed cache to manage metadata (not the data) and configuration information across various nodes. Sample metadata information that is stored in the cache includes query plan fragments, intermediate state of the query execution, and statistics. Drill uses Infinispan as its cache technology. \ No newline at end of file http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/arch/002-arch-hilite.md -- diff --git a/_docs/arch/002-arch-hilite.md b/_docs/arch/002-arch-hilite.md new file mode 100644 index 000..5ac51bc --- /dev/null +++ b/_docs/arch/002-arch-hilite.md @@ -0,0 +1,10 @@ +--- +title: "Architectural Highlights" +parent: "Architectural Overview" +--- +The goal for Drill is to bring the **SQL Ecosystem** and **Performance** of +the relational systems to **Hadoop scale** data **WITHOUT** compromising on +the **Flexibility** of Hadoop/NoSQL systems. There are several core +architectural elements in Apache Drill that make it a highly flexible and +efficient query engine. + http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/arch/arch-hilite/001-flexibility.md -- diff --git a/_docs/arch/arch-hilite/001-flexibility.md b/_docs/arch/arch-hilite/001-flexibility.md new file mode 100644 index 000..0b5c5e3 --- /dev/null +++ b/_docs/arch/arch-hilite/001-flexibility.md @@ -0,0 +1,78 @@ +--- +title: "Flexibility" +parent: "Architectural Highlights" +--- +The following features contribute to Drill's flexible architecture: + +**_Dynamic schema discovery_** + +Drill does not require schema or type specification for the data in order to +start the query execution process. Instead, Drill starts processing the data +in units called record-batches and discovers the schema on the fly during +processing. Self-describing data formats such as Parquet, JSON, AVRO, and +NoSQL databases have schema specified as part of the data itself, which Drill +leverages dynamically at query time. Schema can change over the course of a +Drill query, so all of the Drill operators are designed to reconfigure +themselves when such schema changing events occur. + +**_Flexible data model_** + +Drill is purpose-built from the ground up for complex/multi-structured data +commonly seen in Hadoop/NoSQL applications such as social/mobile, clickstream, +logs, and sensor equipped IOT. From a user point of view, Drill a
[06/13] drill git commit: DRILL-2315: Confluence conversion plus fixes
http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/drill-docs/sql-ref/nested/001-flatten.md -- diff --git a/_docs/drill-docs/sql-ref/nested/001-flatten.md b/_docs/drill-docs/sql-ref/nested/001-flatten.md deleted file mode 100644 index 124db91..000 --- a/_docs/drill-docs/sql-ref/nested/001-flatten.md +++ /dev/null @@ -1,89 +0,0 @@ -title: "FLATTEN Function" -parent: "Nested Data Functions" -The FLATTEN function is useful for flexible exploration of repeated data. -FLATTEN separates the elements in a repeated field into individual records. To -maintain the association between each flattened value and the other fields in -the record, all of the other columns are copied into each new record. A very -simple example would turn this data (one record): - -{ - "x" : 5, - "y" : "a string", - "z" : [ 1,2,3] -} - -into three distinct records: - -select flatten(z) from table; -| x | y | z | -+-++---+ -| 5 | "a string" | 1 | -| 5 | "a string" | 2 | -| 5 | "a string" | 3 | - -The function takes a single argument, which must be an array (the `z` column -in this example). - - - -For a more interesting example, consider the JSON data in the publicly -available [Yelp](https://www.yelp.com/dataset_challenge/dataset) data set. The -first query below returns three columns from the -`yelp_academic_dataset_business.json` file: `name`, `hours`, and `categories`. -The query is restricted to distinct rows where the name is `z``pizza`. The -query returns only one row that meets those criteria; however, note that this -row contains an array of four categories: - -0: jdbc:drill:zk=local> select distinct name, hours, categories -from dfs.yelp.`yelp_academic_dataset_business.json` -where name ='zpizza'; -++++ -|name| hours| categories | -++++ -| zpizza | {"Tuesday":{"close":"22:00","open":"10:00"},"Friday":{"close":"23:00","open":"10:00"},"Monday":{"close":"22:00","open":"10:00"},"Wednesday":{"close":"22:00","open":"10:00"},"Thursday":{"close":"22:00","open":"10:00"},"Sunday":{"close":"22:00","open":"10:00"},"Saturday":{"close":"23:00","open":"10:00"}} | ["Gluten-Free","Pizza","Vegan","Restaurants"] | - -The FLATTEN function can operate on this single row and return multiple rows, -one for each category: - -0: jdbc:drill:zk=local> select distinct name, flatten(categories) as categories -from dfs.yelp.`yelp_academic_dataset_business.json` -where name ='zpizza' order by 2; -++-+ -|name| categories | -++-+ -| zpizza | Gluten-Free | -| zpizza | Pizza | -| zpizza | Restaurants | -| zpizza | Vegan | -++-+ -4 rows selected (2.797 seconds) - -Having used the FLATTEN function to break down arrays into distinct rows, you -can run queries that do deeper analysis on the flattened result set. For -example, you can use FLATTEN in a subquery, then apply WHERE clause -constraints or aggregate functions to the results in the outer query. - -The following query uses the same data file as the previous query to flatten -the categories array, then run a COUNT function on the flattened result: - -select celltbl.catl, count(celltbl.catl) catcount -from (select flatten(categories) catl -from dfs.yelp.`yelp_academic_dataset_business.json`) celltbl -group by celltbl.catl -order by count(celltbl.catl) desc limit 5; - -+---++ -|catl | catcount | -+---++ -| Restaurants | 14303 | -| Shopping | 6428 | -| Food | 5209 | -| Beauty & Spas | 3421 | -| Nightlife | 2870 | -+---|+ - -A common use case for FLATTEN is its use in conjunction with the -[KVGEN](/confluence/display/DRILL/KVGEN+Function) function. - http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/drill-docs/sql-ref/nested/002-kvgen.md -- diff --git a/_docs/drill-docs/sql-ref/nested/002-kvgen.md b/_docs/drill-docs/sql-ref/nested/002-kvgen.md deleted file mode 100644 index a27a781..000 --- a/_docs/drill-docs/sql-ref/nested/002-kvgen.md +++ /dev/null @@ -1,150 +0,0 @@ -title: "KVGEN Function" -parent: "Nested Data Functions" -KVGEN stands for _key-value generation_. This function is useful when complex -data files contain arbitrary maps that consist of relatively "unknown" column -names. Instead of having to specify columns in the map to access the data, you -can use KVGEN to ret
[10/13] drill git commit: DRILL-2315: Confluence conversion plus fixes
http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/drill-docs/016-bylaws.md -- diff --git a/_docs/drill-docs/016-bylaws.md b/_docs/drill-docs/016-bylaws.md deleted file mode 100644 index 6f2604f..000 --- a/_docs/drill-docs/016-bylaws.md +++ /dev/null @@ -1,171 +0,0 @@ -title: "Project Bylaws" -parent: "Apache Drill Documentation" -# Introduction - -This document defines the bylaws under which the Apache Drill project -operates. It defines the roles and responsibilities of the project, who may -vote, how voting works, how conflicts are resolved, etc. - -Drill is a project of the [Apache Software -Foundation](http://www.apache.org/foundation/). The foundation holds the -copyright on Apache code including the code in the Drill codebase. The -[foundation FAQ](http://www.apache.org/foundation/faq.html) explains the -operation and background of the foundation. - -Drill is typical of Apache projects in that it operates under a set of -principles, known collectively as the _Apache Way_. If you are new to Apache -development, please refer to the [Incubator -project](http://incubator.apache.org/) for more information on how Apache -projects operate. - -# Roles and Responsibilities - -Apache projects define a set of roles with associated rights and -responsibilities. These roles govern what tasks an individual may perform -within the project. The roles are defined in the following sections. - -## Users - -The most important participants in the project are people who use our -software. The majority of our contributors start out as users and guide their -development efforts from the user's perspective. - -Users contribute to the Apache projects by providing feedback to contributors -in the form of bug reports and feature suggestions. As well, users participate -in the Apache community by helping other users on mailing lists and user -support forums. - -## Contributors - -All of the volunteers who are contributing time, code, documentation, or -resources to the Drill Project. A contributor that makes sustained, welcome -contributions to the project may be invited to become a committer, though the -exact timing of such invitations depends on many factors. - -## Committers - -The project's committers are responsible for the project's technical -management. Committers have access to a specified set of subproject's code -repositories. Committers on subprojects may cast binding votes on any -technical discussion regarding that subproject. - -Committer access is by invitation only and must be approved by lazy consensus -of the active PMC members. A Committer is considered _emeritus_ by his or her -own declaration or by not contributing in any form to the project for over six -months. An emeritus committer may request reinstatement of commit access from -the PMC which will be sufficient to restore him or her to active committer -status. - -Commit access can be revoked by a unanimous vote of all the active PMC members -(except the committer in question if he or she is also a PMC member). - -All Apache committers are required to have a signed [Contributor License -Agreement (CLA)](http://www.apache.org/licenses/icla.txt) on file with the -Apache Software Foundation. There is a [Committer -FAQ](http://www.apache.org/dev/committers.html) which provides more details on -the requirements for committers. - -A committer who makes a sustained contribution to the project may be invited -to become a member of the PMC. The form of contribution is not limited to -code. It can also include code review, helping out users on the mailing lists, -documentation, etc. - -## Project Management Committee - -The PMC is responsible to the board and the ASF for the management and -oversight of the Apache Drill codebase. The responsibilities of the PMC -include - - * Deciding what is distributed as products of the Apache Drill project. In particular all releases must be approved by the PMC. - * Maintaining the project's shared resources, including the codebase repository, mailing lists, websites. - * Speaking on behalf of the project. - * Resolving license disputes regarding products of the project. - * Nominating new PMC members and committers. - * Maintaining these bylaws and other guidelines of the project. - -Membership of the PMC is by invitation only and must be approved by a lazy -consensus of active PMC members. A PMC member is considered _emeritus_ by his -or her own declaration or by not contributing in any form to the project for -over six months. An emeritus member may request reinstatement to the PMC, -which will be sufficient to restore him or her to active PMC member. - -Membership of the PMC can be revoked by an unanimous vote of all the active -PMC members other than the member in question. - -The chair of the PMC is appointed by the ASF board. The chair is an office -holder of the Apache Software Foundation (Vice President, Apache D
[02/13] drill git commit: DRILL-2315: Confluence conversion plus fixes
http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/sql-ref/cmd-summary/003-select.md -- diff --git a/_docs/sql-ref/cmd-summary/003-select.md b/_docs/sql-ref/cmd-summary/003-select.md new file mode 100644 index 000..4a4 --- /dev/null +++ b/_docs/sql-ref/cmd-summary/003-select.md @@ -0,0 +1,85 @@ +--- +title: "SELECT Statements" +parent: "SQL Commands Summary" +--- +Drill supports the following ANSI standard clauses in the SELECT statement: + + * WITH clause + * SELECT list + * FROM clause + * WHERE clause + * GROUP BY clause + * HAVING clause + * ORDER BY clause (with an optional LIMIT clause) + +You can use the same SELECT syntax in the following commands: + + * CREATE TABLE AS (CTAS) + * CREATE VIEW + +INSERT INTO SELECT is not yet supported. + +## Column Aliases + +You can use named column aliases in the SELECT list to provide meaningful +names for regular columns and computed columns, such as the results of +aggregate functions. See the section on running queries for examples. + +You cannot reference column aliases in the following clauses: + + * WHERE + * GROUP BY + * HAVING + +Because Drill works with schema-less data sources, you cannot use positional +aliases (1, 2, etc.) to refer to SELECT list columns, except in the ORDER BY +clause. + +## UNION ALL Set Operator + +Drill supports the UNION ALL set operator to combine two result sets. The +distinct UNION operator is not yet supported. + +The EXCEPT, EXCEPT ALL, INTERSECT, and INTERSECT ALL operators are not yet +supported. + +## Joins + +Drill supports ANSI standard joins in the FROM and WHERE clauses: + + * Inner joins + * Left, full, and right outer joins + +The following types of join syntax are supported: + +Join type| Syntax +---|--- +Join condition in WHERE clause|FROM table1, table 2 WHERE table1.col1=table2.col1 +USING join in FROM clause|FROM table1 JOIN table2 USING(col1, ...) +ON join in FROM clause|FROM table1 JOIN table2 ON table1.col1=table2.col1 +NATURAL JOIN in FROM clause|FROM table 1 NATURAL JOIN table 2 + +Cross-joins are not yet supported. You must specify a join condition when more +than one table is listed in the FROM clause. + +Non-equijoins are supported if the join also contains an equality condition on +the same two tables as part of a conjunction: + +table1.col1 = table2.col1 AND table1.c2 < table2.c2 + +This restriction applies to both inner and outer joins. + +## Subqueries + +You can use the following subquery operators in Drill queries. These operators +all return Boolean results. + + * ALL + * ANY + * EXISTS + * IN + * SOME + +In general, correlated subqueries are supported. EXISTS and NOT EXISTS +subqueries that do not contain a correlation join are not yet supported. + http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/sql-ref/cmd-summary/004-show-files.md -- diff --git a/_docs/sql-ref/cmd-summary/004-show-files.md b/_docs/sql-ref/cmd-summary/004-show-files.md new file mode 100644 index 000..1fcf395 --- /dev/null +++ b/_docs/sql-ref/cmd-summary/004-show-files.md @@ -0,0 +1,65 @@ +--- +title: "SHOW FILES Command" +parent: "SQL Commands Summary" +--- +The SHOW FILES command provides a quick report of the file systems that are +visible to Drill for query purposes. This command is unique to Apache Drill. + +## Syntax + +The SHOW FILES command supports the following syntax. + +SHOW FILES [ FROM filesystem.directory_name | IN filesystem.directory_name ]; + +The FROM or IN clause is required if you do not specify a default file system +first. You can do this with the USE command. FROM and IN are synonyms. + +The directory name is optional. (If the directory name is a Drill reserved +word, you must use back ticks around the name.) + +The command returns standard Linux `stat` information for each file or +directory, such as permissions, owner, and group values. This information is +not specific to Drill. + +## Examples + +The following example returns information about directories and files in the +local (`dfs`) file system. + + 0: jdbc:drill:> use dfs; + + +++ + | ok | summary | + +++ + | true | Default schema changed to 'dfs' | + +++ + 1 row selected (0.318 seconds) + + 0: jdbc:drill:> show files; + ++-+++++-++--+ + |name| isDirectory | isFile | length | owner| group| permissions | accessTime | modificationTime | + ++-+++++-++--+ + | user | true| false | 1
[05/13] drill git commit: DRILL-2315: Confluence conversion plus fixes
http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/img/ngram_plugin2.png -- diff --git a/_docs/img/ngram_plugin2.png b/_docs/img/ngram_plugin2.png new file mode 100644 index 000..60d432d Binary files /dev/null and b/_docs/img/ngram_plugin2.png differ http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/img/settings.png -- diff --git a/_docs/img/settings.png b/_docs/img/settings.png new file mode 100644 index 000..dcff0d9 Binary files /dev/null and b/_docs/img/settings.png differ http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/img/student_hive.png -- diff --git a/_docs/img/student_hive.png b/_docs/img/student_hive.png new file mode 100644 index 000..7e22b88 Binary files /dev/null and b/_docs/img/student_hive.png differ http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/install/001-drill-in-10.md -- diff --git a/_docs/install/001-drill-in-10.md b/_docs/install/001-drill-in-10.md new file mode 100644 index 000..13d2410 --- /dev/null +++ b/_docs/install/001-drill-in-10.md @@ -0,0 +1,365 @@ +--- +title: "Apache Drill in 10 Minutes" +parent: "Install Drill" +--- +* Objective +* A Few Bits About Apache Drill +* Process Overview +* Install Drill + * Installing Drill on Linux + * Installing Drill on Mac OS X + * Installing Drill on Windows +* Start Drill +* Query Sample Data +* Summary +* Next Steps +* More Information + +## Objective + +Use Apache Drill to query sample data in 10 minutes. For simplicity, youâll +run Drill in _embedded_ mode rather than _distributed_ mode to try out Drill +without having to perform any setup tasks. + +## A Few Bits About Apache Drill + +Drill is a clustered, powerful MPP (Massively Parallel Processing) query +engine for Hadoop that can process petabytes of data, fast. Drill is useful +for short, interactive ad-hoc queries on large-scale data sets. Drill is +capable of querying nested data in formats like JSON and Parquet and +performing dynamic schema discovery. Drill does not require a centralized +metadata repository. + +### **_Dynamic schema discovery_** + +Drill does not require schema or type specification for data in order to start +the query execution process. Drill starts data processing in record-batches +and discovers the schema during processing. Self-describing data formats such +as Parquet, JSON, AVRO, and NoSQL databases have schema specified as part of +the data itself, which Drill leverages dynamically at query time. Because +schema can change over the course of a Drill query, all Drill operators are +designed to reconfigure themselves when schemas change. + +### **_Flexible data model_** + +Drill allows access to nested data attributes, just like SQL columns, and +provides intuitive extensions to easily operate on them. From an architectural +point of view, Drill provides a flexible hierarchical columnar data model that +can represent complex, highly dynamic and evolving data models. Drill allows +for efficient processing of these models without the need to flatten or +materialize them at design time or at execution time. Relational data in Drill +is treated as a special or simplified case of complex/multi-structured data. + +### **_De-centralized metadata_** + +Drill does not have a centralized metadata requirement. You do not need to +create and manage tables and views in a metadata repository, or rely on a +database administrator group for such a function. Drill metadata is derived +from the storage plugins that correspond to data sources. Storage plugins +provide a spectrum of metadata ranging from full metadata (Hive), partial +metadata (HBase), or no central metadata (files). De-centralized metadata +means that Drill is NOT tied to a single Hive repository. You can query +multiple Hive repositories at once and then combine the data with information +from HBase tables or with a file in a distributed file system. You can also +use SQL DDL syntax to create metadata within Drill, which gets organized just +like a traditional database. Drill metadata is accessible through the ANSI +standard INFORMATION_SCHEMA database. + +### **_Extensibility_** + +Drill provides an extensible architecture at all layers, including the storage +plugin, query, query optimization/execution, and client API layers. You can +customize any layer for the specific needs of an organization or you can +extend the layer to a broader array of use cases. Drill provides a built in +classpath scanning and plugin concept to add additional storage plugins, +functions, and operators with minimal configuration. + +## Process Overview + +Download the Apache Drill archive and extract the contents to a directory on +your machine. The Apache Drill archiv
[11/13] drill git commit: DRILL-2315: Confluence conversion plus fixes
http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/design/005-value.md -- diff --git a/_docs/design/005-value.md b/_docs/design/005-value.md new file mode 100644 index 000..828376a --- /dev/null +++ b/_docs/design/005-value.md @@ -0,0 +1,163 @@ +--- +title: "Value Vectors" +parent: "Design Docs" +--- +This document defines the data structures required for passing sequences of +columnar data between [Operators](https://docs.google.com/a/maprtech.com/document/d/1zaxkcrK9mYyfpGwX1kAV80z0PCi8abefL45zOzb97dI/edit#bookmark=id.iip15ful18mm). + +## Goals + +### Support Operators Written in Multiple Language + +ValueVectors should support operators written in C/C++/Assembly. To support +this, the underlying ByteBuffer will not require modification when passed +through the JNI interface. The ValueVector will be considered immutable once +constructed. Endianness has not yet been considered. + +### Access + +Reading a random element from a ValueVector must be a constant time operation. +To accomodate, elements are identified by their offset from the start of the +buffer. Repeated, nullable and variable width ValueVectors utilize in an +additional fixed width value vector to index each element. Write access is not +supported once the ValueVector has been constructed by the RecordBatch. + +### Efficient Subsets of Value Vectors + +When an operator returns a subset of values from a ValueVector, it should +reuse the original ValueVector. To accomplish this, a level of indirection is +introduced to skip over certain values in the vector. This level of +indirection is a sequence of offsets which reference an offset in the original +ValueVector and the count of subsequent values which are to be included in the +subset. + +### Pooled Allocation + +ValueVectors utilize one or more buffers under the covers. These buffers will +be drawn from a pool. Value vectors are themselves created and destroyed as a +schema changes during the course of record iteration. + +### Homogenous Value Types + +Each value in a Value Vector is of the same type. The [Record Batch](https://docs.google.com/a/maprtech.com/document/d/1zaxkcrK9mYyfpGwX1kAV80z0PCi8abefL45zOzb97dI/edit#bookmark=kix.s2xuoqnr8obe) implementation is responsible for +creating a new Value Vector any time there is a change in schema. + +## Definitions + +Data Types + +The canonical source for value type definitions is the [Drill +Datatypes](http://bit.ly/15JO9bC) document. The individual types are listed +under the âBasic Data Typesâ tab, while the value vector types can be found +under the âValue Vectorsâ tab. + +Operators + +An operator is responsible for transforming a stream of fields. It operates on +Record Batches or constant values. + +Record Batch + +A set of field values for some range of records. The batch may be composed of +Value Vectors, in which case each batch consists of exactly one schema. + +Value Vector + +The value vector is comprised of one or more contiguous buffers; one which +stores a sequence of values, and zero or more which store any metadata +associated with the ValueVector. + +## Data Structure + +A ValueVector stores values in a ByteBuf, which is a contiguous region of +memory. Additional levels of indirection are used to support variable value +widths, nullable values, repeated values and selection vectors. These levels +of indirection are primarily lookup tables which consist of one or more fixed +width ValueVectors which may be combined (e.g. for nullable, variable width +values). A fixed width ValueVector of non-nullable, non-repeatable values does +not require an indirect lookup; elements can be accessed directly by +multiplying position by stride. + +Fixed Width Values + +Fixed width ValueVectors simply contain a packed sequence of values. Random +access is supported by accessing element n at ByteBuf[0] + Index * Stride, +where Index is 0-based. The following illustrates the underlying buffer of +INT4 values [1 .. 6]: + +![drill query flow]({{ site.baseurl }}/docs/img/value1.png) + +Nullable Values + +Nullable values are represented by a vector of bit values. Each bit in the +vector corresponds to an element in the ValueVector. If the bit is not set, +the value is NULL. Otherwise the value is retrieved from the underlying +buffer. The following illustrates a NullableValueVector of INT4 values 2, 3 +and 6: + +![drill query flow]({{ site.baseurl }}/docs/img/value2.png) + +### Repeated Values + +A repeated ValueVector is used for elements which can contain multiple values +(e.g. a JSON array). A table of offset and count pairs is used to represent +each repeated element in the ValueVector. A count of zero means the element +has no values (note the offset field is unused in this case). The following +illustrates three fields; one with two values, one with no values, and one +with a single value: + +![drill query flow]({{ site.baseurl }}/docs/img/value3
[08/13] drill git commit: DRILL-2315: Confluence conversion plus fixes
http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/drill-docs/manage/004-partition-prune.md -- diff --git a/_docs/drill-docs/manage/004-partition-prune.md b/_docs/drill-docs/manage/004-partition-prune.md deleted file mode 100644 index fa81034..000 --- a/_docs/drill-docs/manage/004-partition-prune.md +++ /dev/null @@ -1,75 +0,0 @@ -title: "Partition Pruning" -parent: "Manage Drill" -Partition pruning is a performance optimization that limits the number of -files and partitions that Drill reads when querying file systems and Hive -tables. Drill only reads a subset of the files that reside in a file system or -a subset of the partitions in a Hive table when a query matches certain filter -criteria. - -For Drill to apply partition pruning to Hive tables, you must have created the -tables in Hive using the `PARTITION BY` clause: - -`CREATE TABLE () PARTITION BY ();` - -When you create Hive tables using the `PARTITION BY` clause, each partition of -data is automatically split out into different directories as data is written -to disk. For more information about Hive partitioning, refer to the [Apache -Hive wiki](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL/#LanguageManualDDL-PartitionedTables). - -Typically, table data in a file system is organized by directories and -subdirectories. Queries on table data may contain `WHERE` clause filters on -specific directories. - -Drillâs query planner evaluates the filters as part of a Filter operator. If -no partition filters are present, the underlying Scan operator reads all files -in all directories and then sends the data to operators downstream, such as -Filter. - -When partition filters are present, the query planner determines if it can -push the filters down to the Scan such that the Scan only reads the -directories that match the partition filters, thus reducing disk I/O. - -## Partition Pruning Example - -The /`Users/max/data/logs` directory in a file system contains subdirectories -that span a few years. - -The following image shows the hierarchical structure of the `â¦/logs` directory -and (sub) directories: - -![](../../img/54.png) - -The following query requests log file data for 2013 from the `â¦/logs` -directory in the file system: - -SELECT * FROM dfs.`/Users/max/data/logs` WHERE cust_id < 10 and dir0 = 2013 limit 2; - -If you run the `EXPLAIN PLAN` command for the query, you can see that the` -â¦/logs` directory is filtered by the scan operator. - -EXPLAIN PLAN FOR SELECT * FROM dfs.`/Users/max/data/logs` WHERE cust_id < 10 and dir0 = 2013 limit 2; - -The following image shows a portion of the physical plan when partition -pruning is applied: - -![](../../img/21.png) - -## Filter Examples - -The following queries include examples of the types of filters eligible for -partition pruning optimization: - -**Example 1: Partition filters ANDed together** - -SELECT * FROM dfs.`/Users/max/data/logs` WHERE dir0 = '2014' AND dir1 = '1' - -**Example 2: Partition filter ANDed with regular column filter** - -SELECT * FROM dfs.`/Users/max/data/logs` WHERE cust_id < 10 AND dir0 = 2013 limit 2; - -**Example 3: Combination of AND, OR involving partition filters** - -SELECT * FROM dfs.`/Users/max/data/logs` WHERE (dir0 = '2013' AND dir1 = '1') OR (dir0 = '2014' AND dir1 = '2') - http://git-wip-us.apache.org/repos/asf/drill/blob/d959a210/_docs/drill-docs/manage/005-monitor-cancel.md -- diff --git a/_docs/drill-docs/manage/005-monitor-cancel.md b/_docs/drill-docs/manage/005-monitor-cancel.md deleted file mode 100644 index 6888eea..000 --- a/_docs/drill-docs/manage/005-monitor-cancel.md +++ /dev/null @@ -1,30 +0,0 @@ -title: "Monitoring and Canceling Queries in the Drill Web UI" -parent: "Manage Drill" -You can monitor and cancel queries from the Drill Web UI. To access the Drill -Web UI, the Drillbit process must be running on the Drill node that you use to -access the Drill Web UI. - -To monitor or cancel a query from the Drill Web UI, complete the following -steps: - - 1. Navigate to the Drill Web UI at `:8047.` -When you access the Drill Web UI, you see some general information about Drill -running in your cluster, such as the nodes running the Drillbit process, the -various ports Drill is using, and the amount of direct memory assigned to -Drill. -![](../../img/7.png) - - 2. Select **Profiles** in the toolbar. A list of running and completed queries appears. Drill assigns a query ID to each query and lists the Foreman node. The Foreman is the Drillbit node that receives the query from the client or application. The Foreman drives the entire query. -![](../../img/51.png) - - 3. Click the **Query ID** for the query that you want to monitor or cancel. The Query and Planning window appears. -![](../../img/4.png) - - 4. Selec
Git Push Summary
Repository: drill Updated Branches: refs/heads/gh-pages-master [created] 23f82db9f
[1/2] drill git commit: DRILL-2130: Fixed JUnit/Hamcrest/Mockito/Paranamer class path problem.
Repository: drill Updated Branches: refs/heads/master 8bb6b08e5 -> f7ef5ec78 DRILL-2130: Fixed JUnit/Hamcrest/Mockito/Paranamer class path problem. Project: http://git-wip-us.apache.org/repos/asf/drill/repo Commit: http://git-wip-us.apache.org/repos/asf/drill/commit/b0faf708 Tree: http://git-wip-us.apache.org/repos/asf/drill/tree/b0faf708 Diff: http://git-wip-us.apache.org/repos/asf/drill/diff/b0faf708 Branch: refs/heads/master Commit: b0faf708bdbeb53bc3a446d3782554640bdfd6df Parents: 8bb6b08 Author: dbarclay Authored: Sun Feb 22 00:45:42 2015 -0800 Committer: Aditya Kishore Committed: Wed Feb 25 11:08:20 2015 -0800 -- ...rill2130CommonHamcrestConfigurationTest.java | 46 ...30StorageHBaseHamcrestConfigurationTest.java | 46 ...torageHiveCoreHamcrestConfigurationTest.java | 46 ...130InterpreterHamcrestConfigurationTest.java | 46 exec/java-exec/pom.xml | 9 ...ll2130JavaExecHamcrestConfigurationTest.java | 46 ...ll2130JavaJdbcHamcrestConfigurationTest.java | 46 pom.xml | 8 8 files changed, 293 insertions(+) -- http://git-wip-us.apache.org/repos/asf/drill/blob/b0faf708/common/src/test/java/org/apache/drill/test/Drill2130CommonHamcrestConfigurationTest.java -- diff --git a/common/src/test/java/org/apache/drill/test/Drill2130CommonHamcrestConfigurationTest.java b/common/src/test/java/org/apache/drill/test/Drill2130CommonHamcrestConfigurationTest.java new file mode 100644 index 000..99643b1 --- /dev/null +++ b/common/src/test/java/org/apache/drill/test/Drill2130CommonHamcrestConfigurationTest.java @@ -0,0 +1,46 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.drill.test; + +import org.junit.Test; +import static org.junit.Assert.assertThat; +import static org.junit.Assert.fail; +import static org.hamcrest.CoreMatchers.equalTo; + + +public class Drill2130CommonHamcrestConfigurationTest { + + @SuppressWarnings("unused") + private org.hamcrest.MatcherAssert forCompileTimeCheckForNewEnoughHamcrest; + + @Test + public void testJUnitHamcrestMatcherFailureWorks() { +try { + assertThat( 1, equalTo( 2 ) ); +} +catch ( NoSuchMethodError e ) { + fail( "Class search path seems broken re new JUnit and old Hamcrest." + + " Got NoSuchMethodError; e: " + e ); +} +catch ( AssertionError e ) { + System.out.println( "Class path seems fine re new JUnit vs. old Hamcrest." + + " (Got AssertionError, not NoSuchMethodError.)" ); +} + } + +} http://git-wip-us.apache.org/repos/asf/drill/blob/b0faf708/contrib/storage-hbase/src/test/java/org/apache/drill/hbase/test/Drill2130StorageHBaseHamcrestConfigurationTest.java -- diff --git a/contrib/storage-hbase/src/test/java/org/apache/drill/hbase/test/Drill2130StorageHBaseHamcrestConfigurationTest.java b/contrib/storage-hbase/src/test/java/org/apache/drill/hbase/test/Drill2130StorageHBaseHamcrestConfigurationTest.java new file mode 100644 index 000..b52654d --- /dev/null +++ b/contrib/storage-hbase/src/test/java/org/apache/drill/hbase/test/Drill2130StorageHBaseHamcrestConfigurationTest.java @@ -0,0 +1,46 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHO
[2/2] drill git commit: DRILL-1690: Issue with using HBase plugin to access row_key only
DRILL-1690: Issue with using HBase plugin to access row_key only Project: http://git-wip-us.apache.org/repos/asf/drill/repo Commit: http://git-wip-us.apache.org/repos/asf/drill/commit/f7ef5ec7 Tree: http://git-wip-us.apache.org/repos/asf/drill/tree/f7ef5ec7 Diff: http://git-wip-us.apache.org/repos/asf/drill/diff/f7ef5ec7 Branch: refs/heads/master Commit: f7ef5ec784844a99b8b39fe10ab14f001ae149f2 Parents: b0faf70 Author: Aditya Kishore Authored: Wed Feb 25 01:10:48 2015 -0800 Committer: Aditya Kishore Committed: Wed Feb 25 11:17:06 2015 -0800 -- .../exec/store/hbase/HBaseRecordReader.java | 35 +++- 1 file changed, 19 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/drill/blob/f7ef5ec7/contrib/storage-hbase/src/main/java/org/apache/drill/exec/store/hbase/HBaseRecordReader.java -- diff --git a/contrib/storage-hbase/src/main/java/org/apache/drill/exec/store/hbase/HBaseRecordReader.java b/contrib/storage-hbase/src/main/java/org/apache/drill/exec/store/hbase/HBaseRecordReader.java index da38707..42038e8 100644 --- a/contrib/storage-hbase/src/main/java/org/apache/drill/exec/store/hbase/HBaseRecordReader.java +++ b/contrib/storage-hbase/src/main/java/org/apache/drill/exec/store/hbase/HBaseRecordReader.java @@ -72,6 +72,8 @@ public class HBaseRecordReader extends AbstractRecordReader implements DrillHBas private Configuration hbaseConf; private OperatorContext operatorContext; + private boolean rowKeyOnly; + public HBaseRecordReader(Configuration conf, HBaseSubScan.HBaseSubScanSpec subScanSpec, List projectedColumns, FragmentContext context) throws OutOfMemoryException { hbaseConf = conf; @@ -87,8 +89,8 @@ public class HBaseRecordReader extends AbstractRecordReader implements DrillHBas @Override protected Collection transformColumns(Collection columns) { Set transformed = Sets.newLinkedHashSet(); +rowKeyOnly = true; if (!isStarQuery()) { - boolean rowKeyOnly = true; for (SchemaPath column : columns) { if (column.getRootSegment().getPath().equalsIgnoreCase(ROW_KEY)) { transformed.add(ROW_KEY_PATH); @@ -116,6 +118,7 @@ public class HBaseRecordReader extends AbstractRecordReader implements DrillHBas HBaseUtils.andFilterAtIndex(hbaseScan.getFilter(), HBaseUtils.LAST_FILTER, new FirstKeyOnlyFilter())); } } else { + rowKeyOnly = false; transformed.add(ROW_KEY_PATH); } @@ -131,7 +134,6 @@ public class HBaseRecordReader extends AbstractRecordReader implements DrillHBas this.operatorContext = operatorContext; } - @Override public void setup(OutputMutator output) throws ExecutionSetupException { this.outputMutator = output; @@ -197,22 +199,23 @@ public class HBaseRecordReader extends AbstractRecordReader implements DrillHBas if (rowKeyVector != null) { rowKeyVector.getMutator().setSafe(rowCount, cells[0].getRowArray(), cells[0].getRowOffset(), cells[0].getRowLength()); } + if (!rowKeyOnly) { +for (Cell cell : cells) { + int familyOffset = cell.getFamilyOffset(); + int familyLength = cell.getFamilyLength(); + byte[] familyArray = cell.getFamilyArray(); + MapVector mv = getOrCreateFamilyVector(new String(familyArray, familyOffset, familyLength), true); - for (Cell cell : cells) { -int familyOffset = cell.getFamilyOffset(); -int familyLength = cell.getFamilyLength(); -byte[] familyArray = cell.getFamilyArray(); -MapVector mv = getOrCreateFamilyVector(new String(familyArray, familyOffset, familyLength), true); + int qualifierOffset = cell.getQualifierOffset(); + int qualifierLength = cell.getQualifierLength(); + byte[] qualifierArray = cell.getQualifierArray(); + NullableVarBinaryVector v = getOrCreateColumnVector(mv, new String(qualifierArray, qualifierOffset, qualifierLength)); -int qualifierOffset = cell.getQualifierOffset(); -int qualifierLength = cell.getQualifierLength(); -byte[] qualifierArray = cell.getQualifierArray(); -NullableVarBinaryVector v = getOrCreateColumnVector(mv, new String(qualifierArray, qualifierOffset, qualifierLength)); - -int valueOffset = cell.getValueOffset(); -int valueLength = cell.getValueLength(); -byte[] valueArray = cell.getValueArray(); -v.getMutator().setSafe(rowCount, valueArray, valueOffset, valueLength); + int valueOffset = cell.getValueOffset(); + int valueLength = cell.getValueLength(); + byte[] valueArray = cell.getValueArray(); + v.getMutator().setSafe(rowCount, valueArray, valueOffset, valueLength); +} } }