Merge branch cassandra-3.0 into cassandra-3.11
Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/5c9db9af Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/5c9db9af Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/5c9db9af Branch: refs/heads/trunk Commit: 5c9db9af96daeda02610623ccd8587b0a1510d4d Parents: 88fa169 e22cb27 Author: Benjamin Lerer <b.le...@gmail.com> Authored: Thu Jun 1 10:14:19 2017 +0200 Committer: Benjamin Lerer <b.le...@gmail.com> Committed: Thu Jun 1 10:17:09 2017 +0200 ---------------------------------------------------------------------- CHANGES.txt | 1 + NEWS.txt | 2 + doc/source/operating/metrics.rst | 2 +- .../cassandra/db/PartitionRangeReadCommand.java | 20 ++++- .../db/SinglePartitionReadCommand.java | 86 ++++++++++++++------ .../org/apache/cassandra/db/StorageHook.java | 64 +++++++++------ .../UnfilteredRowIteratorWithLowerBound.java | 10 ++- .../io/sstable/format/SSTableReader.java | 72 ++++++++++------ .../io/sstable/format/SSTableReadsListener.java | 82 +++++++++++++++++++ .../io/sstable/format/big/BigTableReader.java | 42 ++++++---- .../io/sstable/format/big/BigTableScanner.java | 41 ++++++++-- .../miscellaneous/SSTablesIteratedTest.java | 69 +++++++++++++++- .../cassandra/db/compaction/TTLExpiryTest.java | 6 +- .../sstable/SSTableCorruptionDetectionTest.java | 8 +- .../io/sstable/SSTableScannerTest.java | 3 +- .../cassandra/io/sstable/SSTableWriterTest.java | 8 +- .../sstable/format/ClientModeSSTableTest.java | 7 +- 17 files changed, 415 insertions(+), 108 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/cassandra/blob/5c9db9af/CHANGES.txt ---------------------------------------------------------------------- diff --cc CHANGES.txt index b28f75f,4232084..2c96521 --- a/CHANGES.txt +++ b/CHANGES.txt @@@ -1,41 -1,5 +1,42 @@@ -3.0.14 +3.11.0 + * Fix formatting of duration columns in CQLSH (CASSANDRA-13549) + * Fix the problem with duplicated rows when using paging with SASI (CASSANDRA-13302) + * Allow CONTAINS statements filtering on the partition key and itâs parts (CASSANDRA-13275) + * Fall back to even ranges calculation in clusters with vnodes when tokens are distributed unevenly (CASSANDRA-13229) + * Fix duration type validation to prevent overflow (CASSANDRA-13218) + * Forbid unsupported creation of SASI indexes over partition key columns (CASSANDRA-13228) + * Reject multiple values for a key in CQL grammar. (CASSANDRA-13369) + * UDA fails without input rows (CASSANDRA-13399) + * Fix compaction-stress by using daemonInitialization (CASSANDRA-13188) + * V5 protocol flags decoding broken (CASSANDRA-13443) + * Use write lock not read lock for removing sstables from compaction strategies. (CASSANDRA-13422) + * Use corePoolSize equal to maxPoolSize in JMXEnabledThreadPoolExecutors (CASSANDRA-13329) + * Avoid rebuilding SASI indexes containing no values (CASSANDRA-12962) + * Add charset to Analyser input stream (CASSANDRA-13151) + * Fix testLimitSSTables flake caused by concurrent flush (CASSANDRA-12820) + * cdc column addition strikes again (CASSANDRA-13382) + * Fix static column indexes (CASSANDRA-13277) + * DataOutputBuffer.asNewBuffer broken (CASSANDRA-13298) + * unittest CipherFactoryTest failed on MacOS (CASSANDRA-13370) + * Forbid SELECT restrictions and CREATE INDEX over non-frozen UDT columns (CASSANDRA-13247) + * Default logging we ship will incorrectly print "?:?" for "%F:%L" pattern (CASSANDRA-13317) + * Possible AssertionError in UnfilteredRowIteratorWithLowerBound (CASSANDRA-13366) + * Support unaligned memory access for AArch64 (CASSANDRA-13326) + * Improve SASI range iterator efficiency on intersection with an empty range (CASSANDRA-12915). + * Fix equality comparisons of columns using the duration type (CASSANDRA-13174) + * Obfuscate password in stress-graphs (CASSANDRA-12233) + * Move to FastThreadLocalThread and FastThreadLocal (CASSANDRA-13034) + * nodetool stopdaemon errors out (CASSANDRA-13030) + * Tables in system_distributed should not use gcgs of 0 (CASSANDRA-12954) + * Fix primary index calculation for SASI (CASSANDRA-12910) + * More fixes to the TokenAllocator (CASSANDRA-12990) + * NoReplicationTokenAllocator should work with zero replication factor (CASSANDRA-12983) + * Address message coalescing regression (CASSANDRA-12676) + * Delete illegal character from StandardTokenizerImpl.jflex (CASSANDRA-13417) + * Fix cqlsh automatic protocol downgrade regression (CASSANDRA-13307) + * Tracing payload not passed from QueryMessage to tracing session (CASSANDRA-12835) +Merged from 3.0: + * Fix the reported number of sstable data files accessed per read (CASSANDRA-13120) * Fix schema digest mismatch during rolling upgrades from versions before 3.0.12 (CASSANDRA-13559) * Upgrade JNA version to 4.4.0 (CASSANDRA-13072) * Interned ColumnIdentifiers should use minimal ByteBuffers (CASSANDRA-13533) http://git-wip-us.apache.org/repos/asf/cassandra/blob/5c9db9af/NEWS.txt ---------------------------------------------------------------------- diff --cc NEWS.txt index 1c14748,a92fc5d..a56ced6 --- a/NEWS.txt +++ b/NEWS.txt @@@ -26,139 -38,40 +26,141 @@@ Upgradin - In 2.1, the default for otc_coalescing_strategy was 'DISABLED'. In 2.2 and 3.0, it was changed to 'TIMEHORIZON', but that value was shown to be a performance regression. The default for 3.11.0 and newer has - been reverted to 'DISABLED'. Users upgrading to Cassandra 3.0 should - consider setting otc_coalescing_strategy to 'DISABLED'. - -3.0.11 -====== - -Upgrading ---------- - - Support for alter types of already defined tables and of UDTs fields has been disabled. - If it is necessary to return a different type, please use casting instead. See - CASSANDRA-12443 for more details. - - Specifying the default_time_to_live option when creating or altering a - materialized view was erroneously accepted (and ignored). It is now - properly rejected. - - Only Java and JavaScript are now supported UDF languages. - The sandbox in 3.0 already prevented the use of script languages except Java - and JavaScript. - - Compaction now correctly drops sstables out of CompactionTask when there - isn't enough disk space to perform the full compaction. This should reduce - pending compaction tasks on systems with little remaining disk space. - - Primary ranges in the system.size_estimates table are now based on the keyspace - replication settings and adjacent ranges are no longer merged (CASSANDRA-9639). + been reverted to 'DISABLED'. Users upgrading from Cassandra 2.2 or 3.0 should + be aware that the default has changed. ++ - The StorageHook interface has been modified to allow to retrieve read information from ++ SSTableReader (CASSANDRA-13120). -3.0.10 -====== +3.10 +==== -Upgrading ---------- - - memtable_allocation_type: offheap_buffers is no longer allowed to be specified in the 3.0 series. - This was an oversight that can cause segfaults. Offheap was re-introduced in 3.4 see CASSANDRA-11039 - and CASSANDRA-9472 for details. +New features +------------ + - New `DurationType` (cql duration). See CASSANDRA-11873 + - Runtime modification of concurrent_compactors is now available via nodetool + - Support for the assignment operators +=/-= has been added for update queries. + - An Index implementation may now provide a task which runs prior to joining + the ring. See CASSANDRA-12039 + - Filtering on partition key columns is now also supported for queries without + secondary indexes. + - A slow query log has been added: slow queries will be logged at DEBUG level. + For more details refer to CASSANDRA-12403 and slow_query_log_timeout_in_ms + in cassandra.yaml. + - Support for GROUP BY queries has been added. + - A new compaction-stress tool has been added to test the throughput of compaction + for any cassandra-stress user schema. see compaction-stress help for how to use. + - Compaction can now take into account overlapping tables that don't take part + in the compaction to look for deleted or overwritten data in the compacted tables. + Then such data is found, it can be safely discarded, which in turn should enable + the removal of tombstones over that data. + + The behavior can be engaged in two ways: + - as a "nodetool garbagecollect -g CELL/ROW" operation, which applies + single-table compaction on all sstables to discard deleted data in one step. + - as a "provide_overlapping_tombstones:CELL/ROW/NONE" compaction strategy flag, + which uses overlapping tables as a source of deletions/overwrites during all + compactions. + The argument specifies the granularity at which deleted data is to be found: + - If ROW is specified, only whole deleted rows (or sets of rows) will be + discarded. + - If CELL is specified, any columns whose value is overwritten or deleted + will also be discarded. + - NONE (default) specifies the old behavior, overlapping tables are not used to + decide when to discard data. + Which option to use depends on your workload, both ROW and CELL increase the + disk load on compaction (especially with the size-tiered compaction strategy), + with CELL being more resource-intensive. Both should lead to better read + performance if deleting rows (resp. overwriting or deleting cells) is common. + - Prepared statements are now persisted in the table prepared_statements in + the system keyspace. Upon startup, this table is used to preload all + previously prepared statements - i.e. in many cases clients do not need to + re-prepare statements against restarted nodes. + - cqlsh can now connect to older Cassandra versions by downgrading the native + protocol version. Please note that this is currently not part of our release + testing and, as a consequence, it is not guaranteed to work in all cases. + See CASSANDRA-12150 for more details. + - Snapshots that are automatically taken before a table is dropped or truncated + will have a "dropped" or "truncated" prefix on their snapshot tag name. + - Metrics are exposed for successful and failed authentication attempts. + These can be located using the object names org.apache.cassandra.metrics:type=Client,name=AuthSuccess + and org.apache.cassandra.metrics:type=Client,name=AuthFailure respectively. + - Add support to "unset" JSON fields in prepared statements by specifying DEFAULT UNSET. + See CASSANDRA-11424 for details + - Allow TTL with null value on insert and update. It will be treated as equivalent to inserting a 0. + - Removed outboundBindAny configuration property. See CASSANDRA-12673 for details. + +Upgrading +--------- + - Support for alter types of already defined tables and of UDTs fields has been disabled. + If it is necessary to return a different type, please use casting instead. See + CASSANDRA-12443 for more details. + - Specifying the default_time_to_live option when creating or altering a + materialized view was erroneously accepted (and ignored). It is now + properly rejected. + - Only Java and JavaScript are now supported UDF languages. + The sandbox in 3.0 already prevented the use of script languages except Java + and JavaScript. + - Compaction now correctly drops sstables out of CompactionTask when there + isn't enough disk space to perform the full compaction. This should reduce + pending compaction tasks on systems with little remaining disk space. + - Request timeouts in cassandra.yaml (read_request_timeout_in_ms, etc) now apply to the + "full" request time on the coordinator. Previously, they only covered the time from + when the coordinator sent a message to a replica until the time that the replica + responded. Additionally, the previous behavior was to reset the timeout when performing + a read repair, making a second read to fix a short read, and when subranges were read + as part of a range scan or secondary index query. In 3.10 and higher, the timeout + is no longer reset for these "subqueries". The entire request must complete within + the specified timeout. As a consequence, your timeouts may need to be adjusted + to account for this. See CASSANDRA-12256 for more details. + - Logs written to stdout are now consistent with logs written to files. + Time is now local (it was UTC on the console and local in files). Date, thread, file + and line info where added to stdout. (see CASSANDRA-12004) + - The 'clientutil' jar, which has been somewhat broken on the 3.x branch, is not longer provided. + The features provided by that jar are provided by any good java driver and we advise relying on drivers rather on + that jar, but if you need that jar for backward compatiblity until you do so, you should use the version provided + on previous Cassandra branch, like the 3.0 branch (by design, the functionality provided by that jar are stable + accross versions so using the 3.0 jar for a client connecting to 3.x should work without issues). + - (Tools development) DatabaseDescriptor no longer implicitly startups components/services like + commit log replay. This may break existing 3rd party tools and clients. In order to startup + a standalone tool or client application, use the DatabaseDescriptor.toolInitialization() or + DatabaseDescriptor.clientInitialization() methods. Tool initialization sets up partitioner, + snitch, encryption context. Client initialization just applies the configuration but does not + setup anything. Instead of using Config.setClientMode() or Config.isClientMode(), which are + deprecated now, use one of the appropiate new methods in DatabaseDescriptor. + - Application layer keep-alives were added to the streaming protocol to prevent idle incoming connections from + timing out and failing the stream session (CASSANDRA-11839). This effectively deprecates the streaming_socket_timeout_in_ms + property in favor of streaming_keep_alive_period_in_secs. See cassandra.yaml for more details about this property. + - Duration litterals support the ISO 8601 format. By consequence, identifiers matching that format + (e.g P2Y or P1MT6H) will not be supported anymore (CASSANDRA-11873). + +3.8 +=== -3.0.9 -===== +New features +------------ + - Shared pool threads are now named according to the stage they are executing + tasks for. Thread names mentioned in traced queries change accordingly. + - A new option has been added to cassandra-stress "-rate fixed={number}/s" + that forces a scheduled rate of operations/sec over time. Using this, stress can + accurately account for coordinated ommission from the stress process. + - The cassandra-stress "-rate limit=" option has been renamed to "-rate throttle=" + - hdr histograms have been added to stress runs, it's output can be saved to disk using: + "-log hdrfile=" option. This histogram includes response/service/wait times when used with the + fixed or throttle rate options. The histogram file can be plotted on + http://hdrhistogram.github.io/HdrHistogram/plotFiles.html + - TimeWindowCompactionStrategy has been added. This has proven to be a better approach + to time series compaction and new tables should use this instead of DTCS. See + CASSANDRA-9666 for details. + - Change-Data-Capture is now available. See cassandra.yaml and for cdc-specific flags and + a brief explanation of on-disk locations for archived data in CommitLog form. This can + be enabled via ALTER TABLE ... WITH cdc=true. + Upon flush, CommitLogSegments containing data for CDC-enabled tables are moved to + the data/cdc_raw directory until removed by the user and writes to CDC-enabled tables + will be rejected with a WriteTimeoutException once cdc_total_space_in_mb is reached + between unflushed CommitLogSegments and cdc_raw. + NOTE: CDC is disabled by default in the .yaml file. Do not enable CDC on a mixed-version + cluster as it will lead to exceptions which can interrupt traffic. Once all nodes + have been upgraded to 3.8 it is safe to enable this feature and restart the cluster. Upgrading --------- http://git-wip-us.apache.org/repos/asf/cassandra/blob/5c9db9af/doc/source/operating/metrics.rst ---------------------------------------------------------------------- diff --cc doc/source/operating/metrics.rst index 546d9c2,0000000..04abb48 mode 100644,000000..100644 --- a/doc/source/operating/metrics.rst +++ b/doc/source/operating/metrics.rst @@@ -1,706 -1,0 +1,706 @@@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at +.. +.. http://www.apache.org/licenses/LICENSE-2.0 +.. +.. Unless required by applicable law or agreed to in writing, software +.. distributed under the License is distributed on an "AS IS" BASIS, +.. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +.. See the License for the specific language governing permissions and +.. limitations under the License. + +.. highlight:: none + +Monitoring +---------- + +Metrics in Cassandra are managed using the `Dropwizard Metrics <http://metrics.dropwizard.io>`__ library. These metrics +can be queried via JMX or pushed to external monitoring systems using a number of `built in +<http://metrics.dropwizard.io/3.1.0/getting-started/#other-reporting>`__ and `third party +<http://metrics.dropwizard.io/3.1.0/manual/third-party/>`__ reporter plugins. + +Metrics are collected for a single node. It's up to the operator to use an external monitoring system to aggregate them. + +Metric Types +^^^^^^^^^^^^ +All metrics reported by cassandra fit into one of the following types. + +``Gauge`` + An instantaneous measurement of a value. + +``Counter`` + A gauge for an ``AtomicLong`` instance. Typically this is consumed by monitoring the change since the last call to + see if there is a large increase compared to the norm. + +``Histogram`` + Measures the statistical distribution of values in a stream of data. + + In addition to minimum, maximum, mean, etc., it also measures median, 75th, 90th, 95th, 98th, 99th, and 99.9th + percentiles. + +``Timer`` + Measures both the rate that a particular piece of code is called and the histogram of its duration. + +``Latency`` + Special type that tracks latency (in microseconds) with a ``Timer`` plus a ``Counter`` that tracks the total latency + accrued since starting. The former is useful if you track the change in total latency since the last check. Each + metric name of this type will have 'Latency' and 'TotalLatency' appended to it. + +``Meter`` + A meter metric which measures mean throughput and one-, five-, and fifteen-minute exponentially-weighted moving + average throughputs. + +Table Metrics +^^^^^^^^^^^^^ + +Each table in Cassandra has metrics responsible for tracking its state and performance. + +The metric names are all appended with the specific ``Keyspace`` and ``Table`` name. + +Reported name format: + +**Metric Name** + ``org.apache.cassandra.metrics.Table.<MetricName>.<Keyspace>.<Table>`` + +**JMX MBean** + ``org.apache.cassandra.metrics:type=Table keyspace=<Keyspace> scope=<Table> name=<MetricName>`` + +.. NOTE:: + There is a special table called '``all``' without a keyspace. This represents the aggregation of metrics across + **all** tables and keyspaces on the node. + + +======================================= ============== =========== +Name Type Description +======================================= ============== =========== +MemtableOnHeapSize Gauge<Long> Total amount of data stored in the memtable that resides **on**-heap, including column related overhead and partitions overwritten. +MemtableOffHeapSize Gauge<Long> Total amount of data stored in the memtable that resides **off**-heap, including column related overhead and partitions overwritten. +MemtableLiveDataSize Gauge<Long> Total amount of live data stored in the memtable, excluding any data structure overhead. +AllMemtablesOnHeapSize Gauge<Long> Total amount of data stored in the memtables (2i and pending flush memtables included) that resides **on**-heap. +AllMemtablesOffHeapSize Gauge<Long> Total amount of data stored in the memtables (2i and pending flush memtables included) that resides **off**-heap. +AllMemtablesLiveDataSize Gauge<Long> Total amount of live data stored in the memtables (2i and pending flush memtables included) that resides off-heap, excluding any data structure overhead. +MemtableColumnsCount Gauge<Long> Total number of columns present in the memtable. +MemtableSwitchCount Counter Number of times flush has resulted in the memtable being switched out. +CompressionRatio Gauge<Double> Current compression ratio for all SSTables. +EstimatedPartitionSizeHistogram Gauge<long[]> Histogram of estimated partition size (in bytes). +EstimatedPartitionCount Gauge<Long> Approximate number of keys in table. +EstimatedColumnCountHistogram Gauge<long[]> Histogram of estimated number of columns. - SSTablesPerReadHistogram Histogram Histogram of the number of sstable data files accessed per read. ++SSTablesPerReadHistogram Histogram Histogram of the number of sstable data files accessed per single partition read. SSTables skipped due to Bloom Filters, min-max key or partition index lookup are not taken into acoount. +ReadLatency Latency Local read latency for this table. +RangeLatency Latency Local range scan latency for this table. +WriteLatency Latency Local write latency for this table. +CoordinatorReadLatency Timer Coordinator read latency for this table. +CoordinatorScanLatency Timer Coordinator range scan latency for this table. +PendingFlushes Counter Estimated number of flush tasks pending for this table. +BytesFlushed Counter Total number of bytes flushed since server [re]start. +CompactionBytesWritten Counter Total number of bytes written by compaction since server [re]start. +PendingCompactions Gauge<Integer> Estimate of number of pending compactions for this table. +LiveSSTableCount Gauge<Integer> Number of SSTables on disk for this table. +LiveDiskSpaceUsed Counter Disk space used by SSTables belonging to this table (in bytes). +TotalDiskSpaceUsed Counter Total disk space used by SSTables belonging to this table, including obsolete ones waiting to be GC'd. +MinPartitionSize Gauge<Long> Size of the smallest compacted partition (in bytes). +MaxPartitionSize Gauge<Long> Size of the largest compacted partition (in bytes). +MeanPartitionSize Gauge<Long> Size of the average compacted partition (in bytes). +BloomFilterFalsePositives Gauge<Long> Number of false positives on table's bloom filter. +BloomFilterFalseRatio Gauge<Double> False positive ratio of table's bloom filter. +BloomFilterDiskSpaceUsed Gauge<Long> Disk space used by bloom filter (in bytes). +BloomFilterOffHeapMemoryUsed Gauge<Long> Off-heap memory used by bloom filter. +IndexSummaryOffHeapMemoryUsed Gauge<Long> Off-heap memory used by index summary. +CompressionMetadataOffHeapMemoryUsed Gauge<Long> Off-heap memory used by compression meta data. +KeyCacheHitRate Gauge<Double> Key cache hit rate for this table. +TombstoneScannedHistogram Histogram Histogram of tombstones scanned in queries on this table. +LiveScannedHistogram Histogram Histogram of live cells scanned in queries on this table. +ColUpdateTimeDeltaHistogram Histogram Histogram of column update time delta on this table. +ViewLockAcquireTime Timer Time taken acquiring a partition lock for materialized view updates on this table. +ViewReadTime Timer Time taken during the local read of a materialized view update. +TrueSnapshotsSize Gauge<Long> Disk space used by snapshots of this table including all SSTable components. +RowCacheHitOutOfRange Counter Number of table row cache hits that do not satisfy the query filter, thus went to disk. +RowCacheHit Counter Number of table row cache hits. +RowCacheMiss Counter Number of table row cache misses. +CasPrepare Latency Latency of paxos prepare round. +CasPropose Latency Latency of paxos propose round. +CasCommit Latency Latency of paxos commit round. +PercentRepaired Gauge<Double> Percent of table data that is repaired on disk. +SpeculativeRetries Counter Number of times speculative retries were sent for this table. +WaitingOnFreeMemtableSpace Histogram Histogram of time spent waiting for free memtable space, either on- or off-heap. +DroppedMutations Counter Number of dropped mutations on this table. +======================================= ============== =========== + +Keyspace Metrics +^^^^^^^^^^^^^^^^ +Each keyspace in Cassandra has metrics responsible for tracking its state and performance. + +These metrics are the same as the ``Table Metrics`` above, only they are aggregated at the Keyspace level. + +Reported name format: + +**Metric Name** + ``org.apache.cassandra.metrics.keyspace.<MetricName>.<Keyspace>`` + +**JMX MBean** + ``org.apache.cassandra.metrics:type=Keyspace scope=<Keyspace> name=<MetricName>`` + +ThreadPool Metrics +^^^^^^^^^^^^^^^^^^ + +Cassandra splits work of a particular type into its own thread pool. This provides back-pressure and asynchrony for +requests on a node. It's important to monitor the state of these thread pools since they can tell you how saturated a +node is. + +The metric names are all appended with the specific ``ThreadPool`` name. The thread pools are also categorized under a +specific type. + +Reported name format: + +**Metric Name** + ``org.apache.cassandra.metrics.ThreadPools.<MetricName>.<Path>.<ThreadPoolName>`` + +**JMX MBean** + ``org.apache.cassandra.metrics:type=ThreadPools scope=<ThreadPoolName> type=<Type> name=<MetricName>`` + +===================== ============== =========== +Name Type Description +===================== ============== =========== +ActiveTasks Gauge<Integer> Number of tasks being actively worked on by this pool. +PendingTasks Gauge<Integer> Number of queued tasks queued up on this pool. +CompletedTasks Counter Number of tasks completed. +TotalBlockedTasks Counter Number of tasks that were blocked due to queue saturation. +CurrentlyBlockedTask Counter Number of tasks that are currently blocked due to queue saturation but on retry will become unblocked. +MaxPoolSize Gauge<Integer> The maximum number of threads in this pool. +===================== ============== =========== + +The following thread pools can be monitored. + +============================ ============== =========== +Name Type Description +============================ ============== =========== +Native-Transport-Requests transport Handles client CQL requests +CounterMutationStage request Responsible for counter writes +ViewMutationStage request Responsible for materialized view writes +MutationStage request Responsible for all other writes +ReadRepairStage request ReadRepair happens on this thread pool +ReadStage request Local reads run on this thread pool +RequestResponseStage request Coordinator requests to the cluster run on this thread pool +AntiEntropyStage internal Builds merkle tree for repairs +CacheCleanupExecutor internal Cache maintenance performed on this thread pool +CompactionExecutor internal Compactions are run on these threads +GossipStage internal Handles gossip requests +HintsDispatcher internal Performs hinted handoff +InternalResponseStage internal Responsible for intra-cluster callbacks +MemtableFlushWriter internal Writes memtables to disk +MemtablePostFlush internal Cleans up commit log after memtable is written to disk +MemtableReclaimMemory internal Memtable recycling +MigrationStage internal Runs schema migrations +MiscStage internal Misceleneous tasks run here +PendingRangeCalculator internal Calculates token range +PerDiskMemtableFlushWriter_0 internal Responsible for writing a spec (there is one of these per disk 0-N) +Sampler internal Responsible for re-sampling the index summaries of SStables +SecondaryIndexManagement internal Performs updates to secondary indexes +ValidationExecutor internal Performs validation compaction or scrubbing +============================ ============== =========== + +.. |nbsp| unicode:: 0xA0 .. nonbreaking space + +Client Request Metrics +^^^^^^^^^^^^^^^^^^^^^^ + +Client requests have their own set of metrics that encapsulate the work happening at coordinator level. + +Different types of client requests are broken down by ``RequestType``. + +Reported name format: + +**Metric Name** + ``org.apache.cassandra.metrics.ClientRequest.<MetricName>.<RequestType>`` + +**JMX MBean** + ``org.apache.cassandra.metrics:type=ClientRequest scope=<RequestType> name=<MetricName>`` + + +:RequestType: CASRead +:Description: Metrics related to transactional read requests. +:Metrics: + ===================== ============== ============================================================= + Name Type Description + ===================== ============== ============================================================= + Timeouts Counter Number of timeouts encountered. + Failures Counter Number of transaction failures encountered. + |nbsp| Latency Transaction read latency. + Unavailables Counter Number of unavailable exceptions encountered. + UnfinishedCommit Counter Number of transactions that were committed on read. + ConditionNotMet Counter Number of transaction preconditions did not match current values. + ContentionHistogram Histogram How many contended reads were encountered + ===================== ============== ============================================================= + +:RequestType: CASWrite +:Description: Metrics related to transactional write requests. +:Metrics: + ===================== ============== ============================================================= + Name Type Description + ===================== ============== ============================================================= + Timeouts Counter Number of timeouts encountered. + Failures Counter Number of transaction failures encountered. + |nbsp| Latency Transaction write latency. + UnfinishedCommit Counter Number of transactions that were committed on write. + ConditionNotMet Counter Number of transaction preconditions did not match current values. + ContentionHistogram Histogram How many contended writes were encountered + ===================== ============== ============================================================= + + +:RequestType: Read +:Description: Metrics related to standard read requests. +:Metrics: + ===================== ============== ============================================================= + Name Type Description + ===================== ============== ============================================================= + Timeouts Counter Number of timeouts encountered. + Failures Counter Number of read failures encountered. + |nbsp| Latency Read latency. + Unavailables Counter Number of unavailable exceptions encountered. + ===================== ============== ============================================================= + +:RequestType: RangeSlice +:Description: Metrics related to token range read requests. +:Metrics: + ===================== ============== ============================================================= + Name Type Description + ===================== ============== ============================================================= + Timeouts Counter Number of timeouts encountered. + Failures Counter Number of range query failures encountered. + |nbsp| Latency Range query latency. + Unavailables Counter Number of unavailable exceptions encountered. + ===================== ============== ============================================================= + +:RequestType: Write +:Description: Metrics related to regular write requests. +:Metrics: + ===================== ============== ============================================================= + Name Type Description + ===================== ============== ============================================================= + Timeouts Counter Number of timeouts encountered. + Failures Counter Number of write failures encountered. + |nbsp| Latency Write latency. + Unavailables Counter Number of unavailable exceptions encountered. + ===================== ============== ============================================================= + + +:RequestType: ViewWrite +:Description: Metrics related to materialized view write wrtes. +:Metrics: + ===================== ============== ============================================================= + Timeouts Counter Number of timeouts encountered. + Failures Counter Number of transaction failures encountered. + Unavailables Counter Number of unavailable exceptions encountered. + ViewReplicasAttempted Counter Total number of attempted view replica writes. + ViewReplicasSuccess Counter Total number of succeded view replica writes. + ViewPendingMutations Gauge<Long> ViewReplicasAttempted - ViewReplicasSuccess. + ViewWriteLatency Timer Time between when mutation is applied to base table and when CL.ONE is achieved on view. + ===================== ============== ============================================================= + +Cache Metrics +^^^^^^^^^^^^^ + +Cassandra caches have metrics to track the effectivness of the caches. Though the ``Table Metrics`` might be more useful. + +Reported name format: + +**Metric Name** + ``org.apache.cassandra.metrics.Cache.<MetricName>.<CacheName>`` + +**JMX MBean** + ``org.apache.cassandra.metrics:type=Cache scope=<CacheName> name=<MetricName>`` + +========================== ============== =========== +Name Type Description +========================== ============== =========== +Capacity Gauge<Long> Cache capacity in bytes. +Entries Gauge<Integer> Total number of cache entries. +FifteenMinuteCacheHitRate Gauge<Double> 15m cache hit rate. +FiveMinuteCacheHitRate Gauge<Double> 5m cache hit rate. +OneMinuteCacheHitRate Gauge<Double> 1m cache hit rate. +HitRate Gauge<Double> All time cache hit rate. +Hits Meter Total number of cache hits. +Misses Meter Total number of cache misses. +MissLatency Timer Latency of misses. +Requests Gauge<Long> Total number of cache requests. +Size Gauge<Long> Total size of occupied cache, in bytes. +========================== ============== =========== + +The following caches are covered: + +============================ =========== +Name Description +============================ =========== +CounterCache Keeps hot counters in memory for performance. +ChunkCache In process uncompressed page cache. +KeyCache Cache for partition to sstable offsets. +RowCache Cache for rows kept in memory. +============================ =========== + +.. NOTE:: + Misses and MissLatency are only defined for the ChunkCache + +CQL Metrics +^^^^^^^^^^^ + +Metrics specific to CQL prepared statement caching. + +Reported name format: + +**Metric Name** + ``org.apache.cassandra.metrics.CQL.<MetricName>`` + +**JMX MBean** + ``org.apache.cassandra.metrics:type=CQL name=<MetricName>`` + +========================== ============== =========== +Name Type Description +========================== ============== =========== +PreparedStatementsCount Gauge<Integer> Number of cached prepared statements. +PreparedStatementsEvicted Counter Number of prepared statements evicted from the prepared statement cache +PreparedStatementsExecuted Counter Number of prepared statements executed. +RegularStatementsExecuted Counter Number of **non** prepared statements executed. +PreparedStatementsRatio Gauge<Double> Percentage of statements that are prepared vs unprepared. +========================== ============== =========== + + +DroppedMessage Metrics +^^^^^^^^^^^^^^^^^^^^^^ + +Metrics specific to tracking dropped messages for different types of requests. +Dropped writes are stored and retried by ``Hinted Handoff`` + +Reported name format: + +**Metric Name** + ``org.apache.cassandra.metrics.DroppedMessages.<MetricName>.<Type>`` + +**JMX MBean** + ``org.apache.cassandra.metrics:type=DroppedMetrics scope=<Type> name=<MetricName>`` + +========================== ============== =========== +Name Type Description +========================== ============== =========== +CrossNodeDroppedLatency Timer The dropped latency across nodes. +InternalDroppedLatency Timer The dropped latency within node. +Dropped Meter Number of dropped messages. +========================== ============== =========== + +The different types of messages tracked are: + +============================ =========== +Name Description +============================ =========== +BATCH_STORE Batchlog write +BATCH_REMOVE Batchlog cleanup (after succesfully applied) +COUNTER_MUTATION Counter writes +HINT Hint replay +MUTATION Regular writes +READ Regular reads +READ_REPAIR Read repair +PAGED_SLICE Paged read +RANGE_SLICE Token range read +REQUEST_RESPONSE RPC Callbacks +_TRACE Tracing writes +============================ =========== + +Streaming Metrics +^^^^^^^^^^^^^^^^^ + +Metrics reported during ``Streaming`` operations, such as repair, bootstrap, rebuild. + +These metrics are specific to a peer endpoint, with the source node being the node you are pulling the metrics from. + +Reported name format: + +**Metric Name** + ``org.apache.cassandra.metrics.Streaming.<MetricName>.<PeerIP>`` + +**JMX MBean** + ``org.apache.cassandra.metrics:type=Streaming scope=<PeerIP> name=<MetricName>`` + +========================== ============== =========== +Name Type Description +========================== ============== =========== +IncomingBytes Counter Number of bytes streamed to this node from the peer. +OutgoingBytes Counter Number of bytes streamed to the peer endpoint from this node. +========================== ============== =========== + + +Compaction Metrics +^^^^^^^^^^^^^^^^^^ + +Metrics specific to ``Compaction`` work. + +Reported name format: + +**Metric Name** + ``org.apache.cassandra.metrics.Compaction.<MetricName>`` + +**JMX MBean** + ``org.apache.cassandra.metrics:type=Compaction name=<MetricName>`` + +========================== ======================================== =============================================== +Name Type Description +========================== ======================================== =============================================== +BytesCompacted Counter Total number of bytes compacted since server [re]start. +PendingTasks Gauge<Integer> Estimated number of compactions remaining to perform. +CompletedTasks Gauge<Long> Number of completed compactions since server [re]start. +TotalCompactionsCompleted Meter Throughput of completed compactions since server [re]start. +PendingTasksByTableName Gauge<Map<String, Map<String, Integer>>> Estimated number of compactions remaining to perform, grouped by keyspace and then table name. This info is also kept in ``Table Metrics``. +========================== ======================================== =============================================== + +CommitLog Metrics +^^^^^^^^^^^^^^^^^ + +Metrics specific to the ``CommitLog`` + +Reported name format: + +**Metric Name** + ``org.apache.cassandra.metrics.CommitLog.<MetricName>`` + +**JMX MBean** + ``org.apache.cassandra.metrics:type=CommitLog name=<MetricName>`` + +========================== ============== =========== +Name Type Description +========================== ============== =========== +CompletedTasks Gauge<Long> Total number of commit log messages written since [re]start. +PendingTasks Gauge<Long> Number of commit log messages written but yet to be fsync'd. +TotalCommitLogSize Gauge<Long> Current size, in bytes, used by all the commit log segments. +WaitingOnSegmentAllocation Timer Time spent waiting for a CommitLogSegment to be allocated - under normal conditions this should be zero. +WaitingOnCommit Timer The time spent waiting on CL fsync; for Periodic this is only occurs when the sync is lagging its sync interval. +========================== ============== =========== + +Storage Metrics +^^^^^^^^^^^^^^^ + +Metrics specific to the storage engine. + +Reported name format: + +**Metric Name** + ``org.apache.cassandra.metrics.Storage.<MetricName>`` + +**JMX MBean** + ``org.apache.cassandra.metrics:type=Storage name=<MetricName>`` + +========================== ============== =========== +Name Type Description +========================== ============== =========== +Exceptions Counter Number of internal exceptions caught. Under normal exceptions this should be zero. +Load Counter Size, in bytes, of the on disk data size this node manages. +TotalHints Counter Number of hint messages written to this node since [re]start. Includes one entry for each host to be hinted per hint. +TotalHintsInProgress Counter Number of hints attemping to be sent currently. +========================== ============== =========== + +HintedHandoff Metrics +^^^^^^^^^^^^^^^^^^^^^ + +Metrics specific to Hinted Handoff. There are also some metrics related to hints tracked in ``Storage Metrics`` + +These metrics include the peer endpoint **in the metric name** + +Reported name format: + +**Metric Name** + ``org.apache.cassandra.metrics.HintedHandOffManager.<MetricName>`` + +**JMX MBean** + ``org.apache.cassandra.metrics:type=HintedHandOffManager name=<MetricName>`` + +=========================== ============== =========== +Name Type Description +=========================== ============== =========== +Hints_created-<PeerIP> Counter Number of hints on disk for this peer. +Hints_not_stored-<PeerIP> Counter Number of hints not stored for this peer, due to being down past the configured hint window. +=========================== ============== =========== + +SSTable Index Metrics +^^^^^^^^^^^^^^^^^^^^^ + +Metrics specific to the SSTable index metadata. + +Reported name format: + +**Metric Name** + ``org.apache.cassandra.metrics.Index.<MetricName>.RowIndexEntry`` + +**JMX MBean** + ``org.apache.cassandra.metrics:type=Index scope=RowIndexEntry name=<MetricName>`` + +=========================== ============== =========== +Name Type Description +=========================== ============== =========== +IndexedEntrySize Histogram Histogram of the on-heap size, in bytes, of the index across all SSTables. +IndexInfoCount Histogram Histogram of the number of on-heap index entries managed across all SSTables. +IndexInfoGets Histogram Histogram of the number index seeks performed per SSTable. +=========================== ============== =========== + +BufferPool Metrics +^^^^^^^^^^^^^^^^^^ + +Metrics specific to the internal recycled buffer pool Cassandra manages. This pool is meant to keep allocations and GC +lower by recycling on and off heap buffers. + +Reported name format: + +**Metric Name** + ``org.apache.cassandra.metrics.BufferPool.<MetricName>`` + +**JMX MBean** + ``org.apache.cassandra.metrics:type=BufferPool name=<MetricName>`` + +=========================== ============== =========== +Name Type Description +=========================== ============== =========== +Size Gauge<Long> Size, in bytes, of the managed buffer pool +Misses Meter The rate of misses in the pool. The higher this is the more allocations incurred. +=========================== ============== =========== + + +Client Metrics +^^^^^^^^^^^^^^ + +Metrics specifc to client managment. + +Reported name format: + +**Metric Name** + ``org.apache.cassandra.metrics.Client.<MetricName>`` + +**JMX MBean** + ``org.apache.cassandra.metrics:type=Client name=<MetricName>`` + +=========================== ============== =========== +Name Type Description +=========================== ============== =========== +connectedNativeClients Counter Number of clients connected to this nodes native protocol server +connectedThriftClients Counter Number of clients connected to this nodes thrift protocol server +=========================== ============== =========== + +JVM Metrics +^^^^^^^^^^^ + +JVM metrics such as memory and garbage collection statistics can either be accessed by connecting to the JVM using JMX or can be exported using `Metric Reporters`_. + +BufferPool +++++++++++ + +**Metric Name** + ``jvm.buffers.<direct|mapped>.<MetricName>`` + +**JMX MBean** + ``java.nio:type=BufferPool name=<direct|mapped>`` + +========================== ============== =========== +Name Type Description +========================== ============== =========== +Capacity Gauge<Long> Estimated total capacity of the buffers in this pool +Count Gauge<Long> Estimated number of buffers in the pool +Used Gauge<Long> Estimated memory that the Java virtual machine is using for this buffer pool +========================== ============== =========== + +FileDescriptorRatio ++++++++++++++++++++ + +**Metric Name** + ``jvm.fd.<MetricName>`` + +**JMX MBean** + ``java.lang:type=OperatingSystem name=<OpenFileDescriptorCount|MaxFileDescriptorCount>`` + +========================== ============== =========== +Name Type Description +========================== ============== =========== +Usage Ratio Ratio of used to total file descriptors +========================== ============== =========== + +GarbageCollector +++++++++++++++++ + +**Metric Name** + ``jvm.gc.<gc_type>.<MetricName>`` + +**JMX MBean** + ``java.lang:type=GarbageCollector name=<gc_type>`` + +========================== ============== =========== +Name Type Description +========================== ============== =========== +Count Gauge<Long> Total number of collections that have occurred +Time Gauge<Long> Approximate accumulated collection elapsed time in milliseconds +========================== ============== =========== + +Memory +++++++ + +**Metric Name** + ``jvm.memory.<heap/non-heap/total>.<MetricName>`` + +**JMX MBean** + ``java.lang:type=Memory`` + +========================== ============== =========== +Committed Gauge<Long> Amount of memory in bytes that is committed for the JVM to use +Init Gauge<Long> Amount of memory in bytes that the JVM initially requests from the OS +Max Gauge<Long> Maximum amount of memory in bytes that can be used for memory management +Usage Ratio Ratio of used to maximum memory +Used Gauge<Long> Amount of used memory in bytes +========================== ============== =========== + +MemoryPool +++++++++++ + +**Metric Name** + ``jvm.memory.pools.<memory_pool>.<MetricName>`` + +**JMX MBean** + ``java.lang:type=MemoryPool name=<memory_pool>`` + +========================== ============== =========== +Committed Gauge<Long> Amount of memory in bytes that is committed for the JVM to use +Init Gauge<Long> Amount of memory in bytes that the JVM initially requests from the OS +Max Gauge<Long> Maximum amount of memory in bytes that can be used for memory management +Usage Ratio Ratio of used to maximum memory +Used Gauge<Long> Amount of used memory in bytes +========================== ============== =========== + +JMX +^^^ + +Any JMX based client can access metrics from cassandra. + +If you wish to access JMX metrics over http it's possible to download `Mx4jTool <http://mx4j.sourceforge.net/>`__ and +place ``mx4j-tools.jar`` into the classpath. On startup you will see in the log:: + + HttpAdaptor version 3.0.2 started on port 8081 + +To choose a different port (8081 is the default) or a different listen address (0.0.0.0 is not the default) edit +``conf/cassandra-env.sh`` and uncomment:: + + #MX4J_ADDRESS="-Dmx4jaddress=0.0.0.0" + + #MX4J_PORT="-Dmx4jport=8081" + + +Metric Reporters +^^^^^^^^^^^^^^^^ + +As mentioned at the top of this section on monitoring the Cassandra metrics can be exported to a number of monitoring +system a number of `built in <http://metrics.dropwizard.io/3.1.0/getting-started/#other-reporting>`__ and `third party +<http://metrics.dropwizard.io/3.1.0/manual/third-party/>`__ reporter plugins. + +The configuration of these plugins is managed by the `metrics reporter config project +<https://github.com/addthis/metrics-reporter-config>`__. There is a sample configuration file located at +``conf/metrics-reporter-config-sample.yaml``. + +Once configured, you simply start cassandra with the flag +``-Dcassandra.metricsReporterConfigFile=metrics-reporter-config.yaml``. The specified .yaml file plus any 3rd party +reporter jars must all be in Cassandra's classpath. http://git-wip-us.apache.org/repos/asf/cassandra/blob/5c9db9af/src/java/org/apache/cassandra/db/PartitionRangeReadCommand.java ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/cassandra/blob/5c9db9af/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java ---------------------------------------------------------------------- diff --cc src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java index 1358f29,72b4465..47c426e --- a/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java +++ b/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java @@@ -39,9 -36,9 +39,10 @@@ import org.apache.cassandra.db.lifecycl import org.apache.cassandra.db.filter.*; import org.apache.cassandra.db.partitions.*; import org.apache.cassandra.db.rows.*; +import org.apache.cassandra.db.transform.Transformation; import org.apache.cassandra.exceptions.RequestExecutionException; import org.apache.cassandra.io.sstable.format.SSTableReader; + import org.apache.cassandra.io.sstable.format.SSTableReadsListener; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.metrics.TableMetrics; @@@ -570,9 -553,11 +571,10 @@@ public class SinglePartitionReadComman * in one pass, and minimize the number of sstables for which we read a partition tombstone. */ Collections.sort(view.sstables, SSTableReader.maxTimestampComparator); - List<SSTableReader> skippedSSTables = null; long mostRecentPartitionTombstone = Long.MIN_VALUE; - long minTimestamp = Long.MAX_VALUE; int nonIntersectingSSTables = 0; + List<SSTableReader> skippedSSTablesWithTombstones = null; + SSTableReadMetricsCollector metricsCollector = new SSTableReadMetricsCollector(); for (SSTableReader sstable : view.sstables) { @@@ -594,11 -580,12 +596,11 @@@ continue; } - @SuppressWarnings("resource") // 'iter' is added to iterators which is closed on exception, or through the closing of the final merged iterator - UnfilteredRowIterator iter = filter.filter(sstable.iterator(partitionKey(), - columnFilter(), - filter.isReversed(), - isForThrift(), - metricsCollector)); + minTimestamp = Math.min(minTimestamp, sstable.getMinTimestamp()); + + @SuppressWarnings("resource") // 'iter' is added to iterators which is closed on exception, + // or through the closing of the final merged iterator - UnfilteredRowIteratorWithLowerBound iter = makeIterator(cfs, sstable, true); ++ UnfilteredRowIteratorWithLowerBound iter = makeIterator(cfs, sstable, true, metricsCollector); if (!sstable.isRepaired()) oldestUnrepairedTombstone = Math.min(oldestUnrepairedTombstone, sstable.getMinLocalDeletionTime()); @@@ -616,14 -602,24 +618,14 @@@ if (sstable.getMaxTimestamp() <= minTimestamp) continue; - @SuppressWarnings("resource") // 'iter' is either closed right away, or added to iterators which is close on exception, or through the closing of the final merged iterator - UnfilteredRowIterator iter = filter.filter(sstable.iterator(partitionKey(), - columnFilter(), - filter.isReversed(), - isForThrift(), - metricsCollector)); + @SuppressWarnings("resource") // 'iter' is added to iterators which is close on exception, + // or through the closing of the final merged iterator - UnfilteredRowIteratorWithLowerBound iter = makeIterator(cfs, sstable, false); ++ UnfilteredRowIteratorWithLowerBound iter = makeIterator(cfs, sstable, false, metricsCollector); + if (!sstable.isRepaired()) + oldestUnrepairedTombstone = Math.min(oldestUnrepairedTombstone, sstable.getMinLocalDeletionTime()); - if (iter.partitionLevelDeletion().markedForDeleteAt() > minTimestamp) - { - iterators.add(iter); - if (!sstable.isRepaired()) - oldestUnrepairedTombstone = Math.min(oldestUnrepairedTombstone, sstable.getMinLocalDeletionTime()); - includedDueToTombstones++; - } - else - { - iter.close(); - } + iterators.add(iter); + includedDueToTombstones++; } } if (Tracing.isTracing()) @@@ -633,8 -631,17 +635,8 @@@ if (iterators.isEmpty()) return EmptyIterators.unfilteredRow(cfs.metadata, partitionKey(), filter.isReversed()); - Tracing.trace("Merging data from memtables and {} sstables", metricsCollector.getMergedSSTables()); - - @SuppressWarnings("resource") // Closed through the closing of the result of that method. - UnfilteredRowIterator merged = UnfilteredRowIterators.merge(iterators, nowInSec()); - if (!merged.isEmpty()) - { - DecoratedKey key = merged.partitionKey(); - cfs.metric.samplers.get(TableMetrics.Sampler.READS).addSample(key.getKey(), key.hashCode(), 1); - } - - return merged; + StorageHook.instance.reportRead(cfs.metadata.cfId, partitionKey()); - return withSSTablesIterated(iterators, cfs.metric); ++ return withSSTablesIterated(iterators, cfs.metric, metricsCollector); } catch (RuntimeException | Error e) { @@@ -661,52 -668,6 +663,53 @@@ return clusteringIndexFilter().shouldInclude(sstable); } - private UnfilteredRowIteratorWithLowerBound makeIterator(ColumnFamilyStore cfs, final SSTableReader sstable, boolean applyThriftTransformation) ++ private UnfilteredRowIteratorWithLowerBound makeIterator(ColumnFamilyStore cfs, ++ final SSTableReader sstable, ++ boolean applyThriftTransformation, ++ SSTableReadsListener listener) + { + return StorageHook.instance.makeRowIteratorWithLowerBound(cfs, + partitionKey(), + sstable, + clusteringIndexFilter(), + columnFilter(), + isForThrift(), + nowInSec(), - applyThriftTransformation); ++ applyThriftTransformation, ++ listener); + + } + + /** + * Return a wrapped iterator that when closed will update the sstables iterated and READ sample metrics. + * Note that we cannot use the Transformations framework because they greedily get the static row, which + * would cause all iterators to be initialized and hence all sstables to be accessed. + */ + private UnfilteredRowIterator withSSTablesIterated(List<UnfilteredRowIterator> iterators, - TableMetrics metrics) ++ TableMetrics metrics, ++ SSTableReadMetricsCollector metricsCollector) + { + @SuppressWarnings("resource") // Closed through the closing of the result of the caller method. + UnfilteredRowIterator merged = UnfilteredRowIterators.merge(iterators, nowInSec()); + + if (!merged.isEmpty()) + { + DecoratedKey key = merged.partitionKey(); + metrics.samplers.get(TableMetrics.Sampler.READS).addSample(key.getKey(), key.hashCode(), 1); + } + + class UpdateSstablesIterated extends Transformation + { + public void onPartitionClose() + { - int sstablesIterated = (int)iterators.stream() - .filter(it -> it instanceof LazilyInitializedUnfilteredRowIterator) - .filter(it -> ((LazilyInitializedUnfilteredRowIterator)it).initialized()) - .count(); - - metrics.updateSSTableIterated(sstablesIterated); - Tracing.trace("Merged data from memtables and {} sstables", sstablesIterated); ++ int mergedSSTablesIterated = metricsCollector.getMergedSSTables(); ++ metrics.updateSSTableIterated(mergedSSTablesIterated); ++ Tracing.trace("Merged data from memtables and {} sstables", mergedSSTablesIterated); + } + }; + return Transformation.apply(merged, new UpdateSstablesIterated()); + } + private boolean queriesMulticellType() { for (ColumnDefinition column : columnFilter().fetchedColumns()) @@@ -773,14 -737,16 +776,19 @@@ // however: if it is set, it impacts everything and must be included. Getting that top-level partition deletion costs us // some seek in general however (unless the partition is indexed and is in the key cache), so we first check if the sstable // has any tombstone at all as a shortcut. - if (!sstable.hasTombstones()) - continue; // Means no tombstone at all, we can skip that sstable + if (!sstable.mayHaveTombstones()) + continue; // no tombstone at all, we can skip that sstable // We need to get the partition deletion and include it if it's live. In any case though, we're done with that sstable. - sstable.incrementReadCount(); - try (UnfilteredRowIterator iter = StorageHook.instance.makeRowIterator(cfs, sstable, partitionKey(), filter.getSlices(metadata()), columnFilter(), filter.isReversed(), isForThrift())) - try (UnfilteredRowIterator iter = filter.filter(sstable.iterator(partitionKey(), - columnFilter(), - filter.isReversed(), - isForThrift(), - metricsCollector))) ++ try (UnfilteredRowIterator iter = StorageHook.instance.makeRowIterator(cfs, ++ sstable, ++ partitionKey(), ++ filter.getSlices(metadata()), ++ columnFilter(), ++ filter.isReversed(), ++ isForThrift(), ++ metricsCollector)) { - sstablesIterated++; if (!iter.partitionLevelDeletion().isLive()) result = add(UnfilteredRowIterators.noRowsIterator(iter.metadata(), iter.partitionKey(), Rows.EMPTY_STATIC_ROW, iter.partitionLevelDeletion(), filter.isReversed()), result, filter, sstable.isRepaired()); else @@@ -789,9 -755,12 +797,14 @@@ continue; } -- Tracing.trace("Merging data from sstable {}", sstable.descriptor.generation); - sstable.incrementReadCount(); - try (UnfilteredRowIterator iter = StorageHook.instance.makeRowIterator(cfs, sstable, partitionKey(), filter.getSlices(metadata()), columnFilter(), filter.isReversed(), isForThrift())) - try (UnfilteredRowIterator iter = filter.filter(sstable.iterator(partitionKey(), - columnFilter(), - filter.isReversed(), - isForThrift(), - metricsCollector))) ++ try (UnfilteredRowIterator iter = StorageHook.instance.makeRowIterator(cfs, ++ sstable, ++ partitionKey(), ++ filter.getSlices(metadata()), ++ columnFilter(), ++ filter.isReversed(), ++ isForThrift(), ++ metricsCollector)) { if (iter.isEmpty()) continue; @@@ -810,10 -778,9 +822,10 @@@ DecoratedKey key = result.partitionKey(); cfs.metric.samplers.get(TableMetrics.Sampler.READS).addSample(key.getKey(), key.hashCode(), 1); + StorageHook.instance.reportRead(cfs.metadata.cfId, partitionKey()); // "hoist up" the requested data into a more recent sstable - if (sstablesIterated > cfs.getMinimumCompactionThreshold() + if (metricsCollector.getMergedSSTables() > cfs.getMinimumCompactionThreshold() && onlyUnrepaired && !cfs.isAutoCompactionDisabled() && cfs.getCompactionStrategyManager().shouldDefragment()) http://git-wip-us.apache.org/repos/asf/cassandra/blob/5c9db9af/src/java/org/apache/cassandra/db/StorageHook.java ---------------------------------------------------------------------- diff --cc src/java/org/apache/cassandra/db/StorageHook.java index 0f27adb,0000000..48d7ede mode 100644,000000..100644 --- a/src/java/org/apache/cassandra/db/StorageHook.java +++ b/src/java/org/apache/cassandra/db/StorageHook.java @@@ -1,86 -1,0 +1,104 @@@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db; + +import java.util.UUID; + +import org.apache.cassandra.db.filter.ClusteringIndexFilter; +import org.apache.cassandra.db.filter.ColumnFilter; +import org.apache.cassandra.db.partitions.PartitionUpdate; +import org.apache.cassandra.db.rows.UnfilteredRowIterator; +import org.apache.cassandra.db.rows.UnfilteredRowIteratorWithLowerBound; +import org.apache.cassandra.io.sstable.format.SSTableReader; ++import org.apache.cassandra.io.sstable.format.SSTableReadsListener; +import org.apache.cassandra.utils.FBUtilities; + +public interface StorageHook +{ + public static final StorageHook instance = createHook(); + + public void reportWrite(UUID cfid, PartitionUpdate partitionUpdate); + public void reportRead(UUID cfid, DecoratedKey key); + public UnfilteredRowIteratorWithLowerBound makeRowIteratorWithLowerBound(ColumnFamilyStore cfs, + DecoratedKey partitionKey, + SSTableReader sstable, + ClusteringIndexFilter filter, + ColumnFilter selectedColumns, + boolean isForThrift, + int nowInSec, - boolean applyThriftTransformation); ++ boolean applyThriftTransformation, ++ SSTableReadsListener listener); ++ + public UnfilteredRowIterator makeRowIterator(ColumnFamilyStore cfs, + SSTableReader sstable, + DecoratedKey key, + Slices slices, + ColumnFilter selectedColumns, + boolean reversed, - boolean isForThrift); ++ boolean isForThrift, ++ SSTableReadsListener listener); + + static StorageHook createHook() + { + String className = System.getProperty("cassandra.storage_hook"); + if (className != null) + { + return FBUtilities.construct(className, StorageHook.class.getSimpleName()); + } - else ++ ++ return new StorageHook() + { - return new StorageHook() - { - public void reportWrite(UUID cfid, PartitionUpdate partitionUpdate) {} ++ public void reportWrite(UUID cfid, PartitionUpdate partitionUpdate) {} + - public void reportRead(UUID cfid, DecoratedKey key) {} ++ public void reportRead(UUID cfid, DecoratedKey key) {} + - public UnfilteredRowIteratorWithLowerBound makeRowIteratorWithLowerBound(ColumnFamilyStore cfs, DecoratedKey partitionKey, SSTableReader sstable, ClusteringIndexFilter filter, ColumnFilter selectedColumns, boolean isForThrift, int nowInSec, boolean applyThriftTransformation) - { - return new UnfilteredRowIteratorWithLowerBound(partitionKey, - sstable, - filter, - selectedColumns, - isForThrift, - nowInSec, - applyThriftTransformation); - } ++ public UnfilteredRowIteratorWithLowerBound makeRowIteratorWithLowerBound(ColumnFamilyStore cfs, ++ DecoratedKey partitionKey, ++ SSTableReader sstable, ++ ClusteringIndexFilter filter, ++ ColumnFilter selectedColumns, ++ boolean isForThrift, ++ int nowInSec, ++ boolean applyThriftTransformation, ++ SSTableReadsListener listener) ++ { ++ return new UnfilteredRowIteratorWithLowerBound(partitionKey, ++ sstable, ++ filter, ++ selectedColumns, ++ isForThrift, ++ nowInSec, ++ applyThriftTransformation, ++ listener); ++ } + - public UnfilteredRowIterator makeRowIterator(ColumnFamilyStore cfs, SSTableReader sstable, DecoratedKey key, Slices slices, ColumnFilter selectedColumns, boolean reversed, boolean isForThrift) - { - return sstable.iterator(key, slices, selectedColumns, reversed, isForThrift); - } - }; - } ++ public UnfilteredRowIterator makeRowIterator(ColumnFamilyStore cfs, ++ SSTableReader sstable, ++ DecoratedKey key, ++ Slices slices, ++ ColumnFilter selectedColumns, ++ boolean reversed, ++ boolean isForThrift, ++ SSTableReadsListener listener) ++ { ++ return sstable.iterator(key, slices, selectedColumns, reversed, isForThrift, listener); ++ } ++ }; + } +} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@cassandra.apache.org For additional commands, e-mail: commits-h...@cassandra.apache.org