[ 
https://issues.apache.org/jira/browse/CASSANDRA-14145?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16618325#comment-16618325
 ] 

ASF GitHub Bot commented on CASSANDRA-14145:
--------------------------------------------

Github user jrwest commented on a diff in the pull request:

    https://github.com/apache/cassandra-dtest/pull/37#discussion_r218264028
  
    --- Diff: repair_tests/incremental_repair_test.py ---
    @@ -918,3 +931,196 @@ def test_subrange(self):
             self.assertRepairedAndUnrepaired(node1, 'ks')
             self.assertRepairedAndUnrepaired(node2, 'ks')
             self.assertRepairedAndUnrepaired(node3, 'ks')
    +
    +    @since('4.0')
    +    def test_repaired_tracking_with_partition_deletes(self):
    +        """
    +        check that when an tracking repaired data status following a 
digest mismatch,
    +        repaired data mismatches are marked as unconfirmed as we may skip 
sstables
    +        after the partition delete are encountered.
    +        @jira_ticket CASSANDRA-14145
    +        """
    +        session, node1, node2 = self.setup_for_repaired_data_tracking()
    +        stmt = SimpleStatement("INSERT INTO ks.tbl (k, c, v) VALUES (%s, 
%s, %s)")
    +        stmt.consistency_level = ConsistencyLevel.ALL
    +        for i in range(10):
    +            session.execute(stmt, (i, i, i))
    +
    +        for node in self.cluster.nodelist():
    +            node.flush()
    +            self.assertNoRepairedSSTables(node, 'ks')
    +
    +        node1.repair(options=['ks'])
    +        node2.stop(wait_other_notice=True)
    +
    +        session.execute("delete from ks.tbl where k = 5")
    +
    +        node1.flush()
    +        node2.start(wait_other_notice=True)
    +
    +        # expect unconfirmed inconsistencies as the partition deletes 
cause some sstables to be skipped
    +        with JolokiaAgent(node1) as jmx:
    +            self.query_and_check_repaired_mismatches(jmx, session, "SELECT 
* FROM ks.tbl WHERE k = 5",
    +                                                     
expect_unconfirmed_inconsistencies=True)
    +            self.query_and_check_repaired_mismatches(jmx, session, "SELECT 
* FROM ks.tbl WHERE k = 5 AND c = 5",
    +                                                     
expect_unconfirmed_inconsistencies=True)
    +            # no digest reads for range queries so blocking read repair 
metric isn't incremented
    +            # *all* sstables are read for partition ranges too, and as the 
repaired set is still in sync there should
    +            # be no inconsistencies
    +            self.query_and_check_repaired_mismatches(jmx, session, "SELECT 
* FROM ks.tbl", expect_read_repair=False)
    +
    +    @since('4.0')
    +    def test_repaired_tracking_with_varying_sstable_sets(self):
    +        """
    +        verify that repaired data digests are computed over the merged 
data for each replica
    +        and that the particular number of sstables on each doesn't affect 
the comparisons
    +        both replicas start with the same repaired set, comprising 2 
sstables. node1's is
    +        then compacted and additional unrepaired data added (which 
overwrites some in the
    +        repaired set). We expect the repaired digests to still match as 
the tracking will
    +        force all sstables containing the partitions to be read
    +        there are two variants of this, for single partition slice & names 
reads and range reads
    +        @jira_ticket CASSANDRA-14145
    +        """
    +        session, node1, node2 = self.setup_for_repaired_data_tracking()
    +        stmt = SimpleStatement("INSERT INTO ks.tbl (k, c, v) VALUES (%s, 
%s, %s)")
    +        stmt.consistency_level = ConsistencyLevel.ALL
    +        for i in range(10):
    +            session.execute(stmt, (i, i, i))
    +
    +        for node in self.cluster.nodelist():
    +            node.flush()
    +
    +        for i in range(10,20):
    +            session.execute(stmt, (i, i, i))
    +
    +        for node in self.cluster.nodelist():
    +            node.flush()
    +            self.assertNoRepairedSSTables(node, 'ks')
    +
    +        node1.repair(options=['ks'])
    +        node2.stop(wait_other_notice=True)
    +
    +        session.execute("insert into ks.tbl (k, c, v) values (5, 5, 55)")
    +        session.execute("insert into ks.tbl (k, c, v) values (15, 15, 
155)")
    +        node1.flush()
    +        node1.compact()
    +        node1.compact()
    +        node2.start(wait_other_notice=True)
    +
    +        # we don't expect any inconsistencies as all repaired data is read 
on both replicas
    +        with JolokiaAgent(node1) as jmx:
    +            self.query_and_check_repaired_mismatches(jmx, session, "SELECT 
* FROM ks.tbl WHERE k = 5")
    +            self.query_and_check_repaired_mismatches(jmx, session, "SELECT 
* FROM ks.tbl WHERE k = 5 AND c = 5")
    +            # no digest reads for range queries so read repair metric 
isn't incremented
    +            self.query_and_check_repaired_mismatches(jmx, session, "SELECT 
* FROM ks.tbl", expect_read_repair=False)
    +
    +    @since('4.0')
    +    def test_repaired_tracking_with_mismatching_replicas(self):
    +        """
    +        there are two variants of this, for single partition slice & names 
reads and range reads
    +        @jira_ticket CASSANDRA-14145
    +        """
    +        session, node1, node2 = self.setup_for_repaired_data_tracking()
    +        stmt = SimpleStatement("INSERT INTO ks.tbl (k, c, v) VALUES (%s, 
%s, %s)")
    +        stmt.consistency_level = ConsistencyLevel.ALL
    +        for i in range(10):
    +            session.execute(stmt, (i, i, i))
    +
    +        for node in self.cluster.nodelist():
    +            node.flush()
    +
    +        for i in range(10,20):
    +            session.execute(stmt, (i, i, i))
    +
    +        for node in self.cluster.nodelist():
    +            node.flush()
    +            self.assertNoRepairedSSTables(node, 'ks')
    +
    +        # stop node 2 and mark its sstables repaired
    +        node2.stop(wait_other_notice=True)
    +        node2.run_sstablerepairedset(keyspace='ks')
    +        # before restarting node2 overwrite some data on node1 to trigger 
digest mismatches
    +        session.execute("insert into ks.tbl (k, c, v) values (5, 5, 55)")
    +        node2.start(wait_for_binary_proto=True)
    +
    +        out1 = node1.run_sstablemetadata(keyspace='ks').stdout
    +        out2 = node2.run_sstablemetadata(keyspace='ks').stdout
    +
    +        # verify the repaired at times for the sstables on node1/node2
    +        assert all(t == 0 for t in [int(x) for x in [y.split(' ')[0] for y 
in findall('(?<=Repaired at: ).*', out1)]])
    +        assert all(t > 0 for t in [int(x) for x in [y.split(' ')[0] for y 
in findall('(?<=Repaired at: ).*', out2)]])
    +
    +        # we expect inconsistencies due to sstables being marked repaired 
on one replica only
    +        # these are marked confirmed because no sessions are pending & all 
sstables are
    +        # skipped due to partition deletes
    +        with JolokiaAgent(node1) as jmx:
    +            self.query_and_check_repaired_mismatches(jmx, session, "SELECT 
* FROM ks.tbl WHERE k = 5",
    +                                                     
expect_confirmed_inconsistencies=True)
    +            self.query_and_check_repaired_mismatches(jmx, session, "SELECT 
* FROM ks.tbl WHERE k = 5 AND c = 5",
    +                                                     
expect_confirmed_inconsistencies=True)
    +            # no digest reads for range queries so read repair metric 
isn't incremented
    +            self.query_and_check_repaired_mismatches(jmx, session, "SELECT 
* FROM ks.tbl", expect_read_repair=False)
    +
    +    def setup_for_repaired_data_tracking(self):
    +        self.fixture_dtest_setup.setup_overrides.cluster_options = 
ImmutableMapping({'hinted_handoff_enabled': 'false',
    +                                                                           
          'num_tokens': 1,
    +                                                                           
          'commitlog_sync_period_in_ms': 500})
    +        self.fixture_dtest_setup.init_default_config()
    +        self.cluster.populate(2)
    +        node1, node2 = self.cluster.nodelist()
    +        remove_perf_disable_shared_mem(node1)  # necessary for jmx
    +        self.cluster.start()
    +
    +        session = self.patient_exclusive_cql_connection(node1)
    +        session.execute("CREATE KEYSPACE ks WITH 
REPLICATION={'class':'SimpleStrategy', 'replication_factor': 2}")
    +        session.execute("CREATE TABLE ks.tbl (k INT, c INT, v INT, PRIMARY 
KEY (k,c)) with read_repair='NONE'")
    +        return session, node1, node2
    +
    +    def query_and_check_repaired_mismatches(self, jmx, session, query,
    +                                            expect_read_repair=True,
    +                                            
expect_unconfirmed_inconsistencies=False,
    +                                            
expect_confirmed_inconsistencies=False):
    +
    +        rr_count = make_mbean('metrics', type='ReadRepair', 
name='ReconcileRead')
    +        unconfirmed_count = make_mbean('metrics', 
type='Table,keyspace=ks', 
name='RepairedDataInconsistenciesUnconfirmed,scope=tbl')
    +        confirmed_count = make_mbean('metrics', type='Table,keyspace=ks', 
name='RepairedDataInconsistenciesConfirmed,scope=tbl')
    +
    +        rr_before = self.get_attribute_count(jmx, rr_count)
    +        uc_before = self.get_attribute_count(jmx, unconfirmed_count)
    +        cc_before = self.get_attribute_count(jmx, confirmed_count)
    +
    +        stmt = SimpleStatement(query)
    +        stmt.consistency_level = ConsistencyLevel.ALL
    +        session.execute(stmt)
    +
    +        rr_after = self.get_attribute_count(jmx, rr_count)
    +        uc_after = self.get_attribute_count(jmx, unconfirmed_count)
    +        cc_after = self.get_attribute_count(jmx, confirmed_count)
    +
    +        logger.debug("RR: {before}, {after}".format(before=rr_before, 
after=rr_after))
    +        logger.debug("UI: {before}, {after}".format(before=uc_before, 
after=uc_after))
    --- End diff --
    
    I find "UI" and "CI" a bit confusing here given the other variable names. 
Consider changing it to complete words. 


>  Detecting data resurrection during read
> ----------------------------------------
>
>                 Key: CASSANDRA-14145
>                 URL: https://issues.apache.org/jira/browse/CASSANDRA-14145
>             Project: Cassandra
>          Issue Type: Improvement
>            Reporter: sankalp kohli
>            Assignee: Sam Tunnicliffe
>            Priority: Minor
>              Labels: pull-request-available
>             Fix For: 4.0
>
>
> We have seen several bugs in which deleted data gets resurrected. We should 
> try to see if we can detect this on the read path and possibly fix it. Here 
> are a few examples which brought back data
> A replica lost an sstable on startup which caused one replica to lose the 
> tombstone and not the data. This tombstone was past gc grace which means this 
> could resurrect data. We can detect such invalid states by looking at other 
> replicas. 
> If we are running incremental repair, Cassandra will keep repaired and 
> non-repaired data separate. Every-time incremental repair will run, it will 
> move the data from non-repaired to repaired. Repaired data across all 
> replicas should be 100% consistent. 
> Here is an example of how we can detect and mitigate the issue in most cases. 
> Say we have 3 machines, A,B and C. All these machines will have data split 
> b/w repaired and non-repaired. 
> 1. Machine A due to some bug bring backs data D. This data D is in repaired 
> dataset. All other replicas will have data D and tombstone T 
> 2. Read for data D comes from application which involve replicas A and B. The 
> data being read involves data which is in repaired state.  A will respond 
> back to co-ordinator with data D and B will send nothing as tombstone is past 
> gc grace. This will cause digest mismatch. 
> 3. This patch will only kick in when there is a digest mismatch. Co-ordinator 
> will ask both replicas to send back all data like we do today but with this 
> patch, replicas will respond back what data it is returning is coming from 
> repaired vs non-repaired. If data coming from repaired does not match, we 
> know there is a something wrong!! At this time, co-ordinator cannot determine 
> if replica A has resurrected some data or replica B has lost some data. We 
> can still log error in the logs saying we hit an invalid state.
> 4. Besides the log, we can take this further and even correct the response to 
> the query. After logging an invalid state, we can ask replica A and B (and 
> also C if alive) to send back all data for this including gcable tombstones. 
> If any machine returns a tombstone which is after this data, we know we 
> cannot return this data. This way we can avoid returning data which has been 
> deleted. 
> Some Challenges with this 
> 1. When data will be moved from non-repaired to repaired, there could be a 
> race here. We can look at which incremental repairs have promoted things on 
> which replica to avoid false positives.  
> 2. If the third replica is down and live replica does not have any tombstone, 
> we wont be able to break the tie in deciding whether data was actually 
> deleted or resurrected. 
> 3. If the read is for latest data only, we wont be able to detect it as the 
> read will be served from non-repaired data. 
> 4. If the replica where we lose a tombstone is the last replica to compact 
> the tombstone, we wont be able to decide if data is coming back or rest of 
> the replicas has lost that data. But we will still detect something is wrong. 
> 5. We wont affect 99.9% of the read queries as we only do extra work during 
> digest mismatch.
> 6. CL.ONE reads will not be able to detect this. 



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@cassandra.apache.org
For additional commands, e-mail: commits-h...@cassandra.apache.org

Reply via email to