[ 
https://issues.apache.org/jira/browse/CASSANDRA-14145?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16618668#comment-16618668
 ] 

ASF GitHub Bot commented on CASSANDRA-14145:
--------------------------------------------

Github user beobal commented on a diff in the pull request:

    https://github.com/apache/cassandra-dtest/pull/37#discussion_r218336047
  
    --- Diff: repair_tests/incremental_repair_test.py ---
    @@ -918,3 +931,196 @@ def test_subrange(self):
             self.assertRepairedAndUnrepaired(node1, 'ks')
             self.assertRepairedAndUnrepaired(node2, 'ks')
             self.assertRepairedAndUnrepaired(node3, 'ks')
    +
    +    @since('4.0')
    +    def test_repaired_tracking_with_partition_deletes(self):
    +        """
    +        check that when an tracking repaired data status following a 
digest mismatch,
    +        repaired data mismatches are marked as unconfirmed as we may skip 
sstables
    +        after the partition delete are encountered.
    +        @jira_ticket CASSANDRA-14145
    +        """
    +        session, node1, node2 = self.setup_for_repaired_data_tracking()
    +        stmt = SimpleStatement("INSERT INTO ks.tbl (k, c, v) VALUES (%s, 
%s, %s)")
    +        stmt.consistency_level = ConsistencyLevel.ALL
    +        for i in range(10):
    +            session.execute(stmt, (i, i, i))
    +
    +        for node in self.cluster.nodelist():
    +            node.flush()
    +            self.assertNoRepairedSSTables(node, 'ks')
    +
    +        node1.repair(options=['ks'])
    +        node2.stop(wait_other_notice=True)
    +
    +        session.execute("delete from ks.tbl where k = 5")
    +
    +        node1.flush()
    +        node2.start(wait_other_notice=True)
    +
    +        # expect unconfirmed inconsistencies as the partition deletes 
cause some sstables to be skipped
    +        with JolokiaAgent(node1) as jmx:
    +            self.query_and_check_repaired_mismatches(jmx, session, "SELECT 
* FROM ks.tbl WHERE k = 5",
    +                                                     
expect_unconfirmed_inconsistencies=True)
    +            self.query_and_check_repaired_mismatches(jmx, session, "SELECT 
* FROM ks.tbl WHERE k = 5 AND c = 5",
    +                                                     
expect_unconfirmed_inconsistencies=True)
    +            # no digest reads for range queries so blocking read repair 
metric isn't incremented
    +            # *all* sstables are read for partition ranges too, and as the 
repaired set is still in sync there should
    +            # be no inconsistencies
    +            self.query_and_check_repaired_mismatches(jmx, session, "SELECT 
* FROM ks.tbl", expect_read_repair=False)
    +
    +    @since('4.0')
    +    def test_repaired_tracking_with_varying_sstable_sets(self):
    +        """
    +        verify that repaired data digests are computed over the merged 
data for each replica
    +        and that the particular number of sstables on each doesn't affect 
the comparisons
    +        both replicas start with the same repaired set, comprising 2 
sstables. node1's is
    +        then compacted and additional unrepaired data added (which 
overwrites some in the
    +        repaired set). We expect the repaired digests to still match as 
the tracking will
    +        force all sstables containing the partitions to be read
    +        there are two variants of this, for single partition slice & names 
reads and range reads
    +        @jira_ticket CASSANDRA-14145
    +        """
    +        session, node1, node2 = self.setup_for_repaired_data_tracking()
    +        stmt = SimpleStatement("INSERT INTO ks.tbl (k, c, v) VALUES (%s, 
%s, %s)")
    +        stmt.consistency_level = ConsistencyLevel.ALL
    +        for i in range(10):
    +            session.execute(stmt, (i, i, i))
    +
    +        for node in self.cluster.nodelist():
    +            node.flush()
    +
    +        for i in range(10,20):
    +            session.execute(stmt, (i, i, i))
    +
    +        for node in self.cluster.nodelist():
    +            node.flush()
    +            self.assertNoRepairedSSTables(node, 'ks')
    +
    +        node1.repair(options=['ks'])
    +        node2.stop(wait_other_notice=True)
    +
    +        session.execute("insert into ks.tbl (k, c, v) values (5, 5, 55)")
    +        session.execute("insert into ks.tbl (k, c, v) values (15, 15, 
155)")
    +        node1.flush()
    +        node1.compact()
    +        node1.compact()
    +        node2.start(wait_other_notice=True)
    +
    +        # we don't expect any inconsistencies as all repaired data is read 
on both replicas
    +        with JolokiaAgent(node1) as jmx:
    +            self.query_and_check_repaired_mismatches(jmx, session, "SELECT 
* FROM ks.tbl WHERE k = 5")
    +            self.query_and_check_repaired_mismatches(jmx, session, "SELECT 
* FROM ks.tbl WHERE k = 5 AND c = 5")
    +            # no digest reads for range queries so read repair metric 
isn't incremented
    +            self.query_and_check_repaired_mismatches(jmx, session, "SELECT 
* FROM ks.tbl", expect_read_repair=False)
    +
    +    @since('4.0')
    +    def test_repaired_tracking_with_mismatching_replicas(self):
    +        """
    +        there are two variants of this, for single partition slice & names 
reads and range reads
    --- End diff --
    
    Sorry, that was an oversight, added a description.


>  Detecting data resurrection during read
> ----------------------------------------
>
>                 Key: CASSANDRA-14145
>                 URL: https://issues.apache.org/jira/browse/CASSANDRA-14145
>             Project: Cassandra
>          Issue Type: Improvement
>            Reporter: sankalp kohli
>            Assignee: Sam Tunnicliffe
>            Priority: Minor
>              Labels: pull-request-available
>             Fix For: 4.0
>
>
> We have seen several bugs in which deleted data gets resurrected. We should 
> try to see if we can detect this on the read path and possibly fix it. Here 
> are a few examples which brought back data
> A replica lost an sstable on startup which caused one replica to lose the 
> tombstone and not the data. This tombstone was past gc grace which means this 
> could resurrect data. We can detect such invalid states by looking at other 
> replicas. 
> If we are running incremental repair, Cassandra will keep repaired and 
> non-repaired data separate. Every-time incremental repair will run, it will 
> move the data from non-repaired to repaired. Repaired data across all 
> replicas should be 100% consistent. 
> Here is an example of how we can detect and mitigate the issue in most cases. 
> Say we have 3 machines, A,B and C. All these machines will have data split 
> b/w repaired and non-repaired. 
> 1. Machine A due to some bug bring backs data D. This data D is in repaired 
> dataset. All other replicas will have data D and tombstone T 
> 2. Read for data D comes from application which involve replicas A and B. The 
> data being read involves data which is in repaired state.  A will respond 
> back to co-ordinator with data D and B will send nothing as tombstone is past 
> gc grace. This will cause digest mismatch. 
> 3. This patch will only kick in when there is a digest mismatch. Co-ordinator 
> will ask both replicas to send back all data like we do today but with this 
> patch, replicas will respond back what data it is returning is coming from 
> repaired vs non-repaired. If data coming from repaired does not match, we 
> know there is a something wrong!! At this time, co-ordinator cannot determine 
> if replica A has resurrected some data or replica B has lost some data. We 
> can still log error in the logs saying we hit an invalid state.
> 4. Besides the log, we can take this further and even correct the response to 
> the query. After logging an invalid state, we can ask replica A and B (and 
> also C if alive) to send back all data for this including gcable tombstones. 
> If any machine returns a tombstone which is after this data, we know we 
> cannot return this data. This way we can avoid returning data which has been 
> deleted. 
> Some Challenges with this 
> 1. When data will be moved from non-repaired to repaired, there could be a 
> race here. We can look at which incremental repairs have promoted things on 
> which replica to avoid false positives.  
> 2. If the third replica is down and live replica does not have any tombstone, 
> we wont be able to break the tie in deciding whether data was actually 
> deleted or resurrected. 
> 3. If the read is for latest data only, we wont be able to detect it as the 
> read will be served from non-repaired data. 
> 4. If the replica where we lose a tombstone is the last replica to compact 
> the tombstone, we wont be able to decide if data is coming back or rest of 
> the replicas has lost that data. But we will still detect something is wrong. 
> 5. We wont affect 99.9% of the read queries as we only do extra work during 
> digest mismatch.
> 6. CL.ONE reads will not be able to detect this. 



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@cassandra.apache.org
For additional commands, e-mail: commits-h...@cassandra.apache.org

Reply via email to