This is an automated email from the ASF dual-hosted git repository.

rnewson pushed a commit to branch epoch-validation
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit 7a79b36efd696132223826ebedbb5299f286d886
Author: Robert Newson <rnew...@apache.org>
AuthorDate: Mon Jul 15 23:25:16 2024 +0100

    Validate epoch transitions
    
    We've learned that `node()` can return `nonode@nohost` in a distributed
    node while it is crashing (if net_kernel dies, say, but perhaps other 
causes).
    
    We've assumed this can never happen, and have seen cases where 
`nonode@nohost`
    has been recorded in the epoch list.
    
    Tighten the code so we get the node name from the init arguments instead as 
these
    are immutable.
    
    Also validate that the associated update sequence never goes down (it can 
stay the
    same, which happens if the file is copied from one machine to another).
---
 src/couch/src/couch_bt_engine_header.erl | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/src/couch/src/couch_bt_engine_header.erl 
b/src/couch/src/couch_bt_engine_header.erl
index e28f07723..ac59dc8b5 100644
--- a/src/couch/src/couch_bt_engine_header.erl
+++ b/src/couch/src/couch_bt_engine_header.erl
@@ -261,6 +261,7 @@ upgrade_uuid(#db_header{} = Header) ->
     end.
 
 upgrade_epochs(#db_header{} = Header) ->
+    Node = init_node_name(),
     NewEpochs =
         case Header#db_header.epochs of
             undefined ->
@@ -269,14 +270,14 @@ upgrade_epochs(#db_header{} = Header) ->
                 % was always an implicit assumption that a file was
                 % owned since eternity by the node it was on. This
                 % just codifies that assumption.
-                [{node(), 0}];
-            [{Node, _} | _] = Epochs0 when Node == node() ->
+                [{Node, 0}];
+            [{Node, S} | _] = Epochs0 when Header#db_header.update_seq >= S ->
                 % Current node is the current owner of this db
                 Epochs0;
-            Epochs1 ->
+            [{_OtherNode, S} | _] = Epochs1 when Header#db_header.update_seq 
>= S ->
                 % This node is taking over ownership of this db
                 % and marking the update sequence where it happened.
-                [{node(), Header#db_header.update_seq} | Epochs1]
+                [{Node, Header#db_header.update_seq} | Epochs1]
         end,
     % Its possible for a node to open a db and claim
     % ownership but never make a write to the db. This
@@ -285,6 +286,20 @@ upgrade_epochs(#db_header{} = Header) ->
     DedupedEpochs = remove_dup_epochs(NewEpochs),
     Header#db_header{epochs = DedupedEpochs}.
 
+%% get the node name from the start up configuration as node()
+%% can return nonode@nohost if net_kernel crashes.
+init_node_name() ->
+    init_node_name(init:get_arguments()).
+
+init_node_name([]) ->
+    nonode@nohost;
+init_node_name([{name, [Name]} | _Rest]) ->
+    list_to_existing_atom(Name);
+init_node_name([{sname, [Name]} | _Rest]) ->
+    list_to_existing_atom(Name);
+init_node_name([_ | Rest]) ->
+    init_node_name(Rest).
+
 % This is slightly relying on the udpate_seq's being sorted
 % in epochs due to how we only ever push things onto the
 % front. Although if we ever had a case where the update_seq

Reply via email to