This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a commit to branch avoid-creating-invalid-view-purge-checkpoints
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit a2223b57bb800c3feff9fe9c386311d0a3b5dbb9
Author: Nick Vatamaniuc <[email protected]>
AuthorDate: Sat Mar 7 00:21:00 2026 -0500

    Avoid creating purge checkpoints for invalid views
    
    Previously, if we successfully parsed a design document into an #mrst{} 
record,
    we would create a purge checkpoint even if the view group didn't have any
    views, or used an invalid language. Normally, that would just be untidy, 
having
    extra unused checkpoints around, however, when we actually start purging, 
these
    checkpoints won't be advancing. So, after a while they will stall the 
compactor
    from compacting the purge sequences and advancing the minimum purge sequence
    forward.
    
    In other words we need to differentiate between parsable #mrst{} view groups
    and buildable/viable ones. Ken (our background index builder) ken was 
already
    performing the same check, so simply move it to the proc manager and the the
    indexing utility modules.
    
    In the proc manager, noticed that the repeated OS environment fetching and
    parsing was kind of time consuming (100s of microseonds) so used a persitent
    term.
---
 src/couch/src/couch_proc_manager.erl        | 32 +++++++++-
 src/couch_index/src/couch_index_util.erl    |  1 +
 src/couch_mrview/src/couch_mrview_index.erl | 17 +++--
 src/couch_mrview/src/couch_mrview_util.erl  | 35 +++++++++--
 src/fabric/test/eunit/fabric_tests.erl      | 96 ++++++++++++++++++++++++++++-
 src/ken/src/ken_server.erl                  | 23 +------
 6 files changed, 173 insertions(+), 31 deletions(-)

diff --git a/src/couch/src/couch_proc_manager.erl 
b/src/couch/src/couch_proc_manager.erl
index aa538e23e..e8813f93a 100644
--- a/src/couch/src/couch_proc_manager.erl
+++ b/src/couch/src/couch_proc_manager.erl
@@ -25,8 +25,7 @@
     new_proc/1,
     reload/0,
     terminate_stale_procs/0,
-    get_servers_from_env/1,
-    native_query_server_enabled/0
+    allowed_languages/0
 ]).
 
 -export([
@@ -503,6 +502,18 @@ split_by_char(String, Char) ->
     {Key, Value}.
 
 get_servers_from_env(Spec) ->
+    case persistent_term:get({?MODULE, Spec}, undefined) of
+        undefined ->
+            % Save some time since os:getenv() can potentially be very large
+            % and we don't want to reparse it all every time.
+            Val = get_servers_from_env_raw(Spec),
+            persistent_term:put({?MODULE, Spec}, Val),
+            Val;
+        Val ->
+            Val
+    end.
+
+get_servers_from_env_raw(Spec) ->
     SpecLen = length(Spec),
     % loop over os:getenv(), match SPEC_
     lists:filtermap(
@@ -517,6 +528,23 @@ get_servers_from_env(Spec) ->
         os:getenv()
     ).
 
+% Get the list of all the languages we know how handle. Some are built-in
+% and some are configurable via config settings or env vars
+%
+allowed_languages() ->
+    % These are always available
+    BuiltIn = [<<"javascript">>, <<"javascript_quickjs">>, <<"query">>],
+    Config =
+        get_servers_from_env("COUCHDB_QUERY_SERVER_") ++
+            get_servers_from_env("COUCHDB_NATIVE_QUERY_SERVER_"),
+    Allowed0 = [list_to_binary(string:to_lower(Lang)) || {Lang, _Cmd} <- 
Config],
+    Allowed =
+        case native_query_server_enabled() of
+            true -> [<<"erlang">> | Allowed0];
+            _Else -> Allowed0
+        end,
+    lists:usort(BuiltIn ++ Allowed).
+
 configure_language_servers() ->
     ets:delete_all_objects(?SERVERS),
     % NOTE: process environment variables cannot be altered except by
diff --git a/src/couch_index/src/couch_index_util.erl 
b/src/couch_index/src/couch_index_util.erl
index 9a16d06d6..5812926af 100644
--- a/src/couch_index/src/couch_index_util.erl
+++ b/src/couch_index/src/couch_index_util.erl
@@ -15,6 +15,7 @@
 -export([root_dir/0, index_dir/2, index_file/3]).
 -export([load_doc/3, sort_lib/1, hexsig/1]).
 -export([get_purge_checkpoints/2, cleanup_purges/3]).
+-export([delete_checkpoint/2]).
 
 -include_lib("couch/include/couch_db.hrl").
 
diff --git a/src/couch_mrview/src/couch_mrview_index.erl 
b/src/couch_mrview/src/couch_mrview_index.erl
index 164227b49..f0a649b77 100644
--- a/src/couch_mrview/src/couch_mrview_index.erl
+++ b/src/couch_mrview/src/couch_mrview_index.erl
@@ -292,13 +292,22 @@ ensure_local_purge_docs(DbName, DDocs) ->
     end).
 
 ensure_local_purge_doc(Db, #mrst{} = State) ->
+    IsValid = couch_mrview_util:mrst_has_valid_views(State),
     Sig = couch_index_util:hexsig(get(signature, State)),
     DocId = couch_mrview_util:get_local_purge_doc_id(Sig),
-    case couch_db:open_doc(Db, DocId, []) of
-        {not_found, _} ->
+    case {IsValid, couch_db:open_doc(Db, DocId, [])} of
+        {true, {not_found, _}} ->
+            % Is valid and doc doesn't exist => create it
             create_local_purge_doc(Db, State);
-        {ok, _} ->
-            ok
+        {false, {not_found, _}} ->
+            % Invalid and doc doesn't exist => do nothing
+            ok;
+        {true, {ok, _}} ->
+            % Is valid and doc exists already => do nothing
+            ok;
+        {false, {ok, _}} ->
+            % Invalid and doc exists => cleanup
+            couch_index_util:delete_checkpoint(Db, DocId)
     end.
 
 create_local_purge_doc(Db, State) ->
diff --git a/src/couch_mrview/src/couch_mrview_util.erl 
b/src/couch_mrview/src/couch_mrview_util.erl
index 494f5b3ec..48138bf76 100644
--- a/src/couch_mrview/src/couch_mrview_util.erl
+++ b/src/couch_mrview/src/couch_mrview_util.erl
@@ -18,6 +18,7 @@
 -export([get_signatures/1, get_purge_checkpoints/1, get_index_files/1]).
 -export([get_signatures_from_ddocs/2]).
 -export([ddoc_to_mrst/2, init_state/4, reset_index/3]).
+-export([mrst_has_valid_views/1]).
 -export([make_header/1]).
 -export([index_file/2, compaction_file/2, open_file/1]).
 -export([delete_files/2, delete_index_file/2, delete_compaction_file/2]).
@@ -112,12 +113,23 @@ get_signatures(Db) ->
     get_signatures_from_ddocs(DbName, DDocs1).
 
 % From a list of design #doc{} records returns signatures map: #{Sig => true}
-%
+% This will be valid signatures of views we expect to run and build on this
+% node.
 get_signatures_from_ddocs(DbName, DDocs) when is_list(DDocs) ->
     FoldFun = fun(#doc{} = Doc, Acc) ->
-        {ok, Mrst} = ddoc_to_mrst(DbName, Doc),
-        Sig = couch_util:to_hex_bin(Mrst#mrst.sig),
-        Acc#{Sig => true}
+        try ddoc_to_mrst(DbName, Doc) of
+            {ok, Mrst} ->
+                case couch_mrview_util:mrst_has_valid_views(Mrst) of
+                    true ->
+                        Sig = couch_util:to_hex_bin(Mrst#mrst.sig),
+                        Acc#{Sig => true};
+                    false ->
+                        Acc
+                end
+        catch
+            _:_ ->
+                Acc
+        end
     end,
     lists:foldl(FoldFun, #{}, DDocs).
 
@@ -346,6 +358,21 @@ view_sig_term(BaseSig, UpdateSeq, PurgeSeq) ->
 view_sig_term(BaseSig, UpdateSeq, PurgeSeq, Args) ->
     {BaseSig, UpdateSeq, PurgeSeq, Args}.
 
+% We can construct an MRSt from design document which may have languages this
+% server doesn't support, or may have no views. That view group won't build on
+% this server, and we can use this utility function to filter it out. Ken and
+% purge checkpoint cleanups use this function.
+%
+mrst_has_valid_views(MRSt) ->
+    case couch_mrview_index:get(views, MRSt) of
+        [_ | _] ->
+            Language = couch_mrview_index:get(language, MRSt),
+            AllowedLanguages = couch_proc_manager:allowed_languages(),
+            lists:member(Language, AllowedLanguages);
+        [] ->
+            false
+    end.
+
 init_state(Db, Fd, #mrst{views = Views} = State, nil) ->
     PurgeSeq = couch_db:get_purge_seq(Db),
     Header = #mrheader{
diff --git a/src/fabric/test/eunit/fabric_tests.erl 
b/src/fabric/test/eunit/fabric_tests.erl
index c092f4473..daf018231 100644
--- a/src/fabric/test/eunit/fabric_tests.erl
+++ b/src/fabric/test/eunit/fabric_tests.erl
@@ -27,7 +27,9 @@ cleanup_index_files_test_() ->
             ?TDEF_FE(t_cleanup_index_files_with_view_data),
             ?TDEF_FE(t_cleanup_index_files_with_deleted_db),
             ?TDEF_FE(t_cleanup_index_file_after_ddoc_update),
-            ?TDEF_FE(t_cleanup_index_file_after_ddoc_delete)
+            ?TDEF_FE(t_cleanup_index_file_after_ddoc_delete),
+            ?TDEF_FE(t_cleanup_empty_view_checkpoints),
+            ?TDEF_FE(t_cleanup_disallowed_language_checkpoints)
         ]
     }.
 
@@ -151,6 +153,76 @@ t_cleanup_index_file_after_ddoc_delete({_, DbName}) ->
     ?assertEqual([], indices(DbName)),
     ?assertEqual([], purges(DbName)).
 
+t_cleanup_empty_view_checkpoints({_, DbName}) ->
+    ?assertEqual(
+        [
+            "4bcdf852098ff6b0578ddf472c320e9c.view",
+            "da817c3d3f7413c1a610f25635a0c521.view"
+        ],
+        indices(DbName)
+    ),
+    ?assertEqual(
+        [
+            <<"_local/purge-mrview-4bcdf852098ff6b0578ddf472c320e9c">>,
+            <<"_local/purge-mrview-da817c3d3f7413c1a610f25635a0c521">>
+        ],
+        purges(DbName)
+    ),
+
+    update_empty_ddoc(DbName, <<"_design/foo">>),
+    {ok, _} = fabric:get_view_group_info(DbName, <<"foo">>),
+    ok = fabric:cleanup_index_files_all_nodes(DbName),
+
+    % One 4bc stays, da8 should gone. If it weren't for the check to
+    % empty views it would have used signature 
"3e823c2a4383ac0c18d4e574135a5b08"
+    ?assertEqual(
+        [
+            "4bcdf852098ff6b0578ddf472c320e9c.view"
+        ],
+        indices(DbName)
+    ),
+    ?assertEqual(
+        [
+            <<"_local/purge-mrview-4bcdf852098ff6b0578ddf472c320e9c">>
+        ],
+        purges(DbName)
+    ).
+
+t_cleanup_disallowed_language_checkpoints({_, DbName}) ->
+    ?assertEqual(
+        [
+            "4bcdf852098ff6b0578ddf472c320e9c.view",
+            "da817c3d3f7413c1a610f25635a0c521.view"
+        ],
+        indices(DbName)
+    ),
+    ?assertEqual(
+        [
+            <<"_local/purge-mrview-4bcdf852098ff6b0578ddf472c320e9c">>,
+            <<"_local/purge-mrview-da817c3d3f7413c1a610f25635a0c521">>
+        ],
+        purges(DbName)
+    ),
+
+    update_invalid_language(DbName, <<"_design/foo">>, <<"bar">>),
+    {ok, _} = fabric:get_view_group_info(DbName, <<"foo">>),
+    ok = fabric:cleanup_index_files_all_nodes(DbName),
+
+    % One 4bc stays, the new view uses an invalid language which can't index 
with
+    % so we don't create a checkpoint for it
+    ?assertEqual(
+        [
+            "4bcdf852098ff6b0578ddf472c320e9c.view"
+        ],
+        indices(DbName)
+    ),
+    ?assertEqual(
+        [
+            <<"_local/purge-mrview-4bcdf852098ff6b0578ddf472c320e9c">>
+        ],
+        purges(DbName)
+    ).
+
 shard_names(DbName) ->
     [mem3:name(S) || S <- mem3:local_shards(DbName)].
 
@@ -239,6 +311,28 @@ update_ddoc(DbName, DDocId, ViewName) ->
     },
     fabric:update_doc(DbName, DDoc, [?ADMIN_CTX]).
 
+update_empty_ddoc(DbName, DDocId) ->
+    {ok, DDoc0} = fabric:open_doc(DbName, DDocId, [?ADMIN_CTX]),
+    DDoc = DDoc0#doc{body = {[]}},
+    fabric:update_doc(DbName, DDoc, [?ADMIN_CTX]).
+
+update_invalid_language(DbName, DDocId, ViewName) ->
+    {ok, DDoc0} = fabric:open_doc(DbName, DDocId, [?ADMIN_CTX]),
+    DDoc = DDoc0#doc{
+        body =
+            {[
+                {<<"language">>, <<"cobol">>},
+                {<<"views">>,
+                    {[
+                        {ViewName,
+                            {[
+                                {<<"map">>, <<"MAIN-PROCEDURE. PERFORM MAP">>}
+                            ]}}
+                    ]}}
+            ]}
+    },
+    fabric:update_doc(DbName, DDoc, [?ADMIN_CTX]).
+
 delete_ddoc(DbName, DDocId) ->
     {ok, DDoc0} = fabric:open_doc(DbName, DDocId, [?ADMIN_CTX]),
     DDoc = DDoc0#doc{deleted = true, body = {[]}},
diff --git a/src/ken/src/ken_server.erl b/src/ken/src/ken_server.erl
index c51ae4304..26c611eae 100644
--- a/src/ken/src/ken_server.erl
+++ b/src/ken/src/ken_server.erl
@@ -361,15 +361,12 @@ should_update(#doc{body = {Props}}, IndexType) ->
     end.
 
 update_ddoc_views(Name, MRSt, Seq, State) ->
-    Language = couch_mrview_index:get(language, MRSt),
-    Allowed = lists:member(Language, allowed_languages()),
-    Views = couch_mrview_index:get(views, MRSt),
-    if
-        Allowed andalso Views =/= [] ->
+    case couch_mrview_util:mrst_has_valid_views(MRSt) of
+        true ->
             {ok, Pid} = couch_index_server:get_index(couch_mrview_index, MRSt),
             GroupName = couch_mrview_index:get(idx_name, MRSt),
             maybe_start_job({Name, GroupName}, Pid, Seq, State);
-        true ->
+        false ->
             ok
     end.
 
@@ -531,20 +528,6 @@ prune_worker_table(State) ->
     ets:select_delete(ken_workers, [{MatchHead, [Guard], [true]}]),
     State#state{pruned_last = erlang:monotonic_time()}.
 
-allowed_languages() ->
-    % These are always available
-    BuiltIn = [<<"javascript">>, <<"javascript_quickjs">>, <<"query">>],
-    Config =
-        couch_proc_manager:get_servers_from_env("COUCHDB_QUERY_SERVER_") ++
-            
couch_proc_manager:get_servers_from_env("COUCHDB_NATIVE_QUERY_SERVER_"),
-    Allowed0 = [list_to_binary(string:to_lower(Lang)) || {Lang, _Cmd} <- 
Config],
-    Allowed =
-        case couch_proc_manager:native_query_server_enabled() of
-            true -> [<<"erlang">> | Allowed0];
-            _Else -> Allowed0
-        end,
-    lists:usort(BuiltIn ++ Allowed).
-
 config(Key, Default) ->
     config:get("ken", Key, Default).
 

Reply via email to