(doris) branch branch-4.1 updated: branch-4.1: [feature](statistics) Skip collecting stats for long string columns #62686 (#63303)

yiguolei Wed, 20 May 2026 00:17:07 -0700

This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-4.1
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/branch-4.1 by this push:
     new 44c6c26186b branch-4.1: [feature](statistics) Skip collecting stats 
for long string columns #62686 (#63303)
44c6c26186b is described below

commit 44c6c26186be7c2372502a23d56577676a05bb4e
Author: yujun <[email protected]>
AuthorDate: Wed May 20 15:16:45 2026 +0800

    branch-4.1: [feature](statistics) Skip collecting stats for long string 
columns #62686 (#63303)
    
    cherry-pick: #62686
    
    Co-authored-by: Copilot <[email protected]>
---
 .../main/java/org/apache/doris/common/Config.java  |  17 ++
 .../doris/catalog/BuiltinScalarFunctions.java      |   2 +
 .../org/apache/doris/statistics/AnalysisJob.java   |  15 +-
 .../apache/doris/statistics/AnalysisManager.java   |  27 +-
 .../doris/statistics/AnalyzeSkipException.java     |  39 +++
 .../apache/doris/statistics/BaseAnalysisTask.java  | 117 ++++++++-
 .../doris/statistics/ExternalAnalysisTask.java     |   1 +
 .../apache/doris/statistics/OlapAnalysisTask.java  |   5 +
 .../doris/statistics/AnalysisManagerTest.java      |  96 +++++++
 .../doris/statistics/HMSAnalysisTaskTest.java      |   2 +-
 .../doris/statistics/OlapAnalysisTaskTest.java     | 122 +++++++++
 .../hive/test_hive_analyze_long_string.groovy      | 125 +++++++++
 .../suites/statistics/analyze_stats.groovy         |   2 +-
 .../statistics/test_analyze_long_string.groovy     | 288 +++++++++++++++++++++
 14 files changed, 846 insertions(+), 12 deletions(-)

diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java 
b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
index 3ab0ce448fd..f7be84dcbcc 100644
--- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
+++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
@@ -2767,6 +2767,23 @@ public class Config extends ConfigBase {
     @ConfField
     public static int auto_analyze_simultaneously_running_task_num = 1;
 
+    @ConfField(mutable = true, masterOnly = true, description = {
+            "统计信息收集时 string 列允许的最大字节长度。若列中存在长度超过该值的行，"
+                    + "该列的统计信息将被跳过收集（task 仍标记为 FINISHED，在 SHOW ANALYZE 
中显示跳过原因）。"
+                    + "≤ 0 表示关闭此保护。默认 1024 (1KB)。"
+                    + "注意：此保护只覆盖 FULL / LINEAR / DUJ1 统计收集路径（即 analyze 全表和 
sample 的主 SQL）。"
+                    + "当 enable_partition_analyze=true 时的 per-partition 
路径（PARTITION_ANALYZE_TEMPLATE）"
+                    + "出于正确性考虑不启用该保护，详见 BaseAnalysisTask 中的 NOTE。",
+            "Max byte length allowed for a string column when collecting 
statistics. "
+                    + "If any row in a string column is longer than this 
value, the column's stats "
+                    + "collection is skipped (the task is still marked 
FINISHED, with the skip reason "
+                    + "shown in SHOW ANALYZE). A value <= 0 disables this 
protection. Default: 1024 (1KB). "
+                    + "Note: this protection applies to the FULL / LINEAR / 
DUJ1 collection paths "
+                    + "(i.e. the main SQL used by full-table and sample 
analyze). The per-partition path "
+                    + "(PARTITION_ANALYZE_TEMPLATE, used when 
enable_partition_analyze=true) is intentionally "
+                    + "not guarded for correctness reasons; see the NOTE in 
BaseAnalysisTask."})
+    public static long statistics_max_string_column_length = 1024;
+
     @Deprecated
     @ConfField
     public static final int period_analyze_simultaneously_running_task_num = 1;
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
index 04ccbd2043e..77f854933a3 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
@@ -89,6 +89,7 @@ import 
org.apache.doris.nereids.trees.expressions.functions.scalar.ArraysOverlap
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Ascii;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Asin;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Asinh;
+import org.apache.doris.nereids.trees.expressions.functions.scalar.AssertTrue;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Atan;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Atan2;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Atanh;
@@ -656,6 +657,7 @@ public class BuiltinScalarFunctions implements 
FunctionHelper {
             scalar(Ascii.class, "ascii"),
             scalar(Asin.class, "asin"),
             scalar(Asinh.class, "asinh"),
+            scalar(AssertTrue.class, "assert_true"),
             scalar(Atan.class, "atan"),
             scalar(Atanh.class, "atanh"),
             scalar(Atan2.class, "atan2"),
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisJob.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisJob.java
index bc43f80cbc2..8d0f3b892b9 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisJob.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisJob.java
@@ -108,7 +108,20 @@ public class AnalysisJob {
                 killed = true;
             case FINISHED:
                 for (BaseAnalysisTask task : queryFinished) {
-                    analysisManager.updateTaskStatus(task.info, state, msg, 
time);
+                    // When flushBuffer passes an empty msg, fall back to the
+                    // task's own info.message. This propagates a skip reason
+                    // previously stashed by BaseAnalysisTask.handleSkip into
+                    // the single FINISHED update for this task, so job.message
+                    // accumulation in AnalysisManager sees it and SHOW ANALYZE
+                    // surfaces the skip reason at the job level.
+                    String taskMsg = msg;
+                    if ((taskMsg == null || taskMsg.isEmpty()) && task.info != 
null) {
+                        taskMsg = task.info.message;
+                    }
+                    if (taskMsg == null) {
+                        taskMsg = "";
+                    }
+                    analysisManager.updateTaskStatus(task.info, state, 
taskMsg, time);
                 }
             default:
                 // DO NOTHING
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java
index b174261d2ac..f6ac836a42d 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java
@@ -235,6 +235,18 @@ public class AnalysisManager implements Writable {
             syncExecute(analysisTaskInfos.values());
             jobInfo.state = AnalysisState.FINISHED;
             updateTableStats(jobInfo);
+            // Sync analyze never populates analysisJobIdToTaskMap, so 
updateTaskStatus
+            // skip-message accumulation does not fire for it. Surface any 
per-task skip
+            // reasons (e.g. long-string column skip) as an OK-packet info 
message so
+            // the user still sees why a column was dropped from collection.
+            List<String> skipMessages = analysisTaskInfos.values().stream()
+                    .map(t -> t.info == null ? null : t.info.message)
+                    .filter(m -> m != null && !m.isEmpty())
+                    .collect(Collectors.toList());
+            if (!skipMessages.isEmpty() && ConnectContext.get() != null) {
+                ConnectContext.get().getState().setOk(0, skipMessages.size(),
+                        String.join(" ", skipMessages));
+            }
             return null;
         }
         recordAnalysisJob(jobInfo);
@@ -471,7 +483,12 @@ public class AnalysisManager implements Writable {
             return;
         }
         info.state = taskState;
-        info.message = message;
+        // Preserve the existing info.message when flushBuffer calls 
updateTaskState(FINISHED, "")
+        // for already-finished tasks, so that a previously-set skip message 
(from
+        // BaseAnalysisTask.handleSkip) is not wiped by the subsequent batch 
FINISHED update.
+        if (!(taskState.equals(AnalysisState.FINISHED) && 
StringUtils.isEmpty(message))) {
+            info.message = message;
+        }
         // Update the task cost time when task finished or failed. And only 
log the final state.
         if (taskState.equals(AnalysisState.FINISHED) || 
taskState.equals(AnalysisState.FAILED)) {
             info.timeCostInMs = time - info.lastExecTimeInMs;
@@ -496,6 +513,14 @@ public class AnalysisManager implements Writable {
                 String errMessage = String.format("%s:[%s] ", info.colName, 
message);
                 job.message = job.message == null ? errMessage : job.message + 
errMessage;
             }
+            // Accumulate a non-empty FINISHED message (e.g. long-string skip 
reason) into
+            // job.message so it is visible in SHOW ANALYZE at job level. 
Guard on the
+            // incoming message being non-empty to avoid double-counting when 
flushBuffer
+            // later calls updateTaskState(FINISHED, "") for the same 
already-skipped task.
+            if (taskState.equals(AnalysisState.FINISHED) && 
!StringUtils.isEmpty(message)) {
+                String skipMessage = String.format("%s:[%s] ", info.colName, 
message);
+                job.message = job.message == null ? skipMessage : job.message 
+ skipMessage;
+            }
             // Set the job state to RUNNING when its first task becomes 
RUNNING.
             if (info.state.equals(AnalysisState.RUNNING) && 
job.state.equals(AnalysisState.PENDING)) {
                 job.state = AnalysisState.RUNNING;
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalyzeSkipException.java
 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalyzeSkipException.java
new file mode 100644
index 00000000000..25f088e5aa4
--- /dev/null
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalyzeSkipException.java
@@ -0,0 +1,39 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.statistics;
+
+/**
+ * Control-flow signal thrown by an analysis task when it decides to skip
+ * statistics collection for a specific column (e.g. a string column contains
+ * at least one row whose byte length exceeds
+ * {@code Config.statistics_max_string_column_length}).
+ *
+ * This is NOT an error. The task that catches this exception should mark
+ * itself as FINISHED (not FAILED) and surface the skip reason via
+ * {@code info.message} / {@code SHOW ANALYZE}.
+ */
+public class AnalyzeSkipException extends RuntimeException {
+
+    public AnalyzeSkipException(String message) {
+        super(message);
+    }
+
+    public AnalyzeSkipException(String message, Throwable cause) {
+        super(message, cause);
+    }
+}
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
index 71bd7feeea6..8ace3876b5f 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
@@ -65,6 +65,14 @@ public abstract class BaseAnalysisTask {
     public static final long LIMIT_SIZE = 1024 * 1024 * 1024; // 1GB
     public static final double LIMIT_FACTOR = 1.2;
 
+    /**
+     * Marker string embedded in {@code assert_true} inside statistics 
collection SQL.
+     * When any row's string column length exceeds the configured limit, BE 
throws an
+     * error whose message contains this marker; FE detects it and converts 
the task
+     * result to a skip signal ({@link AnalyzeSkipException}) rather than a 
failure.
+     */
+    public static final String ANALYZE_SKIP_LONG_STRING_COLUMN_MARKER = 
"ANALYZE_SKIP_LONG_STRING_COLUMN";
+
     protected static final String FULL_ANALYZE_TEMPLATE =
             "SELECT CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS `id`, "
             +     "${catalogId} AS `catalog_id`, "
@@ -81,10 +89,11 @@ public abstract class BaseAnalysisTask {
             +     "${dataSizeFunction} AS `data_size`, "
             +     "NOW() AS `update_time`, "
             +     "null as `hot_value` "
-            + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index}";
+            + "FROM (SELECT `${colName}`${lengthAssert} "
+            +     "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index}) 
__lc_t";
 
     protected static final String LINEAR_ANALYZE_TEMPLATE = "WITH cte1 AS ("
-            +     "SELECT `${colName}` "
+            +     "SELECT `${colName}`${lengthAssert} "
             +     "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} 
${sampleHints} ${limit} ${preAggHint}), "
             + "cte2 AS ("
             +     "SELECT CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS 
`id`, "
@@ -120,7 +129,7 @@ public abstract class BaseAnalysisTask {
             +     "(SELECT "
             +     "${subStringColName} AS `hash_value`, "
             +     "`${colName}` AS `col_value`, "
-            +     "LENGTH(`${colName}`) as `len` "
+            +     "LENGTH(`${colName}`) as `len`${lengthAssert} "
             +     "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} 
${sampleHints} ${limit}) as `t0` "
             +     "${preAggHint} GROUP BY `t0`.`hash_value`), "
             + "cte2 AS ( "
@@ -166,6 +175,14 @@ public abstract class BaseAnalysisTask {
             + "${data_size} AS `data_size`, "
             + "NOW() ";
 
+    // NOTE: PARTITION_ANALYZE_TEMPLATE intentionally does NOT apply the 
long-string
+    // skip guard (statistics_max_string_column_length). Partition-granularity 
analyze
+    // commits per-batch INSERTs into __partition_stats incrementally. 
Aborting mid-loop
+    // via assert_true would leave a mix of fresh and stale rows in 
__partition_stats
+    // that is non-trivial to roll back. Since partition-level statistics are 
seldom
+    // relied upon today, we accept that long-string columns are NOT protected 
on this
+    // path. Only the full / sample OLAP paths and the external-table path 
enforce the
+    // per-row byte-length ceiling.
     protected static final String PARTITION_ANALYZE_TEMPLATE = " SELECT "
             + "${catalogId} AS `catalog_id`, "
             + "${dbId} AS `db_id`, "
@@ -181,7 +198,7 @@ public abstract class BaseAnalysisTask {
             + "SUBSTRING(CAST(MAX(`${colName}`) AS STRING), 1, 1024) AS `max`, 
"
             + "${dataSizeFunction} AS `data_size`, "
             + "NOW() AS `update_time` "
-            + " FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} 
${partitionInfo}";
+            + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} 
${partitionInfo}";
 
     protected static final String MERGE_PARTITION_TEMPLATE =
             "SELECT CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS `id`, "
@@ -254,8 +271,66 @@ public abstract class BaseAnalysisTask {
 
     public void execute() throws Exception {
         prepareExecution();
-        doExecute();
-        afterExecution();
+        try {
+            doExecute();
+        } catch (AnalyzeSkipException e) {
+            handleSkip(e);
+        } catch (Exception e) {
+            if (containsSkipMarker(e)) {
+                handleSkip(new AnalyzeSkipException(buildSkipMessage(), e));
+                return;
+            }
+            throw e;
+        }
+    }
+
+    /**
+     * Walk the cause chain and inspect every Throwable's message for the
+     * long-string skip marker. More robust than only checking the root cause,
+     * since some execution paths wrap the BE error in an outer exception that
+     * reformats the message without preserving the original cause.
+     */
+    protected static boolean containsSkipMarker(Throwable e) {
+        Throwable cur = e;
+        while (cur != null) {
+            String m = cur.getMessage();
+            if (m != null && 
m.contains(ANALYZE_SKIP_LONG_STRING_COLUMN_MARKER)) {
+                return true;
+            }
+            cur = cur.getCause();
+        }
+        return false;
+    }
+
+    /**
+     * Mark this task as FINISHED with a skip message. Called when the task
+     * detects that the column should be skipped (e.g. long string column).
+     */
+    private void handleSkip(AnalyzeSkipException e) {
+        String skipMsg = e.getMessage();
+        LOG.info("Analyze task skip column [{}] in table [{}]. Reason: {}",
+                info.colName, tbl == null ? "?" : tbl.getName(), skipMsg);
+        // Stash the skip message on info.message. The job-level flushBuffer 
path
+        // (AnalysisJob.updateTaskState -> AnalysisManager.updateTaskStatus) 
will
+        // pick it up as the single FINISHED transition for this task, so we
+        // avoid a redundant updateTaskStatus call here. Doing the state update
+        // twice used to overwrite AnalysisInfo.timeCostInMs with the near-zero
+        // delta between the two FINISHED calls.
+        info.message = skipMsg;
+        // Route through taskDoneWithoutData: adds this task to queryFinished
+        // and triggers flushBuffer, which will call updateTaskState(FINISHED,
+        // "") exactly once per task. AnalysisJob.updateTaskState substitutes
+        // the task's own info.message when the outer msg is empty, so skipMsg
+        // reaches job.message for SHOW ANALYZE visibility.
+        job.taskDoneWithoutData(this);
+    }
+
+    private String buildSkipMessage() {
+        return String.format(
+                "Column [%s] has row(s) whose byte length exceeds %d 
(Config.statistics_max_string_column_length), "
+                        + "skip collecting statistics for this column.",
+                info == null ? "?" : info.colName,
+                
org.apache.doris.common.Config.statistics_max_string_column_length);
     }
 
     protected void prepareExecution() {
@@ -266,8 +341,6 @@ public abstract class BaseAnalysisTask {
 
     protected abstract void doSample() throws Exception;
 
-    protected void afterExecution() {}
-
     protected void setTaskStateToRunning() {
         Env.getCurrentEnv().getAnalysisManager()
                 .updateTaskStatus(info, AnalysisState.RUNNING, "", 
System.currentTimeMillis());
@@ -498,6 +571,31 @@ public abstract class BaseAnalysisTask {
         return Maps.newHashMap();
     }
 
+    /**
+     * Populate the {@code ${lengthAssert}} placeholder into SQL params map.
+     * For string columns with config > 0, the placeholder expands into a 
per-row
+     * {@code , assert_true(col IS NULL OR LENGTH(col) <= N, 'marker') AS __lc}
+     * clause that gets inserted into the inner-most SELECT list of statistics
+     * collection SQL. For non-string columns or when config <= 0, the 
placeholder
+     * is an empty string so the SQL stays unchanged.
+     *
+     * Note: the {@code IS NULL OR} guard is required because Doris's
+     * {@code assert_true} BE function throws on NULL inputs.
+     */
+    protected void addLengthAssertParam(Map<String, String> params) {
+        long maxLen = 
org.apache.doris.common.Config.statistics_max_string_column_length;
+        if (col != null && col.getType().isStringType() && maxLen > 0) {
+            String escapedColName = 
StatisticsUtil.escapeColumnName(String.valueOf(info.colName));
+            // The StringSubstitutor used by callers already has ${colName} 
populated,
+            // so we inline the escaped column name directly here.
+            params.put("lengthAssert",
+                    ", assert_true(`" + escapedColName + "` IS NULL OR 
LENGTH(`" + escapedColName + "`) <= "
+                            + maxLen + ", '" + 
ANALYZE_SKIP_LONG_STRING_COLUMN_MARKER + "') AS `__lc`");
+        } else {
+            params.put("lengthAssert", "");
+        }
+    }
+
     protected String castToNumeric(String colName) {
         Type type = col.getType();
         if (type.isNumericType()) {
@@ -535,6 +633,9 @@ public abstract class BaseAnalysisTask {
             job.appendBuf(this, Collections.singletonList(colStatsData));
         } catch (Exception e) {
             LOG.warn("Failed to execute sql {}", sql);
+            if (containsSkipMarker(e)) {
+                throw new AnalyzeSkipException(buildSkipMessage(), e);
+            }
             throw e;
         } finally {
             if (LOG.isDebugEnabled()) {
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ExternalAnalysisTask.java
 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ExternalAnalysisTask.java
index 72beb343956..b9a2525ac6f 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ExternalAnalysisTask.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ExternalAnalysisTask.java
@@ -94,6 +94,7 @@ public class ExternalAnalysisTask extends BaseAnalysisTask {
             params.put("type", col.getType().toString());
         }
         params.put("lastAnalyzeTimeInMs", 
String.valueOf(System.currentTimeMillis()));
+        addLengthAssertParam(params);
         return params;
     }
 
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
index 187eff6d40a..bd0c5d94f12 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
@@ -32,6 +32,7 @@ import org.apache.doris.common.Config;
 import org.apache.doris.common.DdlException;
 import org.apache.doris.common.FeConstants;
 import org.apache.doris.common.Pair;
+import org.apache.doris.common.util.DebugPointUtil;
 import org.apache.doris.common.util.DebugUtil;
 import org.apache.doris.qe.AutoCloseConnectContext;
 import org.apache.doris.qe.SessionVariable;
@@ -398,6 +399,7 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
         params.put("tblName", String.valueOf(tbl.getName()));
         params.put("index", getIndex());
         params.put("preAggHint", "");
+        addLengthAssertParam(params);
         return params;
     }
 
@@ -524,6 +526,9 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
      * @return True for single unique key column and single distribution 
column.
      */
     protected boolean useLinearAnalyzeTemplate() {
+        if (DebugPointUtil.isEnable("OlapAnalysisTask.useDUJ1Template")) {
+            return false;
+        }
         if (partitionColumnSampleTooManyRows || scanFullTable) {
             return true;
         }
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisManagerTest.java 
b/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisManagerTest.java
index 07a50ade181..11db1e5861d 100644
--- 
a/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisManagerTest.java
+++ 
b/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisManagerTest.java
@@ -42,6 +42,7 @@ import mockit.MockUp;
 import mockit.Mocked;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.Test;
+import org.mockito.Mockito;
 
 import java.util.ArrayList;
 import java.util.HashMap;
@@ -108,6 +109,101 @@ public class AnalysisManagerTest {
         Assertions.assertEquals(job.state, AnalysisState.FINISHED);
     }
 
+    @Test
+    public void testUpdateTaskStatusPreservesSkipMessage() {
+        // Verify that a subsequent updateTaskStatus(FINISHED, "") call (e.g. 
from
+        // flushBuffer) does NOT wipe a previously-set skip message on 
info.message,
+        // and that job.message only accumulates the skip reason once.
+        BaseAnalysisTask task1 = Mockito.mock(BaseAnalysisTask.class);
+
+        AnalysisManager manager = Mockito.spy(new AnalysisManager());
+        Mockito.doNothing().when(manager).logCreateAnalysisTask(Mockito.any());
+        Mockito.doNothing().when(manager).logCreateAnalysisJob(Mockito.any());
+        Mockito.doNothing().when(manager).updateTableStats(Mockito.any());
+
+        AnalysisInfo job = new AnalysisInfoBuilder().setJobId(10)
+                
.setState(AnalysisState.PENDING).setAnalysisType(AnalysisType.FUNDAMENTALS)
+                .setJobType(AnalysisInfo.JobType.MANUAL).build();
+        AnalysisInfo taskInfo = new AnalysisInfoBuilder().setJobId(10)
+                
.setTaskId(11).setJobType(JobType.MANUAL).setAnalysisType(AnalysisType.FUNDAMENTALS)
+                .setColName("big_str").setState(AnalysisState.PENDING).build();
+        manager.replayCreateAnalysisJob(job);
+        manager.replayCreateAnalysisTask(taskInfo);
+
+        task1.info = taskInfo;
+        Map<Long, BaseAnalysisTask> tasks = new HashMap<>();
+        tasks.put(11L, task1);
+        manager.addToJobIdTasksMap(10, tasks);
+
+        String skipMsg = "Column [big_str] has row(s) whose byte length 
exceeds 1024"
+                + " (Config.statistics_max_string_column_length), skip 
collecting statistics for this column.";
+        manager.updateTaskStatus(taskInfo, AnalysisState.FINISHED, skipMsg, 0);
+        Assertions.assertEquals(skipMsg, taskInfo.message);
+        Assertions.assertTrue(job.message != null && 
job.message.contains(skipMsg),
+                "expected skip msg in job.message, got: " + job.message);
+        String firstJobMessage = job.message;
+
+        // Simulate flushBuffer replay: subsequent FINISHED with empty message 
should
+        // NOT wipe info.message NOR re-append skip reason.
+        manager.updateTaskStatus(taskInfo, AnalysisState.FINISHED, "", 0);
+        Assertions.assertEquals(skipMsg, taskInfo.message);
+        Assertions.assertEquals(firstJobMessage, job.message,
+                "job.message should not accumulate again on empty-message 
update");
+    }
+
+    @Test
+    public void testUpdateTaskStatusAccumulatesMultipleSkipMessages() {
+        // Two string columns get skipped -> job.message must contain both 
entries keyed
+        // by their respective colName, and repeated flushBuffer (FINISHED,"") 
replays
+        // must NOT duplicate them.
+        BaseAnalysisTask task1 = Mockito.mock(BaseAnalysisTask.class);
+        BaseAnalysisTask task2 = Mockito.mock(BaseAnalysisTask.class);
+
+        AnalysisManager manager = Mockito.spy(new AnalysisManager());
+        Mockito.doNothing().when(manager).logCreateAnalysisTask(Mockito.any());
+        Mockito.doNothing().when(manager).logCreateAnalysisJob(Mockito.any());
+        Mockito.doNothing().when(manager).updateTableStats(Mockito.any());
+
+        AnalysisInfo job = new AnalysisInfoBuilder().setJobId(20)
+                
.setState(AnalysisState.PENDING).setAnalysisType(AnalysisType.FUNDAMENTALS)
+                .setJobType(AnalysisInfo.JobType.MANUAL).build();
+        AnalysisInfo ti1 = new AnalysisInfoBuilder().setJobId(20).setTaskId(21)
+                .setColName("s1").setJobType(JobType.MANUAL)
+                
.setAnalysisType(AnalysisType.FUNDAMENTALS).setState(AnalysisState.PENDING).build();
+        AnalysisInfo ti2 = new AnalysisInfoBuilder().setJobId(20).setTaskId(22)
+                .setColName("s2").setJobType(JobType.MANUAL)
+                
.setAnalysisType(AnalysisType.FUNDAMENTALS).setState(AnalysisState.PENDING).build();
+        manager.replayCreateAnalysisJob(job);
+        manager.replayCreateAnalysisTask(ti1);
+        manager.replayCreateAnalysisTask(ti2);
+        task1.info = ti1;
+        task2.info = ti2;
+        Map<Long, BaseAnalysisTask> tasks = new HashMap<>();
+        tasks.put(21L, task1);
+        tasks.put(22L, task2);
+        manager.addToJobIdTasksMap(20, tasks);
+
+        String skip1 = "Column [s1] has row(s) whose byte length exceeds 1024 
...";
+        String skip2 = "Column [s2] has row(s) whose byte length exceeds 1024 
...";
+        manager.updateTaskStatus(ti1, AnalysisState.FINISHED, skip1, 0);
+        manager.updateTaskStatus(ti2, AnalysisState.FINISHED, skip2, 0);
+        Assertions.assertNotNull(job.message);
+        Assertions.assertTrue(job.message.contains("s1:[" + skip1 + "]"),
+                "expected s1 skip in job.message, got: " + job.message);
+        Assertions.assertTrue(job.message.contains("s2:[" + skip2 + "]"),
+                "expected s2 skip in job.message, got: " + job.message);
+        String afterFirstRound = job.message;
+
+        // Simulate flushBuffer replay with empty message for both tasks. 
Neither entry
+        // should be duplicated.
+        manager.updateTaskStatus(ti1, AnalysisState.FINISHED, "", 0);
+        manager.updateTaskStatus(ti2, AnalysisState.FINISHED, "", 0);
+        Assertions.assertEquals(afterFirstRound, job.message,
+                "job.message must remain stable across flushBuffer replays");
+        Assertions.assertEquals(skip1, ti1.message);
+        Assertions.assertEquals(skip2, ti2.message);
+    }
+
     @Test
     public void testRecordLimit1() {
         Config.analyze_record_limit = 2;
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java 
b/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java
index e870b0b3bfd..f629f36d07c 100644
--- 
a/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java
+++ 
b/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java
@@ -209,7 +209,7 @@ public class HMSAnalysisTaskTest {
                         + "SUBSTRING(CAST(MIN(`hour`) AS STRING), 1, 1024) AS 
`min`, "
                         + "SUBSTRING(CAST(MAX(`hour`) AS STRING), 1, 1024) AS 
`max`, "
                         + "COUNT(1) * 4 AS `data_size`, NOW() AS 
`update_time`, "
-                        + "null as `hot_value` FROM `hms`.`default`.`test` ", 
sql);
+                        + "null as `hot_value` FROM (SELECT `hour` FROM 
`hms`.`default`.`test` ) __lc_t", sql);
             }
         };
 
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
 
b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
index 89a0a67d810..740f8613437 100644
--- 
a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
+++ 
b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
@@ -754,4 +754,126 @@ public class OlapAnalysisTaskTest {
                 + "WHERE `catalog_id` = 0  AND `db_id` = 1  AND `tbl_id` = 2  
AND `idx_id` = 3  AND `col_id` = 'col1'",
                 sql);
     }
+
+    @Test
+    public void testAddLengthAssertParamForStringColumn() throws Exception {
+        // String column with positive config -> emits per-row assert_true 
guard
+        Column strCol = new Column("s", PrimitiveType.VARCHAR);
+        long savedLen = 
org.apache.doris.common.Config.statistics_max_string_column_length;
+        org.apache.doris.common.Config.statistics_max_string_column_length = 
1024;
+        try {
+            OlapAnalysisTask task = new OlapAnalysisTask();
+            AnalysisInfo info = new AnalysisInfoBuilder()
+                    .setJobId(1L)
+                    .setTaskId(2L)
+                    .setColName("s")
+
+                    .build();
+            task.info = info;
+            task.col = strCol;
+            Map<String, String> params = Maps.newHashMap();
+            task.addLengthAssertParam(params);
+            String lengthAssert = params.get("lengthAssert");
+            Assertions.assertNotNull(lengthAssert);
+            Assertions.assertTrue(lengthAssert.contains("assert_true"),
+                    "expected assert_true in placeholder, got: " + 
lengthAssert);
+            Assertions.assertTrue(lengthAssert.contains("IS NULL OR LENGTH"),
+                    "expected NULL guard, got: " + lengthAssert);
+            Assertions.assertTrue(lengthAssert.contains("1024"),
+                    "expected max length value, got: " + lengthAssert);
+            Assertions.assertTrue(
+                    
lengthAssert.contains(BaseAnalysisTask.ANALYZE_SKIP_LONG_STRING_COLUMN_MARKER),
+                    "expected marker in placeholder, got: " + lengthAssert);
+        } finally {
+            org.apache.doris.common.Config.statistics_max_string_column_length 
= savedLen;
+        }
+    }
+
+    @Test
+    public void testAddLengthAssertParamForNonStringColumn() {
+        // Non-string columns must emit an empty placeholder so SQL stays 
unchanged
+        Column intCol = new Column("id", PrimitiveType.INT);
+        OlapAnalysisTask task = new OlapAnalysisTask();
+        AnalysisInfo info = new AnalysisInfoBuilder()
+                .setJobId(1L).setTaskId(2L).setColName("id")
+                .build();
+        task.info = info;
+        task.col = intCol;
+        Map<String, String> params = Maps.newHashMap();
+        task.addLengthAssertParam(params);
+        Assertions.assertEquals("", params.get("lengthAssert"));
+    }
+
+    @Test
+    public void testAddLengthAssertParamConfigDisabled() {
+        Column strCol = new Column("s", PrimitiveType.VARCHAR);
+        long savedLen = 
org.apache.doris.common.Config.statistics_max_string_column_length;
+        org.apache.doris.common.Config.statistics_max_string_column_length = 0;
+        try {
+            OlapAnalysisTask task = new OlapAnalysisTask();
+            AnalysisInfo info = new AnalysisInfoBuilder()
+                    .setJobId(1L).setTaskId(2L).setColName("s")
+                    .build();
+            task.info = info;
+            task.col = strCol;
+            Map<String, String> params = Maps.newHashMap();
+            task.addLengthAssertParam(params);
+            Assertions.assertEquals("", params.get("lengthAssert"));
+        } finally {
+            org.apache.doris.common.Config.statistics_max_string_column_length 
= savedLen;
+        }
+    }
+
+    @Test
+    public void testFullAnalyzeTemplateRendersLengthAssert() {
+        // Confirm the rendered FULL_ANALYZE_TEMPLATE wraps the base table in 
a subquery
+        // and carries the assert_true clause for a string column.
+        Map<String, String> params = new HashMap<>();
+        params.put("internalDB", FeConstants.INTERNAL_DB_NAME);
+        params.put("columnStatTbl", 
StatisticConstants.TABLE_STATISTIC_TBL_NAME);
+        params.put("catalogId", "0");
+        params.put("dbId", "1");
+        params.put("tblId", "2");
+        params.put("idxId", "3");
+        params.put("colId", "s");
+        params.put("dataSizeFunction", "100");
+        params.put("catalogName", "internal");
+        params.put("dbName", "db1");
+        params.put("colName", "s");
+        params.put("tblName", "tbl1");
+        params.put("index", "");
+        params.put("lengthAssert",
+                ", assert_true(`s` IS NULL OR LENGTH(`s`) <= 1024, '"
+                        + 
BaseAnalysisTask.ANALYZE_SKIP_LONG_STRING_COLUMN_MARKER + "') AS `__lc`");
+        StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
+        String sql = 
stringSubstitutor.replace(BaseAnalysisTask.FULL_ANALYZE_TEMPLATE);
+        Assertions.assertTrue(sql.contains("FROM (SELECT `s`, assert_true("), 
sql);
+        Assertions.assertTrue(sql.contains("IS NULL OR LENGTH(`s`) <= 1024"), 
sql);
+        Assertions.assertTrue(sql.endsWith(") __lc_t"), sql);
+    }
+
+    @Test
+    public void testFullAnalyzeTemplateRendersWithoutLengthAssert() {
+        // Non-string columns yield a plain inline view that Nereids can 
collapse.
+        Map<String, String> params = new HashMap<>();
+        params.put("internalDB", FeConstants.INTERNAL_DB_NAME);
+        params.put("columnStatTbl", 
StatisticConstants.TABLE_STATISTIC_TBL_NAME);
+        params.put("catalogId", "0");
+        params.put("dbId", "1");
+        params.put("tblId", "2");
+        params.put("idxId", "3");
+        params.put("colId", "id");
+        params.put("dataSizeFunction", "100");
+        params.put("catalogName", "internal");
+        params.put("dbName", "db1");
+        params.put("colName", "id");
+        params.put("tblName", "tbl1");
+        params.put("index", "");
+        params.put("lengthAssert", "");
+        StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
+        String sql = 
stringSubstitutor.replace(BaseAnalysisTask.FULL_ANALYZE_TEMPLATE);
+        Assertions.assertFalse(sql.contains("assert_true"), sql);
+        Assertions.assertTrue(sql.contains("FROM (SELECT `id` FROM 
`internal`.`db1`.`tbl1`"), sql);
+        Assertions.assertTrue(sql.endsWith(") __lc_t"), sql);
+    }
 }
diff --git 
a/regression-test/suites/external_table_p0/hive/test_hive_analyze_long_string.groovy
 
b/regression-test/suites/external_table_p0/hive/test_hive_analyze_long_string.groovy
new file mode 100644
index 00000000000..faf8dd539ab
--- /dev/null
+++ 
b/regression-test/suites/external_table_p0/hive/test_hive_analyze_long_string.groovy
@@ -0,0 +1,125 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+import org.awaitility.Awaitility
+import java.util.concurrent.TimeUnit
+
+suite("test_hive_analyze_long_string", 
"p0,external,hive,external_docker,external_docker_hive") {
+    String enabled = context.config.otherConfigs.get("enableHiveTest")
+    if (enabled == null || !enabled.equalsIgnoreCase("true")) {
+        logger.info("disable Hive test.")
+        return
+    }
+
+    // Feature only triggers when values exceed 1024 bytes; this test assumes 
the FE default.
+    def cfgRows = sql "admin show frontend config like 
'statistics_max_string_column_length'"
+    assertEquals(1, cfgRows.size())
+    assertEquals("1024", cfgRows[0][1].toString())
+
+    String longVal = "x" * 2048
+
+    for (String hivePrefix : ["hive3"]) {
+        setHivePrefix(hivePrefix)
+        String extHiveHmsHost = 
context.config.otherConfigs.get("externalEnvIp")
+        String extHiveHmsPort = context.config.otherConfigs.get(hivePrefix + 
"HmsPort")
+        String hdfsPort = context.config.otherConfigs.get(hivePrefix + 
"HdfsPort")
+        String catalogName = hivePrefix + "_test_analyze_long_string"
+        String dbName = "test_analyze_long_string_db"
+        String tblName = "t1"
+
+        // Seed the hive side via hive_docker so this suite is self-contained 
and
+        // does not require any pre-install hql.
+        hive_docker """drop table if exists ${dbName}.${tblName}"""
+        hive_docker """drop database if exists ${dbName} cascade"""
+        hive_docker """create database ${dbName}"""
+        hive_docker """create table ${dbName}.${tblName} (id int, s string) 
stored as parquet"""
+        hive_docker """insert into ${dbName}.${tblName} values (1, 'short'), 
(2, 'another'), (3, '${longVal}')"""
+
+        sql "drop catalog if exists ${catalogName}"
+        sql """
+            create catalog if not exists ${catalogName} properties (
+                'type'='hms',
+                'hadoop.username' = 'hadoop',
+                'hive.metastore.uris' = 
'thrift://${extHiveHmsHost}:${extHiveHmsPort}',
+                'fs.defaultFS' = 'hdfs://${extHiveHmsHost}:${hdfsPort}'
+            )
+        """
+        sql "refresh catalog ${catalogName}"
+
+        // Analyze column `s` which contains a >1024-byte row: task should be 
marked
+        // FINISHED with a skip message, and no column stats should be 
persisted.
+        def analyzeRes = sql "analyze table 
${catalogName}.${dbName}.${tblName}(s)"
+        assertEquals(1, analyzeRes.size())
+        long jobId = Long.parseLong(analyzeRes[0][0].toString())
+        logger.info("hive analyze long string jobId=${jobId}")
+
+        Awaitility.await()
+                .atMost(120, TimeUnit.SECONDS)
+                .pollInterval(2, TimeUnit.SECONDS)
+                .until {
+                    def rows = sql "show analyze ${jobId}"
+                    if (rows.isEmpty()) {
+                        return false
+                    }
+                    String state = rows[0][9].toString()
+                    return state == "FINISHED" || state == "FAILED"
+                }
+
+        def jobRows = sql "show analyze ${jobId}"
+        assertEquals("FINISHED", jobRows[0][9].toString())
+        String jobMessage = jobRows[0][7].toString()
+        assertTrue(jobMessage.contains("statistics_max_string_column_length")
+                   || jobMessage.contains("exceeds"),
+                   "expected skip reason in job message but got: 
${jobMessage}")
+
+        def taskRows = sql "show analyze task status ${jobId}"
+        def strTaskRow = taskRows.find { it[1].toString() == "s" }
+        assertNotNull(strTaskRow, "missing analyze task row for column s")
+        assertEquals("FINISHED", strTaskRow[6].toString())
+        String taskMessage = strTaskRow[3].toString()
+        assertTrue(taskMessage.contains("statistics_max_string_column_length")
+                   || taskMessage.contains("exceeds"),
+                   "expected skip reason in task message but got: 
${taskMessage}")
+
+        // The skipped column must NOT have any stored column statistics row.
+        def colStats = sql "show column stats 
${catalogName}.${dbName}.${tblName}(s)"
+        assertTrue(colStats.isEmpty(), "expected no column stats for skipped 
column s")
+
+        // A non-string column on the same table should still analyze 
successfully.
+        def idAnalyze = sql "analyze table 
${catalogName}.${dbName}.${tblName}(id)"
+        long idJobId = Long.parseLong(idAnalyze[0][0].toString())
+        Awaitility.await()
+                .atMost(120, TimeUnit.SECONDS)
+                .pollInterval(2, TimeUnit.SECONDS)
+                .until {
+                    def rows = sql "show analyze ${idJobId}"
+                    if (rows.isEmpty()) {
+                        return false
+                    }
+                    String state = rows[0][9].toString()
+                    return state == "FINISHED" || state == "FAILED"
+                }
+        def idJobRows = sql "show analyze ${idJobId}"
+        assertEquals("FINISHED", idJobRows[0][9].toString())
+        def idColStats = sql "show column stats 
${catalogName}.${dbName}.${tblName}(id)"
+        assertEquals(1, idColStats.size())
+
+        sql "drop catalog if exists ${catalogName}"
+        hive_docker """drop table if exists ${dbName}.${tblName}"""
+        hive_docker """drop database if exists ${dbName} cascade"""
+    }
+}
diff --git a/regression-test/suites/statistics/analyze_stats.groovy 
b/regression-test/suites/statistics/analyze_stats.groovy
index 7de06f23c6d..46aa9adb529 100644
--- a/regression-test/suites/statistics/analyze_stats.groovy
+++ b/regression-test/suites/statistics/analyze_stats.groovy
@@ -1378,7 +1378,7 @@ PARTITION `p599` VALUES IN (599)
     sql """
         INSERT INTO `${tbl}` VALUES 
(-2103297891,1,101,15248,4761818404925265645,939926.283,
         'UTmCFKMbprf0zSVOIlBJRNOl3JcNBdOsnCDt','2022-09-28','2022-10-28 
01:56:56','tVvGDSrN6kyn',
-        
-954349107.187117,-40.46286,'11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111
 [...]
+        
-954349107.187117,-40.46286,'11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111
 [...]
         '-1559301292834325905', NULL, NULL, NULL, NULL)
     """
 
diff --git a/regression-test/suites/statistics/test_analyze_long_string.groovy 
b/regression-test/suites/statistics/test_analyze_long_string.groovy
new file mode 100644
index 00000000000..a234f44bad1
--- /dev/null
+++ b/regression-test/suites/statistics/test_analyze_long_string.groovy
@@ -0,0 +1,288 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+import org.awaitility.Awaitility
+
+import static java.util.concurrent.TimeUnit.SECONDS
+
+suite("test_analyze_long_string", "nonConcurrent") {
+    // `analyze ... with sync` does not persist the job in the in-memory map,
+    // so we use async analyze and poll SHOW ANALYZE / SHOW ANALYZE TASK 
STATUS.
+
+    def findJobId = { String ctl, String db, String tbl ->
+        def rows = sql """show analyze"""
+        def match = -1L
+        for (row in rows) {
+            // columns: job_id, catalog_name, db_name, tbl_name, ...
+            if (row[1].toString() == ctl
+                    && row[2].toString() == db
+                    && row[3].toString() == tbl) {
+                long id = Long.parseLong(row[0].toString())
+                if (id > match) {
+                    match = id
+                }
+            }
+        }
+        return match == -1L ? null : match
+    }
+
+    def waitJobFinished = { long jobId ->
+        Awaitility.await().atMost(120, SECONDS).pollInterval(2, SECONDS).until 
{
+            def rows = sql """show analyze ${jobId}"""
+            if (rows.isEmpty()) {
+                return false
+            }
+            def state = rows[0][9].toString()
+            logger.info("job ${jobId} state=${state}")
+            return state == "FINISHED" || state == "FAILED"
+        }
+    }
+
+    def collectTaskStatuses = { long jobId ->
+        def rows = sql """show analyze task status ${jobId}"""
+        def result = [:]
+        for (row in rows) {
+            // columns: task_id, col_name, index_name, message,
+            //          last_state_change_time, time_cost_in_ms, state
+            def colName = row[1].toString()
+            def msg = row[3] == null ? "" : row[3].toString()
+            def state = row[6].toString()
+            result[colName] = [state: state, message: msg]
+        }
+        return result
+    }
+
+    def assertTaskSkipped = { long jobId, List<String> expectedSkipped, 
List<String> expectedOk ->
+        def statuses = collectTaskStatuses(jobId)
+        logger.info("job ${jobId} task statuses=${statuses}")
+        for (c in expectedSkipped) {
+            assertTrue(statuses.containsKey(c), "missing task for skipped col 
${c}")
+            assertEquals("FINISHED", statuses[c].state,
+                    "expected FINISHED for skipped col ${c}, got 
${statuses[c]}")
+            
assertTrue(statuses[c].message.contains("statistics_max_string_column_length")
+                    || statuses[c].message.contains("exceeds"),
+                    "expected skip reason visible for col ${c}, got 
msg=${statuses[c].message}")
+        }
+        for (c in expectedOk) {
+            assertTrue(statuses.containsKey(c), "missing task for col ${c}")
+            assertEquals("FINISHED", statuses[c].state,
+                    "expected FINISHED for col ${c}, got ${statuses[c]}")
+            assertEquals("", statuses[c].message,
+                    "expected empty message for col ${c}, got 
${statuses[c].message}")
+        }
+    }
+
+    def collectedColumns = { String tbl ->
+        def rows = sql """show column stats ${tbl}"""
+        def set = [] as Set
+        for (r in rows) {
+            set.add(r[0].toString())
+        }
+        return set
+    }
+
+    sql """drop database if exists test_analyze_long_string"""
+    sql """create database test_analyze_long_string"""
+    sql """use test_analyze_long_string"""
+    sql """set global enable_auto_analyze=false"""
+
+    // ---------- Case 1: full analyze on a non-partitioned table ----------
+    sql """drop table if exists test_analyze_long_string_full"""
+    sql """
+        CREATE TABLE test_analyze_long_string_full (
+          `id` bigint,
+          `name` varchar(100),
+          `small_str` varchar(100),
+          `fixed` char(16),
+          `big_str` string
+        ) ENGINE=OLAP
+        DISTRIBUTED BY HASH(`id`) BUCKETS 4
+        PROPERTIES ("replication_allocation" = "tag.location.default: 1");
+    """
+    sql """insert into test_analyze_long_string_full values(1, 'alice',   
'aaa', 'abc',   NULL)"""
+    sql """insert into test_analyze_long_string_full values(2, NULL,      
'bbb', 'defg',  repeat('y', 100))"""
+    sql """insert into test_analyze_long_string_full values(3, 'charlie', 
'ccc', 'hij',   repeat('z', 2048))"""
+
+    setFeConfigTemporary([statistics_max_string_column_length: 1024]) {
+        sql """analyze table test_analyze_long_string_full"""
+        def jobId = findJobId("internal", "test_analyze_long_string", 
"test_analyze_long_string_full")
+        assertNotNull(jobId, "must find analyze job for 
test_analyze_long_string_full")
+        waitJobFinished(jobId)
+        assertTaskSkipped(jobId, ["big_str"], ["id", "name", "small_str", 
"fixed"])
+        def collected = collectedColumns("test_analyze_long_string_full")
+        logger.info("test_analyze_long_string_full collected=${collected}")
+        assertFalse(collected.contains("big_str"),
+                "big_str should be skipped, got ${collected}")
+        assertTrue(collected.contains("id"))
+        assertTrue(collected.contains("name"))
+        assertTrue(collected.contains("small_str"))
+        assertTrue(collected.contains("fixed"))
+        def bigRows = sql """show column stats test_analyze_long_string_full 
(big_str)"""
+        assertEquals(0, bigRows.size(),
+                "expected no stats row for skipped column big_str, got 
${bigRows}")
+    }
+
+    // ---------- Case 2: disabled (limit <= 0) must collect everything 
----------
+    setFeConfigTemporary([statistics_max_string_column_length: 0]) {
+        sql """analyze table test_analyze_long_string_full with sync"""
+        def bigRows = sql """show column stats test_analyze_long_string_full 
(big_str)"""
+        assertEquals(1, bigRows.size(),
+                "protection disabled: big_str should be collected, got 
${bigRows}")
+        assertEquals("3.0", bigRows[0][2].toString())
+    }
+
+    // ---------- Case 3: sample analyze (LINEAR / DUJ1 paths) ----------
+    sql """drop table if exists test_analyze_long_string_sample"""
+    sql """
+        CREATE TABLE test_analyze_long_string_sample (
+          `id` bigint,
+          `small_str` varchar(100),
+          `big_str` string
+        ) ENGINE=OLAP
+        DISTRIBUTED BY HASH(`id`) BUCKETS 2
+        PROPERTIES ("replication_allocation" = "tag.location.default: 1");
+    """
+    sql """insert into test_analyze_long_string_sample values(1, 'aa', 
repeat('z', 2048))"""
+    sql """insert into test_analyze_long_string_sample values(2, 'bb', 
'short1')"""
+    sql """insert into test_analyze_long_string_sample values(3, 'cc', 
'short2')"""
+    sql """insert into test_analyze_long_string_sample values(4, 'dd', 
'short3')"""
+    sql """insert into test_analyze_long_string_sample values(5, 'ee', 
'short4')"""
+
+    setFeConfigTemporary([statistics_max_string_column_length: 1024]) {
+        sql """analyze table test_analyze_long_string_sample with sample 
percent 100"""
+        def jobId = findJobId("internal", "test_analyze_long_string", 
"test_analyze_long_string_sample")
+        assertNotNull(jobId, "must find sample analyze job for 
test_analyze_long_string_sample")
+        waitJobFinished(jobId)
+        assertTaskSkipped(jobId, ["big_str"], ["id", "small_str"])
+        def collected = collectedColumns("test_analyze_long_string_sample")
+        assertFalse(collected.contains("big_str"), "sample analyze: big_str 
should be skipped")
+        assertTrue(collected.contains("small_str"))
+    }
+
+    // ---------- Case 4: partitioned table — long-string guard does NOT apply 
----------
+    // Partition-granularity analyze uses PARTITION_ANALYZE_TEMPLATE and 
commits per-batch
+    // inserts into __partition_stats incrementally. To avoid leaving a mix of 
fresh and
+    // stale partition rows on mid-loop abort, the long-string skip guard is 
intentionally
+    // NOT wired into this path. Analyze should FINISH normally and produce 
statistics
+    // for big_str even when rows exceed statistics_max_string_column_length.
+    sql """drop table if exists test_analyze_long_string_part"""
+    sql """
+        CREATE TABLE test_analyze_long_string_part (
+          `d` date NOT NULL,
+          `id` bigint,
+          `big_str` string
+        ) ENGINE=OLAP
+        DUPLICATE KEY(`d`)
+        PARTITION BY RANGE(`d`) (
+          PARTITION p1 VALUES LESS THAN ('2024-01-02'),
+          PARTITION p2 VALUES LESS THAN ('2024-01-03')
+        )
+        DISTRIBUTED BY HASH(`id`) BUCKETS 2
+        PROPERTIES ("replication_allocation" = "tag.location.default: 1");
+    """
+    sql """insert into test_analyze_long_string_part values('2024-01-01', 1, 
'short_a'),
+                                    ('2024-01-01', 2, repeat('x', 2048)),
+                                    ('2024-01-02', 3, 'short_b')"""
+
+    setFeConfigTemporary([statistics_max_string_column_length: 1024]) {
+        sql """set global enable_partition_analyze = true"""
+        try {
+            sql """analyze table test_analyze_long_string_part"""
+            def jobId = findJobId("internal", "test_analyze_long_string", 
"test_analyze_long_string_part")
+            assertNotNull(jobId, "must find analyze job for 
test_analyze_long_string_part")
+            waitJobFinished(jobId)
+            // No task should be skipped: partition path bypasses the 
long-string guard.
+            assertTaskSkipped(jobId, [], ["d", "id", "big_str"])
+            def collected = collectedColumns("test_analyze_long_string_part")
+            assertTrue(collected.contains("big_str"),
+                    "partition analyze should still collect big_str (guard 
does not apply)")
+            assertTrue(collected.contains("d"))
+            assertTrue(collected.contains("id"))
+        } finally {
+            sql """set global enable_partition_analyze = false"""
+        }
+    }
+
+    // ---------- Case 5: sample analyze forced onto DUJ1 template ----------
+    // useLinearAnalyzeTemplate() normally returns true on small tables 
(scanFullTable),
+    // so LINEAR covers the sample path. Use a FE debug point to force DUJ1 so 
its
+    // ${lengthAssert} injection is exercised end-to-end.
+    sql """drop table if exists test_analyze_long_string_duj1"""
+    sql """
+        CREATE TABLE test_analyze_long_string_duj1 (
+          `id` bigint,
+          `small_str` varchar(100),
+          `big_str` string
+        ) ENGINE=OLAP
+        DISTRIBUTED BY HASH(`id`) BUCKETS 2
+        PROPERTIES ("replication_allocation" = "tag.location.default: 1");
+    """
+    sql """insert into test_analyze_long_string_duj1 values(1, 'aa', 
repeat('z', 2048))"""
+    sql """insert into test_analyze_long_string_duj1 values(2, 'bb', 
'short1')"""
+    sql """insert into test_analyze_long_string_duj1 values(3, 'cc', 
'short2')"""
+    sql """insert into test_analyze_long_string_duj1 values(4, 'dd', 
'short3')"""
+    sql """insert into test_analyze_long_string_duj1 values(5, 'ee', 
'short4')"""
+
+    setFeConfigTemporary([statistics_max_string_column_length: 1024]) {
+        
GetDebugPoint().enableDebugPointForAllFEs('OlapAnalysisTask.useDUJ1Template')
+        try {
+            sql """analyze table test_analyze_long_string_duj1 with sample 
rows 3"""
+            def jobId = findJobId("internal", "test_analyze_long_string", 
"test_analyze_long_string_duj1")
+            assertNotNull(jobId, "must find analyze job for 
test_analyze_long_string_duj1")
+            waitJobFinished(jobId)
+            assertTaskSkipped(jobId, ["big_str"], ["id", "small_str"])
+            def collected = collectedColumns("test_analyze_long_string_duj1")
+            assertFalse(collected.contains("big_str"),
+                    "DUJ1 analyze: big_str should be skipped")
+            assertTrue(collected.contains("small_str"))
+        } finally {
+            
GetDebugPoint().disableDebugPointForAllFEs('OlapAnalysisTask.useDUJ1Template')
+        }
+    }
+
+    // ---------- Case 6: WITH SYNC — skipped column must have no stats 
----------
+    // Sync analyze never populates analysisJobIdToTaskMap, so SHOW ANALYZE 
cannot see
+    // the skip reason. AnalysisManager.buildAndAssignJob surfaces skip 
messages via
+    // ConnectContext OK-packet info for interactive visibility. We cannot 
easily read
+    // that info string from JDBC in regression, so we verify the functional 
outcome:
+    // the skipped column produces no row in column_statistics, while other 
columns do.
+    sql """drop table if exists test_analyze_long_string_full"""
+    sql """
+        CREATE TABLE test_analyze_long_string_full (
+          `id` bigint,
+          `name` varchar(100),
+          `big_str` string
+        ) ENGINE=OLAP
+        DISTRIBUTED BY HASH(`id`) BUCKETS 4
+        PROPERTIES ("replication_allocation" = "tag.location.default: 1");
+    """
+    sql """insert into test_analyze_long_string_full values(1, 'alice', 
NULL)"""
+    sql """insert into test_analyze_long_string_full values(2, 'bob',   
repeat('y', 100))"""
+    sql """insert into test_analyze_long_string_full values(3, 'chris', 
repeat('z', 2048))"""
+    setFeConfigTemporary([statistics_max_string_column_length: 1024]) {
+        sql """analyze table test_analyze_long_string_full with sync"""
+        def bigRows = sql """show column stats test_analyze_long_string_full 
(big_str)"""
+        assertEquals(0, bigRows.size(),
+                "WITH SYNC: big_str should be skipped, got ${bigRows}")
+        def nameRows = sql """show column stats test_analyze_long_string_full 
(name)"""
+        assertEquals(1, nameRows.size(),
+                "WITH SYNC: name should be collected, got ${nameRows}")
+    }
+
+    // ---------- Cleanup ----------
+    sql """drop database if exists test_analyze_long_string"""
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris) branch branch-4.1 updated: branch-4.1: [feature](statistics) Skip collecting stats for long string columns #62686 (#63303)

Reply via email to