alexandrusoare commented on code in PR #37200: URL: https://github.com/apache/superset/pull/37200#discussion_r2758191985
########## superset/mcp_service/utils/token_utils.py: ########## @@ -0,0 +1,399 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Token counting and response size utilities for MCP service. + +This module provides utilities to estimate token counts and generate smart +suggestions when responses exceed configured limits. This prevents large +responses from overwhelming LLM clients like Claude Desktop. +""" + +from __future__ import annotations + +import logging +from typing import Any, Dict, List + +logger = logging.getLogger(__name__) + +# Approximate characters per token for estimation +# Claude tokenizer averages ~4 chars per token for English text +# JSON tends to be more verbose, so we use a slightly lower ratio +CHARS_PER_TOKEN = 3.5 + + +def estimate_token_count(text: str | bytes) -> int: + """ + Estimate the token count for a given text. + + Uses a character-based heuristic since we don't have direct access to + the actual tokenizer. This is conservative to avoid underestimating. + + Args: + text: The text to estimate tokens for (string or bytes) + + Returns: + Estimated number of tokens + """ + if isinstance(text, bytes): + text = text.decode("utf-8", errors="replace") + + # Simple heuristic: ~3.5 characters per token for JSON/code + return int(len(text) / CHARS_PER_TOKEN) + + +def estimate_response_tokens(response: Any) -> int: Review Comment: Is there a better type that we can use instead of Any? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
