Re: [PR] feat(docs): auto-generate database documentation from lib.py [superset]

via GitHub Wed, 14 Jan 2026 13:29:11 -0800


betodealmeida commented on code in PR #36805:
URL: https://github.com/apache/superset/pull/36805#discussion_r2692112658



##########
docs/scripts/generate-database-docs.mjs:
##########
@@ -0,0 +1,706 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/**
+ * This script generates database documentation data from the Python lib.py 
script.
+ * It outputs a JSON file that can be imported by React components for 
rendering.
+ *
+ * Usage: node scripts/generate-database-docs.mjs
+ *
+ * The script can run in two modes:
+ * 1. With Flask app (full diagnostics) - requires superset to be installed
+ * 2. Fallback mode (documentation only) - uses just the DATABASE_DOCS from 
lib.py
+ */
+
+import { spawnSync } from 'child_process';
+import fs from 'fs';
+import path from 'path';
+import { fileURLToPath } from 'url';
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+const ROOT_DIR = path.resolve(__dirname, '../..');
+const DOCS_DIR = path.resolve(__dirname, '..');
+const DATA_OUTPUT_DIR = path.join(DOCS_DIR, 'src/data');
+const DATA_OUTPUT_FILE = path.join(DATA_OUTPUT_DIR, 'databases.json');
+const MDX_OUTPUT_DIR = path.join(DOCS_DIR, 'docs/databases');
+
+/**
+ * Try to run the full lib.py script with Flask context
+ */
+function tryRunFullScript() {
+  try {
+    console.log('Attempting to run lib.py with Flask context...');
+    const pythonCode = `
+import sys
+import json
+sys.path.insert(0, '.')
+from superset.app import create_app
+from superset.db_engine_specs.lib import generate_yaml_docs
+app = create_app()
+with app.app_context():
+    docs = generate_yaml_docs()
+    print(json.dumps(docs, default=str))
+`;
+    const result = spawnSync('python', ['-c', pythonCode], {
+      cwd: ROOT_DIR,
+      encoding: 'utf-8',
+      timeout: 60000,
+      maxBuffer: 10 * 1024 * 1024,
+      env: { ...process.env, SUPERSET_SECRET_KEY: 'docs-build-key' },
+    });
+
+    if (result.error) {
+      throw result.error;
+    }
+    if (result.status !== 0) {
+      throw new Error(result.stderr || 'Python script failed');
+    }
+    return JSON.parse(result.stdout);
+  } catch (error) {
+    console.log('Full script execution failed, using fallback mode...');
+    console.log('  Reason:', error.message?.split('\n')[0] || 'Unknown error');
+    return null;
+  }
+}
+
+/**
+ * Extract DATABASE_DOCS from lib.py using pure AST parsing
+ * No superset imports required - works in CI without dependencies
+ */
+function extractDatabaseDocs() {
+  console.log('Extracting DATABASE_DOCS directly from lib.py...');
+
+  try {
+    const pythonCode = `
+import sys
+import json
+import ast
+import re
+
+# Read the lib.py file
+with open('superset/db_engine_specs/lib.py', 'r') as f:
+    content = f.read()
+
+# Find the DATABASE_DOCS dictionary using regex to extract the block
+# This avoids needing to execute any superset code
+# Handle both plain assignment and type-annotated assignment
+match = re.search(r'^DATABASE_DOCS[^=]*=\\s*({.*?^})', content, re.MULTILINE | 
re.DOTALL)
+if not match:
+    print(json.dumps({}))
+    sys.exit(0)
+
+docs_str = match.group(1)
+
+# Parse the extracted dictionary as a Python literal
+tree = ast.parse(docs_str, mode='eval')
+
+# Safely evaluate the AST - only allow literals
+def eval_node(node):
+    if isinstance(node, ast.Expression):
+        return eval_node(node.body)
+    elif isinstance(node, ast.BinOp) and isinstance(node.op, ast.Add):
+        # Handle string concatenation
+        return eval_node(node.left) + eval_node(node.right)
+    elif isinstance(node, ast.Dict):
+        return {eval_node(k): eval_node(v) for k, v in zip(node.keys, 
node.values)}
+    elif isinstance(node, ast.List):
+        return [eval_node(e) for e in node.elts]
+    elif isinstance(node, ast.Set):
+        return {eval_node(e) for e in node.elts}
+    elif isinstance(node, ast.Tuple):
+        return tuple(eval_node(e) for e in node.elts)
+    elif isinstance(node, ast.Constant):
+        return node.value
+    elif isinstance(node, ast.Str):  # Python 3.7 compat
+        return node.s
+    elif isinstance(node, ast.Num):  # Python 3.7 compat
+        return node.n
+    elif isinstance(node, ast.NameConstant):  # Python 3.7 compat
+        return node.value
+    elif isinstance(node, ast.Name):
+        # Handle True, False, None
+        if node.id == 'True':
+            return True
+        elif node.id == 'False':
+            return False
+        elif node.id == 'None':
+            return None
+        else:
+            return node.id  # Return name as string for unknown refs
+    elif isinstance(node, ast.Call):
+        # Handle set() calls
+        if isinstance(node.func, ast.Name) and node.func.id == 'set':
+            if node.args:
+                return set(eval_node(node.args[0]))
+            return set()
+        return None
+    else:
+        return None
+
+result = eval_node(tree)
+print(json.dumps(result if result else {}, default=str))
+`;
+    const result = spawnSync('python3', ['-c', pythonCode], {
+      cwd: ROOT_DIR,
+      encoding: 'utf-8',
+      timeout: 30000,
+      maxBuffer: 10 * 1024 * 1024,
+    });
+
+    if (result.error) {
+      throw result.error;
+    }
+    if (result.status !== 0) {
+      throw new Error(result.stderr || 'Python script failed');
+    }
+    const databaseDocs = JSON.parse(result.stdout);
+    if (Object.keys(databaseDocs).length === 0) {
+      throw new Error('No DATABASE_DOCS found');
+    }
+
+    // Build the expected format: wrap documentation in full entry structure
+    const databases = {};
+    for (const [name, docs] of Object.entries(databaseDocs)) {
+      databases[name] = {
+        engine: name.toLowerCase().replace(/\s+/g, '_'),
+        engine_name: name,
+        documentation: docs,
+        time_grains: {},
+        score: 0,
+        max_score: 0,
+        joins: true,
+        subqueries: true,
+        supports_dynamic_schema: false,
+        supports_catalog: false,
+        supports_dynamic_catalog: false,
+        ssh_tunneling: false,
+        query_cancelation: false,
+        supports_file_upload: false,
+        user_impersonation: false,
+        query_cost_estimation: false,
+        sql_validation: false,
+      };
+    }
+    console.log(`Extracted ${Object.keys(databases).length} databases from 
lib.py`);
+    return databases;
+  } catch (err) {
+    console.error('AST extraction failed:', err.message);
+    // Fall back to simple extraction
+    return extractDatabaseDocsSimple();
+  }
+}
+
+/**
+ * Simple extraction fallback - just reads DATABASE_DOCS keys and builds 
placeholder entries
+ * No superset imports required - works in CI without dependencies
+ */
+function extractDatabaseDocsSimple() {
+  console.log('Using simple extraction method (no diagnostics)...');
+  // This is a final fallback - just return null and let caller use existing 
data
+  return null;
+}
+
+/**
+ * Build statistics from the database data
+ */
+function buildStatistics(databases) {
+  const stats = {
+    totalDatabases: Object.keys(databases).length,
+    withDocumentation: 0,
+    withConnectionString: 0,
+    withDrivers: 0,
+    withAuthMethods: 0,
+    supportsJoins: 0,
+    supportsSubqueries: 0,
+    supportsDynamicSchema: 0,
+    supportsCatalog: 0,
+    averageScore: 0,
+    maxScore: 0,
+    byCategory: {},
+  };
+
+  let totalScore = 0;
+
+  for (const [name, db] of Object.entries(databases)) {
+    const docs = db.documentation || {};
+
+    if (Object.keys(docs).length > 0) stats.withDocumentation++;
+    if (docs.connection_string || docs.drivers?.length > 0)
+      stats.withConnectionString++;
+    if (docs.drivers?.length > 0) stats.withDrivers++;
+    if (docs.authentication_methods?.length > 0) stats.withAuthMethods++;
+    if (db.joins) stats.supportsJoins++;
+    if (db.subqueries) stats.supportsSubqueries++;
+    if (db.supports_dynamic_schema) stats.supportsDynamicSchema++;
+    if (db.supports_catalog) stats.supportsCatalog++;
+
+    totalScore += db.score || 0;
+    if (db.max_score > stats.maxScore) stats.maxScore = db.max_score;
+
+    // Categorize databases
+    const category = categorizeDatabase(name);
+    if (!stats.byCategory[category]) {
+      stats.byCategory[category] = [];
+    }
+    stats.byCategory[category].push(name);
+  }
+
+  stats.averageScore = Math.round(totalScore / stats.totalDatabases);
+
+  return stats;
+}
+
+/**
+ * Categorize a database by its type
+ */
+function categorizeDatabase(name) {

Review Comment:
   This PR is super cool, but I do have a concern about maintainability (that 
applies to this function and a few other places):
   
   **If we add a new database to Superset, how many files do we need to touch 
in order to keep the documentation up-to-date?**
   
   Right now it seems that it would require adding a new file to `docs/docs/`, 
adding the logo, and updating 2 functions — BTW why do we have both 
`categorizeDatabase` and `getCategory`? They seem to do the same thing.
   
   Ideally we'd want to have this information centralized in the DB engine 
spec, so that no additional work is necessary. The current approach we have is 
to use `lib.py` to figure out attributes, but we could define a schema for the 
metadata:
   
   ```python
   class MyDBEngineSpec(BaseEngineSpec):
       metadata: DBEngineSpecMetadata = {
           "name": "MyDB",
           "description": "Not to be confused with MySQL",
           "logo": "https://mydb.example.com/logo.png";,  # not sure if Apache 
would be OK with this
           "supported_features": {"joins", "query-cost-estimation"},
           "supported_time_grains": {...},
           ...,
       }
   ```
   
   Then we could use the AST parser (I think?) to extract the `metadata` dict 
and trivially convert to JSON.
   
   It adds more work, since we're moving away from the auto-detection approach, 
and the metadata could fall out of sync (for example, if someone adds a time 
grain but forgets to update `supported_time_grains` in the metadata). But we 
could have a CI action that tests that `supported_time_grains` is correct.
   
   I'm not saying we should do this in this PR, just proposing some ideas. 😄 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] feat(docs): auto-generate database documentation from lib.py [superset]

Reply via email to