Copilot commented on code in PR #4486: URL: https://github.com/apache/texera/pull/4486#discussion_r3149874888
########## frontend/src/app/workspace/service/jupyter-panel/jupyter-panel.service.ts: ########## @@ -0,0 +1,263 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { Injectable } from "@angular/core"; +import { BehaviorSubject, catchError, map, of } from "rxjs"; +import { WorkflowActionService } from "../workflow-graph/model/workflow-action.service"; +import { OperatorLink } from "../../types/workflow-common.interface"; +import { HttpClient, HttpHeaders } from "@angular/common/http"; +import { UntilDestroy } from "@ngneat/until-destroy"; +import { NotificationService } from "src/app/common/service/notification/notification.service"; +import { distinctUntilChanged, switchMap } from "rxjs/operators"; +import { AppSettings } from "../../../common/app-setting"; +import { NotebookMigrationService } from "../notebook-migration/notebook-migration.service"; + +@UntilDestroy() +@Injectable({ + providedIn: "root", +}) +export class JupyterPanelService { + private jupyterNotebookPanelVisible = new BehaviorSubject<boolean>(false); + public jupyterNotebookPanelVisible$ = this.jupyterNotebookPanelVisible.asObservable(); + + private iframeRef: HTMLIFrameElement | null = null; // Store reference to iframe element + private cellContent: string[] = []; // Store the content of the cells + private highlightedCell: number | null = null; // Track the highlighted cell + + // Precomputed dictionary for cell to highlight mapping + private cellToHighlightMapping: Record<string, { components: string[]; edges: string[] }> = {}; + + constructor( + private workflowActionService: WorkflowActionService, + private http: HttpClient, + private notificationService: NotificationService, + private notebookMigrationService: NotebookMigrationService + ) { + window.addEventListener("message", this.handleNotebookMessage); + } Review Comment: The service registers a global `window.addEventListener('message', ...)` handler but never removes it. Even if this is a singleton, it can cause leaks / duplicate handlers in tests or if the provider scope changes; implement `OnDestroy` and call `removeEventListener` with the same handler. ########## notebook-migration-service/build.sbt: ########## @@ -0,0 +1,77 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import scala.collection.Seq + +name := "notebook-migration-service" + + +enablePlugins(JavaAppPackaging) + +// Enable semanticdb for Scalafix +ThisBuild / semanticdbEnabled := true +ThisBuild / semanticdbVersion := scalafixSemanticdb.revision + +// Manage dependency conflicts by always using the latest revision +ThisBuild / conflictManager := ConflictManager.latestRevision + +// Restrict parallel execution of tests to avoid conflicts +Global / concurrentRestrictions += Tags.limit(Tags.Test, 1) + +///////////////////////////////////////////////////////////////////////////// +// Compiler Options +///////////////////////////////////////////////////////////////////////////// + +// Scala compiler options +Compile / scalacOptions ++= Seq( + "-Xelide-below", "WARNING", // Turn on optimizations with "WARNING" as the threshold + "-feature", // Check feature warnings + "-deprecation", // Check deprecation warnings + "-Ywarn-unused:imports" // Check for unused imports +) + +///////////////////////////////////////////////////////////////////////////// +// Version Variables +///////////////////////////////////////////////////////////////////////////// + +val dropwizardVersion = "4.0.7" +val mockitoVersion = "5.4.0" +val assertjVersion = "3.24.2" + +///////////////////////////////////////////////////////////////////////////// +// Test-related Dependencies +///////////////////////////////////////////////////////////////////////////// + +libraryDependencies ++= Seq( + "org.scalamock" %% "scalamock" % "5.2.0" % Test, // ScalaMock + "org.scalatest" %% "scalatest" % "3.2.17" % Test, // ScalaTest + "io.dropwizard" % "dropwizard-testing" % dropwizardVersion % Test, // Dropwizard Testing + "org.mockito" % "mockito-core" % mockitoVersion % Test, // Mockito for mocking + "org.assertj" % "assertj-core" % assertjVersion % Test, // AssertJ for assertions + "com.novocode" % "junit-interface" % "0.11" % Test // SBT interface for JUnit +) + +///////////////////////////////////////////////////////////////////////////// +// Dependencies +///////////////////////////////////////////////////////////////////////////// + +// Core Dependencies +libraryDependencies ++= Seq( + "io.dropwizard" % "dropwizard-core" % dropwizardVersion, + "io.dropwizard" % "dropwizard-auth" % dropwizardVersion, // Dropwizard Authentication module + "com.fasterxml.jackson.module" %% "jackson-module-scala" % "2.15.2" Review Comment: `notebook-migration-service` declares `jackson-module-scala` as `2.15.2`, but the rest of the repo standardizes on newer Jackson (currently `2.18.6`) and the root build already adds overrides. Please align this module’s Jackson dependency with the repo-wide version to avoid runtime classpath conflicts. ```suggestion "com.fasterxml.jackson.module" %% "jackson-module-scala" % "2.18.6" ``` ########## frontend/src/app/workspace/component/workflow-editor/workflow-editor.component.ts: ########## @@ -618,12 +620,14 @@ export class WorkflowEditorComponent implements OnInit, AfterViewInit, OnDestroy const elementID = event[0].model.id.toString(); const highlightedOperatorIDs = this.wrapper.getCurrentHighlightedOperatorIDs(); const highlightedCommentBoxIDs = this.wrapper.getCurrentHighlightedCommentBoxIDs(); + this.jupyterPanelService.onWorkflowComponentClick(elementID); // Highlight corresponding Jupyter notebook cell if (event[1].shiftKey) { // if in multiselect toggle highlights on click if (highlightedOperatorIDs.includes(elementID)) { this.workflowActionService.unhighlightOperators(elementID); } else if (this.workflowActionService.getTexeraGraph().hasOperator(elementID)) { this.workflowActionService.highlightOperators(<boolean>event[1].shiftKey, elementID); + this.jupyterPanelService.onWorkflowComponentClick(elementID); // Highlight corresponding Jupyter notebook cell Review Comment: `onWorkflowComponentClick` is invoked twice for the same click path (once unconditionally above, and again inside the `shiftKey` branch). This can cause duplicate postMessage/highlight work; please keep only one call per click. ```suggestion ``` ########## notebook-migration-service/src/main/resources/docker-compose.yml: ########## @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: texera-jupyter +services: + + jupyter: + build: + context: . + dockerfile: Dockerfile Review Comment: `docker-compose.yml` specifies `dockerfile: Dockerfile`, but the repository file is named `dockerfile` (lowercase). On case-sensitive filesystems this will fail to build; rename the file to `Dockerfile` or update the compose reference to match the actual filename. ```suggestion dockerfile: dockerfile ``` ########## frontend/src/app/workspace/service/jupyter-panel/jupyter-panel.service.ts: ########## @@ -0,0 +1,263 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { Injectable } from "@angular/core"; +import { BehaviorSubject, catchError, map, of } from "rxjs"; +import { WorkflowActionService } from "../workflow-graph/model/workflow-action.service"; +import { OperatorLink } from "../../types/workflow-common.interface"; +import { HttpClient, HttpHeaders } from "@angular/common/http"; +import { UntilDestroy } from "@ngneat/until-destroy"; +import { NotificationService } from "src/app/common/service/notification/notification.service"; +import { distinctUntilChanged, switchMap } from "rxjs/operators"; +import { AppSettings } from "../../../common/app-setting"; +import { NotebookMigrationService } from "../notebook-migration/notebook-migration.service"; + +@UntilDestroy() +@Injectable({ + providedIn: "root", +}) +export class JupyterPanelService { + private jupyterNotebookPanelVisible = new BehaviorSubject<boolean>(false); + public jupyterNotebookPanelVisible$ = this.jupyterNotebookPanelVisible.asObservable(); + + private iframeRef: HTMLIFrameElement | null = null; // Store reference to iframe element + private cellContent: string[] = []; // Store the content of the cells + private highlightedCell: number | null = null; // Track the highlighted cell + + // Precomputed dictionary for cell to highlight mapping + private cellToHighlightMapping: Record<string, { components: string[]; edges: string[] }> = {}; + + constructor( + private workflowActionService: WorkflowActionService, + private http: HttpClient, + private notificationService: NotificationService, + private notebookMigrationService: NotebookMigrationService + ) { + window.addEventListener("message", this.handleNotebookMessage); + } + + public init(): void { + this.workflowActionService + .workflowMetaDataChanged() + .pipe( + map(meta => meta.wid), + distinctUntilChanged() + ) + .subscribe(wid => { + this.closeJupyterNotebookPanel(); + if (wid != 0) { + this.fetchNotebookAndMapping(wid).subscribe(result => { + if (result == 1) { + this.precomputeHighlightMapping(); + this.openJupyterNotebookPanel(); + } + }); + } + }); + } + + private fetchNotebookAndMapping( + workflowID: number | undefined = this.workflowActionService.getWorkflow().wid, + vId: number = 1 + ) { + // Fetch mapping and notebook from migration database if exists for wid + const dbAPIUrl = `${AppSettings.getApiEndpoint()}/notebook-migration/fetch-notebook-and-mapping`; + const headers = new HttpHeaders({ "Content-Type": "application/json" }); + const payload = { + wid: workflowID, + vid: vId, // Future work: add dynamic fetching of current workflow vId + }; + + return this.http.post(dbAPIUrl, payload, { headers }).pipe( + switchMap(async (response: any) => { + // Only load mapping and workflow if they exist + if (response.exists) { + this.notebookMigrationService.setMapping("mapping_wid_" + workflowID, response.mapping); + + if ((await this.notebookMigrationService.sendNotebookToJupyter(response.notebook)) == 1) { + return 1; + } else { + return 0; + } + } else { + return 0; + } + }), + catchError((error: unknown) => { + console.error("Network response was not ok when fetching notebook and mapping:", error); + return of(0); + }) + ); + } + + // Precompute the dictionary for O(1) highlighting + private precomputeHighlightMapping(): void { + const wid = this.workflowActionService.getWorkflow().wid; + + if (wid === undefined) { + console.warn("Workflow ID is undefined. Cannot compute highlight mapping."); + return; + } + const mappingKey = "mapping_wid_" + wid; + const mapping = this.notebookMigrationService.getMapping(mappingKey); + + if (mapping == undefined) { + console.warn(`Mapping key '${mappingKey}' not found. Cannot compute highlight mapping.`); + return; + } + const cellToOperator = mapping.cell_to_operator; + + const allLinks: OperatorLink[] = this.workflowActionService.getTexeraGraph().getAllLinks(); + if (allLinks.length === 0) { + console.warn("No links found in the graph during precompute."); + return; + } + + for (const cellUUID in cellToOperator) { + const components = cellToOperator[cellUUID] || []; + const componentSet = new Set(components); + const edges: string[] = []; + + allLinks.forEach(link => { + const sourceOperatorID = link.source.operatorID; + const targetOperatorID = link.target.operatorID; + + if ( + componentSet.has(sourceOperatorID) && + componentSet.has(targetOperatorID) && + sourceOperatorID !== targetOperatorID + ) { + edges.push(link.linkID); + } + }); + + this.cellToHighlightMapping[cellUUID] = { components, edges }; + } + } + + // Set the iframe reference (from the component's ViewChild) + setIframeRef(iframe: HTMLIFrameElement) { + this.iframeRef = iframe; + } + + // Open the Jupyter Notebook panel + openPanel(panelName: string): void { + if (panelName === "JupyterNotebookPanel") { + this.jupyterNotebookPanelVisible.next(true); + } + } + + // Close the Jupyter Notebook panel + closeJupyterNotebookPanel(): void { + this.jupyterNotebookPanelVisible.next(false); + const wid = this.workflowActionService.getWorkflow().wid; + if (wid != undefined) { + this.notebookMigrationService.deleteMapping("mapping_wid_" + wid); + } + } + + // Minimize the Jupyter Notebook panel + public minimizeJupyterNotebookPanel(): void { + this.jupyterNotebookPanelVisible.next(false); + } + + // Expand the Jupyter Notebook panel + public openJupyterNotebookPanel(): void { + const wid = this.workflowActionService.getWorkflow().wid; + const mappingKey = "mapping_wid_" + wid; + // Check if there is corresponding mapping data + if (wid === undefined || !this.notebookMigrationService.hasMapping(mappingKey)) { + this.notificationService.warning("No Jupyter notebook associated with this workflow."); + return; + } + + // Expand only if the mapping exists + this.jupyterNotebookPanelVisible.next(true); + } + + // Handle messages from the Jupyter notebook iframe + private handleNotebookMessage = async (event: MessageEvent) => { + const allowedOrigins = [window.location.origin, await this.notebookMigrationService.getJupyterURL()]; + if (!allowedOrigins.includes(event.origin)) { + return; + } + Review Comment: `handleNotebookMessage` calls `await this.notebookMigrationService.getJupyterURL()` for every posted message, which triggers a network fetch each time a cell is clicked. Cache the Jupyter URL (or compute it once when opening the panel) so message handling stays O(1) and doesn’t depend on the network. ########## frontend/src/app/workspace/component/workspace.component.ts: ########## @@ -280,11 +296,48 @@ export class WorkspaceComponent implements AfterViewInit, OnInit, OnDestroy { .pipe(untilDestroyed(this)) .subscribe(); } + public triggerCenter(): void { this.workflowActionService.getTexeraGraph().triggerCenterEvent(); } public get copilotEnabled(): boolean { return this.config.env.copilotEnabled; } + + onWaitingForLLMChanged(isWaiting: boolean) { + this.isWaitingForLLM = isWaiting; + + if (isWaiting) { + this.startTimer(); + } else { + this.stopTimer(); + } + } + + startTimer() { + this.startTime = Date.now(); + this.updateElapsedTime(); + this.timerInterval = setInterval(() => { + this.updateElapsedTime(); + }, 1000); + } + + stopTimer() { + clearInterval(this.timerInterval); + this.timerInterval = null; + this.startTime = null; + } Review Comment: The timer interval is only cleared when `onWaitingForLLMChanged(false)` is called. If the component is destroyed while waiting (route change/tab close), the interval can continue running; ensure `ngOnDestroy` also calls `stopTimer()` (or otherwise clears `timerInterval`). ########## frontend/src/app/workspace/service/notebook-migration/migration-prompts.ts: ########## @@ -0,0 +1,410 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +// TEXERA DOCUMENTATION + +// https://github.com/Texera/texera/wiki/Guide-to-Use-a-Python-UDF +export const TEXERA_OVERVIEW = ` +You are a robust compiler that takes python code and translates it to our personal workflow environment Texera that uses python. + + Texera is a data analytics tool that uses workflows to do machine learning and data analytics computation. User's are able to drag and drop operators and connect their inputs and outputs in a workflow graphical user interface, which the code we are going to create. + +Texera is able to use Python user defined functions. Documentation of a Python UDF in Texera follows: + Process Data APIs + +There are three APIs to process the data in different units. + + Tuple API. + + class ProcessTupleOperator(UDFOperatorV2): + +def process_tuple(self, tuple_: Tuple, port: int) -> Iterator[Optional[TupleLike]]: +yield tuple_ + +Tuple API takes one input tuple from a port at a time. It returns an iterator of optional TupleLike instances. A TupleLike is any data structure that supports key-value pairs, such as pytexera.Tuple, dict, defaultdict, NamedTuple, etc. + + Tuple API is useful for implementing functional operations which are applied to tuples one by one, such as map, reduce, and filter. + + Table API. + + class ProcessTableOperator(UDFTableOperator): + +def process_table(self, table: Table, port: int) -> Iterator[Optional[TableLike]]: +yield table + +Table API consumes a Table at a time, which consists of all the tuples from a port. It returns an iterator of optional TableLike instances. A TableLike is a collection of TupleLike, and currently, we support pytexera.Table and pandas.DataFrame as a TableLike instance. More flexible types will be supported down the road. + + Table API is useful for implementing blocking operations that will consume all the data from one port, such as join, sort, and machine learning training. + + Batch API. + + class ProcessBatchOperator(UDFBatchOperator): + +BATCH_SIZE = 10 + +def process_batch(self, batch: Batch, port: int) -> Iterator[Optional[BatchLike]]: +yield batch + +Batch API consumes a batch of tuples at a time. Similar to Table, a Batch is also a collection of Tuples; however, its size is defined by the BATCH_SIZE, and one port can have multiple batches. It returns an iterator of optional BatchLike instances. A BatchLike is a collection of TupleLike, and currently, we support pytexera.Batch and pandas.DataFrame as a BatchLike instance. More flexible types will be supported down the road. + + The Batch API serves as a hybrid API combining the features of both the Tuple and Table APIs. It is particularly valuable for striking a balance between time and space considerations, offering a trade-off that optimizes efficiency. + + All three APIs can return an empty iterator by yield None. + + The template code for a Python UDF follows: MAKE SURE TO USE THE CLASS NAMES AND FUNCTIONS DEFINED, THIS IS A MUST FOR THE PROGRAM TO WORK. SELECT 1 OUT OF THE 3 PROCESSING OPERATOR FUNCTIONS TO BUILD DEPENDINGO ON THE CONTEXT OF CODE TRANSLATION. +# Choose from the following templates: + # +# from pytexera import * +# +# class ProcessTupleOperator(UDFOperatorV2): +# +# @overrides +# def process_tuple(self, tuple_: Tuple, port: int) -> Iterator[Optional[TupleLike]]: +# yield tuple_ +# +# class ProcessBatchOperator(UDFBatchOperator): +# BATCH_SIZE = 10 # must be a positive integer +# +# @overrides +# def process_batch(self, batch: Batch, port: int) -> Iterator[Optional[BatchLike]]: +# yield batch +# +# class ProcessTableOperator(UDFTableOperator): +# +# @overrides +# def process_table(self, table: Table, port: int) -> Iterator[Optional[TableLike]]: +# yield table +`; + +// https://github.com/Texera/texera/blob/1fa249a9d55d4dcad36d93e093c2faed5c4434f0/core/amber/src/main/python/core/models/tuple.py +export const TUPLE_DOCUMENTATION = ` +### **<code>Tuple</code> Class Overview** + +The \`Tuple\` class is a **lazy-evaluated** data structure designed for efficient field storage and access. It provides: + + 1. **Support for Multiple Data Sources**: +* Can be initialized from a \`TupleLike\` object, such as a \`pandas.Series\`, \`OrderedDict\`, or another \`Tuple\` instance. +* Works with \`ArrowTableTupleProvider\` to access \`pyarrow.Table\` data. +2. **Lazy Field Evaluation**: +* Field values can be either **directly stored values** or **lazy accessors** (\`field_accessor\`). +* If a field is accessed and is an accessor, it is evaluated and cached. +3. **Schema (<code>Schema</code>) Enforcement**: + * A \`Tuple\` can be created without a schema but can be **finalized** with one using \`finalize(schema)\`, which: +* **Casts field values** (e.g., \`NaN → None\`, \`Object → Bytes\`). +* **Validates field completeness**, ensuring all fields match the \`Schema\`. +4. **Pythonic Access Patterns**: +* **Index-based access**: \`tuple["field_name"]\` or \`tuple[index]\` retrieves field values. +* **Dictionary-like operations**: \`tuple.as_dict()\` returns an \`OrderedDict\`, and \`tuple.as_series()\` converts to a \`pandas.Series\`. +* **Iterable support**: \`for field in tuple\` iterates over field values. +5. **Hashing and Comparisons**: +* Implements \`__hash__\` using a Java-like hashing algorithm, allowing usage as dictionary keys. +* Implements \`__eq__\`, supporting equality checks based on field contents. +6. **Partial Data Extraction**: +* \`tuple.get_partial_tuple(attribute_names)\` returns a new \`Tuple\` instance containing only the specified fields. +`; + +// https://github.com/Texera/texera/blob/1fa249a9d55d4dcad36d93e093c2faed5c4434f0/core/amber/src/main/python/core/models/table.py +export const TABLE_DOCUMENTATION = `### **<code>Table</code> Class Overview** + +The \`Table\` class extends \`pandas.DataFrame\`, providing **structured Tuple-based data management**. It is designed to integrate seamlessly with \`Tuple\` objects. + +#### **Key Features:** + +1. **Flexible Construction:** +* Can be initialized from various sources: +* Another \`Table\` (\`from_table(table)\`) +* A \`pandas.DataFrame\` (\`from_data_frame(df)\`) +* A list/iterator of \`TupleLike\` objects (\`from_tuple_likes(tuple_likes)\`) +* Ensures all \`Tuple\` objects in a \`Table\` have **consistent field names**. +2. **Tuple Conversion:** +* \`as_tuples()\`: Converts the table rows into an **iterator of <code>Tuple</code> instances**, preserving the row order. +3. **Equality Comparison (<code>__eq__</code>):** +* Supports **row-wise equality checks** by comparing the underlying \`Tuple\` objects. +4. **Universal Tuple Output (<code>all_output_to_tuple</code>):** +* A helper function to convert **various data types** into \`Tuple\` iterators, supporting: +* \`None\` → \`[None]\` +* \`Table\` → \`as_tuples()\` +* \`pandas.DataFrame\` → Converted into a \`Table\`, then to Tuples +* \`List[TupleLike]\` → Converted to \`Tuple\` instances +* A single \`TupleLike\` or \`Tuple\` → Wrapped in an iterator + +#### **Relation to <code>Tuple</code>:** + +* \`Table\` **stores multiple <code>Tuple</code> objects** and ensures schema consistency across rows. +* Provides an **efficient bridge** between \`Tuple\`-based data and \`pandas.DataFrame\`, enabling compatibility with Python's data analysis tools. +`; + +// https://github.com/Texera/texera/blob/42d803310c180978a9f02992f0e05556796b293c/core/amber/src/main/python/core/models/operator.py +export const OPERATOR_DOCUMENTATION = `### **Operator Class Overview** + +The \`Operator\` class is an **abstract base class (ABC)** for all operators, defining the fundamental structure for processing \`Tuple\`, \`Batch\`, and \`Table\` data in a workflow. + +#### **Key Features & Hierarchy** + +1. **Base <code>Operator</code> Class**: +* Defines lifecycle methods: \`open()\` and \`close()\`. +* Supports a **source flag (<code>is_source</code>)** to distinguish source operators from others. +2. **Tuple-Based Processing (<code>TupleOperatorV2</code>)**: +* Processes individual \`Tuple\` objects through \`process_tuple(tuple_, port)\`. +* Calls \`on_finish(port)\` when an input port is exhausted. +3. **Types of Operators**: +* **SourceOperator**: +* Produces data via \`produce()\`, yielding \`TupleLike\` or \`TableLike\` objects. +* Overrides \`on_finish(port)\` to output produced data. +* **BatchOperator**: +* Collects tuples into batches (\`BATCH_SIZE\`) before processing via \`process_batch(batch, port)\`. +* Converts processed batches (typically \`pandas.DataFrame\`) into \`Tuple\` output. +* **TableOperator**: +* Collects tuples into a \`Table\` before processing via \`process_table(table, port)\`. +* Converts processed \`Table\` output back into tuples. +4. **Data Flow & Processing**: +* Operators receive data **tuple-by-tuple**, **batch-by-batch**, or **table-by-table** depending on the type. +* Results are **iterators** of transformed data (\`TupleLike\`, \`BatchLike\`, or \`TableLike\`). +5. **Deprecated <code>TupleOperator</code>**: +* The older version of \`TupleOperator\` is deprecated in favor of \`TupleOperatorV2\`. + +#### Relation to <code>Tuple</code> and <code>Table</code> + +* Operators **consume and transform** \`Tuple\` and \`Table\` data within a workflow. +* **Tuple-based operators** process row-wise, while **Table operators** handle structured table transformations. +* **Source operators** initiate the data flow by generating tuples or tables.`; + +export const UDF_INPUT_PORT_DOCUMENTATION = ` +Python UDF operators support multiple input and output ports, allowing a single operator to receive different types of data from various upstream operators. In the process_tuple(self, tuple_: Tuple, port: int) function in ProcessTupleOperator and the process_table(self, table: Table, port: int) function in ProcessTableOperator, the port parameter indicates the input port. The port numbers are assigned in order, starting from 0 to N, from top to bottom. When input data have different schemas, it is necessary to assign them to different input ports. However, if all input data share the same schema, additional ports are not required. In both ProcessTupleOperator and ProcessTableOperator, there is an on_finish(self, port: int) function that is executed only after all the tuples from the specified port are processed. + +Using this knowledge, for situations where multiple upstream UDFs act as input to a single UDF, we can introduce an intermediary UDF that collects all of the input data and reformats it into a single table, which is then passed as input to the original next downstream UDF. When it is necessary for this to occur in your translation from notebook to UDFs, include the intermediary UDF and make sure that it and the next operator that uses its output is formatted correctly and handles the data transfer properly. +`; + +export const EXAMPLE_OF_GOOD_CONVERSION = ` +Here is an example of python code translated into a compatible Texera UDF that gives output that abides the output schema compatible with the Texera workflow operators for tuples. Other operators do not always follow this strict format, but the yielding output structure is important. + +Python Code (high level idea): We have a python code that given some data, we limit the number of data. + +Texera Operator code: +from pytexera import * + +class ProcessTupleOperator(UDFOperatorV2): +def __init__(self): +self.limit = 10 +self.count = 0 +@overrides +def process_tuple(self, tuple_: Tuple, port: int) -> Iterator[Optional[TupleLike]]: +if(self.count < self.limit): +self.count += 1 +yield tuple_ + +`; + +export const VISUALIZER_DOCUMENTATION = ` +Texera requires a unique way of generating visualizations from ML libraries: +1. Ensures one yield per operator (per Texera’s UDF constraints). +2. Uses Plotly for visualization and outputs results as embeddable HTML. +3. Error handling is built-in to notify users when data is missing. +`; + +export const EXAMPLE_OF_MULTIPLE_UDF_CONVERSION = ` +Here is an example of breaking up python code into multiple Texera UDFs. Format your response structure exactly like the given example. The "code" key contains a dictionary of the UDF ID's with their respective code. The "edges" key contains a list of pairs that contains the connections between UDFs. The "outputs" key contains a dictionary of the UDF ID's with a list of variable names that they yield in the UDF code. The UDFs can branch and merge, it does not have to be a linear chain depending on your implementation. + +Original Code: +\`\`\`python +# START CELL1 +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.ensemble import RandomForestClassifier +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import accuracy_score +from sklearn.preprocessing import StandardScaler +import matplotlib.pyplot as plt +# END CELL1 + +# START CELL2 +# Load the dataset +file_path = 'diabetes.csv' +data = pd.read_csv(file_path) +# END CELL2 + +# START CELL3 +# Remove duplicate rows +data = data.drop_duplicates() + +# Remove rows with null values +data = data.dropna() +# END CELL3 + +# START CELL4 +# Print the minimum, maximum, and mean for all fields +print("Minimum values:\n", data.min()) +print("\nMaximum values:\n", data.max()) +print("\nMean values:\n", data.mean()) +# END CELL 4 + +# START CELL5 +# Create a boxplot for the 'Pregnancies' field +plt.figure(figsize=(8, 6)) +plt.boxplot(data['Pregnancies'], vert=False, patch_artist=True) +plt.title('Boxplot of Pregnancies') +plt.xlabel('Number of Pregnancies') +plt.show() +# END CELL5 + +# START CELL6 +# Separate features and target variable +X = data.drop('Outcome', axis=1) +y = data['Outcome'] +# END CELL6 + +# START CELL7 +# Split data into training and testing sets (80% train, 20% test) +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + +scaler = StandardScaler() +X_train = scaler.fit_transform(X_train) +X_test = scaler.transform(X_test) +# END CELL7 + +# START CELL8 +# Train Random Forest model +rf_model = RandomForestClassifier(random_state=42) +rf_model.fit(X_train, y_train) +rf_pred = rf_model.predict(X_test) +rf_accuracy = accuracy_score(y_test, rf_pred) +print(f"Random Forest Accuracy: {rf_accuracy:.2%}") +# END CELL8 + +# START CELL9 +# Train SVM model +svm_model = SVC(random_state=42) +svm_model.fit(X_train, y_train) +svm_pred = svm_model.predict(X_test) +svm_accuracy = accuracy_score(y_test, svm_pred) +print(f"SVM Accuracy: {svm_accuracy:.2%}") +# END CELL9 +\`\`\` + +Texera UDF conversion: +\`\`\`json +{ + "code": { + "UDF1": "# UDF1\nfrom pytexera import *\nimport pandas as pd\nfrom typing import Iterator, Optional\n\nclass ProcessTableOperator(UDFTableOperator):\n\n @overrides\n def process_table(self, table: Table, port: int) -> Iterator[Optional[TableLike]]:\n # Remove duplicate rows\n data = table.drop_duplicates()\n\n # Remove rows with null values\n data = data.dropna()\n\n # Calculate statistics\n min_values = data.min()\n max_values = data.max()\n mean_values = data.mean()\n\n # Create a DataFrame to yield\n result_table = pd.DataFrame({\n 'min_values': [min_values],\n 'max_values': [max_values],\n 'mean_values': [mean_values],\n 'data': [data]\n })\n\n yield Table(result_table)", + "UDF2": "# UDF2\nfrom pytexera import *\nimport pandas as pd\nimport plotly.express as px\nimport plotly.io\nfrom typing import Iterator, Optional\n\nclass ProcessTableOperator(UDFTableOperator):\n def render_error(self, error_msg):\n return '''<h1>Boxplot is not available.</h1>\n <p>Reason is: {} </p>\n '''.format(error_msg)\n\n @overrides\n def process_table(self, table: Table, port: int) -> Iterator[Optional[TableLike]]:\n data = table['data'].iloc[0]\n\n if data.empty:\n yield {'html-content': self.render_error('input table is empty.')}\n return\n\n # Create a boxplot for the 'Pregnancies' field\n fig = px.box(data, x='Pregnancies')\n fig.update_layout(margin=dict(l=0, r=0, t=0, b=0))\n\n # Convert fig to HTML content\n html = plotly.io.to_html(fig, include_plotlyjs='cdn', auto_play=False)\n yield {'html-content': html}", + "UDF3": "# UDF3\nfrom pytexera import *\nimport pandas as pd\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nfrom typing import Iterator, Optional\n\nclass ProcessTableOperator(UDFTableOperator):\n\n @overrides\n def process_table(self, table: Table, port: int) -> Iterator[Optional[TableLike]]:\n data = table['data'].iloc[0]\n\n # Separate features and target variable\n X = data.drop('Outcome', axis=1)\n y = data['Outcome']\n\n # Split data into training and testing sets (80% train, 20% test)\n X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n\n scaler = StandardScaler()\n X_train = scaler.fit_transform(X_train)\n X_test = scaler.transform(X_test)\n\n # Create a DataFrame to yield\n result_table = pd.DataFrame({\n 'X_train': [X_train], 'X_test': [X_test],\n 'y_train': [y_tra in], 'y_test': [y_test]\n })\n\n yield Table(result_table)", + "UDF4": "# UDF4\nfrom pytexera import *\nimport pandas as pd\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score\nfrom typing import Iterator, Optional\n\nclass ProcessTableOperator(UDFTableOperator):\n\n @overrides\n def process_table(self, table: Table, port: int) -> Iterator[Optional[TableLike]]:\n X_train = table['X_train'].iloc[0]\n y_train = table['y_train'].iloc[0]\n X_test = table['X_test'].iloc[0]\n y_test = table['y_test'].iloc[0]\n\n # Train Random Forest model\n rf_model = RandomForestClassifier(random_state=42)\n rf_model.fit(X_train, y_train)\n rf_pred = rf_model.predict(X_test)\n rf_accuracy = accuracy_score(y_test, rf_pred)\n\n # Create a DataFrame to yield\n result_table = pd.DataFrame({\n 'rf_model': [rf_model],\n 'rf_accuracy': [rf_accuracy],\n 'X_test': [X_test],\n 'y_test': [y_test]\n })\n\n yield Table(result_table)", + "UDF5": "# UDF5\nfrom pytexera import *\nimport pandas as pd\nfrom sklearn.svm import SVC\nfrom sklearn.metrics import accuracy_score\nfrom typing import Iterator, Optional\n\nclass ProcessTableOperator(UDFTableOperator):\n\n @overrides\n def process_table(self, table: Table, port: int) -> Iterator[Optional[TableLike]]:\n X_train = table['X_train'].iloc[0]\n y_train = table['y_train'].iloc[0]\n X_test = table['X_test'].iloc[0]\n y_test = table['y_test'].iloc[0]\n\n # Train SVM model\n svm_model = SVC(random_state=42)\n svm_model.fit(X_train, y_train)\n svm_pred = svm_model.predict(X_test)\n svm_accuracy = accuracy_score(y_test, svm_pred)\n\n # Create a DataFrame to yield\n result_table = pd.DataFrame({\n 'svm_model': [svm_model],\n 'svm_accuracy': [svm_accuracy],\n 'X_test': [X_test],\n 'y_test': [y_test]\n })\n\n yield Table(result_tab le)" + }, + "edges": [ + ["UDF1", "UDF2"], + ["UDF1", "UDF3"], + ["UDF3", "UDF4"], + ["UDF3", "UDF5"] + ], + "outputs": { + "UDF1": ["min_values", "max_values", "mean_values", "data"], + "UDF2": ["html-content"], + "UDF3": ["X_train", "X_test", "y_train", "y_test"], + "UDF4": ["rf_model", "rf_accuracy", "X_test", "y_test"], + "UDF5": ["svm_model", "svm_accuracy", "X_test", "y_test"] + } +} +\`\`\` +`; + +export const WORKFLOW_PROMPT = `You are an expert in Python coding and workflow systems. +Many users of Texera system are non-technical, but the notebooks they provide are written by technical people. +They want to convert their notebooks to Texera workflows. +Your goal is to help convert these notebooks into a Texera workflow that non-technical users can use directly. +So do not remove or modify any classes or functions, preserve their names and structure as they are. +Ensure that all essential logic remains intact. +Create multiple Texera UDF codes using the provided Python code. +Number each UDF, starting at 1 and incrementing, by starting with a comment that states that UDF number. + +Use the class and function names as shown in ProcessTupleOperator, ProcessTableOperator, and ProcessBatchOperator. +Do not change the class names, function names, or input parameters. +Use the ones that make sense and split the code meaningfully as instructed. + +Use the starter code provided for Python UDFs. + +Use the documentation of Table, Tuple, or Batch to work with parameters within Texera UDF. +Do not import other libraries to define these types. + +There is no need for an __init__ function. Assume all inputs are valid pandas DataFrames, +so do not use .to_pandas(), .to_dataframe(), etc. Do not load data from a file in the first UDF, assume +that the data is already given to you in the table parameter. +Ensure proper data flow between functions. Separate operators as if they will run in different files. + +Current UDF operators can only have one output. Build a dataframe to yield all necessary variables +and data. Ensure proper data flow for each UDF and all information is yielded (including training +and testing data) if subsequent UDFs need them. + +Ensure all necessary imports are included in each UDF code block. + +Each UDF operator should be in its own Python code block. Do not combine them into a single block. +Ensure import statements cover all used functions and separate them as necessary. + +It is VERY important that all of the original code in the Jupyter notebook is represented in the generated workflow. +Make sure that nothing in the original is removed and that the semantic meaning of what the original code was doing is retained. +If there are user-defined Python classes, include the entire class definition in the appropriate UDF(s) that use that class. +Always include the code that defines the class inside of every distinct UDF that uses that constructs an object of that class. +Python classes are allowed in Texera UDFs and follow the same semantics as standard Python. +They can be defined outside of ProcessTableOperator, ProcessTupleOperator, and ProcessBatchOperator. + +Return only the JSON formatted response, do not give any explanation. +Make sure the response is a valid JSON structure, including closing all braces and not including commas after the last element. +Follow this JSON format (don't reuse the values, this is just the format). 'code', 'edges', and 'outputs' are all their own key's, do not nest any of these in another one and make sure to close their braces: +{ +"code": { +"UDF1": "code for UDF1 goes here", +"UDF2": "code for UDF2 goes here" +}, +"edges": [ +["UDF1", "UDF2"] +], +"outputs": { +"UDF1": ["min_values", "max_values", "mean_values", "data"], +"UDF2": ["html-content"] +} +} +Make sure only the keys in the code section appear in the edges and outputs sections. Do not include any extraneous fields. +Do not include any extraneous UDF's in the code field that include empty strings. +Give ALL of the code, do not omit anything or use placeholders for code. Make sure ALL code in the original is translated over. +Use only unescaped single quotes inside of the code values for the UDF's, do not use escaped double quotes. +Convert following the instructions and examples given. Here is the code: +`; + +export const MAPPING_PROMPT = ` +Here is an example of a mapping generated between the given example Python code and the Texera UDFs using their CELL and UDF IDs. Cell IDs are designated by the UUID following '# START'. The format should be kept the same. +{ +"UDF1": [ +"CEll3", +"CELL4" +], Review Comment: The example mapping JSON in the prompt contains `"CEll3"` (typo / inconsistent casing). Since the mapping response is later parsed with `JSON.parse`, small inconsistencies in the example increase the chance the LLM returns unusable output; please correct the example to be consistent. ########## notebook-migration-service/src/main/scala/org/apache/texera/service/resource/NotebookMigrationResource.scala: ########## @@ -0,0 +1,358 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.texera.service.resource + +import com.fasterxml.jackson.databind.ObjectMapper +import com.fasterxml.jackson.module.scala.DefaultScalaModule +import com.typesafe.scalalogging.LazyLogging +import jakarta.ws.rs._ +import jakarta.ws.rs.core._ +import org.apache.texera.dao.SqlServer +import org.jooq.JSONB +import org.apache.texera.dao.jooq.generated.tables.Notebook +import org.apache.texera.dao.jooq.generated.tables.WorkflowNotebookMapping +import java.net.{HttpURLConnection, URL} +import java.nio.charset.StandardCharsets +import scala.util.control.NonFatal +import org.apache.texera.amber.config.StorageConfig + +object NotebookMigrationResource extends LazyLogging { + + private val mapper: ObjectMapper = new ObjectMapper().registerModule(DefaultScalaModule) + private val jupyterUrl = StorageConfig.jupyterURL + private var jupyterIframeURL = s"$jupyterUrl/notebooks/work/notebook.ipynb" + Review Comment: `jupyterIframeURL` is a mutable global var. It’s updated per request and shared across all users/requests, which is not thread-safe and can cause cross-request leakage. Prefer computing the iframe URL per request (and/or store per-workflow/per-user state in DB) instead of keeping mutable singleton state. ########## sql/updates/23.sql: ########## @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +-- ============================================ +-- 1. Connect to the texera_db database +-- ============================================ +\c texera_db + +SET search_path TO texera_db; + +-- ============================================ +-- 2. Delete tables if they already exist +-- ============================================ + +BEGIN; + +DROP TABLE IF EXISTS notebook CASCADE; +DROP TABLE IF EXISTS workflow_notebook_mapping CASCADE; + Review Comment: This migration drops the notebook/mapping tables unconditionally. Even if the update is intended to be run once, dropping tables can destroy existing data if the script is re-applied (e.g., in dev/test resets). Prefer creating the new tables without dropping, or at least avoid `CASCADE` and document/guard the destructive behavior. ########## frontend/src/app/workspace/component/menu/menu.component.ts: ########## @@ -175,6 +201,13 @@ export class MenuComponent implements OnInit, OnDestroy { // Subscribe to computing unit this.subscribeToComputingUnitSelection(); this.subscribeToComputingUnitStatus(); + + this.importForm = this.fb.group({ + description: [""], + file: [null, Validators.required], + model: [""], Review Comment: `importForm` only requires `file`, so the modal can be submitted with an empty `model` value. That will pass an empty string into the LLM client and likely fail at runtime; make the model selection required (or set a safe default model when opening the modal). ```suggestion model: ["", Validators.required], ``` ########## frontend/src/app/workspace/service/notebook-migration/migration-prompts.ts: ########## @@ -0,0 +1,410 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +// TEXERA DOCUMENTATION + +// https://github.com/Texera/texera/wiki/Guide-to-Use-a-Python-UDF +export const TEXERA_OVERVIEW = ` +You are a robust compiler that takes python code and translates it to our personal workflow environment Texera that uses python. + + Texera is a data analytics tool that uses workflows to do machine learning and data analytics computation. User's are able to drag and drop operators and connect their inputs and outputs in a workflow graphical user interface, which the code we are going to create. + +Texera is able to use Python user defined functions. Documentation of a Python UDF in Texera follows: + Process Data APIs + +There are three APIs to process the data in different units. + + Tuple API. + + class ProcessTupleOperator(UDFOperatorV2): + +def process_tuple(self, tuple_: Tuple, port: int) -> Iterator[Optional[TupleLike]]: +yield tuple_ + +Tuple API takes one input tuple from a port at a time. It returns an iterator of optional TupleLike instances. A TupleLike is any data structure that supports key-value pairs, such as pytexera.Tuple, dict, defaultdict, NamedTuple, etc. + + Tuple API is useful for implementing functional operations which are applied to tuples one by one, such as map, reduce, and filter. + + Table API. + + class ProcessTableOperator(UDFTableOperator): + +def process_table(self, table: Table, port: int) -> Iterator[Optional[TableLike]]: +yield table + +Table API consumes a Table at a time, which consists of all the tuples from a port. It returns an iterator of optional TableLike instances. A TableLike is a collection of TupleLike, and currently, we support pytexera.Table and pandas.DataFrame as a TableLike instance. More flexible types will be supported down the road. + + Table API is useful for implementing blocking operations that will consume all the data from one port, such as join, sort, and machine learning training. + + Batch API. + + class ProcessBatchOperator(UDFBatchOperator): + +BATCH_SIZE = 10 + +def process_batch(self, batch: Batch, port: int) -> Iterator[Optional[BatchLike]]: +yield batch + +Batch API consumes a batch of tuples at a time. Similar to Table, a Batch is also a collection of Tuples; however, its size is defined by the BATCH_SIZE, and one port can have multiple batches. It returns an iterator of optional BatchLike instances. A BatchLike is a collection of TupleLike, and currently, we support pytexera.Batch and pandas.DataFrame as a BatchLike instance. More flexible types will be supported down the road. + + The Batch API serves as a hybrid API combining the features of both the Tuple and Table APIs. It is particularly valuable for striking a balance between time and space considerations, offering a trade-off that optimizes efficiency. + + All three APIs can return an empty iterator by yield None. + + The template code for a Python UDF follows: MAKE SURE TO USE THE CLASS NAMES AND FUNCTIONS DEFINED, THIS IS A MUST FOR THE PROGRAM TO WORK. SELECT 1 OUT OF THE 3 PROCESSING OPERATOR FUNCTIONS TO BUILD DEPENDINGO ON THE CONTEXT OF CODE TRANSLATION. +# Choose from the following templates: + # +# from pytexera import * +# +# class ProcessTupleOperator(UDFOperatorV2): +# +# @overrides +# def process_tuple(self, tuple_: Tuple, port: int) -> Iterator[Optional[TupleLike]]: +# yield tuple_ +# +# class ProcessBatchOperator(UDFBatchOperator): +# BATCH_SIZE = 10 # must be a positive integer +# +# @overrides +# def process_batch(self, batch: Batch, port: int) -> Iterator[Optional[BatchLike]]: +# yield batch +# +# class ProcessTableOperator(UDFTableOperator): +# +# @overrides +# def process_table(self, table: Table, port: int) -> Iterator[Optional[TableLike]]: +# yield table +`; + +// https://github.com/Texera/texera/blob/1fa249a9d55d4dcad36d93e093c2faed5c4434f0/core/amber/src/main/python/core/models/tuple.py +export const TUPLE_DOCUMENTATION = ` +### **<code>Tuple</code> Class Overview** + +The \`Tuple\` class is a **lazy-evaluated** data structure designed for efficient field storage and access. It provides: + + 1. **Support for Multiple Data Sources**: +* Can be initialized from a \`TupleLike\` object, such as a \`pandas.Series\`, \`OrderedDict\`, or another \`Tuple\` instance. +* Works with \`ArrowTableTupleProvider\` to access \`pyarrow.Table\` data. +2. **Lazy Field Evaluation**: +* Field values can be either **directly stored values** or **lazy accessors** (\`field_accessor\`). +* If a field is accessed and is an accessor, it is evaluated and cached. +3. **Schema (<code>Schema</code>) Enforcement**: + * A \`Tuple\` can be created without a schema but can be **finalized** with one using \`finalize(schema)\`, which: +* **Casts field values** (e.g., \`NaN → None\`, \`Object → Bytes\`). +* **Validates field completeness**, ensuring all fields match the \`Schema\`. +4. **Pythonic Access Patterns**: +* **Index-based access**: \`tuple["field_name"]\` or \`tuple[index]\` retrieves field values. +* **Dictionary-like operations**: \`tuple.as_dict()\` returns an \`OrderedDict\`, and \`tuple.as_series()\` converts to a \`pandas.Series\`. +* **Iterable support**: \`for field in tuple\` iterates over field values. +5. **Hashing and Comparisons**: +* Implements \`__hash__\` using a Java-like hashing algorithm, allowing usage as dictionary keys. +* Implements \`__eq__\`, supporting equality checks based on field contents. +6. **Partial Data Extraction**: +* \`tuple.get_partial_tuple(attribute_names)\` returns a new \`Tuple\` instance containing only the specified fields. +`; + +// https://github.com/Texera/texera/blob/1fa249a9d55d4dcad36d93e093c2faed5c4434f0/core/amber/src/main/python/core/models/table.py +export const TABLE_DOCUMENTATION = `### **<code>Table</code> Class Overview** + +The \`Table\` class extends \`pandas.DataFrame\`, providing **structured Tuple-based data management**. It is designed to integrate seamlessly with \`Tuple\` objects. + +#### **Key Features:** + +1. **Flexible Construction:** +* Can be initialized from various sources: +* Another \`Table\` (\`from_table(table)\`) +* A \`pandas.DataFrame\` (\`from_data_frame(df)\`) +* A list/iterator of \`TupleLike\` objects (\`from_tuple_likes(tuple_likes)\`) +* Ensures all \`Tuple\` objects in a \`Table\` have **consistent field names**. +2. **Tuple Conversion:** +* \`as_tuples()\`: Converts the table rows into an **iterator of <code>Tuple</code> instances**, preserving the row order. +3. **Equality Comparison (<code>__eq__</code>):** +* Supports **row-wise equality checks** by comparing the underlying \`Tuple\` objects. +4. **Universal Tuple Output (<code>all_output_to_tuple</code>):** +* A helper function to convert **various data types** into \`Tuple\` iterators, supporting: +* \`None\` → \`[None]\` +* \`Table\` → \`as_tuples()\` +* \`pandas.DataFrame\` → Converted into a \`Table\`, then to Tuples +* \`List[TupleLike]\` → Converted to \`Tuple\` instances +* A single \`TupleLike\` or \`Tuple\` → Wrapped in an iterator + +#### **Relation to <code>Tuple</code>:** + +* \`Table\` **stores multiple <code>Tuple</code> objects** and ensures schema consistency across rows. +* Provides an **efficient bridge** between \`Tuple\`-based data and \`pandas.DataFrame\`, enabling compatibility with Python's data analysis tools. +`; + +// https://github.com/Texera/texera/blob/42d803310c180978a9f02992f0e05556796b293c/core/amber/src/main/python/core/models/operator.py +export const OPERATOR_DOCUMENTATION = `### **Operator Class Overview** + +The \`Operator\` class is an **abstract base class (ABC)** for all operators, defining the fundamental structure for processing \`Tuple\`, \`Batch\`, and \`Table\` data in a workflow. + +#### **Key Features & Hierarchy** + +1. **Base <code>Operator</code> Class**: +* Defines lifecycle methods: \`open()\` and \`close()\`. +* Supports a **source flag (<code>is_source</code>)** to distinguish source operators from others. +2. **Tuple-Based Processing (<code>TupleOperatorV2</code>)**: +* Processes individual \`Tuple\` objects through \`process_tuple(tuple_, port)\`. +* Calls \`on_finish(port)\` when an input port is exhausted. +3. **Types of Operators**: +* **SourceOperator**: +* Produces data via \`produce()\`, yielding \`TupleLike\` or \`TableLike\` objects. +* Overrides \`on_finish(port)\` to output produced data. +* **BatchOperator**: +* Collects tuples into batches (\`BATCH_SIZE\`) before processing via \`process_batch(batch, port)\`. +* Converts processed batches (typically \`pandas.DataFrame\`) into \`Tuple\` output. +* **TableOperator**: +* Collects tuples into a \`Table\` before processing via \`process_table(table, port)\`. +* Converts processed \`Table\` output back into tuples. +4. **Data Flow & Processing**: +* Operators receive data **tuple-by-tuple**, **batch-by-batch**, or **table-by-table** depending on the type. +* Results are **iterators** of transformed data (\`TupleLike\`, \`BatchLike\`, or \`TableLike\`). +5. **Deprecated <code>TupleOperator</code>**: +* The older version of \`TupleOperator\` is deprecated in favor of \`TupleOperatorV2\`. + +#### Relation to <code>Tuple</code> and <code>Table</code> + +* Operators **consume and transform** \`Tuple\` and \`Table\` data within a workflow. +* **Tuple-based operators** process row-wise, while **Table operators** handle structured table transformations. +* **Source operators** initiate the data flow by generating tuples or tables.`; + +export const UDF_INPUT_PORT_DOCUMENTATION = ` +Python UDF operators support multiple input and output ports, allowing a single operator to receive different types of data from various upstream operators. In the process_tuple(self, tuple_: Tuple, port: int) function in ProcessTupleOperator and the process_table(self, table: Table, port: int) function in ProcessTableOperator, the port parameter indicates the input port. The port numbers are assigned in order, starting from 0 to N, from top to bottom. When input data have different schemas, it is necessary to assign them to different input ports. However, if all input data share the same schema, additional ports are not required. In both ProcessTupleOperator and ProcessTableOperator, there is an on_finish(self, port: int) function that is executed only after all the tuples from the specified port are processed. + +Using this knowledge, for situations where multiple upstream UDFs act as input to a single UDF, we can introduce an intermediary UDF that collects all of the input data and reformats it into a single table, which is then passed as input to the original next downstream UDF. When it is necessary for this to occur in your translation from notebook to UDFs, include the intermediary UDF and make sure that it and the next operator that uses its output is formatted correctly and handles the data transfer properly. +`; + +export const EXAMPLE_OF_GOOD_CONVERSION = ` +Here is an example of python code translated into a compatible Texera UDF that gives output that abides the output schema compatible with the Texera workflow operators for tuples. Other operators do not always follow this strict format, but the yielding output structure is important. + +Python Code (high level idea): We have a python code that given some data, we limit the number of data. + +Texera Operator code: +from pytexera import * + +class ProcessTupleOperator(UDFOperatorV2): +def __init__(self): +self.limit = 10 +self.count = 0 +@overrides +def process_tuple(self, tuple_: Tuple, port: int) -> Iterator[Optional[TupleLike]]: +if(self.count < self.limit): +self.count += 1 +yield tuple_ + +`; + +export const VISUALIZER_DOCUMENTATION = ` +Texera requires a unique way of generating visualizations from ML libraries: +1. Ensures one yield per operator (per Texera’s UDF constraints). +2. Uses Plotly for visualization and outputs results as embeddable HTML. +3. Error handling is built-in to notify users when data is missing. +`; + +export const EXAMPLE_OF_MULTIPLE_UDF_CONVERSION = ` +Here is an example of breaking up python code into multiple Texera UDFs. Format your response structure exactly like the given example. The "code" key contains a dictionary of the UDF ID's with their respective code. The "edges" key contains a list of pairs that contains the connections between UDFs. The "outputs" key contains a dictionary of the UDF ID's with a list of variable names that they yield in the UDF code. The UDFs can branch and merge, it does not have to be a linear chain depending on your implementation. + +Original Code: +\`\`\`python +# START CELL1 +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.ensemble import RandomForestClassifier +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import accuracy_score +from sklearn.preprocessing import StandardScaler +import matplotlib.pyplot as plt +# END CELL1 + +# START CELL2 +# Load the dataset +file_path = 'diabetes.csv' +data = pd.read_csv(file_path) +# END CELL2 + +# START CELL3 +# Remove duplicate rows +data = data.drop_duplicates() + +# Remove rows with null values +data = data.dropna() +# END CELL3 + +# START CELL4 +# Print the minimum, maximum, and mean for all fields +print("Minimum values:\n", data.min()) +print("\nMaximum values:\n", data.max()) +print("\nMean values:\n", data.mean()) +# END CELL 4 + +# START CELL5 +# Create a boxplot for the 'Pregnancies' field +plt.figure(figsize=(8, 6)) +plt.boxplot(data['Pregnancies'], vert=False, patch_artist=True) +plt.title('Boxplot of Pregnancies') +plt.xlabel('Number of Pregnancies') +plt.show() +# END CELL5 + +# START CELL6 +# Separate features and target variable +X = data.drop('Outcome', axis=1) +y = data['Outcome'] +# END CELL6 + +# START CELL7 +# Split data into training and testing sets (80% train, 20% test) +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + +scaler = StandardScaler() +X_train = scaler.fit_transform(X_train) +X_test = scaler.transform(X_test) +# END CELL7 + +# START CELL8 +# Train Random Forest model +rf_model = RandomForestClassifier(random_state=42) +rf_model.fit(X_train, y_train) +rf_pred = rf_model.predict(X_test) +rf_accuracy = accuracy_score(y_test, rf_pred) +print(f"Random Forest Accuracy: {rf_accuracy:.2%}") +# END CELL8 + +# START CELL9 +# Train SVM model +svm_model = SVC(random_state=42) +svm_model.fit(X_train, y_train) +svm_pred = svm_model.predict(X_test) +svm_accuracy = accuracy_score(y_test, svm_pred) +print(f"SVM Accuracy: {svm_accuracy:.2%}") +# END CELL9 +\`\`\` + +Texera UDF conversion: +\`\`\`json +{ + "code": { + "UDF1": "# UDF1\nfrom pytexera import *\nimport pandas as pd\nfrom typing import Iterator, Optional\n\nclass ProcessTableOperator(UDFTableOperator):\n\n @overrides\n def process_table(self, table: Table, port: int) -> Iterator[Optional[TableLike]]:\n # Remove duplicate rows\n data = table.drop_duplicates()\n\n # Remove rows with null values\n data = data.dropna()\n\n # Calculate statistics\n min_values = data.min()\n max_values = data.max()\n mean_values = data.mean()\n\n # Create a DataFrame to yield\n result_table = pd.DataFrame({\n 'min_values': [min_values],\n 'max_values': [max_values],\n 'mean_values': [mean_values],\n 'data': [data]\n })\n\n yield Table(result_table)", + "UDF2": "# UDF2\nfrom pytexera import *\nimport pandas as pd\nimport plotly.express as px\nimport plotly.io\nfrom typing import Iterator, Optional\n\nclass ProcessTableOperator(UDFTableOperator):\n def render_error(self, error_msg):\n return '''<h1>Boxplot is not available.</h1>\n <p>Reason is: {} </p>\n '''.format(error_msg)\n\n @overrides\n def process_table(self, table: Table, port: int) -> Iterator[Optional[TableLike]]:\n data = table['data'].iloc[0]\n\n if data.empty:\n yield {'html-content': self.render_error('input table is empty.')}\n return\n\n # Create a boxplot for the 'Pregnancies' field\n fig = px.box(data, x='Pregnancies')\n fig.update_layout(margin=dict(l=0, r=0, t=0, b=0))\n\n # Convert fig to HTML content\n html = plotly.io.to_html(fig, include_plotlyjs='cdn', auto_play=False)\n yield {'html-content': html}", + "UDF3": "# UDF3\nfrom pytexera import *\nimport pandas as pd\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nfrom typing import Iterator, Optional\n\nclass ProcessTableOperator(UDFTableOperator):\n\n @overrides\n def process_table(self, table: Table, port: int) -> Iterator[Optional[TableLike]]:\n data = table['data'].iloc[0]\n\n # Separate features and target variable\n X = data.drop('Outcome', axis=1)\n y = data['Outcome']\n\n # Split data into training and testing sets (80% train, 20% test)\n X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n\n scaler = StandardScaler()\n X_train = scaler.fit_transform(X_train)\n X_test = scaler.transform(X_test)\n\n # Create a DataFrame to yield\n result_table = pd.DataFrame({\n 'X_train': [X_train], 'X_test': [X_test],\n 'y_train': [y_tra in], 'y_test': [y_test]\n })\n\n yield Table(result_table)", + "UDF4": "# UDF4\nfrom pytexera import *\nimport pandas as pd\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score\nfrom typing import Iterator, Optional\n\nclass ProcessTableOperator(UDFTableOperator):\n\n @overrides\n def process_table(self, table: Table, port: int) -> Iterator[Optional[TableLike]]:\n X_train = table['X_train'].iloc[0]\n y_train = table['y_train'].iloc[0]\n X_test = table['X_test'].iloc[0]\n y_test = table['y_test'].iloc[0]\n\n # Train Random Forest model\n rf_model = RandomForestClassifier(random_state=42)\n rf_model.fit(X_train, y_train)\n rf_pred = rf_model.predict(X_test)\n rf_accuracy = accuracy_score(y_test, rf_pred)\n\n # Create a DataFrame to yield\n result_table = pd.DataFrame({\n 'rf_model': [rf_model],\n 'rf_accuracy': [rf_accuracy],\n 'X_test': [X_test],\n 'y_test': [y_test]\n })\n\n yield Table(result_table)", + "UDF5": "# UDF5\nfrom pytexera import *\nimport pandas as pd\nfrom sklearn.svm import SVC\nfrom sklearn.metrics import accuracy_score\nfrom typing import Iterator, Optional\n\nclass ProcessTableOperator(UDFTableOperator):\n\n @overrides\n def process_table(self, table: Table, port: int) -> Iterator[Optional[TableLike]]:\n X_train = table['X_train'].iloc[0]\n y_train = table['y_train'].iloc[0]\n X_test = table['X_test'].iloc[0]\n y_test = table['y_test'].iloc[0]\n\n # Train SVM model\n svm_model = SVC(random_state=42)\n svm_model.fit(X_train, y_train)\n svm_pred = svm_model.predict(X_test)\n svm_accuracy = accuracy_score(y_test, svm_pred)\n\n # Create a DataFrame to yield\n result_table = pd.DataFrame({\n 'svm_model': [svm_model],\n 'svm_accuracy': [svm_accuracy],\n 'X_test': [X_test],\n 'y_test': [y_test]\n })\n\n yield Table(result_tab le)" + }, + "edges": [ + ["UDF1", "UDF2"], + ["UDF1", "UDF3"], + ["UDF3", "UDF4"], + ["UDF3", "UDF5"] + ], + "outputs": { + "UDF1": ["min_values", "max_values", "mean_values", "data"], + "UDF2": ["html-content"], + "UDF3": ["X_train", "X_test", "y_train", "y_test"], + "UDF4": ["rf_model", "rf_accuracy", "X_test", "y_test"], + "UDF5": ["svm_model", "svm_accuracy", "X_test", "y_test"] + } +} +\`\`\` +`; + +export const WORKFLOW_PROMPT = `You are an expert in Python coding and workflow systems. +Many users of Texera system are non-technical, but the notebooks they provide are written by technical people. +They want to convert their notebooks to Texera workflows. +Your goal is to help convert these notebooks into a Texera workflow that non-technical users can use directly. +So do not remove or modify any classes or functions, preserve their names and structure as they are. +Ensure that all essential logic remains intact. +Create multiple Texera UDF codes using the provided Python code. +Number each UDF, starting at 1 and incrementing, by starting with a comment that states that UDF number. + +Use the class and function names as shown in ProcessTupleOperator, ProcessTableOperator, and ProcessBatchOperator. +Do not change the class names, function names, or input parameters. +Use the ones that make sense and split the code meaningfully as instructed. + +Use the starter code provided for Python UDFs. + +Use the documentation of Table, Tuple, or Batch to work with parameters within Texera UDF. +Do not import other libraries to define these types. + +There is no need for an __init__ function. Assume all inputs are valid pandas DataFrames, +so do not use .to_pandas(), .to_dataframe(), etc. Do not load data from a file in the first UDF, assume +that the data is already given to you in the table parameter. +Ensure proper data flow between functions. Separate operators as if they will run in different files. + +Current UDF operators can only have one output. Build a dataframe to yield all necessary variables +and data. Ensure proper data flow for each UDF and all information is yielded (including training +and testing data) if subsequent UDFs need them. + +Ensure all necessary imports are included in each UDF code block. + +Each UDF operator should be in its own Python code block. Do not combine them into a single block. +Ensure import statements cover all used functions and separate them as necessary. + +It is VERY important that all of the original code in the Jupyter notebook is represented in the generated workflow. +Make sure that nothing in the original is removed and that the semantic meaning of what the original code was doing is retained. +If there are user-defined Python classes, include the entire class definition in the appropriate UDF(s) that use that class. +Always include the code that defines the class inside of every distinct UDF that uses that constructs an object of that class. +Python classes are allowed in Texera UDFs and follow the same semantics as standard Python. +They can be defined outside of ProcessTableOperator, ProcessTupleOperator, and ProcessBatchOperator. + +Return only the JSON formatted response, do not give any explanation. +Make sure the response is a valid JSON structure, including closing all braces and not including commas after the last element. +Follow this JSON format (don't reuse the values, this is just the format). 'code', 'edges', and 'outputs' are all their own key's, do not nest any of these in another one and make sure to close their braces: +{ +"code": { +"UDF1": "code for UDF1 goes here", +"UDF2": "code for UDF2 goes here" +}, +"edges": [ +["UDF1", "UDF2"] +], +"outputs": { +"UDF1": ["min_values", "max_values", "mean_values", "data"], +"UDF2": ["html-content"] +} +} +Make sure only the keys in the code section appear in the edges and outputs sections. Do not include any extraneous fields. +Do not include any extraneous UDF's in the code field that include empty strings. +Give ALL of the code, do not omit anything or use placeholders for code. Make sure ALL code in the original is translated over. +Use only unescaped single quotes inside of the code values for the UDF's, do not use escaped double quotes. +Convert following the instructions and examples given. Here is the code: +`; + +export const MAPPING_PROMPT = ` +Here is an example of a mapping generated between the given example Python code and the Texera UDFs using their CELL and UDF IDs. Cell IDs are designated by the UUID following '# START'. The format should be kept the same. +{ +"UDF1": [ +"CEll3", +"CELL4" +], +"UDF2": [ +"CELL5" +], +"UDF3": [ +"CELL6", +"CELL7" +] Review Comment: The example JSON in `MAPPING_PROMPT` is invalid (missing a comma between the `UDF3` array and the `UDF4` key). Because `convertNotebookToWorkflow` does `JSON.parse` on the LLM output, this kind of invalid example can cause the model to mimic invalid JSON and break the flow; please fix the example to be valid JSON. ```suggestion ], ``` ########## frontend/src/app/workspace/service/notebook-migration/migration-llm.ts: ########## @@ -0,0 +1,299 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { Injectable } from "@angular/core"; +import { firstValueFrom, from, Observable, of } from "rxjs"; +import { map } from "rxjs/operators"; +import { createOpenAI } from "@ai-sdk/openai"; +import { generateText, type ModelMessage } from "ai"; +import { AppSettings } from "../../../common/app-setting"; +import { v4 as uuidv4 } from "uuid"; +import { + TEXERA_OVERVIEW, + TUPLE_DOCUMENTATION, + TABLE_DOCUMENTATION, + OPERATOR_DOCUMENTATION, + UDF_INPUT_PORT_DOCUMENTATION, + EXAMPLE_OF_GOOD_CONVERSION, + VISUALIZER_DOCUMENTATION, + EXAMPLE_OF_MULTIPLE_UDF_CONVERSION, + WORKFLOW_PROMPT, + MAPPING_PROMPT, +} from "./migration-prompts"; + +interface Cell { + cell_type: string; + metadata: { [key: string]: any }; + source: string; +} + +export interface Notebook { + cells: Cell[]; +} + +interface WorkflowJSON { + operators: any[]; + operatorPositions: Record<string, { x: number; y: number }>; + links: any[]; + commentBoxes: any[]; + settings: { + dataTransferBatchSize: number; + }; +} + +interface CombinedMapping { + operator_to_cell: Record<string, string[]>; + cell_to_operator: Record<string, string[]>; +} + +@Injectable() +export class NotebookMigrationLLM { + private model: any; + private messages: ModelMessage[] = []; + private initialized = false; + + private static readonly DOCUMENTATION: string[] = [ + TEXERA_OVERVIEW, + TUPLE_DOCUMENTATION, + TABLE_DOCUMENTATION, + OPERATOR_DOCUMENTATION, + EXAMPLE_OF_GOOD_CONVERSION, + VISUALIZER_DOCUMENTATION, + UDF_INPUT_PORT_DOCUMENTATION, + EXAMPLE_OF_MULTIPLE_UDF_CONVERSION, + ]; + + /** + * Initialize a new LLM session with Texera documentation + */ + public initialize(modelType: string = "gpt-5-mini", apiKey: string = "dummy"): void { + this.model = createOpenAI({ + baseURL: new URL(`${AppSettings.getApiEndpoint()}`, document.baseURI).toString(), + // apiKey is required by the library for creating the OpenAI compatible client; + // For security reason, we store the apiKey at the backend, thus the value is dummy here. + apiKey: apiKey, Review Comment: `initialize` claims the API key is stored at the backend and should be a dummy value, but it actually passes the user-provided `apiKey` into the OpenAI client, which will typically be sent in an `Authorization` header on requests. If the intention is to keep keys server-side (or rely on the server’s master key), this should not forward the user’s key from the browser; otherwise, the backend/proxy needs an explicit, secure design for handling user-provided keys. ```suggestion void apiKey; this.model = createOpenAI({ baseURL: new URL(`${AppSettings.getApiEndpoint()}`, document.baseURI).toString(), // apiKey is required by the library for creating the OpenAI compatible client; // For security reason, we store the apiKey at the backend, thus the value is dummy here. apiKey: "dummy", ``` ########## notebook-migration-service/src/main/scala/org/apache/texera/service/resource/NotebookMigrationResource.scala: ########## @@ -0,0 +1,358 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.texera.service.resource + +import com.fasterxml.jackson.databind.ObjectMapper +import com.fasterxml.jackson.module.scala.DefaultScalaModule +import com.typesafe.scalalogging.LazyLogging +import jakarta.ws.rs._ +import jakarta.ws.rs.core._ +import org.apache.texera.dao.SqlServer +import org.jooq.JSONB +import org.apache.texera.dao.jooq.generated.tables.Notebook +import org.apache.texera.dao.jooq.generated.tables.WorkflowNotebookMapping +import java.net.{HttpURLConnection, URL} +import java.nio.charset.StandardCharsets +import scala.util.control.NonFatal +import org.apache.texera.amber.config.StorageConfig + +object NotebookMigrationResource extends LazyLogging { + + private val mapper: ObjectMapper = new ObjectMapper().registerModule(DefaultScalaModule) + private val jupyterUrl = StorageConfig.jupyterURL + private var jupyterIframeURL = s"$jupyterUrl/notebooks/work/notebook.ipynb" + + private def isJupyterAvailable(jupyterUrl: String): Boolean = { + try { + val conn = new java.net.URL(s"$jupyterUrl/api") + .openConnection() + .asInstanceOf[java.net.HttpURLConnection] + + conn.setRequestMethod("GET") + conn.setConnectTimeout(2000) + conn.setReadTimeout(2000) + + val status = conn.getResponseCode + + status == 200 || status == 403 + } catch { + case _: Exception => false + } + } + + // Returns the Jupyter iframe reference URL + def getJupyterIframeURL(): Response = { + if (!isJupyterAvailable(jupyterUrl)) { + return Response + .status(500) + .entity( + """ + { + "success": false, + "message": "Cannot connect to Jupyter server" + } + """ + ) + .build() + } + + Response + .ok( + s""" + { + "success": true, + "url": "$jupyterIframeURL" + } + """ + ) + .build() + } + + // Returns the URL of Jupyter + def getJupyterURL(): Response = { + if (!isJupyterAvailable(jupyterUrl)) { + return Response + .status(500) + .entity( + """ + { + "success": false, + "message": "Cannot connect to Jupyter server" + } + """ + ) + .build() + } + + Response + .ok( + s""" + { + "success": true, + "url": "$jupyterUrl" + } + """ + ) + .build() + } + + // Set the notebook in Jupyter + def setNotebook(body: String): Response = { + if (!isJupyterAvailable(jupyterUrl)) { + return Response + .status(500) + .entity( + """ + { + "success": false, + "message": "Cannot connect to Jupyter server" + } + """ + ) + .build() + } + + try { + val json = mapper.readTree(body) + + val notebookName = json.get("notebookName").asText() + val notebookData = json.get("notebookData") + Review Comment: `setNotebook` accepts an arbitrary `notebookName`, but later the iframe URL is always set to `.../notebook.ipynb` (hard-coded). This makes the API internally inconsistent if a different name is provided; either enforce a fixed name in the request contract or derive the iframe URL from the uploaded `notebookName`. ########## frontend/src/app/workspace/component/menu/menu.component.html: ########## @@ -456,3 +465,113 @@ </div> </div> </div> + +<ng-template + #importNotebookModal + let-data> + <form + [formGroup]="importForm" + nz-form> + <div style="text-align: center; margin-bottom: 12px"> + <img + ngSrc="assets/notebook_migration_tool/tool_popup_diagram.png" + alt="Notebook to Workflow" + width="1646" + height="479" + style="max-width: 100%; height: auto; border-radius: 8px" /> + </div> + + <nz-form-item> + <p style="margin: 0; font-size: 16px; font-weight: 500; user-select: text"> + This tool converts a Python Jupyter Notebook into a Texera workflow using LLM capabilities. After you submit a + notebook, the LLM service will generate a corresponding Texera workflow. The conversion time depends on the + notebook’s complexity and can take 1–5 minutes. Once the process is complete, the workflow workspace will reload + with: + </p> + <ol + style=" + margin-top: 4px; + margin-bottom: 0; + padding-left: 20px; + font-size: 16px; + font-weight: 500; + user-select: text; + "> + <li> + The generated workflow ready to use (Note: you will still need to upload the dataset and connect it to the + workflow). + </li> + <li>A floating Jupyter window containing the uploaded notebook for reference.</li> + </ol> + <p style="margin: 0; font-size: 16px; font-weight: 500; user-select: text"> + Feel free to navigate away from this tab while you wait for the workflow to generate. Please do not close the + window. + </p> + </nz-form-item> + + <nz-form-item> + <nz-form-label [nzNoColon]="true"> + <span style="font-weight: 1000; font-size: 16px"> Upload Python Jupyter Notebook </span> + </nz-form-label> + <nz-form-control> + <div style="display: inline-flex; align-items: center; gap: 8px"> + <nz-upload + [nzBeforeUpload]="beforeUpload" + [nzShowUploadList]="false"> + <button + nz-button + style="white-space: normal"> + <i + nz-icon + nzType="upload"></i> + </button> + </nz-upload> + + <span *ngIf="importForm.get('file')?.value?.name"> + Selected file: {{ importForm.get('file')?.value?.name }} + </span> + </div> + </nz-form-control> + </nz-form-item> + + <nz-form-item> + <nz-form-label [nzNoColon]="true"> + <span style="font-weight: 1000; font-size: 16px"> Select Model Type </span> + </nz-form-label> + + <nz-form-control> + <ng-container *ngIf="data.models$ | async as models; else loadingTpl"> + <nz-select + formControlName="model" + nzPlaceHolder="Select a model" + style="width: 50%"> + <nz-option + *ngFor="let model of $any(models)" + [nzValue]="model.name" + [nzLabel]="model.name"></nz-option> + </nz-select> + </ng-container> + + <ng-template #loadingTpl> + <nz-select + nzPlaceHolder="Loading models..." + [nzLoading]="true" + [nzDisabled]="true" + style="width: 50%"></nz-select> + </ng-template> + </nz-form-control> + </nz-form-item> + + <nz-form-item> + <nz-form-label [nzNoColon]="true"> + <span style="font-weight: 1000; font-size: 16px"> API Key </span> + </nz-form-label> + <nz-form-control> + <input + nz-input Review Comment: The API key input is rendered as a plain text field. Since this value is sensitive, it should be a password-type input (and ideally disable autocomplete) to reduce accidental disclosure (screen sharing, browser autofill history, etc.). ```suggestion nz-input type="password" autocomplete="off" ``` ########## frontend/src/app/workspace/component/menu/menu.component.ts: ########## @@ -550,6 +583,168 @@ export class MenuComponent implements OnInit, OnDestroy { this.workflowActionService.deleteOperatorsAndLinks(allOperatorIDs); } + openImportNotebookModal(): void { + const models$ = this.notebookMigrationService.getAvailableModels().pipe( + tap({ + error: () => this.notificationService.error("Failed to fetch models"), + }) + ); + + const modalRef = this.modal.create({ + nzTitle: "AI Generate Workflow from Python Notebook", + nzContent: this.importModalTpl, + nzViewContainerRef: this.viewContainerRef, + nzWidth: 700, + nzData: { + models$: models$, + }, + nzFooter: [ + { + label: "Cancel", + onClick: () => { + modalRef.close(); + }, + }, + { + label: "Submit", + type: "primary", + disabled: () => !this.importForm.valid, + onClick: () => { + const file: NzUploadFile = this.importForm.get("file")?.value; + const model: string = this.importForm.get("model")?.value; + const apiKey: string = this.importForm.get("apiKey")?.value; + this.onClickImportNotebook(file, model, apiKey); + modalRef.close(); // close after submit too + }, + }, + ], + }); + } + + public beforeUpload = (file: NzUploadFile) => { + this.importForm.patchValue({ file }); + this.importForm.get("file")?.markAsDirty(); + this.importForm.get("file")?.updateValueAndValidity(); + return false; // prevent auto upload + }; + + public onClickImportNotebook = (file: NzUploadFile, model: string, apiKey: string): boolean => { + const reader = new FileReader(); + + // Check if the file is a Jupyter notebook based on its extension + const fileExtension = file.name.split(".").pop()?.toLowerCase(); + if (fileExtension !== "ipynb") { + this.notificationService.error("Please upload a valid Jupyter Notebook (.ipynb) file."); + return false; + } + + this.setWaitingForLLM.emit(true); // start loading + + // Read the notebook file as text + reader.readAsText(file as any); + reader.onload = async () => { + try { + const result = reader.result; + if (typeof result !== "string") { + throw new Error("File content is not a valid string."); + } + + // Parse the content of the .ipynb file (it's in JSON format) + const notebookContent = JSON.parse(result) as Notebook; + + // Validate the notebook structure + if (!notebookContent || !Array.isArray(notebookContent.cells)) { + throw new Error("Invalid notebook structure."); + } + + // Add UUID's to each cell in the notebook + for (const cell of notebookContent.cells) { + if (!cell.metadata) { + cell.metadata = {}; + } + cell.metadata.uuid = uuidv4(); + } + + // Send Notebook JSON to pod to open in jupyterlab + await this.notebookMigrationService.sendNotebookToJupyter(notebookContent); + + // Get workflow and mapping from LLM + await this.notebookMigrationService + .sendToAIGenerateWorkflow(notebookContent, model, apiKey) + .then(result => { + if (result) { + const { workflowContent, mappingContent } = result; + + const fileExtensionIndex = file.name.lastIndexOf("."); + var workflowName: string; + if (fileExtensionIndex === -1) { + workflowName = file.name; + } else { + workflowName = file.name.substring(0, fileExtensionIndex); + } + if (workflowName.trim() === "") { + workflowName = DEFAULT_WORKFLOW_NAME; + } + + const workflow: Workflow = { + content: workflowContent, + name: `${workflowName}_GENERATED_BY_LLM`, + isPublished: 0, + description: undefined, + wid: undefined, + creationTime: undefined, + lastModifiedTime: undefined, + readonly: false, + }; + + this.workflowPersistService + .persistWorkflow(workflow) + .pipe( + switchMap((updatedWorkflow: Workflow) => { + const mappingID = "mapping_wid_" + updatedWorkflow.wid; + + this.notebookMigrationService.setMapping(mappingID, mappingContent); + + return this.notebookMigrationService + .storeNotebookAndMapping(updatedWorkflow.wid, 1, mappingContent, notebookContent) + .pipe(map(() => updatedWorkflow)); + }), + untilDestroyed(this) + ) + .subscribe({ + next: updatedWorkflow => { + this.workflowActionService.reloadWorkflow(updatedWorkflow, true); + this.jupyterPanelService.openPanel("JupyterNotebookPanel"); + this.notificationService.success("Successfully generated workflow and mapping from notebook."); + }, + error: (err: unknown) => { + this.notificationService.error("Failed to import notebook, check console for detailed error"); + console.error("Import notebook failed:", err); + }, + complete: () => { + this.setWaitingForLLM.emit(false); + }, + }); + } else { + console.error("Result is undefined"); + } + }) + .catch(error => { + this.notificationService.error("Error while communicating with LLM, check console for details"); + console.error("Error while fetching data from LLM: ", error); + }) + .finally(() => { + this.setWaitingForLLM.emit(false); // stop loading + }); + } catch (error) { + this.notificationService.error("Failed to import the notebook."); + console.error(error); Review Comment: If an error happens before the `sendToAIGenerateWorkflow(...).finally(...)` block is reached (e.g., JSON parse failure, invalid notebook structure, failure sending notebook to Jupyter), `setWaitingForLLM` will remain `true` and the spinner can get stuck. Add an outer `finally` (or emit `false` in the outer `catch`) so the loading state is always cleared. ```suggestion }); } catch (error) { this.notificationService.error("Failed to import the notebook."); console.error(error); } finally { this.setWaitingForLLM.emit(false); // stop loading ``` ########## notebook-migration-service/src/main/resources/custom.js: ########## @@ -0,0 +1,95 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +// Use Jupyter's event system to ensure the notebook is fully loaded +require(["base/js/events"], function (events) { + events.on("kernel_ready.Kernel", function () { + + // Attach click event listener to cells + $("#notebook-container").on("click", ".cell", function (event) { + const cell = $(this); + const index = $(".cell").index(cell); + const cellContent = cell.find(".input_area").text(); + + // Get the UUID from the cell's metadata, or use "N/A" if it doesn't exist + const cellUUID = Jupyter.notebook.get_cell(index).metadata.uuid || 'N/A'; + + // Send a message to the parent window (Texera app) + window.parent.postMessage( + { action: "cellClicked", cellIndex: index, cellContent: cellContent, cellUUID: cellUUID }, + "http://localhost:4200" + ); Review Comment: `postMessage` is hard-coded to target origin `http://localhost:4200`, and the receiver side also hard-codes that same origin. This will break in non-local deployments (different host/port) and is easy to misconfigure; please make the allowed/target origin configurable (or derive it safely from runtime config) rather than hard-coding localhost. ########## notebook-migration-service/src/main/scala/org/apache/texera/service/resource/NotebookMigrationResource.scala: ########## @@ -0,0 +1,358 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.texera.service.resource + +import com.fasterxml.jackson.databind.ObjectMapper +import com.fasterxml.jackson.module.scala.DefaultScalaModule +import com.typesafe.scalalogging.LazyLogging +import jakarta.ws.rs._ +import jakarta.ws.rs.core._ +import org.apache.texera.dao.SqlServer +import org.jooq.JSONB +import org.apache.texera.dao.jooq.generated.tables.Notebook +import org.apache.texera.dao.jooq.generated.tables.WorkflowNotebookMapping +import java.net.{HttpURLConnection, URL} +import java.nio.charset.StandardCharsets +import scala.util.control.NonFatal +import org.apache.texera.amber.config.StorageConfig + +object NotebookMigrationResource extends LazyLogging { + + private val mapper: ObjectMapper = new ObjectMapper().registerModule(DefaultScalaModule) + private val jupyterUrl = StorageConfig.jupyterURL + private var jupyterIframeURL = s"$jupyterUrl/notebooks/work/notebook.ipynb" + + private def isJupyterAvailable(jupyterUrl: String): Boolean = { + try { + val conn = new java.net.URL(s"$jupyterUrl/api") + .openConnection() + .asInstanceOf[java.net.HttpURLConnection] + + conn.setRequestMethod("GET") + conn.setConnectTimeout(2000) + conn.setReadTimeout(2000) + + val status = conn.getResponseCode + + status == 200 || status == 403 + } catch { + case _: Exception => false + } + } + + // Returns the Jupyter iframe reference URL + def getJupyterIframeURL(): Response = { + if (!isJupyterAvailable(jupyterUrl)) { + return Response + .status(500) + .entity( + """ + { + "success": false, + "message": "Cannot connect to Jupyter server" + } + """ + ) + .build() + } + + Response + .ok( + s""" + { + "success": true, + "url": "$jupyterIframeURL" + } + """ + ) + .build() + } + + // Returns the URL of Jupyter + def getJupyterURL(): Response = { + if (!isJupyterAvailable(jupyterUrl)) { + return Response + .status(500) + .entity( + """ + { + "success": false, + "message": "Cannot connect to Jupyter server" + } + """ + ) + .build() + } + + Response + .ok( + s""" + { + "success": true, + "url": "$jupyterUrl" + } + """ + ) + .build() + } + + // Set the notebook in Jupyter + def setNotebook(body: String): Response = { + if (!isJupyterAvailable(jupyterUrl)) { + return Response + .status(500) + .entity( + """ + { + "success": false, + "message": "Cannot connect to Jupyter server" + } + """ + ) + .build() + } + + try { + val json = mapper.readTree(body) + + val notebookName = json.get("notebookName").asText() + val notebookData = json.get("notebookData") + + // Construct Jupyter API URL + val apiUrl = s"$jupyterUrl/api/contents/work/$notebookName" + + val url = new URL(apiUrl) + val conn = url.openConnection().asInstanceOf[HttpURLConnection] + + conn.setRequestMethod("PUT") + conn.setDoOutput(true) + conn.setRequestProperty("Content-Type", "application/json") + + val requestBody = + s""" + { + "type": "notebook", + "content": $notebookData + } + """ + + val os = conn.getOutputStream + os.write(requestBody.getBytes(StandardCharsets.UTF_8)) + os.flush() + os.close() + + val status = conn.getResponseCode + + if (status != 200 && status != 201) { + return Response + .status(500) + .entity( + s""" + { + "success": false, + "message": "Failed to upload notebook to Jupyter (status $status)" + } + """ + ) + .build() + } + + jupyterIframeURL = s"$jupyterUrl/notebooks/work/notebook.ipynb" + + Response + .ok( + s""" + { + "success": true, + "message": "Notebook successfully sent to Jupyter." + } + """ + ) + .build() + + } catch { + case NonFatal(e) => + logger.error("Error sending notebook to Jupyter", e) + Response + .status(Response.Status.INTERNAL_SERVER_ERROR) + .entity(s"""{"error":"${e.getMessage}"}""") + .build() + } + } + + // Store notebook + mapping in database + def storeNotebookAndMapping(body: String): Response = { + try { + val json = mapper.readTree(body) + + val wid: java.lang.Integer = json.get("wid").asInt() + val vid: java.lang.Integer = json.get("vid").asInt() + val mappingNode = json.get("mapping") + val notebookNode = json.get("notebook") + + val dsl = SqlServer.getInstance().createDSLContext() + + val nid: java.lang.Integer = SqlServer.withTransaction(dsl) { ctx => + // Insert notebook + val notebookRecord = ctx + .insertInto(Notebook.NOTEBOOK) + .set(Notebook.NOTEBOOK.WID, wid) + .set(Notebook.NOTEBOOK.NOTEBOOK_, JSONB.valueOf(notebookNode.toString)) + .returning(Notebook.NOTEBOOK.NID) + .fetchOne() + + val nidInside: java.lang.Integer = notebookRecord.getValue(Notebook.NOTEBOOK.NID) + + // Insert workflow-notebook mapping + ctx + .insertInto(WorkflowNotebookMapping.WORKFLOW_NOTEBOOK_MAPPING) + .set(WorkflowNotebookMapping.WORKFLOW_NOTEBOOK_MAPPING.WID, wid) + .set(WorkflowNotebookMapping.WORKFLOW_NOTEBOOK_MAPPING.VID, vid) + .set(WorkflowNotebookMapping.WORKFLOW_NOTEBOOK_MAPPING.NID, nidInside) + .set( + WorkflowNotebookMapping.WORKFLOW_NOTEBOOK_MAPPING.MAPPING, + JSONB.valueOf(mappingNode.toString) + ) + .execute() + + nidInside + } + + Response + .ok( + s""" + { + "success": true, + "message": "Notebook and mapping successfully stored. wid: $wid, vid: $vid, nid: $nid" + } + """ + ) + .build() + + } catch { + case NonFatal(e) => + logger.error("Error storing mapping and workflow", e) + Response + .status(Response.Status.INTERNAL_SERVER_ERROR) + .entity(s"""{"error":"${e.getMessage}"}""") + .build() + } + } + + // Fetch notebook + mapping + def fetchNotebookAndMapping(body: String): Response = { + try { + val json = mapper.readTree(body) + + val wid: java.lang.Integer = json.get("wid").asInt() + val vid: java.lang.Integer = json.get("vid").asInt() + + val dsl = SqlServer.getInstance().createDSLContext() + + // Fetch the most recent notebook (highest nid) for this workflow version + val result = dsl + .select( + Notebook.NOTEBOOK.NID, + Notebook.NOTEBOOK.NOTEBOOK_, + WorkflowNotebookMapping.WORKFLOW_NOTEBOOK_MAPPING.MAPPING + ) + .from(Notebook.NOTEBOOK) + .join(WorkflowNotebookMapping.WORKFLOW_NOTEBOOK_MAPPING) + .on(Notebook.NOTEBOOK.WID.eq(WorkflowNotebookMapping.WORKFLOW_NOTEBOOK_MAPPING.WID)) + .and(Notebook.NOTEBOOK.NID.eq(WorkflowNotebookMapping.WORKFLOW_NOTEBOOK_MAPPING.NID)) + .where(Notebook.NOTEBOOK.WID.eq(wid)) + .and(WorkflowNotebookMapping.WORKFLOW_NOTEBOOK_MAPPING.VID.eq(vid)) + .orderBy(Notebook.NOTEBOOK.NID.desc()) // most recent nid first + .limit(1) // only take the latest + .fetchOne() + + if (result == null) { + Response.ok("""{"exists": false}""").build() + } else { + val nid: Int = result.getValue(Notebook.NOTEBOOK.NID) + val notebookJson: String = + result.get(Notebook.NOTEBOOK.NOTEBOOK_).asInstanceOf[JSONB].data() + val mappingJson: String = result + .get(WorkflowNotebookMapping.WORKFLOW_NOTEBOOK_MAPPING.MAPPING) + .asInstanceOf[JSONB] + .data() + + Response + .ok( + s""" + { + "exists": true, + "notebook": $notebookJson, + "mapping": $mappingJson + } + """ + ) + .build() + } + + } catch { + case NonFatal(e) => + logger.error("Database error retrieving mapping", e) + Response + .status(Response.Status.INTERNAL_SERVER_ERROR) + .entity(s"""{"error":"${e.getMessage}"}""") + .build() + } + } +} + +@Path("/notebook-migration") +@Produces(Array(MediaType.APPLICATION_JSON)) +@Consumes(Array(MediaType.APPLICATION_JSON)) +class NotebookMigrationResource extends LazyLogging { + + @GET + @Path("/get-jupyter-iframe-url") + def getJupyterIframeURL: Response = { + logger.info("Getting Jupyter iframe URL") + NotebookMigrationResource.getJupyterIframeURL() + } + + @GET + @Path("/get-jupyter-url") + def getJupyterURL: Response = { + logger.info("Getting Jupyter API URL") + NotebookMigrationResource.getJupyterURL() + } + + @POST + @Path("/set-notebook") + def setNotebook(body: String): Response = { + logger.info("Setting notebook, request body: " + body) + NotebookMigrationResource.setNotebook(body) Review Comment: These endpoints log the full raw request body at INFO level. For notebooks/mappings this can be very large and may contain sensitive user data; please log only minimal metadata (e.g., wid/vid/nid, sizes) and avoid emitting the full JSON payload. ########## notebook-migration-service/src/main/resources/docker-compose.yml: ########## @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: texera-jupyter +services: + + jupyter: + build: + context: . + dockerfile: Dockerfile + container_name: texera-jupyter + ports: + - "9100:8888" + command: > + start-notebook.sh + --NotebookApp.token='' + --NotebookApp.password='' + --NotebookApp.disable_check_xsrf=True + --NotebookApp.tornado_settings="{'headers': {'Content-Security-Policy': 'frame-ancestors http://localhost:*'}}" + --NotebookApp.default_url=/tree Review Comment: The Jupyter container is started with token/password disabled and XSRF checks disabled. This makes the notebook server unauthenticated and vulnerable if it’s ever exposed beyond localhost; please keep secure defaults and make the insecure settings opt-in via env vars / a separate dev-only compose file. ```suggestion /bin/sh -c "start-notebook.sh --NotebookApp.tornado_settings=\"{'headers': {'Content-Security-Policy': 'frame-ancestors http://localhost:*'}}\" --NotebookApp.default_url=/tree $${JUPYTER_DEV_FLAGS:-}" ``` ########## frontend/src/app/workspace/component/jupyter-notebook-panel/jupyter-notebook-panel.component.ts: ########## @@ -0,0 +1,97 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { Component, ElementRef, OnDestroy, OnInit, ViewChild, AfterViewInit } from "@angular/core"; +import { JupyterPanelService } from "../../service/jupyter-panel/jupyter-panel.service"; +import { from, of, Subject } from "rxjs"; +import { switchMap, takeUntil } from "rxjs/operators"; +import { DomSanitizer, SafeResourceUrl } from "@angular/platform-browser"; +import { NotebookMigrationService } from "../../service/notebook-migration/notebook-migration.service"; + +@Component({ + selector: "texera-jupyter-notebook-panel", + templateUrl: "./jupyter-notebook-panel.component.html", + styleUrls: ["./jupyter-notebook-panel.component.scss"], +}) +export class JupyterNotebookPanelComponent implements OnInit, AfterViewInit, OnDestroy { + @ViewChild("iframeRef", { static: false }) iframeRef!: ElementRef<HTMLIFrameElement>; // Use static: false + + isVisible: boolean = false; // Initialize to false, meaning the panel is hidden by default + jupyterUrl: SafeResourceUrl = ""; // Store the notebook URL dynamically + private destroy$ = new Subject<void>(); + + constructor( + private jupyterPanelService: JupyterPanelService, + private sanitizer: DomSanitizer, + private notebookMigrationService: NotebookMigrationService + ) {} + + ngOnInit(): void { + this.jupyterPanelService.jupyterNotebookPanelVisible$ + .pipe( + switchMap((visible: boolean) => { + this.isVisible = visible; + + if (!visible) { + return of(null); + } + + return from(this.notebookMigrationService.getJupyterIframeURL()); + }), + takeUntil(this.destroy$) + ) + .subscribe(url => { + if (url) { + this.jupyterUrl = this.sanitizer.bypassSecurityTrustResourceUrl(url); + this.checkIframeRef(); + } + }); + } + + ngAfterViewInit(): void { + // Ensure iframe is handled after it's available in the DOM + this.checkIframeRef(); + } + + checkIframeRef(): void { + setTimeout(() => { + if (this.isVisible && this.iframeRef?.nativeElement) { + this.jupyterPanelService.setIframeRef(this.iframeRef.nativeElement); + } else { + console.error("Jupyter Iframe reference not found."); + } Review Comment: `checkIframeRef` logs `console.error` when the panel is not visible / iframe is not rendered yet, but it’s called from `ngAfterViewInit` even when hidden. This will produce noisy errors on normal startup; only log when the panel is expected to be visible and the iframe ref is still missing. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
