This is an automated email from the ASF dual-hosted git repository. github-merge-queue[bot] pushed a commit to branch gh-readonly-queue/main/pr-5171-f89c07cffe88eff6f88fcce89f6a4ac41ede239e in repository https://gitbox.apache.org/repos/asf/texera.git
commit cbf4942785f5218b415a6edfd35eefec4046aadd Author: Xuan Gu <[email protected]> AuthorDate: Mon May 25 19:33:56 2026 -0700 fix: filter out datasets with inconsistent database and LakeFS records (#5171) <!-- Thanks for sending a pull request (PR)! Here are some tips for you: 1. If this is your first time, please read our contributor guidelines: [Contributing to Texera](https://github.com/apache/texera/blob/main/CONTRIBUTING.md) 2. Ensure you have added or run the appropriate tests for your PR 3. If the PR is work in progress, mark it a draft on GitHub. 4. Please write your PR title to summarize what this PR proposes, we are following Conventional Commits style for PR titles as well. 5. Be sure to keep the PR description updated to reflect all changes. --> ### What changes were proposed in this PR? <!-- Please clarify what changes you are proposing. The purpose of this section is to outline the changes. Here are some tips for you: 1. If you propose a new API, clarify the use case for a new API. 2. If you fix a bug, you can clarify why it is a bug. 3. If it is a refactoring, clarify what has been changed. 3. It would be helpful to include a before-and-after comparison using screenshots or GIFs. 4. Please consider writing useful notes for better and faster reviews. --> This PR fixes an issue where dataset listings fail when dataset records in the database and LakeFS repositories are inconsistent. This breaks the workflow dataset picker and can also affect Hub dataset listings. The fix wraps the per-row retrieveRepositorySize call in a try/catch for ApiException, logs the orphan, and drops it from the response. Demo: | Before | After | |--------|-------| | <img width="525" height="194" alt="Before: dataset listing error" src="https://github.com/user-attachments/assets/ae72e08f-64e0-4bce-866c-a9570b4c4591" /> | <img width="655" height="395" alt="After: dataset picker loads valid datasets" src="https://github.com/user-attachments/assets/cf3e4679-bc09-4cff-853a-2080c92cc35b" /> | ### Any related issues, documentation, discussions? <!-- Please use this section to link other resources if not mentioned already. 1. If this PR fixes an issue, please include `Fixes #1234`, `Resolves #1234` or `Closes #1234`. If it is only related, simply mention the issue number. 2. If there is design documentation, please add the link. 3. If there is a discussion in the mailing list, please add the link. --> Closes #5106 ### How was this PR tested? <!-- If tests were added, say they were added here. Or simply mention that if the PR is tested with existing test cases. Make sure to include/update test cases that check the changes thoroughly including negative and positive cases if possible. If it was tested in a way different from regular unit tests, please clarify how you tested step by step, ideally copy and paste-able, so that other reviewers can test and check, and descendants can verify in the future. If tests were not added, please describe why they were not added and/or why it was difficult to add. --> Added two tests. ### Was this PR authored or co-authored using generative AI tooling? <!-- If generative AI tooling has been used in the process of authoring this PR, please include the phrase: 'Generated-by: ' followed by the name of the tool and its version. If no, write 'No'. Please refer to the [ASF Generative Tooling Guidance](https://www.apache.org/legal/generative-tooling.html) for details. --> Generated-by: Claude Opus 4.7 --------- Co-authored-by: Meng Wang <[email protected]> --- .../web/resource/dashboard/hub/HubResource.scala | 31 +++++++++++---- .../texera/service/resource/DatasetResource.scala | 45 ++++++++++++---------- .../service/resource/DatasetResourceSpec.scala | 36 +++++++++++++++++ 3 files changed, 84 insertions(+), 28 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/web/resource/dashboard/hub/HubResource.scala b/amber/src/main/scala/org/apache/texera/web/resource/dashboard/hub/HubResource.scala index c4cb9ee3cb..33ba16c6e1 100644 --- a/amber/src/main/scala/org/apache/texera/web/resource/dashboard/hub/HubResource.scala +++ b/amber/src/main/scala/org/apache/texera/web/resource/dashboard/hub/HubResource.scala @@ -19,6 +19,7 @@ package org.apache.texera.web.resource.dashboard.hub +import com.typesafe.scalalogging.Logger import io.dropwizard.auth.Auth import org.apache.texera.amber.core.storage.util.LakeFSStorageClient import org.apache.texera.auth.SessionUser @@ -41,6 +42,7 @@ import org.apache.texera.web.resource.dashboard.user.workflow.WorkflowResource.{ } import org.jooq.Table import org.jooq.impl.DSL +import org.slf4j.LoggerFactory import java.util.regex.Pattern import javax.servlet.http.HttpServletRequest @@ -51,6 +53,8 @@ import scala.jdk.CollectionConverters._ import scala.language.existentials object HubResource { + private lazy val logger: Logger = Logger(LoggerFactory.getLogger(getClass.getName)) + // Represents an entity reference for general-purpose batch APIs. // Used by: isLikedHelper, recordLikeAction, getCounts, userAccess case class UserRequest(entityId: Integer, entityType: EntityType) @@ -306,17 +310,28 @@ object HubResource { .fetch() records.asScala - .map { record => + .flatMap { record => val dataset = record.into(DATASET).into(classOf[Dataset]) val datasetAccess = record.into(DATASET_USER_ACCESS).into(classOf[DatasetUserAccess]) val ownerEmail = record.into(USER).getEmail - DashboardDataset( - isOwner = if (uid == null) false else dataset.getOwnerUid == uid, - dataset = dataset, - accessPrivilege = datasetAccess.getPrivilege, - ownerEmail = ownerEmail, - size = LakeFSStorageClient.retrieveRepositorySize(dataset.getRepositoryName) - ) + try { + Some( + DashboardDataset( + isOwner = if (uid == null) false else dataset.getOwnerUid == uid, + dataset = dataset, + accessPrivilege = datasetAccess.getPrivilege, + ownerEmail = ownerEmail, + size = LakeFSStorageClient.retrieveRepositorySize(dataset.getRepositoryName) + ) + ) + } catch { + case e: io.lakefs.clients.sdk.ApiException => + logger.error( + s"LakeFS ApiException for dataset repository '${dataset.getRepositoryName}': ${e.getMessage}", + e + ) + None + } } .toList .distinctBy(_.dataset.getDid) diff --git a/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala b/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala index 46457c9454..bc2d637a00 100644 --- a/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala +++ b/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala @@ -19,6 +19,7 @@ package org.apache.texera.service.resource +import com.typesafe.scalalogging.LazyLogging import io.dropwizard.auth.Auth import jakarta.annotation.security.RolesAllowed import jakarta.ws.rs._ @@ -215,7 +216,7 @@ object DatasetResource { @Produces(Array(MediaType.APPLICATION_JSON, "image/jpeg", "application/pdf")) @Path("/dataset") -class DatasetResource { +class DatasetResource extends LazyLogging { private val ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE = "User has no access to this dataset" private val ERR_DATASET_VERSION_NOT_FOUND_MESSAGE = "The version of the dataset not found" private val EXPIRATION_MINUTES = 5 @@ -1105,28 +1106,32 @@ class DatasetResource { ) .where(DATASET.IS_PUBLIC.eq(true)) .fetch() - .map(record => { + .asScala + .flatMap { record => val dataset = record.into(DATASET).into(classOf[Dataset]) val ownerEmail = record.into(USER).getEmail - DashboardDataset( - isOwner = false, - dataset = dataset, - accessPrivilege = PrivilegeEnum.READ, - ownerEmail = ownerEmail, - size = LakeFSStorageClient.retrieveRepositorySize(dataset.getRepositoryName) - ) - }) - publicDatasets.forEach { publicDataset => + try { + Some( + DashboardDataset( + isOwner = false, + dataset = dataset, + accessPrivilege = PrivilegeEnum.READ, + ownerEmail = ownerEmail, + size = LakeFSStorageClient.retrieveRepositorySize(dataset.getRepositoryName) + ) + ) + } catch { + case e: io.lakefs.clients.sdk.ApiException => + logger.error( + s"LakeFS ApiException for dataset repository '${dataset.getRepositoryName}': ${e.getMessage}", + e + ) + None + } + } + publicDatasets.foreach { publicDataset => if (!accessibleDatasets.exists(_.dataset.getDid == publicDataset.dataset.getDid)) { - val dashboardDataset = DashboardDataset( - isOwner = false, - dataset = publicDataset.dataset, - ownerEmail = publicDataset.ownerEmail, - accessPrivilege = PrivilegeEnum.READ, - size = - LakeFSStorageClient.retrieveRepositorySize(publicDataset.dataset.getRepositoryName) - ) - accessibleDatasets = accessibleDatasets :+ dashboardDataset + accessibleDatasets = accessibleDatasets :+ publicDataset } } accessibleDatasets.toList diff --git a/file-service/src/test/scala/org/apache/texera/service/resource/DatasetResourceSpec.scala b/file-service/src/test/scala/org/apache/texera/service/resource/DatasetResourceSpec.scala index e4f1e74f6c..4c73ccd2e0 100644 --- a/file-service/src/test/scala/org/apache/texera/service/resource/DatasetResourceSpec.scala +++ b/file-service/src/test/scala/org/apache/texera/service/resource/DatasetResourceSpec.scala @@ -370,6 +370,42 @@ class DatasetResourceSpec datasetDao.fetchOneByDid(dataset.getDid) should not be null } + "listDatasets" should "include a dataset whose LakeFS repo exists" in { + val repoName = s"list-ok-${System.nanoTime()}" + val dataset = new Dataset + dataset.setName(repoName) + dataset.setRepositoryName(repoName) + dataset.setDescription("list endpoint - healthy dataset") + dataset.setOwnerUid(ownerUser.getUid) + dataset.setIsPublic(true) + dataset.setIsDownloadable(true) + datasetDao.insert(dataset) + LakeFSStorageClient.initRepo(repoName) + + val result = datasetResource.listDatasets(sessionUser) + + result.map(_.dataset.getDid) should contain(dataset.getDid) + } + + it should "exclude a dataset whose LakeFS repo has been deleted (orphan DB row)" in { + val repoName = s"list-orphan-${System.nanoTime()}" + val dataset = new Dataset + dataset.setName(repoName) + dataset.setRepositoryName(repoName) + dataset.setDescription("list endpoint - orphan DB row") + dataset.setOwnerUid(ownerUser.getUid) + dataset.setIsPublic(true) + dataset.setIsDownloadable(true) + datasetDao.insert(dataset) + LakeFSStorageClient.initRepo(repoName) + // Simulate the DB/LakeFS mismatch: delete the repo directly, leaving the DB row. + LakeFSStorageClient.deleteRepo(repoName) + + val result = datasetResource.listDatasets(sessionUser) + + result.map(_.dataset.getDid) should not contain dataset.getDid + } + "updateDatasetName" should "rename dataset successfully if user has write access" in { val dataset = new Dataset dataset.setName("rename-before")
