keith-turner commented on code in PR #3853: URL: https://github.com/apache/accumulo/pull/3853#discussion_r1364502249
########## server/manager/src/main/java/org/apache/accumulo/manager/tableOps/split/PreSplit.java: ########## @@ -76,79 +64,21 @@ public long isReady(long tid, Manager manager) throws Exception { // that have not completed their first step. Once splits starts running, would like it to move // through as quickly as possible. - var tabletMetadata = manager.getContext().getAmple().readTablet(splitInfo.getOriginal(), - PREV_ROW, LOCATION, OPID); - - log.trace("Attempting tablet split {} {} {}", FateTxId.formatTid(tid), splitInfo.getOriginal(), - tabletMetadata == null ? null : tabletMetadata.getLocation()); - - if (tabletMetadata == null || (tabletMetadata.getOperationId() != null - && !opid.equals(tabletMetadata.getOperationId()))) { - // tablet no longer exists or is reserved by another operation + if (canStart(tid, manager)) { return 0; - } else if (opid.equals(tabletMetadata.getOperationId())) { - if (tabletMetadata.getLocation() == null) { - // the operation id is set and there is no location, so can proceed to split - return 0; - } else { - // the operation id was set, but a location is also set wait for it be unset - return 1000; - } - } else { - try (var tabletsMutator = manager.getContext().getAmple().conditionallyMutateTablets()) { - - tabletsMutator.mutateTablet(splitInfo.getOriginal()).requireAbsentOperation() - .requireSame(tabletMetadata, LOCATION, PREV_ROW).putOperation(opid) - .submit(tmeta -> opid.equals(tmeta.getOperationId())); - - Map<KeyExtent,Ample.ConditionalResult> results = tabletsMutator.process(); - if (results.get(splitInfo.getOriginal()).getStatus() == Status.ACCEPTED) { - log.trace("Successfully set operation id for split {}", FateTxId.formatTid(tid)); - if (tabletMetadata.getLocation() == null) { - // the operation id was set and there is no location, so can move on - return 0; - } else { - // now that the operation id set, generate an event to unload the tablet - manager.getEventCoordinator().event(splitInfo.getOriginal(), - "Set operation id %s on tablet for split", FateTxId.formatTid(tid)); - // the operation id was set, but a location is also set wait for it be unset - return 1000; - } - } else { - log.trace("Failed to set operation id for split {}", FateTxId.formatTid(tid)); - // something changed with the tablet, so setting the operation id failed. Try again later - return 1000; - } - } } + return 1000; } @Override public Repo<Manager> call(long tid, Manager manager) throws Exception { - manager.getSplitter().removeSplitStarting(splitInfo.getOriginal()); - - TabletMetadata tabletMetadata = manager.getContext().getAmple() - .readTablet(splitInfo.getOriginal(), PREV_ROW, LOCATION, OPID); - - var opid = TabletOperationId.from(TabletOperationType.SPLITTING, tid); - - if (tabletMetadata == null || !opid.equals(tabletMetadata.getOperationId())) { - // the tablet no longer exists or we could not set the operation id, maybe another operation - // was running, lets not proceed with the split. - var optMeta = Optional.ofNullable(tabletMetadata); - log.trace("{} Not proceeding with split. extent:{} location:{} opid:{}", - FateTxId.formatTid(tid), splitInfo.getOriginal(), - optMeta.map(TabletMetadata::getLocation).orElse(null), - optMeta.map(TabletMetadata::getOperationId).orElse(null)); - return null; + if (!canContinue(tid, manager)) { + throw new IllegalStateException( + "Tablet is in an unexpected condition: " + splitInfo.getOriginal()); Review Comment: This is changing the behavior, its throwing exceptions under different conditions. ########## server/manager/src/main/java/org/apache/accumulo/manager/tableOps/split/UpdateTablets.java: ########## @@ -37,58 +37,55 @@ import org.apache.accumulo.core.metadata.schema.TabletOperationId; import org.apache.accumulo.core.metadata.schema.TabletOperationType; import org.apache.accumulo.manager.Manager; -import org.apache.accumulo.manager.tableOps.ManagerRepo; +import org.apache.accumulo.manager.tableOps.DestructiveTabletManagerRepo; import org.apache.accumulo.server.util.FileUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.base.Function; import com.google.common.base.Preconditions; -public class UpdateTablets extends ManagerRepo { +public class UpdateTablets extends DestructiveTabletManagerRepo { private static final Logger log = LoggerFactory.getLogger(UpdateTablets.class); private static final long serialVersionUID = 1L; private final SplitInfo splitInfo; private final List<String> dirNames; public UpdateTablets(SplitInfo splitInfo, List<String> dirNames) { + super(TabletOperationType.SPLITTING, splitInfo.getOriginal()); this.splitInfo = splitInfo; this.dirNames = dirNames; } @Override public Repo<Manager> call(long tid, Manager manager) throws Exception { + TabletMetadata tabletMetadata = manager.getContext().getAmple().readTablet(splitInfo.getOriginal()); var opid = TabletOperationId.from(TabletOperationType.SPLITTING, tid); - if (tabletMetadata == null) { - // check to see if this operation has already succeeded. - TabletMetadata newTabletMetadata = - manager.getContext().getAmple().readTablet(splitInfo.getTablets().last()); - - if (newTabletMetadata != null && opid.equals(newTabletMetadata.getOperationId())) { - // have already created the new tablet and failed before we could return the next step, so - // lets go ahead and return the next step. - log.trace( - "{} creating new tablet was rejected because it existed, operation probably failed before.", - FateTxId.formatTid(tid)); - return new DeleteOperationIds(splitInfo); - } else { - throw new IllegalStateException("Tablet is in an unexpected condition " - + splitInfo.getOriginal() + " " + (newTabletMetadata == null) + " " - + (newTabletMetadata == null ? null : newTabletMetadata.getOperationId())); - } - } + if (!canContinue(tid, manager)) { Review Comment: I don't think this method should be called here. The split code creates new tablets and eventually changes the original tablets prevrow which makes could make it look like it does not exists. In process failure conditions where this code might be rerunning it checking these conditions to see where its at and what is left to be done. However its expected that the code will see either the old tablet or new tablet with an operation id set, if sees neither its an indication of a bug. So I think it should always continue to check if the new tablet exists when the old one does not. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: notifications-unsubscr...@accumulo.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org