[ https://issues.apache.org/jira/browse/TINKERPOP-2958?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17731756#comment-17731756 ]
ASF GitHub Bot commented on TINKERPOP-2958: ------------------------------------------- lyndonbauto commented on PR #2090: URL: https://github.com/apache/tinkerpop/pull/2090#issuecomment-1587954864 @kenhuuu > Hi Lyndon, is it possible for you to share what kind of testing you were running that showed this issue? Hey Ken, yeah sure, this isn't pretty code but here it is: ``` package com.aerospike.firefly.server; import com.aerospike.firefly.Server; import org.apache.tinkerpop.gremlin.driver.Cluster; import org.apache.tinkerpop.gremlin.driver.remote.DriverRemoteConnection; import org.apache.tinkerpop.gremlin.process.traversal.dsl.graph.GraphTraversalSource; import org.apache.tinkerpop.gremlin.structure.Property; import org.junit.Test; import java.util.ArrayList; import java.util.List; import java.util.Random; import java.util.Timer; import java.util.TimerTask; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.stream.Collectors; import static org.apache.tinkerpop.gremlin.driver.Tokens.ARGS_EVAL_TIMEOUT; import static org.apache.tinkerpop.gremlin.process.traversal.AnonymousTraversalSource.traversal; public class TestServer { private static final long SECOND = 1000; private static final long MINUTE = 60 * SECOND; private static final long TEST_DURATION = 4 * MINUTE; private long totalExecuted = 0; Server server; @Test public void testServer() throws Exception { server = Server.main(new String[] {"../conf/firefly-gremlin-server-local.yaml"}); HEAP_REPORTING_TIMER.schedule(new HeapReport(), 0, 5000); exposeLeak(); } private static final Timer HEAP_REPORTING_TIMER = new Timer(true); private class HeapReport extends TimerTask { @Override public void run() { System.gc(); System.gc(); System.gc(); System.out.printf("Heap size: %d MB%n", Runtime.getRuntime().totalMemory() / 1024 / 1024); System.out.printf("Executed %d queries%n", totalExecuted); } } public void exposeLeak() throws Exception { final long startTime = System.currentTimeMillis(); Cluster cluster = Cluster.build().addContactPoint("localhost").port(8182).create(); DriverRemoteConnection drc = DriverRemoteConnection.using(cluster, "g"); // final DriverRemoteConnection drc = DriverRemoteConnection.using("localhost", 8182, "g"); final GraphTraversalSource g = traversal().withRemote(drc); final ExecutorService executorService = Executors.newFixedThreadPool(16); g.with(ARGS_EVAL_TIMEOUT, 1L).V().drop().iterate(); for (int i = 0; i < 25000; i++) { g.addV("randomVertex"). property("last_seen", String.format("%d", System.currentTimeMillis())). property("something", String.format("%d", i)). property("otherThing", String.format("%d", i)). property("anotherThing", String.format("%d", i)). property("yetAnotherThing", String.format("%d", i)). property("andAnotherThing", String.format("%d", i)). property("andYetAnotherThing", String.format("%d", i)). iterate(); } final List<Object> ids = g.V().id().toList(); final List<Future<List<? extends Property>>> futures = new ArrayList<>(); while (System.currentTimeMillis() - startTime <= TEST_DURATION) { while (futures.size() > 500) { Thread.sleep(100); final List<Future<List<? extends Property>>> completedFutures = futures.stream().filter(Future::isDone).collect(Collectors.toList()); completedFutures.forEach(f -> { try { f.get(); } catch (Exception e) { e.printStackTrace(); } }); totalExecuted += completedFutures.size(); futures.removeAll(completedFutures); } final Random rand = new Random(); final List<Object> idz = new ArrayList<>(); for (int i = 0; i < 9; i++) { idz.add(ids.get(rand.nextInt(ids.size()))); } futures.add(executorService.submit(() -> g.V(idz).properties("last_seen").toList())); } futures.forEach(f-> { try { f.get(); } catch (Exception e) { e.printStackTrace(); } }); drc.close(); Thread.sleep(30000); } } ``` The server code is: ``` package com.aerospike.firefly; import org.apache.tinkerpop.gremlin.server.GremlinServer; import org.apache.tinkerpop.gremlin.server.Settings; import org.apache.tinkerpop.gremlin.server.util.ServerGremlinExecutor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.concurrent.CompletableFuture; public class Server { private static final Logger logger = LoggerFactory.getLogger(Server.class); private GremlinServer gremlinServer = null; private final String confPath; private CompletableFuture<Void> serverStarted = null; private CompletableFuture<Void> serverStopped = null; public Server(final String file) { confPath = file; } public static Server main(final String[] args) { if (args.length != 1) { System.err.println("Usage: Server <conf file>"); System.exit(1); } String file = args[0]; Server fireflyServer = new Server(file); fireflyServer.start().exceptionally(t -> { logger.error("Firefly Server was unable to start and will now begin shutdown", t); fireflyServer.stop().join(); return null; }).join(); return fireflyServer; } public synchronized CompletableFuture<Void> start() { if (serverStarted != null) { return serverStarted; } serverStarted = new CompletableFuture<>(); try { Settings settings = Settings.read(confPath); gremlinServer = new GremlinServer(settings); serverStarted = CompletableFuture.allOf(gremlinServer.start()); } catch (Exception ex) { serverStarted.completeExceptionally(ex); } return serverStarted; } public synchronized CompletableFuture<Void> stop() { if (gremlinServer == null) { return CompletableFuture.completedFuture(null); } if (serverStopped != null) { return serverStopped; } serverStopped = gremlinServer.stop(); return serverStopped; } } ``` which i adapted from the JanusGraphServer implementation specifically to test this since I didn't have any in Java server bootstrapping code. Basically you just run and watch the memory, for Aerospike Graph after a few mins we see about 1 GB total if the bug is not present, if the bug is present I see ~1 GB per minute growth and will see ~3-4 GB at the end of the test. This is kind of obvious, but make sure the evaluationTimeout in the server yaml is big (mine was 20 mins). > ScheduledExecutorService for timeouts are never cancelled > --------------------------------------------------------- > > Key: TINKERPOP-2958 > URL: https://issues.apache.org/jira/browse/TINKERPOP-2958 > Project: TinkerPop > Issue Type: Bug > Components: server > Affects Versions: 3.6.3, 3.6.4 > Reporter: Lyndon Bauto > Assignee: Lyndon Bauto > Priority: Major > > A scheduled task was added to timeout a request in the future. > > This task is not cancelled when the request is completed before the timeout. > With a long timeout, this can lead to extreme memory usage (~1 GB growth per > minute on my laptop with my graph implementation). -- This message was sent by Atlassian Jira (v8.20.10#820010)