This is an automated email from the ASF dual-hosted git repository. cgivre pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/drill.git
The following commit(s) were added to refs/heads/master by this push: new 9551372cd5 DRILL-8320: Prevent Infinite Pagination for Index Paginator (#2660) 9551372cd5 is described below commit 9551372cd5678ff4ab7d40425b51ae485e202881 Author: Charles S. Givre <cgi...@apache.org> AuthorDate: Wed Sep 28 14:20:02 2022 -0400 DRILL-8320: Prevent Infinite Pagination for Index Paginator (#2660) * DRILL-8320: Prevent Infinite Pagination for Index Paginator --- contrib/storage-http/.gitignore | 1 + .../drill/exec/store/http/HttpBatchReader.java | 28 ++++++++++- .../drill/exec/store/http/HttpPaginatorConfig.java | 7 +-- .../exec/store/http/HttpScanBatchCreator.java | 6 +-- .../exec/store/http/paginator/IndexPaginator.java | 22 +++++++-- .../drill/exec/store/http/TestPagination.java | 54 +++++++++++++++++++++- .../resources/data/nested_pagination_fields.json | 17 +++++++ .../resources/data/nested_pagination_fields2.json | 14 ++++++ .../easy/json/parser/JsonStructureParser.java | 1 - 9 files changed, 131 insertions(+), 19 deletions(-) diff --git a/contrib/storage-http/.gitignore b/contrib/storage-http/.gitignore new file mode 100644 index 0000000000..710368b187 --- /dev/null +++ b/contrib/storage-http/.gitignore @@ -0,0 +1 @@ +./src/test/resources/logback-test.xml diff --git a/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpBatchReader.java b/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpBatchReader.java index e2bfee9114..6f312b0e9b 100644 --- a/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpBatchReader.java +++ b/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpBatchReader.java @@ -111,6 +111,8 @@ public class HttpBatchReader implements ManagedReader<SchemaNegotiator> { }; negotiator.setErrorContext(errorContext); + logger.debug("Executing request with url: {}", url); + // Http client setup SimpleHttp http = SimpleHttp.builder() .scanDefn(subScan) @@ -359,13 +361,32 @@ public class HttpBatchReader implements ManagedReader<SchemaNegotiator> { if (StringUtils.isNotEmpty(indexPaginator.getNextPageParam())) { indexPaginator.setNextPageValue(paginationFields.get(indexPaginator.getNextPageParam()).toString()); } + } else { + // This covers the case of keyset/index pagination where there isn't a boolean parameter to indicate whether there are more results. + // In this case, we will interpret the absence of the pagination field, receiving the same value, or a null value as the marker to stop pagination. + if ( (!paginationFields.containsKey(indexPaginator.getIndexParam())) || + paginationFields.get(indexPaginator.getIndexParam()) == null + ) { + // End pagination + paginator.notifyPartialPage(); + } else { + // Otherwise, check to see if the field is present but empty, or contains the value from the last page. + // This will prevent runaway pagination calls. + String indexParameter = paginationFields.get(indexPaginator.getIndexParam()).toString(); + // Empty value or the last value is the same as the current one. + if (StringUtils.isEmpty(indexParameter) || (StringUtils.equals(indexParameter, indexPaginator.getLastIndexValue()))) { + paginator.notifyPartialPage(); + } else { + // Whew! We made it... get the next page. + indexPaginator.setIndexValue(indexParameter); + } + } } } @Override public boolean next() { boolean result = jsonLoader.readBatch(); - // This code implements the index/keyset pagination. This pagination method // uses a value returned in the current result set as the starting point for the // next page. Some APIs will have a boolean parameter to indicate that there are @@ -373,9 +394,12 @@ public class HttpBatchReader implements ManagedReader<SchemaNegotiator> { // it is necessary to grab these values from the returned data. if (paginator != null && paginator.getMode() == PaginatorMethod.INDEX) { // First check to see if the limit has been reached. If so, mark the end of pagination. - if (maxRecords > 0 && (resultSetLoader.totalRowCount() > maxRecords)) { + long totalRowCount = resultSetLoader.totalRowCount(); + if (maxRecords > 0 && totalRowCount >= maxRecords) { // End Pagination paginator.notifyPartialPage(); + // Returning false here because if there is a partial page, we want the reader to stop and no further batches to be created. + return false; } else { populateIndexPaginator(); } diff --git a/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpPaginatorConfig.java b/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpPaginatorConfig.java index ad474f9bac..e6b226f8c6 100644 --- a/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpPaginatorConfig.java +++ b/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpPaginatorConfig.java @@ -120,12 +120,7 @@ public class HttpPaginatorConfig { break; case INDEX: // Either the nextPageParam OR the indexParam must be populated - if (StringUtils.isEmpty(this.hasMoreParam)) { - throw UserException - .validationError() - .message("Invalid paginator configuration. For INDEX pagination, the hasMoreParam must be defined.") - .build(logger); - } else if ((StringUtils.isEmpty(this.nextPageParam) && StringUtils.isNotEmpty(this.indexParam)) && + if ((StringUtils.isEmpty(this.nextPageParam) && StringUtils.isNotEmpty(this.indexParam)) && (StringUtils.isNotEmpty(this.nextPageParam) && StringUtils.isEmpty(this.indexParam))) { throw UserException .validationError() diff --git a/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpScanBatchCreator.java b/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpScanBatchCreator.java index edc7dee1f9..3de783f7da 100644 --- a/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpScanBatchCreator.java +++ b/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpScanBatchCreator.java @@ -86,9 +86,6 @@ public class HttpScanBatchCreator implements BatchCreator<HttpSubScan> { ReaderFactory readerFactory = new HttpReaderFactory(subScan); builder.setReaderFactory(readerFactory); builder.nullType(Types.optional(MinorType.VARCHAR)); - - // TODO Add page size limit here to ScanFramework Builder - return builder; } @@ -139,8 +136,9 @@ public class HttpScanBatchCreator implements BatchCreator<HttpSubScan> { paginatorConfig.pageSizeParam()); } else if (paginatorConfig.getMethodType() == PaginatorMethod.INDEX) { paginator = new IndexPaginator(urlBuilder, + 0, // Page size not used for Index/Keyset pagination subScan.maxRecords(), - 0, paginatorConfig.hasMoreParam(), + paginatorConfig.hasMoreParam(), paginatorConfig.indexParam(), paginatorConfig.nextPageParam()); } diff --git a/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/paginator/IndexPaginator.java b/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/paginator/IndexPaginator.java index 22aa531f54..67bc89c4a6 100644 --- a/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/paginator/IndexPaginator.java +++ b/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/paginator/IndexPaginator.java @@ -20,19 +20,25 @@ package org.apache.drill.exec.store.http.paginator; import okhttp3.HttpUrl.Builder; import org.apache.commons.lang3.StringUtils; +import org.apache.drill.common.exceptions.UserException; import org.apache.drill.exec.store.http.HttpPaginatorConfig.PaginatorMethod; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.UnsupportedEncodingException; import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; import java.util.NoSuchElementException; public class IndexPaginator extends Paginator { + private static final Logger logger = LoggerFactory.getLogger(IndexPaginator.class); private final String hasMoreParam; private final String indexParam; private final String nextPageParam; private String indexValue; + private String lastIndexValue; private Boolean hasMoreValue; private String nextPageValue; private int pageCount; @@ -75,9 +81,14 @@ public class IndexPaginator extends Paginator { } public void setNextPageValue(String nextPageValue) { + this.lastIndexValue = this.nextPageValue; this.nextPageValue = nextPageValue; } + public String getLastIndexValue() { + return lastIndexValue; + } + public boolean isFirstPage() { return pageCount < 1; } @@ -94,13 +105,14 @@ public class IndexPaginator extends Paginator { throw new NoSuchElementException(); } - if (StringUtils.isNotEmpty(nextPageValue)) { - // TODO figure this out... - } else if (StringUtils.isNotEmpty(indexValue)) { + if (StringUtils.isNotEmpty(indexValue)) { try { - indexValue = URLEncoder.encode(indexValue, "UTF-8"); + indexValue = URLEncoder.encode(indexValue, StandardCharsets.UTF_8.name()); } catch (UnsupportedEncodingException e) { - // Do nothing. + // Should never happen + throw UserException.internalError() + .message(e.getMessage()) + .build(logger); } builder.removeAllEncodedQueryParameters(indexParam); builder.addQueryParameter(indexParam, indexValue); diff --git a/contrib/storage-http/src/test/java/org/apache/drill/exec/store/http/TestPagination.java b/contrib/storage-http/src/test/java/org/apache/drill/exec/store/http/TestPagination.java index 328a447cb4..d04f230f3d 100644 --- a/contrib/storage-http/src/test/java/org/apache/drill/exec/store/http/TestPagination.java +++ b/contrib/storage-http/src/test/java/org/apache/drill/exec/store/http/TestPagination.java @@ -65,7 +65,8 @@ public class TestPagination extends ClusterTest { private static String TEST_JSON_INDEX_PAGE2; private static String TEST_JSON_INDEX_PAGE3; private static String TEST_JSON_INDEX_PAGE4; - + private static String TEST_JSON_NESTED_INDEX; + private static String TEST_JSON_NESTED_INDEX2; private static String TEST_XML_PAGE1; private static String TEST_XML_PAGE2; private static String TEST_XML_PAGE3; @@ -89,6 +90,9 @@ public class TestPagination extends ClusterTest { TEST_JSON_INDEX_PAGE3 = Files.asCharSource(DrillFileUtils.getResourceAsFile("/data/index_response3.json"), Charsets.UTF_8).read(); TEST_JSON_INDEX_PAGE4 = Files.asCharSource(DrillFileUtils.getResourceAsFile("/data/index_response4.json"), Charsets.UTF_8).read(); + TEST_JSON_NESTED_INDEX = Files.asCharSource(DrillFileUtils.getResourceAsFile("/data/nested_pagination_fields.json"), Charsets.UTF_8).read(); + TEST_JSON_NESTED_INDEX2 = Files.asCharSource(DrillFileUtils.getResourceAsFile("/data/nested_pagination_fields2.json"), Charsets.UTF_8).read(); + TEST_XML_PAGE1 = Files.asCharSource(DrillFileUtils.getResourceAsFile("/data/response_1.xml"), Charsets.UTF_8).read(); TEST_XML_PAGE2 = Files.asCharSource(DrillFileUtils.getResourceAsFile("/data/response_2.xml"), Charsets.UTF_8).read(); TEST_XML_PAGE3 = Files.asCharSource(DrillFileUtils.getResourceAsFile("/data/response_3.xml"), Charsets.UTF_8).read(); @@ -167,6 +171,31 @@ public class TestPagination extends ClusterTest { .inputType("json") .build(); + HttpPaginatorConfig nestedIndexPaginator = HttpPaginatorConfig.builder() + .indexParam("after") + .method("index") + .build(); + + HttpApiConfig mockJsonConfigWitNestedKeyset = HttpApiConfig.builder() + .url("http://localhost:8092/json") + .method("get") + .headers(headers) + .requireTail(false) + .paginator(nestedIndexPaginator) + .inputType("json") + .build(); + + HttpApiConfig mockJsonConfigWitNestedKeysetAndDataPath = HttpApiConfig.builder() + .url("http://localhost:8092/json") + .method("get") + .headers(headers) + .dataPath("results") + .requireTail(false) + .paginator(nestedIndexPaginator) + .inputType("json") + .build(); + + HttpApiConfig mockJsonConfigWithKeysetAndDataPath = HttpApiConfig.builder() .url("http://localhost:8092/json") .method("get") @@ -232,6 +261,8 @@ public class TestPagination extends ClusterTest { configs.put("csv_paginator", mockCsvConfigWithPaginator); configs.put("json_index", mockJsonConfigWithKeyset); configs.put("json_index_datapath", mockJsonConfigWithKeysetAndDataPath); + configs.put("nested_keyset", mockJsonConfigWitNestedKeyset); + configs.put("nested_keyset_and_datapath", mockJsonConfigWitNestedKeysetAndDataPath); configs.put("json_paginator", mockJsonConfigWithPaginator); configs.put("xml_paginator", mockXmlConfigWithPaginator); configs.put("xml_paginator_url_params", mockXmlConfigWithPaginatorAndUrlParams); @@ -348,6 +379,27 @@ public class TestPagination extends ClusterTest { assertEquals(4, count); } } + @Test + public void jsonQueryWithoutHasMore() throws Exception { + String sql = "SELECT * FROM `local`.`nested_keyset` LIMIT 4"; + try (MockWebServer server = startServer()) { + + server.enqueue(new MockResponse().setResponseCode(200).setBody(TEST_JSON_NESTED_INDEX)); + server.enqueue(new MockResponse().setResponseCode(200).setBody(TEST_JSON_NESTED_INDEX2)); + + List<QueryDataBatch> results = client.queryBuilder() + .sql(sql) + .results(); + + int count = 0; + for(QueryDataBatch b : results){ + count += b.getHeader().getRowCount(); + b.release(); + } + assertEquals(2, results.size()); + assertEquals(2, count); + } + } @Test public void simpleJSONPaginatorQueryWithoutLimit() throws Exception { diff --git a/contrib/storage-http/src/test/resources/data/nested_pagination_fields.json b/contrib/storage-http/src/test/resources/data/nested_pagination_fields.json new file mode 100644 index 0000000000..3ad3845cd4 --- /dev/null +++ b/contrib/storage-http/src/test/resources/data/nested_pagination_fields.json @@ -0,0 +1,17 @@ +{ + "results": [ + { + "id": "1", + "properties": { + "company": "HubSpot" + }, + "archived": false + } + ], + "paging": { + "next": { + "after": "2", + "link": "https://api.hubapi.com/crm/v3/objects/contacts?includeAssociations=true&properties=record_id&properties=company&properties=createdate&properties=email&properties=firstname&properties=lastmodifieddate&properties=lastname&properties=phone&properties=website&limit=1&after=2" + } + } +} diff --git a/contrib/storage-http/src/test/resources/data/nested_pagination_fields2.json b/contrib/storage-http/src/test/resources/data/nested_pagination_fields2.json new file mode 100644 index 0000000000..68f68d2f73 --- /dev/null +++ b/contrib/storage-http/src/test/resources/data/nested_pagination_fields2.json @@ -0,0 +1,14 @@ +{ + "results": [ + { + "id": "2", + "properties": { + "company": "HubSpot2" + }, + "archived": false + } + ], + "paging": { + + } +} diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/json/parser/JsonStructureParser.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/json/parser/JsonStructureParser.java index 03fa51b695..4d33403070 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/json/parser/JsonStructureParser.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/json/parser/JsonStructureParser.java @@ -264,7 +264,6 @@ public class JsonStructureParser { } while (true) { try { - // System.out.println(tokenizer.stringValue()); return rootState.parseRoot(tokenizer); } catch (RecoverableJsonException e) { if (! recover()) {