This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new de77326a1 TIKA-3882 -- need to add doc_as_upsert to opensearch emitter
for embedded files...ugh :(
de77326a1 is described below
commit de77326a10ea13f380c5f966d02b7e20b9f142db
Author: tballison <[email protected]>
AuthorDate: Mon Oct 17 11:03:10 2022 -0400
TIKA-3882 -- need to add doc_as_upsert to opensearch emitter for embedded
files...ugh :(
---
.../pipes/xsearch/tests/TikaPipesXSearchBase.java | 63 ++++++++++++++++++++++
.../pipes/emitter/opensearch/OpenSearchClient.java | 1 +
2 files changed, 64 insertions(+)
diff --git
a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/xsearch/tests/TikaPipesXSearchBase.java
b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/xsearch/tests/TikaPipesXSearchBase.java
index 75b091943..3da8704d3 100644
---
a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/xsearch/tests/TikaPipesXSearchBase.java
+++
b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/xsearch/tests/TikaPipesXSearchBase.java
@@ -258,6 +258,69 @@ public abstract class TikaPipesXSearchBase {
assertEquals(400, results.getStatus());
}
+ @Test
+ public void testUpsertSeparateDocsFSToOpenSearch() throws Exception {
+ //now test that this works with upsert
+ int numHtmlDocs = 42;
+ createTestHtmlFiles("Happiness", numHtmlDocs);
+ String endpoint = OPEN_SEARCH_ENDPOINT_BASE + TEST_INDEX;
+ sendMappings(endpoint, TEST_INDEX, "opensearch-mappings.json");
+
+ runPipes(OpenSearchEmitter.AttachmentStrategy.SEPARATE_DOCUMENTS,
+ OpenSearchEmitter.UpdateStrategy.UPSERT,
+ HandlerConfig.PARSE_MODE.RMETA, endpoint);
+
+ String query = "{ \"track_total_hits\": true, \"query\": { \"match\":
{ \"content\": { " +
+ "\"query\": \"happiness\" } } } }";
+
+ JsonResponse results = CLIENT.postJson(endpoint + "/_search", query);
+ assertEquals(200, results.getStatus());
+ assertEquals(numHtmlDocs + 1,
+
results.getJson().get("hits").get("total").get("value").asInt());
+
+ //now try match all
+ query = "{ " +
+ //"\"from\":0, \"size\":1000," +
+ "\"track_total_hits\": true, \"query\": { " +
+ "\"match_all\": {} } }";
+ results = CLIENT.postJson(endpoint + "/_search", query);
+ assertEquals(200, results.getStatus());
+ assertEquals(numHtmlDocs + 3 + 12, //3 for the mock docs,
+ // and the .docx file has 11 embedded files, plus itself
+
results.getJson().get("hits").get("total").get("value").asInt());
+
+ //now check out one of the embedded files
+ query = "{ \"track_total_hits\": true, \"query\": { \"query_string\":
{ " +
+ "\"default_field\": \"content\", " +
+ "\"query\": \"embed4 zip\" , \"minimum_should_match\":2 } } }
";
+ results = CLIENT.postJson(endpoint + "/_search", query);
+ assertEquals(200, results.getStatus());
+ assertEquals(1,
+
results.getJson().get("hits").get("total").get("value").asInt());
+ JsonNode source =
results.getJson().get("hits").get("hits").get(0).get("_source");
+
+ Matcher m = Pattern.compile("\\Atest_recursive_embedded" +
+ ".docx-[0-9a-f]{8}-[0-9a-f]{4}-" +
+ "[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\\Z").matcher(
+
results.getJson().get("hits").get("hits").get(0).get("_id").asText()
+ );
+ assertTrue(m.find(), "test_recursive_embedded.docx-$guid");
+
+
assertNull(results.getJson().get("hits").get("hits").get(0).get("_routing"),
+ "test_recursive_embedded.docx");
+ assertNull(source.get("relation_type"),
"test_recursive_embedded.docx");
+
+ assertEquals("application/zip", source.get("mime").asText());
+
+ //now make sure there are no children; this query should
+ //cause an exception because there are no relationships in the schema
+ query = "{ \"track_total_hits\": true, \"query\": { \"parent_id\": { "
+
+ "\"type\": \"embedded\", " +
+ "\"id\": \"test_recursive_embedded.docx\" } } } ";
+ results = CLIENT.postJson(endpoint + "/_search", query);
+ assertEquals(400, results.getStatus());
+ }
+
@Test
public void testUpsert() throws Exception {
String endpoint = OPEN_SEARCH_ENDPOINT_BASE + TEST_INDEX;
diff --git
a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java
b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java
index 3b8061a95..c757115a1 100644
---
a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java
+++
b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java
@@ -292,6 +292,7 @@ public class OpenSearchClient {
}
//end the "doc"
jsonGenerator.writeEndObject();
+ jsonGenerator.writeBooleanField("doc_as_upsert", true);
//end the metadata object
jsonGenerator.writeEndObject();
}