This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new de77326a1 TIKA-3882 -- need to add doc_as_upsert to opensearch emitter 
for embedded files...ugh :(
de77326a1 is described below

commit de77326a10ea13f380c5f966d02b7e20b9f142db
Author: tballison <[email protected]>
AuthorDate: Mon Oct 17 11:03:10 2022 -0400

    TIKA-3882 -- need to add doc_as_upsert to opensearch emitter for embedded 
files...ugh :(
---
 .../pipes/xsearch/tests/TikaPipesXSearchBase.java  | 63 ++++++++++++++++++++++
 .../pipes/emitter/opensearch/OpenSearchClient.java |  1 +
 2 files changed, 64 insertions(+)

diff --git 
a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/xsearch/tests/TikaPipesXSearchBase.java
 
b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/xsearch/tests/TikaPipesXSearchBase.java
index 75b091943..3da8704d3 100644
--- 
a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/xsearch/tests/TikaPipesXSearchBase.java
+++ 
b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/xsearch/tests/TikaPipesXSearchBase.java
@@ -258,6 +258,69 @@ public abstract class TikaPipesXSearchBase {
         assertEquals(400, results.getStatus());
     }
 
+    @Test
+    public void testUpsertSeparateDocsFSToOpenSearch() throws Exception {
+        //now test that this works with upsert
+        int numHtmlDocs = 42;
+        createTestHtmlFiles("Happiness", numHtmlDocs);
+        String endpoint = OPEN_SEARCH_ENDPOINT_BASE + TEST_INDEX;
+        sendMappings(endpoint, TEST_INDEX, "opensearch-mappings.json");
+
+        runPipes(OpenSearchEmitter.AttachmentStrategy.SEPARATE_DOCUMENTS,
+                OpenSearchEmitter.UpdateStrategy.UPSERT,
+                HandlerConfig.PARSE_MODE.RMETA, endpoint);
+
+        String query = "{ \"track_total_hits\": true, \"query\": { \"match\": 
{ \"content\": { " +
+                "\"query\": \"happiness\" } } } }";
+
+        JsonResponse results = CLIENT.postJson(endpoint + "/_search", query);
+        assertEquals(200, results.getStatus());
+        assertEquals(numHtmlDocs + 1,
+                
results.getJson().get("hits").get("total").get("value").asInt());
+
+        //now try match all
+        query = "{ " +
+                //"\"from\":0, \"size\":1000," +
+                "\"track_total_hits\": true, \"query\": { " +
+                "\"match_all\": {} } }";
+        results = CLIENT.postJson(endpoint + "/_search", query);
+        assertEquals(200, results.getStatus());
+        assertEquals(numHtmlDocs + 3 + 12, //3 for the mock docs,
+                // and the .docx file has 11 embedded files, plus itself
+                
results.getJson().get("hits").get("total").get("value").asInt());
+
+        //now check out one of the embedded files
+        query = "{ \"track_total_hits\": true, \"query\": { \"query_string\": 
{ " +
+                "\"default_field\": \"content\",  " +
+                "\"query\": \"embed4 zip\" , \"minimum_should_match\":2 } } } 
";
+        results = CLIENT.postJson(endpoint + "/_search", query);
+        assertEquals(200, results.getStatus());
+        assertEquals(1,
+                
results.getJson().get("hits").get("total").get("value").asInt());
+        JsonNode source = 
results.getJson().get("hits").get("hits").get(0).get("_source");
+
+        Matcher m = Pattern.compile("\\Atest_recursive_embedded" +
+                ".docx-[0-9a-f]{8}-[0-9a-f]{4}-" +
+                "[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\\Z").matcher(
+                
results.getJson().get("hits").get("hits").get(0).get("_id").asText()
+        );
+        assertTrue(m.find(), "test_recursive_embedded.docx-$guid");
+
+        
assertNull(results.getJson().get("hits").get("hits").get(0).get("_routing"),
+                "test_recursive_embedded.docx");
+        assertNull(source.get("relation_type"), 
"test_recursive_embedded.docx");
+
+        assertEquals("application/zip", source.get("mime").asText());
+
+        //now make sure there are no children; this query should
+        //cause an exception because there are no relationships in the schema
+        query = "{ \"track_total_hits\": true, \"query\": { \"parent_id\": { " 
+
+                "\"type\": \"embedded\",  " +
+                "\"id\": \"test_recursive_embedded.docx\" } } } ";
+        results = CLIENT.postJson(endpoint + "/_search", query);
+        assertEquals(400, results.getStatus());
+    }
+
     @Test
     public void testUpsert() throws Exception {
         String endpoint = OPEN_SEARCH_ENDPOINT_BASE + TEST_INDEX;
diff --git 
a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java
 
b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java
index 3b8061a95..c757115a1 100644
--- 
a/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java
+++ 
b/tika-pipes/tika-emitters/tika-emitter-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchClient.java
@@ -292,6 +292,7 @@ public class OpenSearchClient {
                 }
                 //end the "doc"
                 jsonGenerator.writeEndObject();
+                jsonGenerator.writeBooleanField("doc_as_upsert", true);
                 //end the metadata object
                 jsonGenerator.writeEndObject();
             }

Reply via email to