[ 
https://issues.apache.org/jira/browse/TIKA-4229?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17850708#comment-17850708
 ] 

ASF GitHub Bot commented on TIKA-4229:
--------------------------------------

bartek commented on code in PR #1698:
URL: https://github.com/apache/tika/pull/1698#discussion_r1620663162


##########
tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/MicrosoftGraphFetcher.java:
##########
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.fetchers.microsoftgraph;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Map;
+
+import com.azure.identity.ClientCertificateCredentialBuilder;
+import com.azure.identity.ClientSecretCredentialBuilder;
+import com.microsoft.graph.serviceclient.GraphServiceClient;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.config.Initializable;
+import org.apache.tika.config.InitializableProblemHandler;
+import org.apache.tika.config.Param;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.pipes.fetcher.AbstractFetcher;
+import 
org.apache.tika.pipes.fetchers.microsoftgraph.config.ClientCertificateCredentialsConfig;
+import 
org.apache.tika.pipes.fetchers.microsoftgraph.config.ClientSecretCredentialsConfig;
+import 
org.apache.tika.pipes.fetchers.microsoftgraph.config.MsGraphFetcherConfig;
+
+/**
+ * Fetches files from Microsoft Graph API.
+ * Fetch keys are ${siteDriveId},${driveItemId}
+ */
+public class MicrosoftGraphFetcher extends AbstractFetcher implements 
Initializable {
+    private static final Logger LOGGER = 
LoggerFactory.getLogger(MicrosoftGraphFetcher.class);
+    private GraphServiceClient graphClient;
+    private MsGraphFetcherConfig msGraphFetcherConfig;
+    private long[] throttleSeconds;
+
+    public MicrosoftGraphFetcher() {
+
+    }
+
+    public MicrosoftGraphFetcher(MsGraphFetcherConfig msGraphFetcherConfig) {
+        this.msGraphFetcherConfig = msGraphFetcherConfig;
+    }
+
+    /**
+     * Set seconds to throttle retries as a comma-delimited list, e.g.: 
30,60,120,600
+     *
+     * @param commaDelimitedLongs
+     * @throws TikaConfigException
+     */
+    @Field
+    public void setThrottleSeconds(String commaDelimitedLongs) throws 
TikaConfigException {
+        String[] longStrings = commaDelimitedLongs.split(",");
+        long[] seconds = new long[longStrings.length];
+        for (int i = 0; i < longStrings.length; i++) {
+            try {
+                seconds[i] = Long.parseLong(longStrings[i]);
+            } catch (NumberFormatException e) {
+                throw new TikaConfigException(e.getMessage());
+            }
+        }
+        setThrottleSeconds(seconds);
+    }
+
+    public void setThrottleSeconds(long[] throttleSeconds) {
+        this.throttleSeconds = throttleSeconds;
+    }
+
+    @Override
+    public void initialize(Map<String, Param> map) {
+        String[] scopes = msGraphFetcherConfig.getScopes().toArray(new 
String[0]);
+        if (msGraphFetcherConfig.getCredentials() instanceof 
ClientCertificateCredentialsConfig) {
+            ClientCertificateCredentialsConfig credentials =
+                    (ClientCertificateCredentialsConfig) 
msGraphFetcherConfig.getCredentials();
+            graphClient = new GraphServiceClient(
+                    new 
ClientCertificateCredentialBuilder().clientId(credentials.getClientId())
+                            
.tenantId(credentials.getTenantId()).pfxCertificate(
+                                    new 
ByteArrayInputStream(credentials.getCertificateBytes()))
+                            
.clientCertificatePassword(credentials.getCertificatePassword())
+                            .build(), scopes);
+        } else if (msGraphFetcherConfig.getCredentials() instanceof 
ClientSecretCredentialsConfig) {
+            ClientSecretCredentialsConfig credentials =
+                    (ClientSecretCredentialsConfig) 
msGraphFetcherConfig.getCredentials();
+            graphClient = new GraphServiceClient(
+                    new 
ClientSecretCredentialBuilder().tenantId(credentials.getTenantId())
+                            .clientId(credentials.getClientId())
+                            
.clientSecret(credentials.getClientSecret()).build(), scopes);
+        }
+    }
+
+    @Override
+    public void checkInitialization(InitializableProblemHandler 
initializableProblemHandler)
+            throws TikaConfigException {
+    }
+
+    @Override
+    public InputStream fetch(String fetchKey, Metadata metadata) throws 
TikaException, IOException {
+        int tries = 0;
+        Exception ex;
+        do {
+            try {
+                long start = System.currentTimeMillis();
+                String[] fetchKeySplit = fetchKey.split(",");

Review Comment:
   I'm stepping back on this comment. After reviewing the responses from the MS 
Graph API, there is no simple (without creating additional resources in the 
API) to get a long-living download URL. The URLs it provides are short lived 
(typically one hour) with a temp auth token. This is not ideal for Tika
   
   The drive item ID approach here I believe is the right one. I'll be testing 
it soon.





> add microsoft graph fetcher
> ---------------------------
>
>                 Key: TIKA-4229
>                 URL: https://issues.apache.org/jira/browse/TIKA-4229
>             Project: Tika
>          Issue Type: New Feature
>          Components: tika-pipes
>            Reporter: Nicholas DiPiazza
>            Priority: Major
>
> add a tika pipes fetcher capable of fetching files from MS graph api



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to