tballison commented on code in PR #558:
URL: https://github.com/apache/tika/pull/558#discussion_r871458924


##########
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGReadParser.java:
##########
@@ -0,0 +1,208 @@
+package org.apache.tika.parser.dwg;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+import java.util.UUID;
+import java.util.function.Consumer;
+import org.apache.commons.lang3.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.commons.io.FileUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import com.fasterxml.jackson.core.JsonFactory;
+import com.fasterxml.jackson.core.JsonParser;
+import com.fasterxml.jackson.core.JsonToken;
+import com.fasterxml.jackson.core.json.JsonReadFeature;
+
+
+public class DWGReadParser extends AbstractDWGParser {
+    private static final Logger LOG = LoggerFactory.getLogger(DWGParser.class);
+    /**
+     * 
+     */
+    private static final long serialVersionUID = 7983127145030096837L;
+    private static MediaType TYPE = MediaType.image("vnd.dwg");
+
+    public Set < MediaType > getSupportedTypes(ParseContext context) {
+        return Collections.singleton(TYPE);
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata, ParseContext context)
+    throws IOException, SAXException, TikaException {
+
+        configure(context);
+        DWGParserConfig dwgc = context.get(DWGParserConfig.class);
+        final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, 
metadata);
+
+        xhtml.startDocument();
+        UUID uuid = UUID.randomUUID();
+        File tmpFileOut = File.createTempFile(uuid + "dwgreadout", ".json");
+        File tmpFileOutCleaned = File.createTempFile(uuid + "dwgreadoutclean", 
".json");
+        File tmpFileIn = File.createTempFile(uuid + "dwgreadin", ".dwg");
+        try {
+
+            FileUtils.copyInputStreamToFile(stream, tmpFileIn);
+
+            List < String > command = 
Arrays.asList(dwgc.getDwgReadExecutable(), "-O", "JSON", "-o",

Review Comment:
   Consider using 
https://github.com/apache/tika/blob/main/tika-core/src/main/java/org/apache/tika/utils/ProcessUtils.java#L165
 instead of hand rolling ProcessBuilder?
   
   Definitely need a timeout and should never waitFor() forever.



##########
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGReadParser.java:
##########
@@ -0,0 +1,208 @@
+package org.apache.tika.parser.dwg;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+import java.util.UUID;
+import java.util.function.Consumer;
+import org.apache.commons.lang3.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.commons.io.FileUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import com.fasterxml.jackson.core.JsonFactory;
+import com.fasterxml.jackson.core.JsonParser;
+import com.fasterxml.jackson.core.JsonToken;
+import com.fasterxml.jackson.core.json.JsonReadFeature;
+
+
+public class DWGReadParser extends AbstractDWGParser {
+    private static final Logger LOG = LoggerFactory.getLogger(DWGParser.class);
+    /**
+     * 
+     */
+    private static final long serialVersionUID = 7983127145030096837L;
+    private static MediaType TYPE = MediaType.image("vnd.dwg");
+
+    public Set < MediaType > getSupportedTypes(ParseContext context) {
+        return Collections.singleton(TYPE);
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata, ParseContext context)
+    throws IOException, SAXException, TikaException {
+
+        configure(context);
+        DWGParserConfig dwgc = context.get(DWGParserConfig.class);
+        final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, 
metadata);
+
+        xhtml.startDocument();
+        UUID uuid = UUID.randomUUID();
+        File tmpFileOut = File.createTempFile(uuid + "dwgreadout", ".json");
+        File tmpFileOutCleaned = File.createTempFile(uuid + "dwgreadoutclean", 
".json");
+        File tmpFileIn = File.createTempFile(uuid + "dwgreadin", ".dwg");
+        try {
+
+            FileUtils.copyInputStreamToFile(stream, tmpFileIn);
+
+            List < String > command = 
Arrays.asList(dwgc.getDwgReadExecutable(), "-O", "JSON", "-o",
+                tmpFileOut.getCanonicalPath(), tmpFileIn.getCanonicalPath());
+            Process p = new ProcessBuilder(command).start();
+
+            try {
+                int exitCode = p.waitFor();
+            } catch (InterruptedException e) {
+                Thread.currentThread().interrupt();
+            }
+
+            if (dwgc.isCleanDwgReadOutput()) {
+                // dwgread sometimes creates strings with invalid utf-8 
sequences or invalid json (nan instead of NaN). replace them

Review Comment:
   Oh, my head hurts.  Thank you for this.



##########
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/AbstractDWGParser.java:
##########
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.dwg;
+
+
+import org.apache.tika.config.Field;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+
+
+
+
+public abstract class AbstractDWGParser extends AbstractParser {
+
+
+       /**
+        * 
+        */
+       private static final long serialVersionUID = 6261810259683381984L;
+       private final DWGParserConfig defaultDwgParserConfig = new 
DWGParserConfig();
+
+    public void configure(ParseContext parseContext) {
+       DWGParserConfig dwgParserConfig =  
parseContext.get(DWGParserConfig.class, defaultDwgParserConfig);
+        parseContext.set(DWGParserConfig.class, dwgParserConfig);
+    }
+
+
+    String getDwgReadExecutable() {
+        return defaultDwgParserConfig.getDwgReadExecutable();
+    }
+    
+    @Field
+    public void setDwgReadExecutable(String dwgReadExecutable) {
+       defaultDwgParserConfig.setDwgReadExecutable(dwgReadExecutable);
+    }
+    
+    boolean isCleanDwgReadOutput() {
+        return defaultDwgParserConfig.isCleanDwgReadOutput();
+    }
+    
+    @Field
+    public void setCleanDwgReadOutput(boolean cleanDwgReadOutput) {
+       defaultDwgParserConfig.setCleanDwgReadOutput(cleanDwgReadOutput);
+    }
+    
+    int getCleanDwgReadOutputBatchSize() {
+        return defaultDwgParserConfig.getCleanDwgReadOutputBatchSize();
+    }
+    
+    @Field
+    public void setCleanDwgReadOutputBatchSize(int 
cleanDwgReadOutputBatchSize) {
+       
defaultDwgParserConfig.setCleanDwgReadOutputBatchSize(cleanDwgReadOutputBatchSize);
+    }
+    String getCleanDwgReadRegexToReplace() {
+        return defaultDwgParserConfig.getCleanDwgReadRegexToReplace();
+    }
+    
+    @Field
+    public void setCleanDwgReadRegexToReplace(String 
cleanDwgReadRegexToReplace) {
+       
defaultDwgParserConfig.setCleanDwgReadRegexToReplace(cleanDwgReadRegexToReplace);
+    }
+    String getCleanDwgReadReplaceWith() {
+        return defaultDwgParserConfig.getCleanDwgReadReplaceWith();
+    }
+    
+    @Field
+    public void setCleanDwgReadReplaceWith(String cleanDwgReadReplaceWith) {
+       
defaultDwgParserConfig.setCleanDwgReadReplaceWith(cleanDwgReadReplaceWith);
+    }
+    

Review Comment:
   May want to implement Initializable and see if the dwg executable is 
actually working.  Please add lots of logging if a user specified a dwg 
executable, but it isn't working.
   
   Something along the lines of: 
https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java#L529



##########
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java:
##########
@@ -94,6 +93,14 @@ public Set<MediaType> getSupportedTypes(ParseContext 
context) {
 
     public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata,
                       ParseContext context) throws IOException, TikaException, 
SAXException {
+       
+       configure(context);
+       DWGParserConfig dwgc = context.get(DWGParserConfig.class);
+
+if(!dwgc.getDwgReadExecutable().isEmpty()) {

Review Comment:
   Do I remember correctly that the dwg commandline handles a different version 
of files than our existing parser.  If so, we'll want to branch based on 
detected mime type, not whether the executable is available.
   
   If this is as intended, please ignore this comment.



##########
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java:
##########
@@ -128,7 +135,7 @@ public void parse(InputStream stream, ContentHandler 
handler, Metadata metadata,
                 throw new TikaException("Unsupported AutoCAD drawing version: 
" + version);
         }
 
-        xhtml.endDocument();
+        xhtml.endDocument();}

Review Comment:
   formatting.  Did checkstyle let this one through?



##########
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java:
##########
@@ -29,166 +29,179 @@
 import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.OfficeParserConfig;
 import org.apache.tika.sax.BodyContentHandler;
 
 public class DWGParserTest extends TikaTest {
 
+//    @Test
+//    public void testDWG2000Parser() throws Exception {
+//        InputStream input =
+//                
DWGParserTest.class.getResourceAsStream("/test-documents/testDWG2000.dwg");
+//        testParserAlt(input);
+//    }
+//
+//    @Test
+//    public void testDWG2004Parser() throws Exception {
+//        InputStream input =
+//                
DWGParserTest.class.getResourceAsStream("/test-documents/testDWG2004.dwg");
+//        testParser(input);
+//    }
+//
+//    @Test
+//    public void testDWG2004ParserNoHeaderAddress() throws Exception {
+//        InputStream input = DWGParserTest.class
+//                
.getResourceAsStream("/test-documents/testDWG2004_no_header.dwg");
+//        testParserNoHeader(input);
+//    }
+//
+//    @Test
+//    public void testDWG2007Parser() throws Exception {
+//        InputStream input =
+//                
DWGParserTest.class.getResourceAsStream("/test-documents/testDWG2007.dwg");
+//        testParser(input);
+//    }
+//
+//    @Test
+//    public void testDWG2010Parser() throws Exception {
+//        InputStream input =
+//                
DWGParserTest.class.getResourceAsStream("/test-documents/testDWG2010.dwg");
+//        testParser(input);
+//    }
+//
+////    @Test
+////    public void testDWG2010CustomPropertiesParser() throws Exception {
+////        // Check that standard parsing works
+////        InputStream testInput = DWGParserTest.class
+////                
.getResourceAsStream("/test-documents/testDWG2010_custom_props.dwg");
+////        testParser(testInput);
+////
+////        // Check that custom properties with alternate padding work
+////        try (InputStream input = DWGParserTest.class
+////                
.getResourceAsStream("/test-documents/testDWG2010_custom_props.dwg")) {
+////            Metadata metadata = new Metadata();
+////            ContentHandler handler = new BodyContentHandler();
+////            new DWGParser().parse(input, handler, metadata, null);
+////
+////            assertEquals("valueforcustomprop1",
+////                    metadata.get(DWGParser.DWG_CUSTOM_META_PREFIX + 
"customprop1"));
+////            assertEquals("valueforcustomprop2",
+////                    metadata.get(DWGParser.DWG_CUSTOM_META_PREFIX + 
"customprop2"));
+////        }
+////    }
+//
+//    @Test
+//    public void testDWGMechParser() throws Exception {
+//        String[] types =
+//                new String[]{"6", "2004", "2004DX", "2005", "2006", "2007", 
"2008", "2009", "2010",
+//                        "2011"};
+//        for (String type : types) {
+//            InputStream input = DWGParserTest.class
+//                    .getResourceAsStream("/test-documents/testDWGmech" + 
type + ".dwg");
+//            testParserAlt(input);
+//        }
+//    }
+//
+//    @SuppressWarnings("deprecation")
+//    private void testParser(InputStream input) throws Exception {
+//        try {
+//            Metadata metadata = new Metadata();
+//            ContentHandler handler = new BodyContentHandler();
+//            new DWGParser().parse(input, handler, metadata);
+//
+//            assertEquals("image/vnd.dwg", 
metadata.get(Metadata.CONTENT_TYPE));
+//
+//            assertEquals("The quick brown fox jumps over the lazy dog",
+//                    metadata.get(TikaCoreProperties.TITLE));
+//            assertEquals("Gym class featuring a brown fox and lazy dog",
+//                    metadata.get(TikaCoreProperties.DESCRIPTION));
+//            assertEquals("Nevin Nollop", 
metadata.get(TikaCoreProperties.CREATOR));
+//            assertContains("Pangram, fox, dog",
+//                    
Arrays.asList(metadata.getValues(TikaCoreProperties.SUBJECT)));
+//            assertEquals("Lorem ipsum", 
metadata.get(TikaCoreProperties.COMMENTS).substring(0, 11));
+//            assertEquals("http://www.alfresco.com";, 
metadata.get(TikaCoreProperties.RELATION));
+//
+//            String content = handler.toString();
+//            assertContains("The quick brown fox jumps over the lazy dog", 
content);
+//            assertContains("Gym class", content);
+//            assertContains("www.alfresco.com", content);
+//        } finally {
+//            input.close();
+//        }
+//    }
+//
+//    @SuppressWarnings("deprecation")
+//    private void testParserNoHeader(InputStream input) throws Exception {
+//        try {
+//            Metadata metadata = new Metadata();
+//            ContentHandler handler = new BodyContentHandler();
+//            new DWGParser().parse(input, handler, metadata);
+//
+//            assertEquals("image/vnd.dwg", 
metadata.get(Metadata.CONTENT_TYPE));
+//
+//            assertNull(metadata.get(TikaCoreProperties.TITLE));
+//            assertNull(metadata.get(TikaCoreProperties.DESCRIPTION));
+//            assertNull(metadata.get(TikaCoreProperties.CREATOR));
+//            assertNull(metadata.get(TikaCoreProperties.SUBJECT));
+//            assertNull(metadata.get(TikaCoreProperties.COMMENTS));
+//            assertNull(metadata.get(TikaCoreProperties.RELATION));
+//
+//            String content = handler.toString();
+//            assertEquals("", content);
+//        } finally {
+//            input.close();
+//        }
+//    }
+//
+//    @SuppressWarnings("deprecation")
+//    private void testParserAlt(InputStream input) throws Exception {
+//        try {
+//            Metadata metadata = new Metadata();
+//            ContentHandler handler = new BodyContentHandler();
+//            new DWGParser().parse(input, handler, metadata);
+//
+//            assertEquals("image/vnd.dwg", 
metadata.get(Metadata.CONTENT_TYPE));
+//
+//            assertEquals("Test Title", 
metadata.get(TikaCoreProperties.TITLE));
+//            assertEquals("Test Subject", 
metadata.get(TikaCoreProperties.DESCRIPTION));
+//            assertEquals("My Author", 
metadata.get(TikaCoreProperties.CREATOR));
+//            assertEquals("My keyword1, MyKeyword2", 
metadata.get(TikaCoreProperties.SUBJECT));
+//            assertEquals("This is a comment", 
metadata.get(TikaCoreProperties.COMMENTS));
+//            assertEquals("bejanpol", 
metadata.get(TikaCoreProperties.MODIFIER));
+//            assertEquals("http://mycompany/drawings";, 
metadata.get(TikaCoreProperties.RELATION));
+//            assertEquals("MyCustomPropertyValue",
+//                    metadata.get(DWGParser.DWG_CUSTOM_META_PREFIX + 
"MyCustomProperty"));
+//
+//            String content = handler.toString();
+//            assertContains("This is a comment", content);
+//            assertContains("mycompany", content);
+//        } finally {
+//            input.close();
+//        }
+//    }
+//
+//    @Test
+//    public void testAC1027() throws Exception {
+//        Metadata metadata = getXML("testDWG-AC1027.dwg").metadata;
+//        assertEquals("hlu", metadata.get(TikaCoreProperties.MODIFIER));
+//    }
+//
+//    @Test
+//    public void testAC1032() throws Exception {
+//        Metadata metadata = getXML("testDWG-AC1032.dwg").metadata;
+//        assertEquals("jlakshvi", metadata.get(TikaCoreProperties.MODIFIER));
+//        assertEquals("CUSTOMER'S ADDRESS", 
metadata.get("dwg-custom:CUSTOMER'S ADDRESS"));
+//    }
+//    
     @Test
-    public void testDWG2000Parser() throws Exception {
-        InputStream input =
-                
DWGParserTest.class.getResourceAsStream("/test-documents/testDWG2000.dwg");
-        testParserAlt(input);
-    }
-
-    @Test
-    public void testDWG2004Parser() throws Exception {
-        InputStream input =
-                
DWGParserTest.class.getResourceAsStream("/test-documents/testDWG2004.dwg");
-        testParser(input);
-    }
-
-    @Test
-    public void testDWG2004ParserNoHeaderAddress() throws Exception {
-        InputStream input = DWGParserTest.class
-                
.getResourceAsStream("/test-documents/testDWG2004_no_header.dwg");
-        testParserNoHeader(input);
-    }
-
-    @Test
-    public void testDWG2007Parser() throws Exception {
-        InputStream input =
-                
DWGParserTest.class.getResourceAsStream("/test-documents/testDWG2007.dwg");
-        testParser(input);
-    }
-
-    @Test
-    public void testDWG2010Parser() throws Exception {
-        InputStream input =
-                
DWGParserTest.class.getResourceAsStream("/test-documents/testDWG2010.dwg");
-        testParser(input);
-    }
-
-    @Test
-    public void testDWG2010CustomPropertiesParser() throws Exception {
-        // Check that standard parsing works
-        InputStream testInput = DWGParserTest.class
-                
.getResourceAsStream("/test-documents/testDWG2010_custom_props.dwg");
-        testParser(testInput);
-
-        // Check that custom properties with alternate padding work
-        try (InputStream input = DWGParserTest.class
-                
.getResourceAsStream("/test-documents/testDWG2010_custom_props.dwg")) {
-            Metadata metadata = new Metadata();
-            ContentHandler handler = new BodyContentHandler();
-            new DWGParser().parse(input, handler, metadata, null);
-
-            assertEquals("valueforcustomprop1",
-                    metadata.get(DWGParser.DWG_CUSTOM_META_PREFIX + 
"customprop1"));
-            assertEquals("valueforcustomprop2",
-                    metadata.get(DWGParser.DWG_CUSTOM_META_PREFIX + 
"customprop2"));
-        }
-    }
-
-    @Test
-    public void testDWGMechParser() throws Exception {
-        String[] types =
-                new String[]{"6", "2004", "2004DX", "2005", "2006", "2007", 
"2008", "2009", "2010",
-                        "2011"};
-        for (String type : types) {
-            InputStream input = DWGParserTest.class
-                    .getResourceAsStream("/test-documents/testDWGmech" + type 
+ ".dwg");
-            testParserAlt(input);
-        }
-    }
-
-    @SuppressWarnings("deprecation")
-    private void testParser(InputStream input) throws Exception {
-        try {
-            Metadata metadata = new Metadata();
-            ContentHandler handler = new BodyContentHandler();
-            new DWGParser().parse(input, handler, metadata);
-
-            assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE));
-
-            assertEquals("The quick brown fox jumps over the lazy dog",
-                    metadata.get(TikaCoreProperties.TITLE));
-            assertEquals("Gym class featuring a brown fox and lazy dog",
-                    metadata.get(TikaCoreProperties.DESCRIPTION));
-            assertEquals("Nevin Nollop", 
metadata.get(TikaCoreProperties.CREATOR));
-            assertContains("Pangram, fox, dog",
-                    
Arrays.asList(metadata.getValues(TikaCoreProperties.SUBJECT)));
-            assertEquals("Lorem ipsum", 
metadata.get(TikaCoreProperties.COMMENTS).substring(0, 11));
-            assertEquals("http://www.alfresco.com";, 
metadata.get(TikaCoreProperties.RELATION));
-
-            String content = handler.toString();
-            assertContains("The quick brown fox jumps over the lazy dog", 
content);
-            assertContains("Gym class", content);
-            assertContains("www.alfresco.com", content);
-        } finally {
-            input.close();
-        }
-    }
-
-    @SuppressWarnings("deprecation")
-    private void testParserNoHeader(InputStream input) throws Exception {
-        try {
-            Metadata metadata = new Metadata();
-            ContentHandler handler = new BodyContentHandler();
-            new DWGParser().parse(input, handler, metadata);
-
-            assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE));
-
-            assertNull(metadata.get(TikaCoreProperties.TITLE));
-            assertNull(metadata.get(TikaCoreProperties.DESCRIPTION));
-            assertNull(metadata.get(TikaCoreProperties.CREATOR));
-            assertNull(metadata.get(TikaCoreProperties.SUBJECT));
-            assertNull(metadata.get(TikaCoreProperties.COMMENTS));
-            assertNull(metadata.get(TikaCoreProperties.RELATION));
-
-            String content = handler.toString();
-            assertEquals("", content);
-        } finally {
-            input.close();
-        }
-    }
-
-    @SuppressWarnings("deprecation")
-    private void testParserAlt(InputStream input) throws Exception {
-        try {
-            Metadata metadata = new Metadata();
-            ContentHandler handler = new BodyContentHandler();
-            new DWGParser().parse(input, handler, metadata);
-
-            assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE));
-
-            assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
-            assertEquals("Test Subject", 
metadata.get(TikaCoreProperties.DESCRIPTION));
-            assertEquals("My Author", 
metadata.get(TikaCoreProperties.CREATOR));
-            assertEquals("My keyword1, MyKeyword2", 
metadata.get(TikaCoreProperties.SUBJECT));
-            assertEquals("This is a comment", 
metadata.get(TikaCoreProperties.COMMENTS));
-            assertEquals("bejanpol", 
metadata.get(TikaCoreProperties.MODIFIER));
-            assertEquals("http://mycompany/drawings";, 
metadata.get(TikaCoreProperties.RELATION));
-            assertEquals("MyCustomPropertyValue",
-                    metadata.get(DWGParser.DWG_CUSTOM_META_PREFIX + 
"MyCustomProperty"));
-
-            String content = handler.toString();
-            assertContains("This is a comment", content);
-            assertContains("mycompany", content);
-        } finally {
-            input.close();
-        }
-    }
-
-    @Test
-    public void testAC1027() throws Exception {
-        Metadata metadata = getXML("testDWG-AC1027.dwg").metadata;
-        assertEquals("hlu", metadata.get(TikaCoreProperties.MODIFIER));
-    }
-
-    @Test
-    public void testAC1032() throws Exception {
-        Metadata metadata = getXML("testDWG-AC1032.dwg").metadata;
-        assertEquals("jlakshvi", metadata.get(TikaCoreProperties.MODIFIER));
-        assertEquals("CUSTOMER'S ADDRESS", metadata.get("dwg-custom:CUSTOMER'S 
ADDRESS"));
+    public void testDWGRead() throws Exception {

Review Comment:
   If the executable is required, we should "assume" that it exists at the 
start of the test. Along the lines: 
https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java#L66



##########
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGReadParser.java:
##########
@@ -0,0 +1,208 @@
+package org.apache.tika.parser.dwg;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+import java.util.UUID;
+import java.util.function.Consumer;
+import org.apache.commons.lang3.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.commons.io.FileUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import com.fasterxml.jackson.core.JsonFactory;
+import com.fasterxml.jackson.core.JsonParser;
+import com.fasterxml.jackson.core.JsonToken;
+import com.fasterxml.jackson.core.json.JsonReadFeature;
+
+
+public class DWGReadParser extends AbstractDWGParser {
+    private static final Logger LOG = LoggerFactory.getLogger(DWGParser.class);
+    /**
+     * 
+     */
+    private static final long serialVersionUID = 7983127145030096837L;
+    private static MediaType TYPE = MediaType.image("vnd.dwg");
+
+    public Set < MediaType > getSupportedTypes(ParseContext context) {
+        return Collections.singleton(TYPE);
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata, ParseContext context)
+    throws IOException, SAXException, TikaException {
+
+        configure(context);
+        DWGParserConfig dwgc = context.get(DWGParserConfig.class);
+        final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, 
metadata);
+
+        xhtml.startDocument();
+        UUID uuid = UUID.randomUUID();
+        File tmpFileOut = File.createTempFile(uuid + "dwgreadout", ".json");
+        File tmpFileOutCleaned = File.createTempFile(uuid + "dwgreadoutclean", 
".json");
+        File tmpFileIn = File.createTempFile(uuid + "dwgreadin", ".dwg");
+        try {
+
+            FileUtils.copyInputStreamToFile(stream, tmpFileIn);
+
+            List < String > command = 
Arrays.asList(dwgc.getDwgReadExecutable(), "-O", "JSON", "-o",
+                tmpFileOut.getCanonicalPath(), tmpFileIn.getCanonicalPath());
+            Process p = new ProcessBuilder(command).start();
+
+            try {
+                int exitCode = p.waitFor();
+            } catch (InterruptedException e) {
+                Thread.currentThread().interrupt();
+            }
+
+            if (dwgc.isCleanDwgReadOutput()) {
+                // dwgread sometimes creates strings with invalid utf-8 
sequences or invalid json (nan instead of NaN). replace them
+                // with empty string.
+
+                try (FileInputStream fis = new FileInputStream(tmpFileOut); 
FileOutputStream fos = new FileOutputStream(tmpFileOutCleaned)) {
+                    byte[] bytes = new 
byte[dwgc.getCleanDwgReadOutputBatchSize()];
+                    while (fis.read(bytes) != -1) {
+                        byte[] fixedBytes = new String(bytes, 
StandardCharsets.UTF_8)
+                            .replaceAll(dwgc.getCleanDwgReadRegexToReplace(), 
dwgc.getCleanDwgReadReplaceWith())
+                            .getBytes(StandardCharsets.UTF_8);
+                        fos.write(fixedBytes, 0, fixedBytes.length);
+                    }
+                } finally {
+                    FileUtils.deleteQuietly(tmpFileOut);
+                    tmpFileOut = tmpFileOutCleaned;
+                }
+
+            }
+            
+            // we can't guarantee the json output is correct so we try to 
ignore as many errors as we can
+            JsonFactory jfactory = 
JsonFactory.builder().enable(JsonReadFeature.ALLOW_MISSING_VALUES,JsonReadFeature.ALLOW_UNESCAPED_CONTROL_CHARS,JsonReadFeature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER).build();
+            JsonParser jParser = jfactory.createParser(tmpFileOut);
+            JsonToken nextToken = jParser.nextToken();
+            while ((nextToken = jParser.nextToken()) != JsonToken.END_OBJECT) {
+                if (nextToken == JsonToken.FIELD_NAME) {
+                    String nextFieldName = jParser.currentName();
+                    nextToken = jParser.nextToken();
+                    if (nextToken.isStructStart()) {
+
+                        if ("OBJECTS".equals(nextFieldName)) {
+                            // Start array
+                            jParser.nextToken();
+                            while (jParser.nextToken() != JsonToken.END_ARRAY) 
{
+                                parseDwgObject(jParser, (nextTextValue) -> {
+
+                                    try {
+                                                                               
xhtml.characters(cleanupDwgString(nextTextValue));
+                                                                               
xhtml.newline();
+                                                                       } catch 
(SAXException e) {
+                                                                               
LOG.error("Could not write next text value {} to xhtml stream", nextTextValue);
+                                                                       }
+                                });
+                            }
+                        }  else if ("FILEHEADER".equals(nextFieldName)) {
+                            parseHeader(jParser,metadata);
+                        } else {
+                            jParser.skipChildren();
+                        }
+                    }
+                }
+            }
+            jParser.close();
+        } finally {
+            FileUtils.deleteQuietly(tmpFileOut);
+            FileUtils.deleteQuietly(tmpFileIn);
+        }
+        
+        
+        xhtml.endDocument();
+    }
+    private void parseDwgObject(JsonParser jsonParser, Consumer<String> 
textConsumer) throws IOException {
+        JsonToken nextToken;
+        while ((nextToken = jsonParser.nextToken()) != JsonToken.END_OBJECT) {
+            if (nextToken == JsonToken.FIELD_NAME) {
+                String nextFieldName = jsonParser.currentName();
+                nextToken = jsonParser.nextToken();
+                if (nextToken.isStructStart()) {
+                    jsonParser.skipChildren();
+                } else if (nextToken.isScalarValue()) {
+                    if ("text".equals(nextFieldName)) {
+                        String textVal = jsonParser.getText();
+                        if (StringUtils.isNotBlank(textVal)) {
+
+                            textConsumer.accept(textVal);
+                        }
+                    }
+                    else    if ("text_value".equals(nextFieldName)) {
+                        String textVal = jsonParser.getText();
+                        if (StringUtils.isNotBlank(textVal)) {
+
+                            textConsumer.accept(textVal);
+                            
+                        }
+                    }
+                }
+            }
+        }
+    }
+    private void parseHeader(JsonParser jsonParser, Metadata metadata) throws 
IOException {
+        JsonToken nextToken;
+        while ((nextToken = jsonParser.nextToken()) != JsonToken.END_OBJECT) {
+            if (nextToken == JsonToken.FIELD_NAME) {
+                String nextFieldName = jsonParser.currentName();
+                nextToken = jsonParser.nextToken();
+                if (nextToken.isStructStart()) {
+                    jsonParser.skipChildren();
+                } else if (nextToken.isScalarValue()) {
+                    metadata.set(nextFieldName, jsonParser.getText());
+                }
+            }
+        }
+    }
+       private String cleanupDwgString(String dwgString) {
+               //Cleaning chars have been found from the following websites:
+               
//https://www.cadforum.cz/en/text-formatting-codes-in-mtext-objects-tip8640
+               
//https://adndevblog.typepad.com/autocad/2017/09/dissecting-mtext-format-codes.html
+               String cleanString;

Review Comment:
   This also hurts.  Thank you.



##########
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java:
##########
@@ -29,166 +29,179 @@
 import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.OfficeParserConfig;
 import org.apache.tika.sax.BodyContentHandler;
 
 public class DWGParserTest extends TikaTest {
 
+//    @Test

Review Comment:
   Why are these turned off?



##########
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGReadParser.java:
##########
@@ -0,0 +1,208 @@
+package org.apache.tika.parser.dwg;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+import java.util.UUID;
+import java.util.function.Consumer;
+import org.apache.commons.lang3.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.commons.io.FileUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import com.fasterxml.jackson.core.JsonFactory;
+import com.fasterxml.jackson.core.JsonParser;
+import com.fasterxml.jackson.core.JsonToken;
+import com.fasterxml.jackson.core.json.JsonReadFeature;
+
+
+public class DWGReadParser extends AbstractDWGParser {
+    private static final Logger LOG = LoggerFactory.getLogger(DWGParser.class);
+    /**
+     * 
+     */
+    private static final long serialVersionUID = 7983127145030096837L;
+    private static MediaType TYPE = MediaType.image("vnd.dwg");
+
+    public Set < MediaType > getSupportedTypes(ParseContext context) {
+        return Collections.singleton(TYPE);
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata, ParseContext context)
+    throws IOException, SAXException, TikaException {
+
+        configure(context);
+        DWGParserConfig dwgc = context.get(DWGParserConfig.class);
+        final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, 
metadata);
+
+        xhtml.startDocument();
+        UUID uuid = UUID.randomUUID();
+        File tmpFileOut = File.createTempFile(uuid + "dwgreadout", ".json");
+        File tmpFileOutCleaned = File.createTempFile(uuid + "dwgreadoutclean", 
".json");
+        File tmpFileIn = File.createTempFile(uuid + "dwgreadin", ".dwg");
+        try {
+
+            FileUtils.copyInputStreamToFile(stream, tmpFileIn);
+
+            List < String > command = 
Arrays.asList(dwgc.getDwgReadExecutable(), "-O", "JSON", "-o",
+                tmpFileOut.getCanonicalPath(), tmpFileIn.getCanonicalPath());
+            Process p = new ProcessBuilder(command).start();
+
+            try {
+                int exitCode = p.waitFor();
+            } catch (InterruptedException e) {
+                Thread.currentThread().interrupt();
+            }
+
+            if (dwgc.isCleanDwgReadOutput()) {
+                // dwgread sometimes creates strings with invalid utf-8 
sequences or invalid json (nan instead of NaN). replace them
+                // with empty string.
+
+                try (FileInputStream fis = new FileInputStream(tmpFileOut); 
FileOutputStream fos = new FileOutputStream(tmpFileOutCleaned)) {
+                    byte[] bytes = new 
byte[dwgc.getCleanDwgReadOutputBatchSize()];
+                    while (fis.read(bytes) != -1) {
+                        byte[] fixedBytes = new String(bytes, 
StandardCharsets.UTF_8)
+                            .replaceAll(dwgc.getCleanDwgReadRegexToReplace(), 
dwgc.getCleanDwgReadReplaceWith())
+                            .getBytes(StandardCharsets.UTF_8);
+                        fos.write(fixedBytes, 0, fixedBytes.length);
+                    }
+                } finally {
+                    FileUtils.deleteQuietly(tmpFileOut);
+                    tmpFileOut = tmpFileOutCleaned;
+                }
+
+            }
+            
+            // we can't guarantee the json output is correct so we try to 
ignore as many errors as we can
+            JsonFactory jfactory = 
JsonFactory.builder().enable(JsonReadFeature.ALLOW_MISSING_VALUES,JsonReadFeature.ALLOW_UNESCAPED_CONTROL_CHARS,JsonReadFeature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER).build();
+            JsonParser jParser = jfactory.createParser(tmpFileOut);
+            JsonToken nextToken = jParser.nextToken();
+            while ((nextToken = jParser.nextToken()) != JsonToken.END_OBJECT) {
+                if (nextToken == JsonToken.FIELD_NAME) {
+                    String nextFieldName = jParser.currentName();
+                    nextToken = jParser.nextToken();
+                    if (nextToken.isStructStart()) {
+
+                        if ("OBJECTS".equals(nextFieldName)) {
+                            // Start array
+                            jParser.nextToken();
+                            while (jParser.nextToken() != JsonToken.END_ARRAY) 
{
+                                parseDwgObject(jParser, (nextTextValue) -> {
+
+                                    try {
+                                                                               
xhtml.characters(cleanupDwgString(nextTextValue));
+                                                                               
xhtml.newline();
+                                                                       } catch 
(SAXException e) {
+                                                                               
LOG.error("Could not write next text value {} to xhtml stream", nextTextValue);
+                                                                       }
+                                });
+                            }
+                        }  else if ("FILEHEADER".equals(nextFieldName)) {
+                            parseHeader(jParser,metadata);
+                        } else {
+                            jParser.skipChildren();
+                        }
+                    }
+                }
+            }
+            jParser.close();
+        } finally {
+            FileUtils.deleteQuietly(tmpFileOut);

Review Comment:
   Should also delete tmpfileoutcleaned in case there's an exception?



##########
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/pom.xml:
##########
@@ -36,6 +36,17 @@
       <artifactId>tika-parser-microsoft-module</artifactId>
       <version>${project.version}</version>
     </dependency>
+
+    <dependency>
+       <groupId>com.fasterxml.jackson.core</groupId>
+       <artifactId>jackson-core</artifactId>
+           <version>${jackson.version}</version><!--$NO-MVN-MAN-VER$-->

Review Comment:
   Shouldn't need versions for jackson-core or jackson-databind because those 
are in dependency management in tika-parent.  I'm still in the process of 
cleaning these up, and there are quite a few cases where I've done exactly 
this.  This isn't your fault.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: dev-unsubscr...@tika.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to