Author: nick
Date: Mon Jul 14 20:05:13 2014
New Revision: 1610506

URL: http://svn.apache.org/r1610506
Log:
Patch from Tyler Palsulich from TIKA-1327 - More enhancements to the Matlab 
parser

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/test_mat_text.mat 
  (with props)
Modified:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mat/MatParser.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mat/MatParserTest.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mat/MatParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mat/MatParser.java?rev=1610506&r1=1610505&r2=1610506&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mat/MatParser.java 
(original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mat/MatParser.java 
Mon Jul 14 20:05:13 2014
@@ -48,101 +48,91 @@ import com.jmatio.common.MatDataTypes;
 
 
 public class MatParser extends AbstractParser {
-    
-    public static final String MATLAB_MIME_TYPE = 
-    "application/x-matlab-data";  
+
+    public static final String MATLAB_MIME_TYPE =
+            "application/x-matlab-data";
 
     private final Set<MediaType> SUPPORTED_TYPES =
-        Collections.singleton(MediaType.application("x-matlab-data"));
+            Collections.singleton(MediaType.application("x-matlab-data"));
 
     public Set<MediaType> getSupportedTypes(ParseContext context){
         return SUPPORTED_TYPES;
-        }
+    }
+
+    public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
 
-    public void parse(InputStream stream, ContentHandler handler,
-        Metadata metadata, ParseContext context) throws IOException,
-        SAXException, TikaException {
-                               
-        //Set MIME type as metadata
+        //Set MIME type as Matlab
         metadata.set(Metadata.CONTENT_TYPE, MATLAB_MIME_TYPE);
-               
+
         try {
-               
-            //Input file stream
+            // Use TIS so we can spool a temp file for parsing.
             TikaInputStream tis = TikaInputStream.get(stream);
-                                                               
+
             //Extract information from header file
             MatFileReader mfr = new MatFileReader(tis.getFile()); //input .mat 
file
             MatFileHeader hdr = mfr.getMatFileHeader(); //.mat header 
information
 
-            String stringToSplit = hdr.getDescription(); //break header 
information into its parts
-            String[] parts = stringToSplit.split(",");
-                               
-            // Ex .mat header "MATLAB 5.0 MAT-file, Platform: MACI64, Created 
on: Sun Mar  2 23:41:57 2014"
+            // Example header: "MATLAB 5.0 MAT-file, Platform: MACI64, Created 
on: Sun Mar  2 23:41:57 2014"
+            String[] parts = hdr.getDescription().split(","); // Break header 
information into its parts
+
             if (parts[2].contains("Created")) {
                 int lastIndex1 = parts[2].lastIndexOf("Created on:");
-                String dateCreated = parts[2].substring(lastIndex1 + 
11).trim();
+                String dateCreated = parts[2].substring(lastIndex1 + "Created 
on:".length()).trim();
                 metadata.set("createdOn", dateCreated);
-                }
-                       
+            }
+
             if (parts[1].contains("Platform")) {
                 int lastIndex2 = parts[1].lastIndexOf("Platform:");
-                String platform = parts[1].substring(lastIndex2 + 9).trim();
+                String platform = parts[1].substring(lastIndex2 + 
"Platform:".length()).trim();
                 metadata.set("platform" , platform);
-                }
-                               
-            if (parts[0].contains("MATLAB")) { 
+            }
+
+            if (parts[0].contains("MATLAB")) {
                 metadata.set("fileType", parts[0]);
-                }
-                       
-            //Get endian indicator from header file
-            String endianBytes = new String(hdr.getEndianIndicator()); 
//retrieve endian bytes and convert to string
-            String endianCode = String.valueOf(endianBytes.toCharArray()); 
//convert bytes to characters to string
+            }
+
+            // Get endian indicator from header file
+            String endianBytes = new String(hdr.getEndianIndicator()); // 
Retrieve endian bytes and convert to string
+            String endianCode = String.valueOf(endianBytes.toCharArray()); // 
Convert bytes to characters to string
             metadata.set("endian", endianCode);
-       
+
             //Text output      
-            XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, 
metadata);                    
+            XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, 
metadata);
             xhtml.startDocument();
-                       
-            //Get array names, size, and data types
-            Map<String, MLArray> data = mfr.getContent();
-            Set<String> vars = data.keySet();
-                       
-            //Loop through each variable 
-            for (Iterator<String> var = vars.iterator(); var.hasNext();) {
-                String varName = var.next();
-                MLArray varData = data.get(varName);
-                   
-                xhtml.characters(varName);
-                xhtml.characters(":");
-                xhtml.characters(String.valueOf(varData));
-                xhtml.newline();
-                   
-                //if the variable is a structure, extract variable info from 
structure
+            xhtml.newline();
+            //Loop through each variable
+            for (Map.Entry<String, MLArray> entry : 
mfr.getContent().entrySet()) {
+                String varName = entry.getKey();
+                MLArray varData = entry.getValue();
+
+                xhtml.element("p", varName + ":" + String.valueOf(varData));
+
+                // If the variable is a structure, extract variable info from 
structure
                 if (varData.isStruct()){
-                    MLStructure mlStructure = (MLStructure) 
mfr.getMLArray(varName);   
-                    Collection<MLArray> list = mlStructure.getAllFields();
-                                  
-                    for (MLArray element : list){
-                        xhtml.characters("  ");
+                    MLStructure mlStructure = (MLStructure) 
mfr.getMLArray(varName);
+                    xhtml.startElement("ul");
+                    xhtml.newline();
+                    for (MLArray element : mlStructure.getAllFields()){
+                        xhtml.startElement("li");
                         xhtml.characters(String.valueOf(element));
-                        xhtml.newline();
-                                               
-                        //if there is an imbedded structure, extract variable 
info.                                            
-                        if (element.isStruct()){                               
                                                                                
                                 
-                            String nest = element.contentToString();
-                            xhtml.characters("      ");
-                            xhtml.characters(nest);   
-                            xhtml.newline();
-                            }
+
+                        // If there is an embedded structure, extract variable 
info.
+                        if (element.isStruct()){
+                            xhtml.startElement("ul");
+                            // Should this actually be a recursive call?
+                            xhtml.element("li", element.contentToString());
+                            xhtml.endElement("ul");
                         }
-                    }          
-            } 
 
-            xhtml.endDocument();               
-                               
-            } catch (IOException e) {
-                throw new TikaException("matparser error", e);
-                } 
-        }        
+                        xhtml.endElement("li");
+                    }
+                    xhtml.endElement("ul");
+                }
+            }
+            xhtml.endDocument();
+        } catch (IOException e) {
+            throw new TikaException("Error parsing Matlab file with 
MatParser", e);
+        }
+    }
 }
\ No newline at end of file

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mat/MatParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mat/MatParserTest.java?rev=1610506&r1=1610505&r2=1610506&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mat/MatParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mat/MatParserTest.java
 Mon Jul 14 20:05:13 2014
@@ -31,47 +31,70 @@ import org.apache.tika.metadata.TikaCore
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ToXMLContentHandler;
 import org.junit.Test;
 import org.xml.sax.ContentHandler;
 /**
  * Test cases to exercise the {@link MatParser}.
- * 
+ *
  */
 public class MatParserTest {
 
     @Test
     public void testParser() throws Exception {
-        
+
         Parser parser = new MatParser();
-        ContentHandler handler = new BodyContentHandler();
+        ToXMLContentHandler handler = new ToXMLContentHandler();
         Metadata metadata = new Metadata();
         String path = 
"/test-documents/breidamerkurjokull_radar_profiles_2009.mat";
 
-        InputStream stream = MatParser.class.getResourceAsStream(path); 
-                
+        InputStream stream = MatParser.class.getResourceAsStream(path);
+
         try {
             parser.parse(stream, handler, metadata, new ParseContext());
-               } finally {
-               stream.close();
-               }
-        
-               //Check Metadata
-               assertEquals("PCWIN64", metadata.get("platform"));
-               assertEquals("MATLAB 5.0 MAT-file", metadata.get("fileType"));
-               assertEquals("IM", metadata.get("endian"));
-               assertEquals("Thu Feb 21 15:52:49 2013", 
metadata.get("createdOn"));
-               
-                       //Check Content
-                       String content = handler.toString();
-        
-                       assertTrue(content.contains("a1:[1x1  struct array]"));
-                       assertTrue(content.contains("[1024x1261  double 
array]"));
-                       assertTrue(content.contains("a2:[1x1  struct array]"));
-                       assertTrue(content.contains("[1024x1283  double 
array]"));
-                       assertTrue(content.contains("b1:[1x1  struct array]"));
-                       assertTrue(content.contains("[1024x1311  double 
array]"));
-                       assertTrue(content.contains("c1:[1x1  struct array]"));
-                       assertTrue(content.contains("[1024x909  double 
array]"));       
-                       }
+        } finally {
+            stream.close();
+        }
+
+        //Check Metadata
+        assertEquals("PCWIN64", metadata.get("platform"));
+        assertEquals("MATLAB 5.0 MAT-file", metadata.get("fileType"));
+        assertEquals("IM", metadata.get("endian"));
+        assertEquals("Thu Feb 21 15:52:49 2013", metadata.get("createdOn"));
+
+        //Check Content
+        String content = handler.toString();
+
+        assertTrue(content.contains("<li>[1x909  double array]</li>"));
+        assertTrue(content.contains("<p>c1:[1x1  struct array]</p>"));
+        assertTrue(content.contains("<li>[1024x1  double array]</li>"));
+        assertTrue(content.contains("<p>b1:[1x1  struct array]</p>"));
+        assertTrue(content.contains("<p>a1:[1x1  struct array]</p>"));
+        assertTrue(content.contains("<li>[1024x1261  double array]</li>"));
+        assertTrue(content.contains("<li>[1x1  double array]</li>"));
+        assertTrue(content.contains("</body></html>"));
+    }
+
+    @Test
+    public void testParserForText() throws Exception {
+
+        Parser parser = new MatParser();
+        ToXMLContentHandler handler = new ToXMLContentHandler();
+        Metadata metadata = new Metadata();
+        String path = "/test-documents/test_mat_text.mat";
+
+        InputStream stream = MatParser.class.getResourceAsStream(path);
+
+        try {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        } finally {
+            stream.close();
+        }
+
+        //Check Content
+        String content = handler.toString();
+        assertTrue(content.contains("<p>double:[2x2  double array]</p>"));
+        System.err.println(content);
+    }
 
 }

Added: 
tika/trunk/tika-parsers/src/test/resources/test-documents/test_mat_text.mat
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/test_mat_text.mat?rev=1610506&view=auto
==============================================================================
Binary file - no diff available.

Propchange: 
tika/trunk/tika-parsers/src/test/resources/test-documents/test_mat_text.mat
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream


Reply via email to