This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4451 in repository https://gitbox.apache.org/repos/asf/tika.git
commit e4263756fab834c6424cc779ea13e6d1b262ab43 Author: tallison <[email protected]> AuthorDate: Tue Jul 8 17:37:23 2025 -0400 TIKA-4451 -- remove XML logger updater --- .../apache/tika/eval/app/XMLErrorLogUpdater.java | 211 --------------------- .../tika/eval/app/batch/DBConsumersManager.java | 12 -- .../apache/tika/eval/app/io/XMLLogMsgHandler.java | 27 --- .../org/apache/tika/eval/app/io/XMLLogReader.java | 116 ----------- .../tika/eval/app/io/FatalExceptionReaderTest.java | 35 ---- 5 files changed, 401 deletions(-) diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/XMLErrorLogUpdater.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/XMLErrorLogUpdater.java deleted file mode 100644 index 81ae5f2be..000000000 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/XMLErrorLogUpdater.java +++ /dev/null @@ -1,211 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.eval.app; - - -import java.io.IOException; -import java.io.InputStream; -import java.io.StringReader; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.sql.Connection; -import java.sql.ResultSet; -import java.sql.SQLException; -import java.sql.Statement; -import javax.xml.stream.XMLInputFactory; -import javax.xml.stream.XMLStreamConstants; -import javax.xml.stream.XMLStreamException; -import javax.xml.stream.XMLStreamReader; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.slf4j.event.Level; - -import org.apache.tika.eval.app.db.Cols; -import org.apache.tika.eval.app.db.H2Util; -import org.apache.tika.eval.app.db.JDBCUtil; -import org.apache.tika.eval.app.db.TableInfo; -import org.apache.tika.eval.app.io.XMLLogMsgHandler; -import org.apache.tika.eval.app.io.XMLLogReader; -import org.apache.tika.eval.app.reports.ResultsReporter; - -/** - * This is a very task specific class that reads a log file and updates - * the "comparisons" table. It should not be run in a multithreaded environment. - */ -public class XMLErrorLogUpdater { - private static final Logger LOG = LoggerFactory.getLogger(ResultsReporter.class); - - private Statement statement; - - public static void main(String[] args) throws Exception { - XMLErrorLogUpdater writer = new XMLErrorLogUpdater(); - Path xmlLogFileA = Paths.get(args[0]); - Path xmlLogFileB = Paths.get(args[1]); - Path db = Paths.get(args[2]); - JDBCUtil dbUtil = new H2Util(db); - Connection connection = dbUtil.getConnection(); - writer.update(connection, ExtractComparer.EXTRACT_EXCEPTION_TABLE_A, xmlLogFileA); - writer.update(connection, ExtractComparer.EXTRACT_EXCEPTION_TABLE_B, xmlLogFileB); - connection.commit(); - } - - public void update(Connection connection, TableInfo tableInfo, Path xmlLogFile) throws Exception { - statement = connection.createStatement(); - XMLLogReader reader = new XMLLogReader(); - try (InputStream is = Files.newInputStream(xmlLogFile)) { - reader.read(is, new ErrorMsgUpdater(tableInfo.getName())); - } catch (IOException e) { - throw new RuntimeException("Problem reading: " + xmlLogFile - .toAbsolutePath() - .toString()); - } finally { - try { - connection.commit(); - statement.close(); - } catch (SQLException e) { - throw new RuntimeException("Failed to close db connection!", e); - } - } - } - - private class ErrorMsgUpdater implements XMLLogMsgHandler { - private final String errorTablename; - - private ErrorMsgUpdater(String errorTablename) { - this.errorTablename = errorTablename; - } - - @Override - public void handleMsg(Level level, String xml) throws SQLException, IOException { - if (!level.equals(Level.ERROR)) { - return; - } - XMLStreamReader reader = null; - try { - reader = XMLInputFactory - .newInstance() - .createXMLStreamReader(new StringReader(xml)); - } catch (XMLStreamException e) { - throw new IOException(e); - } - String type = null; - String resourceId = null; - try { - while (reader.hasNext() && type == null && resourceId == null) { - reader.next(); - switch (reader.getEventType()) { - case XMLStreamConstants.START_ELEMENT: - if ("timed_out".equals(reader.getLocalName())) { - resourceId = reader.getAttributeValue("", "resourceId"); - update(errorTablename, resourceId, AbstractProfiler.PARSE_ERROR_TYPE.TIMEOUT); - - } else if ("oom".equals(reader.getLocalName())) { - resourceId = reader.getAttributeValue("", "resourceId"); - update(errorTablename, resourceId, AbstractProfiler.PARSE_ERROR_TYPE.OOM); - } - break; - } - } - reader.close(); - } catch (XMLStreamException e) { - throw new IOException(e); - } - } - - private void update(String errorTableName, String filePath, AbstractProfiler.PARSE_ERROR_TYPE type) throws SQLException { - int containerId = getContainerId(filePath); - String sql = "SELECT count(1) from " + errorTableName + " where " + Cols.CONTAINER_ID + " = " + containerId + " or " + Cols.FILE_PATH + "='" + filePath + "'"; - int hitCount; - try (ResultSet rs = statement.executeQuery(sql)) { - //now try to figure out if that file already exists - //in parse errors - hitCount = 0; - while (rs.next()) { - hitCount = rs.getInt(1); - } - } - - //if it does, update all records matching that path or container id - if (hitCount > 0) { - sql = "UPDATE " + errorTableName + " SET " + Cols.PARSE_ERROR_ID + " = " + type.ordinal() + "," + Cols.FILE_PATH + "='" + filePath + "'" + " where " + - Cols.CONTAINER_ID + "=" + containerId + " or " + Cols.FILE_PATH + "='" + filePath + "'"; - - } else { - //if not and container id > -1 - //insert full record - if (containerId > -1) { - sql = "INSERT INTO " + errorTableName + " (" + Cols.CONTAINER_ID + "," + Cols.FILE_PATH + "," + Cols.PARSE_ERROR_ID + ")" + " values (" + containerId + ", '" + - filePath + "'," + type.ordinal() + ");"; - } else { - //if container id == -1, insert only file path and parse error type id - sql = "INSERT INTO " + errorTableName + " (" + Cols.FILE_PATH.name() + "," + Cols.PARSE_ERROR_ID + ")" + "values ('" + filePath + "'," + type.ordinal() + ");"; - } - - } - int updated = statement.executeUpdate(sql); - if (updated == 0) { - //TODO: log - LOG.warn("made no updates in xmlerrorlogupdater!"); - } else if (updated > 1) { - LOG.warn("made too many updates"); - } - } - - private int getContainerId(String resourceId) throws SQLException { - int containerId = -1; - String sql = "SELECT " + Cols.CONTAINER_ID.name() + " from " + ExtractProfiler.CONTAINER_TABLE.getName() + " where " + Cols.FILE_PATH + " ='" + resourceId + "'"; - int resultCount; - try (ResultSet rs = statement.executeQuery(sql)) { - resultCount = 0; - while (rs.next()) { - containerId = rs.getInt(1); - resultCount++; - } - } - - if (resultCount == 0) { - LOG.warn("Should have found a container for: {}", resourceId); - } else if (resultCount > 1) { - LOG.error("Records ids should be unique: {}", resourceId); - } -/* - if (containerId < 0) { - System.err.println("CONTAINER ID < 0!!!"); - sql = "SELECT MAX("+ Cols.CONTAINER_ID.name() + - ") from "+ExtractProfiler.CONTAINER_TABLE.getName(); - rs = statement.executeQuery(sql); - while (rs.next()) { - containerId = rs.getInt(1); - } - rs.close(); - if (containerId < 0) { - //log and abort - //return -1? - } else { - containerId++; - } - - }*/ - return containerId; - } - - - } - -} diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/DBConsumersManager.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/DBConsumersManager.java index aba35416f..41db03520 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/DBConsumersManager.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/DBConsumersManager.java @@ -26,7 +26,6 @@ import java.util.List; import org.apache.tika.batch.ConsumersManager; import org.apache.tika.batch.FileResourceConsumer; import org.apache.tika.eval.app.AbstractProfiler; -import org.apache.tika.eval.app.XMLErrorLogUpdater; import org.apache.tika.eval.app.db.JDBCUtil; import org.apache.tika.eval.app.db.MimeBuffer; import org.apache.tika.eval.app.db.TableInfo; @@ -64,17 +63,6 @@ public class DBConsumersManager extends ConsumersManager { throw new RuntimeException(e); } - //MUST HAPPEN AFTER consumers have closed and - //committed container information!!! - XMLErrorLogUpdater up = new XMLErrorLogUpdater(); - for (LogTablePair p : errorLogs) { - try { - up.update(conn, p.tableInfo, p.log); - } catch (Exception e) { - throw new RuntimeException(e); - } - } - try { conn.commit(); diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/XMLLogMsgHandler.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/XMLLogMsgHandler.java deleted file mode 100644 index 0221849ba..000000000 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/XMLLogMsgHandler.java +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.eval.app.io; - -import java.io.IOException; -import java.sql.SQLException; - -import org.slf4j.event.Level; - - -public interface XMLLogMsgHandler { - public void handleMsg(Level level, String xml) throws IOException, SQLException; -} diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/XMLLogReader.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/XMLLogReader.java deleted file mode 100644 index 2b8350894..000000000 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/XMLLogReader.java +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.eval.app.io; - -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.nio.charset.StandardCharsets; -import java.sql.SQLException; -import javax.xml.stream.XMLInputFactory; -import javax.xml.stream.XMLStreamConstants; -import javax.xml.stream.XMLStreamException; -import javax.xml.stream.XMLStreamReader; - -import org.apache.commons.io.IOUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.slf4j.event.Level; - -import org.apache.tika.utils.XMLReaderUtils; - - -public class XMLLogReader { - private static final Logger LOG = LoggerFactory.getLogger(XMLLogReader.class); - //class that wraps a logger's xml output - //into a single xml parseable input stream. - - public void read(InputStream xmlLogFileIs, XMLLogMsgHandler handler) throws XMLStreamException { - InputStream is = new LogXMLWrappingInputStream(xmlLogFileIs); - XMLInputFactory factory = XMLReaderUtils.getXMLInputFactory(); - XMLStreamReader reader = factory.createXMLStreamReader(is); - - Level level = null; - while (reader.hasNext()) { - reader.next(); - switch (reader.getEventType()) { - case XMLStreamConstants.START_ELEMENT: - if ("event".equals(reader.getLocalName())) { - String levelString = reader.getAttributeValue("", "level"); - if (levelString != null) { - level = Level.valueOf(levelString); - } else { - level = Level.DEBUG; - } - } else if ("message".equals(reader.getLocalName())) { - try { - handler.handleMsg(level, reader.getElementText()); - } catch (IOException e) { - LOG.warn("Error parsing: {}", reader.getElementText()); - } catch (SQLException e) { - LOG.warn("SQLException: {}", e.getMessage()); - } - } - break; - case XMLStreamConstants.END_ELEMENT: - if ("event".equals(reader.getLocalName())) { - level = null; - } else if ("message".equals(reader.getLocalName())) { - //do we care any more? - } - break; - } - } - } - - - static class LogXMLWrappingInputStream extends InputStream { - //plagiarized from log4j's chainsaw - private final static String HEADER = - "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" + "<log4j:eventSet version=\"1.2\" " + "xmlns:log4j=\"http://jakarta.apache.org/log4j/\">"; - private static final String FOOTER = "</log4j:eventSet>"; - int currentStreamIndex = 0; - private InputStream[] streams; - - private LogXMLWrappingInputStream(InputStream xmlLogFileIs) { - streams = new InputStream[3]; - streams[0] = new ByteArrayInputStream(HEADER.getBytes(StandardCharsets.UTF_8)); - streams[1] = xmlLogFileIs; - streams[2] = new ByteArrayInputStream(FOOTER.getBytes(StandardCharsets.UTF_8)); - - } - - @Override - public int read() throws IOException { - int c = streams[currentStreamIndex].read(); - if (c < 0) { - IOUtils.closeQuietly(streams[currentStreamIndex]); - while (currentStreamIndex < streams.length - 1) { - currentStreamIndex++; - int tmpC = streams[currentStreamIndex].read(); - if (tmpC < 0) { - IOUtils.closeQuietly(streams[currentStreamIndex]); - } else { - return tmpC; - } - } - return -1; - } - return c; - } - } -} diff --git a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/io/FatalExceptionReaderTest.java b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/io/FatalExceptionReaderTest.java deleted file mode 100644 index 2cd0d1363..000000000 --- a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/io/FatalExceptionReaderTest.java +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tika.eval.app.io; - - -import java.io.InputStream; - -import org.junit.jupiter.api.Test; - -public class FatalExceptionReaderTest { - @Test - public void testSimpleRead() throws Exception { - try (InputStream is = this - .getClass() - .getResourceAsStream("/test-dirs/batch-logs/batch-process-fatal.xml")) { - XMLLogReader reader = new XMLLogReader(); - //reader.read(is); - } - } -}
