All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.eval.XMLErrorLogUpdater Maven / Gradle / Ivy

There is a newer version: 3.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.eval;


import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;

import org.apache.log4j.Level;
import org.apache.tika.eval.db.Cols;
import org.apache.tika.eval.db.H2Util;
import org.apache.tika.eval.db.JDBCUtil;
import org.apache.tika.eval.db.TableInfo;
import org.apache.tika.eval.io.XMLLogMsgHandler;
import org.apache.tika.eval.io.XMLLogReader;
import org.apache.tika.eval.reports.ResultsReporter;
import org.apache.tika.io.IOExceptionWithCause;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * This is a very task specific class that reads a log file and updates
 * the "comparisons" table.  It should not be run in a multithreaded environment.
 */
public class XMLErrorLogUpdater {
    private static final Logger LOG = LoggerFactory.getLogger(ResultsReporter.class);

    private Statement statement;

    public static void main(String[] args) throws Exception {
        XMLErrorLogUpdater writer = new XMLErrorLogUpdater();
        Path xmlLogFileA = Paths.get(args[0]);
        Path xmlLogFileB = Paths.get(args[1]);
        Path db = Paths.get(args[2]);
        JDBCUtil dbUtil = new H2Util(db);
        Connection connection = dbUtil.getConnection();
        writer.update(connection, ExtractComparer.EXTRACT_EXCEPTION_TABLE_A, xmlLogFileA);
        writer.update(connection, ExtractComparer.EXTRACT_EXCEPTION_TABLE_B, xmlLogFileB);
        connection.commit();
    }

    public void update(Connection connection, TableInfo tableInfo, Path xmlLogFile) throws Exception {
        statement = connection.createStatement();
        XMLLogReader reader = new XMLLogReader();
        try (InputStream is = Files.newInputStream(xmlLogFile)) {
            reader.read(is, new ErrorMsgUpdater(tableInfo.getName()));
        } catch (IOException e) {
            throw new RuntimeException("Problem reading: "+xmlLogFile.toAbsolutePath().toString());
        } finally {
            try {
                connection.commit();
                statement.close();
            } catch (SQLException e) {
                throw new RuntimeException("Failed to close db connection!", e);
            }
        }
    }

    private class ErrorMsgUpdater implements XMLLogMsgHandler {
        private final String errorTablename;

        private ErrorMsgUpdater(String errorTablename) {
            this.errorTablename = errorTablename;
        }

        @Override
        public void handleMsg(Level level, String xml) throws SQLException, IOException {
            if (! level.equals(Level.ERROR)) {
                return;
            }
            XMLStreamReader reader = null;
            try {
                reader = XMLInputFactory.newInstance().createXMLStreamReader(new StringReader(xml));
            } catch (XMLStreamException e) {
                throw new IOExceptionWithCause(e);
            }
            String type = null;
            String resourceId = null;
            try {
                while (reader.hasNext() && type == null && resourceId == null) {
                    reader.next();
                    switch (reader.getEventType()) {
                        case XMLStreamConstants.START_ELEMENT:
                            if ("timed_out".equals(reader.getLocalName())) {
                                resourceId = reader.getAttributeValue("", "resourceId");
                                update(errorTablename, resourceId,
                                        AbstractProfiler.PARSE_ERROR_TYPE.TIMEOUT);

                            } else if ("oom".equals(reader.getLocalName())) {
                                resourceId = reader.getAttributeValue("", "resourceId");
                                update(errorTablename, resourceId, AbstractProfiler.PARSE_ERROR_TYPE.OOM);
                            }
                            break;
                    }
                }
                reader.close();
            } catch (XMLStreamException e) {
                throw new IOExceptionWithCause(e);
            }
        }

        private void update(String errorTableName,
                            String filePath, AbstractProfiler.PARSE_ERROR_TYPE type) throws SQLException {
            int containerId = getContainerId(filePath);
            String sql = "SELECT count(1) from "+errorTableName +
                    " where "+Cols.CONTAINER_ID +
                    " = "+containerId + " or "+
                    Cols.FILE_PATH + "='"+filePath+"'";
            ResultSet rs = statement.executeQuery(sql);

            //now try to figure out if that file already exists
            //in parse errors
            int hitCount = 0;
            while (rs.next()) {
                hitCount = rs.getInt(1);
            }

            //if it does, update all records matching that path or container id
            if (hitCount > 0) {
                sql = "UPDATE " + errorTableName +
                        " SET " + Cols.PARSE_ERROR_ID +
                        " = " + type.ordinal() + ","+
                        Cols.FILE_PATH + "='" +filePath+"'"+
                        " where "+Cols.CONTAINER_ID +
                        "="+containerId + " or "+
                        Cols.FILE_PATH + "='"+filePath+"'";;

            } else {
                //if not and container id > -1
                //insert full record
                if (containerId > -1) {
                    sql = "INSERT INTO " + errorTableName +
                            " ("+Cols.CONTAINER_ID+","+Cols.FILE_PATH +","+Cols.PARSE_ERROR_ID +")"+
                            " values (" + containerId + ", '" + filePath + "'," +
                            type.ordinal() + ");";
                } else {
                    //if container id == -1, insert only file path and parse error type id
                    sql = "INSERT INTO " + errorTableName +
                            " ("+Cols.FILE_PATH.name()+","+Cols.PARSE_ERROR_ID +")"+
                            "values ('" + filePath + "'," +
                            type.ordinal() + ");";
                }

            }
            int updated = statement.executeUpdate(sql);
            if (updated == 0) {
                //TODO: log
                LOG.warn("made no updates in xmlerrorlogupdater!");
            } else if (updated > 1) {
                LOG.warn("made too many updates");
            }
        }

        private int getContainerId(String resourceId) throws SQLException {
            int containerId = -1;
            String sql = "SELECT " + Cols.CONTAINER_ID.name() +
                    " from " + ExtractProfiler.CONTAINER_TABLE.getName()+
                    " where " + Cols.FILE_PATH +
                    " ='"+resourceId+"'";
            ResultSet rs = statement.executeQuery(sql);
            int resultCount = 0;
            while (rs.next()) {
                containerId = rs.getInt(1);
                resultCount++;
            }
            rs.close();

            if (resultCount == 0) {
                LOG.warn("Should have found a container for: {}", resourceId);
            } else if (resultCount > 1) {
                LOG.error("Records ids should be unique: {}", resourceId);
            }
/*
            if (containerId < 0) {
                System.err.println("CONTAINER ID < 0!!!");
                sql = "SELECT MAX("+ Cols.CONTAINER_ID.name() +
                        ") from "+ExtractProfiler.CONTAINER_TABLE.getName();
                rs = statement.executeQuery(sql);
                while (rs.next()) {
                    containerId = rs.getInt(1);
                }
                rs.close();
                if (containerId < 0) {
                    //log and abort
                    //return -1?
                } else {
                    containerId++;
                }

            }*/
            return containerId;
        }


    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy