All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.eval.batch.EvalConsumerBuilder Maven / Gradle / Ivy

There is a newer version: 3.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tika.eval.batch;

import java.io.IOException;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.tika.batch.FileResource;
import org.apache.tika.batch.FileResourceConsumer;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.eval.AbstractProfiler;
import org.apache.tika.eval.db.Cols;
import org.apache.tika.eval.db.JDBCUtil;
import org.apache.tika.eval.db.MimeBuffer;
import org.apache.tika.eval.db.TableInfo;
import org.apache.tika.eval.io.DBWriter;
import org.apache.tika.eval.io.ExtractReader;
import org.apache.tika.eval.io.ExtractReaderException;
import org.apache.tika.eval.io.IDBWriter;
import org.apache.tika.util.PropsUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public abstract class EvalConsumerBuilder {

    private static final Logger LOG = LoggerFactory.getLogger(EvalConsumerBuilder.class);

    protected ArrayBlockingQueue queue;
    Map localAttrs;
    JDBCUtil dbUtil;
    private MimeBuffer mimeBuffer;
    AtomicInteger initialized = new AtomicInteger(0);

    public MimeBuffer init(ArrayBlockingQueue queue, Map localAttrs,
                     JDBCUtil dbUtil, boolean forceDrop) throws IOException, SQLException {
        if (initialized.getAndIncrement() > 0) {
            throw new RuntimeException("Can only init a consumer builder once!");
        }
        this.queue = queue;
        this.localAttrs = localAttrs;
        this.dbUtil = dbUtil;
        //the order of the following is critical
        //step 1. update the table names with prefixes
        updateTableInfosWithPrefixes(localAttrs);

        JDBCUtil.CREATE_TABLE createRegularTable = (forceDrop) ? JDBCUtil.CREATE_TABLE.DROP_IF_EXISTS :
                JDBCUtil.CREATE_TABLE.THROW_EX_IF_EXISTS;

        JDBCUtil.CREATE_TABLE createRefTable = (forceDrop) ? JDBCUtil.CREATE_TABLE.DROP_IF_EXISTS :
                JDBCUtil.CREATE_TABLE.SKIP_IF_EXISTS;

        //step 2. create the tables
        dbUtil.createTables(getNonRefTableInfos(), createRegularTable);
        dbUtil.createTables(getRefTableInfos(), createRefTable);

        //step 3. create mime buffer
        this.mimeBuffer = new MimeBuffer(dbUtil.getConnection(), getMimeTable(),
                TikaConfig.getDefaultConfig());

        //step 4. populate the reference tables
        populateRefTables();

        return mimeBuffer;
    }

    public abstract FileResourceConsumer build() throws IOException, SQLException;

    protected abstract void updateTableInfosWithPrefixes(Map attrs);

    /**
     *
     * @return only the ref tables
     */
    protected abstract List getRefTableInfos();

    /**
     *
     * @return the main tables, not including the ref tables
     */
    protected abstract List getNonRefTableInfos();

    protected abstract TableInfo getMimeTable();

    protected abstract void addErrorLogTablePairs(DBConsumersManager manager);

    public void populateRefTables() throws IOException, SQLException {
        boolean refTablesPopulated = true;
        try{
            Connection connection = dbUtil.getConnection();
            for (TableInfo tableInfo : getRefTableInfos()) {
                int rows = 0;
                try (ResultSet rs = connection.createStatement().executeQuery("select * from "+
                        tableInfo.getName())) {
                    while (rs.next()) {
                        rows++;
                    }
                }
                if (rows == 0) {
                    refTablesPopulated = false;
                    break;
                }

            }
        } catch (SQLException e) {
            //swallow
        }
        if (refTablesPopulated) {
            LOG.info("ref tables are already populated");
            return;
        }

        IDBWriter writer = getDBWriter(getRefTableInfos());
        Map m = new HashMap<>();
        for (AbstractProfiler.PARSE_ERROR_TYPE t : AbstractProfiler.PARSE_ERROR_TYPE.values()) {
            m.clear();
            m.put(Cols.PARSE_ERROR_ID, Integer.toString(t.ordinal()));
            m.put(Cols.PARSE_ERROR_DESCRIPTION, t.name());
            writer.writeRow(AbstractProfiler.REF_PARSE_ERROR_TYPES, m);
        }

        for (AbstractProfiler.EXCEPTION_TYPE t : AbstractProfiler.EXCEPTION_TYPE.values()) {
            m.clear();
            m.put(Cols.PARSE_EXCEPTION_ID, Integer.toString(t.ordinal()));
            m.put(Cols.PARSE_EXCEPTION_DESCRIPTION, t.name());
            writer.writeRow(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES, m);
        }

        for (ExtractReaderException.TYPE t :
                ExtractReaderException.TYPE.values()) {
            m.clear();
            m.put(Cols.EXTRACT_EXCEPTION_ID, Integer.toString(t.ordinal()));
            m.put(Cols.EXTRACT_EXCEPTION_DESCRIPTION, t.name());
            writer.writeRow(AbstractProfiler.REF_EXTRACT_EXCEPTION_TYPES, m);
        }
        writer.close();
    }

    protected IDBWriter getDBWriter(List tableInfos) throws IOException, SQLException {
        Connection conn = dbUtil.getConnection();
        return new DBWriter(conn, tableInfos, dbUtil, mimeBuffer);
    }

    ExtractReader.ALTER_METADATA_LIST getAlterMetadata(Map localAttrs) {

        String alterExtractString = localAttrs.get("alterExtract");
        ExtractReader.ALTER_METADATA_LIST alterExtractList;
        if (alterExtractString == null || alterExtractString.equalsIgnoreCase("as_is")) {
            alterExtractList = ExtractReader.ALTER_METADATA_LIST.AS_IS;
        } else if (alterExtractString.equalsIgnoreCase("first_only")) {
            alterExtractList = ExtractReader.ALTER_METADATA_LIST.FIRST_ONLY;
        } else if (alterExtractString.equalsIgnoreCase("concatenate_content")) {
            alterExtractList = ExtractReader.ALTER_METADATA_LIST.CONCATENATE_CONTENT_INTO_FIRST;
        } else {
            throw new RuntimeException("options for alterExtract: as_is, first_only, concatenate_content." +
                    " I don't understand:" + alterExtractString);
        }
        return alterExtractList;
    }

    protected ExtractReader buildExtractReader(Map localAttrs) {
        long minExtractLength = PropsUtil.getLong(localAttrs.get("minExtractLength"), -1L);
        long maxExtractLength = PropsUtil.getLong(localAttrs.get("maxExtractLength"), -1L);

        ExtractReader.ALTER_METADATA_LIST alterExtractList = getAlterMetadata(localAttrs);
        return new ExtractReader(alterExtractList, minExtractLength, maxExtractLength);
    }

    FileResourceConsumer parameterizeProfiler(AbstractProfiler abstractProfiler) {

        int maxContentLength = PropsUtil.getInt(localAttrs.get("maxContentLength"), -2);
        if (maxContentLength > -2) {
            abstractProfiler.setMaxContentLength(maxContentLength);
        }

        int maxContentLengthForLangId = PropsUtil.getInt(localAttrs.get("maxContentLengthForLangId"), -2);
        if (maxContentLengthForLangId > -2) {
            abstractProfiler.setMaxContentLengthForLangId(maxContentLengthForLangId);
        }

        int maxTokens = PropsUtil.getInt(localAttrs.get("maxTokens"), -2);
        if (maxTokens > -2) {
            abstractProfiler.setMaxTokens(maxTokens);
        }


        return abstractProfiler;
    }


/*
    public abstract Map getIndexInfo();

    class ValueComparator implements Comparator {

        Map map;

        public ValueComparator(Map base) {
            this.map = base;
        }

        public int compare(String a, String b) {
            Integer aVal = map.get(a).getDBColOffset();
            Integer bVal = map.get(b).getDBColOffset();
            if (aVal == null || bVal == null) {
                throw new IllegalArgumentException("Column offset must be specified!");
            }
            if (aVal == bVal && ! map.get(a).equals(map.get(b))) {
                throw new IllegalArgumentException("Column offsets must be unique: " + a + " and " + b + " both have: "+aVal);
            }
            if (aVal < bVal) {
                return -1;
            } else {
                return 1;
            }
        }
    }
*/
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy