All Downloads are FREE. Search and download functionalities are using the official Maven repository.

fr.pilato.elasticsearch.crawler.fs.tika.TikaDocParser Maven / Gradle / Ivy

There is a newer version: 2.5
Show newest version
/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package fr.pilato.elasticsearch.crawler.fs.tika;

import fr.pilato.elasticsearch.crawler.fs.meta.doc.Doc;
import fr.pilato.elasticsearch.crawler.fs.meta.settings.FsSettings;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.tika.metadata.Metadata;

import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.security.DigestInputStream;
import java.security.MessageDigest;
import java.util.ArrayList;
import java.util.Base64;
import java.util.List;

import static fr.pilato.elasticsearch.crawler.fs.tika.TikaInstance.tika;

/**
 * Parse a binary document and generate a FSCrawler Doc
 */
public class TikaDocParser {

    private final static Logger logger = LogManager.getLogger(TikaDocParser.class);

    public static void generate(FsSettings fsSettings, byte[] data, String filename, Doc doc, MessageDigest messageDigest) throws UnsupportedEncodingException {
        // Extracting content with Tika
        // See #38: https://github.com/dadoonet/fscrawler/issues/38
        int indexedChars = 100000;
        if (fsSettings.getFs().getIndexedChars() != null) {
            if (fsSettings.getFs().getIndexedChars().percentage()) {
                indexedChars = (int) Math.round(data.length * fsSettings.getFs().getIndexedChars().asDouble());
                logger.trace("using percentage [{}] to define indexed chars: [{}]",
                        fsSettings.getFs().getIndexedChars(), indexedChars);
            } else {
                indexedChars = (int) fsSettings.getFs().getIndexedChars().value();
                logger.trace("indexed chars [{}]",
                        indexedChars == -1 ? "has been disabled. All text will be extracted" : indexedChars);
            }
        }
        Metadata metadata = new Metadata();

        String parsedContent = null;
        InputStream dataStream = new ByteArrayInputStream(data);
        DigestInputStream dis = null;

        if (messageDigest != null) {
            logger.trace("Generating hash with [{}]", messageDigest.getAlgorithm());
            dis = new DigestInputStream(dataStream, messageDigest);
            dataStream = dis;
        }

        try {
            // Set the maximum length of strings returned by the parseToString method, -1 sets no limit
            parsedContent = tika().parseToString(dataStream, metadata, indexedChars);
        } catch (Throwable e) {
            logger.debug("Failed to extract [" + indexedChars + "] characters of text for [" + filename + "]", e);
        }

        // Adding what we found to the document we want to index

        // File
        doc.getFile().setContentType(metadata.get(Metadata.CONTENT_TYPE));

        // We only add `indexed_chars` if we have other value than default or -1
        if (fsSettings.getFs().getIndexedChars() != null && fsSettings.getFs().getIndexedChars().value() != -1) {
            doc.getFile().setIndexedChars(indexedChars);
        }

        if (fsSettings.getFs().isAddFilesize()) {
            if (metadata.get(Metadata.CONTENT_LENGTH) != null) {
                // We try to get CONTENT_LENGTH from Tika first
                doc.getFile().setFilesize(Long.parseLong(metadata.get(Metadata.CONTENT_LENGTH)));
            }
        }
        if (messageDigest != null) {
            byte[] digest = messageDigest.digest();
            String result = "";
            // Convert to Hexa
            for (int i=0; i < digest.length; i++) {
                result += Integer.toString( ( digest[i] & 0xff ) + 0x100, 16).substring( 1 );
            }
            doc.getFile().setChecksum(result);
        }
        // File

        // Meta
        doc.getMeta().setAuthor(metadata.get(Metadata.AUTHOR));
        doc.getMeta().setTitle(metadata.get(Metadata.TITLE));
        // TODO Fix that as the date we get from Tika might be not parseable as a Date
        // doc.getMeta().setDate(metadata.get(Metadata.DATE));
        doc.getMeta().setKeywords(commaDelimitedListToStringArray(metadata.get(Metadata.KEYWORDS)));

        if (fsSettings.getFs().isRawMetadata()) {
            logger.trace("Listing all available metadata:");
            for (String metadataName : metadata.names()) {
                String value = metadata.get(metadataName);
                // This is a logger trick which helps to generate our unit tests
                // You need to change test/resources/log4j2.xml fr.pilato.elasticsearch.crawler.fs.tika level to trace
                logger.trace("  assertThat(raw, hasEntry(\"{}\", \"{}\"));", metadataName, value);
                doc.getMeta().addRaw(metadataName, value);
            }
        }
        // Meta

        // Doc content
        doc.setContent(parsedContent);

        // Doc as binary attachment
        if (fsSettings.getFs().isStoreSource()) {
            doc.setAttachment(new String(Base64.getEncoder().encode(data), "UTF-8"));
        }
        // End of our document
    }

    public static List commaDelimitedListToStringArray(String str) {
        if (str == null) {
            return new ArrayList<>();
        }
        List result = new ArrayList<>();
        int pos = 0;
        int delPos;
        while ((delPos = str.indexOf(",", pos)) != -1) {
            result.add(str.substring(pos, delPos));
            pos = delPos + 1;
        }
        if (str.length() > 0 && pos <= str.length()) {
            // Add rest of String, but not in case of empty input.
            result.add(str.substring(pos));
        }
        return result;
    }


}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy