All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.utils.CommonsDigester Maven / Gradle / Ivy

package org.apache.tika.parser.utils;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Locale;

import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.IOUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.io.IOExceptionWithCause;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.DigestingParser;
import org.apache.tika.parser.ParseContext;

/**
 * Implementation of {@link org.apache.tika.parser.DigestingParser.Digester}
 * that relies on commons.codec.digest.DigestUtils to calculate digest hashes.
 * 

* This digester tries to use the regular mark/reset protocol on the InputStream. * However, this wraps an internal BoundedInputStream, and if the InputStream * is not fully read, then this will reset the stream and * spool the InputStream to disk (via TikaInputStream) and then digest the file. *

* If a TikaInputStream is passed in and it has an underlying file that is longer * than the {@link #markLimit}, then this digester digests the file directly. * */ public class CommonsDigester implements DigestingParser.Digester { public enum DigestAlgorithm { //those currently available in commons.digest MD2, MD5, SHA1, SHA256, SHA384, SHA512; String getMetadataKey() { return TikaCoreProperties.TIKA_META_PREFIX+ "digest"+Metadata.NAMESPACE_PREFIX_DELIMITER+this.toString(); } } private final List algorithms = new ArrayList(); private final int markLimit; public CommonsDigester(int markLimit, DigestAlgorithm... algorithms) { Collections.addAll(this.algorithms, algorithms); if (markLimit < 0) { throw new IllegalArgumentException("markLimit must be >= 0"); } this.markLimit = markLimit; } @Override public void digest(InputStream is, Metadata m, ParseContext parseContext) throws IOException { //if this is already a TikaInputStream, rely on the caller to close //the stream and free the tmp file. TikaInputStream tis = TikaInputStream.cast(is); TemporaryResources tmp = null; if (tis == null) { //if this isn't a TikaInputStream, create a new TempResources //and make sure to release it!!! tmp = new TemporaryResources(); tis = TikaInputStream.get(new CloseShieldInputStream(is), tmp); } try { long sz = -1; if (tis.hasFile()) { sz = tis.getLength(); } //if the file is definitely a file, //and its size is greater than its mark limit, //just digest the underlying file. if (sz > markLimit) { digestFile(tis.getFile(), m); return; } //try the usual mark/reset stuff. //however, if you actually hit the bound, //then stop and spool to file via TikaInputStream SimpleBoundedInputStream bis = new SimpleBoundedInputStream(markLimit, tis); boolean finishedStream = false; for (DigestAlgorithm algorithm : algorithms) { bis.mark(markLimit + 1); finishedStream = digestEach(algorithm, bis, m); bis.reset(); if (!finishedStream) { break; } } if (!finishedStream) { digestFile(tis.getFile(), m); } } finally { try { if (tmp != null) { tmp.dispose(); } } catch (TikaException e) { throw new IOExceptionWithCause(e); } } } private void digestFile(File f, Metadata m) throws IOException { for (DigestAlgorithm algorithm : algorithms) { InputStream is = new FileInputStream(f); try { digestEach(algorithm, is, m); } finally { IOUtils.closeQuietly(is); } } } /** * * @param algorithm algo to use * @param is input stream to read from * @param metadata metadata for reporting the digest * @return whether or not this finished the input stream * @throws IOException */ private boolean digestEach(DigestAlgorithm algorithm, InputStream is, Metadata metadata) throws IOException { String digest = null; try { switch (algorithm) { case MD2: digest = DigestUtils.md2Hex(is); break; case MD5: digest = DigestUtils.md5Hex(is); break; case SHA1: digest = DigestUtils.sha1Hex(is); break; case SHA256: digest = DigestUtils.sha256Hex(is); break; case SHA384: digest = DigestUtils.sha384Hex(is); break; case SHA512: digest = DigestUtils.sha512Hex(is); break; default: throw new IllegalArgumentException("Sorry, not aware of algorithm: " + algorithm.toString()); } } catch (IOException e) { e.printStackTrace(); //swallow, or should we throw this? } if (is instanceof SimpleBoundedInputStream) { if (((SimpleBoundedInputStream)is).hasHitBound()) { return false; } } metadata.set(algorithm.getMetadataKey(), digest); return true; } /** * * @param s comma-delimited (no space) list of algorithms to use: md5,sha256 * @return */ public static DigestAlgorithm[] parse(String s) { assert(s != null); List ret = new ArrayList<>(); for (String algoString : s.split(",")) { String uc = algoString.toUpperCase(Locale.ROOT); if (uc.equals(DigestAlgorithm.MD2.toString())) { ret.add(DigestAlgorithm.MD2); } else if (uc.equals(DigestAlgorithm.MD5.toString())) { ret.add(DigestAlgorithm.MD5); } else if (uc.equals(DigestAlgorithm.SHA1.toString())) { ret.add(DigestAlgorithm.SHA1); } else if (uc.equals(DigestAlgorithm.SHA256.toString())) { ret.add(DigestAlgorithm.SHA256); } else if (uc.equals(DigestAlgorithm.SHA384.toString())) { ret.add(DigestAlgorithm.SHA384); } else if (uc.equals(DigestAlgorithm.SHA512.toString())) { ret.add(DigestAlgorithm.SHA512); } else { StringBuilder sb = new StringBuilder(); int i = 0; for (DigestAlgorithm algo : DigestAlgorithm.values()) { if (i++ > 0) { sb.append(", "); } sb.append(algo.toString()); } throw new IllegalArgumentException("Couldn't match " + s + " with any of: " + sb.toString()); } } return ret.toArray(new DigestAlgorithm[ret.size()]); } /** * Very slight modification of Commons' BoundedInputStream * so that we can figure out if this hit the bound or not. */ private class SimpleBoundedInputStream extends InputStream { private final static int EOF = -1; private final long max; private final InputStream in; private long pos; private SimpleBoundedInputStream(long max, InputStream in) { this.max = max; this.in = in; } @Override public int read() throws IOException { if (max >= 0 && pos >= max) { return EOF; } final int result = in.read(); pos++; return result; } /** * Invokes the delegate's read(byte[]) method. * @param b the buffer to read the bytes into * @return the number of bytes read or -1 if the end of stream or * the limit has been reached. * @throws IOException if an I/O error occurs */ @Override public int read(final byte[] b) throws IOException { return this.read(b, 0, b.length); } /** * Invokes the delegate's read(byte[], int, int) method. * @param b the buffer to read the bytes into * @param off The start offset * @param len The number of bytes to read * @return the number of bytes read or -1 if the end of stream or * the limit has been reached. * @throws IOException if an I/O error occurs */ @Override public int read(final byte[] b, final int off, final int len) throws IOException { if (max>=0 && pos>=max) { return EOF; } final long maxRead = max>=0 ? Math.min(len, max-pos) : len; final int bytesRead = in.read(b, off, (int)maxRead); if (bytesRead==EOF) { return EOF; } pos+=bytesRead; return bytesRead; } /** * Invokes the delegate's skip(long) method. * @param n the number of bytes to skip * @return the actual number of bytes skipped * @throws IOException if an I/O error occurs */ @Override public long skip(final long n) throws IOException { final long toSkip = max>=0 ? Math.min(n, max-pos) : n; final long skippedBytes = in.skip(toSkip); pos+=skippedBytes; return skippedBytes; } @Override public void reset() throws IOException { in.reset(); } @Override public void mark(int readLimit) { in.mark(readLimit); } public boolean hasHitBound() { return pos >= max; } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy