All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.io.TikaInputStream Maven / Gradle / Ivy

There is a newer version: 2024.11.18751.20241128T090041Z-241100
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.io;

import static java.nio.file.StandardCopyOption.REPLACE_EXISTING;

import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.Closeable;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLConnection;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.sql.Blob;
import java.sql.SQLException;

import org.apache.tika.metadata.Metadata;

/**
 * Input stream with extended capabilities. The purpose of this class is
 * to allow files and other resources and information to be associated with
 * the {@link InputStream} instance passed through the
 * {@link org.apache.tika.parser.Parser} interface and other similar APIs.
 * 

* TikaInputStream instances can be created using the various static * get() factory methods. Most of these methods take an optional * {@link Metadata} argument that is then filled with the available input * metadata from the given resource. The created TikaInputStream instance * keeps track of the original resource used to create it, while behaving * otherwise just like a normal, buffered {@link InputStream}. * A TikaInputStream instance is also guaranteed to support the * {@link #mark(int)} feature. *

* Code that wants to access the underlying file or other resources * associated with a TikaInputStream should first use the * {@link #get(InputStream)} factory method to cast or wrap a given * {@link InputStream} into a TikaInputStream instance. *

* TikaInputStream includes a few safety features to protect against parsers * that may fail to check for an EOF or may incorrectly rely on the unreliable * value returned from {@link FileInputStream#skip}. These parser failures * can lead to infinite loops. We strongly encourage the use of * TikaInputStream. * * @since Apache Tika 0.8 */ public class TikaInputStream extends TaggedInputStream { private static final int MAX_CONSECUTIVE_EOFS = 1000; /** * Checks whether the given stream is a TikaInputStream instance. * The given stream can be null, in which case the return * value is false. * * @param stream input stream, possibly null * @return true if the stream is a TikaInputStream instance, * false otherwise */ public static boolean isTikaInputStream(InputStream stream) { return stream instanceof TikaInputStream; } /** * Casts or wraps the given stream to a TikaInputStream instance. * This method can be used to access the functionality of this class * even when given just a normal input stream instance. *

* The given temporary file provider is used for any temporary files, * and should be disposed when the returned stream is no longer used. *

* Use this method instead of the {@link #get(InputStream)} alternative * when you don't explicitly close the returned stream. The * recommended access pattern is: *

     * try (TemporaryResources tmp = new TemporaryResources()) {
     *     TikaInputStream stream = TikaInputStream.get(..., tmp);
     *     // process stream but don't close it
     * }
     * 
*

* The given stream instance will not be closed when the * {@link TemporaryResources#close()} method is called by the * try-with-resources statement. The caller is expected to explicitly * close the original stream when it's no longer used. * * @since Apache Tika 0.10 * @param stream normal input stream * @return a TikaInputStream instance */ public static TikaInputStream get( InputStream stream, TemporaryResources tmp) { if (stream == null) { throw new NullPointerException("The Stream must not be null"); } if (stream instanceof TikaInputStream) { return (TikaInputStream) stream; } else { // Make sure that the stream is buffered and that it // (properly) supports the mark feature if (!(stream.markSupported())) { stream = new BufferedInputStream(stream); } return new TikaInputStream(stream, tmp, -1); } } /** * Casts or wraps the given stream to a TikaInputStream instance. * This method can be used to access the functionality of this class * even when given just a normal input stream instance. *

* Use this method instead of the * {@link #get(InputStream, TemporaryResources)} alternative when you * do explicitly close the returned stream. The recommended * access pattern is: *

     * try (TikaInputStream stream = TikaInputStream.get(...)) {
     *     // process stream
     * }
     * 
*

* The given stream instance will be closed along with any other resources * associated with the returned TikaInputStream instance when the * {@link #close()} method is called by the try-with-resources statement. * * @param stream normal input stream * @return a TikaInputStream instance */ public static TikaInputStream get(InputStream stream) { return get(stream, new TemporaryResources()); } /** * Returns the given stream casts to a TikaInputStream, or * null if the stream is not a TikaInputStream. * * @since Apache Tika 0.10 * @param stream normal input stream * @return a TikaInputStream instance */ public static TikaInputStream cast(InputStream stream) { if (stream instanceof TikaInputStream) { return (TikaInputStream) stream; } else { return null; } } /** * Creates a TikaInputStream from the given array of bytes. *

* Note that you must always explicitly close the returned stream as in * some cases it may end up writing the given data to a temporary file. * * @param data input data * @return a TikaInputStream instance */ public static TikaInputStream get(byte[] data) { return get(data, new Metadata()); } /** * Creates a TikaInputStream from the given array of bytes. The length of * the array is stored as input metadata in the given metadata instance. *

* Note that you must always explicitly close the returned stream as in * some cases it may end up writing the given data to a temporary file. * * @param data input data * @param metadata metadata instance * @return a TikaInputStream instance * @throws IOException */ public static TikaInputStream get(byte[] data, Metadata metadata) { metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(data.length)); return new TikaInputStream( new ByteArrayInputStream(data), new TemporaryResources(), data.length); } /** * Creates a TikaInputStream from the file at the given path. *

* Note that you must always explicitly close the returned stream to * prevent leaking open file handles. * * @param path input file * @return a TikaInputStream instance * @throws IOException if an I/O error occurs */ public static TikaInputStream get(Path path) throws IOException { return get(path, new Metadata()); } /** * Creates a TikaInputStream from the file at the given path. The file name * and length are stored as input metadata in the given metadata instance. *

* Note that you must always explicitly close the returned stream to * prevent leaking open file handles. * * @param path input file * @param metadata metadata instance * @return a TikaInputStream instance * @throws IOException if an I/O error occurs */ public static TikaInputStream get(Path path, Metadata metadata) throws IOException { metadata.set(Metadata.RESOURCE_NAME_KEY, path.getFileName().toString()); metadata.set(Metadata.CONTENT_LENGTH, Long.toString(Files.size(path))); return new TikaInputStream(path); } /** * Creates a TikaInputStream from the given file. *

* Note that you must always explicitly close the returned stream to * prevent leaking open file handles. * * @param file input file * @return a TikaInputStream instance * @throws FileNotFoundException if the file does not exist * @deprecated use {@link #get(Path)}. In Tika 2.0, this will be removed * or modified to throw an IOException. */ @Deprecated public static TikaInputStream get(File file) throws FileNotFoundException { return get(file, new Metadata()); } /** * Creates a TikaInputStream from the given file. The file name and * length are stored as input metadata in the given metadata instance. *

* Note that you must always explicitly close the returned stream to * prevent leaking open file handles. * * @param file input file * @param metadata metadata instance * @return a TikaInputStream instance * @throws FileNotFoundException if the file does not exist * or cannot be opened for reading * @deprecated use {@link #get(Path, Metadata)}. In Tika 2.0, * this will be removed or modified to throw an IOException. */ @Deprecated public static TikaInputStream get(File file, Metadata metadata) throws FileNotFoundException { metadata.set(Metadata.RESOURCE_NAME_KEY, file.getName()); metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.length())); return new TikaInputStream(file); } /** * Creates a TikaInputStream from the given database BLOB. *

* Note that the result set containing the BLOB may need to be kept open * until the returned TikaInputStream has been processed and closed. * You must also always explicitly close the returned stream as in * some cases it may end up writing the blob data to a temporary file. * * @param blob database BLOB * @return a TikaInputStream instance * @throws SQLException if BLOB data can not be accessed */ public static TikaInputStream get(Blob blob) throws SQLException { return get(blob, new Metadata()); } /** * Blob size threshold that limits the largest BLOB size to be * buffered fully in memory by the {@link #get(Blob, Metadata)} * method. */ private static final int BLOB_SIZE_THRESHOLD = 1024 * 1024; /** * Creates a TikaInputStream from the given database BLOB. The BLOB * length (if available) is stored as input metadata in the given * metadata instance. *

* Note that the result set containing the BLOB may need to be kept open * until the returned TikaInputStream has been processed and closed. * You must also always explicitly close the returned stream as in * some cases it may end up writing the blob data to a temporary file. * * @param blob database BLOB * @param metadata metadata instance * @return a TikaInputStream instance * @throws SQLException if BLOB data can not be accessed */ public static TikaInputStream get(Blob blob, Metadata metadata) throws SQLException { long length = -1; try { length = blob.length(); metadata.set(Metadata.CONTENT_LENGTH, Long.toString(length)); } catch (SQLException ignore) { } // Prefer an in-memory buffer for reasonably sized blobs to reduce // the likelihood of problems caused by long-lived database accesses if (0 <= length && length <= BLOB_SIZE_THRESHOLD) { // the offset in Blob.getBytes() starts at 1 return get(blob.getBytes(1, (int) length), metadata); } else { return new TikaInputStream( new BufferedInputStream(blob.getBinaryStream()), new TemporaryResources(), length); } } /** * Creates a TikaInputStream from the resource at the given URI. *

* Note that you must always explicitly close the returned stream as in * some cases it may end up writing the resource to a temporary file. * * @param uri resource URI * @return a TikaInputStream instance * @throws IOException if the resource can not be accessed */ public static TikaInputStream get(URI uri) throws IOException { return get(uri, new Metadata()); } /** * Creates a TikaInputStream from the resource at the given URI. The * available input metadata is stored in the given metadata instance. *

* Note that you must always explicitly close the returned stream as in * some cases it may end up writing the resource to a temporary file. * * @param uri resource URI * @param metadata metadata instance * @return a TikaInputStream instance * @throws IOException if the resource can not be accessed */ public static TikaInputStream get(URI uri, Metadata metadata) throws IOException { // Special handling for file:// URIs if ("file".equalsIgnoreCase(uri.getScheme())) { Path path = Paths.get(uri); if (Files.isRegularFile(path)) { return get(path, metadata); } } return get(uri.toURL(), metadata); } /** * Creates a TikaInputStream from the resource at the given URL. *

* Note that you must always explicitly close the returned stream as in * some cases it may end up writing the resource to a temporary file. * * @param url resource URL * @return a TikaInputStream instance * @throws IOException if the resource can not be accessed */ public static TikaInputStream get(URL url) throws IOException { return get(url, new Metadata()); } /** * Creates a TikaInputStream from the resource at the given URL. The * available input metadata is stored in the given metadata instance. *

* Note that you must always explicitly close the returned stream as in * some cases it may end up writing the resource to a temporary file. * * @param url resource URL * @param metadata metadata instance * @return a TikaInputStream instance * @throws IOException if the resource can not be accessed */ public static TikaInputStream get(URL url, Metadata metadata) throws IOException { // Special handling for file:// URLs if ("file".equalsIgnoreCase(url.getProtocol())) { try { Path path = Paths.get(url.toURI()); if (Files.isRegularFile(path)) { return get(path, metadata); } } catch (URISyntaxException e) { // fall through } } URLConnection connection = url.openConnection(); String path = url.getPath(); int slash = path.lastIndexOf('/'); if (slash + 1 < path.length()) { // works even with -1! metadata.set(Metadata.RESOURCE_NAME_KEY, path.substring(slash + 1)); } String type = connection.getContentType(); if (type != null) { metadata.set(Metadata.CONTENT_TYPE, type); } String encoding = connection.getContentEncoding(); if (encoding != null) { metadata.set(Metadata.CONTENT_ENCODING, encoding); } int length = connection.getContentLength(); if (length >= 0) { metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(length)); } return new TikaInputStream( new BufferedInputStream(connection.getInputStream()), new TemporaryResources(), length); } /** * The path to the file that contains the contents of this stream. * This is either the original file passed to the * {@link #TikaInputStream(Path)} constructor or a temporary file created * by a call to the {@link #getPath()} method. If neither has been called, * then the value is null. */ private Path path; /** * Tracker of temporary resources. */ private final TemporaryResources tmp; /** * Total length of the stream, or -1 if unknown. */ private long length; /** * Current read position within this stream. */ private long position = 0; /** * Marked position, or -1 if there is no current mark. */ private long mark = -1; /** * A opened container, such as a POIFS FileSystem * for an OLE2 document, or a Zip file for a * zip based (eg ooxml, odf) document. */ private Object openContainer; private int consecutiveEOFs = 0; private byte[] skipBuffer; /** * Creates a TikaInputStream instance. This private constructor is used * by the static factory methods based on the available information. * * @param path the path to the file that contains the stream * @throws IOException if an I/O error occurs */ private TikaInputStream(Path path) throws IOException { super(new BufferedInputStream(Files.newInputStream(path))); this.path = path; this.tmp = new TemporaryResources(); this.length = Files.size(path); } /** * Creates a TikaInputStream instance. This private constructor is used * by the static factory methods based on the available information. * * @param file the file that contains the stream * @throws FileNotFoundException if the file does not exist * @deprecated use {@link #TikaInputStream(Path)} */ @Deprecated private TikaInputStream(File file) throws FileNotFoundException { super(new BufferedInputStream(new FileInputStream(file))); this.path = file.toPath(); this.tmp = new TemporaryResources(); this.length = file.length(); } /** * Creates a TikaInputStream instance. This private constructor is used * by the static factory methods based on the available information. *

* The given stream needs to be included in the given temporary resource * collection if the caller wants it also to get closed when the * {@link #close()} method is invoked. * * @param stream buffered stream (must support the mark feature) * @param tmp tracker for temporary resources associated with this stream * @param length total length of the stream, or -1 if unknown */ private TikaInputStream( InputStream stream, TemporaryResources tmp, long length) { super(stream); this.path = null; this.tmp = tmp; this.length = length; } /** * Fills the given buffer with upcoming bytes from this stream without * advancing the current stream position. The buffer is filled up unless * the end of stream is encountered before that. This method will block * if not enough bytes are immediately available. * * @param buffer byte buffer * @return number of bytes written to the buffer * @throws IOException if the stream can not be read */ public int peek(byte[] buffer) throws IOException { int n = 0; mark(buffer.length); int m = read(buffer); while (m != -1) { n += m; if (n < buffer.length) { m = read(buffer, n, buffer.length - n); } else { m = -1; } } reset(); return n; } /** * Returns the open container object, such as a * POIFS FileSystem in the event of an OLE2 * document being detected and processed by * the OLE2 detector. */ public Object getOpenContainer() { return openContainer; } /** * Stores the open container object against * the stream, eg after a Zip contents * detector has loaded the file to decide * what it contains. */ public void setOpenContainer(Object container) { openContainer = container; if (container instanceof Closeable) { tmp.addResource((Closeable) container); } } public boolean hasFile() { return path != null; } /** * If the user created this TikaInputStream with a file, * the original file will be returned. If not, the entire stream * will be spooled to a temporary file which will be deleted * upon the close of this TikaInputStream * @return * @throws IOException */ public Path getPath() throws IOException { return getPath(-1); } /** * * @param maxBytes if this is less than 0 and if an underlying file doesn't already exist, * the full file will be spooled to disk * @return the original path used in the initialization of this TikaInputStream, * a temporary file if the stream was shorter than maxBytes, or null * if the underlying stream was longer than maxBytes. * @throws IOException */ public Path getPath(int maxBytes) throws IOException { if (path == null) { if (position > 0) { throw new IOException("Stream is already being read"); } else { Path tmpFile = tmp.createTempFile(); if (maxBytes > -1) { try (InputStream lookAhead = new LookaheadInputStream(in, maxBytes)) { Files.copy(lookAhead, tmpFile, REPLACE_EXISTING); if (Files.size(tmpFile) >= maxBytes) { //tmpFile will be cleaned up when this TikaInputStream is closed return null; } } } else { // Spool the entire stream into a temporary file Files.copy(in, tmpFile, REPLACE_EXISTING); } //successful so far, set tis' path to tmpFile path = tmpFile; // Create a new input stream and make sure it'll get closed InputStream newStream = Files.newInputStream(path); tmp.addResource(newStream); // Replace the spooled stream with the new stream in a way // that still ends up closing the old stream if or when the // close() method is called. The closing of the new stream // is already being handled as noted above. final InputStream oldStream = in; in = new BufferedInputStream(newStream) { @Override public void close() throws IOException { oldStream.close(); } }; length = Files.size(path); } } return path; } /** * @see #getPath() */ public File getFile() throws IOException { return getPath().toFile(); } public FileChannel getFileChannel() throws IOException { FileChannel channel = FileChannel.open(getPath()); tmp.addResource(channel); return channel; } public boolean hasLength() { return length != -1; } /** * Returns the length (in bytes) of this stream. Note that if the length * was not available when this stream was instantiated, then this method * will use the {@link #getPath()} method to buffer the entire stream to * a temporary file in order to calculate the stream length. This case * will only work if the stream has not yet been consumed. * * @return stream length * @throws IOException if the length can not be determined */ public long getLength() throws IOException { if (length == -1) { getPath(); // updates length internally } return length; } /** * Returns the current position within the stream. * * @return stream position */ public long getPosition() { return position; } /** * This relies on {@link IOUtils#skip(InputStream, long)} to ensure * that the alleged bytes skipped were actually skipped. * * @param ln the number of bytes to skip * @return the number of bytes skipped * @throws IOException if the number of bytes requested to be skipped does not match the number of bytes skipped * or if there's an IOException during the read. */ @Override public long skip(long ln) throws IOException { //On TIKA-3092, we found that using the static byte array buffer //caused problems with multithreading with the FlateInputStream //from a POIFS document stream if (skipBuffer == null) { skipBuffer = new byte[4096]; } long n = IOUtils.skip(super.in, ln, skipBuffer); position += n; return n; } @Override public void mark(int readlimit) { super.mark(readlimit); mark = position; } @Override public boolean markSupported() { return true; } @Override public void reset() throws IOException { super.reset(); position = mark; mark = -1; consecutiveEOFs = 0; } @Override public void close() throws IOException { path = null; mark = -1; // The close method was explicitly called, so we indeed // are expected to close the input stream. Handle that // by adding that stream as a resource to be tracked before // closing all of them. This way also possible exceptions from // the close() calls get managed properly. tmp.addResource(in); tmp.close(); } @Override protected void afterRead(int n) throws IOException { if (n != -1) { position += n; } else { consecutiveEOFs++; if (consecutiveEOFs > MAX_CONSECUTIVE_EOFS) { throw new IOException("Read too many -1 (EOFs); there could be an infinite loop." + "If you think your file is not corrupt, please open an issue on Tika's JIRA"); } } } public String toString() { String str = "TikaInputStream of "; if (hasFile()) { str += path.toString(); } else { str += in.toString(); } if (openContainer != null) { str += " (in " + openContainer + ")"; } return str; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy