org.apache.tika.embedder.Embedder Maven / Gradle / Ivy
Show all versions of aem-sdk-api Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.embedder;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.Serializable;
import java.util.Set;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
/**
* Tika embedder interface
*
* @since Apache Tika 1.3
*/
public interface Embedder extends Serializable {
/**
* Returns the set of media types supported by this embedder when used with
* the given parse context.
*
* The name differs from the precedence of {@link Parser#getSupportedTypes(ParseContext)}
* so that parser implementations may also choose to implement this interface.
*
* @param context parse context
* @return immutable set of media types
*/
Set getSupportedEmbedTypes(ParseContext context);
/**
* Embeds related document metadata from the given metadata object into the
* given output stream.
*
* The given document stream is consumed but not closed by this method. The
* responsibility to close the stream remains on the caller.
*
* Information about the parsing context can be passed in the context
* parameter. See the parser implementations for the kinds of context
* information they expect.
*
* In general implementations should favor preserving the source file's metadata
* unless an update to a field is explicitly defined in the Metadata object.
* More specifically:
*
* - Embedder implementations should only attempt to update metadata fields
* present in the given Metadata object. Other fields should be left untouched.
* - Embedder implementations should set properties as empty when the
* corresponding field in the Metadata object is an empty string, i.e. ""
* - Embedder implementations should nullify or delete properties
* corresponding to fields with a null value in the given Metadata object.
* - Embedder implementations should set the property
* corresponding to a particular field in the given Metadata object in all
* metadata containers whenever possible and appropriate for the file format at the time.
* If a particular metadata container falls out of use and/or is superseded by another
* (such as IIC vs XMP for IPTC) it is up to the implementation to decide if and when
* to cease embedding in the alternate container.
* - Embedder implementations should attempt to embed as much of the metadata
* as accurately as possible. An implementation may choose a strict approach
* and throw an exception if a value to be embedded exceeds the length allowed
* or may choose to truncate the value.
*
*
* @param metadata document metadata (input and output)
* @param originalStream the document stream (input)
* @param outputStream the output stream to write the metadata embedded data to
* @param context parse context
* @throws IOException if the document stream could not be read
* @throws TikaException if the document could not be parsed
*/
void embed(Metadata metadata, InputStream originalStream,
OutputStream outputStream, ParseContext context)
throws IOException, TikaException;
}