All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.rtf.RTFEmbObjHandler Maven / Gradle / Ivy

There is a newer version: 3.0.0-BETA2
Show newest version
package org.apache.tika.parser.rtf; 
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.commons.io.FilenameUtils;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.RTFMetadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MimeType;
import org.apache.tika.mime.MimeTypeException;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

/**
 * This class buffers data from embedded objects and pictures.
 * 

*

*

* When the parser has finished an object or picture and called * {@link #handleCompletedObject()}, this will write the object * to the {@link #handler}. *

*

*

* This (in combination with TextExtractor) expects basically a flat parse. It will pull out * all pict whether they are tied to objdata or are intended * to be standalone. *

*

* This tries to pull metadata around a pict that is encoded * with {sp {sn} {sv}} types of data. This information * sometimes contains the name and even full file path of the original file. */ class RTFEmbObjHandler { private static final String EMPTY_STRING = ""; private final ContentHandler handler; private final ParseContext context; private final ByteArrayOutputStream os; //high hex cached for writing hexpair chars (data) private int hi = -1; private int thumbCount = 0; //don't need atomic, do need mutable private AtomicInteger unknownFilenameCount = new AtomicInteger(); private boolean inObject = false; private String sv = EMPTY_STRING; private String sn = EMPTY_STRING; private StringBuilder sb = new StringBuilder(); private Metadata metadata; private EMB_STATE state = EMB_STATE.NADA; protected RTFEmbObjHandler(ContentHandler handler, Metadata metadata, ParseContext context) { this.handler = handler; this.context = context; os = new ByteArrayOutputStream(); } protected void startPict() { state = EMB_STATE.PICT; metadata = new Metadata(); } protected void startObjData() { state = EMB_STATE.OBJDATA; metadata = new Metadata(); } protected void startSN() { sb.setLength(0); sb.append(RTFMetadata.RTF_PICT_META_PREFIX); } protected void endSN() { sn = sb.toString(); } protected void startSV() { sb.setLength(0); } protected void endSV() { sv = sb.toString(); } //end metadata pair protected void endSP() { metadata.add(sn, sv); } protected boolean getInObject() { return inObject; } protected void setInObject(boolean v) { inObject = v; } protected void writeMetadataChar(char c) { sb.append(c); } protected void writeHexChar(int b) throws IOException, TikaException { //if not hexchar, ignore //white space is common if (TextExtractor.isHexChar(b)) { if (hi == -1) { hi = 16 * TextExtractor.hexValue(b); } else { long sum = hi + TextExtractor.hexValue(b); if (sum > Integer.MAX_VALUE || sum < 0) { throw new IOException("hex char to byte overflow"); } os.write((int) sum); hi = -1; } return; } if (b == -1) { throw new TikaException("hit end of stream before finishing byte pair"); } } protected void writeBytes(InputStream is, int len) throws IOException, TikaException { if (len < 0 || len > RTFParser.getMaxBytesForEmbeddedObject()) { throw new IOException("length of bytes to read out of bounds: " + len); } byte[] bytes = new byte[len]; int bytesRead = is.read(bytes); if (bytesRead < len) { throw new TikaException("unexpected end of file: need " + len + " bytes of binary data, found " + (len - bytesRead)); } os.write(bytes); } /** * Call this when the objdata/pict has completed * * @throws IOException * @throws SAXException * @throws TikaException */ protected void handleCompletedObject() throws IOException, SAXException, TikaException { EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class); if (embeddedExtractor == null) { embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context); } byte[] bytes = os.toByteArray(); if (state == EMB_STATE.OBJDATA) { RTFObjDataParser objParser = new RTFObjDataParser(); try { byte[] objBytes = objParser.parse(bytes, metadata, unknownFilenameCount); extractObj(objBytes, handler, embeddedExtractor, metadata); } catch (IOException e) { //swallow. If anything goes wrong, ignore. } } else if (state == EMB_STATE.PICT) { String filePath = metadata.get(RTFMetadata.RTF_PICT_META_PREFIX + "wzDescription"); if (filePath != null && filePath.length() > 0) { metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, filePath); metadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(filePath)); } metadata.set(RTFMetadata.THUMBNAIL, Boolean.toString(inObject)); extractObj(bytes, handler, embeddedExtractor, metadata); } else if (state == EMB_STATE.NADA) { //swallow...no start for pict or embed?! } reset(); } private void extractObj(byte[] bytes, ContentHandler handler, EmbeddedDocumentExtractor embeddedExtractor, Metadata metadata) throws SAXException, IOException, TikaException { if (bytes == null) { return; } metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(bytes.length)); if (embeddedExtractor.shouldParseEmbedded(metadata)) { TikaInputStream stream = TikaInputStream.get(bytes); if (metadata.get(Metadata.RESOURCE_NAME_KEY) == null) { String extension = getExtension(stream, metadata); stream.reset(); if (inObject && state == EMB_STATE.PICT) { metadata.set(Metadata.RESOURCE_NAME_KEY, "thumbnail_" + thumbCount++ + extension); metadata.set(RTFMetadata.THUMBNAIL, "true"); } else { metadata.set(Metadata.RESOURCE_NAME_KEY, "file_" + unknownFilenameCount.getAndIncrement() + extension); } } try { embeddedExtractor.parseEmbedded( stream, new EmbeddedContentHandler(handler), metadata, false); } finally { stream.close(); } } } private String getExtension(TikaInputStream is, Metadata metadata) { String cType = metadata.get(Metadata.CONTENT_TYPE); TikaConfig config = getConfig(); if (cType == null) { Detector detector = config.getDetector(); try { MediaType mediaType = detector.detect(is, metadata); MimeTypes types = config.getMimeRepository(); MimeType mime = types.forName(mediaType.toString()); metadata.set(Metadata.CONTENT_TYPE, mime.toString()); return mime.getExtension(); } catch (IOException e) { //swallow } catch (MimeTypeException e) { } } return ".bin"; } private TikaConfig getConfig() { TikaConfig config = context.get(TikaConfig.class); if (config == null) { config = TikaConfig.getDefaultConfig(); } return config; } /** * reset state after each object. * Do not reset unknown file number. */ protected void reset() { state = EMB_STATE.NADA; os.reset(); metadata = new Metadata(); hi = -1; sv = EMPTY_STRING; sn = EMPTY_STRING; sb.setLength(0); } private enum EMB_STATE { PICT, //recording pict data OBJDATA, //recording objdata NADA } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy