All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.rtf.RTFEmbObjHandler Maven / Gradle / Ivy

There is a newer version: 2024.11.18751.20241128T090041Z-241100
Show newest version
package org.apache.tika.parser.rtf; 
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.TikaMemoryLimitException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.RTFMetadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

/**
 * This class buffers data from embedded objects and pictures.
 * 

*

*

* When the parser has finished an object or picture and called * {@link #handleCompletedObject()}, this will write the object * to the {@link #handler}. *

*

*

* This (in combination with TextExtractor) expects basically a flat parse. It will pull out * all pict whether they are tied to objdata or are intended * to be standalone. *

*

* This tries to pull metadata around a pict that is encoded * with {sp {sn} {sv}} types of data. This information * sometimes contains the name and even full file path of the original file. */ class RTFEmbObjHandler { private static final String EMPTY_STRING = ""; private final ContentHandler handler; private final EmbeddedDocumentUtil embeddedDocumentUtil; private final ByteArrayOutputStream os; //high hex cached for writing hexpair chars (data) private int hi = -1; private int thumbCount = 0; //don't need atomic, do need mutable private AtomicInteger unknownFilenameCount = new AtomicInteger(); private boolean inObject = false; private String sv = EMPTY_STRING; private String sn = EMPTY_STRING; private StringBuilder sb = new StringBuilder(); private Metadata metadata; private EMB_STATE state = EMB_STATE.NADA; private final int memoryLimitInKb; protected RTFEmbObjHandler(ContentHandler handler, Metadata metadata, ParseContext context, int memoryLimitInKb) { this.handler = handler; this.embeddedDocumentUtil = new EmbeddedDocumentUtil(context); os = new ByteArrayOutputStream(); this.memoryLimitInKb = memoryLimitInKb; } protected void startPict() { state = EMB_STATE.PICT; metadata = new Metadata(); } protected void startObjData() { state = EMB_STATE.OBJDATA; metadata = new Metadata(); } protected void startSN() { sb.setLength(0); sb.append(RTFMetadata.RTF_PICT_META_PREFIX); } protected void endSN() { sn = sb.toString(); } protected void startSV() { sb.setLength(0); } protected void endSV() { sv = sb.toString(); } //end metadata pair protected void endSP() { metadata.add(sn, sv); } protected boolean getInObject() { return inObject; } protected void setInObject(boolean v) { inObject = v; } protected void writeMetadataChar(char c) { sb.append(c); } protected void writeHexChar(int b) throws IOException, TikaException { //if not hexchar, ignore //white space is common if (TextExtractor.isHexChar(b)) { if (hi == -1) { hi = 16 * TextExtractor.hexValue(b); } else { long sum = hi + TextExtractor.hexValue(b); if (sum > Integer.MAX_VALUE || sum < 0) { throw new IOException("hex char to byte overflow"); } os.write((int) sum); hi = -1; } return; } if (b == -1) { throw new TikaException("hit end of stream before finishing byte pair"); } } protected void writeBytes(InputStream is, int len) throws IOException, TikaException { if (len < 0) { throw new TikaException("Requesting I read < 0 bytes ?!"); } if (len > memoryLimitInKb*1024) { throw new TikaMemoryLimitException(len, (memoryLimitInKb*1024)); } byte[] bytes = new byte[len]; IOUtils.readFully(is, bytes); os.write(bytes); } /** * Call this when the objdata/pict has completed * * @throws IOException * @throws SAXException * @throws TikaException */ protected void handleCompletedObject() throws IOException, SAXException, TikaException { byte[] bytes = os.toByteArray(); if (state == EMB_STATE.OBJDATA) { RTFObjDataParser objParser = new RTFObjDataParser(memoryLimitInKb); try { byte[] objBytes = objParser.parse(bytes, metadata, unknownFilenameCount); extractObj(objBytes, handler, metadata); } catch (IOException e) { EmbeddedDocumentUtil.recordException(e, metadata); } } else if (state == EMB_STATE.PICT) { String filePath = metadata.get(RTFMetadata.RTF_PICT_META_PREFIX + "wzDescription"); if (filePath != null && filePath.length() > 0) { metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, filePath); metadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(filePath)); metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, filePath); } metadata.set(RTFMetadata.THUMBNAIL, Boolean.toString(inObject)); extractObj(bytes, handler, metadata); } else if (state == EMB_STATE.NADA) { //swallow...no start for pict or embed?! } reset(); } private void extractObj(byte[] bytes, ContentHandler handler, Metadata metadata) throws SAXException, IOException, TikaException { if (bytes == null) { return; } metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(bytes.length)); if (embeddedDocumentUtil.shouldParseEmbedded(metadata)) { TikaInputStream stream = TikaInputStream.get(bytes); if (metadata.get(Metadata.RESOURCE_NAME_KEY) == null) { String extension = embeddedDocumentUtil.getExtension(stream, metadata); if (inObject && state == EMB_STATE.PICT) { metadata.set(Metadata.RESOURCE_NAME_KEY, "thumbnail_" + thumbCount++ + extension); metadata.set(RTFMetadata.THUMBNAIL, "true"); } else { metadata.set(Metadata.RESOURCE_NAME_KEY, "file_" + unknownFilenameCount.getAndIncrement() + extension); } } try { embeddedDocumentUtil.parseEmbedded( stream, new EmbeddedContentHandler(handler), metadata, false); } catch (IOException e) { EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata); } finally { stream.close(); } } } /** * reset state after each object. * Do not reset unknown file number. */ protected void reset() { state = EMB_STATE.NADA; os.reset(); metadata = new Metadata(); hi = -1; sv = EMPTY_STRING; sn = EMPTY_STRING; sb.setLength(0); } private enum EMB_STATE { PICT, //recording pict data OBJDATA, //recording objdata NADA } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy