org.apache.tika.parser.microsoft.EMFParser Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
import java.util.Set;
import org.apache.poi.hemf.extractor.HemfExtractor;
import org.apache.poi.hemf.record.AbstractHemfComment;
import org.apache.poi.hemf.record.HemfCommentPublic;
import org.apache.poi.hemf.record.HemfCommentRecord;
import org.apache.poi.hemf.record.HemfRecord;
import org.apache.poi.hemf.record.HemfRecordType;
import org.apache.poi.hemf.record.HemfText;
import org.apache.poi.util.RecordFormatException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
/**
* Extracts files embedded in EMF and offers a
* very rough capability to extract text if there
* is text stored in the EMF.
*
* To improve text extraction, we'd have to implement
* quite a bit more at the POI level. We'd want to track changes
* in font and use that information for identifying character sets,
* inserting spaces and new lines.
*/
public class EMFParser extends AbstractParser {
private static final MediaType MEDIA_TYPE = MediaType.image("emf");
private static final MediaType WMF_MEDIA_TYPE = MediaType.image("wmf");
private static final Set SUPPORTED_TYPES =
Collections.singleton(MEDIA_TYPE);
@Override
public Set getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
EmbeddedDocumentExtractor embeddedDocumentExtractor = null;
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
try {
HemfExtractor ex = new HemfExtractor(stream);
long lastY = -1;
long lastX = -1;
long fudgeFactorX = 1000;//derive this from the font or frame/bounds information
StringBuilder buffer = new StringBuilder();
for (HemfRecord record : ex) {
if (record.getRecordType() == HemfRecordType.comment) {
AbstractHemfComment comment = ((HemfCommentRecord) record).getComment();
if (comment instanceof HemfCommentPublic.MultiFormats) {
if (embeddedDocumentExtractor == null) {
embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
}
handleMultiFormats((HemfCommentPublic.MultiFormats)comment, xhtml, embeddedDocumentExtractor);
} else if (comment instanceof HemfCommentPublic.WindowsMetafile) {
if (embeddedDocumentExtractor == null) {
embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
}
handleWMF((HemfCommentPublic.WindowsMetafile)comment, xhtml, embeddedDocumentExtractor);
}
} else if (record.getRecordType().equals(HemfRecordType.exttextoutw)) {
HemfText.ExtTextOutW extTextOutW = (HemfText.ExtTextOutW) record;
if (lastY > -1 && lastY != extTextOutW.getY()) {
xhtml.startElement("p");
xhtml.characters(buffer.toString());
xhtml.endElement("p");
buffer.setLength(0);
lastX = -1;
}
if (lastX > -1 && extTextOutW.getX() - lastX > fudgeFactorX) {
buffer.append(" ");
}
String txt = extTextOutW.getText();
buffer.append(txt);
lastY = extTextOutW.getY();
lastX = extTextOutW.getX();
}
}
if (buffer.length() > 0) {
xhtml.startElement("p");
xhtml.characters(buffer.toString());
xhtml.endElement("p");
}
} catch (RecordFormatException e) { //POI's hemfparser can throw these for "parse exceptions"
throw new TikaException(e.getMessage(), e);
} catch (RuntimeException e) { //convert Runtime to RecordFormatExceptions
throw new TikaException(e.getMessage(), e);
}
xhtml.endDocument();
}
private void handleWMF(HemfCommentPublic.WindowsMetafile comment, ContentHandler contentHandler,
EmbeddedDocumentExtractor embeddedDocumentExtractor) throws IOException, SAXException, TikaException {
Metadata embeddedMetadata = new Metadata();
embeddedMetadata.set(Metadata.CONTENT_TYPE, WMF_MEDIA_TYPE.toString());
if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
try (InputStream is = TikaInputStream.get(comment.getWmfInputStream())) {
embeddedDocumentExtractor.parseEmbedded(is,
new EmbeddedContentHandler(contentHandler), embeddedMetadata, false);
}
}
}
private void handleMultiFormats(HemfCommentPublic.MultiFormats comment, ContentHandler handler,
EmbeddedDocumentExtractor embeddedDocumentExtractor) throws IOException, TikaException, SAXException {
for (HemfCommentPublic.HemfMultiFormatsData data :
((HemfCommentPublic.MultiFormats) comment).getData()) {
handleEmbedded(data.getData(), embeddedDocumentExtractor, handler);
}
}
private static void handleEmbedded(byte[] data,
EmbeddedDocumentExtractor embeddedDocumentExtractor,
ContentHandler handler) throws TikaException, SAXException {
try (InputStream is = TikaInputStream.get(data)) {
Metadata embeddedMetadata = new Metadata();
if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
embeddedDocumentExtractor.parseEmbedded(is,
new EmbeddedContentHandler(handler), embeddedMetadata, false);
}
} catch (IOException e) {
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy