All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.microsoft.EMFParser Maven / Gradle / Ivy

There is a newer version: 2024.11.18751.20241128T090041Z-241100
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * 

* http://www.apache.org/licenses/LICENSE-2.0 *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.parser.microsoft; import java.io.IOException; import java.io.InputStream; import java.util.Collections; import java.util.Set; import org.apache.poi.hemf.record.emf.HemfComment; import org.apache.poi.hemf.record.emf.HemfRecord; import org.apache.poi.hemf.record.emf.HemfRecordType; import org.apache.poi.hemf.record.emf.HemfText; import org.apache.poi.hemf.usermodel.HemfPicture; import org.apache.poi.util.RecordFormatException; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AbstractParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.EmbeddedContentHandler; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; /** * Extracts files embedded in EMF and offers a * very rough capability to extract text if there * is text stored in the EMF. *

* To improve text extraction, we'd have to implement * quite a bit more at the POI level. We'd want to track changes * in font and use that information for identifying character sets, * inserting spaces and new lines. */ public class EMFParser extends AbstractParser { private static final MediaType MEDIA_TYPE = MediaType.image("emf"); private static final MediaType WMF_MEDIA_TYPE = MediaType.image("wmf"); private static final Set SUPPORTED_TYPES = Collections.singleton(MEDIA_TYPE); @Override public Set getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; } @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { EmbeddedDocumentExtractor embeddedDocumentExtractor = null; XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); try { HemfPicture ex = new HemfPicture(stream); double lastY = -1; double lastX = -1; long fudgeFactorX = 1000;//derive this from the font or frame/bounds information StringBuilder buffer = new StringBuilder(); for (HemfRecord record : ex) { if (record.getEmfRecordType() == HemfRecordType.comment) { HemfComment.EmfCommentData commentData = ((HemfComment.EmfComment) record).getCommentData(); if (commentData instanceof HemfComment.EmfCommentDataMultiformats) { if (embeddedDocumentExtractor == null) { embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); } handleMultiFormats( (HemfComment.EmfCommentDataMultiformats)commentData, xhtml, embeddedDocumentExtractor); } else if (commentData instanceof HemfComment.EmfCommentDataWMF) { if (embeddedDocumentExtractor == null) { embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); } handleWMF(((HemfComment.EmfCommentDataWMF) commentData).getWMFData(), xhtml, embeddedDocumentExtractor); } } else if (record.getEmfRecordType().equals(HemfRecordType.extTextOutW)) { HemfText.EmfExtTextOutW extTextOutW = (HemfText.EmfExtTextOutW) record; //change equality to delta diff; if (lastY > -1 && lastY != extTextOutW.getReference().getY()) { xhtml.startElement("p"); xhtml.characters(buffer.toString()); xhtml.endElement("p"); buffer.setLength(0); lastX = -1; } if (lastX > -1 && extTextOutW.getReference().getX() - lastX > fudgeFactorX) { buffer.append(" "); } String txt = extTextOutW.getText(); buffer.append(txt); lastY = extTextOutW.getReference().getY(); lastX = extTextOutW.getReference().getX(); } } if (buffer.length() > 0) { xhtml.startElement("p"); xhtml.characters(buffer.toString()); xhtml.endElement("p"); } } catch (RecordFormatException e) { //POI's hemfparser can throw these for "parse exceptions" throw new TikaException(e.getMessage(), e); } catch (RuntimeException e) { //convert Runtime to RecordFormatExceptions throw new TikaException(e.getMessage(), e); } xhtml.endDocument(); } private void handleWMF(byte[] bytes, ContentHandler contentHandler, EmbeddedDocumentExtractor embeddedDocumentExtractor) throws IOException, SAXException, TikaException { Metadata embeddedMetadata = new Metadata(); embeddedMetadata.set(Metadata.CONTENT_TYPE, WMF_MEDIA_TYPE.toString()); if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) { try (InputStream is = TikaInputStream.get(bytes)) { embeddedDocumentExtractor.parseEmbedded(is, new EmbeddedContentHandler(contentHandler), embeddedMetadata, false); } } } private void handleMultiFormats(HemfComment.EmfCommentDataMultiformats commentData, ContentHandler handler, EmbeddedDocumentExtractor embeddedDocumentExtractor) throws IOException, TikaException, SAXException { for (HemfComment.EmfCommentDataFormat dataFormat : commentData.getFormats()) { //is this right?! handleEmbedded(dataFormat.getRawData(), embeddedDocumentExtractor, handler); } } private static void handleEmbedded(byte[] data, EmbeddedDocumentExtractor embeddedDocumentExtractor, ContentHandler handler) throws TikaException, SAXException { try (InputStream is = TikaInputStream.get(data)) { Metadata embeddedMetadata = new Metadata(); if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) { embeddedDocumentExtractor.parseEmbedded(is, new EmbeddedContentHandler(handler), embeddedMetadata, false); } } catch (IOException e) { } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy