All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor Maven / Gradle / Ivy

There is a newer version: 2024.11.18751.20241128T090041Z-241100
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.microsoft.ooxml;

import static org.apache.tika.sax.XHTMLContentHandler.XHTML;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.poi.extractor.POITextExtractor;
import org.apache.poi.ooxml.POIXMLDocument;
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationship;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
import org.apache.poi.openxml4j.opc.TargetMode;
import org.apache.poi.openxml4j.opc.internal.FileHelper;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Ole10Native;
import org.apache.poi.poifs.filesystem.Ole10NativeException;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xssf.usermodel.XSSFRelation;
import org.apache.poi.xwpf.usermodel.XWPFRelation;
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.OfficeParser;
import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.OfflineContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.ExceptionUtils;
import org.apache.tika.utils.XMLReaderUtils;
import org.apache.xmlbeans.XmlException;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;

/**
 * Base class for all Tika OOXML extractors.
 * 

* Tika extractors decorate POI extractors so that the parsed content of * documents is returned as a sequence of XHTML SAX events. Subclasses must * implement the buildXHTML method {@link #buildXHTML(XHTMLContentHandler)} that * populates the {@link XHTMLContentHandler} object received as parameter. */ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { static final String RELATION_AUDIO = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/audio"; static final String RELATION_MEDIA = "http://schemas.microsoft.com/office/2007/relationships/media"; static final String RELATION_VIDEO = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/video"; static final String RELATION_DIAGRAM_DATA = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/diagramData"; private static final String TYPE_OLE_OBJECT = "application/vnd.openxmlformats-officedocument.oleObject"; protected final static String[] EMBEDDED_RELATIONSHIPS = new String[]{ RELATION_AUDIO, PackageRelationshipTypes.IMAGE_PART, POIXMLDocument.PACK_OBJECT_REL_TYPE, PackageRelationshipTypes.CORE_DOCUMENT, RELATION_DIAGRAM_DATA }; private final EmbeddedDocumentExtractor embeddedExtractor; private final ParseContext context; protected OfficeParserConfig config; protected POIXMLTextExtractor extractor; public AbstractOOXMLExtractor(ParseContext context, POIXMLTextExtractor extractor) { this.context = context; this.extractor = extractor; embeddedExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); // This has already been set by OOXMLParser's call to configure() // We can rely on this being non-null. this.config = context.get(OfficeParserConfig.class); } /** * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getDocument() */ public POIXMLDocument getDocument() { return (POIXMLDocument)extractor.getDocument(); } /** * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getMetadataExtractor() */ public MetadataExtractor getMetadataExtractor() { return new MetadataExtractor(extractor); } /** * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getXHTML(ContentHandler, Metadata, ParseContext) */ public void getXHTML( ContentHandler handler, Metadata metadata, ParseContext context) throws SAXException, XmlException, IOException, TikaException { XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); buildXHTML(xhtml); // Now do any embedded parts handleEmbeddedParts(handler, metadata); // thumbnail handleThumbnail(handler); xhtml.endDocument(); } protected String getJustFileName(String desc) { int idx = desc.lastIndexOf('/'); if (idx != -1) { desc = desc.substring(idx + 1); } idx = desc.lastIndexOf('.'); if (idx != -1) { desc = desc.substring(0, idx); } return desc; } private void handleThumbnail(ContentHandler handler) throws SAXException { try { OPCPackage opcPackage = extractor.getPackage(); for (PackageRelationship rel : opcPackage.getRelationshipsByType(PackageRelationshipTypes.THUMBNAIL)) { PackagePart tPart = opcPackage.getPart(rel); InputStream tStream = tPart.getInputStream(); Metadata thumbnailMetadata = new Metadata(); String thumbName = tPart.getPartName().getName(); thumbnailMetadata.set(Metadata.RESOURCE_NAME_KEY, thumbName); AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute(XHTML, "class", "class", "CDATA", "embedded"); attributes.addAttribute(XHTML, "id", "id", "CDATA", thumbName); handler.startElement(XHTML, "div", "div", attributes); handler.endElement(XHTML, "div", "div"); thumbnailMetadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, thumbName); thumbnailMetadata.set(Metadata.CONTENT_TYPE, tPart.getContentType()); thumbnailMetadata.set(TikaCoreProperties.TITLE, tPart.getPartName().getName()); thumbnailMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.THUMBNAIL.toString()); if (embeddedExtractor.shouldParseEmbedded(thumbnailMetadata)) { embeddedExtractor.parseEmbedded(TikaInputStream.get(tStream), new EmbeddedContentHandler(handler), thumbnailMetadata, false); } tStream.close(); } } catch (SecurityException e) { throw e; } catch (Exception ex) { //swallow unless write limit reached WriteLimitReachedException.throwIfWriteLimitReached(ex); } } private void handleEmbeddedParts(ContentHandler handler, Metadata metadata) throws TikaException, IOException, SAXException { //keep track of media items that have been handled //there can be multiple relationships pointing to the //same underlying media item. We only want to process //the underlying media item once. Set handledTarget = new HashSet<>(); try { for (PackagePart source : getMainDocumentParts()) { if (source == null) { //parts can go missing; silently ignore -- TIKA-2134 continue; } for (PackageRelationship rel : source.getRelationships()) { try { handleEmbeddedPart(source, rel, handler, metadata, handledTarget); } catch (SecurityException e) { throw e; } catch (Exception e) { WriteLimitReachedException.throwIfWriteLimitReached(e); EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata); } } } } catch (InvalidFormatException e) { throw new TikaException("Broken OOXML file", e); } } private void handleEmbeddedPart(PackagePart source, PackageRelationship rel, ContentHandler handler, Metadata parentMetadata, Set handledTarget) throws IOException, SAXException, TikaException, InvalidFormatException { URI targetURI = rel.getTargetURI(); if (targetURI != null) { if (handledTarget.contains(targetURI.toString())) { return; } } URI sourceURI = rel.getSourceURI(); String sourceDesc; if (sourceURI != null) { sourceDesc = getJustFileName(sourceURI.getPath()); if (sourceDesc.startsWith("slide")) { sourceDesc += "_"; } else { sourceDesc = ""; } } else { sourceDesc = ""; } if (rel.getTargetMode() != TargetMode.INTERNAL) { return; } PackagePart target; try { target = source.getRelatedPart(rel); } catch (IllegalArgumentException ex) { return; } String type = rel.getRelationshipType(); if (POIXMLDocument.OLE_OBJECT_REL_TYPE.equals(type) && TYPE_OLE_OBJECT.equals(target.getContentType())) { handleEmbeddedOLE(target, handler, sourceDesc + rel.getId(), parentMetadata); handledTarget.add(targetURI.toString()); } else if ( RELATION_MEDIA.equals(type) || RELATION_VIDEO.equals(type) || RELATION_AUDIO.equals(type) || PackageRelationshipTypes.IMAGE_PART.equals(type) || POIXMLDocument.PACK_OBJECT_REL_TYPE.equals(type) || POIXMLDocument.OLE_OBJECT_REL_TYPE.equals(type)) { handleEmbeddedFile(target, handler, sourceDesc + rel.getId()); handledTarget.add(targetURI.toString()); } else if (XSSFRelation.VBA_MACROS.getRelation().equals(type)) { handleMacros(target, handler); handledTarget.add(targetURI.toString()); } } /** * Handles an embedded OLE object in the document */ private void handleEmbeddedOLE(PackagePart part, ContentHandler handler, String rel, Metadata parentMetadata) throws IOException, SAXException { // A POIFSFileSystem needs to be at least 3 blocks big to be valid if (part.getSize() >= 0 && part.getSize() < 512 * 3) { // Too small, skip return; } InputStream is = part.getInputStream(); // Open the POIFS (OLE2) structure and process POIFSFileSystem fs = null; try { fs = new POIFSFileSystem(part.getInputStream()); } catch (Exception e) { EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata); return; } TikaInputStream stream = null; try { Metadata metadata = new Metadata(); metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel); DirectoryNode root = fs.getRoot(); POIFSDocumentType type = POIFSDocumentType.detectType(root); if (root.hasEntry("\u0001Ole") && root.hasEntry("\u0001CompObj") && ( root.hasEntry("CONTENTS") || root.hasEntry("Package") )) { // TIKA-704: OLE 2.0 embedded non-Office document? //TODO: figure out if the equivalent of OLE 1.0's //getCommand() and getFileName() exist for OLE 2.0 to populate //TikaCoreProperties.ORIGINAL_RESOURCE_NAME if (root.hasEntry("CONTENTS")) { stream = TikaInputStream.get( fs.createDocumentInputStream("CONTENTS")); } else if (root.hasEntry("Package")) { //TIKA-2588 stream = TikaInputStream.get( fs.createDocumentInputStream("Package")); } else { throw new IllegalStateException("Shouldn't ever arrive here; please open a ticket on our jira"); } if (embeddedExtractor.shouldParseEmbedded(metadata)) { embeddedExtractor.parseEmbedded( stream, new EmbeddedContentHandler(handler), metadata, false); } } else if (POIFSDocumentType.OLE10_NATIVE == type) { // TIKA-704: OLE 1.0 embedded document Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(fs); if (ole.getLabel() != null) { metadata.set(Metadata.RESOURCE_NAME_KEY, ole.getLabel()); } if (ole.getCommand() != null) { metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getCommand()); } if (ole.getFileName() != null) { metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getFileName()); } byte[] data = ole.getDataBuffer(); if (data != null) { stream = TikaInputStream.get(data); } if (stream != null && embeddedExtractor.shouldParseEmbedded(metadata)) { embeddedExtractor.parseEmbedded( stream, new EmbeddedContentHandler(handler), metadata, false); } } else { handleEmbeddedFile(part, handler, rel); } } catch (FileNotFoundException e) { // There was no CONTENTS entry, so skip this part } catch (Ole10NativeException e) { // Could not process an OLE 1.0 entry, so skip this part } catch (IOException e ) { EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata); } finally { if (fs != null) { fs.close(); } if (stream != null) { stream.close(); } } } /** * Handles an embedded file in the document */ protected void handleEmbeddedFile(PackagePart part, ContentHandler handler, String rel) throws SAXException, IOException { Metadata metadata = new Metadata(); metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel); // Get the name String name = part.getPartName().getName(); metadata.set( Metadata.RESOURCE_NAME_KEY, name.substring(name.lastIndexOf('/') + 1)); // Get the content type metadata.set( Metadata.CONTENT_TYPE, part.getContentType()); // Call the recursing handler if (embeddedExtractor.shouldParseEmbedded(metadata)) { try(TikaInputStream tis = TikaInputStream.get(part.getInputStream())) { embeddedExtractor.parseEmbedded( tis, new EmbeddedContentHandler(handler), metadata, false); } } } /** * Populates the {@link XHTMLContentHandler} object received as parameter. */ protected abstract void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, XmlException, IOException; /** * Return a list of the main parts of the document, used * when searching for embedded resources. * This should be all the parts of the document that end * up with things embedded into them. */ protected abstract List getMainDocumentParts() throws TikaException; void handleMacros(PackagePart macroPart, ContentHandler handler) throws TikaException, SAXException { OfficeParserConfig officeParserConfig = context.get(OfficeParserConfig.class); if (officeParserConfig.getExtractMacros()) { try (InputStream is = macroPart.getInputStream()) { try (POIFSFileSystem poifs = new POIFSFileSystem(is)) { //Macro reading exceptions are already swallowed here OfficeParser.extractMacros(poifs, handler, embeddedExtractor); } } catch (IOException e) { throw new TikaException("Broken OOXML file", e); } } } /** * This is used by the SAX docx and pptx decorators to load hyperlinks and * other linked objects * * @param bodyPart * @return */ protected Map loadLinkedRelationships(PackagePart bodyPart, boolean includeInternal, Metadata metadata) { Map linkedRelationships = new HashMap<>(); try { PackageRelationshipCollection prc = bodyPart.getRelationshipsByType(XWPFRelation.HYPERLINK.getRelation()); for (int i = 0; i < prc.size(); i++) { PackageRelationship pr = prc.getRelationship(i); if (pr == null) { continue; } if (! includeInternal && TargetMode.INTERNAL.equals(pr.getTargetMode())) { continue; } String id = pr.getId(); String url = (pr.getTargetURI() == null) ? null : pr.getTargetURI().toString(); if (id != null && url != null) { linkedRelationships.put(id, url); } } for (String rel : EMBEDDED_RELATIONSHIPS) { prc = bodyPart.getRelationshipsByType(rel); for (int i = 0; i < prc.size(); i++) { PackageRelationship pr = prc.getRelationship(i); if (pr == null) { continue; } String id = pr.getId(); String uriString = (pr.getTargetURI() == null) ? null : pr.getTargetURI().toString(); String fileName = uriString; if (pr.getTargetURI() != null) { try { fileName = FileHelper.getFilename(new File(fileName)); } catch (Exception e) { fileName = uriString; } } if (id != null) { fileName = (fileName == null) ? "" : fileName; linkedRelationships.put(id, fileName); } } } } catch (InvalidFormatException e) { EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata); } return linkedRelationships; } /** * This should handle the comments, master, notes, with the streaming "general docx/pptx handler" * * @param contentType * @param xhtmlClassLabel * @param parentPart * @param contentHandler */ void handleGeneralTextContainingPart(String contentType, String xhtmlClassLabel, PackagePart parentPart, Metadata parentMetadata, ContentHandler contentHandler) throws SAXException { PackageRelationshipCollection relatedPartPRC = null; try { relatedPartPRC = parentPart.getRelationshipsByType(contentType); } catch (InvalidFormatException e) { parentMetadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e)); } if (relatedPartPRC != null && relatedPartPRC.size() > 0) { AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "class", "class", "CDATA", xhtmlClassLabel); contentHandler.startElement("", "div", "div", attributes); for (int i = 0; i < relatedPartPRC.size(); i++) { PackageRelationship relatedPartPackageRelationship = relatedPartPRC.getRelationship(i); try { PackagePart relatedPartPart = parentPart.getRelatedPart(relatedPartPackageRelationship); try (InputStream stream = relatedPartPart.getInputStream()) { XMLReaderUtils.parseSAX(stream, new OfflineContentHandler(new EmbeddedContentHandler(contentHandler)), context); } catch (IOException|TikaException e) { parentMetadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e)); } } catch (InvalidFormatException e) { parentMetadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e)); } } contentHandler.endElement("", "div", "div"); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy