All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.microsoft.OfficeParser Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.microsoft;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.security.GeneralSecurityException;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import java.util.Set;

import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
import org.apache.poi.poifs.crypt.Decryptor;
import org.apache.poi.poifs.crypt.EncryptionInfo;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.poifs.macros.VBAMacroReader;
import org.apache.poi.util.LocaleUtil;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import org.apache.tika.detect.microsoft.POIFSContainerDetector;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.StringUtils;

/**
 * Defines a Microsoft document content extractor.
 */
public class OfficeParser extends AbstractOfficeParser {

    /**
     * Serial version UID
     */
    private static final long serialVersionUID = 7393462244028653479L;

    private static final Set SUPPORTED_TYPES = Collections.unmodifiableSet(
            new HashSet<>(Arrays.asList(POIFSDocumentType.WORKBOOK.type,
                    POIFSDocumentType.OLE10_NATIVE.type, POIFSDocumentType.WORDDOCUMENT.type,
                    POIFSDocumentType.UNKNOWN.type, POIFSDocumentType.ENCRYPTED.type,
                    POIFSDocumentType.DRMENCRYPTED.type,
                    POIFSDocumentType.POWERPOINT.type, POIFSDocumentType.PUBLISHER.type,
                    POIFSDocumentType.PROJECT.type, POIFSDocumentType.VISIO.type,
                    // Works isn't supported
                    POIFSDocumentType.XLR.type, // but Works 7.0 Spreadsheet is
                    POIFSDocumentType.OUTLOOK.type, POIFSDocumentType.SOLIDWORKS_PART.type,
                    POIFSDocumentType.SOLIDWORKS_ASSEMBLY.type,
                    POIFSDocumentType.SOLIDWORKS_DRAWING.type)));

    /**
     * Helper to extract macros from an NPOIFS/vbaProject.bin
     * 

* As of POI-3.15-final, there are still some bugs in VBAMacroReader. * For now, we are swallowing NPE and other runtime exceptions * * @param fs NPOIFS to extract from * @param xhtml SAX writer * @param embeddedDocumentExtractor extractor for embedded documents * @throws IOException on IOException if it occurs during the extraction of the embedded doc * @throws SAXException on SAXException for writing to xhtml */ public static void extractMacros(POIFSFileSystem fs, ContentHandler xhtml, EmbeddedDocumentExtractor embeddedDocumentExtractor) throws IOException, SAXException { VBAMacroReader reader = null; Map macros = null; try { reader = new VBAMacroReader(fs); macros = reader.readMacros(); } catch (SecurityException e) { throw e; } catch (Exception e) { Metadata m = new Metadata(); m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString()); m.set(Metadata.CONTENT_TYPE, "text/x-vbasic"); EmbeddedDocumentUtil.recordException(e, m); if (embeddedDocumentExtractor.shouldParseEmbedded(m)) { embeddedDocumentExtractor.parseEmbedded( //pass in space character so that we don't trigger a zero-byte exception UnsynchronizedByteArrayInputStream.builder().setByteArray(new byte[]{'\u0020'}).get(), xhtml, m, true); } return; } for (Map.Entry e : macros.entrySet()) { Metadata m = new Metadata(); m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString()); m.set(Metadata.CONTENT_TYPE, "text/x-vbasic"); if (!StringUtils.isBlank(e.getKey())) { m.set(TikaCoreProperties.RESOURCE_NAME_KEY, e.getKey()); } if (embeddedDocumentExtractor.shouldParseEmbedded(m)) { embeddedDocumentExtractor.parseEmbedded( UnsynchronizedByteArrayInputStream.builder().setByteArray(e.getValue().getBytes(StandardCharsets.UTF_8)).get(), xhtml, m, true); } } } public Set getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; } /** * Extracts properties and text from an MS Document input stream */ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { configure(context); XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); final DirectoryNode root; TikaInputStream tstream = TikaInputStream.cast(stream); POIFSFileSystem mustCloseFs = null; boolean isDirectoryNode = false; try { if (tstream == null) { mustCloseFs = new POIFSFileSystem(CloseShieldInputStream.wrap(stream)); root = mustCloseFs.getRoot(); } else { final Object container = tstream.getOpenContainer(); if (container instanceof POIFSFileSystem) { root = ((POIFSFileSystem) container).getRoot(); } else if (container instanceof DirectoryNode) { root = (DirectoryNode) container; isDirectoryNode = true; } else { POIFSFileSystem fs = null; if (tstream.hasFile()) { fs = new POIFSFileSystem(tstream.getFile(), true); } else { fs = new POIFSFileSystem(CloseShieldInputStream.wrap(tstream)); } //tstream will close the fs, no need to close this below tstream.setOpenContainer(fs); root = fs.getRoot(); } } parse(root, context, metadata, xhtml); OfficeParserConfig officeParserConfig = context.get(OfficeParserConfig.class); if (officeParserConfig.isExtractMacros()) { //now try to get macros. //Note that macros are handled separately for ppt in HSLFExtractor. //We might consider not bothering to check for macros in root, //if we know we're processing ppt based on content-type identified in metadata if (! isDirectoryNode) { // if the "root" is a directory node, we assume that the macros have already // been extracted from the parent's fileSystem -- TIKA-4116 extractMacros(root.getFileSystem(), xhtml, EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context)); } } } finally { IOUtils.closeQuietly(mustCloseFs); } xhtml.endDocument(); } protected void parse(DirectoryNode root, ParseContext context, Metadata metadata, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { // Parse summary entries first, to make metadata available early new SummaryExtractor(metadata).parseSummaries(root); // Parse remaining document entries POIFSDocumentType type = POIFSDocumentType.detectType(root); if (type != POIFSDocumentType.UNKNOWN) { setType(metadata, type.getType()); } switch (type) { case SOLIDWORKS_PART: case SOLIDWORKS_ASSEMBLY: case SOLIDWORKS_DRAWING: break; case PUBLISHER: PublisherTextExtractor publisherTextExtractor = new PublisherTextExtractor(root); xhtml.element("p", publisherTextExtractor.getText()); break; case WORDDOCUMENT: new WordExtractor(context, metadata).parse(root, xhtml); break; case POWERPOINT: new HSLFExtractor(context, metadata).parse(root, xhtml); break; case WORKBOOK: case XLR: Locale locale = context.get(Locale.class, LocaleUtil.getUserLocale()); new ExcelExtractor(context, metadata).parse(root, xhtml, locale); break; case PROJECT: // We currently can't do anything beyond the metadata break; case VISIO: VisioTextExtractor visioTextExtractor = new VisioTextExtractor(root); for (String text : visioTextExtractor.getAllText()) { xhtml.element("p", text); } break; case OUTLOOK: OutlookExtractor extractor = new OutlookExtractor(root, metadata, context); extractor.parse(xhtml); break; case ENCRYPTED: try { EncryptionInfo info = new EncryptionInfo(root); Decryptor d = Decryptor.getInstance(info); // By default, use the default Office Password String password = Decryptor.DEFAULT_PASSWORD; // If they supplied a Password Provider, ask that for the password, // and use the provider given one if available (stick with default if not) PasswordProvider passwordProvider = context.get(PasswordProvider.class); if (passwordProvider != null) { String suppliedPassword = passwordProvider.getPassword(metadata); if (suppliedPassword != null) { password = suppliedPassword; } } // Check if we've the right password or not if (!d.verifyPassword(password)) { throw new EncryptedDocumentException(); } // Decrypt the OLE2 stream, and delegate the resulting OOXML // file to the regular OOXML parser for normal handling OOXMLParser parser = new OOXMLParser(); try (TikaInputStream tis = TikaInputStream.get(d.getDataStream(root))) { parser.parse(tis, new EmbeddedContentHandler(new BodyContentHandler(xhtml)), metadata, context); } } catch (GeneralSecurityException ex) { throw new EncryptedDocumentException(ex); } catch (FileNotFoundException ex) { //this can happen because POI may not support case-insensitive ole2 object //lookups throw new EncryptedDocumentException(ex); } break; case DRMENCRYPTED: throw new EncryptedDocumentException("DRM encrypted document is not yet supported" + " by Apache POI"); default: if (root.hasEntry("EncryptedPackage")) { throw new EncryptedDocumentException("OLE2 file with an unrecognized " + "EncryptedPackage entry"); } // For unsupported / unhandled types, just the metadata // is extracted, which happened above break; } } private void setType(Metadata metadata, MediaType type) { metadata.set(Metadata.CONTENT_TYPE, type.toString()); } public enum POIFSDocumentType { WORKBOOK("xls", MediaType.application("vnd.ms-excel")), OLE10_NATIVE("ole", POIFSContainerDetector.OLE10_NATIVE), COMP_OBJ("ole", POIFSContainerDetector.COMP_OBJ), WORDDOCUMENT("doc", MediaType.application("msword")), UNKNOWN("unknown", MediaType.application("x-tika-msoffice")), DRMENCRYPTED("ole", MediaType.application("x-tika-ole-drm-encrypted")), ENCRYPTED("ole", MediaType.application("x-tika-ooxml-protected")), POWERPOINT("ppt", MediaType.application("vnd.ms-powerpoint")), PUBLISHER("pub", MediaType.application("x-mspublisher")), PROJECT("mpp", MediaType.application("vnd.ms-project")), VISIO("vsd", MediaType.application("vnd.visio")), WORKS("wps", MediaType.application("vnd.ms-works")), XLR("xlr", MediaType.application("x-tika-msworks-spreadsheet")), OUTLOOK("msg", MediaType.application("vnd.ms-outlook")), SOLIDWORKS_PART("sldprt", MediaType.application("sldworks")), SOLIDWORKS_ASSEMBLY("sldasm", MediaType.application("sldworks")), SOLIDWORKS_DRAWING("slddrw", MediaType.application("sldworks")), GRAPH("", MediaType.application("vnd.ms-graph")); static Map TYPE_MAP = new HashMap<>(); static { for (POIFSDocumentType t : values()) { TYPE_MAP.put(t.type, t); } } private final String extension; private final MediaType type; POIFSDocumentType(String extension, MediaType type) { this.extension = extension; this.type = type; } public static POIFSDocumentType detectType(POIFSFileSystem fs) { return detectType(fs.getRoot()); } public static POIFSDocumentType detectType(DirectoryEntry node) { Set names = new HashSet<>(); for (Entry entry : node) { names.add(entry.getName()); } MediaType type = POIFSContainerDetector.detect(names, node); if (TYPE_MAP.containsKey(type)) { return TYPE_MAP.get(type); } return UNKNOWN; } public String getExtension() { return extension; } public MediaType getType() { return type; } } /** * Looks for entry within root (non-recursive) that has an upper-cased * name that equals ucTarget * @param root * @param ucTarget * @return */ public static Entry getUCEntry(DirectoryEntry root, String ucTarget) { Iterator it = root.getEntries(); while (it.hasNext()) { Entry e = it.next(); if (e.getName().toUpperCase(Locale.US).equals(ucTarget)) { return e; } } return null; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy