All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.microsoft.AbstractPOIFSExtractor Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.microsoft;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;

import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
import org.apache.poi.hpsf.ClassID;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.Ole10Native;
import org.apache.poi.poifs.filesystem.Ole10NativeException;
import org.apache.poi.util.IOUtils;
import org.apache.poi.util.StringUtil;
import org.xml.sax.SAXException;

import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.detect.zip.DefaultZipContainerDetector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.BoundedInputStream;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MimeType;
import org.apache.tika.mime.MimeTypeException;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.StringUtils;

abstract class AbstractPOIFSExtractor {

    private static final String OCX_NAME = "\u0003OCXNAME";
    protected final Metadata parentMetadata;//metadata of the parent/container document
    protected final OfficeParserConfig officeParserConfig;
    protected final ParseContext context;
    private final EmbeddedDocumentUtil embeddedDocumentUtil;
    private PasswordProvider passwordProvider;

    protected AbstractPOIFSExtractor(ParseContext context) {
        this(context, null);
    }

    protected AbstractPOIFSExtractor(ParseContext context, Metadata parentMetadata) {
        embeddedDocumentUtil = new EmbeddedDocumentUtil(context);

        this.passwordProvider = context.get(PasswordProvider.class);
        this.officeParserConfig = context.get(OfficeParserConfig.class, new OfficeParserConfig());
        this.parentMetadata = parentMetadata;
        this.context = context;
    }

    // Note - these cache, but avoid creating the default TikaConfig if not needed
    protected TikaConfig getTikaConfig() {
        return embeddedDocumentUtil.getTikaConfig();
    }

    protected Detector getDetector() {
        return embeddedDocumentUtil.getDetector();
    }

    /**
     * Returns the password to be used for this file, or null
     * if no / default password should be used
     */
    protected String getPassword() {
        if (passwordProvider != null) {
            return passwordProvider.getPassword(parentMetadata);
        }
        return null;
    }

    protected void handleEmbeddedResource(TikaInputStream resource, String filename,
                                          String relationshipID, String mediaType,
                                          XHTMLContentHandler xhtml, boolean outputHtml)
            throws IOException, SAXException, TikaException {
        handleEmbeddedResource(resource, filename, relationshipID, null, mediaType, xhtml,
                outputHtml);
    }

    protected void handleEmbeddedResource(TikaInputStream resource, String filename,
                                          String relationshipID, ClassID storageClassID,
                                          String mediaType, XHTMLContentHandler xhtml,
                                          boolean outputHtml)
            throws IOException, SAXException, TikaException {
        handleEmbeddedResource(resource, new Metadata(), filename, relationshipID, storageClassID,
                mediaType, xhtml, outputHtml);
    }

    protected void handleEmbeddedResource(TikaInputStream resource, Metadata embeddedMetadata,
                                          String filename, String relationshipID,
                                          ClassID storageClassID, String mediaType,
                                          XHTMLContentHandler xhtml, boolean outputHtml)
            throws IOException, SAXException, TikaException {

        try {
            if (filename != null) {
                embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, filename);
            }
            if (relationshipID != null) {
                embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, relationshipID);
            }
            if (storageClassID != null) {
                embeddedMetadata.set(TikaCoreProperties.EMBEDDED_STORAGE_CLASS_ID,
                        storageClassID.toString());
            }
            if (mediaType != null) {
                embeddedMetadata.set(Metadata.CONTENT_TYPE, mediaType);
            }

            if (embeddedDocumentUtil.shouldParseEmbedded(embeddedMetadata)) {
                embeddedDocumentUtil.parseEmbedded(resource, xhtml, embeddedMetadata, outputHtml);
            }
        } finally {
            resource.close();
        }
    }

    /**
     * Handle an office document that's embedded at the POIFS level
     */
    protected void handleEmbeddedOfficeDoc(DirectoryEntry dir, XHTMLContentHandler xhtml,
                                           boolean outputHtml)
            throws IOException, SAXException, TikaException {
        handleEmbeddedOfficeDoc(dir, null, xhtml, outputHtml);
    }

    /**
     * Handle an office document that's embedded at the POIFS level
     */
    protected void handleEmbeddedOfficeDoc(DirectoryEntry dir, String resourceName,
                                           XHTMLContentHandler xhtml, boolean outputHtml)
            throws IOException, SAXException, TikaException {


        // Is it an embedded OLE2 document, or an embedded OOXML document?
        //first try for ooxml
        Entry ooxml = dir.hasEntry("Package") ? dir.getEntry("Package") :
                (dir.hasEntry("package") ? dir.getEntry("package") : null);

        if (ooxml != null) {
            // It's OOXML (has a ZipFile):
            Metadata metadata = new Metadata();
            metadata.set(Metadata.CONTENT_LENGTH,
                    Integer.toString(((DocumentEntry)ooxml).getSize()));
            try (TikaInputStream stream = TikaInputStream
                    .get(new DocumentInputStream((DocumentEntry) ooxml))) {

                Detector detector = new DefaultZipContainerDetector();
                MediaType type = null;
                try {
                    type = detector.detect(stream, metadata);
                } catch (SecurityException e) {
                    throw e;
                } catch (Exception e) {
                    //if there's a stream error while detecting, give up
                    EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
                    return;
                }
                handleEmbeddedResource(stream, metadata,null, dir.getName(), dir.getStorageClsid(),
                        type.toString(), xhtml, outputHtml);
                return;
            }
        }

        // It's regular OLE2:

        // What kind of document is it?
        Metadata metadata = new Metadata();
        metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, dir.getName());
        if (dir.getStorageClsid() != null) {
            metadata.set(TikaCoreProperties.EMBEDDED_STORAGE_CLASS_ID,
                    dir.getStorageClsid().toString());
        }
        POIFSDocumentType type = POIFSDocumentType.detectType(dir);
        String rName = (resourceName == null) ? dir.getName() : resourceName;
        extractOCXName(dir, metadata);
        if (type == POIFSDocumentType.OLE10_NATIVE) {
            handleOLENative(dir, type, rName, metadata, xhtml, outputHtml);
        } else if (type == POIFSDocumentType.COMP_OBJ) {
            handleCompObj(dir, type, rName, metadata, xhtml, outputHtml);
        } else if (type == POIFSDocumentType.OUTLOOK) {
            //for Outlook try to use the title first so that we don't wind up with __substg1.0_37...
            //if that doesn't exist, backoff to rName
            //add the suffix
            metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
            String name = tryToGetMsgTitle(dir, rName);
            if (! StringUtils.isBlank(name)) {
                if (StringUtils.isBlank(type.getExtension())) {
                    metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
                } else {
                    metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
                            name + '.' + type.getExtension());
                }
            }
            parseEmbedded(dir, xhtml, metadata, outputHtml);
        } else {
            metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
            if (! StringUtils.isBlank(rName)) {
                if (StringUtils.isBlank(type.getExtension())) {
                    metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, rName);
                } else {
                    metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
                            rName + '.' + type.getExtension());
                }
            }
            parseEmbedded(dir, xhtml, metadata, outputHtml);
        }
    }

    private void extractOCXName(DirectoryEntry dir, Metadata metadata) {
        if (! dir.hasEntry(OCX_NAME)) {
            return;
        }
        try {
            Entry e = dir.getEntry(OCX_NAME);
            if (!e.isDocumentEntry()) {
                return;
            }
            UnsynchronizedByteArrayOutputStream bos = UnsynchronizedByteArrayOutputStream.builder().get();
            try (DocumentInputStream dis = new DocumentInputStream((DocumentEntry) e)) {
                IOUtils.copy(dis, bos);
            }
            byte[] bytes = bos.toByteArray();
            int charCount = (bytes.length - 4);
            if (charCount < 0) {
                return;
            }
            if (charCount % 2 != 0) {
                return;
            }
            charCount /= 2;
            String ocxName = StringUtil.getFromUnicodeLE0Terminated(bytes, 0, charCount);
            metadata.set(Office.OCX_NAME, ocxName);
        } catch (IOException e) {
            //log this?
        }
    }

    private void handleCompObj(DirectoryEntry dir, POIFSDocumentType type, String rName,
                               Metadata metadata, XHTMLContentHandler xhtml, boolean outputHtml)
            throws IOException, SAXException {
        //TODO: figure out if the equivalent of OLE 1.0's
        //getCommand() and getFileName() exist for OLE 2.0 to populate
        //TikaCoreProperties.ORIGINAL_RESOURCE_NAME

        String contentsEntryName = getContentsEntryName(dir);
        if (contentsEntryName == null) {
            //log or record exception?
            return;
        }
        // Grab the contents and process
        DocumentEntry contentsEntry;

        try {
            contentsEntry = (DocumentEntry) dir.getEntry(contentsEntryName);
        } catch (FileNotFoundException fnfe) {
            EmbeddedDocumentUtil.recordEmbeddedStreamException(fnfe, parentMetadata);
            return;
        }

        int length = contentsEntry.getSize();
        DocumentInputStream inp = null;
        try {
            inp = new DocumentInputStream(contentsEntry);
        } catch (SecurityException e) {
            throw e;
        } catch (Exception e) {
            EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
            return;
        }
        try (TikaInputStream tis = TikaInputStream.get(inp)) {
            // Try to work out what it is
            MediaType mediaType = getDetector().detect(tis, metadata);
            String extension = type.getExtension();
            try {
                MimeType mimeType =
                        embeddedDocumentUtil.getMimeTypes().forName(mediaType.toString());
                extension = mimeType.getExtension();
            } catch (MimeTypeException mte) {
                // No details on this type are known
            }

            // Record what we can do about it
            metadata.set(Metadata.CONTENT_TYPE, mediaType.getType());
            metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, rName + extension);
            metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(length));
            parseEmbedded(dir, tis, xhtml, metadata, outputHtml);
        } finally {
            inp.close();
        }
    }

    private String getContentsEntryName(DirectoryEntry dir) {
        /*
        if (dir.hasEntry("CorelDRAW")) {
            contentsEntry = (DocumentEntry) dir.getEntry("CorelDRAW");}
         */
        //TODO: modify getEntry to case insensitive when available in POI
        if (dir.hasEntry("CONTENTS")) {
            return "CONTENTS";
        } else if (dir.hasEntry("Contents")) {
            return "Contents";
        } else {
            for (String n : dir.getEntryNames()) {
                if ("contents".equalsIgnoreCase(n)) {
                    return n;
                }
            }
        }
        return null;
    }


    private void handleOLENative(DirectoryEntry dir, POIFSDocumentType type, String rName,
                                 Metadata metadata, XHTMLContentHandler xhtml, boolean outputHtml)
            throws IOException, SAXException {
        byte[] data = null;
        try {
            // Try to un-wrap the OLE10Native record:
            Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode) dir);
            if (ole.getLabel() != null) {
                metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, rName + '/' + ole.getLabel());
            } else {
                metadata.add(TikaCoreProperties.RESOURCE_NAME_KEY, rName);
            }
            if (ole.getCommand() != null) {
                metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getCommand());
            }
            if (ole.getFileName() != null) {
                metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getFileName());
            }
            data = ole.getDataBuffer();
            metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(data.length));
        } catch (Ole10NativeException ex) {
            // Not a valid OLE10Native record, skip it
        } catch (SecurityException e) {
            throw e;
        } catch (Exception e) {
            EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
            return;
        }
        try (TikaInputStream tis = TikaInputStream.get(data)) {
            parseEmbedded(dir, tis, xhtml, metadata, outputHtml);
        }
    }

    private void parseEmbedded(DirectoryEntry dir, TikaInputStream tis, XHTMLContentHandler xhtml,
                               Metadata metadata, boolean outputHtml) throws IOException,
            SAXException {
        if (!embeddedDocumentUtil.shouldParseEmbedded(metadata)) {
            return;
        }
        if (dir.getStorageClsid() != null) {
            metadata.set(TikaCoreProperties.EMBEDDED_STORAGE_CLASS_ID,
                    dir.getStorageClsid().toString());
        }
        embeddedDocumentUtil.parseEmbedded(tis, xhtml, metadata, outputHtml);
    }

    private void parseEmbedded(DirectoryEntry dir, XHTMLContentHandler xhtml, Metadata metadata,
                               boolean outputHtml)
            throws IOException, SAXException {
        if (!embeddedDocumentUtil.shouldParseEmbedded(metadata)) {
            return;
        }
        try (TikaInputStream tis = TikaInputStream.get(new byte[0])) {
            tis.setOpenContainer(dir);
            if (dir.getStorageClsid() != null) {
                metadata.set(TikaCoreProperties.EMBEDDED_STORAGE_CLASS_ID,
                        dir.getStorageClsid().toString());
            }
            embeddedDocumentUtil.parseEmbedded(tis, xhtml, metadata, outputHtml);
        }
    }


    public static String tryToGetMsgTitle(DirectoryEntry node, String defaultVal) {

        for (String entryName : new String[] {"__substg1.0_0037001F", "__substg1.0_0E1D001F", "__substg1.0_0070001F"} ) {
            try {
                Entry entry = node.getEntry(entryName);
                if (entry instanceof DocumentEntry) {
                    try (InputStream is = new BoundedInputStream(1000, new DocumentInputStream((DocumentEntry) entry))) {
                        return org.apache.commons.io.IOUtils.toString(is, StandardCharsets.UTF_16LE);
                    }
                }
            } catch (IOException e) {
                //do nothing
            }
        }
        return defaultVal;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy