All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.pkg.PackageParser Maven / Gradle / Ivy

There is a newer version: 2024.11.18751.20241128T090041Z-241100
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.pkg;


import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.Collections;
import java.util.Date;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.zip.ZipEntry;

import org.apache.commons.compress.PasswordRequiredException;
import org.apache.commons.compress.archivers.ArchiveEntry;
import org.apache.commons.compress.archivers.ArchiveException;
import org.apache.commons.compress.archivers.ArchiveInputStream;
import org.apache.commons.compress.archivers.ArchiveStreamFactory;
import org.apache.commons.compress.archivers.StreamingNotSupportedException;
import org.apache.commons.compress.archivers.ar.ArArchiveInputStream;
import org.apache.commons.compress.archivers.cpio.CpioArchiveInputStream;
import org.apache.commons.compress.archivers.dump.DumpArchiveInputStream;
import org.apache.commons.compress.archivers.jar.JarArchiveInputStream;
import org.apache.commons.compress.archivers.sevenz.SevenZFile;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException.Feature;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.io.input.CloseShieldInputStream;

import org.apache.tika.config.Field;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractEncodingDetectorParser;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;

/**
 * Parser for various packaging formats. Package entries will be written to
 * the XHTML event stream as <div class="package-entry"> elements that
 * contain the (optional) entry name as a <h1> element and the full
 * structured body content of the parsed entry.
 * 

* User must have JCE Unlimited Strength jars installed for encryption to * work with 7Z files (see: COMPRESS-299 and TIKA-1521). If the jars * are not installed, an IOException will be thrown, and potentially * wrapped in a TikaException. */ public class PackageParser extends AbstractEncodingDetectorParser { /** Serial version UID */ private static final long serialVersionUID = -5331043266963888708L; private static final MediaType ZIP = MediaType.APPLICATION_ZIP; private static final MediaType JAR = MediaType.application("java-archive"); private static final MediaType AR = MediaType.application("x-archive"); private static final MediaType ARJ = MediaType.application("x-arj"); private static final MediaType CPIO = MediaType.application("x-cpio"); private static final MediaType DUMP = MediaType.application("x-tika-unix-dump"); private static final MediaType TAR = MediaType.application("x-tar"); private static final MediaType SEVENZ = MediaType.application("x-7z-compressed"); private static final MediaType TIKA_OOXML = MediaType.application("x-tika-ooxml"); private static final MediaType GTAR = MediaType.application("x-gtar"); private static final MediaType KMZ = MediaType.application("vnd.google-earth.kmz"); private static final Set SUPPORTED_TYPES = MediaType.set(ZIP, JAR, AR, ARJ, CPIO, DUMP, TAR, SEVENZ); //We used to avoid overwriting file types if the file type //was a specialization of zip/tar. We determined specialization of zip //via TikaConfig at parse time. //However, TIKA-2483 showed that TikaConfig is not serializable //and this causes an exception in the ForkParser. //The following is an inelegant hack, but until we can serialize TikaConfig, //or dramatically rework the ForkParser to avoid serialization //of parsers, this is what we have. //There is at least a test in PackageParserTest that makes sure that we //keep this list updated. static final Set PACKAGE_SPECIALIZATIONS = loadPackageSpecializations(); // the mark limit used for stream private static final int MARK_LIMIT = 100 * 1024 * 1024; // 100M static final Set loadPackageSpecializations() { Set zipSpecializations = new HashSet<>(); for (String mediaTypeString : new String[]{ //specializations of ZIP "application/bizagi-modeler", "application/epub+zip", "application/java-archive", "application/vnd.adobe.air-application-installer-package+zip", "application/vnd.android.package-archive", "application/vnd.apple.iwork", "application/vnd.apple.keynote", "application/vnd.apple.numbers", "application/vnd.apple.pages", "application/vnd.etsi.asic-e+zip", "application/vnd.etsi.asic-s+zip", "application/vnd.google-earth.kmz", "application/vnd.mindjet.mindmanager", "application/vnd.ms-excel.addin.macroenabled.12", "application/vnd.ms-excel.sheet.binary.macroenabled.12", "application/vnd.ms-excel.sheet.macroenabled.12", "application/vnd.ms-excel.template.macroenabled.12", "application/vnd.ms-powerpoint.addin.macroenabled.12", "application/vnd.ms-powerpoint.presentation.macroenabled.12", "application/vnd.ms-powerpoint.slide.macroenabled.12", "application/vnd.ms-powerpoint.slideshow.macroenabled.12", "application/vnd.ms-powerpoint.template.macroenabled.12", "application/vnd.ms-visio.drawing", "application/vnd.ms-visio.drawing.macroenabled.12", "application/vnd.ms-visio.stencil", "application/vnd.ms-visio.stencil.macroenabled.12", "application/vnd.ms-visio.template", "application/vnd.ms-visio.template.macroenabled.12", "application/vnd.ms-word.document.macroenabled.12", "application/vnd.ms-word.template.macroenabled.12", "application/vnd.ms-xpsdocument", "application/vnd.oasis.opendocument.formula", "application/vnd.openxmlformats-officedocument.presentationml.presentation", "application/vnd.openxmlformats-officedocument.presentationml.slide", "application/vnd.openxmlformats-officedocument.presentationml.slideshow", "application/vnd.openxmlformats-officedocument.presentationml.template", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "application/vnd.openxmlformats-officedocument.spreadsheetml.template", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "application/vnd.openxmlformats-officedocument.wordprocessingml.template", "application/x-ibooks+zip", "application/x-itunes-ipa", "application/x-tika-iworks-protected", "application/x-tika-java-enterprise-archive", "application/x-tika-java-web-archive", "application/x-tika-ooxml", "application/x-tika-ooxml-protected", "application/x-tika-visio-ooxml", "application/x-xliff+zip", "application/x-xmind", "model/vnd.dwfx+xps", "application/vnd.sun.xml.calc", "application/vnd.sun.xml.writer", "application/vnd.sun.xml.writer.template", "application/vnd.sun.xml.draw", "application/vnd.sun.xml.impress", "application/vnd.openofficeorg.autotext", "application/vnd.adobe.indesign-idml-package", "application/x-gtar" //specialization of tar }) { zipSpecializations.add(MediaType.parse(mediaTypeString)); } return Collections.unmodifiableSet(zipSpecializations); } @Deprecated static MediaType getMediaType(ArchiveInputStream stream) { if (stream instanceof JarArchiveInputStream) { return JAR; } else if (stream instanceof ZipArchiveInputStream) { return ZIP; } else if (stream instanceof ArArchiveInputStream) { return AR; } else if (stream instanceof CpioArchiveInputStream) { return CPIO; } else if (stream instanceof DumpArchiveInputStream) { return DUMP; } else if (stream instanceof TarArchiveInputStream) { return TAR; } else if (stream instanceof SevenZWrapper) { return SEVENZ; } else { return MediaType.OCTET_STREAM; } } static MediaType getMediaType(String name) { if (ArchiveStreamFactory.JAR.equals(name)) { return JAR; } else if (ArchiveStreamFactory.ZIP.equals(name)) { return ZIP; } else if (ArchiveStreamFactory.AR.equals(name)) { return AR; } else if (ArchiveStreamFactory.ARJ.equals(name)) { return ARJ; } else if (ArchiveStreamFactory.CPIO.equals(name)) { return CPIO; } else if (ArchiveStreamFactory.DUMP.equals(name)) { return DUMP; } else if (ArchiveStreamFactory.TAR.equals(name)) { return TAR; } else if (ArchiveStreamFactory.SEVEN_Z.equals(name)) { return SEVENZ; } else { return MediaType.OCTET_STREAM; } } static boolean isZipArchive(MediaType type) { return type.equals(ZIP) || type.equals(JAR); } private boolean detectCharsetsInEntryNames = true; public PackageParser() { super(); } public PackageParser(EncodingDetector encodingDetector) { super(encodingDetector); } public Set getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; } public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { // Ensure that the stream supports the mark feature if (! stream.markSupported()) { stream = new BufferedInputStream(stream); } TemporaryResources tmp = new TemporaryResources(); ArchiveInputStream ais = null; String encoding = null; try { ArchiveStreamFactory factory = context.get(ArchiveStreamFactory.class, new ArchiveStreamFactory()); encoding = factory.getEntryEncoding(); // At the end we want to close the archive stream to release // any associated resources, but the underlying document stream // should not be closed ais = factory.createArchiveInputStream(new CloseShieldInputStream(stream)); } catch (StreamingNotSupportedException sne) { // Most archive formats work on streams, but a few need files if (sne.getFormat().equals(ArchiveStreamFactory.SEVEN_Z)) { // Rework as a file, and wrap stream.reset(); TikaInputStream tstream = TikaInputStream.get(stream, tmp); // Seven Zip suports passwords, was one given? String password = null; PasswordProvider provider = context.get(PasswordProvider.class); if (provider != null) { password = provider.getPassword(metadata); } SevenZFile sevenz; try{ if (password == null) { sevenz = new SevenZFile(tstream.getFile()); } else { sevenz = new SevenZFile(tstream.getFile(), password.getBytes("UnicodeLittleUnmarked")); } }catch(PasswordRequiredException e){ throw new EncryptedDocumentException(e); } // Pending a fix for COMPRESS-269 / TIKA-1525, this bit is a little nasty ais = new SevenZWrapper(sevenz); } else { tmp.close(); throw new TikaException("Unknown non-streaming format " + sne.getFormat(), sne); } } catch (ArchiveException e) { tmp.close(); throw new TikaException("Unable to unpack document stream", e); } updateMediaType(ais, metadata); // Use the delegate parser to parse the contained document EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); // mark before we start parsing entries for potential reset stream.mark(MARK_LIMIT); //require mutable int...this is not being used for thread safety. AtomicInteger entryCnt = new AtomicInteger(0); try { parseEntries(false, ais, metadata, extractor, xhtml, entryCnt); } catch (UnsupportedZipFeatureException zfe) { // If this is a zip archive which requires a data descriptor, parse it again if (zfe.getFeature() == Feature.DATA_DESCRIPTOR) { // Close archive input stream and create a new one that could handle data descriptor ais.close(); // An exception would be thrown if MARK_LIMIT is not big enough stream.reset(); ais = new ZipArchiveInputStream(new CloseShieldInputStream(stream), encoding, true, true); parseEntries(true, ais, metadata, extractor, xhtml, entryCnt); } } finally { ais.close(); tmp.close(); } xhtml.endDocument(); } /** * Parse the entries of the zip archive * * @param shouldUseDataDescriptor indicates if a data descriptor is required or not * @param ais archive input stream * @param metadata document metadata (input and output) * @param extractor the delegate parser * @param xhtml the xhtml handler * @throws TikaException if the document could not be parsed * @throws IOException if a UnsupportedZipFeatureException is met * @throws SAXException if the SAX events could not be processed */ private void parseEntries(boolean shouldUseDataDescriptor, ArchiveInputStream ais, Metadata metadata, EmbeddedDocumentExtractor extractor, XHTMLContentHandler xhtml, AtomicInteger entryCnt) throws TikaException, IOException, SAXException { try { ArchiveEntry entry = ais.getNextEntry(); while (entry != null) { if (shouldUseDataDescriptor && entryCnt.get() > 0) { // With shouldUseDataDescriptor being true, we are reading // the zip once again. The number of entryCnt entries have // already been parsed in the last time, so we can just // skip these entries. entryCnt.decrementAndGet(); entry = ais.getNextEntry(); continue; } if (!entry.isDirectory()) { parseEntry(ais, entry, extractor, metadata, xhtml); } if (!shouldUseDataDescriptor) { // Record the number of entries we have read, this is used // for zip archives using Data Descriptor. It's used for // skipping the entries we have already read entryCnt.incrementAndGet(); } entry = ais.getNextEntry(); } } catch (UnsupportedZipFeatureException zfe) { // If it's an encrypted document of unknown password, report as such if (zfe.getFeature() == Feature.ENCRYPTION) { throw new EncryptedDocumentException(zfe); } if (zfe.getFeature() == Feature.DATA_DESCRIPTOR) { throw zfe; } // Otherwise throw the exception throw new TikaException("UnsupportedZipFeature", zfe); } catch (PasswordRequiredException pre) { throw new EncryptedDocumentException(pre); } } private void updateMediaType(ArchiveInputStream ais, Metadata metadata) { MediaType type = getMediaType(ais); if (type.equals(MediaType.OCTET_STREAM)) { return; } //now see if the user or an earlier step has passed in a content type String incomingContentTypeString = metadata.get(Metadata.CONTENT_TYPE); if (incomingContentTypeString == null) { metadata.set(Metadata.CONTENT_TYPE, type.toString()); return; } MediaType incomingMediaType = MediaType.parse(incomingContentTypeString); if (incomingMediaType == null) { metadata.set(Metadata.CONTENT_TYPE, type.toString()); return; } if (! PACKAGE_SPECIALIZATIONS.contains(incomingMediaType)) { metadata.set(Metadata.CONTENT_TYPE, type.toString()); } } private void parseEntry( ArchiveInputStream archive, ArchiveEntry entry, EmbeddedDocumentExtractor extractor, Metadata parentMetadata, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException { String name = entry.getName(); //Try to detect charset of archive entry in case of non-unicode filename is used if (detectCharsetsInEntryNames && entry instanceof ZipArchiveEntry) { Charset candidate = getEncodingDetector().detect(new ByteArrayInputStream(((ZipArchiveEntry) entry).getRawName()), parentMetadata); if (candidate != null) { name = new String(((ZipArchiveEntry) entry).getRawName(), candidate); } } if (archive.canReadEntryData(entry)) { // Fetch the metadata on the entry contained in the archive Metadata entrydata = handleEntryMetadata(name, null, entry.getLastModifiedDate(), entry.getSize(), xhtml); // Recurse into the entry if desired if (extractor.shouldParseEmbedded(entrydata)) { // For detectors to work, we need a mark/reset supporting // InputStream, which ArchiveInputStream isn't, so wrap TemporaryResources tmp = new TemporaryResources(); try { TikaInputStream tis = TikaInputStream.get(archive, tmp); extractor.parseEmbedded(tis, xhtml, entrydata, true); } finally { tmp.dispose(); } } } else { name = (name == null) ? "" : name; if (entry instanceof ZipArchiveEntry) { ZipArchiveEntry zipArchiveEntry = (ZipArchiveEntry) entry; boolean usesEncryption = zipArchiveEntry.getGeneralPurposeBit().usesEncryption(); if (usesEncryption) { EmbeddedDocumentUtil.recordEmbeddedStreamException( new EncryptedDocumentException("stream ("+name+") is encrypted"), parentMetadata); } // do not write to the handler if UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR // is met, we will catch this exception and read the zip archive once again boolean usesDataDescriptor = zipArchiveEntry.getGeneralPurposeBit().usesDataDescriptor(); if (usesDataDescriptor && zipArchiveEntry.getMethod() == ZipEntry.STORED) { throw new UnsupportedZipFeatureException(UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR, zipArchiveEntry); } } else { EmbeddedDocumentUtil.recordEmbeddedStreamException( new TikaException("Can't read archive stream ("+name+")"), parentMetadata); } if (name.length() > 0) { xhtml.element("p", name); } } } protected static Metadata handleEntryMetadata( String name, Date createAt, Date modifiedAt, Long size, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException { Metadata entrydata = new Metadata(); if (createAt != null) { entrydata.set(TikaCoreProperties.CREATED, createAt); } if (modifiedAt != null) { entrydata.set(TikaCoreProperties.MODIFIED, modifiedAt); } if (size != null) { entrydata.set(Metadata.CONTENT_LENGTH, Long.toString(size)); } if (name != null && name.length() > 0) { name = name.replace("\\", "/"); entrydata.set(Metadata.RESOURCE_NAME_KEY, name); AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "class", "class", "CDATA", "embedded"); attributes.addAttribute("", "id", "id", "CDATA", name); xhtml.startElement("div", attributes); xhtml.endElement("div"); entrydata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, name); } return entrydata; } // Pending a fix for COMPRESS-269, we have to wrap ourselves private static class SevenZWrapper extends ArchiveInputStream { private SevenZFile file; private SevenZWrapper(SevenZFile file) { this.file = file; } @Override public int read() throws IOException { return file.read(); } @Override public int read(byte[] b) throws IOException { return file.read(b); } @Override public int read(byte[] b, int off, int len) throws IOException { return file.read(b, off, len); } @Override public ArchiveEntry getNextEntry() throws IOException { return file.getNextEntry(); } @Override public void close() throws IOException { file.close(); } } /** * Whether or not to run the default charset detector against entry * names in ZipFiles. The default is true. * * @param detectCharsetsInEntryNames */ @Field public void setDetectCharsetsInEntryNames(boolean detectCharsetsInEntryNames) { this.detectCharsetsInEntryNames = detectCharsetsInEntryNames; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy