All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.elasticsearch.mapper.attachments.TikaImpl Maven / Gradle / Ivy

Go to download

The mapper attachments plugin adds the attachment type to Elasticsearch using Apache Tika.

There is a newer version: 5.0.0-alpha5
Show newest version
package org.elasticsearch.mapper.attachments;

/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

import java.io.ByteArrayInputStream;
import java.io.FilePermission;
import java.io.IOException;
import java.lang.reflect.ReflectPermission;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLClassLoader;
import java.nio.file.Path;
import java.security.AccessControlContext;
import java.security.AccessController;
import java.security.PermissionCollection;
import java.security.Permissions;
import java.security.PrivilegedActionException;
import java.security.PrivilegedExceptionAction;
import java.security.ProtectionDomain;
import java.security.SecurityPermission;
import java.util.PropertyPermission;

import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
import org.elasticsearch.SpecialPermission;
import org.elasticsearch.bootstrap.JarHell;
import org.elasticsearch.common.SuppressForbidden;
import org.elasticsearch.common.io.PathUtils;

/**
 * Runs tika with limited parsers and limited permissions.
 * 

* Do NOT make public */ final class TikaImpl { /** subset of parsers for types we support */ private static final Parser PARSERS[] = new Parser[] { // documents new org.apache.tika.parser.html.HtmlParser(), new org.apache.tika.parser.rtf.RTFParser(), new org.apache.tika.parser.pdf.PDFParser(), new org.apache.tika.parser.txt.TXTParser(), new org.apache.tika.parser.microsoft.OfficeParser(), new org.apache.tika.parser.microsoft.OldExcelParser(), new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(), new org.apache.tika.parser.odf.OpenDocumentParser(), new org.apache.tika.parser.iwork.IWorkPackageParser(), new org.apache.tika.parser.xml.DcXMLParser(), new org.apache.tika.parser.epub.EpubParser(), }; /** autodetector based on this subset */ private static final AutoDetectParser PARSER_INSTANCE = new AutoDetectParser(PARSERS); /** singleton tika instance */ private static final Tika TIKA_INSTANCE = new Tika(PARSER_INSTANCE.getDetector(), PARSER_INSTANCE); /** * parses with tika, throwing any exception hit while parsing the document */ // only package private for testing! static String parse(final byte content[], final Metadata metadata, final int limit) throws TikaException, IOException { // check that its not unprivileged code like a script SecurityManager sm = System.getSecurityManager(); if (sm != null) { sm.checkPermission(new SpecialPermission()); } try { return AccessController.doPrivileged(new PrivilegedExceptionAction() { @Override public String run() throws TikaException, IOException { return TIKA_INSTANCE.parseToString(new ByteArrayInputStream(content), metadata, limit); } }, RESTRICTED_CONTEXT); } catch (PrivilegedActionException e) { // checked exception from tika: unbox it Throwable cause = e.getCause(); if (cause instanceof TikaException) { throw (TikaException) cause; } else if (cause instanceof IOException) { throw (IOException) cause; } else { throw new AssertionError(cause); } } } // apply additional containment for parsers, this is intersected with the current permissions // its hairy, but worth it so we don't have some XML flaw reading random crap from the FS private static final AccessControlContext RESTRICTED_CONTEXT = new AccessControlContext( new ProtectionDomain[] { new ProtectionDomain(null, getRestrictedPermissions()) } ); // compute some minimal permissions for parsers. they only get r/w access to the java temp directory, // the ability to load some resources from JARs, and read sysprops static PermissionCollection getRestrictedPermissions() { Permissions perms = new Permissions(); // property/env access needed for parsing perms.add(new PropertyPermission("*", "read")); perms.add(new RuntimePermission("getenv.TIKA_CONFIG")); // add permissions for resource access: // classpath addReadPermissions(perms, JarHell.parseClassPath()); // plugin jars if (TikaImpl.class.getClassLoader() instanceof URLClassLoader) { addReadPermissions(perms, ((URLClassLoader)TikaImpl.class.getClassLoader()).getURLs()); } // jvm's java.io.tmpdir (needs read/write) perms.add(new FilePermission(System.getProperty("java.io.tmpdir") + System.getProperty("file.separator") + "-", "read,readlink,write,delete")); // current hacks needed for POI/PDFbox issues: perms.add(new SecurityPermission("putProviderProperty.BC")); perms.add(new SecurityPermission("insertProvider")); perms.add(new SecurityPermission("insertProvider.BC")); perms.add(new ReflectPermission("suppressAccessChecks")); // xmlbeans, use by POI, needs to get the context classloader perms.add(new RuntimePermission("getClassLoader")); perms.setReadOnly(); return perms; } // add resources to (what is typically) a jar, but might not be (e.g. in tests/IDE) @SuppressForbidden(reason = "adds access to jar resources") static void addReadPermissions(Permissions perms, URL resources[]) { try { for (URL url : resources) { Path path = PathUtils.get(url.toURI()); // resource itself perms.add(new FilePermission(path.toString(), "read,readlink")); // classes underneath perms.add(new FilePermission(path.toString() + System.getProperty("file.separator") + "-", "read,readlink")); } } catch (URISyntaxException bogus) { throw new RuntimeException(bogus); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy