All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.microsoft.JackcessParser Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tika.parser.microsoft;


import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
import java.util.Locale;
import java.util.Set;

import com.healthmarketscience.jackcess.Database;
import com.healthmarketscience.jackcess.DatabaseBuilder;
import com.healthmarketscience.jackcess.crypt.CryptCodecProvider;
import com.healthmarketscience.jackcess.util.LinkResolver;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.UnsupportedFormatException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.sax.XHTMLContentHandler;

/**
 * Parser that handles Microsoft Access files via
 * Jackcess
 * 

* Many, many thanks to LexisNexis®/Health Market Science (HMS), Brian O'Neill, * and James Ahlborn for relicensing Jackcess to Apache v2.0! */ public class JackcessParser implements Parser { public static final String SUMMARY_PROPERTY_PREFIX = "MDB_SUMMARY_PROP" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; private final static LinkResolver IGNORE_LINK_RESOLVER = new IgnoreLinkResolver(); private static final long serialVersionUID = -752276948656079347L; private static final MediaType MEDIA_TYPE = MediaType.application("x-msaccess"); private static final Set SUPPORTED_TYPES = Collections.singleton(MEDIA_TYPE); //TODO: figure out how to get this info // public static Property LINKED_DATABASES = Property.externalTextBag("LinkedDatabases"); public static String MDB_PROPERTY_PREFIX = "MDB_PROP" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; public static String USER_DEFINED_PROPERTY_PREFIX = "MDB_USER_PROP" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; public static Property MDB_PW = Property.externalText("Password"); private Locale locale = Locale.ROOT; @Override public Set getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; } @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { TikaInputStream tis = TikaInputStream.get(stream); Database db = null; XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); String password = null; PasswordProvider passwordProvider = context.get(PasswordProvider.class); if (passwordProvider != null) { password = passwordProvider.getPassword(metadata); } try { if (password == null) { //do this to ensure encryption/wrong password exception vs. more generic //"need right codec" error message. db = new DatabaseBuilder(tis.getFile()).setCodecProvider(new CryptCodecProvider()) .setReadOnly(true).open(); } else { db = new DatabaseBuilder(tis.getFile()) .setCodecProvider(new CryptCodecProvider(password)).setReadOnly(true) .open(); } db.setLinkResolver(IGNORE_LINK_RESOLVER);//just in case JackcessExtractor ex = new JackcessExtractor(metadata, context, locale); ex.parse(db, xhtml); } catch (IOException e) { //TIKA-3849 if (e.getMessage() != null && e.getMessage().contains("Unrecognized map type: 75")) { throw new UnsupportedFormatException( "Jackcess doesn't process mdb versions before v97"); } throw e; } catch (IllegalStateException e) { if (e.getMessage() != null && e.getMessage().contains("Incorrect password")) { throw new EncryptedDocumentException(e); } throw e; } finally { if (db != null) { try { db.close(); } catch (IOException e) { //swallow = silent close } } } xhtml.endDocument(); } private static final class IgnoreLinkResolver implements LinkResolver { //If links are resolved, Jackcess might try to open and process //any file on the current system that is specified as a linked db. //This could be a nasty security issue. @Override public Database resolveLinkedDatabase(Database database, String s) throws IOException { throw new AssertionError("DO NOT ALLOW RESOLVING OF LINKS!!!"); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy