All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.detect.zip.FrictionlessPackageDetector Maven / Gradle / Ivy

There is a newer version: 3.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.detect.zip;

import java.io.IOException;
import java.io.InputStream;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Set;

import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipFile;

import org.apache.tika.io.TikaInputStream;
import org.apache.tika.mime.MediaType;

public class FrictionlessPackageDetector implements ZipContainerDetector {

    private static final MediaType WACZ = MediaType.application("x-wacz");
    private static final MediaType DATA_PACKAGE = MediaType.application("x-vnd.datapackage+zip");

    private enum PARTS {
        PACKAGE,
        WACZ_ARCHIVE,
        WACZ_INDEXES,
        WACZ_PAGES
    }

    @Override
    public MediaType detect(ZipFile zip, TikaInputStream tis) throws IOException {

        Enumeration entries = zip.getEntries();
        MediaType mt = null;
        Counter counter = new Counter();
        while (entries.hasMoreElements()) {
            ZipArchiveEntry zae = entries.nextElement();
            updateCounter(zae, counter);
            mt = getMediaType(counter);
            if (mt == WACZ) {
                return WACZ;
            }
        }
        return getMediaType(counter);
    }

    @Override
    public MediaType streamingDetectUpdate(ZipArchiveEntry zae, InputStream zis,
                                           StreamingDetectContext detectContext) {
        Counter counter = detectContext.get(Counter.class);
        if (counter == null) {
            counter = new Counter();
            detectContext.set(Counter.class, counter);
        }
        updateCounter(zae, counter);
        MediaType mt = getMediaType(counter);
        if (mt == WACZ) {
            return WACZ;
        }
        return null;
    }

    private void updateCounter(ZipArchiveEntry zae, Counter counter) {
        String name = zae.getName();
        if (name.startsWith("archive/")) {
            counter.update(PARTS.WACZ_ARCHIVE);
        } else if (name.startsWith("indexes/")) {
            counter.update(PARTS.WACZ_INDEXES);
        } else if (name.startsWith("pages/")) {
            counter.update(PARTS.WACZ_PAGES);
        } else if ("datapackage.json".equals(name)) {
            counter.update(PARTS.PACKAGE);
        }
    }

    MediaType getMediaType(Counter counter) {
        if (counter == null) {
            return null;
        }
        if (counter.parts.contains(PARTS.PACKAGE)) {
            if (counter.parts.size() == 1) {
                return DATA_PACKAGE;
            }
            //this is, um, heuristic; I think all the parts are
            //required, but I'm not sure what we'll see in practice.
            if (counter.parts.contains(PARTS.WACZ_ARCHIVE)) {
                return WACZ;
            } else if (counter.parts.contains(PARTS.WACZ_INDEXES) &&
                    counter.parts.contains(PARTS.WACZ_PAGES)) {
                return WACZ;
            }
        }
        return null;
    }

    @Override
    public MediaType streamingDetectFinal(StreamingDetectContext detectContext) {
        return getMediaType(detectContext.get(Counter.class));
    }

    private static class Counter {
        private Set parts = new HashSet<>();
        void update(PARTS val) {
            parts.add(val);
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy