org.apache.tika.mime.ProbabilisticMimeDetectionSelector Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of aem-sdk-api Show documentation
Show all versions of aem-sdk-api Show documentation
The Adobe Experience Manager SDK
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.mime;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import org.apache.tika.detect.Detector;
import org.apache.tika.metadata.Metadata;
/**
* Selector for combining different mime detection results
* based on probability
*/
public class ProbabilisticMimeDetectionSelector implements Detector {
private static final long serialVersionUID = 224589862960269260L;
private MimeTypes mimeTypes;
private final MediaType rootMediaType;
/** probability parameters default value */
private static final float DEFAULT_MAGIC_TRUST = 0.9f;
private static final float DEFAULT_META_TRUST = 0.8f;
private static final float DEFAULT_EXTENSION_TRUST = 0.8f;
private float priorMagicFileType, priorExtensionFileType,
priorMetaFileType;
private float magic_trust, extension_trust, meta_trust;
private float magic_neg, extension_neg, meta_neg;
/*
* any posterior probability lower than the threshold, will be considered as
* an oct-stream type, the default value is 0.5
*/
private float threshold;
/*
* this change rate is used when there are multiple types predicted by
* magic-bytes. the first predicted type has the highest probability, and
* the probability for the next type predicted by magic-bytes will decay
* with this change rate. The idea is to have the first one to take
* precedence among the multiple possible types predicted by MAGIC-bytes.
*/
private float changeRate;
/** ***********************/
public ProbabilisticMimeDetectionSelector() {
this(MimeTypes.getDefaultMimeTypes(), null);
}
public ProbabilisticMimeDetectionSelector(final Builder builder) {
this(MimeTypes.getDefaultMimeTypes(), builder);
}
public ProbabilisticMimeDetectionSelector(final MimeTypes mimeTypes) {
this(mimeTypes, null);
}
public ProbabilisticMimeDetectionSelector(final MimeTypes mimeTypes,
final Builder builder) {
this.mimeTypes = mimeTypes;
rootMediaType = MediaType.OCTET_STREAM;
this.initializeDefaultProbabilityParameters();
this.changeRate = 0.1f;
if (builder != null) {
priorMagicFileType = builder.priorMagicFileType == 0f ?
priorMagicFileType : builder.priorMagicFileType;
priorExtensionFileType = builder.priorExtensionFileType == 0f ?
priorExtensionFileType : builder.priorExtensionFileType;
priorMetaFileType = builder.priorMetaFileType == 0f ?
priorMetaFileType : builder.priorMetaFileType;
magic_trust = builder.magic_trust == 0f ? magic_trust : builder.extension_neg;
extension_trust = builder.extension_trust == 0f ? extension_trust : builder.extension_trust;
meta_trust = builder.meta_trust == 0f ? meta_trust : builder.meta_trust;
magic_neg = builder.magic_neg == 0f ? magic_neg : builder.magic_neg;
extension_neg = builder.extension_neg == 0f ?
extension_neg : builder.extension_neg;
meta_neg = builder.meta_neg == 0f ? meta_neg : builder.meta_neg;
threshold = builder.threshold == 0f ? threshold : builder.threshold;
}
}
/**
* Initilize probability parameters with default values;
*/
private void initializeDefaultProbabilityParameters() {
priorMagicFileType = 0.5f;
priorExtensionFileType = 0.5f;
priorMetaFileType = 0.5f;
magic_trust = DEFAULT_MAGIC_TRUST;
extension_trust = DEFAULT_EXTENSION_TRUST;
meta_trust = DEFAULT_META_TRUST;
// probability of the type detected by magic test given that the type is
// not the detected type. The default is taken by 1 - the magic trust
magic_neg = 1 - DEFAULT_MAGIC_TRUST;
// probability of the type detected by extension test given that the
// type is not the type detected by extension test
extension_neg = 1 - DEFAULT_EXTENSION_TRUST;
// same as above; but it could be customized to suffice different use.
meta_neg = 1 - DEFAULT_META_TRUST;
threshold = 0.5001f;
}
public MediaType detect(InputStream input, Metadata metadata)
throws IOException {
List possibleTypes = new ArrayList<>();
// Get type based on magic prefix
if (input != null) {
input.mark(mimeTypes.getMinLength());
try {
byte[] prefix = mimeTypes.readMagicHeader(input);
//defensive copy
possibleTypes.addAll(mimeTypes.getMimeType(prefix));
} finally {
input.reset();
}
}
MimeType extHint = null;
// Get type based on resourceName hint (if available)
String resourceName = metadata.get(Metadata.RESOURCE_NAME_KEY);
if (resourceName != null) {
String name = null;
// Deal with a URI or a path name in as the resource name
try {
URI uri = new URI(resourceName);
String path = uri.getPath();
if (path != null) {
int slash = path.lastIndexOf('/');
if (slash + 1 < path.length()) {
name = path.substring(slash + 1);
}
}
} catch (URISyntaxException e) {
name = resourceName;
}
if (name != null) {
// MimeType hint = getMimeType(name);
extHint = mimeTypes.getMimeType(name);
// If we have some types based on mime magic, try to specialise
// and/or select the type based on that
// Otherwise, use the type identified from the name
// possibleTypes = applyHint(possibleTypes, hint);
}
}
// Get type based on metadata hint (if available)
MimeType metaHint = null;
String typeName = metadata.get(Metadata.CONTENT_TYPE);
if (typeName != null) {
try {
// MimeType hint = forName(typeName);
metaHint = mimeTypes.forName(typeName);
// possibleTypes = applyHint(possibleTypes, hint);
} catch (MimeTypeException e) {
// Malformed type name, ignore
}
}
/*
* the following calls the probability selection.
*/
return applyProbilities(possibleTypes, extHint, metaHint);
}
private MediaType applyProbilities(final List possibleTypes,
final MimeType extMimeType, final MimeType metadataMimeType) {
/* initialize some probability variables */
MediaType extensionMediaType_ = extMimeType == null ? null : extMimeType.getType();
MediaType metaMediaType_ = metadataMimeType == null ? null : metadataMimeType.getType();
int n = possibleTypes.size();
float mag_trust = magic_trust;
float mag_neg = magic_neg;
float ext_trust = extension_trust;
float ext_neg = extension_neg;
float met_trust = meta_trust;
float met_neg = meta_neg;
/* ************************** */
/* pre-process some probability variables */
if (extensionMediaType_ == null || extensionMediaType_.compareTo(rootMediaType) == 0) {
/*
* this is a root type, that means the extension method fails to
* identify any type.
*/
ext_trust = 1;
ext_neg = 1;
}
if (metaMediaType_ == null || metaMediaType_.compareTo(rootMediaType) == 0) {
met_trust = 1;
met_neg = 1;
}
float maxProb = -1f;
MediaType bestEstimate = rootMediaType;
if (possibleTypes != null && !possibleTypes.isEmpty()) {
int i;
for (i = 0; i < n; i++) {
MediaType magictype = possibleTypes.get(i).getType();
MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry();
if (magictype != null && magictype.equals(rootMediaType)) {
mag_trust = 1;
mag_neg = 1;
} else {
// check if each identified type belongs to the same class;
if (extensionMediaType_ != null) {
if (extensionMediaType_.equals(magictype)
|| registry.isSpecializationOf(
extensionMediaType_, magictype)) {
// Use just this type
possibleTypes.set(i, extMimeType);
} else if (registry.isSpecializationOf(magictype,
extensionMediaType_)) {
extensionMediaType_ = magictype;
}
}
if (metaMediaType_ != null) {
if (metaMediaType_.equals(magictype)
|| registry.isSpecializationOf(metaMediaType_,
magictype)) {
// Use just this type
possibleTypes.set(i, metadataMimeType);
} else if (registry.isSpecializationOf(magictype,
metaMediaType_)) {
metaMediaType_ = magictype;
}
}
}
/*
* prepare the conditional probability for file type prediction.
*/
float[] results = new float[3];
float[] trust1 = new float[3];
float[] negtrust1 = new float[3];
magictype = possibleTypes.get(i).getType();
if (i > 0) {
/*
* decay as our trust goes down with next type predicted by
* magic
*/
mag_trust = mag_trust * (1 - changeRate);
/*
* grow as our trust goes down
*/
mag_neg = mag_neg * (1 + changeRate);
}
if (magictype != null && mag_trust != 1) {
trust1[0] = mag_trust;
negtrust1[0] = mag_neg;
if (metaMediaType_ != null && met_trust != 1) {
if (magictype.equals(metaMediaType_)) {
trust1[1] = met_trust;
negtrust1[1] = met_neg;
} else {
trust1[1] = 1 - met_trust;
negtrust1[1] = 1 - met_neg;
}
} else {
trust1[1] = 1;
negtrust1[1] = 1;
}
if (extensionMediaType_ != null && ext_trust != 1) {
if (magictype.equals(extensionMediaType_)) {
trust1[2] = ext_trust;
negtrust1[2] = ext_neg;
} else {
trust1[2] = 1 - ext_trust;
negtrust1[2] = 1 - ext_neg;
}
} else {
trust1[2] = 1;
negtrust1[2] = 1;
}
} else {
results[0] = 0.1f;
}
float[] trust2 = new float[3];
float[] negtrust2 = new float[3];
if (metadataMimeType != null && met_trust != 1) {
trust2[1] = met_trust;
negtrust2[1] = met_neg;
if (magictype != null && mag_trust != 1) {
if (metaMediaType_.equals(magictype)) {
trust2[0] = mag_trust;
negtrust2[0] = mag_neg;
} else {
trust2[0] = 1 - mag_trust;
negtrust2[0] = 1 - mag_neg;
}
} else {
trust2[0] = 1f;
negtrust2[0] = 1f;
}
if (extensionMediaType_ != null && ext_trust != 1) {
if (metaMediaType_.equals(extensionMediaType_)) {
trust2[2] = ext_trust;
negtrust2[2] = ext_neg;
} else {
trust2[2] = 1 - ext_trust;
negtrust2[2] = 1 - ext_neg;
}
} else {
trust2[2] = 1f;
negtrust2[2] = 1f;
}
} else {
results[1] = 0.1f;
}
float[] trust3 = new float[3];
float[] negtrust3 = new float[3];
if (extensionMediaType_ != null && ext_trust != 1) {
trust3[2] = ext_trust;
negtrust3[2] = ext_neg;
if (magictype != null && mag_trust != 1) {
if (magictype.equals(extensionMediaType_)) {
trust3[0] = mag_trust;
negtrust3[0] = mag_neg;
} else {
trust3[0] = 1 - mag_trust;
negtrust3[0] = 1 - mag_neg;
}
} else {
trust3[0] = 1f;
negtrust3[0] = 1f;
}
if (metaMediaType_ != null && met_trust != 1) {
if (metaMediaType_.equals(extensionMediaType_)) {
trust3[1] = met_trust;
negtrust3[1] = met_neg;
} else {
trust3[1] = 1 - met_trust;
negtrust3[1] = 1 - met_neg;
}
} else {
trust3[1] = 1f;
negtrust3[1] = 1f;
}
} else {
results[2] = 0.1f;
}
/*
* compute the posterior probability for each predicted file
* type and store them into the "results" array.
*/
float pPrime = priorMagicFileType;
float deno = 1 - priorMagicFileType;
int j;
if (results[0] == 0) {
for (j = 0; j < trust1.length; j++) {
pPrime *= trust1[j];
if (trust1[j] != 1) {
deno *= negtrust1[j];
}
}
pPrime /= (pPrime + deno);
results[0] = pPrime;
}
if (maxProb < results[0]) {
maxProb = results[0];
bestEstimate = magictype;
}
pPrime = priorMetaFileType;
deno = 1 - priorMetaFileType;
if (results[1] == 0) {
for (j = 0; j < trust2.length; j++) {
pPrime *= trust2[j];
if (trust2[j] != 1) {
deno *= negtrust2[j];
}
}
pPrime /= (pPrime + deno);
results[1] = pPrime;
}
if (maxProb < results[1]) {
maxProb = results[1];
bestEstimate = metaMediaType_;
}
pPrime = priorExtensionFileType;
deno = 1 - priorExtensionFileType;
if (results[2] == 0) {
for (j = 0; j < trust3.length; j++) {
pPrime *= trust3[j];
if (trust3[j] != 1) {
deno *= negtrust3[j];
}
}
pPrime /= (pPrime + deno);
results[2] = pPrime;
}
if (maxProb < results[2]) {
maxProb = results[2];
bestEstimate = extensionMediaType_;
}
/*
for (float r : results) {
System.out.print(r + "; ");
}
System.out.println();
*/
}
}
return maxProb < threshold ? this.rootMediaType : bestEstimate;
}
public MediaTypeRegistry getMediaTypeRegistry() {
return this.mimeTypes.getMediaTypeRegistry();
}
/**
* build class for probability parameters setting
*
*
*/
public static class Builder {
/*
* the following are the prior probabilities for the file type
* identified by each method.
*/
private float priorMagicFileType, priorExtensionFileType,
priorMetaFileType;
/*
* the following are the conditional probability for each method with
* positive conditions
*/
private float magic_trust, extension_trust, meta_trust;
/*
* the following *_neg are the conditional probabilities with negative
* conditions
*/
private float magic_neg, extension_neg, meta_neg;
private float threshold;
public synchronized Builder priorMagicFileType(final float prior) {
this.priorMagicFileType = prior;
return this;
}
public synchronized Builder priorExtensionFileType(final float prior) {
this.priorExtensionFileType = prior;
return this;
}
public synchronized Builder priorMetaFileType(final float prior) {
this.priorMetaFileType = prior;
return this;
}
public synchronized Builder magic_trust(final float trust) {
this.magic_trust = trust;
return this;
}
public synchronized Builder extension_trust(final float trust) {
this.extension_trust = trust;
return this;
}
public synchronized Builder meta_trust(final float trust) {
this.meta_trust = trust;
return this;
}
public synchronized Builder magic_neg(final float trust) {
this.magic_neg = trust;
return this;
}
public synchronized Builder extension_neg(final float trust) {
this.extension_neg = trust;
return this;
}
public synchronized Builder meta_neg(final float trust) {
this.meta_neg = trust;
return this;
}
public synchronized Builder threshold(final float threshold) {
this.threshold = threshold;
return this;
}
/**
* Initialize the MimeTypes with this builder instance
*/
public ProbabilisticMimeDetectionSelector build2() {
return new ProbabilisticMimeDetectionSelector(this);
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy