![JAR search and dependency download from the Maven repository](/logo.png)
org.apache.tika.detect.NameDetector Maven / Gradle / Ivy
Show all versions of aem-sdk-api Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.detect;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import static java.nio.charset.StandardCharsets.UTF_8;
/**
* Content type detection based on the resource name. An instance of this
* class contains a set of regular expression patterns that are matched
* against the resource name potentially given as a part of the input metadata.
*
* If a pattern matches the given name, then the media type associated with
* that pattern is returned as the likely content type of the input document.
* Otherwise the returned type is application/octet-stream
.
*
* See the {@link #detect(InputStream, Metadata)} method for more details
* of the matching algorithm.
*
* @since Apache Tika 0.3
*/
public class NameDetector implements Detector {
/**
* The regular expression patterns used for type detection.
*/
private final Map patterns;
/**
* Creates a new content type detector based on the given name patterns.
* The given pattern map is not copied, so the caller may update the
* mappings even after this detector instance has been created. However,
* the map must not be concurrently modified while this instance
* is used for type detection.
*
* @param patterns map from name patterns to corresponding media types
*/
public NameDetector(Map patterns) {
this.patterns = patterns;
}
/**
* Detects the content type of an input document based on the document
* name given in the input metadata. The RESOURCE_NAME_KEY attribute of
* the given input metadata is expected to contain the name (normally
* a file name or a URL) of the input document.
*
* If a resource name is given, then it is first processed as follows.
*
* -
* Potential URL query (?...) and fragment identifier (#...)
* parts are removed from the end of the resource name.
*
* -
* Potential leading path elements (up to the last slash or backslash)
* are removed from the beginning of the resource name.
*
* -
* Potential URL encodings (%nn, in UTF-8) are decoded.
*
* -
* Any leading and trailing whitespace is removed.
*
*
*
* The resulting name string (if any) is then matched in sequence against
* all the configured name patterns. If a match is found, then the (first)
* matching media type is returned.
*
* @param input ignored
* @param metadata input metadata, possibly with a RESOURCE_NAME_KEY value
* @return detected media type, or application/octet-stream
*/
public MediaType detect(InputStream input, Metadata metadata) {
// Look for a resource name in the input metadata
String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
if (name != null) {
// If the name is a URL, skip the trailing query
int question = name.indexOf('?');
if (question != -1) {
name = name.substring(0, question);
}
// If the name is a URL or a path, skip all but the last component
int slash = name.lastIndexOf('/');
if (slash != -1) {
name = name.substring(slash + 1);
}
int backslash = name.lastIndexOf('\\');
if (backslash != -1) {
name = name.substring(backslash + 1);
}
// Strip any fragments from the end, but only ones after the extension
int hash = name.lastIndexOf('#');
int dot = name.indexOf('.');
if (hash != -1) {
if (dot == -1 || hash > dot) {
name = name.substring(0, hash);
}
}
// Decode any potential URL encoding
int percent = name.indexOf('%');
if (percent != -1) {
try {
name = URLDecoder.decode(name, UTF_8.name());
} catch (UnsupportedEncodingException e) {
throw new IllegalStateException("UTF-8 not supported", e);
}
}
// Skip any leading or trailing whitespace
name = name.trim();
if (name.length() > 0) {
// Match the name against the registered patterns
for (Pattern pattern : patterns.keySet()) {
if (pattern.matcher(name).matches()) {
return patterns.get(pattern);
}
}
}
}
return MediaType.OCTET_STREAM;
}
}