opennlp.tools.models.simple.SimpleClassPathModelFinder Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.models.simple;
import java.io.IOException;
import java.lang.reflect.Field;
import java.lang.reflect.Method;
import java.net.JarURLConnection;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLClassLoader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Enumeration;
import java.util.List;
import java.util.Locale;
import java.util.jar.JarEntry;
import java.util.jar.JarFile;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import opennlp.tools.models.AbstractClassPathModelFinder;
import opennlp.tools.models.ClassPathModelFinder;
/**
* Enables the detection of OpenNLP models in the classpath via JDK classes
* By default, this class will search for JAR files starting with "opennlp-models-*".
* This wildcard pattern can be adjusted by using the alternative constructor of this class.
*
* It is a rather simple implementation of scanning the classpath by trying to obtain {@link URL urls}
* from the actual classpath via a chain of possible options. It might not work for every use-case
* since it relies on JDK internals only and doesn't account for classloader hierarchies or edge-cases.
*
* It will
* (1) Try to see if we have a {@link URLClassLoader} available in the current thread.
* (2) Try to obtain URLs via the build in classloader via reflections
* (requires {@code --add-opens java.base/jdk.internal.loader=ALL-UNNAMED} as JVM argument)
* (3) Try to use the bootstrap classpath via {@code java.class.path}.
*
* If you need a more sophisticated solution,
* use {@link opennlp.tools.models.classgraph.ClassgraphModelFinder}.
*/
public class SimpleClassPathModelFinder extends AbstractClassPathModelFinder implements ClassPathModelFinder {
private static final Logger logger = LoggerFactory.getLogger(SimpleClassPathModelFinder.class);
private static final String FILE_PREFIX = "file";
private static final Pattern CLASSPATH_SEPARATOR_PATTERN_WINDOWS = Pattern.compile(";");
private static final Pattern CLASSPATH_SEPARATOR_PATTERN_UNIX = Pattern.compile(":");
// ; for Windows, : for Linux/OSX
/**
* By default, it scans for "opennlp-models-*.jar".
*/
public SimpleClassPathModelFinder() {
this(OPENNLP_MODEL_JAR_PREFIX);
}
/**
* @param modelJarPrefix The leafnames of the jars that should be canned (e.g. "opennlp.jar").
* May contain a wildcard glob ("opennlp-*.jar"). It must not be {@code null}.
*/
public SimpleClassPathModelFinder(String modelJarPrefix) {
super(modelJarPrefix);
}
/**
* @return always {@code NULL} as it is not needed for the simple case.
*/
@Override
protected Object getContext() {
return null; //not needed for the simple case. Just return NULL.
}
/**
* @param wildcardPattern the pattern. Must not be {@code null}.
* @param context an object holding context information.
* It is unused within this implementation.
* @return a list of matching classpath uris.
*/
@Override
protected List getMatchingURIs(String wildcardPattern, Object context) {
if (wildcardPattern == null) {
return Collections.emptyList();
}
final boolean isWindows = isWindows();
final List cp = getClassPathElements();
final List cpu = new ArrayList<>();
final Pattern jarPattern = Pattern.compile(asRegex("*" + getJarModelPrefix()));
final Pattern filePattern = Pattern.compile(asRegex("*" + wildcardPattern));
for (URL url : cp) {
if (matchesPattern(url, jarPattern)) {
try {
for (URI u : getURIsFromJar(url, isWindows)) {
if (matchesPattern(u.toURL(), filePattern)) {
cpu.add(u);
}
}
} catch (IOException e) {
logger.warn("Cannot read content of {}.", url, e);
}
}
}
return cpu;
}
/**
* Escapes a wildcard expressions for usage as a Java regular expression.
*
* @param wildcard must not be {@code null}.
* @return the escaped regex.
*/
private String asRegex(String wildcard) {
return wildcard
.replace(".", "\\.")
.replace("*", ".*")
.replace("?", ".");
}
private boolean matchesPattern(URL url, Pattern pattern) {
return pattern.matcher(url.getFile()).matches();
}
private List getURIsFromJar(URL fileUrl, boolean isWindows) throws IOException {
final List uris = new ArrayList<>();
final URL jarUrl = new URL(JAR + ":" +
(isWindows
? fileUrl.toString().replace("\\", "/")
: fileUrl.toString())
+ "!/");
final JarURLConnection jarConnection = (JarURLConnection) jarUrl.openConnection();
try (JarFile jarFile = jarConnection.getJarFile()) {
final Enumeration entries = jarFile.entries();
while (entries.hasMoreElements()) {
final JarEntry entry = entries.nextElement();
if (!entry.isDirectory()) {
final URL entryUrl = new URL(jarUrl + entry.getName());
try {
uris.add(entryUrl.toURI());
} catch (URISyntaxException ignored) {
//if we cannot convert to URI here, we ignore that entry.
}
}
}
}
return uris;
}
private boolean isWindows() {
return System.getProperty("os.name", "unknown").toLowerCase(Locale.ROOT).contains("win");
}
/**
* Try to obtain URLs from the classpath in the following order:
*
* (1) Try to see if we have a {@link URLClassLoader}.
* (2) Try to obtain URLs via the build in classloader via reflections (requires an add opens JVM argument)
* (3) Try to use the bootstrap classpath via {@code java.class.path}.
*
* @return a list of URLs within the classpath.
*/
private List getClassPathElements() {
final ClassLoader cl = Thread.currentThread().getContextClassLoader();
if (cl instanceof URLClassLoader ucl) {
return Arrays.asList(ucl.getURLs());
} else {
final URL[] fromUcp = getURLs(cl);
if (fromUcp != null && fromUcp.length > 0) {
return Arrays.asList(fromUcp);
} else {
return getClassPathUrlsFromSystemProperty();
}
}
}
private List getClassPathUrlsFromSystemProperty() {
final String cp = System.getProperty("java.class.path", "");
final String[] matches = isWindows()
? CLASSPATH_SEPARATOR_PATTERN_WINDOWS.split(cp)
: CLASSPATH_SEPARATOR_PATTERN_UNIX.split(cp);
final List jarUrls = new ArrayList<>();
for (String classPath: matches) {
try {
jarUrls.add(new URL(FILE_PREFIX, "", classPath));
} catch (MalformedURLException ignored) {
//if we cannot parse a URL from the system property, just ignore it...
//we couldn't load it anyway
}
}
return jarUrls;
}
/*
* Java 9+ Bridge to obtain URLs from classpath.
* This requires "--add-opens java.base/jdk.internal.loader=ALL-UNNAMED" as JVM argument
*/
private URL[] getURLs(ClassLoader classLoader) {
try {
final Class> builtinClazzLoader = Class.forName("jdk.internal.loader.BuiltinClassLoader");
final Field ucpField = builtinClazzLoader.getDeclaredField("ucp");
ucpField.setAccessible(true);
final Object ucpObject = ucpField.get(classLoader);
final Class> clazz = Class.forName("jdk.internal.loader.URLClassPath");
if (ucpObject != null) {
final Method getURLs = clazz.getMethod("getURLs");
return (URL[]) getURLs.invoke(ucpObject);
}
} catch (Exception ignored) {
//ok here because we still have a fallback and this is just one step in the chain of possible
//options to obtain URLs from the classpath
}
return new URL[0];
}
}