org.apache.tika.language.translate.CachedTranslator Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of tika-translate Show documentation
Show all versions of tika-translate Show documentation
This is the translate Apache Tika™ toolkit. Translator implementations may depend on web services.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.language.translate;
import java.io.IOException;
import java.util.HashMap;
import org.apache.tika.exception.TikaException;
import org.apache.tika.language.detect.LanguageResult;
import com.fasterxml.jackson.databind.util.LRUMap;
/**
* CachedTranslator. Saves a map of previous translations in order to prevent repetitive translation requests.
*/
public class CachedTranslator extends AbstractTranslator {
private static final int INITIAL_ENTRIES = 100;
private static final int MAX_ENTRIES = 1000;
private Translator translator;
// The cache is a map from sourceLang:targetLang to an LRUMap of previously translated pairs.
// Old entries are removed from the cache when it reaches its limit.
// For example, {en:fr -> {hello -> salut}}.
private HashMap> cache;
/**
* Create a new CachedTranslator (must set the {@link Translator} with {@link #setTranslator(Translator)} before use!)
*/
public CachedTranslator(){
this(null);
}
/**
* Create a new CachedTranslator.
*
* @param translator The translator that should be used for the underlying translation service. The properties
* for that service must be set properly!
*/
public CachedTranslator(Translator translator) {
this.translator = translator;
this.cache = new HashMap<>();
}
/**
* @return the translator
*/
public Translator getTranslator() {
return translator;
}
/**
* @param translator the translator to set
*/
public void setTranslator(Translator translator) {
this.translator = translator;
}
@Override
public String translate(String text, String sourceLanguage, String targetLanguage) throws TikaException, IOException {
if (translator == null) {
return text;
}
LRUMap translationCache = getTranslationCache(sourceLanguage, targetLanguage);
String translatedText = translationCache.get(text);
if (translatedText == null) {
translatedText = translator.translate(text, sourceLanguage, targetLanguage);
translationCache.put(text, translatedText);
}
return translatedText;
}
@Override
public String translate(String text, String targetLanguage) throws TikaException, IOException {
LanguageResult language = detectLanguage(text);
String sourceLanguage = language.getLanguage();
return translate(text, sourceLanguage, targetLanguage);
}
@Override
public boolean isAvailable() {
return translator != null && translator.isAvailable();
}
/**
* Get the number of different source/target translation pairs this CachedTranslator
* currently has in its cache.
*
* @return Number of translation source/target pairs in this CachedTranslator's cache.
* @since Tika 1.6
*/
public int getNumTranslationPairs() {
return cache.size();
}
/**
* Get the number of different translations from the source language to the target language
* this CachedTranslator has in its cache.
*
* @param sourceLanguage The source language of translation.
* @param targetLanguage The target language of translation.
* @return The number of translations between source and target.
* @since Tika 1.6
*/
public int getNumTranslationsFor(String sourceLanguage, String targetLanguage) {
LRUMap translationCache = cache.get(buildCacheKeyString(sourceLanguage, targetLanguage));
if (translationCache == null) {
return 0;
} else {
return translationCache.size();
}
}
/**
* Check whether this CachedTranslator's cache contains a translation of the text from the
* source language to the target language.
*
* @param text What string to check for.
* @param sourceLanguage The source language of translation.
* @param targetLanguage The target language of translation.
* @return true if the cache contains a translation of the text, false otherwise.
*/
public boolean contains(String text, String sourceLanguage, String targetLanguage) {
LRUMap translationCache = getTranslationCache(sourceLanguage, targetLanguage);
return translationCache.get(text) != null;
}
/**
* Check whether this CachedTranslator's cache contains a translation of the text to the target language,
* attempting to auto-detect the source language.
*
* @param text What string to check for.
* @param targetLanguage The target language of translation.
* @return true if the cache contains a translation of the text, false otherwise.
*/
public boolean contains(String text, String targetLanguage) {
try {
LanguageResult language = detectLanguage(text);
String sourceLanguage = language.getLanguage();
return contains(text, sourceLanguage, targetLanguage);
} catch (IOException e) {
// TODO what to do if we get an error?
return false;
}
}
/**
* Build the String to be used as the key into this CachedTranslator's cache.
*
* @param sourceLanguage The source language of translation.
* @param targetLanguage The target language of translation.
* @return The string to be used as the key into this CachedTranslator's cache.
*/
private String buildCacheKeyString(String sourceLanguage, String targetLanguage) {
return sourceLanguage + ":" + targetLanguage;
}
/**
* Get the cache of translations from the given source language to target language.
*
* @param sourceLanguage The source language of translation.
* @param targetLanguage The target language of translation.
* @return The LRUMap representing the translation cache.
*/
private LRUMap getTranslationCache(String sourceLanguage, String targetLanguage) {
LRUMap translationCache = cache.get(buildCacheKeyString(sourceLanguage, targetLanguage));
if (translationCache == null) {
translationCache = new LRUMap<>(INITIAL_ENTRIES, MAX_ENTRIES);
cache.put(buildCacheKeyString(sourceLanguage, targetLanguage), translationCache);
}
return translationCache;
}
}