org.apache.tika.language.translate.JoshuaNetworkTranslator Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of tika-translate Show documentation
Show all versions of tika-translate Show documentation
This is the translate Apache Tika™ toolkit. Translator implementations may depend on web services.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.language.translate;
import static java.nio.charset.StandardCharsets.UTF_8;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.Response;
import org.apache.cxf.jaxrs.client.WebClient;
import org.apache.tika.exception.TikaException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.core.JsonParseException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.fasterxml.jackson.jaxrs.json.JacksonJsonProvider;
/**
* This translator is designed to work with a TCP-IP available
* Joshua translation server, specifically the
*
* REST-based Joshua server.
*
* If you were to interact with the server via curl a request
* would look as follows
*
*
* {code
* curl http://localhost:5000/joshua/translate/english \
* -i -H "Content-Type: application/json" \
* -X POST -d '{"inputLanguage": "Spanish", "inputText": "vuelo"}' -v
* }
*
*
* Joshua requires input to be pre-formatted into sentences, one per line,
* so this translation implementation takes care of that.
*/
public class JoshuaNetworkTranslator extends AbstractTranslator {
private static final Logger LOG = LoggerFactory.getLogger(JoshuaNetworkTranslator.class);
private static final String PROPERTIES_FILE = "translator.joshua.properties";
private static final String JOSHUA_SERVER = "joshua.server.url";
private String networkServer;
private String networkURI;
/**
* Default constructor which first checks for the presence of
* the translator.joshua.properties
file.
* We check if the remote server is available on each
* translation process. This check is not a remote call, but instead
* a check for null value within of a local variable represetning the
* value for joshua.server.url
, which should be populated
* within the translator.joshua.properties
file.
*/
public JoshuaNetworkTranslator() {
Properties props = new Properties();
InputStream stream;
stream = JoshuaNetworkTranslator.class.getResourceAsStream(PROPERTIES_FILE);
try {
if(stream != null) {
props.load(stream);
networkServer = props.getProperty(JOSHUA_SERVER);
}
} catch (IOException e) {
LOG.error("An error occured whilst reading translator.joshua.properties file", e);
}
}
/**
* Initially then check if the source language has been provided.
* If no source language (or a null value) has been provided then
* we make an attempt to guess the source using Tika's
* {@link org.apache.tika.langdetect.OptimaizeLangDetector}. If we
* are still unable to guess the language then we return the source
* text.
*
* We then process the input text into a new string consisting of
* sentences, one per line e.g. insert \n between the presence of '.'
*
* @see org.apache.tika.language.translate.Translator#translate
* (java.lang.String, java.lang.String, java.lang.String)
*/
@Override
public String translate(String text, String sourceLanguage,
String targetLanguage) throws TikaException, IOException {
//create networkURI
if (!networkServer.endsWith("/")) {
networkURI = networkServer + "/" + targetLanguage;
} else {
networkURI = networkServer + targetLanguage;
}
if (!this.isAvailable())
return text;
//make an attempt to guess language if one is not provided.
if (sourceLanguage == null)
sourceLanguage = detectLanguage(text).getLanguage();
//process input text into sentences, one per line
// e.g. insert \n between the presence of '.'
StringBuilder sb = new StringBuilder(text);
int i = 0;
while ((i = sb.indexOf(".", i + 1)) != -1) {
sb.replace(i, i + 1, "\n");
}
String inputText = sb.toString();
WebClient client;
final List