All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.language.translate.JoshuaNetworkTranslator Maven / Gradle / Ivy

Go to download

This is the translate Apache Tika™ toolkit. Translator implementations may depend on web services.

There is a newer version: 1.0.18
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.language.translate;

import static java.nio.charset.StandardCharsets.UTF_8;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;

import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.Response;

import org.apache.cxf.jaxrs.client.WebClient;
import org.apache.tika.exception.TikaException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.fasterxml.jackson.core.JsonParseException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.fasterxml.jackson.jaxrs.json.JacksonJsonProvider;

/**
 * 

This translator is designed to work with a TCP-IP available * Joshua translation server, specifically the * * REST-based Joshua server.

* *

If you were to interact with the server via curl a request * would look as follows

* *
 * {code
 * curl http://localhost:5000/joshua/translate/english \
 *   -i -H "Content-Type: application/json" \
 *   -X POST -d '{"inputLanguage": "Spanish", "inputText": "vuelo"}' -v
 * }
 * 
* * Joshua requires input to be pre-formatted into sentences, one per line, * so this translation implementation takes care of that. */ public class JoshuaNetworkTranslator extends AbstractTranslator { private static final Logger LOG = LoggerFactory.getLogger(JoshuaNetworkTranslator.class); private static final String PROPERTIES_FILE = "translator.joshua.properties"; private static final String JOSHUA_SERVER = "joshua.server.url"; private String networkServer; private String networkURI; /** * Default constructor which first checks for the presence of * the translator.joshua.properties file. * We check if the remote server is available on each * translation process. This check is not a remote call, but instead * a check for null value within of a local variable represetning the * value for joshua.server.url, which should be populated * within the translator.joshua.properties file. */ public JoshuaNetworkTranslator() { Properties props = new Properties(); InputStream stream; stream = JoshuaNetworkTranslator.class.getResourceAsStream(PROPERTIES_FILE); try { if(stream != null) { props.load(stream); networkServer = props.getProperty(JOSHUA_SERVER); } } catch (IOException e) { LOG.error("An error occured whilst reading translator.joshua.properties file", e); } } /** *

Initially then check if the source language has been provided. * If no source language (or a null value) has been provided then * we make an attempt to guess the source using Tika's * {@link org.apache.tika.langdetect.OptimaizeLangDetector}. If we * are still unable to guess the language then we return the source * text.

* *

We then process the input text into a new string consisting of * sentences, one per line e.g. insert \n between the presence of '.'

* * @see org.apache.tika.language.translate.Translator#translate * (java.lang.String, java.lang.String, java.lang.String) */ @Override public String translate(String text, String sourceLanguage, String targetLanguage) throws TikaException, IOException { //create networkURI if (!networkServer.endsWith("/")) { networkURI = networkServer + "/" + targetLanguage; } else { networkURI = networkServer + targetLanguage; } if (!this.isAvailable()) return text; //make an attempt to guess language if one is not provided. if (sourceLanguage == null) sourceLanguage = detectLanguage(text).getLanguage(); //process input text into sentences, one per line // e.g. insert \n between the presence of '.' StringBuilder sb = new StringBuilder(text); int i = 0; while ((i = sb.indexOf(".", i + 1)) != -1) { sb.replace(i, i + 1, "\n"); } String inputText = sb.toString(); WebClient client; final List providers = new ArrayList<>(); JacksonJsonProvider jacksonJsonProvider = new JacksonJsonProvider(); providers.add(jacksonJsonProvider); client = WebClient.create(networkURI, providers); ObjectMapper requestMapper = new ObjectMapper(); ObjectNode jsonNode = requestMapper.createObjectNode(); jsonNode.put("inputLanguage", sourceLanguage); jsonNode.put("inputText", inputText); //make the reuest Response response = client.accept(MediaType.APPLICATION_JSON).type(MediaType.APPLICATION_JSON).post(jsonNode); BufferedReader reader = new BufferedReader(new InputStreamReader( (InputStream) response.getEntity(), UTF_8)); String line; StringBuilder responseText = new StringBuilder(); while ((line = reader.readLine()) != null) { responseText.append(line); } try { ObjectMapper responseMapper = new ObjectMapper(); JsonNode jsonResp = responseMapper.readTree(responseText.toString()); if (jsonResp.findValuesAsText("outputText") != null) { return jsonResp.findValuesAsText("outputText").get(0); } else { throw new TikaException(jsonResp.findValue("message").get(0).asText()); } } catch (JsonParseException e) { throw new TikaException("Error requesting translation from '" + sourceLanguage + "' to '" + targetLanguage + "', JSON response " + "from Joshua REST Server is not well formatted: " + responseText.toString()); } } /** * Make an attempt to guess the source language via * {@link org.apache.tika.language.translate.AbstractTranslator#detectLanguage(String)} * before making the call to * {@link org.apache.tika.language.translate.JoshuaNetworkTranslator#translate(String, String, String)} * @see org.apache.tika.language.translate.Translator#translate(java.lang.String, java.lang.String) */ @Override public String translate(String text, String targetLanguage) throws TikaException, IOException { if (isAvailable()) return text; String sourceLanguage = detectLanguage(text).getLanguage(); return translate(text, sourceLanguage, targetLanguage); } /** * @see org.apache.tika.language.translate.Translator#isAvailable() */ @Override public boolean isAvailable() { if (this.networkServer!=null) { URL url = null; try { url = new URL(networkURI); } catch (MalformedURLException mue) { LOG.error("Error reading {} property from {}. {}", JOSHUA_SERVER, PROPERTIES_FILE, mue); } HttpURLConnection connection = null; try { if (url!=null) { connection = (HttpURLConnection) url.openConnection(); connection.setRequestProperty("Connection", "close"); connection.setConnectTimeout(2000); // Timeout 2 seconds connection.connect(); return tryResponseCode(connection); } } catch (IOException ioe) { LOG.error("Error whilst checking availability of {}. {}", JOSHUA_SERVER, ioe); } } return false; } private boolean tryResponseCode(HttpURLConnection connection) { // If the web service is available try { if (connection.getResponseCode() == 200) return true; } catch (IOException ioe) { LOG.error("Error retreiving response code from Joshua Network Translator. {}", ioe); } return false; } }