All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.okapi.connectors.microsoft.MicrosoftMTConnector Maven / Gradle / Ivy

There is a newer version: 1.47.0
Show newest version
/*===========================================================================
  Copyright (C) 2010-2019 by the Okapi Framework contributors
-----------------------------------------------------------------------------
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License.
===========================================================================*/

package net.sf.okapi.connectors.microsoft;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;

import net.sf.okapi.common.IParameters;
import net.sf.okapi.common.LocaleId;
import net.sf.okapi.common.UsingParameters;
import net.sf.okapi.common.exceptions.OkapiException;
import net.sf.okapi.common.query.QueryResult;
import net.sf.okapi.common.resource.ITextUnit;
import net.sf.okapi.common.resource.TextFragment;
import net.sf.okapi.lib.translation.BaseConnector;
import net.sf.okapi.lib.translation.ITMQuery;
import net.sf.okapi.lib.translation.QueryUtil;

import org.apache.http.client.config.RequestConfig;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@UsingParameters(Parameters.class)
public class MicrosoftMTConnector extends BaseConnector implements ITMQuery {

	private final Logger logger = LoggerFactory.getLogger(getClass());

	private static final int RETRIES = 5; // number of times to try to get a response from Microsoft before failing
	private static final int SLEEPPAUSE = 300; // DWH 5-3-2012 how long to wait before trying
	static final int CONNECTION_TIMEOUT = 20 * 1000;
	
	private QueryUtil util = new QueryUtil();
	private Parameters params = new Parameters();
	private int maximumHits = 1;
	private int threshold = -10; // it was returning 0 for good translations
	private Iterator results = Collections.emptyIterator();
	private MicrosoftMTAPI api;
	private CloseableHttpClient httpClient;

	public MicrosoftMTConnector () {
		// Use a 20s timeout; this is conservative (Microsoft recommends 13s).
		this(HttpClientBuilder.create()
				.setDefaultRequestConfig(RequestConfig.custom()
					.setConnectTimeout(CONNECTION_TIMEOUT)
					.setSocketTimeout(CONNECTION_TIMEOUT)
					.build())
				.build());
	}

	public MicrosoftMTConnector(CloseableHttpClient httpClient) {
		this.httpClient = httpClient;
	}
	
	@Override
	public void close () {
		// Nothing to do
	}

	@Override
	public String getName () {
		return "Microsoft-Translator";
	}

	@Override
	public String getSettingsDisplay () {
		return "Service: " + params.getBaseURL();
	}

	@Override
	public void open () {
		open(null);
	}

	// A MicrosoftMTAPI object is passed only from the unit tests. Normally, a null is passed.
	void open(MicrosoftMTAPI api) {
		this.results = Collections.emptyIterator();
		this.api = api != null ? api :
			new MicrosoftMTAPIv3Impl(getParameters(), httpClient);
	}

	protected  int _query(String queryText, T originalText, QueryResultBuilder resultBuilder) {
		open(api);
		if (queryText.trim().isEmpty()) return 0;
		List queryResults = new ArrayList<>();
		try {
			for (int tries = 0; tries < RETRIES; tries++) {
				List responses = api.getTranslations(queryText, srcCode, trgCode, maximumHits, threshold);
				if (responses != null) {
					queryResults = resultBuilder.convertResponses(responses, originalText);
					break;
				}
				// TODO handle this in the API?
				try {
					Thread.sleep(SLEEPPAUSE);
				} catch (InterruptedException e) { // the app closed
					throw new OkapiException("Interrupted while trying to contact Microsoft MT");
				}
			}
		}
		catch ( Throwable e) {
			throw new OkapiException("Error querying the MT server.\n" + e.getMessage(), e);
		}
		if (queryResults.size() > 0) {
		    results = queryResults.iterator();
		    return queryResults.size();
		}
	    throw new OkapiException("Could not retrieve results from Microsoft MT after " + RETRIES + " attempts.");
	}

	@Override
	public int query (String plainText) {
		return _query(plainText, plainText, new TextQueryResultBuilder(params, getWeight()));
	}

	@Override
	public int query (TextFragment frag) {
		return _query(util.toCodedHTML(frag), frag, new FragmentQueryResultBuilder(params, getWeight()));
	}

	// There is a limitation on the number of characters the API can handle in one prequest
	// and the number of the pieces of text it can take.
	// We have to collect the pieces of text to the limit and make an API call, 
	// and repeat the process until we process all the pieces.
	protected  List> _batchQuery(List texts, List originalText,
													  QueryResultBuilder qrBuilder) {
		open(api);
		int batchCharCount = 0;
		List> results = new ArrayList<>(texts.size());
		List textsToQuery = new ArrayList<>();
		List originalTextsToQuery = new ArrayList<>();
		for (int end = 0; end < texts.size(); end++) {
			int charCount = texts.get(end).trim().length();
			if (charCount >= api.maxCharCount()) {
				// Too big, skip this one
				logger.warn("Segment {} starting with '{}' is too long to query.", end,
							texts.get(end).substring(0, 20));
				continue;
			} else if (charCount < 1) {
				// MS Translator doesn't like trying to translate empty or whitespace only strings
				// Note: Not sure if this is correct for API v3. 
				logger.warn("Segment {} is empty or contains only whitespace.", end);
				continue;
			}
			boolean processBatch = false;
			if (batchCharCount + charCount >= api.maxCharCount()) {
				processBatch = true;
			}
			if (textsToQuery.size() >= api.maxTextCount()) {
				processBatch = true;
			}
			if (processBatch) {
				results.addAll(_subBatchQuery(textsToQuery, originalTextsToQuery, qrBuilder));
				batchCharCount = 0;
				textsToQuery.clear();
				originalTextsToQuery.clear();
				end--; // Do this one again
			}
			else {
				batchCharCount += charCount;
				textsToQuery.add(texts.get(end).trim());
				originalTextsToQuery.add(originalText.get(end));
			}
		}
		if (textsToQuery.size() > 0) {
			results.addAll(_subBatchQuery(textsToQuery, originalTextsToQuery, qrBuilder));
		}
		return results;
	}

	// Process a single batch that is small enough to be handled in a single API call.
	protected  List> _subBatchQuery(List texts, List originalText,
								QueryResultBuilder qrBuilder) {
		GetTranslationsArrayRequest request = new GetTranslationsArrayRequest(texts, srcCode, trgCode,
											maximumHits, params.getCategory());
		List> queryResults = new ArrayList<>();
		// XXX how does this handle skipped entries?
		for (int tries = 0; tries < RETRIES; tries++) {
			List> responses = api.getTranslationsArray(request, srcCode, trgCode,
												maximumHits, threshold);
			if (responses != null) {
				for (int i = 0; i < responses.size(); i++) {
					queryResults.add(qrBuilder.convertResponses(responses.get(i), originalText.get(i)));
				}
				break;
			}
			// TODO handle this in the API?
			try {
				Thread.sleep(SLEEPPAUSE);
			} catch (InterruptedException e) { // the app closed
				throw new OkapiException("Interrupted while trying to contact Microsoft MT");
			}
		}
		if (queryResults.size() > 0) {
		    return queryResults;
		}
		logger.error("Failed to batch translate with Microsoft Translator ({} tries). Skipping query for {}",
			     RETRIES, request.toString());
		throw new OkapiException("Could not retrieve results from Microsoft MT after " + RETRIES + " attempts.");
	}

	@Override
	public void leverage (ITextUnit tu) {
		leverageUsingBatchQuery(tu);
	}
	
	@Override	
	public void batchLeverage(List tuList) {
		batchLeverageUsingBatchQuery(tuList);
	}

	@Override
	public List> batchQueryText(List plainTexts) {
		return _batchQuery(plainTexts, plainTexts, new TextQueryResultBuilder(params, getWeight()));
	}

	@Override
	public List> batchQuery (List fragments) {
		return _batchQuery(util.toCodedHTML(fragments), fragments,
						   new FragmentQueryResultBuilder(params, getWeight()));
	}

	@Override
	protected String toInternalCode (LocaleId locale) {
		String code = locale.toBCP47().toLowerCase();
		if ( code.equals("zh-tw") || code.startsWith("zh-hant") || code.equals("zh-cht") || code.equals("zh-hk")) {
			code = "zh-Hant";
			// code = "zh-CHT"; // v2 API
		}
		else if ( code.startsWith("zh") ) { // zh-cn, zh-hans, zh-chs..
			code = "zh-Hans";
			// code = "zh-CHS";
		}
		else if ( code.startsWith("sr-cyrl") ) {
			// Preserve script for Serbian in Cyrillic
			// Latin script is the default
			code = "sr-Cyrl";
		}
		else if ( code.startsWith("pt") || code.equals("es-419") ) {
			; // Preserve region variants for some locales
		}
		else if (locale.getLanguage().equals("in")) {
			// Java uses 'in' to represent the Indonesian locale, while
			// Microsoft expects 'id'. ICU and Locale#toLanguageTag
			// both produce 'id', but our LocaleIds aren't always
			// coming from those sources.
			return "id";
		}
		else { // Use just the language otherwise
			code = locale.getLanguage(); 
		}
		return code;
	}

	@Override
	public Parameters getParameters () {
		return params;
	}

	@Override
	public void setParameters (IParameters params) {
		this.params = (Parameters)params;
	}

	@Override
	public boolean hasNext () {
		return results.hasNext();
	}

	@Override
	public QueryResult next () {
		return results.hasNext() ? results.next() : null;
	}

	@Override
	public int getMaximumHits () {
		return maximumHits;
	}

	@Override
	public void setMaximumHits (int maximumHits) {
		this.maximumHits = maximumHits;
	}

	@Override
	public int getThreshold () {
		return threshold;
	}

	@Override
	public void setThreshold (int threshold) {
		this.threshold = threshold;
		this.threshold = -10; // Microsoft is returning confidence of 0
	}

	@Override
	public void setLanguages (LocaleId sourceLocale,
		LocaleId targetLocale)
	{
		super.setLanguages(sourceLocale, targetLocale);
		// srcCode and trgCode are set properly in setLanguage()
		// they must not be reset here
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy