org.apache.tika.langdetect.OptimaizeLangDetector Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of tika-langdetect Show documentation
This is the language detection Apache Tika™ toolkit.
There is a newer version: 1.0.18
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.langdetect;

import java.io.CharArrayWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.tika.language.detect.LanguageConfidence;
import org.apache.tika.language.detect.LanguageDetector;
import org.apache.tika.language.detect.LanguageNames;
import org.apache.tika.language.detect.LanguageResult;

import com.optimaize.langdetect.DetectedLanguage;
import com.optimaize.langdetect.LanguageDetectorBuilder;
import com.optimaize.langdetect.i18n.LdLocale;
import com.optimaize.langdetect.ngram.NgramExtractors;
import com.optimaize.langdetect.profiles.BuiltInLanguages;
import com.optimaize.langdetect.profiles.LanguageProfile;
import com.optimaize.langdetect.profiles.LanguageProfileReader;

/**
 * Implementation of the LanguageDetector API that uses
 * https://github.com/optimaize/language-detector
 */
public class OptimaizeLangDetector extends LanguageDetector {

	private static final int MAX_CHARS_FOR_DETECTION = 20000;
	private static final int MAX_CHARS_FOR_SHORT_DETECTION = 200;
	
	private com.optimaize.langdetect.LanguageDetector detector;
	private CharArrayWriter writer;
	private Set languages;
	private Map languageProbabilities;
	
	public OptimaizeLangDetector() {
		super();
		
		writer = new CharArrayWriter(MAX_CHARS_FOR_DETECTION);
	}
	
	@Override
	public LanguageDetector loadModels() throws IOException {
		List languageProfiles = new LanguageProfileReader().readAllBuiltIn();
		
		// FUTURE when the "language-detector" project supports short profiles, check if
		// isShortText() returns true and switch to those.
		
		languages = new HashSet<>();
		for (LanguageProfile profile : languageProfiles) {
			languages.add(makeLanguageName(profile.getLocale()));
		}
		
		detector = createDetector(languageProfiles);
		
		return this;

	}

	private String makeLanguageName(LdLocale locale) {
		return LanguageNames.makeName(locale.getLanguage(), locale.getScript().orNull(), locale.getRegion().orNull());
	}

	@Override
	public LanguageDetector loadModels(Set languages) throws IOException {
		
		// Normalize languages.
		this.languages = new HashSet<>(languages.size());
		for (String language : languages) {
			this.languages.add(LanguageNames.normalizeName(language));
		}
		
		// TODO what happens if you request a language that has no profile?
		Set locales = new HashSet<>();
		for (LdLocale locale : BuiltInLanguages.getLanguages()) {
			String languageName = makeLanguageName(locale);
			if (this.languages.contains(languageName)) {
				locales.add(locale);
			}
		}
		
		detector = createDetector(new LanguageProfileReader().readBuiltIn(locales));
		
		return this;
	}

	private com.optimaize.langdetect.LanguageDetector createDetector(List languageProfiles) {
		// FUTURE currently the short text algorithm doesn't normalize probabilities until the end, which
		// means you can often get 0 probabilities. So we pick a very short length for this limit.
		LanguageDetectorBuilder builder = LanguageDetectorBuilder.create(NgramExtractors.standard())
				.shortTextAlgorithm(30)
		        .withProfiles(languageProfiles);
		
		if (languageProbabilities != null) {
			Map languageWeights = new HashMap<>(languageProbabilities.size());
			for (String language : languageProbabilities.keySet()) {
				Double priority = (double)languageProbabilities.get(language);
				languageWeights.put(LdLocale.fromString(language), priority);
			}
			
			builder.languagePriorities(languageWeights);
		}
		
		return builder.build();
	}
	
	@Override
	public boolean hasModel(String language) {
		return languages.contains(language);
	}

	@Override
	public LanguageDetector setPriors(Map languageProbabilities) throws IOException {
		this.languageProbabilities = languageProbabilities;
		
		loadModels(languageProbabilities.keySet());

		return this;
	}
	
	@Override
	public void reset() {
		writer.reset();
	}

	@Override
	public void addText(char[] cbuf, int off, int len) {
		if (hasEnoughText()) {
			return; // do nothing if we've already got enough text.
		}
		
		writer.write(cbuf, off, len);
		
		// FUTURE - use support to get padding char from NGramExtractors.standard().
		// We'd like to get the textPadding character from the NGramExtractor, but
		// that's not exposed. NGramExtractors.standard() returns extractor with ' '
		// as padding, so that's what we'll use here.
		writer.write(' ');
	}

	@Override
	public List detectAll() {
		// TODO throw exception if models haven't been loaded, or auto-load all?
		
		List result = new ArrayList<>();
		
		List rawResults = detector.getProbabilities(writer.toString());
		for (DetectedLanguage rawResult : rawResults) {
			// TODO figure out right level for confidence brackets.
			LanguageConfidence confidence = rawResult.getProbability() > 0.9 ? LanguageConfidence.HIGH : LanguageConfidence.MEDIUM;
			result.add(new LanguageResult(makeLanguageName(rawResult.getLocale()), confidence, (float)rawResult.getProbability()));
		}

		if (result.isEmpty()) {
			result.add(LanguageResult.NULL);
		}
		
		return result;
	}

	@Override
	public boolean hasEnoughText() {
		return writer.size() >= getTextLimit();
	}

	private int getTextLimit() {
		int limit = (shortText ? MAX_CHARS_FOR_SHORT_DETECTION : MAX_CHARS_FOR_DETECTION);
		
		// We want more text if we're processing documents that have a mixture of languages.
		// FUTURE - figure out right amount to bump up the limit.
		if (mixedLanguages) {
			limit *= 2;
		}
		
		return limit;
	}
}