org.apache.tika.language.detect.LanguageDetector Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.language.detect;

import java.io.IOException;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.tika.config.ServiceLoader;

// We should use the IANA registry for primary language names...see
// http://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
// There must be a package that uses this dataset to support knowledge of
// the default script, etc. And how to map from - (e.g. 'zh-CN')
// to  ('cmn'), or - to  ('zh-cmn' => 'cmn')
// We'd also want to know the default sublang for a macro language ('zh' => 'zh-cmn')
// There's also mapping 'zh-CN' to 'cmn-Hans' (simplified chinese script)

// TODO decide how deep to go into supporting extended language tags, see
// http://www.w3.org/International/articles/language-tags/. For example,
// what should you expect from calling hasModel("en-GB") if there's only
// a model for "en"?

// This is mostly an issue for interpreting language tags in (X)HTML docs,
// and maybe XML if we really care. In those cases you could get something
// like "ast" (three letter language code), or even zh-cmn-Hant-SG
// (Chinese, Mandarin, Traditional script, in Singapore) plus additional:
// language-extlang-script-region-variant-extension-privateuse

// The full spec is at http://www.rfc-editor.org/rfc/bcp/bcp47.txt

public abstract class LanguageDetector {

	private static final ServiceLoader DEFAULT_SERVICE_LOADER = new ServiceLoader();

	// True if text is expected to be a mix of languages, and thus higher-resolution
	// detection must be done to avoid under-sampling the text.
	protected boolean mixedLanguages = false;
	
	// True if the text is expected to be 'short' (typically less than 100 chars), and
	// thus a different algorithm and/or set of profiles should be used.
	protected boolean shortText = false;
	
	public static LanguageDetector getDefaultLanguageDetector() {
		List detectors = getLanguageDetectors();
		if (detectors.isEmpty()) {
			throw new IllegalStateException("No language detectors available");
		} else {
			return detectors.get(0);
		}
	}
	
	public static List getLanguageDetectors() {
		return getLanguageDetectors(DEFAULT_SERVICE_LOADER);
	}
	
	public static List getLanguageDetectors(ServiceLoader loader) {
        List detectors = loader.loadStaticServiceProviders(LanguageDetector.class);
        Collections.sort(detectors, new Comparator() {
            public int compare(LanguageDetector d1, LanguageDetector d2) {
                String n1 = d1.getClass().getName();
                String n2 = d2.getClass().getName();
                boolean tika1 = n1.startsWith("org.apache.tika.");
                boolean tika2 = n2.startsWith("org.apache.tika.");
                if (tika1 == tika2) {
                    return n1.compareTo(n2);
                } else if (tika1) {
                    return -1;
                } else {
                    return 1;
                }
            }
        });
        
        return detectors;
	}
	
	public boolean isMixedLanguages() {
		return mixedLanguages;
	}
	
	public LanguageDetector setMixedLanguages(boolean mixedLanguages) {
		this.mixedLanguages = mixedLanguages;
		return this;
	}
	
	public boolean isShortText() {
		return shortText;
	}
	
	public LanguageDetector setShortText(boolean shortText) {
		this.shortText = shortText;
		return this;
	}
	
	/**
	 * Load (or re-load) all available language models. This must
	 * be called after any settings that would impact the models
	 * being loaded (e.g. mixed language/short text), but
	 * before any of the document processing routines (below)
	 * are called. Note that it only needs to be called once.
	 * 
	 * @return this
	 */
	public abstract LanguageDetector loadModels() throws IOException;
	
	/**
	 * Load (or re-load) the models specified in . These use the
	 * ISO 639-1 names, with an optional "-" for more
	 * specific specification (e.g. "zh-CN" for Chinese in China).
	 * 
	 * @param languages list of target languages.
	 * @return this
	 */
	public abstract LanguageDetector loadModels(Set languages) throws IOException;
	
	/**
	 * Provide information about whether a model exists for a specific
	 * language.
	 * 
	 * @param language ISO 639-1 name for language
	 * @return true if a model for this language exists.
	 */
	public abstract boolean hasModel(String language);
	
	/**
	 * Set the a-priori probabilities for these languages. The provided map uses the language
	 * as the key, and the probability (0.0 > probability < 1.0) of text being in that language.
	 * Note that if the probabilities don't sum to 1.0, these values will be normalized.
	 * 
	 * If hasModel() returns false for any of the languages, an IllegalArgumentException is thrown.
	 * 
	 * Use of these probabilities is detector-specific, and thus might not impact the results at all.
	 * As such, these should be viewed as a hint.
	 * 
	 * @param languageProbabilities Map from language to probability
	 * @return this
	 */
	public abstract LanguageDetector setPriors(Map languageProbabilities) throws IOException;
	
	// ============================================================
	// The routines below are called when processing a document
	// ============================================================

	/**
	 * Reset statistics about the current document being processed
	 */
	public abstract void reset();
	
	/**
	 * Add statistics about this text for the current document. Note
	 * that we assume an implicit word break exists before/after
	 * each of these runs of text.
	 * 
	 * @param cbuf Character buffer
	 * @param off Offset into cbuf to first character in the run of text
	 * @param len Number of characters in the run of text.
	 */
	public abstract void addText(char[] cbuf, int off, int len);
	
	/**
	 * Add  to the statistics being accumulated for the current
	 * document. Note that this is a default implementation for adding
	 * a string (not optimized)
	 * 
	 * @param text Characters to add to current statistics.
	 */
	public void addText(CharSequence text) {
		char[] chars = text.toString().toCharArray();
		addText(chars, 0, chars.length);
	}

	
	/**
	 * Tell the caller whether more text is required for the current document
	 * before the language can be reliably detected.
	 * 
	 * Implementations can override this to do early termination of stats
	 * collection, which can improve performance with longer documents.
	 * 
	 * Note that detect() can be called even when this returns false
	 * 
	 * @return true if we have enough text for reliable detection.
	 */
	public boolean hasEnoughText() {
		return false;
	}
	
	/**
	 * Detect languages based on previously submitted text (via addText calls).
	 * 
	 * @return	list of all possible languages with at least medium confidence,
	 * 			sorted by confidence from highest to lowest. There will always
	 * 			be at least one result, which might have a confidence of NONE.
	 */
	public abstract List detectAll();
	
	public LanguageResult detect() {
		List results = detectAll();
		return results.get(0);
	}

	/**
	 * Utility wrapper that detects the language of a given chunk of text.
	 * 
	 * @param text String to add to current statistics.
	 * @return list of all possible languages with at least medium confidence,
	 * 			sorted by confidence from highest to lowest.
	 */
	public List detectAll(String text) {
		reset();
		addText(text);
		return detectAll();
	}
	
	public LanguageResult detect(CharSequence text) {
		reset();
		addText(text);
		return detect();
	}

}