Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.language.detect;
import java.io.IOException;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.tika.config.ServiceLoader;
// We should use the IANA registry for primary language names...see
// http://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
// There must be a package that uses this dataset to support knowledge of
// the default script, etc. And how to map from - (e.g. 'zh-CN')
// to ('cmn'), or - to ('zh-cmn' => 'cmn')
// We'd also want to know the default sublang for a macro language ('zh' => 'zh-cmn')
// There's also mapping 'zh-CN' to 'cmn-Hans' (simplified chinese script)
// TODO decide how deep to go into supporting extended language tags, see
// http://www.w3.org/International/articles/language-tags/. For example,
// what should you expect from calling hasModel("en-GB") if there's only
// a model for "en"?
// This is mostly an issue for interpreting language tags in (X)HTML docs,
// and maybe XML if we really care. In those cases you could get something
// like "ast" (three letter language code), or even zh-cmn-Hant-SG
// (Chinese, Mandarin, Traditional script, in Singapore) plus additional:
// language-extlang-script-region-variant-extension-privateuse
// The full spec is at http://www.rfc-editor.org/rfc/bcp/bcp47.txt
public abstract class LanguageDetector {
private static final ServiceLoader DEFAULT_SERVICE_LOADER = new ServiceLoader();
// True if text is expected to be a mix of languages, and thus higher-resolution
// detection must be done to avoid under-sampling the text.
protected boolean mixedLanguages = false;
// True if the text is expected to be 'short' (typically less than 100 chars), and
// thus a different algorithm and/or set of profiles should be used.
protected boolean shortText = false;
public static LanguageDetector getDefaultLanguageDetector() {
List detectors = getLanguageDetectors();
if (detectors.isEmpty()) {
throw new IllegalStateException("No language detectors available");
} else {
return detectors.get(0);
}
}
public static List getLanguageDetectors() {
return getLanguageDetectors(DEFAULT_SERVICE_LOADER);
}
public static List getLanguageDetectors(ServiceLoader loader) {
List detectors = loader.loadStaticServiceProviders(LanguageDetector.class);
Collections.sort(detectors, new Comparator() {
public int compare(LanguageDetector d1, LanguageDetector d2) {
String n1 = d1.getClass().getName();
String n2 = d2.getClass().getName();
boolean tika1 = n1.startsWith("org.apache.tika.");
boolean tika2 = n2.startsWith("org.apache.tika.");
if (tika1 == tika2) {
return n1.compareTo(n2);
} else if (tika1) {
return -1;
} else {
return 1;
}
}
});
return detectors;
}
public boolean isMixedLanguages() {
return mixedLanguages;
}
public LanguageDetector setMixedLanguages(boolean mixedLanguages) {
this.mixedLanguages = mixedLanguages;
return this;
}
public boolean isShortText() {
return shortText;
}
public LanguageDetector setShortText(boolean shortText) {
this.shortText = shortText;
return this;
}
/**
* Load (or re-load) all available language models. This must
* be called after any settings that would impact the models
* being loaded (e.g. mixed language/short text), but
* before any of the document processing routines (below)
* are called. Note that it only needs to be called once.
*
* @return this
*/
public abstract LanguageDetector loadModels() throws IOException;
/**
* Load (or re-load) the models specified in . These use the
* ISO 639-1 names, with an optional "-" for more
* specific specification (e.g. "zh-CN" for Chinese in China).
*
* @param languages list of target languages.
* @return this
*/
public abstract LanguageDetector loadModels(Set languages) throws IOException;
/**
* Provide information about whether a model exists for a specific
* language.
*
* @param language ISO 639-1 name for language
* @return true if a model for this language exists.
*/
public abstract boolean hasModel(String language);
/**
* Set the a-priori probabilities for these languages. The provided map uses the language
* as the key, and the probability (0.0 > probability < 1.0) of text being in that language.
* Note that if the probabilities don't sum to 1.0, these values will be normalized.
*
* If hasModel() returns false for any of the languages, an IllegalArgumentException is thrown.
*
* Use of these probabilities is detector-specific, and thus might not impact the results at all.
* As such, these should be viewed as a hint.
*
* @param languageProbabilities Map from language to probability
* @return this
*/
public abstract LanguageDetector setPriors(Map languageProbabilities) throws IOException;
// ============================================================
// The routines below are called when processing a document
// ============================================================
/**
* Reset statistics about the current document being processed
*/
public abstract void reset();
/**
* Add statistics about this text for the current document. Note
* that we assume an implicit word break exists before/after
* each of these runs of text.
*
* @param cbuf Character buffer
* @param off Offset into cbuf to first character in the run of text
* @param len Number of characters in the run of text.
*/
public abstract void addText(char[] cbuf, int off, int len);
/**
* Add to the statistics being accumulated for the current
* document. Note that this is a default implementation for adding
* a string (not optimized)
*
* @param text Characters to add to current statistics.
*/
public void addText(CharSequence text) {
char[] chars = text.toString().toCharArray();
addText(chars, 0, chars.length);
}
/**
* Tell the caller whether more text is required for the current document
* before the language can be reliably detected.
*
* Implementations can override this to do early termination of stats
* collection, which can improve performance with longer documents.
*
* Note that detect() can be called even when this returns false
*
* @return true if we have enough text for reliable detection.
*/
public boolean hasEnoughText() {
return false;
}
/**
* Detect languages based on previously submitted text (via addText calls).
*
* @return list of all possible languages with at least medium confidence,
* sorted by confidence from highest to lowest. There will always
* be at least one result, which might have a confidence of NONE.
*/
public abstract List detectAll();
public LanguageResult detect() {
List results = detectAll();
return results.get(0);
}
/**
* Utility wrapper that detects the language of a given chunk of text.
*
* @param text String to add to current statistics.
* @return list of all possible languages with at least medium confidence,
* sorted by confidence from highest to lowest.
*/
public List detectAll(String text) {
reset();
addText(text);
return detectAll();
}
public LanguageResult detect(CharSequence text) {
reset();
addText(text);
return detect();
}
}