All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.language.LanguageIdentifier Maven / Gradle / Ivy

There is a newer version: 2024.11.18751.20241128T090041Z-241100
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.language;

import static java.nio.charset.StandardCharsets.UTF_8;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
import java.util.Set;

/**
 * Identifier of the language that best matches a given content profile.
 * The content profile is compared to generic language profiles based on
 * material from various sources.
 * @since Apache Tika 0.5
 * @see 
 *      Europarl: A Parallel Corpus for Statistical Machine Translation
 * @see 
 *      ISO 639 Language Codes
 * @deprecated  use a concrete class of {@link org.apache.tika.language.detect.LanguageDetector}
 */
@Deprecated
public class LanguageIdentifier {
    
    /**
     * The available language profiles.
     */
    private static final Map PROFILES =
        new HashMap();
    private static final String PROFILE_SUFFIX = ".ngp";

    private static Properties props = new Properties();
    private static String errors = "";
    
    private static final String PROPERTIES_OVERRIDE_FILE = "tika.language.override.properties";
    private static final String PROPERTIES_FILE = "tika.language.properties";
    private static final String LANGUAGES_KEY = "languages";
    private static final double CERTAINTY_LIMIT = 0.022;

    private final String language;

    private final double distance;

    /*
     * Always attempt initializing language profiles when class is loaded first time
     */
    static {
        initProfiles();
    }
    
    /*
     * Add one language profile based on config in property file
     */
    private static void addProfile(String language) throws Exception {
        try {
            LanguageProfile profile = new LanguageProfile();

            try (InputStream stream =
                    LanguageIdentifier.class.getResourceAsStream(
                            language + PROFILE_SUFFIX)) {
                BufferedReader reader =
                    new BufferedReader(new InputStreamReader(stream, UTF_8));
                String line = reader.readLine();
                while (line != null) {
                    if (line.length() > 0 && !line.startsWith("#")) {
                        int space = line.indexOf(' ');
                        profile.add(
                                line.substring(0, space),
                                Long.parseLong(line.substring(space + 1)));
                    }
                    line = reader.readLine();
                }
            }

            addProfile(language, profile);
        } catch (Throwable t) {
            throw new Exception("Failed trying to load language profile for language \""+language+"\". Error: "+t.getMessage());
        }
    }
    
    /**
     * Adds a single language profile
     * @param language an ISO 639 code representing language
     * @param profile the language profile
     */
    public static void addProfile(String language, LanguageProfile profile) {
        PROFILES.put(language, profile);
    }
    
    /**
     * Constructs a language identifier based on a LanguageProfile
     * @param profile the language profile
     */
    public LanguageIdentifier(LanguageProfile profile) {
        String minLanguage = "unknown";
        double minDistance = 1.0;
        for (Map.Entry entry : PROFILES.entrySet()) {
            double distance = profile.distance(entry.getValue());
            if (distance < minDistance) {
                minDistance = distance;
                minLanguage = entry.getKey();
            }
        }

        this.language = minLanguage;
        this.distance = minDistance;
    }

    /**
     * Constructs a language identifier based on a String of text content
     * @param content the text
     */
    public LanguageIdentifier(String content) {
        this(new LanguageProfile(content));
    }

    /**
     * Gets the identified language
     * @return an ISO 639 code representing the detected language
     */
    public String getLanguage() {
        return language;
    }

    /**
     * Tries to judge whether the identification is certain enough
     * to be trusted.
     * WARNING: Will never return true for small amount of input texts. 
     * @return true if the distance is smaller then {@value LanguageIdentifier#CERTAINTY_LIMIT}, false otherwise
     */
    public boolean isReasonablyCertain() {
        return distance < CERTAINTY_LIMIT;
    }

    /**
     * Builds the language profiles.
     * The list of languages are fetched from a property file named "tika.language.properties"
     * If a file called "tika.language.override.properties" is found on classpath, this is used instead
     * The property file contains a key "languages" with values being comma-separated language codes
     */
    public static void initProfiles() {
        clearProfiles();
        
        errors = "";
        InputStream stream;
        stream = LanguageIdentifier.class.getResourceAsStream(PROPERTIES_OVERRIDE_FILE);
        if(stream == null) {
            stream = LanguageIdentifier.class.getResourceAsStream(PROPERTIES_FILE);
        }

        if(stream != null){
            try {
                props = new Properties();
                props.load(stream);
            } catch (IOException e) {
                errors += "IOException while trying to load property file. Message: " + e.getMessage() + "\n";
            }
        }
        
        String[] languages = props.getProperty(LANGUAGES_KEY).split(",");
        for(String language : languages) {
            language = language.trim();
            String name = props.getProperty("name."+language, "Unknown");
            try {
                addProfile(language);
            } catch (Exception e) {
                errors += "Language " + language + " (" + name + ") not initialized. Message: " + e.getMessage() + "\n";
            }
        }
    }

    /**
     * Initializes the language profiles from a user supplied initialized Map.
     * This overrides the default set of profiles initialized at startup,
     * and provides an alternative to configuring profiles through property file
     *
     * @param profilesMap map of language profiles
     */
    public static void initProfiles(Map profilesMap) {
        clearProfiles();
        for(Map.Entry entry : profilesMap.entrySet()) {
            addProfile(entry.getKey(), entry.getValue());
        }
    }
    
    /**
     * Clears the current map of language profiles
     */
    public static void clearProfiles() {
        PROFILES.clear();
    }
    
    /**
     * Tests whether there were errors initializing language config
     * @return true if there are errors. Use getErrors() to retrieve.
     */
    public static boolean hasErrors() {
        return errors != "";
    }
    
    /**
     * Returns a string of error messages related to initializing language profiles
     * @return the String containing the error messages
     */
    public static String getErrors() {
        return errors;
    }
    
    /**
     * Returns what languages are supported for language identification
     * @return A set of Strings being the ISO 639 language codes
     */
    public static Set getSupportedLanguages() {
        return PROFILES.keySet();
    }

    @Override
    public String toString() {
        return language + " (" + distance + ")";
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy