All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.optimaize.langdetect.cybozu.CommandLineInterface Maven / Gradle / Ivy

/*
 * Copyright 2011 Nakatani Shuyo
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.optimaize.langdetect.cybozu;

import com.optimaize.langdetect.frma.LangProfileWriter;
import com.optimaize.langdetect.cybozu.util.LangProfile;
import com.google.common.base.Optional;
import com.optimaize.langdetect.DetectedLanguage;
import com.optimaize.langdetect.LanguageDetector;
import com.optimaize.langdetect.LanguageDetectorBuilder;
import com.optimaize.langdetect.i18n.LdLocale;
import com.optimaize.langdetect.ngram.NgramExtractors;
import com.optimaize.langdetect.profiles.LanguageProfile;
import com.optimaize.langdetect.profiles.LanguageProfileReader;
import com.optimaize.langdetect.text.CommonTextObjectFactories;
import com.optimaize.langdetect.text.TextObject;
import com.optimaize.langdetect.text.TextObjectFactory;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;

import java.io.*;
import java.util.*;

/**
 * LangDetect Command Line Interface.
 *
 * 

This is a command line interface of Language Detection Library "LangDetect".

* *

Renamed: this class was previously known as "Command".

* *

TODO after my recent changes switching to the new Detector this code is untested. -Fabian

* * @author Nakatani Shuyo * @author Francois ROLAND * @author Fabian Kessler */ public class CommandLineInterface { /** smoothing default parameter (ELE) */ private static final double DEFAULT_ALPHA = 0.5; /** for Command line easy parser */ private final Map opt_with_value = new HashMap<>(); private final Map values = new HashMap<>(); private final Set opt_without_value = new HashSet<>(); private final List arglist = new ArrayList<>(); /** * Command Line Interface * @param args command line arguments */ public static void main(String[] args) throws IOException { CommandLineInterface cli = new CommandLineInterface(); cli.addOpt("-d", "directory", "./"); cli.addOpt("-a", "alpha", "" + DEFAULT_ALPHA); cli.addOpt("-s", "seed", null); cli.parse(args); if (cli.hasParam("--genprofile")) { cli.generateProfile(); } else if (cli.hasParam("--detectlang")) { cli.detectLang(); } else if (cli.hasParam("--batchtest")) { cli.batchTest(); } } /** * Command line easy parser * @param args command line arguments */ private void parse(String[] args) { for (int i=0; i>>"+value+"<<<", e); } } /** */ @Nullable private Long getParamLongOrNull(String key) { String value = values.get(key); if (value == null || value.isEmpty()) { return null; } try { return Long.valueOf(value); } catch (NumberFormatException e) { throw new RuntimeException("Invalid long value: >>>"+value+"<<<", e); } } private boolean hasParam(String opt) { return opt_without_value.contains(opt); } /** * File search (easy glob) * @param directory directory path * @param pattern searching file pattern with regular representation * @return matched file */ private File searchFile(File directory, String pattern) { if (!directory.isDirectory()) { throw new IllegalArgumentException("Not a directly: "+directory); } File[] files = directory.listFiles(); assert files != null; //checked for directly above. for (File file : files) { if (file.getName().matches(pattern)) return file; } return null; } /** * Generate Language Profile from a text file. * *
     * usage: --genprofile [text file] [language name]
     * 
* */ public void generateProfile() { File directory = new File(arglist.get(0)); String lang = arglist.get(1); File file = searchFile(directory, lang + "wiki-.*-abstract\\.xml.*"); if (file == null) { System.err.println("Not Found text file : lang = " + lang); return; } try(FileOutputStream outputStream = new FileOutputStream(new File(lang))) { LangProfile profile = GenProfile.load(lang, file); profile.omitLessFreq(); new LangProfileWriter().write(profile, outputStream); } catch (IOException e) { e.printStackTrace(); } } /** * Language detection test for each file (--detectlang option) * *
     * usage: --detectlang -d [profile directory] -a [alpha] -s [seed] [test file(s)]
     * 
* */ public void detectLang() throws IOException { LanguageDetector languageDetector = makeDetector(); TextObjectFactory textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText(); for (String filename: arglist) { try (BufferedReader is = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "utf-8"))) { TextObject textObject = textObjectFactory.create().append(is); List probabilities = languageDetector.getProbabilities(textObject); System.out.println(filename + ":" + probabilities); } } } /** * Batch Test of Language Detection (--batchtest option) * *
     * usage: --batchtest -d [profile directory] -a [alpha] -s [seed] [test data(s)]
     * 
* * The format of test data(s): *
     *   [correct language name]\t[text body for test]\n
     * 
* */ public void batchTest() throws IOException { LanguageDetector languageDetector = makeDetector(); TextObjectFactory textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText(); Map> result = new HashMap<>(); for (String filename : arglist) { try (BufferedReader is = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "utf-8"))) { while (is.ready()) { String line = is.readLine(); int idx = line.indexOf('\t'); if (idx <= 0) continue; String correctLang = line.substring(0, idx); String text = line.substring(idx + 1); TextObject textObject = textObjectFactory.forText(text); Optional lang = languageDetector.detect(textObject); if (!result.containsKey(correctLang)) result.put(correctLang, new ArrayList()); if (lang.isPresent()) { result.get(correctLang).add(lang.toString()); } else { result.get(correctLang).add("unknown"); } if (hasParam("--debug")) System.out.println(correctLang + "," + lang + "," + (text.length() > 100 ? text.substring(0, 100) : text)); } } List langList = new ArrayList<>(result.keySet()); Collections.sort(langList); int totalCount = 0, totalCorrect = 0; for (String lang : langList) { Map resultCount = new HashMap<>(); int count = 0; List list = result.get(lang); for (String detectedLang: list) { ++count; if (resultCount.containsKey(detectedLang)) { resultCount.put(detectedLang, resultCount.get(detectedLang) + 1); } else { resultCount.put(detectedLang, 1); } } int correct = resultCount.containsKey(lang)?resultCount.get(lang):0; double rate = correct / (double)count; System.out.println(String.format("%s (%d/%d=%.2f): %s", lang, correct, count, rate, resultCount)); totalCorrect += correct; totalCount += count; } System.out.println(String.format("total: %d/%d = %.3f", totalCorrect, totalCount, totalCorrect / (double) totalCount)); } } /** * Using all language profiles from the given directory. */ private LanguageDetector makeDetector() throws IOException { double alpha = getParamDouble("alpha", DEFAULT_ALPHA); String profileDirectory = requireParamString("directory") + "/"; Optional seed = Optional.fromNullable(getParamLongOrNull("seed")); List languageProfiles = new LanguageProfileReader().readAll(new File(profileDirectory)); return LanguageDetectorBuilder.create(NgramExtractors.standard()) .alpha(alpha) .seed(seed) .shortTextAlgorithm(50) .withProfiles(languageProfiles) .build(); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy