com.cybozu.labs.langdetect.GenProfile Maven / Gradle / Ivy
package com.cybozu.labs.langdetect;
import com.cybozu.labs.langdetect.util.LangProfile;
import com.cybozu.labs.langdetect.util.TagExtractor;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import java.io.*;
import java.util.zip.GZIPInputStream;
/**
* Load Wikipedia's abstract XML as corpus and
* generate its language profile in JSON format.
*
* @author Nakatani Shuyo
*/
public class GenProfile implements Serializable {
/**
* Load Wikipedia abstract database file and generate its language profile
*
* @param lang target language name
* @param file target database file path
* @return Language profile instance
* @throws LangDetectException if there's a problem
*/
public static LangProfile loadFromWikipediaAbstract(String lang, File file) throws LangDetectException {
LangProfile profile = new LangProfile(lang);
BufferedReader br = null;
try {
InputStream is = new FileInputStream(file);
if (file.getName().endsWith(".gz")) is = new GZIPInputStream(is);
br = new BufferedReader(new InputStreamReader(is, "utf-8"));
TagExtractor tagextractor = new TagExtractor("abstract", 100);
XMLStreamReader reader = null;
try {
XMLInputFactory factory = XMLInputFactory.newInstance();
reader = factory.createXMLStreamReader(br);
while (reader.hasNext()) {
switch (reader.next()) {
case XMLStreamReader.START_ELEMENT:
tagextractor.setTag(reader.getName().toString());
break;
case XMLStreamReader.CHARACTERS:
tagextractor.add(reader.getText());
break;
case XMLStreamReader.END_ELEMENT:
String text = tagextractor.closeTag();
if (text != null) profile.update(text);
break;
}
}
} catch (XMLStreamException e) {
throw new LangDetectException(ErrorCode.TrainDataFormatError, "Training database file '" + file.getName() + "' is an invalid XML.");
} finally {
try {
if (reader != null) reader.close();
} catch (XMLStreamException e) {
}
}
System.out.println(lang + ":" + tagextractor.count());
} catch (IOException e) {
throw new LangDetectException(ErrorCode.CantOpenTrainData, "Can't open training database file '" + file.getName() + "'");
} finally {
try {
if (br != null) br.close();
} catch (IOException e) {
}
}
return profile;
}
/**
* Load text file with UTF-8 and generate its language profile
*
* @param lang target language name
* @param file target file path
* @return Language profile instance
* @throws LangDetectException if there's a problem
*/
public static LangProfile loadFromText(String lang, File file) throws LangDetectException {
LangProfile profile = new LangProfile(lang);
BufferedReader is = null;
try {
is = new BufferedReader(new InputStreamReader(new FileInputStream(file), "utf-8"));
int count = 0;
while (is.ready()) {
String line = is.readLine();
profile.update(line);
++count;
}
System.out.println(lang + ":" + count);
} catch (IOException e) {
throw new LangDetectException(ErrorCode.CantOpenTrainData, "Can't open training database file '" + file.getName() + "'");
} finally {
try {
if (is != null) is.close();
} catch (IOException e) {
}
}
return profile;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy