Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
package de.datexis.preprocess;
import com.google.common.base.Optional;
import com.optimaize.langdetect.LanguageDetector;
import com.optimaize.langdetect.LanguageDetectorBuilder;
import com.optimaize.langdetect.i18n.LdLocale;
import com.optimaize.langdetect.ngram.NgramExtractors;
import com.optimaize.langdetect.profiles.LanguageProfile;
import com.optimaize.langdetect.profiles.LanguageProfileReader;
import com.optimaize.langdetect.text.CommonTextObjectFactories;
import com.optimaize.langdetect.text.TextObject;
import com.optimaize.langdetect.text.TextObjectFactory;
import de.datexis.common.Resource;
import de.datexis.common.WordHelpers;
import de.datexis.model.Document;
import de.datexis.model.Sentence;
import de.datexis.model.Token;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.*;
import static de.datexis.common.WordHelpers.skipSpaceAfter;
import static de.datexis.common.WordHelpers.skipSpaceBefore;
/**
* Creates a fully tokenized Document from raw text.
* @author sarnold, rschneider, fgrimme
*/
public class DocumentFactory {
protected final static Logger log = LoggerFactory.getLogger(DocumentFactory.class);
protected static DocumentFactory instance = new DocumentFactory();
public static DocumentFactory getInstance() {
return instance;
}
public static enum Newlines {
KEEP, // keep all newlines in the text and use them as sentence breaks
//KEEP_DOUBLE, // keep only double newlines in the text, but use all of them as sentence breaks
DISCARD // discard all newlines in the text but still use them as sentence breaks
};
private final static String LANG_EN = "en";
private final static String LANG_DE = "de";
TreeMap sentenceSplitter;
TreeMap newlineTokenizer;
TextObjectFactory textObjectFactory;
LanguageDetector languageDetector;
/**
* Create a new DocumentFactory instance. Use this only if you need multiple instances!
* Otherwise, getInstance() will return a singleton object that you can use.
*/
public DocumentFactory() {
sentenceSplitter = new TreeMap<>();
newlineTokenizer = new TreeMap<>();
loadSentenceSplitter(LANG_EN, Resource.fromJAR("openNLP/en-sent.bin"));
loadTokenizer(LANG_EN, Resource.fromJAR("openNLP/en-token.bin"));
loadSentenceSplitter(LANG_DE, Resource.fromJAR("openNLP/de-sent.bin"));
loadTokenizer(LANG_DE, Resource.fromJAR("openNLP/de-token.bin"));
try {
//load all languages:
List languageProfiles = new LanguageProfileReader().readAllBuiltIn();
//build language detector:
languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard())
.withProfiles(languageProfiles)
.build();
//create a text object factory
textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
} catch (IOException ex) {
log.error("Could not load language profiles");
}
}
private void loadSentenceSplitter(String language, Resource modelPath) {
try {
SentenceModel sentenceModel = new SentenceModel(modelPath.getInputStream());
sentenceSplitter.put(language, new SentenceDetectorMENL(sentenceModel));
} catch (IOException ex) {
throw new IllegalStateException("cannot load openNLP model '" + modelPath.toString() + "': " + ex.toString());
}
}
private void loadTokenizer(String language, Resource modelPath) {
try {
TokenizerModel tokenModel = new TokenizerModel(modelPath.getInputStream());
newlineTokenizer.put(language, new TokenizerMENL(tokenModel));
} catch (IOException ex) {
throw new IllegalStateException("cannot load openNLP model '" + modelPath.toString() + "': " + ex.toString());
}
}
/**
* Creates a Document with Sentences and Tokens from a String.
* Newlines in the text will lead to new sentences, but will not be contained in the document.
* @param text The text to process
*/
public static Document fromText(String text) {
return instance.createFromText(text);
}
/**
* Creates a Document with Sentences and Tokens from a String.
* Newlines in the text will always lead to new sentences, and you can choose to keep or discard them.
* @param text The text to process
* @param newlines Keep or discard newlines
*/
public static Document fromText(String text, Newlines newlines) {
return instance.createFromText(text, newlines);
}
/**
* Creates a Document with Sentences and Tokens from a String.
* Newlines in the text will always lead to new sentences, and you can choose to keep or discard them.
* @param text The text to process
* @param newlines Keep or discard newlines
* @param lang use a given language for tokenization and sentence splitting
*/
public static Document fromText(String text, Newlines newlines, WordHelpers.Language lang) {
return instance.createFromText(text, newlines, lang);
}
/**
* Creates a Document with Sentences and Tokens from a whitespace-tokenized String.
* The Tokens in the resulting Document will follow the tokenization from the input, and Sentences are split automatically.
* Whitespace between the tokens will be guessed according to some simple rules.
* @param text The tokenized text to process
*/
public static Document fromTokenizedText(String text) {
final List tokens = instance.createTokensFromTokenizedText(text, 0);
return instance.createFromTokens(tokens);
}
/**
* Creates a Document from existing Tokens, processing Span positions and Sentence splitting.
* @param tokens The list of Tokens to process in their natural order. WARNING: Token offsets must be set correctly!
*/
public static Document fromTokens(List tokens) {
return instance.createFromTokens(tokens);
}
/**
* Create Tokens from raw text, without sentence splitting.
* If you don't need perfect tokenization of punctuation and Token offsets, consider using WordHelpers.splitSpaces()
* @param text The text to process
*/
public static List createTokensFromText(String text) {
return instance.createTokensFromText(text, 0);
}
/**
* Create Tokens from tokenized text, without sentence splitting.
* Whitespace between the tokens will be guessed according to some simple rules.
* @param text The tokenized text to process
*/
public static List createTokensFromTokenizedText(String text) {
return instance.createTokensFromTokenizedText(text, 0);
}
/**
* Create a single Sentence from given Tokens, omitting Sentence splitting.
*/
public static Sentence createSentenceFromTokens(List tokens) {
Sentence s = new Sentence();
tokens.stream()
.filter(t -> !t.isEmpty())
.forEach(t -> s.addToken(t));
return s;
}
public static Sentence createSentenceFromString(String text, String language) {
return createSentenceFromTokens(instance.createTokensFromText(text, 0, language));
}
public static Sentence createSentenceFromTokenizedString(String text) {
return createSentenceFromTokens(instance.createTokensFromTokenizedText(text, 0));
}
/**
* Detects the language of a text
* @return language code, e.g. "en" or "de"
*/
public static String getLanguage(String text) {
return instance.detectLanguage(text);
}
public Document createFromText(String text) {
Document doc = new Document();
addToDocumentFromText(text, doc, Newlines.DISCARD);
return doc;
}
public Document createFromText(String text, Newlines newlines) {
Document doc = new Document();
addToDocumentFromText(text, doc, newlines);
return doc;
}
public Document createFromText(String text, Newlines newlines, WordHelpers.Language lang) {
Document doc = new Document();
addToDocumentFromText(text, doc, newlines, lang.toString().toLowerCase());
return doc;
}
public void addToDocumentFromText(String text, Document doc, Newlines newlines) {
String lang = doc.getLanguage();
if(lang == null) {
lang = detectLanguage(text);
if(!lang.isEmpty()) doc.setLanguage(lang);
}
addToDocumentFromText(text, doc, newlines, lang);
}
public void addToDocumentFromText(String text, Document doc, Newlines newlines, String language) {
int docOffset = doc.getEnd();
if(docOffset > 0) docOffset++;
// find best Tokenizer and Splitter for text
TokenizerME tokenizer = newlineTokenizer.getOrDefault(language, newlineTokenizer.get(LANG_EN));
SentenceDetectorME ssplit = sentenceSplitter.getOrDefault(language, sentenceSplitter.get(LANG_EN));
opennlp.tools.util.Span sentences[] = ssplit.sentPosDetect(text);
// Tokenize sentences
int countNewlines = 0;
int nlOffset = 0; // number of skipped newlines
for(opennlp.tools.util.Span sentence : sentences) {
if(sentence == null) continue;
String sentenceText = text.substring(sentence.getStart(), sentence.getEnd());
opennlp.tools.util.Span tokens[] = tokenizer.tokenizePos(sentenceText);
List tokenList = new LinkedList<>();
for(opennlp.tools.util.Span token : tokens) {
String tokenText = sentenceText.substring(token.getStart(), token.getEnd());
if(tokenText.equals("\n")) { // newline
countNewlines++;
if(newlines == Newlines.KEEP) { // newline is a paragraph
tokenList.add(new Token(tokenText, docOffset - nlOffset + sentence.getStart() + token.getStart(), docOffset - nlOffset + sentence.getStart() + token.getEnd()));
//} else if(newlines == Newlines.KEEP_DOUBLE && countNewlines == 2) { // two newlines are a new paragraph, skip next though
// tokenList.add(new Token(tokenText, docOffset - nlOffset + sentence.getStart() + token.getStart(), docOffset - nlOffset + sentence.getStart() + token.getEnd()));
} else if(newlines == Newlines.DISCARD) { // skip newlines, but keep one whitespace
if(countNewlines > 1) nlOffset++;
} else {
nlOffset++;
}
} else {
tokenList.add(new Token(tokenText, docOffset - nlOffset + sentence.getStart() + token.getStart(), docOffset - nlOffset + sentence.getStart() + token.getEnd()));
countNewlines = 0;
}
}
doc.addSentence(new Sentence(tokenList), false);
}
}
public synchronized String detectLanguage(String text) {
try {
TextObject textObject = textObjectFactory.forText(text);
Optional locale = languageDetector.detect(textObject);
if(locale.isPresent()) return locale.get().getLanguage();
} catch(Exception e) {}
return "";
}
public Document createFromTokens(List tokens) {
String text = WordHelpers.tokensToText(tokens, 0);
String lang = detectLanguage(text);
Document doc = new Document();
doc.setLanguage(lang);
createSentencesFromTokens(tokens, lang).forEach(sentence -> {
doc.addSentence(sentence, false);
});
return doc;
}
public List createSentencesFromTokens(List tokens) {
String text = WordHelpers.tokensToText(tokens, 0);
String lang = detectLanguage(text);
return createSentencesFromTokens(tokens, lang);
}
public List createSentencesFromTokens(List tokens, String language) {
List result = new ArrayList<>();
String text = WordHelpers.tokensToText(tokens, 0);
// find best Tokenizer and Splitter for text
SentenceDetectorME ssplit = sentenceSplitter.getOrDefault(language, sentenceSplitter.get(LANG_EN));
opennlp.tools.util.Span sentences[] = ssplit.sentPosDetect(text);
// Tokenize sentences
Iterator tokenIt = tokens.iterator();
if(!tokenIt.hasNext()) return result;
Token currentToken = tokenIt.next();
for(opennlp.tools.util.Span sentence : sentences) {
if(sentence == null) continue;
List tokenList = new ArrayList<>();
while(currentToken.getBegin() < sentence.getEnd()) {
if(!currentToken.getText().equals("\n")) {
tokenList.add(currentToken);
}
if(!tokenIt.hasNext()) break;
currentToken = tokenIt.next();
}
result.add(new Sentence(tokenList));
}
return result;
}
/**
* Creates a list of Tokens from raw text (ignores sentences)
*/
public List createTokensFromText(String text, int offset) {
String language = detectLanguage(text);
return createTokensFromText(text, offset, language);
}
/**
* Creates a list of Tokens from raw text (ignores sentences)
*/
public List createTokensFromText(String text, int offset, String language) {
TokenizerME tokenizer = newlineTokenizer.getOrDefault(language, newlineTokenizer.get(LANG_EN));
opennlp.tools.util.Span tokens[] = tokenizer.tokenizePos(text);
List tokenList = new LinkedList<>();
for(opennlp.tools.util.Span token : tokens) {
String tokenText = text.substring(token.getStart(), token.getEnd());
Token t = new Token(tokenText, offset + token.getStart(), offset + token.getEnd());
tokenList.add(t);
}
return tokenList;
}
/**
* Creates a list of Tokens from tokenized text, keeping the original tokenization.
*/
public List createTokensFromTokenizedText(String text, int offset) {
List tokens = new ArrayList<>();
String last = "";
for(String token : WordHelpers.splitSpaces(text)) {
int length = token.length();
Token t = new Token(token, offset, offset + length);
if(!skipSpaceAfter.contains(last) && !skipSpaceBefore.contains(token)) {
t.setBegin(t.getBegin() + 1);
t.setEnd(t.getEnd() + 1);
}
offset = t.getEnd();
tokens.add(t);
last = token;
}
return tokens;
}
/**
* Recreates the document with automatic tokenization. Offsets are kept.
*/
public void retokenize(Document doc) {
doc.setText(doc.getText());
}
}