de.datexis.preprocess.DocumentFactory Maven / Gradle / Ivy

Go to download
package de.datexis.preprocess;

import com.google.common.base.Optional;
import com.optimaize.langdetect.LanguageDetector;
import com.optimaize.langdetect.LanguageDetectorBuilder;
import com.optimaize.langdetect.i18n.LdLocale;
import com.optimaize.langdetect.ngram.NgramExtractors;
import com.optimaize.langdetect.profiles.LanguageProfile;
import com.optimaize.langdetect.profiles.LanguageProfileReader;
import com.optimaize.langdetect.text.CommonTextObjectFactories;
import com.optimaize.langdetect.text.TextObject;
import com.optimaize.langdetect.text.TextObjectFactory;
import de.datexis.common.Resource;
import de.datexis.common.WordHelpers;
import de.datexis.model.Document;
import de.datexis.model.Sentence;
import de.datexis.model.Token;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.*;

import static de.datexis.common.WordHelpers.skipSpaceAfter;
import static de.datexis.common.WordHelpers.skipSpaceBefore;

/**
 * Creates a fully tokenized Document from raw text.
 * @author sarnold, rschneider, fgrimme
 */
public class DocumentFactory {

  protected final static Logger log = LoggerFactory.getLogger(DocumentFactory.class);

  protected static DocumentFactory instance = new DocumentFactory();
  
  public static DocumentFactory getInstance() {
    return instance;
  }
  
  public static enum Newlines { 
    KEEP, // keep all newlines in the text and use them as sentence breaks
    //KEEP_DOUBLE, // keep only double newlines in the text, but use all of them as sentence breaks
    DISCARD // discard all newlines in the text but still use them as sentence breaks
  };
  
  private final static String LANG_EN = "en";
  private final static String LANG_DE = "de";
  
  TreeMap sentenceSplitter;
  TreeMap newlineTokenizer;
  
  TextObjectFactory textObjectFactory;
  LanguageDetector languageDetector;
  
  
  /**
   * Create a new DocumentFactory instance. Use this only if you need multiple instances!
   * Otherwise, getInstance() will return a singleton object that you can use.
   */
  public DocumentFactory() {
    
    sentenceSplitter = new TreeMap<>();
    newlineTokenizer = new TreeMap<>();
    
    loadSentenceSplitter(LANG_EN, Resource.fromJAR("openNLP/en-sent.bin"));
    loadTokenizer(LANG_EN, Resource.fromJAR("openNLP/en-token.bin"));
    loadSentenceSplitter(LANG_DE, Resource.fromJAR("openNLP/de-sent.bin"));
    loadTokenizer(LANG_DE, Resource.fromJAR("openNLP/de-token.bin"));
    
    try {
      //load all languages:
      List languageProfiles = new LanguageProfileReader().readAllBuiltIn();
      //build language detector:
      languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard())
              .withProfiles(languageProfiles)
              .build();
      //create a text object factory
      textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
    } catch (IOException ex) {
      log.error("Could not load language profiles");
    }
  }
  
  private void loadSentenceSplitter(String language, Resource modelPath) {
    try {
      SentenceModel sentenceModel = new SentenceModel(modelPath.getInputStream());
      sentenceSplitter.put(language, new SentenceDetectorMENL(sentenceModel));
    } catch (IOException ex) {
      throw new IllegalStateException("cannot load openNLP model '" + modelPath.toString() + "': " + ex.toString());
    }
  }
  
  private void loadTokenizer(String language, Resource modelPath) {
    try {
      TokenizerModel tokenModel = new TokenizerModel(modelPath.getInputStream());
      newlineTokenizer.put(language, new TokenizerMENL(tokenModel));
    } catch (IOException ex) {
      throw new IllegalStateException("cannot load openNLP model '" + modelPath.toString() + "': " + ex.toString());
    }
  }
  
  /**
   * Creates a Document with Sentences and Tokens from a String.
   * Newlines in the text will lead to new sentences, but will not be contained in the document.
   * @param text The text to process
   */
  public static Document fromText(String text) {
		return instance.createFromText(text);
	}
	
  /**
   * Creates a Document with Sentences and Tokens from a String.
   * Newlines in the text will always lead to new sentences, and you can choose to keep or discard them.
   * @param text The text to process
   * @param newlines Keep or discard newlines
   */
  public static Document fromText(String text, Newlines newlines) {
		return instance.createFromText(text, newlines);
	}
  
  /**
   * Creates a Document with Sentences and Tokens from a String.
   * Newlines in the text will always lead to new sentences, and you can choose to keep or discard them.
   * @param text The text to process
   * @param newlines Keep or discard newlines
   * @param lang use a given language for tokenization and sentence splitting
   */
  public static Document fromText(String text, Newlines newlines, WordHelpers.Language lang) {
    return instance.createFromText(text, newlines, lang);
  }
  
  /**
   * Creates a Document with Sentences and Tokens from a whitespace-tokenized String.
   * The Tokens in the resulting Document will follow the tokenization from the input, and Sentences are split automatically.
   * Whitespace between the tokens will be guessed according to some simple rules.
   * @param text The tokenized text to process
   */
  public static Document fromTokenizedText(String text) {
    final List tokens = instance.createTokensFromTokenizedText(text, 0);
    return instance.createFromTokens(tokens);
  }
  
  /**
   * Creates a Document from existing Tokens, processing Span positions and Sentence splitting.
   * @param tokens The list of Tokens to process in their natural order. WARNING: Token offsets must be set correctly!
   */
  public static Document fromTokens(List tokens) {
		return instance.createFromTokens(tokens);
	}
  
  /**
   * Create Tokens from raw text, without sentence splitting.
   * If you don't need perfect tokenization of punctuation and Token offsets, consider using WordHelpers.splitSpaces()
   * @param text The text to process
   */
  public static List createTokensFromText(String text) {
		return instance.createTokensFromText(text, 0);
	}
  
  /**
   * Create Tokens from tokenized text, without sentence splitting.
   * Whitespace between the tokens will be guessed according to some simple rules.
   * @param text The tokenized text to process
   */
  public static List createTokensFromTokenizedText(String text) {
    return instance.createTokensFromTokenizedText(text, 0);
  }
    
  /**
   * Create a single Sentence from given Tokens, omitting Sentence splitting.
   */
  public static Sentence createSentenceFromTokens(List tokens) {
    Sentence s = new Sentence();
    tokens.stream()
      .filter(t -> !t.isEmpty())
      .forEach(t -> s.addToken(t));
    return s;
  }
  
  public static Sentence createSentenceFromString(String text, String language) {
    return createSentenceFromTokens(instance.createTokensFromText(text, 0, language));
  }
  
  public static Sentence createSentenceFromTokenizedString(String text) {
    return createSentenceFromTokens(instance.createTokensFromTokenizedText(text, 0));
  }
  
  /**
   * Detects the language of a text
   * @return language code, e.g. "en" or "de"
   */
  public static String getLanguage(String text) {
    return instance.detectLanguage(text);
  }
  
  public Document createFromText(String text) {
    Document doc = new Document();
    addToDocumentFromText(text, doc, Newlines.DISCARD);
    return doc;
  }
  
  public Document createFromText(String text, Newlines newlines) {
    Document doc = new Document();
    addToDocumentFromText(text, doc, newlines);
    return doc;
  }
  
  public Document createFromText(String text, Newlines newlines, WordHelpers.Language lang) {
    Document doc = new Document();
    addToDocumentFromText(text, doc, newlines, lang.toString().toLowerCase());
    return doc;
  }
  
  public void addToDocumentFromText(String text, Document doc, Newlines newlines) {
    String lang = doc.getLanguage();
    if(lang == null) {
      lang = detectLanguage(text);
      if(!lang.isEmpty()) doc.setLanguage(lang);
    }
    addToDocumentFromText(text, doc, newlines, lang);
  }
  
  public void addToDocumentFromText(String text, Document doc, Newlines newlines, String language) {
    
    int docOffset = doc.getEnd();
    if(docOffset > 0) docOffset++;
    
    // find best Tokenizer and Splitter for text
    TokenizerME tokenizer = newlineTokenizer.getOrDefault(language, newlineTokenizer.get(LANG_EN));
    SentenceDetectorME ssplit = sentenceSplitter.getOrDefault(language, sentenceSplitter.get(LANG_EN));
    
    opennlp.tools.util.Span sentences[] = ssplit.sentPosDetect(text); 
    
    // Tokenize sentences
    int countNewlines = 0;
    int nlOffset = 0; // number of skipped newlines
    for(opennlp.tools.util.Span sentence : sentences) {
      if(sentence == null) continue;
      String sentenceText = text.substring(sentence.getStart(), sentence.getEnd());
      opennlp.tools.util.Span tokens[] = tokenizer.tokenizePos(sentenceText);
      List tokenList = new LinkedList<>();
      for(opennlp.tools.util.Span token : tokens) {
        String tokenText = sentenceText.substring(token.getStart(), token.getEnd());
        if(tokenText.equals("\n")) { // newline
          countNewlines++;
          if(newlines == Newlines.KEEP) { // newline is a paragraph
            tokenList.add(new Token(tokenText, docOffset - nlOffset + sentence.getStart() + token.getStart(), docOffset - nlOffset + sentence.getStart() + token.getEnd()));
          //} else if(newlines == Newlines.KEEP_DOUBLE && countNewlines == 2) { // two newlines are a new paragraph, skip next though
          // tokenList.add(new Token(tokenText, docOffset - nlOffset + sentence.getStart() + token.getStart(), docOffset - nlOffset + sentence.getStart() + token.getEnd()));
          } else if(newlines == Newlines.DISCARD) { // skip newlines, but keep one whitespace
            if(countNewlines > 1) nlOffset++;
          } else {
            nlOffset++;
          }
        } else {
          tokenList.add(new Token(tokenText, docOffset - nlOffset + sentence.getStart() + token.getStart(), docOffset - nlOffset + sentence.getStart() + token.getEnd()));
          countNewlines = 0;
        }
      }
      doc.addSentence(new Sentence(tokenList), false);
    }
  }
  
  public synchronized String detectLanguage(String text) {
    try {
      TextObject textObject = textObjectFactory.forText(text);
      Optional locale = languageDetector.detect(textObject);
      if(locale.isPresent()) return locale.get().getLanguage();
    } catch(Exception e) {}
    return "";
  }
  
  public Document createFromTokens(List tokens) {
    String text = WordHelpers.tokensToText(tokens, 0);
    String lang = detectLanguage(text);
    Document doc = new Document();
    doc.setLanguage(lang);
    createSentencesFromTokens(tokens, lang).forEach(sentence -> {
      doc.addSentence(sentence, false);
    });
    return doc;
  }
  
  public List createSentencesFromTokens(List tokens) {
    String text = WordHelpers.tokensToText(tokens, 0);
    String lang = detectLanguage(text);
    return createSentencesFromTokens(tokens, lang);
  }
  
  public List createSentencesFromTokens(List tokens, String language) {
    List result = new ArrayList<>();
    String text = WordHelpers.tokensToText(tokens, 0);
    
    // find best Tokenizer and Splitter for text
    SentenceDetectorME ssplit = sentenceSplitter.getOrDefault(language, sentenceSplitter.get(LANG_EN));
    
    opennlp.tools.util.Span sentences[] = ssplit.sentPosDetect(text); 
    
    // Tokenize sentences
    Iterator tokenIt = tokens.iterator();
    if(!tokenIt.hasNext()) return result;
    Token currentToken = tokenIt.next();
    for(opennlp.tools.util.Span sentence : sentences) {
      if(sentence == null) continue;
      List tokenList = new ArrayList<>();
      while(currentToken.getBegin() < sentence.getEnd()) {
        if(!currentToken.getText().equals("\n")) {
          tokenList.add(currentToken);
        }
        if(!tokenIt.hasNext()) break;
        currentToken = tokenIt.next();
      }
      result.add(new Sentence(tokenList));
    }
    return result;
  }
  
  /**
   * Creates a list of Tokens from raw text (ignores sentences)
   */
  public List createTokensFromText(String text, int offset) {
    String language = detectLanguage(text);
    return createTokensFromText(text, offset, language);
  }
  
  /**
   * Creates a list of Tokens from raw text (ignores sentences)
   */
  public List createTokensFromText(String text, int offset, String language) {
    TokenizerME tokenizer = newlineTokenizer.getOrDefault(language, newlineTokenizer.get(LANG_EN));
    opennlp.tools.util.Span tokens[] = tokenizer.tokenizePos(text);
    List tokenList = new LinkedList<>();
    for(opennlp.tools.util.Span token : tokens) {
      String tokenText = text.substring(token.getStart(), token.getEnd());
      Token t = new Token(tokenText, offset + token.getStart(), offset + token.getEnd());
      tokenList.add(t);
    }
    return tokenList;
  }
  
  /**
   * Creates a list of Tokens from tokenized text, keeping the original tokenization.
   */
  public List createTokensFromTokenizedText(String text, int offset) {
    List tokens = new ArrayList<>();
    String last = "";
    for(String token : WordHelpers.splitSpaces(text)) {
      int length = token.length();
      Token t = new Token(token, offset, offset + length);
      if(!skipSpaceAfter.contains(last) && !skipSpaceBefore.contains(token)) {
        t.setBegin(t.getBegin() + 1);
        t.setEnd(t.getEnd() + 1);
      }
      offset = t.getEnd();
      tokens.add(t);
      last = token;
    }
    return tokens;
  }
  
  /**
   * Recreates the document with automatic tokenization. Offsets are kept.
   */
  public void retokenize(Document doc) {
    doc.setText(doc.getText());
  }
  
}