All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.datexis.preprocess.MinimalLowercaseNewlinePreprocessor Maven / Gradle / Ivy

package de.datexis.preprocess;

import de.datexis.common.WordHelpers;
import org.deeplearning4j.text.tokenization.tokenizer.TokenPreProcess;

/**
 * Proprocessor used for single tokens without positional information.
 * Converts to lowercase and strips punctuation and numbers.
 * @author Sebastian Arnold 
 */
public class MinimalLowercaseNewlinePreprocessor implements TokenPreProcess {
  @Override
  public String preProcess(String token) {
    if(token == null) return null;
    if(token.equals("\n")) return "*NL*";
    token = WordHelpers.replaceUmlauts(token);
    token = WordHelpers.replaceAccents(token);
    token = WordHelpers.replacePunctuation(token, "");
    token = WordHelpers.replaceNumbers(token, "#");
    token = WordHelpers.replaceSpaces(token, "_");
    return token.toLowerCase();
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy