Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
de.datexis.common.WordHelpers Maven / Gradle / Ivy
package de.datexis.common;
import com.google.common.collect.Lists;
import de.datexis.model.Span;
import de.datexis.model.Token;
import org.apache.commons.io.IOUtils;
import org.deeplearning4j.text.tokenization.tokenizer.TokenPreProcess;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.nd4j.linalg.factory.Nd4j;
import org.nd4j.linalg.ops.transforms.Transforms;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.text.Normalizer;
import java.util.*;
import java.util.regex.Pattern;
/**
* Utility class with static helpers for Strings
* @author Sebastian Arnold
*/
public class WordHelpers {
protected final static Logger log = LoggerFactory.getLogger(WordHelpers.class);
public static HashSet skipSpaceBefore = new HashSet<>(Arrays.asList(",", ".", ":", ";", "?", "!", ")", "]", "'m", "'s", "'re", "'ve", "'d", "'ll", "n't"));
public static HashSet skipSpaceAfter = new HashSet<>(Arrays.asList("(", "[", "", "\n"));
private static final String[][] umlautReplacements = { {"Ä","Ae"}, {"Ü","Ue"}, {"Ö","Oe"}, {"ä","ae"}, {"ü","ue"}, {"ö","oe"}, {"ß","ss"}, {"–","-"} };
private static final String[][] tokenizationReplacements = { {"``","\""}, {"''","\""} };
public static final Pattern dashPattern = Pattern.compile("[\\-_\\/]+");
public static final Pattern punctPattern = Pattern.compile("[^\\w\\s\\-_]+");
public static final Pattern spacePattern = Pattern.compile("[\\s]+");
public static final Pattern numericPattern = Pattern.compile("[\\d]+");
public static final Pattern bracketsPattern = Pattern.compile("[\\(\\)\\[\\]\"]");
// TODO: umlaute fehlen hier!
// Lists taken from: http://www.statmt.org/europarl/ tools
public static HashSet abbreviationsEN = new HashSet<>(Arrays.asList("Adj.", "Adm.", "Adv.", "Asst.", "Bart.", "Bldg.", "Brig.", "Bros.", "Capt.", "Cmdr.", "Col.",
"Comdr.", "Con.", "Corp.", "Cpl.", "DR.", "Dr.", "Drs.", "Ens.", "Gen.", "Gov.", "Hon.", "Hr.", "Hosp.", "Insp.", "Lt.", "MM.", "MR.", "MRS.", "MS.", "Maj.", "Messrs.",
"Mlle.", "Mme.", "Mr.", "Mrs.", "Ms.", "Msgr.", "Op.", "Ord.", "Pfc.", "Ph.", "Prof.", "Pvt.", "Rep.", "Reps.", "Res.", "Rev.", "Rt.", "Sen.", "Sens.", "Sfc.", "Sgt.",
"Sr.", "St.", "Supt.", "Surg", "v.", "vs.", "i.e.", "rev.", "e.g.", "No.", "Nr.", "pp."));
public static HashSet abbreviationsDE = new HashSet<>(Arrays.asList("I.", "II.", "III.", "IV.", "V.", "VI.", "VII.", "VIII.", "IX.", "X.", "XI.", "XII.", "XIII.",
"XIV.", "XV.", "XVI.", "XVII.", "XVIII.", "XIX.", "XX.", "i.", "ii.", "iii.", "iv.", "v.", "vi.", "vii.", "viii.", "ix.", "x.", "xi.", "xii.", "xiii.", "xiv.", "xv.",
"xvi.", "xvii.", "xviii.", "xix.", "xx.", "Adj.", "Adm.", "Adv.", "Asst.", "Bart.", "Bldg.", "Brig.", "Bros.", "Capt.", "Cmdr.", "Col.", "Comdr.", "Con.", "Corp.",
"Cpl.", "DR.", "Dr.", "Ens.", "Gen.", "Gov.", "Hon.", "Hosp.", "Insp.", "Lt.", "MM.", "MR.", "MRS.", "MS.", "Maj.", "Messrs.", "Mlle.", "Mme.", "Mr.", "Mrs.", "Ms.",
"Msgr.", "Op.", "Ord.", "Pfc.", "Ph.", "Prof.", "Pvt.", "Rep.", "Reps.", "Res.", "Rev.", "Rt.", "Sen.", "Sens.", "Sfc.", "Sgt.", "Sr.", "St.", "Supt.", "Surg.",
"Mio.", "Mrd.", "bzw.", "v.", "vs.", "usw.", "d.h.", "z.B.", "u.a.", "etc.", "Mrd.", "MwSt.", "ggf.", "d.J.", "D.h.", "m.E.", "vgl.", "I.F.", "z.T.", "sogen.", "ff.",
"u.E.", "g.U.", "g.g.A.", "c.-à-d.", "Buchst.", "u.s.w.", "sog.", "u.ä.", "Std.", "evtl.", "Zt.", "Chr.", "u.U.", "o.ä.", "Ltd.", "b.A.", "z.Zt.", "spp.", "sen.",
"SA.", "k.o.", "jun.", "i.H.v.", "dgl.", "dergl.", "Co.", "zzt.", "usf.", "s.p.a.", "Dkr.", "Corp.", "bzgl.", "BSE.", "No.", "Nos.", "Art.", "Nr.", "pp.", "ca.", "Ca"));
public static enum Language { EN, DE };
private final Set stopWords;
public WordHelpers(Language lang) {
stopWords = new TreeSet<>(readStopWords(lang));
}
public static Language getLanguage(String language) {
try {
return Language.valueOf(language.trim().toUpperCase());
} catch(IllegalArgumentException e) {
return Language.EN;
}
}
private List readStopWords(Language lang) {
Resource stop = Resource.fromJAR("stopwords/stopwords_" + lang.toString().toLowerCase() + ".csv");
List stopWords = new ArrayList<>();
try {
stopWords = IOUtils.readLines(stop.getInputStream(), "UTF-8");
} catch (IOException ex) {
log.error("Could not read stop words " + ex.toString());
}
return stopWords;
}
public List getStopWords() {
return Lists.newArrayList(stopWords);
}
public boolean isStopWord(String word, TokenPreProcess pre) {
return isStopWord(pre.preProcess(word));
}
public boolean isStopWord(String word) {
return stopWords.contains(word.toLowerCase());
}
/**
* Builds a String from given words, with rule-based spacing according to characters.
* @param tokens
* @return
*/
public static String wordsToText(Iterable tokens) {
StringBuilder res = new StringBuilder();
String last = "";
for(Token t : tokens) {
if(!skipSpaceAfter.contains(last) && !skipSpaceBefore.contains(t.getText())) res.append(" ");
res.append(t.getText());
last = t.getText();
}
return res.toString().trim();
}
/**
* Builds a String from given Tokens with their original spacing. If offsets are not assigned correctly, will return a space-seperated String.
* @param tokens list of Tokens
* @param beginOffset the start offset for this String. If 0, words will be padded until their original position.
* @return the original Text, if possible
*/
public static String tokensToText(Iterable tokens, int beginOffset) {
StringBuilder res = new StringBuilder();
int cursor = beginOffset;
for(Token t : tokens) {
if(t.isEmpty()) continue;
if(cursor > t.getBegin()) {
// reset in case of wrong offsets
res.append(" ");
cursor = t.getBegin();
}
while(cursor < t.getBegin()) {
// append whitespace until begin is reached
res.append(" ");
cursor++;
}
// append text until end of token is reached.
// This is important, because while Tokenization, "etc." could be converted into [etc]. [.]
final String word = t.getText();
if(t.getLength() == word.length()) res.append(word);
else if(t.getLength() < word.length()) res.append(word.substring(0, t.getLength())); // truncate word
else res.append(word).append(String.join("", Collections.nCopies(t.getLength() - word.length(), " "))); // add spaces
cursor = t.getEnd();
}
return res.toString();
}
/**
* Cosine similarity between two vectors.
* @return 1 for high similarity, 0 for orthogonal vectors, -1 for vectors pointing in the opposite direction
* If one of both vectors is a Null-Vector or null, 0 is returned
*/
public static double cosineSim(INDArray arr1, INDArray arr2) {
if(arr1 == null || arr2 == null ) return 0;
else if(arr1.maxNumber().doubleValue() == 0 || arr2.maxNumber().doubleValue() == 0 ) return 0;
else return Transforms.cosineSim(arr1, arr2);
}
public static String vecToString(INDArray vec) {
StringBuilder sb = new StringBuilder();
for(int j = 0; j < vec.length(); j++) {
sb.append(vec.getDouble(j));
if(j < vec.length() - 1) {
sb.append(" ");
}
}
return sb.toString();
}
// TODO: this takes a lot of time?
public static INDArray stringToVec(String str) {
String[] split = str.split(" ");
float[] vector = new float[split.length];
for(int i = 0; i < split.length; i++) {
vector[i] = Float.parseFloat(split[i]);
}
return Nd4j.create(vector);
}
public static String replaceAccents(String str) {
//str = StringUtils.stripAccents(str);
str = Normalizer.normalize(str, Normalizer.Form.NFD);
//str = str.replaceAll("[\\p{InCombiningDiacriticalMarks}]", "");
return str;
}
public static String replaceUmlauts(String str) {
for(String[] rep : umlautReplacements) {
str = str.replaceAll(rep[0], rep[1]);
}
return str;
}
public static String replaceDashes(String str, String rep) {
return dashPattern.matcher(str).replaceAll(rep);
}
public static String replacePunctuation(String str, String rep) {
return punctPattern.matcher(str).replaceAll(rep);
}
public static String replaceNumbers(String str, String rep) {
return numericPattern.matcher(str).replaceAll(rep);
}
public static String replaceSpaces(String str, String rep) {
return spacePattern.matcher(str).replaceAll(rep);
}
public static String[] splitSpaces(String str) {
return str.split(spacePattern.pattern());
}
public static int getSpanOverlapLength(Span a, Span b) {
int begin = Math.max(a.getBegin(), b.getBegin());
int end = Math.min(a.getEnd(), b.getEnd());
if(begin < end) return end - begin;
else return 0;
}
}