annis.CommonHelper Maven / Gradle / Ivy
/*
* Copyright 2011 SFB 632.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package annis;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.UUID;
import com.google.common.base.Joiner;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.lang3.StringUtils;
import org.corpus_tools.salt.SALT_TYPE;
import org.corpus_tools.salt.SaltFactory;
import org.corpus_tools.salt.common.SCorpus;
import org.corpus_tools.salt.common.SCorpusGraph;
import org.corpus_tools.salt.common.SDocument;
import org.corpus_tools.salt.common.SDocumentGraph;
import org.corpus_tools.salt.common.SOrderRelation;
import org.corpus_tools.salt.common.STextualDS;
import org.corpus_tools.salt.common.STextualRelation;
import org.corpus_tools.salt.common.SToken;
import org.corpus_tools.salt.common.SaltProject;
import org.corpus_tools.salt.core.GraphTraverseHandler;
import org.corpus_tools.salt.core.SAnnotation;
import org.corpus_tools.salt.core.SFeature;
import org.corpus_tools.salt.core.SGraph;
import org.corpus_tools.salt.core.SGraph.GRAPH_TRAVERSE_TYPE;
import org.corpus_tools.salt.core.SLayer;
import org.corpus_tools.salt.core.SNode;
import org.corpus_tools.salt.core.SRelation;
import org.corpus_tools.salt.graph.Label;
import org.corpus_tools.salt.util.DataSourceSequence;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import annis.model.AnnisConstants;
import annis.service.objects.Match;
/**
* Utilities class for non-gui operations.
*
* @author Thomas Krause {@literal }
* @author Benjamin Weißenfels {@literal }
*/
public class CommonHelper {
private final static Logger log = LoggerFactory.getLogger(CommonHelper.class);
/**
* Detects arabic characters in a string.
*
*
* Every character is checked, if its bit representation lies between:
* [1425, 1785] | [64286, 65019] | [65136, 65276]
*
*
*
* @param str
* The string to be checked.
* @return returns true, if arabic characters are detected.
*/
public static boolean containsRTLText(String str) {
if (str != null) {
for (int i = 0; i < str.length(); i++) {
char cc = str.charAt(i);
// hebrew extended and basic, arabic basic and extendend
if (cc >= 1425 && cc <= 1785) {
return true;
}
// alphabetic presentations forms (hebrwew) to arabic presentation forms A
else if (cc >= 64286 && cc <= 65019) {
return true;
}
// arabic presentation forms B
else if (cc >= 65136 && cc <= 65276) {
return true;
}
}
}
return false;
}
/**
* Calculates a {@link SOrderRelation} node chain of a {@link SDocumentGraph}.
*
*
* If no segmentation name is set, a list of sorted {@link SToken} will be
* returned.
*
*
* @param segName
* The segmentation name, for which the chain is computed.
* @param graph
* The salt document graph, which is traversed for the
* segmentation.
*
* @return Returns a List of {@link SNode}, which is sorted by the
* {@link SOrderRelation}.
*/
public static List getSortedSegmentationNodes(String segName, SDocumentGraph graph) {
List token = new ArrayList();
if (segName == null) {
// if no segmentation is given just return the sorted token list
List unsortedToken = graph.getSortedTokenByText();
if (unsortedToken != null) {
token.addAll(unsortedToken);
}
} else {
// get the very first node of the order relation chain
Set startNodes = new LinkedHashSet();
if (graph != null) {
List orderRoots = graph.getRootsByRelation(SALT_TYPE.SORDER_RELATION);
if (orderRoots != null) {
// collect the start nodes of a segmentation chain of length 1
for (SNode n : orderRoots) {
for (SRelation, ?> rel : n.getOutRelations()) {
if (rel instanceof SOrderRelation) {
// the type is the name of the relation
if (segName.equals(rel.getType())) {
startNodes.add(n);
break;
}
}
}
}
}
}
Set alreadyAdded = new HashSet();
// add all nodes on the order relation chain beginning from the start node
for (SNode s : startNodes) {
SNode current = s;
while (current != null) {
token.add(current);
List> out = graph.getOutRelations(current.getId());
current = null;
if (out != null) {
for (SRelation extends SNode, ? extends SNode> e : out) {
if (e instanceof SOrderRelation) {
current = ((SOrderRelation) e).getTarget();
if (alreadyAdded.contains(current.getId())) {
// abort if cycle detected
current = null;
} else {
alreadyAdded.add(current.getId());
}
break;
}
}
}
}
}
}
return token;
}
public static Set getTokenAnnotationLevelSet(SDocumentGraph graph) {
Set result = new TreeSet();
if (graph != null) {
for (SToken n : graph.getTokens()) {
for (SAnnotation anno : n.getAnnotations()) {
result.add(anno.getQName());
}
}
}
return result;
}
public static Set getTokenAnnotationLevelSet(SaltProject p) {
Set result = new TreeSet();
for (SCorpusGraph corpusGraphs : p.getCorpusGraphs()) {
for (SDocument doc : corpusGraphs.getDocuments()) {
SDocumentGraph g = doc.getDocumentGraph();
result.addAll(getTokenAnnotationLevelSet(g));
}
}
return result;
}
/**
* Gets the spannend/covered text for a token. This will get all
* {@link STextualRelation} edges for a {@link SToken} from the
* {@link SDocumentGraph} and calculates the appropiate substring from the
* {@link STextualDS}.
*
* @param tok
* The {@link SToken} which is overlapping the text sequence.
* @return An empty {@link String} object, if there is no
* {@link STextualRelation}
*/
public static String getSpannedText(SToken tok) {
SGraph graph = tok.getGraph();
List> edges = graph.getOutRelations(tok.getId());
for (SRelation extends SNode, ? extends SNode> e : edges) {
if (e instanceof STextualRelation) {
STextualRelation textRel = (STextualRelation) e;
return textRel.getTarget().getText().substring(textRel.getStart(), textRel.getEnd());
}
}
return "";
}
/**
* Checks a {@link SNode} if it is member of a specific {@link SLayer}.
*
* @param layerName
* Specifies the layername to check.
* @param node
* Specifies the node to check.
* @return true - it is true when the name of layername corresponds to the name
* of any label of the SNode.
*/
public static boolean checkSLayer(String layerName, SNode node) {
// robustness
if (layerName == null || node == null) {
return false;
}
Set sLayers = node.getLayers();
if (sLayers != null) {
for (SLayer l : sLayers) {
Collection