semRewrite.substitutor.CoreLabelSequence Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of sigma-nlp Show documentation
Show all versions of sigma-nlp Show documentation
Natural language processing toolbox using Sigma knowledge engineering system.
/*
* Copyright 2014-2015 IPsoft
*
* Author: Andrei Holub [email protected]
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program ; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package semRewrite.substitutor;
import com.articulate.sigma.StringUtil;
import com.google.common.base.Joiner;
import com.google.common.collect.ImmutableList;
import edu.stanford.nlp.ling.CoreLabel;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
public class CoreLabelSequence {
public static final semRewrite.substitutor.CoreLabelSequence EMPTY_SEQUENCE = semRewrite.substitutor.CoreLabelSequence.from();
public static final int IGNORE_SENTENCE = Integer.MIN_VALUE;
final List labels;
/** ***************************************************************
*/
public CoreLabelSequence(List labels) {
this.labels = ImmutableList.copyOf(labels);
}
/** ***************************************************************
*/
public CoreLabelSequence(CoreLabel... labels) {
this.labels = ImmutableList.copyOf(labels);
}
/** ***************************************************************
*/
public static semRewrite.substitutor.CoreLabelSequence from(CoreLabel... labels) {
return labels.length > 0
? new semRewrite.substitutor.CoreLabelSequence(labels)
: EMPTY_SEQUENCE;
}
/** ***************************************************************
*/
public String toListString() {
return labels.toString();
}
/** ***************************************************************
* Generate a String where the CoreLabel values are separated by
* spaces and do not have a token number suffix
*/
public String toString() {
StringBuffer sb = new StringBuffer();
for (CoreLabel cl : labels) {
if (!StringUtil.emptyString(sb.toString()))
sb.append(" ");
sb.append(cl.value());
}
return labels.toString();
}
/** ***************************************************************
*/
public List getLabels() {
return labels;
}
/** ***************************************************************
*/
public boolean containsLabel(CoreLabel label) {
return labels.contains(label);
}
/** ***************************************************************
*/
public boolean isEmpty() {
return labels.isEmpty();
}
/** *************************************************************
* Checks if label is part of current sequence
* @param text the label to be checked for in the sequence
* @param sentIndex can be CoreLabelSequence.IGNORE_SENTENCE
*/
public boolean containsLabel(int sentIndex, String text, int index) {
//System.out.println("CoreLabelSequence.containsLabel(): sentIndex: " + sentIndex);
//System.out.println("CoreLabelSequence.containsLabel(): text: " + text);
//System.out.println("CoreLabelSequence.containsLabel(): index: " + index);
//System.out.println("CoreLabelSequence.containsLabel(): labels: " + labels);
for (CoreLabel label : labels) {
//System.out.println("CoreLabelSequence.containsLabel(): value: " + label.value());
//System.out.println("CoreLabelSequence.containsLabel():index: " + label.index());
if ((sentIndex == label.sentIndex() || sentIndex == IGNORE_SENTENCE)
&& text.equals(label.value())
// && index == label.index() FIXME: total hack!
) {
//System.out.println("CoreLabelSequence.containsLabel(): success ");
return true;
}
}
//return labels.stream().anyMatch(label ->
// (sentIndex == label.sentIndex() || sentIndex == IGNORE_SENTENCE)
// && text.equals(label.originalText())
// && index == label.index()
//);
//System.out.println("CoreLabelSequence.containsLabel(): failure - label not in sequence ");
return false;
}
/** *************************************************************
* Converts the sequence to text representation using "_" as a separator
*/
public String toText() {
return Joiner.on("_").join(labels.stream().map(label -> label.value()).toArray());
}
/** *************************************************************
* Returns the sequence in the String format like "United_States-3"
*/
public Optional toLabelString() {
if (!labels.isEmpty()) {
String combinedIndex = "-" + labels.get(0).index();
return Optional.of(toText() + combinedIndex);
}
else {
return Optional.empty();
}
}
/** *************************************************************
* Returns the sequence in the String format like "United_States-3"
*/
public String toStringWithNumToken() {
//System.out.println("CoreLabelSequence.toStringWithNumToken(): labels: " + labels);
if (!labels.isEmpty()) {
String combinedIndex = "-" + labels.get(0).index();
return toText() + combinedIndex;
}
else {
return "";
}
}
/** *************************************************************
*/
public int size() {
return labels.size();
}
/** *************************************************************
* Change the value() of each CoreLabel to be all caps
*/
public semRewrite.substitutor.CoreLabelSequence toUpperCase() {
//System.out.println("CoreLabelSequence.toUpperCase(): labels: " + labels);
List lcl = new ArrayList<>();
for (CoreLabel cl : labels) {
CoreLabel newcl = new CoreLabel();
newcl.setValue(cl.value().toUpperCase());
newcl.setIndex(cl.index());
lcl.add(newcl);
}
semRewrite.substitutor.CoreLabelSequence cls = new semRewrite.substitutor.CoreLabelSequence(lcl);
//System.out.println("CoreLabelSequence.toUpperCase(): cls: " + cls);
return cls;
}
/** *************************************************************
*/
public semRewrite.substitutor.CoreLabelSequence removePunctuation() {
//System.out.println("CoreLabelSequence.toUpperCase(): removePunctuation: " + labels);
semRewrite.substitutor.CoreLabelSequence cls = new semRewrite.substitutor.CoreLabelSequence(labels);
for (CoreLabel cl : labels) {
String puncRE = "[\\.\\,\\;\\:\\[\\]\\{\\}\\!\\@\\#\\$\\%\\^\\&\\*\\(\\)\\-\\=\\_\\+\\`\\~\\<\\>\\/\\?]";
if (cl.value().matches(puncRE))
cl.setValue(cl.value().replace(puncRE,""));
}
//System.out.println("CoreLabelSequence.toUpperCase(): cls: " + cls);
return cls;
}
/** *************************************************************
*/
public String toWordNetID() {
//System.out.println("CoreLabelSequence.toUptoWordNetIDperCase(): labels: " + labels);
StringBuffer sb = new StringBuffer();
for (CoreLabel cl : labels) {
if (!StringUtil.emptyString(sb.toString()))
sb.append("_");
sb.append(cl.value().replace(" ","_"));
}
//System.out.println("CoreLabelSequence.toUptoWordNetIDperCase(): sb: " + sb);
return sb.toString();
}
}