![JAR search and dependency download from the Maven repository](/logo.png)
eu.project.ttc.utils.TermOccurrenceUtils Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of termsuite-core Show documentation
Show all versions of termsuite-core Show documentation
A Java UIMA-based toolbox for multilingual and efficient terminology extraction an multilingual term alignment
/*******************************************************************************
* Copyright 2015 - CNRS (Centre National de Recherche Scientifique)
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*******************************************************************************/
package eu.project.ttc.utils;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import com.google.common.collect.AbstractIterator;
import com.google.common.collect.ComparisonChain;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import eu.project.ttc.models.Document;
import eu.project.ttc.models.TermOccurrence;
import eu.project.ttc.models.index.TermMeasure;
/**
* A utililty class for {@link TermOccurrence} objects and collections.
*
* @author Damien Cram
*
*/
public class TermOccurrenceUtils {
public static Comparator uimaNaturalOrder = new Comparator() {
@Override
public int compare(TermOccurrence o1, TermOccurrence o2) {
return ComparisonChain.start()
.compare(o1.getSourceDocument().getUrl(), o2.getSourceDocument().getUrl())
.compare(o1.getBegin(), o2.getBegin())
.compare(o2.getEnd(), o1.getEnd())
.result();
}
};
/**
* Given a strategy, detects all primary occurrences in a collection
* of {@link TermOccurrence}.
*
* What defines an occurrence's primary/secondary status is the fact
* that in a {@link Document}, two primary occurrences cannot overlap.
*
* E.g. in text "offshore wind energy", the sequence or term occurrences "offshore"
* and "wind energy" is a set of primary sequence, but the set of term occurrences
* "offshore wind" and "wind energy" is not a primary sequence, because occurrences
* overlap.
*
*
* @see TermOccurrenceUtils#markPrimaryOccurrence(Collection, TermMeasure)
* @see TermOccurrence#isPrimaryOccurrence()
* @param occs
* the occurrence collection
* @param measure
* the measure for detecting primary occurrences
*
*/
public static void markPrimaryOccurrence(
Collection occs, TermMeasure measure) {
for(Iterator> it = occurrenceChunkIterator(occs);it.hasNext();) {
List chunk = it.next();
Set primaryOccs = Sets.newHashSet();
Collections.sort(chunk, measure.getOccurrenceComparator(true));
for(TermOccurrence o:chunk) {
o.setPrimaryOccurrence(!hasOverlappingOffsets(o, primaryOccs));
if(o.isPrimaryOccurrence())
primaryOccs.add(o);
}
}
}
/**
* Returns a virtual iterator on chunks of an occurrence collection.
*
* A occurrence collection's chunk is a list of overlapping {@link TermOccurrence}. Every time
* there is a gap between two occurrences (i.e. there do not overlap),
* a new chunk is created.
*
* @param occurrences
* @return
*/
public static Iterator> occurrenceChunkIterator(Collection occurrences) {
List asList = Lists.newArrayList(occurrences);
Collections.sort(asList, TermOccurrenceUtils.uimaNaturalOrder);
final Iterator it = asList.iterator();
return new AbstractIterator>() {
private List currentChunk = Lists.newArrayList();
@Override
protected List computeNext() {
while(it.hasNext()) {
TermOccurrence next = it.next();
if(currentChunk.isEmpty() || hasOverlappingOffsets(next, currentChunk))
currentChunk.add(next);
else {
List ret = copyAndReinit();
currentChunk.add(next);
return ret;
}
}
if(!currentChunk.isEmpty()) {
return copyAndReinit();
} else
return endOfData();
}
private List copyAndReinit() {
List copy = Lists.newArrayList(currentChunk);
currentChunk = Lists.newArrayList();
return copy;
}
};
}
/**
* Removes from an occurrence set all occurrences that overlap
* at least one occurrence in a reference occurrence set.
*
* @param referenceSet
* the reference set, not modified by this method
* @param occurrenceSet
* the occurrence set to analyze, will be modified by this method
*/
public static void removeOverlaps(Collection referenceSet, Collection occurrenceSet) {
Iterator it = occurrenceSet.iterator();
while(it.hasNext()) {
TermOccurrence occ = it.next();
for(TermOccurrence refOcc:referenceSet) {
if(occ.getSourceDocument().equals(refOcc.getSourceDocument())
&& areOffsetsOverlapping(occ, refOcc)) {
it.remove();
break;
}
}
}
}
/**
* True if an occurrence set contains any element overlapping
* with the param occurrence.
*
* @param theOcc
* @param theOccCollection
* @return
*/
public static boolean hasOverlappingOffsets(TermOccurrence theOcc, Collection theOccCollection) {
for(TermOccurrence o:theOccCollection)
if(areOffsetsOverlapping(theOcc, o))
return true;
return false;
}
/**
* True if two {@link TermOccurrence} offsets overlap strictly. Sharing exactly
* one offset (e.g. a.end == b.begin
) is not considered as overlap.
*
* @param a
* @param b
* @return
*/
public static boolean areOffsetsOverlapping(TermOccurrence a, TermOccurrence b) {
if(a.getBegin() <= b.getBegin())
return !(a.getBegin() <= b.getEnd() && a.getEnd() <= b.getBegin());
else
return !(b.getBegin() <= a.getEnd() && b.getEnd() <= a.getBegin());
}
/**
* Returns true if two occurrences are in the same
* document and their offsets overlap.
*
* @param a
* @param b
* @return
*/
public static boolean areOverlapping(TermOccurrence a, TermOccurrence b) {
return a.getSourceDocument().equals(b.getSourceDocument()) && areOffsetsOverlapping(a, b);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy