
gr.demokritos.iit.jinsect.documentModel.representations.DocumentNGramGraph Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of JInsect Show documentation
Show all versions of JInsect Show documentation
The JINSECT toolkit is a Java-based toolkit and library that supports and demonstrates the use of n-gram graphs within Natural Language Processing applications, ranging from summarization and summary evaluation to text classification and indexing.
The newest version!
/*
* INSECTDocumentGraph.java
*
* Created on 24 ?????????? 2006, 10:33 ??
*
*/
package gr.demokritos.iit.jinsect.documentModel.representations;
import gr.demokritos.iit.jinsect.structs.IMergeable;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.HashSet;
import java.util.Map;
import java.util.Vector;
import gr.demokritos.iit.jinsect.events.NormalizerListener;
import gr.demokritos.iit.jinsect.events.WordEvaluatorListener;
import gr.demokritos.iit.jinsect.structs.EdgeCachedLocator;
import gr.demokritos.iit.jinsect.structs.UniqueVertexGraph;
import gr.demokritos.iit.jinsect.utils;
import gr.demokritos.iit.jinsect.events.TextPreprocessorListener;
import java.util.Arrays;
import java.util.List;
import salvo.jesus.graph.*;
/** Represents the graph of a document, with vertices n-grams of the document and edges the number
* of the n-grams' co-occurences within a given window.
*
* @author PCKid
*/
public class DocumentNGramGraph implements Serializable, Cloneable, IMergeable {
/** The minimum and maximum n-gram size, and the cooccurence window.
* Default values are 3, 3, 3 correspondingly.
*/
protected int MinSize = 3, MaxSize = 3, CorrelationWindow = 3;
protected String DataString = "";
protected HashMap DegradedEdges;
public NormalizerListener Normalizer = null;
public WordEvaluatorListener WordEvaluator = null;
public TextPreprocessorListener TextPreprocessor = null;
protected UniqueVertexGraph[] NGramGraphArray;
protected EdgeCachedLocator eclLocator = null;
/** Creates a new instance of INSECTDocumentGraph */
public DocumentNGramGraph() {
InitGraphs();
}
/***
* Creates a new instance of INSECTDocumentGraph
* @param iMinSize The minimum n-gram size
* @param iMaxSize The maximum n-gram size
* @param iCorrelationWindow The maximum distance of terms to be considered
* as correlated.
*/
public DocumentNGramGraph(int iMinSize, int iMaxSize, int iCorrelationWindow) {
MinSize = iMinSize;
MaxSize = iMaxSize;
CorrelationWindow = iCorrelationWindow;
InitGraphs();
}
protected void InitGraphs() {
// Create array of graphs
NGramGraphArray = new UniqueVertexGraph[MaxSize - MinSize + 1];
// Init array
for (int iCnt=MinSize; iCnt <= MaxSize; iCnt++)
NGramGraphArray[iCnt - MinSize] = new UniqueVertexGraph();
// Create degraded edge list
DegradedEdges = new HashMap();
}
/** Measures an indication of the size of a document n-gram graph based on
* the edge count of its contained graphs.
*
* @return The sum of the count of the edges of the various level graphs in
* the document n-gram graph.
*/
public int length() {
java.util.Iterator iIter = java.util.Arrays.asList(NGramGraphArray).iterator();
int iCnt = 0;
while (iIter.hasNext())
iCnt += ((UniqueVertexGraph)iIter.next()).getEdgesCount();
return iCnt;
}
public boolean isEmpty() {
return NGramGraphArray[0].getEdgesCount()== 0;
}
/** Creates the graph based on a data string loaded from a given file.
*@param sFilename The filename of the file containing the data string.
*/
public void loadDataStringFromFile(String sFilename) throws java.io.IOException,
java.io.FileNotFoundException{
String sDataString = utils.loadFileToStringWithNewlines(sFilename);
setDataString(sDataString); // Actually update
}
/***
*Returns graph with M-based index
*@param iIndex The index of the graph. Zero (0) equals to the graph for
* level MinSize n-grams.
*@return The {@link UniqueVertexGraph} of the corresponding level.
***/
public UniqueVertexGraph getGraphLevel(int iIndex) {
return NGramGraphArray[iIndex];
}
/***
*Returns graph with n-gram-size-based index
*@param iNGramSize The n-gram size of the graph.
*@return The {@link UniqueVertexGraph} of the corresponding level.
***/
public UniqueVertexGraph getGraphLevelByNGramSize(int iNGramSize) {
// Check bounds
if ((iNGramSize < MinSize) || (iNGramSize > MaxSize))
return null;
return NGramGraphArray[iNGramSize - MinSize];
}
public HashSet getAllNodes() {
HashSet hRes = new HashSet(length() / (MaxSize - MinSize)); // Init set
for (int iCurLvl = MinSize; iCurLvl <= MaxSize; iCurLvl++)
{
java.util.Iterator iIter = NGramGraphArray[iCurLvl -
MinSize].getEdgeSet().iterator();
while (iIter.hasNext())
hRes.add(iIter.next());
}
return hRes;
}
/**
* Set a locator to optimize the edge lookup.
* @param eNewLocator The locator to use.
*/
public void setLocator(EdgeCachedLocator eNewLocator) {
eclLocator = eNewLocator;
}
/***
* Creates an edge in [gGraph] connecting [sBaseNode] to each node in the
*[lOtherNodes] list of nodes. If an edge exists, its weight is increased by [iIncreaseWeight],
*else its weight is set to [iStartWeight]
*@param gGraph The graph to use
*@param sStartNode The node from which all edges begin
*@param lOtherNodes The list of nodes to which sBaseNode is connected
*@param hAppearenceHistogram The histogram of appearences of the terms
***/
public void createEdgesConnecting(UniqueVertexGraph gGraph, String sStartNode, List lOtherNodes,
HashMap hAppearenceHistogram) {
double dStartWeight = 0;
double dIncreaseWeight = 0;
// If no neightbours
if (lOtherNodes != null)
if (lOtherNodes.size() == 0)
{
// Attempt to add solitary node [sStartNode]
VertexImpl v = new VertexImpl();
v.setLabel(sStartNode);
try {
gGraph.add(v);
}
catch (Exception e) {
// Probably exists already
e.printStackTrace();
}
return;
}
// Otherwise for every neighbour add edge
java.util.Iterator iIter = lOtherNodes.iterator();
// Locate source node
Vertex vOldA = gGraph.locateVertex(new VertexImpl(sStartNode));
// DEPRECATED: Vertex vOldA = utils.locateVertexInGraph(gGraph, sStartNode);
Vertex vA;
if (vOldA != null)
vA = vOldA;
else {
// else create it
vA = new VertexImpl();
vA.setLabel(sStartNode);
// Add to graph
try {
gGraph.add(vA);
}
catch (Exception e) {
// Not added. Ignore.
}
}
EdgeCachedLocator ecl;
if (eclLocator == null)
ecl = new EdgeCachedLocator(100);
else
ecl = eclLocator;
// For every edge
while (iIter.hasNext())
{
VertexImpl vB = new VertexImpl();
vB.setLabel((String)iIter.next());
double dOldWeight = 0;
double dNewWeight = 0;
//dStartWeight = 2.0 / (((Double)hAppearenceHistogram.get(vA.getLabel())).doubleValue() +
//((Double)hAppearenceHistogram.get(vB.getLabel())).doubleValue());
dStartWeight = 1.0;
dIncreaseWeight = dStartWeight;
WeightedEdge weCorrectEdge = (WeightedEdge)ecl.locateDirectedEdgeInGraph(
gGraph, vA, vB);
if (weCorrectEdge == null)
// Not found. Using Start weight
dNewWeight = dStartWeight;
else {
dOldWeight = weCorrectEdge.getWeight();
dNewWeight = dOldWeight + dIncreaseWeight; // Increase as required
}
try
{
if (weCorrectEdge == null) {
ecl.addedEdge(gGraph.addEdge(vA, vB, dNewWeight));
}
else
weCorrectEdge.setWeight(dNewWeight);
}
catch (Exception e)
{
// Unknown error
e.printStackTrace();
}
}
}
/***
* Creates an edge in [gGraph] connecting [sBaseNode] to each node in the
*[lOtherNodes] list of nodes. If an edge exists, its weight is increased by [iIncreaseWeight],
*else its weight is set to [iStartWeight]
*@param gGraph The graph to use
*@param sStartNode The node from which all edges begin
*@param lOtherNodes The list of nodes to which sBaseNode is connected
*@param dStartWeight The initial weight for first-occuring nodes
*@param dNewWeight The new weight
*@param dDataImportance The tendency towards the new value. 0.0 means no change
*to the current value. 1.0 means the old value is completely replaced by the
*new. 0.5 means the final value is the average of the old and the new.
***/
public void createWeightedEdgesConnecting(UniqueVertexGraph gGraph,
String sStartNode, List lOtherNodes,
double dStartWeight, double dNewWeight, double dDataImportance) {
// If no neightbours
if (lOtherNodes != null)
if (lOtherNodes.size() == 0)
{
// Attempt to add solitary node [sStartNode]
VertexImpl v = new VertexImpl();
v.setLabel(sStartNode);
try {
gGraph.add(v);
}
catch (Exception e) {
// Probably exists already
e.printStackTrace();
}
}
// Locate or create source node
Vertex vA = gGraph.locateVertex(sStartNode);
if (vA == null) {
vA = new VertexImpl();
vA.setLabel(sStartNode);
try {
gGraph.add(vA);
}
catch (Exception e) {
// Add failed. Ignore
}
}
EdgeCachedLocator ecl;
if (eclLocator == null)
ecl = new EdgeCachedLocator(100);
else
ecl = eclLocator;
// Otherwise for every neighbour add edge
java.util.Iterator iIter = lOtherNodes.iterator();
// For every edge
while (iIter.hasNext())
{
VertexImpl vB = new VertexImpl();
vB.setLabel(new String((String)iIter.next()));
double dOldWeight = 0;
double dFinalWeight = 0;
WeightedEdge weCorrectEdge = null;
// Get old weight
WeightedEdge weEdge = null;
// Look for SAME ORIENTATION OF EDGE
boolean bFound = (weEdge =
(WeightedEdge)ecl.locateDirectedEdgeInGraph(gGraph, vA, vB))
!= null;
if (bFound)
{
dOldWeight = weEdge.getWeight();
// Found edge should break to avoid redundancy
weCorrectEdge = weEdge;
dFinalWeight = dOldWeight + (dNewWeight - dOldWeight)
* dDataImportance; // Increase as required
weCorrectEdge.setWeight(dFinalWeight);
}
else
{
// Not found. New edge.
dFinalWeight = dStartWeight;
try {
gGraph.addEdge(vA, vB, dFinalWeight);
ecl.resetCache();
}
catch (Exception e) {
// Insert failed. Ignoring...
// TODO: Check if it needs to be removed
e.printStackTrace();
}
}
// DEBUG LINES
// if (dFinalWeight < 0)
// System.err.println("Negative weight.");
//////////////
}
}
/***
* Creates the graph of n-grams, for all the levels specified in the MinSize, MaxSize range.
***/
public void createGraphs() {
String sUsableString = new StringBuilder().append(DataString).toString();
// Use preprocessor if available
if (TextPreprocessor != null)
sUsableString = TextPreprocessor.preprocess(sUsableString);
// else
// sUsableString = new String(sUsableString);
int iLen = DataString.length();
// Create token histogram.
HashMap hTokenAppearence = new HashMap();
// 1st pass. Populate histogram.
///////////////////////////////
// For all sizes create corresponding levels
for (int iNGramSize = MinSize; iNGramSize <= MaxSize; iNGramSize++)
{
// If n-gram bigger than text
if (iLen < iNGramSize)
// then Ignore
continue;
// The String has a size of at least [iNGramSize]
String sCurNGram = null;
for (int iCurStart = 0; iCurStart < iLen; iCurStart++)
{
// If reached end
if (iLen < iCurStart + iNGramSize)
// then break
break;
// Get n-gram
sCurNGram = sUsableString.substring(iCurStart, iCurStart + iNGramSize);
// Evaluate word
if (WordEvaluator != null)
if (!WordEvaluator.evaluateWord(sCurNGram))
// and ignore if it does not evaluate
continue;
// Update Histogram
if (hTokenAppearence.containsKey(sCurNGram))
hTokenAppearence.put(sCurNGram, ((Double)hTokenAppearence.get(sCurNGram)).doubleValue() + 1.0);
else
hTokenAppearence.put(sCurNGram, 1.0);
}
}
// 2nd pass. Create graph.
///////////////////////////////
// For all sizes create corresponding levels
for (int iNGramSize = MinSize; iNGramSize <= MaxSize; iNGramSize++)
{
// If n-gram bigger than text
if (iLen < iNGramSize)
// then Ignore
continue;
Vector PrecedingNeighbours = new Vector();
UniqueVertexGraph gGraph = getGraphLevelByNGramSize(iNGramSize);
// The String has a size of at least [iNGramSize]
String sCurNGram = "";
for (int iCurStart = 0; iCurStart < iLen; iCurStart++)
{
// If reached end
if (iLen < iCurStart + iNGramSize)
// then break
break;
// Get n-gram
sCurNGram = sUsableString.substring(iCurStart, iCurStart + iNGramSize);
// Evaluate word
if (WordEvaluator != null)
if (!WordEvaluator.evaluateWord(sCurNGram))
// and ignore if it does not evaluate
continue;
String[] aFinalNeighbours;
// Normalize
if (Normalizer != null)
aFinalNeighbours = (String[])Normalizer.normalize(null, PrecedingNeighbours.toArray());
else
{
aFinalNeighbours = new String[PrecedingNeighbours.size()];
PrecedingNeighbours.toArray(aFinalNeighbours);
}
createEdgesConnecting(gGraph, sCurNGram, java.util.Arrays.asList(aFinalNeighbours),
hTokenAppearence);
PrecedingNeighbours.add(sCurNGram);
if (PrecedingNeighbours.size() > CorrelationWindow)
PrecedingNeighbours.removeElementAt(0);// Remove first element
}
}
}
/***
*Merges the data of [dgOtherGraph] document graph to the data of this graph,
*by adding all existing edges and moving the values of those existing in both graphs
*towards the new graph values based on a tendency modifier.
*The convergence tendency towards the starting value or the new value is determined
*by [fWeightPercent].
*@param dgOtherGraph The second graph used for the merging
*@param fWeightPercent The convergence tendency parameter. A value of 0.0
* means no change to existing value, 1.0 means new value is the same as
* that of the new graph. A value of 0.5 means new value is exactly between
* the old and new value (average).
***/
public void mergeGraph(DocumentNGramGraph dgOtherGraph, double fWeightPercent) {
// If both graphs are the same, ignore merging.
if (dgOtherGraph == this)
return;
for (int iCurLvl = MinSize; iCurLvl <= MaxSize; iCurLvl++) {
UniqueVertexGraph gGraph = getGraphLevelByNGramSize(iCurLvl);
UniqueVertexGraph gOtherGraph = dgOtherGraph.getGraphLevelByNGramSize(iCurLvl);
// Check if other graph has corresponding level
if (gOtherGraph == null)
// If not, ignore level
continue;
// For every edge on other graph
java.util.Iterator iIter = gOtherGraph.getEdgeSet().iterator();
ArrayList lOtherNodes = new ArrayList();
while (iIter.hasNext())
{
WeightedEdge weCurItem = (WeightedEdge)iIter.next();
String sHead = weCurItem.getVertexA().getLabel();
String sTail = weCurItem.getVertexB().getLabel();
double dWeight = weCurItem.getWeight();
lOtherNodes.clear();
lOtherNodes.add(sTail);
// TODO: Check this
createWeightedEdgesConnecting(gGraph, sHead,
lOtherNodes, dWeight, dWeight, fWeightPercent);
}
// DONE: Remove multi-threading
// // Multi-threading
// ThreadQueue tq = new ThreadQueue();
// // For every edge on other graph
// java.util.Iterator iIter = gOtherGraph.getEdgeSet().iterator();
// while (iIter.hasNext())
// {
// WeightedEdge weCurItem = (WeightedEdge)iIter.next();
// final String sHead = weCurItem.getVertexA().getLabel();
// final String sTail = weCurItem.getVertexB().getLabel();
// final double dWeight = weCurItem.getWeight();
// final String[] lOtherNodes = new String[1];
// lOtherNodes[0] = sTail;
// final UniqueVertexGraph graphArg = gGraph;
// final double dWeightPercentArg = fWeightPercent;
//
// while (!tq.addThreadFor(new Runnable() {
// @Override
// public void run() {
// synchronized (graphArg) {
// createWeightedEdgesConnecting(graphArg, sHead,
// java.util.Arrays.asList(lOtherNodes), 1.0, dWeight,
// dWeightPercentArg);
// }
// }
// }))
// Thread.yield();
// }
//
// try {
// tq.waitUntilCompletion();
// }
// catch (InterruptedException ie) {
// // Do nothing
// }
}
}
public DocumentNGramGraph intersectGraph(DocumentNGramGraph dgOtherGraph) {
// Init res graph
DocumentNGramGraph gRes = new DocumentNGramGraph(MinSize, MaxSize, CorrelationWindow);
// Use cached edge locator
EdgeCachedLocator ecl = new EdgeCachedLocator(1000);
for (int iCurLvl = MinSize; iCurLvl <= MaxSize; iCurLvl++) {
UniqueVertexGraph gGraph = getGraphLevelByNGramSize(iCurLvl);
UniqueVertexGraph gOtherGraph = dgOtherGraph.getGraphLevelByNGramSize(iCurLvl);
UniqueVertexGraph gNewGraph = gRes.getGraphLevelByNGramSize(iCurLvl);
// Check if other graph has corresponding level
if (gOtherGraph == null)
// If not, ignore level
continue;
// For every edge on other graph
java.util.Iterator iIter = gOtherGraph.getEdgeSet().iterator();
while (iIter.hasNext())
{
WeightedEdge weCurItem = (WeightedEdge)iIter.next();
String sHead = weCurItem.getVertexA().getLabel();
String sTail = weCurItem.getVertexB().getLabel();
// TODO: Check if should be directed or not
//WeightedEdge eEdge = (WeightedEdge)gr.demokritos.iit.jinsect.utils.locateEdgeInGraph(gGraph, sHead, sTail);
WeightedEdge eEdge = (WeightedEdge)ecl.locateEdgeInGraph(gGraph,
weCurItem.getVertexA(),weCurItem.getVertexB());
if (eEdge != null)
try
{
List l = new ArrayList();
l.add(sTail);
double dTargetWeight = 0.5 * (eEdge.getWeight() + weCurItem.getWeight());
// Initialize with mean weight
createWeightedEdgesConnecting(gNewGraph, sHead, l, dTargetWeight, dTargetWeight, 1.0);
// Used to be
//createWeightedEdgesConnecting(gNewGraph, sHead, l,1, eEdge.getWeight(), 1.0);
}
catch (Exception e)
{
// Non fatal error occured. Continue.
e.printStackTrace();
}
}
}
return gRes;
}
/** Returns the difference (inverse of the intersection) graph between the current graph
* and a given graph.
*@param dgOtherGraph The graph to compare to.
*@return A DocumentNGramGraph that is the difference between the current graph and the given graph.
*/
public DocumentNGramGraph inverseIntersectGraph(DocumentNGramGraph dgOtherGraph) {
// Get the union (merged) graph
DocumentNGramGraph dgUnion = (DocumentNGramGraph)clone();
dgUnion.mergeGraph(dgOtherGraph, 0);
// Get the intersection graph
DocumentNGramGraph dgIntersection = intersectGraph(dgOtherGraph);
// For every level
for (int iCurLvl = MinSize; iCurLvl <= MaxSize; iCurLvl++) {
UniqueVertexGraph gUnion = dgUnion.getGraphLevelByNGramSize(iCurLvl);
UniqueVertexGraph gIntersection = dgIntersection.getGraphLevelByNGramSize(iCurLvl);
// TODO: Order by edge count for optimization
EdgeCachedLocator eclLocator = new EdgeCachedLocator(10);
// Check if other graph has corresponding level
if (gIntersection == null)
// If not, ignore level
continue;
// For every edge of intersection
java.util.Iterator iIter = gIntersection.getEdgeSet().iterator();
while (iIter.hasNext())
{
WeightedEdge weCurItem = (WeightedEdge)iIter.next();
// If the edge is contained in the merged graph
Edge eEdge = eclLocator.locateDirectedEdgeInGraph(gUnion, weCurItem.getVertexA(),
weCurItem.getVertexB());
if (eEdge != null)
try {
gUnion.removeEdge(eEdge);
} catch (Exception ex) {
// Non-lethal exception. Continue.
ex.printStackTrace();
}
}
}
return dgUnion;
}
/** Returns both the intersection and the difference (inverse of the intersection)
* graph between the current graph and a given graph.
*@param dgOtherGraph The graph to use for intersection and difference.
*@return A DocumentNGramDistroGraph array of two elements. The first is the intersection between
* the current graph and the given graph and the second is the difference of the graphs.
* The edge distributions are kept from the original graphs.
*/
public DocumentNGramGraph[] intersectAndDeltaGraph(DocumentNGramGraph dgOtherGraph) {
DocumentNGramGraph dgUnion = null;
// Initialize union using the biggest graph
// and get the union (merged) graph
if (dgOtherGraph.length() > length()) {
dgUnion = (DocumentNGramGraph)dgOtherGraph.clone();
dgUnion.merge(this, 0);
}
else {
dgUnion = (DocumentNGramGraph)clone();
dgUnion.merge(dgOtherGraph, 0);
}
DocumentNGramGraph[] res = new DocumentNGramGraph[2];
// Get the intersection graph
DocumentNGramGraph dgIntersection = intersectGraph(dgOtherGraph);
res[0] = dgIntersection;
// For every level
for (int iCurLvl = MinSize; iCurLvl <= MaxSize; iCurLvl++) {
UniqueVertexGraph gUnion = dgUnion.getGraphLevelByNGramSize(iCurLvl);
UniqueVertexGraph gIntersection =
dgIntersection.getGraphLevelByNGramSize(iCurLvl);
// TODO: Order by edge count for optimization
EdgeCachedLocator eclLocator = new EdgeCachedLocator(100);
// Check if other graph has corresponding level
if (gIntersection == null)
// If not, ignore level
continue;
// For every edge of intersection
java.util.Iterator iIter = gIntersection.getEdgeSet().iterator();
while (iIter.hasNext())
{
WeightedEdge weCurItem = (WeightedEdge)iIter.next();
// If the edge is contained in the merged graph
Edge eEdge = eclLocator.locateDirectedEdgeInGraph(gUnion,
weCurItem.getVertexA(), weCurItem.getVertexB());
if (eEdge != null)
try {
gUnion.removeEdge(eEdge);
} catch (Exception ex) {
// Non-lethal exception. Continue.
ex.printStackTrace();
}
}
}
res[1] = dgUnion;
return res;
}
public int getMinSize() {
return MinSize;
}
public int getMaxSize() {
return MaxSize;
}
public int getWindowSize() {
return CorrelationWindow;
}
/***
* Returns a functions of [element graph edges max],[number of neighbours], where
* [element graph edges max] refers to the maximum weight of the edges including [sNode],
* and [number of neightbours] is its number of neighbours in the graph.
*@param sNode The node object the Coexistence Importance of which we calculate
***/
public double calcCoexistenceImportance(String sNode) {
VertexImpl v = new VertexImpl();
v.setLabel(sNode);
return calcCoexistenceImportance(v);
}
public double calcCoexistenceImportance(Vertex vNode) {
double dRes = 0.0;
int iNoOfNeighbours = 0;
double dMaxEdgeWeight = 0;
// Search all levels
for (int iNGramSize=MinSize; iNGramSize <= MaxSize; iNGramSize++) {
UniqueVertexGraph gCurLevel = getGraphLevelByNGramSize(iNGramSize);
if (gCurLevel.containsVertex(vNode))
{
// Keep max neighbours number
List lEdges = gCurLevel.getEdges(vNode);
int iTempNeighbours = lEdges.size();
iNoOfNeighbours = (iTempNeighbours > iNoOfNeighbours) ? iTempNeighbours : iNoOfNeighbours;
java.util.Iterator iIter = lEdges.iterator();
while (iIter.hasNext())
{
// Keep max edge weight
WeightedEdge weEdge = (WeightedEdge)iIter.next();
dMaxEdgeWeight = (weEdge.getWeight() > dMaxEdgeWeight) ? weEdge.getWeight() : dMaxEdgeWeight;
}
}
}
// Final calculation
dRes = -200000.0; // Very low value
if (dMaxEdgeWeight > 0) {
if (iNoOfNeighbours > 0)
dRes = Math.log10(Math.pow(2 * dMaxEdgeWeight, 2.5) / Math.max(1.0, Math.pow(iNoOfNeighbours / 2, 2)));
else
dRes = Math.log10(Math.pow(2 * dMaxEdgeWeight, 2.5));
}
return dRes;
}
public void prune(double dMinCoexistenceImportance) {
for (int iNGramSize=MinSize; iNGramSize <= MaxSize; iNGramSize++) {
UniqueVertexGraph gCurLevel = getGraphLevelByNGramSize(iNGramSize);
Vector vToRemove = new Vector();
Iterator iIter = gCurLevel.getVerticesIterator();
while (iIter.hasNext()) {
Vertex vCur = (Vertex)iIter.next();
if (calcCoexistenceImportance(vCur) < dMinCoexistenceImportance) {
vToRemove.add(vCur);
}
}
// Actually remove
iIter = vToRemove.iterator();
while (iIter.hasNext())
try {
gCurLevel.remove((Vertex)iIter.next());
}
catch (Exception e) {
// Ignore
}
}
}
/***
*Removes an item (node) from all graphs.
*@param sItem The item to remove.
***/
public void deleteItem(String sItem) {
// From all levels
for (int iNGramSize=MinSize; iNGramSize <= MaxSize; iNGramSize++) {
UniqueVertexGraph gCurLevel = getGraphLevelByNGramSize(iNGramSize);
Vertex v = utils.locateVertexInGraph(gCurLevel, sItem);
if (v == null)
return;
try {
gCurLevel.remove(v);
}
catch (Exception e) {
e.printStackTrace(); // Probably node did not exist
}
}
}
/***
*Sets all weights in all graphs to zero
***/
public void nullify() {
// From all levels
for (int iNGramSize=MinSize; iNGramSize <= MaxSize; iNGramSize++) {
UniqueVertexGraph gCurLevel = getGraphLevelByNGramSize(iNGramSize);
// Get all edges
java.util.Iterator iIter = gCurLevel.getEdgeSet().iterator();
while (iIter.hasNext())
{
WeightedEdge weEdge = (WeightedEdge)iIter.next();
// Set weight to zero
weEdge.setWeight(0.0);
}
}
}
public void setDataString(String sDataString) {
DataString = new StringBuilder().append(sDataString).toString();
InitGraphs(); // Clear graphs
createGraphs(); // Update graphs
}
public String getDataString() {
return DataString;
}
// Serialization
private void writeObject(java.io.ObjectOutputStream out)
throws IOException {
// Write Fields
out.writeInt(MinSize);
out.writeInt(MaxSize);
out.writeInt(CorrelationWindow);
out.writeObject(DataString);
// Save all graphs
// For each graph
for (int iCnt=MinSize; iCnt <= MaxSize; iCnt++) {
UniqueVertexGraph g = getGraphLevelByNGramSize(iCnt);
// Serialize
out.writeObject(g);
}
// Update degredation
out.writeObject(DegradedEdges);
}
private void readObject(java.io.ObjectInputStream in)
throws IOException, ClassNotFoundException {
// Read Fields
try {
MinSize = in.readInt();
MaxSize = in.readInt();
CorrelationWindow = in.readInt();
DataString = (String)in.readObject();
// DataString = "";
//// // DEBUG LINES
// if (utils.Sum == 0) {
// UniqueVertexGraph uT = new UniqueVertexGraph();
// Runtime.getRuntime().gc();
// utils.Sum = Runtime.getRuntime().freeMemory();
// System.out.println("Starting free (MB):" +
// Runtime.getRuntime().freeMemory() / (1024*1024));
// }
// //////////////
// Create array of graphs
NGramGraphArray = new UniqueVertexGraph[MaxSize - MinSize + 1];
// For each graph
for (int iCnt=MinSize; iCnt <= MaxSize; iCnt++) {
// TODO: Restore
UniqueVertexGraph g = (UniqueVertexGraph)in.readObject();
// in.readObject();
// UniqueVertexGraph g = null;
this.NGramGraphArray[iCnt - MinSize] = g;
//// // DEBUG LINES
// if (++utils.Count % 200 == 0) {
// System.out.println("So far " + utils.Count);
// Runtime.getRuntime().runFinalization();
// Runtime.getRuntime().gc();
// System.out.println("Current free (MB):" +
// Runtime.getRuntime().freeMemory() / (1024*1024));
// System.out.println("Average size (Kb): " + (double)
// (utils.Sum - Runtime.getRuntime().freeMemory())
// / (utils.Count * 1024));
// }
// //////////////
}
// Load degredation
DegradedEdges = (HashMap)in.readObject();
// if (DegradedEdges.size() > 500)
// System.out.println(DegradedEdges.size() + " degraded");
} catch (Exception e) {
throw new IOException(e.getMessage());
}
}
public void degrade(DocumentNGramGraph dgOtherGraph) {
for (int iCurLvl = MinSize; iCurLvl <= MaxSize; iCurLvl++) {
UniqueVertexGraph gGraph = getGraphLevelByNGramSize(iCurLvl);
UniqueVertexGraph gOtherGraph = dgOtherGraph.getGraphLevelByNGramSize(iCurLvl);
// Check if other graph has corresponding level
if (gOtherGraph == null)
// If not, ignore level
continue;
// For every edge on other graph
java.util.Iterator iIter = gOtherGraph.getEdgeSet().iterator();
while (iIter.hasNext())
{
WeightedEdge weCurItem = (WeightedEdge)iIter.next();
String sHead = weCurItem.getVertexA().getLabel();
String sTail = weCurItem.getVertexB().getLabel();
Edge eEdge = gr.demokritos.iit.jinsect.utils.locateEdgeInGraph(gGraph, sHead, sTail);
if (eEdge != null)
try
{
if (DegradedEdges.containsKey(eEdge))
DegradedEdges.put(eEdge,
((Double)DegradedEdges.get(eEdge)).doubleValue() + 1);
else
DegradedEdges.put(eEdge, (double)1.0);
}
catch (Exception e)
{
// Non fatal error occured. Continue.
e.printStackTrace();
}
}
}
}
public double degredationDegree(Edge e) {
if (DegradedEdges.containsKey(e))
return ((Double)DegradedEdges.get(e)).doubleValue();
else
return 0;
}
public String toCooccurenceText(Map mCooccurenceMap) {
StringBuffer sb = new StringBuffer();
// For every graph level
for (int iCnt=MinSize; iCnt <= MaxSize; iCnt++) {
UniqueVertexGraph g = getGraphLevelByNGramSize(iCnt);
// For all edges
Iterator iIter = g.getEdgeSet().iterator();
while (iIter.hasNext()) {
// Get edge
WeightedEdge eCur = (WeightedEdge)iIter.next();
String sCooccurenceID;
// If the edge is already in the map
if (mCooccurenceMap.containsKey(eCur.toString()))
// Get its ID
sCooccurenceID = (String)mCooccurenceMap.get(((Edge)eCur).toString());
else {
// else create a new ID based on current time and put it in the map.
sCooccurenceID = String.valueOf(mCooccurenceMap.size() + 1);
mCooccurenceMap.put(((Edge)eCur).toString(), sCooccurenceID);
}
// Add the ID as many times as the co-occurences
for (int iTimes=0; iTimes < (int)eCur.getWeight(); iTimes++) {
sb.append(sCooccurenceID + " ");
}
}
}
return sb.toString();
}
public static void main(String args[]) {
DocumentNGramGraph ngs = new DocumentNGramGraph(3,3,2);
ngs.setDataString("abcdef");
//ngs.setDataString("This is");
System.out.println(gr.demokritos.iit.jinsect.utils.graphToDot(ngs.getGraphLevel(0), true));
}
@Override
public Object clone() {
DocumentNGramGraph gRes = new DocumentNGramGraph(MinSize, MaxSize, CorrelationWindow);
gRes.DataString = DataString;
gRes.DegradedEdges.putAll((HashMap)this.DegradedEdges.clone());
gRes.NGramGraphArray = new UniqueVertexGraph[this.NGramGraphArray.length];
int iCnt=0;
for (UniqueVertexGraph uCur : this.NGramGraphArray)
gRes.NGramGraphArray[iCnt++] = (UniqueVertexGraph)uCur.clone();
gRes.Normalizer = this.Normalizer;
gRes.TextPreprocessor = this.TextPreprocessor;
gRes.WordEvaluator = this.WordEvaluator;
return gRes;
}
/** See the mergeGraph member for details. Implements the merge interface. */
public void merge(DocumentNGramGraph dgOtherObject, double fWeightPercent) {
mergeGraph(dgOtherObject, fWeightPercent);
}
/** Returns all edges not existent in another graph.
*@param dgOtherGraph The graph to use for intersection and difference.
*@return A DocumentNGramGraph containing all edges from this graph not existing in the
* other given graph (edge distros are not used).
* The edge distributions are kept from this graphs.
*/
public DocumentNGramGraph allNotIn(DocumentNGramGraph dgOtherGraph) {
// TODO: Order by edge count for optimization
EdgeCachedLocator eclLocator = new EdgeCachedLocator(Math.max(length(),
dgOtherGraph.length()));
// Clone this graph
DocumentNGramGraph dgClone = (DocumentNGramGraph)clone();
for (int iCurLvl = MinSize; iCurLvl <= MaxSize; iCurLvl++) {
UniqueVertexGraph gCloneLevel = dgClone.getGraphLevelByNGramSize(iCurLvl);
UniqueVertexGraph gOtherGraphLevel = dgOtherGraph.getGraphLevelByNGramSize(iCurLvl);
// If this level does not exist in other graph, then keep it and continue.
if (gOtherGraphLevel == null)
continue;
// For every edge of the cloned graph (using a new list of edges)
java.util.Iterator iIter = Arrays.asList(gCloneLevel.getEdgeSet().toArray()).iterator();
while (iIter.hasNext())
{
WeightedEdge weCurItem = (WeightedEdge)iIter.next();
// If the edge is contained in the merged graph
Edge eEdge = eclLocator.locateDirectedEdgeInGraph(gOtherGraphLevel, weCurItem.getVertexA(),
weCurItem.getVertexB());
if (eEdge != null)
try {
gCloneLevel.removeEdge(weCurItem);
eclLocator.resetCache();
// Refresh edge iterator
// iIter = gCloneLevel.getEdgeSet().iterator();
} catch (Exception ex) {
// Non-lethal exception. Continue.
ex.printStackTrace();
}
}
}
// DEBUG LINES
//System.err.println(String.format("(%s) Cache success: %4.3f",
// this.getClass().getName(), eclLocator.getSuccessRatio()));
//////////////
return dgClone;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy