![JAR search and dependency download from the Maven repository](/logo.png)
edu.stanford.nlp.StanfordKBLoader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-adept Show documentation
Show all versions of stanford-adept Show documentation
The project file to build and run stanford applications using Maven
The newest version!
package edu.stanford.nlp;
import adept.common.*;
import adept.kbapi.*;
import adept.utilities.DocumentMaker;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.pipeline.*;
import edu.stanford.nlp.time.SUTimeSimpleParser;
import edu.stanford.nlp.time.SUTime;
import java.io.*;
import java.util.InvalidPropertiesFormatException;
import java.util.*;
public class StanfordKBLoader {
KBParameters kbParameters;
KB kb;
public StanfordKBLoader() {
try {
kbParameters = new KBParameters("adept/kbapi/KBParameters.xml");
kb = new KB(kbParameters);
System.out.println("done connecting to KB!");
} catch (Exception e) {
System.out.println("Issue with establishing KB connection!");
e.printStackTrace();
}
}
/** load everything discovered in this document into the KB **/
public void loadDocumentContentsIntoKB(HltContentContainer document) throws
IOException, KBQueryException, KBUpdateException {
// load document text
System.out.println("loading document text for: "+document.getDocumentId());
kb.saveDocumentText(document.getDocumentId(), "stanford-kb-docs",
document.getDocument().getValue());
System.out.println("doc text: "+document.getDocument().getValue());
System.out.println("done loading text for: "+document.getDocumentId());
// process entities first
List entityList = document.getCoreferences().get(0).getEntities();
List documentRelationsList = document.getDocumentRelations();
List documentEventsList = document.getDocumentEvents();
HashMap- entityToKBEntity = new HashMap
- ();
HashMap
entityNameToEntity = new HashMap();
System.out.println("---");
System.out.println("Entities in this HltContentContainer: "+document.getDocumentId());
// get all the entity names in this doc
for (Entity potentialEntity : entityList) {
System.out.println("\t"+potentialEntity.getCanonicalMention().getValue());
entityNameToEntity.put(potentialEntity.getCanonicalMention().getValue(), potentialEntity);
}
// load all entities
for (Entity entityToAdd : entityList) {
EntityMention canonicalMentionForEntity = entityToAdd.getCanonicalMention();
String entityName = canonicalMentionForEntity.getValue();
// if the entity mention maps to a Wikipedia entity, link to that
// first set up the KBID corresponding to this wikipedia entity
KBID wikipediaKBID = null;
if (!canonicalMentionForEntity.getAttribute("wikipediaEntity").equals("*NO-WIKI-ENTITY*")) {
wikipediaKBID = new KBID(canonicalMentionForEntity.getAttribute("wikipediaEntity"), "wikipedia");
}
// get the entity type
IType entityType = entityToAdd.getEntityType();
if (entityType.getType().equals("UNKNOWN") || entityType.getType().equals("DATE")
|| entityType.getType().equals("OTHER") || entityType.getType().equals("NUMBER")
|| entityType.getType().equals("TIME")) {
System.out.println("skipping entity: "+entityName);
System.out.println("entity has bad type: "+entityType.getType());
continue;
}
com.google.common.base.Optional entityOntType =
KBOntologyMap.getTACOntologyMap().getKBTypeForType(entityType);
// look for this entity in the KB
List wikipediaMatchingEntities = new ArrayList();
List stringMatchingEntities;
// look for matching wikipedia entities
if (wikipediaKBID != null) {
com.google.common.base.Optional
kbObject = kb.getKBObjectByExternalID(wikipediaKBID);
// if there is a matching object in the KB, put it in the list
if (kbObject.isPresent()) {
KBID adeptKBID = kbObject.get().getKBID();
KBEntity adeptEntity = kb.getEntityById(adeptKBID);
wikipediaMatchingEntities.add(adeptEntity);
System.out.println("Found matching wikipedia entity in KB: "+wikipediaKBID.getObjectID());
}
}
// get the string matching entities
stringMatchingEntities = kb.getEntitiesByStringReference(entityName);
if (stringMatchingEntities == null) {
stringMatchingEntities = new ArrayList();
}
// check if there already is an entity in the KB matching this
boolean foundMatchingKBEntity = false;
// check for wikipedia match
for (KBEntity kbMatchingEntity : wikipediaMatchingEntities) {
for (OntType possibleType : kbMatchingEntity.getTypes().keySet()) {
if (possibleType.equals(entityOntType.get())) {
entityToKBEntity.put(entityToAdd, kbMatchingEntity);
System.out.println("Found entity with matching wikipedia ID: " +
entityToAdd.getValue());
foundMatchingKBEntity = true;
break;
}
if (foundMatchingKBEntity)
break;
}
/*if (foundType != null && foundType.equals(entityOntType.get())) {
entityToKBEntity.put(entityToAdd, kbMatchingEntity);
System.out.println("Found entity with matching wikipedia ID: "+entityToAdd.getValue());
foundMatchingKBEntity = true;
break;
}*/
}
// if there was no wikipedia match check for string match
if (!foundMatchingKBEntity) {
for (KBEntity kbMatchingEntity : stringMatchingEntities) {
OntType foundType = null;
for (OntType possibleType : kbMatchingEntity.getTypes().keySet()) {
foundType = possibleType;
break;
}
if (foundType != null && foundType.equals(entityOntType.get())) {
entityToKBEntity.put(entityToAdd, kbMatchingEntity);
System.out.println("Found entity with matching canonical string: " + entityToAdd.getValue());
foundMatchingKBEntity = true;
break;
}
}
}
// if there was a matching entity found in the KB, continue and don't bother with KB load step
if (foundMatchingKBEntity) {
System.out.println("Found KB match for: "+entityName);
// if there was a matching entity found in KB, don't go through with KB load step for this entity
continue;
}
// at this point, no entity was found in KB, so attempt to load a new entity into the KB
// note there may be an entity with same wikipedia ID or same canonical string, but of different type
// an issue to resolve right now is that Wikidict does not provide a type
// so London (person) might map to London-wikipedia and London (city) might map to London-wikipedia
// the issue is wikipedia entities right now do not have a type
// hopefully this is a rare corner case
System.out.println("Attempting to load: "+entityToAdd.getCanonicalMention().getValue()+" into KB");
System.out.println("attempted entity has type: "+entityType.getType());
System.out.println("id for entity attempting to load: "+entityToAdd.getEntityId());
System.out.println("id distribution for canonical mention:");
for (long attemptedEntityID : entityToAdd.getCanonicalMention().getEntityIdDistribution().keySet()) {
System.out.println("\t"+attemptedEntityID+":"
+entityToAdd.getCanonicalMention().getEntityIdDistribution().get(attemptedEntityID));
}
ArrayList entityMentions = new ArrayList();
entityMentions.add(entityToAdd.getCanonicalMention());
KBEntity.InsertionBuilder insertionBuilder =
KBEntity.entityInsertionBuilder(entityToAdd, entityMentions, KBOntologyMap.getTACOntologyMap());
// if there was a wikipedia entity link, add it into the KB
if (wikipediaKBID != null && wikipediaKBID.getObjectID() != "O") {
System.out.println("this entity matches wikipedia entity: "+wikipediaKBID.getObjectID());
HashSet wikipediaIDSet = new HashSet();
wikipediaIDSet.add(wikipediaKBID);
insertionBuilder.addExternalKBIds(wikipediaIDSet);
}
System.out.println("inserting into KB: " + entityToAdd.getCanonicalMention().getValue());
KBEntity insertedKBEntity = insertionBuilder.insert(kb);
System.out.println("successfully inserted entity into KB: "+entityToAdd.getCanonicalMention().getValue());
entityToKBEntity.put(entityToAdd, insertedKBEntity);
}
// add all GenericThings for the relations and events found in this document
ArrayList genericThings = new ArrayList();
for (DocumentRelation dr : documentRelationsList) {
for (DocumentRelationArgument da : dr.getArguments()) {
if (da.getFiller().asGenericThing().isPresent()) {
genericThings.add(da.getFiller().asGenericThing().get());
}
}
for (DocumentEvent de : documentEventsList) {
for (DocumentEventArgument da : de.getArguments()) {
if (da.getFiller().asGenericThing().isPresent()) {
genericThings.add(da.getFiller().asGenericThing().get());
}
}
}
}
for (GenericThing gt : genericThings) {
String genericThingValue = gt.getValue();
com.google.common.base.Optional optionalGenericThingType =
KBOntologyMap.getTACOntologyMap().getKBTypeForType(gt.getType());
if (optionalGenericThingType.isPresent()) {
OntType genericThingType = optionalGenericThingType.get();
com.google.common.base.Optional optionalMatchingGenericThing =
kb.getGenericThingByTypeAndValue(genericThingType, genericThingValue);
if (optionalMatchingGenericThing.isPresent()) {
entityToKBEntity.put(gt, optionalMatchingGenericThing.get());
} else {
System.out.println("Attempting to load GenericThing into KB...");
KBGenericThing.InsertionBuilder gtBuilder =
KBGenericThing.genericThingInsertionBuilder(genericThingType, genericThingValue);
KBGenericThing kbGenericThing = gtBuilder.insert(kb);
System.out.println("Successfully loaded generic thing into KB: "+gt.getValue());
entityToKBEntity.put(gt, kbGenericThing);
}
}
}
// add all TimexValues for the events found in this document
for (DocumentEvent de : documentEventsList) {
for (DocumentEventArgument da : de.getArguments()) {
if (da.getFiller().asTemporalValue().isPresent()) {
// Stanford system works with Timex values, so this cast should be ok
TimexValue tv = (TimexValue) da.getFiller().asTemporalValue().get();
String timexString = tv.asString();
com.google.common.base.Optional optionalMatchingDate =
kb.getDateByTimex2Value(timexString);
if (optionalMatchingDate.isPresent()) {
System.out.println("Found date in KB already: "+timexString);
entityToKBEntity.put(da.getFiller().asItem().get(), optionalMatchingDate.get());
} else {
System.out.println("Attempting to load TimexValue into KB...");
KBDate.InsertionBuilder tvBuilder =
KBDate.timexInsertionBuilder(timexString);
KBDate kbDate = tvBuilder.insert(kb);
System.out.println("Successfully loaded date into KB: "+timexString);
entityToKBEntity.put(da.getFiller().asItem().get(), kbDate);
}
}
}
}
// add all NumericValues for the numeric values found in this document
ArrayList numericValues = new ArrayList();
for (DocumentRelation dr : documentRelationsList) {
for (DocumentRelationArgument da : dr.getArguments()) {
if (da.getFiller().asNumericValue().isPresent()) {
numericValues.add(da.getFiller().asNumericValue().get());
}
}
}
for (NumericValue nv : numericValues) {
Number nvAsNumber = nv.asNumber();
com.google.common.base.Optional optionalMatchingNumber =
kb.getNumberByValue(nvAsNumber);
if (optionalMatchingNumber.isPresent()) {
entityToKBEntity.put(nv,optionalMatchingNumber.get());
} else {
System.out.println("Attempting to load NumberValue into KB...");
KBNumber.InsertionBuilder nvBuilder =
KBNumber.numberInsertionBuilder(nvAsNumber);
KBNumber kbNumber = nvBuilder.insert(kb);
System.out.println("Successfully loaded number into KB: "+nvAsNumber);
entityToKBEntity.put(nv, kbNumber);
}
}
// now add the document relations
for (DocumentRelation dr : documentRelationsList) {
System.out.println("relation type for DocumentRelation: "+dr.getRelationType().getType());
KBRelation.InsertionBuilder insertionBuilder =
KBRelation.relationInsertionBuilder(dr, entityToKBEntity, KBOntologyMap.getTACOntologyMap());
System.out.println("insertion builder provenances: "+insertionBuilder.getProvenances());
KBRelation kbRelation = insertionBuilder.insert(kb);
System.out.println("successfully inserted relation into KB");
}
// now add the document events
for (DocumentEvent de : documentEventsList) {
System.out.println("event type for DocumentEvent: "+de.getEventType());
KBEvent.InsertionBuilder insertionBuilder = KBEvent.eventInsertionBuilder(de,entityToKBEntity,
KBOntologyMap.getTACOntologyMap());
System.out.println("insertion builder provenances: "+insertionBuilder.getProvenances());
KBEvent kbEvent = insertionBuilder.insert(kb);
System.out.println("successfully inserted event into KB");
}
}
public static void main(String[] args) {
// test files to load
List inputFiles = IOUtils.linesFromFile(args[0]);
// get processor config
String configFilePath = "edu/stanford/nlp/StanfordCoreNlpTesterConfig.xml";
Properties kbLoaderConfig = new Properties();
try {
DataInputStream dis = new DataInputStream(IOUtils.
getInputStreamFromURLOrClasspathOrFileSystem(configFilePath));
kbLoaderConfig.loadFromXML(dis);
} catch (InvalidPropertiesFormatException e1) {
e1.printStackTrace();
} catch (IOException e1) {
e1.printStackTrace();
}
// build processor
String stanfordCoreNlpProcessorConfig = kbLoaderConfig.getProperty("stanfordCoreNlpProcessorConfig");
// build date finding StanfordCoreNLP
Properties dateFindingProps = new Properties();
dateFindingProps.setProperty("annotators", "tokenize, cleanxml");
dateFindingProps.setProperty("clean.datetags", "DATETIME|DATE|DATELINE");
StanfordCoreNLP dateFindingPipeline = new StanfordCoreNLP(dateFindingProps);
try {
// build the processor
StanfordCoreNlpProcessor stanfordprocessor = new StanfordCoreNlpProcessor();
stanfordprocessor.activate(stanfordCoreNlpProcessorConfig);
// build the kb loader
StanfordKBLoader kbLoader = new StanfordKBLoader();
// process each document
int docCount = 0;
for (String inputFile : inputFiles) {
try {
System.out.println("Processing doc: " + inputFile + " at " + (new Date()));
// find the doc date
String rawFileContents = IOUtils.stringFromFile(inputFile);
Annotation rawFileAnnotation = new Annotation(rawFileContents);
dateFindingPipeline.annotate(rawFileAnnotation);
String docDateString = rawFileAnnotation.get(CoreAnnotations.DocDateAnnotation.class);
if (docDateString == null) {
docDateString = "";
}
SUTime.Temporal potentialDate = SUTimeSimpleParser.parse(docDateString);
System.out.println("FOUND DATE: "+potentialDate);
System.out.println("TO STRING VERSION: "+potentialDate.toString());
// build the adept doc
HltContentContainer hltContentContainer = new HltContentContainer();
Document document = DocumentMaker.getInstance().createDocument(inputFile, hltContentContainer);
// use the StanfordCoreNLP found date for the doc date
document.setCaptureDate(potentialDate.toString());
hltContentContainer = stanfordprocessor.process(document,
hltContentContainer);
kbLoader.loadDocumentContentsIntoKB(hltContentContainer);
System.out.println("done with doc: " + inputFile + " at " + (new Date()));
docCount++;
System.out.println("docs processed: " + docCount);
} catch (Exception e) {
System.out.println("---");
System.out.println("Problem with this doc: "+inputFile);
System.out.println(e);
for (StackTraceElement ste : e.getStackTrace()) {
System.out.println(ste.toString());
}
e.printStackTrace();
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy