All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.StanfordKBLoader Maven / Gradle / Ivy

The newest version!
package edu.stanford.nlp;

import adept.common.*;
import adept.kbapi.*;
import adept.utilities.DocumentMaker;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.pipeline.*;
import edu.stanford.nlp.time.SUTimeSimpleParser;
import edu.stanford.nlp.time.SUTime;

import java.io.*;
import java.util.InvalidPropertiesFormatException;
import java.util.*;

public class StanfordKBLoader {

    KBParameters kbParameters;
    KB kb;

    public StanfordKBLoader() {
        try {
            kbParameters = new KBParameters("adept/kbapi/KBParameters.xml");
            kb = new KB(kbParameters);
            System.out.println("done connecting to KB!");
        } catch (Exception e) {
            System.out.println("Issue with establishing KB connection!");
            e.printStackTrace();
        }
    }

    /** load everything discovered in this document into the KB **/
    public void loadDocumentContentsIntoKB(HltContentContainer document) throws
            IOException, KBQueryException, KBUpdateException {
        // load document text
        System.out.println("loading document text for: "+document.getDocumentId());
        kb.saveDocumentText(document.getDocumentId(), "stanford-kb-docs",
                document.getDocument().getValue());
        System.out.println("doc text: "+document.getDocument().getValue());
        System.out.println("done loading text for: "+document.getDocumentId());
        // process entities first
        List entityList = document.getCoreferences().get(0).getEntities();
        List documentRelationsList = document.getDocumentRelations();
        List documentEventsList = document.getDocumentEvents();
        HashMap entityToKBEntity = new HashMap();
        HashMap entityNameToEntity = new HashMap();
        System.out.println("---");
        System.out.println("Entities in this HltContentContainer: "+document.getDocumentId());
        // get all the entity names in this doc
        for (Entity potentialEntity : entityList) {
            System.out.println("\t"+potentialEntity.getCanonicalMention().getValue());
            entityNameToEntity.put(potentialEntity.getCanonicalMention().getValue(), potentialEntity);
        }
        // load all entities
        for (Entity entityToAdd : entityList) {
            EntityMention canonicalMentionForEntity = entityToAdd.getCanonicalMention();
            String entityName = canonicalMentionForEntity.getValue();
            // if the entity mention maps to a Wikipedia entity, link to that
            // first set up the KBID corresponding to this wikipedia entity
            KBID wikipediaKBID = null;
            if (!canonicalMentionForEntity.getAttribute("wikipediaEntity").equals("*NO-WIKI-ENTITY*")) {
                wikipediaKBID = new KBID(canonicalMentionForEntity.getAttribute("wikipediaEntity"), "wikipedia");
            }
            // get the entity type
            IType entityType = entityToAdd.getEntityType();
            if (entityType.getType().equals("UNKNOWN") || entityType.getType().equals("DATE")
                    || entityType.getType().equals("OTHER") || entityType.getType().equals("NUMBER")
                    || entityType.getType().equals("TIME")) {
                System.out.println("skipping entity: "+entityName);
                System.out.println("entity has bad type: "+entityType.getType());
                continue;
            }
            com.google.common.base.Optional entityOntType =
                    KBOntologyMap.getTACOntologyMap().getKBTypeForType(entityType);
            // look for this entity in the KB
            List wikipediaMatchingEntities = new ArrayList();
            List stringMatchingEntities;
            // look for matching wikipedia entities
            if (wikipediaKBID != null) {
                com.google.common.base.Optional
                        kbObject = kb.getKBObjectByExternalID(wikipediaKBID);
                // if there is a matching object in the KB, put it in the list
                if (kbObject.isPresent()) {
                    KBID adeptKBID = kbObject.get().getKBID();
                    KBEntity adeptEntity = kb.getEntityById(adeptKBID);
                    wikipediaMatchingEntities.add(adeptEntity);
                    System.out.println("Found matching wikipedia entity in KB: "+wikipediaKBID.getObjectID());
                }
            }
            // get the string matching entities
            stringMatchingEntities = kb.getEntitiesByStringReference(entityName);
            if (stringMatchingEntities == null) {
                stringMatchingEntities = new ArrayList();
            }
            // check if there already is an entity in the KB matching this
            boolean foundMatchingKBEntity = false;
            // check for wikipedia match
            for (KBEntity kbMatchingEntity : wikipediaMatchingEntities) {
                for (OntType possibleType : kbMatchingEntity.getTypes().keySet()) {
                    if (possibleType.equals(entityOntType.get())) {
                        entityToKBEntity.put(entityToAdd, kbMatchingEntity);
                        System.out.println("Found entity with matching wikipedia ID: " +
                                entityToAdd.getValue());
                        foundMatchingKBEntity = true;
                        break;
                    }
                    if (foundMatchingKBEntity)
                        break;
                }
                /*if (foundType != null && foundType.equals(entityOntType.get())) {
                    entityToKBEntity.put(entityToAdd, kbMatchingEntity);
                    System.out.println("Found entity with matching wikipedia ID: "+entityToAdd.getValue());
                    foundMatchingKBEntity = true;
                    break;
                }*/
            }
            // if there was no wikipedia match check for string match
            if (!foundMatchingKBEntity) {
                for (KBEntity kbMatchingEntity : stringMatchingEntities) {
                    OntType foundType = null;
                    for (OntType possibleType : kbMatchingEntity.getTypes().keySet()) {
                        foundType = possibleType;
                        break;
                    }
                    if (foundType != null && foundType.equals(entityOntType.get())) {
                        entityToKBEntity.put(entityToAdd, kbMatchingEntity);
                        System.out.println("Found entity with matching canonical string: " + entityToAdd.getValue());
                        foundMatchingKBEntity = true;
                        break;
                    }
                }
            }
            // if there was a matching entity found in the KB, continue and don't bother with KB load step
            if (foundMatchingKBEntity) {
                System.out.println("Found KB match for: "+entityName);
                // if there was a matching entity found in KB, don't go through with KB load step for this entity
                continue;
            }
            // at this point, no entity was found in KB, so attempt to load a new entity into the KB
            // note there may be an entity with same wikipedia ID or same canonical string, but of different type
            // an issue to resolve right now is that Wikidict does not provide a type
            // so London (person) might map to London-wikipedia and London (city) might map to London-wikipedia
            // the issue is wikipedia entities right now do not have a type
            // hopefully this is a rare corner case
            System.out.println("Attempting to load: "+entityToAdd.getCanonicalMention().getValue()+" into KB");
            System.out.println("attempted entity has type: "+entityType.getType());
            System.out.println("id for entity attempting to load: "+entityToAdd.getEntityId());
            System.out.println("id distribution for canonical mention:");
            for (long attemptedEntityID : entityToAdd.getCanonicalMention().getEntityIdDistribution().keySet()) {
                System.out.println("\t"+attemptedEntityID+":"
                        +entityToAdd.getCanonicalMention().getEntityIdDistribution().get(attemptedEntityID));
            }
            ArrayList entityMentions = new ArrayList();
            entityMentions.add(entityToAdd.getCanonicalMention());
            KBEntity.InsertionBuilder insertionBuilder =
                    KBEntity.entityInsertionBuilder(entityToAdd, entityMentions, KBOntologyMap.getTACOntologyMap());
            // if there was a wikipedia entity link, add it into the KB
            if (wikipediaKBID != null && wikipediaKBID.getObjectID() != "O") {
                System.out.println("this entity matches wikipedia entity: "+wikipediaKBID.getObjectID());
                HashSet wikipediaIDSet = new HashSet();
                wikipediaIDSet.add(wikipediaKBID);
                insertionBuilder.addExternalKBIds(wikipediaIDSet);
            }
            System.out.println("inserting into KB: " + entityToAdd.getCanonicalMention().getValue());
            KBEntity insertedKBEntity = insertionBuilder.insert(kb);
            System.out.println("successfully inserted entity into KB: "+entityToAdd.getCanonicalMention().getValue());
            entityToKBEntity.put(entityToAdd, insertedKBEntity);
        }
        // add all GenericThings for the relations and events found in this document
        ArrayList genericThings = new ArrayList();
        for (DocumentRelation dr : documentRelationsList) {
            for (DocumentRelationArgument da : dr.getArguments()) {
                if (da.getFiller().asGenericThing().isPresent()) {
                    genericThings.add(da.getFiller().asGenericThing().get());
                }
            }
            for (DocumentEvent de : documentEventsList) {
                for (DocumentEventArgument da : de.getArguments()) {
                    if (da.getFiller().asGenericThing().isPresent()) {
                        genericThings.add(da.getFiller().asGenericThing().get());
                    }
                }
            }
        }
        for (GenericThing gt : genericThings) {
            String genericThingValue = gt.getValue();
            com.google.common.base.Optional optionalGenericThingType =
                    KBOntologyMap.getTACOntologyMap().getKBTypeForType(gt.getType());
            if (optionalGenericThingType.isPresent()) {
                OntType genericThingType = optionalGenericThingType.get();
                com.google.common.base.Optional optionalMatchingGenericThing =
                        kb.getGenericThingByTypeAndValue(genericThingType, genericThingValue);
                if (optionalMatchingGenericThing.isPresent()) {
                    entityToKBEntity.put(gt, optionalMatchingGenericThing.get());
                } else {
                    System.out.println("Attempting to load GenericThing into KB...");
                    KBGenericThing.InsertionBuilder gtBuilder =
                            KBGenericThing.genericThingInsertionBuilder(genericThingType, genericThingValue);
                    KBGenericThing kbGenericThing = gtBuilder.insert(kb);
                    System.out.println("Successfully loaded generic thing into KB: "+gt.getValue());
                    entityToKBEntity.put(gt, kbGenericThing);
                }
            }
        }
        // add all TimexValues for the events found in this document
        for (DocumentEvent de : documentEventsList) {
            for (DocumentEventArgument da : de.getArguments()) {
                if (da.getFiller().asTemporalValue().isPresent()) {
                    // Stanford system works with Timex values, so this cast should be ok
                    TimexValue tv = (TimexValue) da.getFiller().asTemporalValue().get();
                    String timexString = tv.asString();
                    com.google.common.base.Optional optionalMatchingDate =
                            kb.getDateByTimex2Value(timexString);
                    if (optionalMatchingDate.isPresent()) {
                        System.out.println("Found date in KB already: "+timexString);
                        entityToKBEntity.put(da.getFiller().asItem().get(), optionalMatchingDate.get());
                    } else {
                        System.out.println("Attempting to load TimexValue into KB...");
                        KBDate.InsertionBuilder tvBuilder =
                                KBDate.timexInsertionBuilder(timexString);
                        KBDate kbDate = tvBuilder.insert(kb);
                        System.out.println("Successfully loaded date into KB: "+timexString);
                        entityToKBEntity.put(da.getFiller().asItem().get(), kbDate);
                    }
                }
            }
        }
        // add all NumericValues for the numeric values found in this document
        ArrayList numericValues = new ArrayList();
        for (DocumentRelation dr : documentRelationsList) {
            for (DocumentRelationArgument da : dr.getArguments()) {
                if (da.getFiller().asNumericValue().isPresent()) {
                    numericValues.add(da.getFiller().asNumericValue().get());
                }
            }
        }
        for (NumericValue nv : numericValues) {
            Number nvAsNumber = nv.asNumber();
            com.google.common.base.Optional optionalMatchingNumber =
                    kb.getNumberByValue(nvAsNumber);
            if (optionalMatchingNumber.isPresent()) {
                entityToKBEntity.put(nv,optionalMatchingNumber.get());
            } else {
                System.out.println("Attempting to load NumberValue into KB...");
                KBNumber.InsertionBuilder nvBuilder =
                        KBNumber.numberInsertionBuilder(nvAsNumber);
                KBNumber kbNumber = nvBuilder.insert(kb);
                System.out.println("Successfully loaded number into KB: "+nvAsNumber);
                entityToKBEntity.put(nv, kbNumber);
            }
        }
        // now add the document relations
        for (DocumentRelation dr : documentRelationsList) {
            System.out.println("relation type for DocumentRelation: "+dr.getRelationType().getType());
            KBRelation.InsertionBuilder insertionBuilder =
                    KBRelation.relationInsertionBuilder(dr, entityToKBEntity, KBOntologyMap.getTACOntologyMap());
            System.out.println("insertion builder provenances: "+insertionBuilder.getProvenances());
            KBRelation kbRelation = insertionBuilder.insert(kb);
            System.out.println("successfully inserted relation into KB");
        }
        // now add the document events
        for (DocumentEvent de : documentEventsList) {
            System.out.println("event type for DocumentEvent: "+de.getEventType());
            KBEvent.InsertionBuilder insertionBuilder = KBEvent.eventInsertionBuilder(de,entityToKBEntity,
                    KBOntologyMap.getTACOntologyMap());
            System.out.println("insertion builder provenances: "+insertionBuilder.getProvenances());
            KBEvent kbEvent = insertionBuilder.insert(kb);
            System.out.println("successfully inserted event into KB");
        }
    }

    public static void main(String[] args) {

        // test files to load
        List inputFiles = IOUtils.linesFromFile(args[0]);

        // get processor config
        String configFilePath = "edu/stanford/nlp/StanfordCoreNlpTesterConfig.xml";
        Properties kbLoaderConfig = new Properties();
        try {
            DataInputStream dis = new DataInputStream(IOUtils.
                    getInputStreamFromURLOrClasspathOrFileSystem(configFilePath));
            kbLoaderConfig.loadFromXML(dis);
        } catch (InvalidPropertiesFormatException e1) {
            e1.printStackTrace();
        } catch (IOException e1) {
            e1.printStackTrace();
        }

        // build processor
        String stanfordCoreNlpProcessorConfig = kbLoaderConfig.getProperty("stanfordCoreNlpProcessorConfig");
        // build date finding StanfordCoreNLP
        Properties dateFindingProps = new Properties();
        dateFindingProps.setProperty("annotators", "tokenize, cleanxml");
        dateFindingProps.setProperty("clean.datetags", "DATETIME|DATE|DATELINE");
        StanfordCoreNLP dateFindingPipeline = new StanfordCoreNLP(dateFindingProps);
        try {
            // build the processor
            StanfordCoreNlpProcessor stanfordprocessor = new StanfordCoreNlpProcessor();
            stanfordprocessor.activate(stanfordCoreNlpProcessorConfig);
            // build the kb loader
            StanfordKBLoader kbLoader = new StanfordKBLoader();
            // process each document
            int docCount = 0;
            for (String inputFile : inputFiles) {
                try {
                    System.out.println("Processing doc: " + inputFile + " at " + (new Date()));
                    // find the doc date
                    String rawFileContents = IOUtils.stringFromFile(inputFile);
                    Annotation rawFileAnnotation = new Annotation(rawFileContents);
                    dateFindingPipeline.annotate(rawFileAnnotation);
                    String docDateString = rawFileAnnotation.get(CoreAnnotations.DocDateAnnotation.class);
                    if (docDateString == null) {
                        docDateString = "";
                    }
                    SUTime.Temporal potentialDate = SUTimeSimpleParser.parse(docDateString);
                    System.out.println("FOUND DATE: "+potentialDate);
                    System.out.println("TO STRING VERSION: "+potentialDate.toString());
                    // build the adept doc
                    HltContentContainer hltContentContainer = new HltContentContainer();
                    Document document = DocumentMaker.getInstance().createDocument(inputFile, hltContentContainer);
                    // use the StanfordCoreNLP found date for the doc date
                    document.setCaptureDate(potentialDate.toString());
                    hltContentContainer = stanfordprocessor.process(document,
                            hltContentContainer);
                    kbLoader.loadDocumentContentsIntoKB(hltContentContainer);
                    System.out.println("done with doc: " + inputFile + " at " + (new Date()));
                    docCount++;
                    System.out.println("docs processed: " + docCount);
                } catch (Exception e) {
                    System.out.println("---");
                    System.out.println("Problem with this doc: "+inputFile);
                    System.out.println(e);
                    for (StackTraceElement ste : e.getStackTrace()) {
                        System.out.println(ste.toString());
                    }
                    e.printStackTrace();
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy