All Downloads are FREE. Search and download functionalities are using the official Maven repository.

semRewrite.substitutor.ExternalSubstitutor Maven / Gradle / Ivy

Go to download

Natural language processing toolbox using Sigma knowledge engineering system.

There is a newer version: 1.1
Show newest version
/*
Copyright 2014-2015 IPsoft

Author: Andrei Holub [email protected]

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program ; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston,
MA  02111-1307 USA 
*/
package semRewrite.substitutor;

import com.articulate.sigma.*;
import edu.stanford.nlp.ling.CoreLabel;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;

import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.*;

public class ExternalSubstitutor extends SimpleSubstitutorStorage {

    public int tokenCounter = 1; // number of token being added

    public class Segmentation {
        public ArrayList segments = new ArrayList<>();
        public ArrayList types = new ArrayList<>();

        public String toString() {
            StringBuffer sb = new StringBuffer();
            sb.append("segments: " + segments.toString());
            sb.append("types: " + types.toString());
            return sb.toString();
        }

        // a segment from string without token numbers
        public void addStringSegment(String s) {

            List al = new ArrayList<>();
            //System.out.println("Segmentation.addStringSegment: before " + s);
            String[] ss = StringUtil.removePunctuation(s).split(" ");
            //System.out.println("Segmentation.addStringSegment: after " + Arrays.toString(ss));
            for (String seg : ss) {
                CoreLabel cl = new CoreLabel();
                cl.setOriginalText(seg);
                cl.setValue(seg);
                cl.setIndex(tokenCounter);
                tokenCounter++;
                al.add(cl);
            }
            semRewrite.substitutor.CoreLabelSequence cls = new semRewrite.substitutor.CoreLabelSequence(al);
            segments.add(cls);
        }
    }

    // cache of remote system responses
    public static Map cache = new HashMap<>();

    /*************************************************************
     */
    public ExternalSubstitutor(List labels) {

        initialize(labels);
    }

    /****************************************************************
     * {
     * "query": "blah blah blah",
     *      "annotatations": [
     *          {
     *              "types": [
     *                  "...",
     *                  ...
     *              ],
     *              "segmentation": [
     *                  "...",
     *                  ...
     *              ],
     *          }
     *      ],
     * }
     */
    private Segmentation parseNERJson(String json) {

        try {
            Segmentation result = new Segmentation();
            JSONParser parser = new JSONParser();
            Object obj = parser.parse(json);
            JSONObject jsonObject = (JSONObject) obj;
            System.out.println("ExternaSubstitutor.parseNERJson(): Reading JSON file: " + jsonObject);

            JSONArray anno = (JSONArray) jsonObject.get("annotatations");
            System.out.println("ExternalSubstitutor.parseNERJson(): annotations: " + anno);
            if (anno == null || anno.size() == 0 || anno.get(0) == null) {
                System.out.println("ExternalSubstitutor.parseNERJson(): bad JSON: " + json);
                throw new ParseException(0);
            }
            JSONObject msg = (JSONObject) anno.get(0);
            JSONArray typeObj = (JSONArray) msg.get("types");
            Iterator iterator = typeObj.iterator();
            ArrayList types = new ArrayList();
            while (iterator.hasNext()) {
                result.types.add(iterator.next());
            }

            JSONArray msg2 = (JSONArray) msg.get("segmentation");
            Iterator iterator2 = msg2.iterator();
            tokenCounter = 1;
            while (iterator2.hasNext()) {
                result.addStringSegment(iterator2.next());
            }
            return result;
        }
        catch (ParseException pe) {
            System.out.println("Error in ExternaSubstitutor.prepare(): parseException: " + json);
            pe.printStackTrace();
            return null;
        }
    }

    /* *************************************************************
     * Load a file consisting of previous responses from the remote NER
     * server.
     */
    private void loadCache() {

        if (cache.keySet().size() > 0)
            return;  // don't load the cache multiple times
        String resourcePath = System.getenv("SIGMA_HOME") + File.separator + "KBs" +
                File.separator + "nerCache.json";
        try {
            File f = new File(resourcePath);
            if (!f.exists())
                return;
            FileReader fr = new FileReader(f);
            System.out.println("ExternaSubstitutor.loadCache(): Reading JSON file: " + resourcePath);
            JSONParser parser = new JSONParser();
            Object obj = parser.parse(fr);
            JSONObject jsonObject = (JSONObject) obj;
            JSONArray msg = (JSONArray) jsonObject.get("cache");

            Iterator iterator = msg.iterator();
            while (iterator.hasNext()) {
                JSONObject oneQuery = iterator.next();
                String query = (String) oneQuery.get("query");
                JSONArray types = (JSONArray) oneQuery.get("types");
                JSONArray segmentation = (JSONArray) oneQuery.get("segmentation");
                Segmentation seg = new Segmentation();
                Iterator iterator2 = segmentation.iterator();
                tokenCounter = 1;
                while (iterator2.hasNext()) {
                    seg.addStringSegment(iterator2.next());
                }
                seg.types.addAll(types);
                cache.put(query, seg);
            }
        }
        catch (Exception e) {
            System.out.println("Parse exception reading: " + resourcePath);
            System.out.println(e.getMessage());
            e.printStackTrace();
        }
    }

    /***************************************************************
     */
    private void saveCache() {

        String resourcePath = System.getenv("SIGMA_HOME") + File.separator + "KBs" +
                File.separator + "nerCache.json";
        try {
            FileWriter fw = new FileWriter(new File(resourcePath));
            System.out.println("Writing JSON file: " + resourcePath);
            JSONObject jsonObject = new JSONObject();
            JSONArray cacheJson = new JSONArray();

            for (String s : cache.keySet()) {
                JSONObject queryAndNER = new JSONObject();
                queryAndNER.put("query", s);

                Segmentation seg = cache.get(s);
                JSONArray seglist = new JSONArray();
                JSONArray typelist = new JSONArray();
                for (semRewrite.substitutor.CoreLabelSequence segment : seg.segments) {
                    StringBuffer oneseg = new StringBuffer();
                    for (CoreLabel cl : segment.getLabels()) {
                        if (!StringUtil.emptyString(oneseg.toString()))
                            oneseg.append(" ");
                        oneseg.append(cl.value());
                    }
                    seglist.add(oneseg.toString());
                }
                for (String segment : seg.types) {
                    typelist.add(segment);
                }
                queryAndNER.put("segmentation", seglist);
                queryAndNER.put("types", typelist);
                cacheJson.add(queryAndNER);
            }
            jsonObject.put("cache", cacheJson);
            fw.write(jsonObject.toJSONString());
            fw.flush();
            fw.close();
        }
        catch (Exception e) {
            System.out.println("Exception writing: " + resourcePath);
            System.out.println(e.getMessage());
            e.printStackTrace();
        }
    }

    /* *************************************************************
     * Collects information froma remote service about continuous noun
     * sequences like "Garry Bloom", "Tim Buk Tu"
     */
    private static String toURLSentence(List labels) {

        StringBuffer sb = new StringBuffer();
        for (CoreLabel cl : labels) {
            //System.out.println("Info in ExternalSubstitutor.toURLSentence(): label: " + cl);
            //System.out.println("Info in ExternalSubstitutor.toURLSentence(): label: " + cl.originalText());
            //System.out.println("Info in ExternalSubstitutor.toURLSentence(): label: " + cl.lemma());
            //System.out.println("Info in ExternalSubstitutor.toURLSentence(): label: " + cl.value());
            if (!sb.toString().equals(""))
                sb.append("+");
            sb.append(cl.value());
        }
        return sb.toString();
    }

    /* *************************************************************
     * convert a string consisting of space-delimited words into a
     * CoreLabelSequence
     */
    private static List stringToCoreLabelList(String s) {

        ArrayList al = new ArrayList<>();
        String[] splits = s.split(" ");
        for (String word : splits) {
            CoreLabel cl = new CoreLabel();
            cl.setValue(word);
            al.add(cl);
        }
        return al;
    }

    /* *************************************************************
     * convert a string consisting of space-delimited words into a
     * CoreLabelSequence
     */
    private static semRewrite.substitutor.CoreLabelSequence stringToCoreLabelSeq(String s) {

        List al = stringToCoreLabelList(s);
        semRewrite.substitutor.CoreLabelSequence result = new semRewrite.substitutor.CoreLabelSequence(al);
        return result;
    }

    /* *************************************************************
     */
    private Map segToCoreLabel(Segmentation seg) {

        Map result = new HashMap<>();
        for (semRewrite.substitutor.CoreLabelSequence cls : seg.segments) {
            semRewrite.substitutor.CoreLabelSequence clsUpper = cls.toUpperCase();
            if (cls.size() > 1) {
                result.put(clsUpper,cls);
            }
        }
        return result;
    }

    /* *************************************************************
     * FIXME: A hack to use previous code for adding term formats to
     * WordNet
     * FIXME: Only gets used for multi words or certain types?
     */
    private void addNERtoWN(Segmentation seg) {

        KB kb = KBmanager.getMgr().getKB("SUMO");
        for (int i = 0; i < seg.segments.size(); i++) {
            semRewrite.substitutor.CoreLabelSequence cls = seg.segments.get(i);
            semRewrite.substitutor.CoreLabelSequence noPunc = cls.removePunctuation();
            System.out.println("Info in ExternalSubstitutor.addNERtoWN(): noPunc: " + noPunc);
            //if (s.contains(" ")) {
            String type = seg.types.get(i);
            if (!type.contains("UNKNOWN") && !type.startsWith("ACTIONS")) {
                Formula form = new Formula("(termFormat EnglishLanguage " + type + "\"" + cls.toString() + "\")");
                form.setSourceFile("ExternalNER.kif");
                String wnWord = noPunc.toWordNetID(); // replace space with underscore
                System.out.println("Info in ExternalSubstitutor.addNERtoWN(): wnWord: " + wnWord);
                String SUMOterm = Character.toUpperCase(wnWord.charAt(0)) + wnWord.substring(1);
                System.out.println("Info in ExternalSubstitutor.addNERtoWN(): SUMOterm: " + SUMOterm);
                //if (!WordNet.wn.caseMap.containsKey(wnWord.toUpperCase())) {
                    if (noPunc.size() > 1)
                        WordNet.wn.multiWords.addMultiWord(wnWord);
//                    WordNet.wn.synsetFromTermFormat(form, wnWord, SUMOterm, kb);
                    String f = "(instance " + SUMOterm + " " + type + ")";
                    System.out.println("Info in ExternalSubstitutor.addNERtoWN(): formula: " + f);
                    kb.tell(f);
                //}
            }
        }
        kb.kbCache.buildCaches();
    }

    /* *************************************************************
     * Given a list of tokens from a single sentence, collects information from a remote
     * service about continuous noun sequences like "Garry Bloom",
     * "Tim Buk Tu"
     */
    private void initialize(List labels) {

        if (labels.get(labels.size()-1).toString().startsWith(".-"))
            labels.remove(labels.size()-1);
        System.out.println("Info in ExternalSubstitutor.initialize(): labels: " + labels);
        Segmentation seg = null;
        loadCache();
        String nerurl = "";
        try {
            String sentence = toURLSentence(labels);
            if (cache.containsKey(sentence)) {
                seg = cache.get(sentence);
                System.out.println("Info in ExternaSubstitutor.initialize(): Seg: " + seg);
            }
            else {
                System.out.println("Info in ExternalSubstitutor.initialize(): sentence: " + sentence);
                nerurl = System.getenv("NERURL") + "query=" + sentence + "&limit=1";
                System.out.println("Info in ExternalSubstitutor.initialize(): URL: " + nerurl);
                URL myURL = new URL(nerurl);

                URLConnection yc = myURL.openConnection();
                BufferedReader in = new BufferedReader(new InputStreamReader(
                        yc.getInputStream()));
                String JSONString = "";
                String inputLine;
                while ((inputLine = in.readLine()) != null) {
                    System.out.println("Info in ExternalSubstitutor.initialize(): inputLine: " + inputLine);
                    JSONString += inputLine;
                }
                System.out.println("Info in ExternalSubstitutor.initialize(): JSON: " + JSONString);
                in.close();
                seg = parseNERJson(JSONString);
                if (seg == null)
                    return;
                System.out.println("Info in ExternaSubstitutor.initialize(): " + seg);
                cache.put(sentence,seg);
                saveCache();
            }
            addNERtoWN(seg);
        }
        catch (MalformedURLException e) {
            System.out.println("Error in ExternaSubstitutor.initialize(): new URL() failed: " + nerurl);
            e.printStackTrace();
            return;
        }
        catch (IOException e) {
            System.out.println("Error in ExternaSubstitutor.initialize(): IOException: " + nerurl);
            //e.printStackTrace();
            return;
        }
        Map groupsFull = segToCoreLabel(seg);
        // create structures like {[UPTOWN-5, FUNK-6]=[Uptown-5, Funk-6], [APPLE-8, MUSIC-9]=[Apple-8, Music-9]}
        addGroups(groupsFull);
        System.out.println("Info in ExternaSubstitutor.initialize(): result: " + groupsFull);
    }

    /****************************************************************
     */
    public static void main(String[] args) throws IOException {

        ExternalSubstitutor es = new ExternalSubstitutor(stringToCoreLabelList("i want to watch the game of thrones"));
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy