All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.ner.nltk.NLTKNERecogniser Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.ner.nltk;

import org.apache.tika.parser.ner.NERecogniser;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.Set;
import java.util.HashSet;
import java.util.Collection;
import java.util.Map;
import java.util.HashMap;
import java.util.Properties;
import java.util.Iterator;
import java.util.Locale;
import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.Response;

import org.apache.cxf.jaxrs.client.WebClient;

/**
 *  This class offers an implementation of {@link NERecogniser} based on
 *  ne_chunk() module of NLTK. This NER requires additional setup,
 *  due to Http requests to an endpoint server that runs NLTK.
 *  See 
 *
 */
public class NLTKNERecogniser implements NERecogniser {

    private static final Logger LOG = LoggerFactory.getLogger(NLTKNERecogniser.class);
    private static boolean available = false;
    private static final String NLTK_REST_HOST = "http://localhost:8881";
    private String restHostUrlStr;
     /**
     * some common entities identified by NLTK
     */
    public static final Set ENTITY_TYPES = new HashSet(){{
        add("NAMES");
    }};


    public NLTKNERecogniser(){
        try {

            String restHostUrlStr="";
            try {
                restHostUrlStr = readRestUrl();
            } catch (IOException e) {
                LOG.warn("Can't read rest url", e);
            }

            if (restHostUrlStr == null || restHostUrlStr.equals("")) {
                this.restHostUrlStr = NLTK_REST_HOST;
            } else {
                this.restHostUrlStr = restHostUrlStr;
            }

            Response response = WebClient.create(restHostUrlStr).accept(MediaType.TEXT_HTML).get();
            int responseCode = response.getStatus();
            if(responseCode == 200){
                available = true;
            }
            else{
                LOG.info("NLTKRest Server is not running");
            }

        } catch (Exception e) {
            LOG.warn(e.getMessage(), e);
        }
    }

    private static String readRestUrl() throws IOException {
        Properties nltkProperties = new Properties();
        nltkProperties.load(NLTKNERecogniser.class
                .getResourceAsStream("NLTKServer.properties"));

        return nltkProperties.getProperty("nltk.server.url");
    }

    /**
     * @return {@code true} if server endpoint is available.
     * returns {@code false} if server endpoint is not avaliable for service.
     */
    public boolean isAvailable() {
        return available;
    }

    /**
     * Gets set of entity types recognised by this recogniser
     * @return set of entity classes/types
     */
    public Set getEntityTypes() {
        return ENTITY_TYPES;
    }

    /**
     * recognises names of entities in the text
     * @param text text which possibly contains names
     * @return map of entity type -> set of names
     */
    public Map> recognise(String text) {
        Map> entities = new HashMap<>();
        try {
            String url = restHostUrlStr + "/nltk";
            Response response = WebClient.create(url).accept(MediaType.TEXT_HTML).post(text);
            int responseCode = response.getStatus();
            if (responseCode == 200) {
                String result = response.readEntity(String.class);
                JSONParser parser = new JSONParser();
                JSONObject j = (JSONObject) parser.parse(result);
                Iterator keys = j.keySet().iterator();
                while( keys.hasNext() ) {
                    String key = (String)keys.next();
                    if ( !key.equals("result") ) {
                        ENTITY_TYPES.add(key);
                        entities.put(key.toUpperCase(Locale.ENGLISH), new HashSet((Collection) j.get(key)));
                    }
                }
            }
        }
        catch (Exception e) {
            LOG.debug(e.getMessage(), e);
        }

        return entities;
    }


}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy