edu.stanford.nlp.pipeline.StanfordCoreNLPClient Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.pipeline;

import edu.stanford.nlp.io.FileSequentialCollection;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;
import edu.stanford.nlp.util.logging.StanfordRedwoodConfiguration;

import java.io.*;
import java.net.*;
import java.util.*;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import java.util.function.BiConsumer;
import java.util.function.Consumer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import static edu.stanford.nlp.util.logging.Redwood.Util.*;

/**
 * An annotation pipeline in spirit identical to {@link StanfordCoreNLP}, but
 * with the backend supported by a web server.
 *
 * @author Gabor Angeli
 */
@SuppressWarnings("FieldCanBeLocal")
public class StanfordCoreNLPClient extends AnnotationPipeline  {

  /** A logger for this class */
  private static final Redwood.RedwoodChannels log = Redwood.channels(StanfordCoreNLPClient.class);

  /** A simple URL spec, for parsing backend URLs */
  private static final Pattern URL_PATTERN = Pattern.compile("(?:(https?)://)?([^:]+):([0-9]+)?");

  /**
   * Information on how to connect to a backend.
   * The semantics of one of these objects is as follows:
   * 
   *   It should define a hostname and port to connect to.
   *   This represents ONE thread on the remote server. The client should
   *       treat it as such.
   *   Two backends that are .equals() point to the same endpoint, but there can be
   *       multiple of them if we want to run multiple threads on that endpoint.
   * 
   */
  private static class Backend {
    /** The protocol to connect to the server with. */
    public final String protocol;
    /** The hostname of the server running the CoreNLP annotators */
    public final String host;
    /** The port of the server running the CoreNLP annotators */
    public final int port;
    public Backend(String protocol, String host, int port) {
      this.protocol = protocol;
      this.host = host;
      this.port = port;
    }
    @Override
    public boolean equals(Object o) {
      if (this == o) return true;
      if (!(o instanceof Backend)) return false;
      Backend backend = (Backend) o;
      return port == backend.port && protocol.equals(backend.protocol) && host.equals(backend.host);
    }
    @Override
    public int hashCode() {
      throw new IllegalStateException("Hashing backends is dangerous!");
    }

    @Override
    public String toString() {
      return protocol + "://" + host + ":" + port;
    }
  }

  /**
   * A special type of {@link Thread}, which is responsible for scheduling jobs
   * on the backend.
   */
  private static class BackendScheduler extends Thread {
    /**
     * The list of backends that we can schedule on.
     * This should not generally be called directly from anywhere
     */
    public final List backends;

    /**
     * The queue on requests for the scheduler to handle.
     * Each element of this queue is a function: calling the function signals
     * that this backend is available to perform a task on the passed backend.
     * It is then obligated to call the passed Consumer to signal that it has
     * released control of the backend, and it can be used for other things.
     * Remember to lock access to this object with {@link BackendScheduler#stateLock}.
     */
    private final Queue>> queue;
    /**
     * The lock on access to {@link BackendScheduler#queue}.
     */
    private final Lock stateLock = new ReentrantLock();
    /**
     * Represents the event that an item has been added to the work queue.
     * Linked to {@link BackendScheduler#stateLock}.
     */
    private final Condition enqueued = stateLock.newCondition();
    /**
     * Represents the event that the queue has become empty, and this schedule is no
     * longer needed.
     */
    public final Condition shouldShutdown = stateLock.newCondition();

    /**
     * The queue of annotators (backends) that are free to be run on.
     * Remember to lock access to this object with {@link BackendScheduler#stateLock}.
     */
    private final Queue freeAnnotators;
    /**
     * Represents the event that an annotator has freed up and is available for
     * work on the {@link BackendScheduler#freeAnnotators} queue.
     * Linked to {@link BackendScheduler#stateLock}.
     */
    private final Condition newlyFree = stateLock.newCondition();

    /**
     * While this is true, continue running the scheduler.
     */
    private boolean doRun = true;

    /**
     * Create a new scheduler from a list of backends.
     * These can contain duplicates -- in that case, that many concurrent
     * calls can be made to that backend.
     */
    public BackendScheduler(List backends) {
      super();
      setDaemon(true);
      this.backends = backends;
      this.freeAnnotators = new LinkedList<>(backends);
      this.queue = new LinkedList<>();
    }

    /** {@inheritDoc} */
    @Override
    public void run() {
      try {
        while (doRun) {
          // Wait for a request
          BiConsumer> request;
          Backend annotator;
          stateLock.lock();
          try {
            while (queue.isEmpty()) {
              enqueued.await();
              if (!doRun) {
                return;
              }
            }
            // Get the actual request
            request = queue.poll();
            // We have a request

            // Find a free annotator
            while (freeAnnotators.isEmpty()) {
              newlyFree.await();
            }
            annotator = freeAnnotators.poll();
          } finally {
            stateLock.unlock();
          }
          // We have an annotator

          // Run the annotation
          request.accept(annotator, freedAnnotator -> {
            // ASYNC: we've freed this annotator
            // add it back to the queue and register it as available
            stateLock.lock();
            try {
              freeAnnotators.add(freedAnnotator);

              // If the queue is empty, and all the annotators have returned, we're done
              if (queue.isEmpty() && freeAnnotators.size() == backends.size()) {
                log.info("All annotations completed. Signaling for shutdown");
                shouldShutdown.signalAll();
              }

              newlyFree.signal();
            } finally {
              stateLock.unlock();
            }
          });
          // Annotator is running (in parallel, most likely)
        }
      } catch (InterruptedException e) {
        throw new RuntimeException(e);
      }
    }

    /**
     * Schedule a new job on the backend
     * @param annotate A callback, which will be called when a backend is free
     *                 to do some processing. The implementation of this callback
     *                 MUST CALL the second argument when it is done processing,
     *                 to register the backend as free for further work.
     */
    public void schedule(BiConsumer> annotate) {
      stateLock.lock();
      try {
        queue.add(annotate);
        enqueued.signal();
      } finally {
        stateLock.unlock();
      }
    }
  } // end static class BackEndScheduler

  /** The path on the server to connect to. */
  private final String path = "";
  /** The Properties file to annotate with. */
  private final Properties properties;

  /** The Properties file to send to the server, serialized as JSON. */
  private final String propsAsJSON;

  /** The API key to authenticate with, or null */
  private final String apiKey;
  /** The API secret to authenticate with, or null */
  private final String apiSecret;

  /** The scheduler to use when running on multiple backends at a time */
  private final BackendScheduler scheduler;

  /**
   * The annotation serializer responsible for translating between the wire format
   * (protocol buffers) and the {@link Annotation} classes.
   */
  private final ProtobufAnnotationSerializer serializer = new ProtobufAnnotationSerializer(true);

  /**
   * The main constructor. Create a client from a properties file and a list of backends.
   * Note that this creates at least one Daemon thread.
   *
   * @param properties The properties file, as would be passed to {@link StanfordCoreNLP}.
   * @param backends The backends to run on.
   * @param apiKey The key to authenticate with as a username
   * @param apiSecret The key to authenticate with as a password
   */
  private StanfordCoreNLPClient(Properties properties, List backends,
                                String apiKey, String apiSecret) {
    // Save the constructor variables
    this.properties = properties;
    Properties serverProperties = new Properties();
    for (String key : properties.stringPropertyNames()) {
      serverProperties.setProperty(key, properties.getProperty(key));
    }
    Collections.shuffle(backends, new Random(System.currentTimeMillis()));
    this.scheduler = new BackendScheduler(backends);
    this.apiKey = apiKey;
    this.apiSecret = apiSecret;

    // Set required serverProperties
    serverProperties.setProperty("inputFormat", "serialized");
    serverProperties.setProperty("outputFormat", "serialized");
    serverProperties.setProperty("inputSerializer", ProtobufAnnotationSerializer.class.getName());
    serverProperties.setProperty("outputSerializer", ProtobufAnnotationSerializer.class.getName());

    // Create a list of all the properties, as JSON map elements
    List jsonProperties = serverProperties.stringPropertyNames().stream().map(key -> '"' + JSONOutputter.cleanJSON(key) + "\": \"" +
        JSONOutputter
            .cleanJSON(serverProperties.getProperty(key)) + '"')
        .collect(Collectors.toList());
    // Create the JSON object
    this.propsAsJSON = "{ " + StringUtils.join(jsonProperties, ", ") + " }";

    // Start 'er up
    this.scheduler.start();
  }


  /**
   * The main constructor without credentials.
   *
   * @see StanfordCoreNLPClient#StanfordCoreNLPClient(Properties, List, String, String)
   */
  private StanfordCoreNLPClient(Properties properties, List backends) {
    this(properties, backends, null, null);
  }


  /**
   * Run the client, pulling credentials from the environment.
   * Throws an IllegalStateException if the required environment variables aren't set.
   * These are:
   *
   * 
   *   CORENLP_HOST
   *   CORENLP_KEY
   *   CORENLP_SECRET
   * 
   *
   * @throws IllegalStateException Thrown if we could not read the required environment variables.
   */
  @SuppressWarnings("unused")
  public StanfordCoreNLPClient(Properties properties) throws IllegalStateException {
    this(properties,
        Optional.ofNullable(System.getenv("CORENLP_HOST")).orElseThrow(() -> new IllegalStateException("Environment variable CORENLP_HOST not specified")),
        Optional.ofNullable(System.getenv("CORENLP_HOST")).map(x -> x.startsWith("http://") ? 80 : 443).orElse(443),
        1,
        Optional.ofNullable(System.getenv("CORENLP_KEY")).orElse(null),
        Optional.ofNullable(System.getenv("CORENLP_SECRET")).orElse(null)
      );
  }


  /**
   * Run on a single backend.
   *
   * @see StanfordCoreNLPClient (Properties, List)
   */
  @SuppressWarnings("unused")
  public StanfordCoreNLPClient(Properties properties, String host, int port) {
    this(properties, host, port, 1);
  }

  /**
   * Run on a single backend, with authentication
   *
   * @see StanfordCoreNLPClient (Properties, List)
   */
  @SuppressWarnings("unused")
  public StanfordCoreNLPClient(Properties properties, String host, int port,
                               String apiKey, String apiSecret) {
    this(properties, host, port, 1, apiKey, apiSecret);
  }


  /**
   * Run on a single backend, with authentication
   *
   * @see StanfordCoreNLPClient (Properties, List)
   */
  @SuppressWarnings("unused")
  public StanfordCoreNLPClient(Properties properties, String host,
                               String apiKey, String apiSecret) {
    this(properties, host, host.startsWith("http://") ? 80 : 443, 1, apiKey, apiSecret);
  }

  /**
   * Run on a single backend, but with k threads on each backend.
   *
   * @see StanfordCoreNLPClient (Properties, List)
   */
  @SuppressWarnings("unused")
  public StanfordCoreNLPClient(Properties properties, String host, int port, int threads) {
    this(properties, host, port, threads, null, null);
  }


  /**
   * Run on a single backend, but with k threads on each backend, and with authentication
   *
   * @see StanfordCoreNLPClient (Properties, List)
   */
  public StanfordCoreNLPClient(Properties properties, String host, int port, int threads,
                               String apiKey, String apiSecret) {
    this(properties, new ArrayList() {{
      for (int i = 0; i < threads; ++i) {
        add(new Backend(host.startsWith("http://") ? "http" : "https",
            host.startsWith("http://") ? host.substring("http://".length()) : (host.startsWith("https://") ? host.substring("https://".length()) : host),
            port));
      }
    }},
    apiKey, apiSecret);
  }

  /**
   * {@inheritDoc}
   *
   * This method creates an async call to the server, and blocks until the server
   * has finished annotating the object.
   */
  @Override
  public void annotate(Annotation annotation) {
    final Lock lock = new ReentrantLock();
    final Condition annotationDone = lock.newCondition();
    annotate(Collections.singleton(annotation), 1, (Annotation annInput) -> {
      try {
        lock.lock();
        annotationDone.signal();
      } finally {
        lock.unlock();
      }
    });
    try {
      lock.lock();
      annotationDone.await();  // Only wait for one callback to complete; only annotating one document
    } catch (InterruptedException e) {
      log.info("Interrupt while waiting for annotation to return");
    } finally {
      lock.unlock();
    }
  }

  /**
   * This method fires off a request to the server. Upon returning, it calls the provided
   * callback method.
   *
   * @param annotations The input annotations to process
   * @param numThreads The number of threads to run on. IGNORED in this class.
   * @param callback A function to be called when an annotation finishes.
   */
  @Override
  public void annotate(final Iterable annotations, int numThreads, final Consumer callback){
    for (Annotation annotation : annotations) {
      annotate(annotation, callback);
    }
  }


  /**
   * The canonical entry point of the client annotator.
   * Create an HTTP request, send this annotation to the server, and await a response.
   *
   * @param annotation The annotation to annotate.
   * @param callback Called when the server has returned an annotated document.
   *                 The input to this callback is the same as the passed Annotation object.
   */
  public void annotate(final Annotation annotation, final Consumer callback) {
    scheduler.schedule((Backend backend, Consumer isFinishedCallback) -> new Thread(() -> {
      try {
        // 1. Create the input
        // 1.1 Create a protocol buffer
        ByteArrayOutputStream os = new ByteArrayOutputStream();
        serializer.write(annotation, os);
        os.close();
        byte[] message = os.toByteArray();
        // 1.2 Create the query params

        String queryParams = String.format(
            "properties=%s",
            URLEncoder.encode(StanfordCoreNLPClient.this.propsAsJSON, "utf-8"));

        // 2. Create a connection
        URL serverURL = new URL(backend.protocol, backend.host,
            backend.port,
            StanfordCoreNLPClient.this.path + '?' + queryParams);

        // 3. Do the annotation
        //    This method has two contracts:
        //    1. It should call the two relevant callbacks
        //    2. It must not throw an exception
        doAnnotation(annotation, backend, serverURL, message, 0);
      } catch (Throwable t) {
        log.warn("Could not annotate via server! Trying to annotate locally...", t);
        StanfordCoreNLP corenlp = new StanfordCoreNLP(properties);
        corenlp.annotate(annotation);
      } finally {
        callback.accept(annotation);
        isFinishedCallback.accept(backend);
      }
    }).start());
  }


  /**
   * Actually try to perform the annotation on the server side.
   * This is factored out so that we can retry up to 3 times.
   *
   * @param annotation The annotation we need to fill.
   * @param backend The backend we are querying against.
   * @param serverURL The URL of the server we are hitting.
   * @param message The message we are sending the server (don't need to recompute each retry).
   * @param tries The number of times we've tried already.
   */
  @SuppressWarnings("unchecked")
  private void doAnnotation(Annotation annotation, Backend backend, URL serverURL, byte[] message, int tries) {

    try {
      // 1. Set up the connection
      URLConnection connection = serverURL.openConnection();
      // 1.1 Set authentication
      if (apiKey != null && apiSecret != null) {
        String userpass = apiKey + ":" + apiSecret;
        String basicAuth = "Basic " + new String(Base64.getEncoder().encode(userpass.getBytes()));
        connection.setRequestProperty("Authorization", basicAuth);
      }
      // 1.2 Set some protocol-independent properties
      connection.setDoOutput(true);
      connection.setRequestProperty("Content-Type", "application/x-protobuf");
      connection.setRequestProperty("Content-Length", Integer.toString(message.length));
      connection.setRequestProperty("Accept-Charset", "utf-8");
      connection.setRequestProperty("User-Agent", StanfordCoreNLPClient.class.getName());
      // 1.3 Set some protocol-dependent properties
      switch (backend.protocol) {
        case "https":
        case "http":
          ((HttpURLConnection) connection).setRequestMethod("POST");
          break;
        default:
          throw new IllegalStateException("Haven't implemented protocol: " + backend.protocol);
      }

      // 2. Annotate
      // 2.1. Fire off the request
      connection.connect();
      connection.getOutputStream().write(message);
      connection.getOutputStream().flush();
      // 2.2 Await a response
      // -- It might be possible to send more than one message, but we are not going to do that.
      Annotation response = serializer.read(connection.getInputStream()).first;
      // 2.3. Copy response over to original annotation
      for (Class key : response.keySet()) {
        annotation.set(key, response.get(key));
      }

    } catch (Throwable t) {
      // 3. We encountered an error -- retry
      if (tries < 3) {
        log.warn(t);
        doAnnotation(annotation, backend, serverURL, message, tries + 1);
      } else {
        throw new RuntimeException(t);
      }
    }
  }


  /**
   * Runs the entire pipeline on the content of the given text passed in.
   * @param text The text to process
   * @return An Annotation object containing the output of all annotators
   */
  public Annotation process(String text) {
    Annotation annotation = new Annotation(text);
    annotate(annotation);
    return annotation;
  }


  /**
   * Runs an interactive shell where input text is processed with the given pipeline.
   *
   * @param pipeline The pipeline to be used
   * @throws IOException If IO problem with stdin
   */
  private static void shell(StanfordCoreNLPClient pipeline) throws IOException {
    log.info("Entering interactive shell. Type q RETURN or EOF to quit.");
    final StanfordCoreNLP.OutputFormat outputFormat = StanfordCoreNLP.OutputFormat.valueOf(pipeline.properties.getProperty("outputFormat", "text").toUpperCase());
    IOUtils.console("NLP> ", line -> {
      if ( ! line.isEmpty()) {
        Annotation anno = pipeline.process(line);
        try {
          switch (outputFormat) {
            case XML:
              new XMLOutputter().print(anno, System.out);
              break;
            case JSON:
              new JSONOutputter().print(anno, System.out);
              System.out.println();
              break;
            case CONLL:
              new CoNLLOutputter().print(anno, System.out);
              System.out.println();
              break;
            case TEXT:
              new TextOutputter().print(anno, System.out);
              break;
            case SERIALIZED:
              warn("You probably cannot read the serialized output, so printing in text instead");
              new TextOutputter().print(anno, System.out);
              break;
            default:
              throw new IllegalArgumentException("Cannot output in format " + outputFormat + " from the interactive shell");
          }
        } catch (IOException e) {
          throw new RuntimeIOException(e);
        }
      }
    });
  }

  /**
   * The implementation of what to run on a command-line call of CoreNLPWebClient
   *
   * @throws IOException If any IO problem
   */
  public void run() throws IOException {
    StanfordRedwoodConfiguration.minimalSetup();
    StanfordCoreNLP.OutputFormat outputFormat = StanfordCoreNLP.OutputFormat.valueOf(properties.getProperty("outputFormat", "text").toUpperCase());

    //
    // Process one file or a directory of files
    //
    if (properties.containsKey("file") || properties.containsKey("textFile")) {
      String fileName = properties.getProperty("file");
      if (fileName == null) {
        fileName = properties.getProperty("textFile");
      }
      Collection files = new FileSequentialCollection(new File(fileName), properties.getProperty("extension"), true);
      StanfordCoreNLP.processFiles(null, files, 1, properties, this::annotate,
          StanfordCoreNLP.createOutputter(properties, new AnnotationOutputter.Options()), outputFormat);
    }

    //
    // Process a list of files
    //
    else if (properties.containsKey("filelist")){
      String fileName = properties.getProperty("filelist");
      Collection inputFiles = StanfordCoreNLP.readFileList(fileName);
      Collection files = new ArrayList<>(inputFiles.size());
      for (File file : inputFiles) {
        if (file.isDirectory()) {
          files.addAll(new FileSequentialCollection(new File(fileName), properties.getProperty("extension"), true));
        } else {
          files.add(file);
        }
      }
      StanfordCoreNLP.processFiles(null, files, 1, properties, this::annotate,
          StanfordCoreNLP.createOutputter(properties, new AnnotationOutputter.Options()), outputFormat);
    }

    //
    // Run the interactive shell
    //
    else {
      shell(this);
    }
  }

  /**
   * 
   *   Good practice to call after you are done with this object.
   *   Shuts down the queue of annotations to run and the associated threads.
   * 
   *
   * 
   *   If this is not called, any job which has been scheduled but not run will be
   *   cancelled.
   * 
   */
  public void shutdown() throws InterruptedException {
    scheduler.stateLock.lock();
    try {
      while (!scheduler.queue.isEmpty() || scheduler.freeAnnotators.size() != scheduler.backends.size()) {
        scheduler.shouldShutdown.await(5, TimeUnit.SECONDS);
      }
      scheduler.doRun = false;
      scheduler.enqueued.signalAll();  // In case the thread's waiting on this condition
    } finally {
      scheduler.stateLock.unlock();
    }
  }


  /**
   * This can be used just for testing or for command-line text processing.
   * This runs the pipeline you specify on the
   * text in the file that you specify and sends some results to stdout.
   * The current code in this main method assumes that each line of the file
   * is to be processed separately as a single sentence.
   * 
   * Example usage:

   * java -mx6g edu.stanford.nlp.pipeline.StanfordCoreNLP -props properties -backends site1:port1,site2,port2 

   *    or just -host name -port number
   *
   * @param args List of required properties
   * @throws java.io.IOException If IO problem
   * @throws ClassNotFoundException If class loading problem
   */
  public static void main(String[] args) throws IOException, ClassNotFoundException {
    //
    // process the arguments
    //
    // extract all the properties from the command line
    // if cmd line is empty, set the properties to null. The processor will search for the properties file in the classpath
    // if (args.length < 2) {
    //   log.info("Usage: " + StanfordCoreNLPClient.class.getSimpleName() + " -host  -port  ...");
    //   System.exit(1);
    // }
    Properties props = StringUtils.argsToProperties(args);
    boolean hasH = props.containsKey("h");
    boolean hasHelp = props.containsKey("help");
    if (hasH || hasHelp) {
      String helpValue = hasH ? props.getProperty("h") : props.getProperty("help");
      StanfordCoreNLP.printHelp(System.err, helpValue);
      return;
    }

    // Create the backends
    List backends = new ArrayList<>();
    String defaultBack = "http://localhost:9000";
    String backStr = props.getProperty("backends");
    if (backStr == null) {
      String host = props.getProperty("host");
      String port = props.getProperty("port");
      if (host != null) {
        if (port != null) {
          defaultBack = host + ':' + port;
        } else {
          defaultBack = host;
        }
      }
    }

    for (String spec : props.getProperty("backends", defaultBack).split(",")) {
      Matcher matcher = URL_PATTERN.matcher(spec.trim());
      if (matcher.matches()) {
        String protocol = matcher.group(1);
        if (protocol == null) {
          protocol = "http";
        }
        String host = matcher.group(2);
        int port = 80;
        String portStr = matcher.group(3);
        if (portStr != null) {
          port = Integer.parseInt(portStr);
        }
        backends.add(new Backend(protocol, host, port));
      }
    }
    log.info("Using backends: " + backends);

    // Run the pipeline
    StanfordCoreNLPClient client = new StanfordCoreNLPClient(props, backends);
    client.run();
    try {
      client.shutdown();  // In case anything is pending on the server
    } catch (InterruptedException ignored) { }
  } // end main()

}