edu.washington.cs.knowitall.nlp.ChunkedSentencePattern Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of reverb-core Show documentation
A regular-expression based Open IE relation extractor.
There is a newer version: 1.4.3
package edu.washington.cs.knowitall.nlp;

import java.io.IOException;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import edu.washington.cs.knowitall.logic.ArgFactory;
import edu.washington.cs.knowitall.logic.LogicExpression;
import edu.washington.cs.knowitall.logic.Expression.Arg;
import edu.washington.cs.knowitall.regex.Expression;
import edu.washington.cs.knowitall.regex.ExpressionFactory;
import edu.washington.cs.knowitall.regex.Match;
import edu.washington.cs.knowitall.regex.RegularExpression;

public class ChunkedSentencePattern {
    /***
     * This class compiles regular expressions over the ChunkedSentenceTokens in
     * a sentence into an NFA. There is a lot of redundancy in their
     * expressiveness. This is largely because it supports pattern matching on
     * the fields This is not necessary but is an optimization and a shorthand
     * (i.e. {@code  is equivalent to "}
     * and {@code (?: | )}.
     * 
     * Here are some equivalent examples:
     * 

     *  {@code * +}
     * 
 {@code * +}
     * 
 {@code * +}
     * 
 {@code * (?: | )+}
     * 
     * Note that (3) and (4) are not preferred for efficiency reasons. Regex OR
     * (in example (4)) should only be used on multi-ChunkedSentenceToken
     * sequences.
     * 
     * The Regular Expressions support named groups (: ... ), unnamed
     * groups (?: ... ), and capturing groups ( ... ). The operators allowed are
     * +, ?, *, and |. The Logic Expressions (that describe each
     * ChunkedSentenceToken) allow grouping "( ... )", not '!', or '|', and and
     * '&'.
     *
     * @param regex
     * @return
     */
    public static RegularExpression compile(String regex) {
        return RegularExpression.compile(regex,
                new ExpressionFactory() {

                    @Override
                    public Expression.BaseExpression create(
                            final String expression) {
                        final Pattern valuePattern = Pattern
                                .compile("([\"'])(.*)\\1");
                        return new Expression.BaseExpression(
                                expression) {
                            private final LogicExpression logic;

                            {
                                this.logic = LogicExpression.compile(
                                        expression,
                                        new ArgFactory() {
                                            @Override
                                            public Arg create(
                                                    final String argument) {
                                                return new Arg() {
                                                    private final ChunkedSentenceToken.Expression expression;

                                                    {
                                                        String[] parts = argument
                                                                .split("=");

                                                        String base = parts[0];

                                                        Matcher matcher = valuePattern
                                                                .matcher(parts[1]);
                                                        if (!matcher.matches()) {
                                                            throw new IllegalArgumentException(
                                                                    "Value not enclosed in quotes (\") or ('): "
                                                                            + argument);
                                                        }
                                                        String string = matcher
                                                                .group(2);

                                                        if (base.equalsIgnoreCase("stringCS")) {
                                                            this.expression = new ChunkedSentenceToken.StringExpression(
                                                                    string, 0);
                                                        } else if (base
                                                                .equalsIgnoreCase("string")) {
                                                            this.expression = new ChunkedSentenceToken.StringExpression(
                                                                    string);
                                                        } else if (base
                                                                .equalsIgnoreCase("pos")) {
                                                            this.expression = new ChunkedSentenceToken.PosTagExpression(
                                                                    string);
                                                        } else if (base
                                                                .equalsIgnoreCase("chunk")) {
                                                            this.expression = new ChunkedSentenceToken.ChunkTagExpression(
                                                                    string);
                                                        } else {
                                                            throw new IllegalStateException(
                                                                    "unknown argument specified: "
                                                                            + base);
                                                        }
                                                    }

                                                    @Override
                                                    public boolean apply(
                                                            ChunkedSentenceToken entity) {
                                                        return this.expression
                                                                .apply(entity);
                                                    }
                                                };
                                            }
                                        });
                            }

                            @Override
                            public boolean apply(ChunkedSentenceToken entity) {
                                return logic.apply(entity);
                            }
                        };
                    }
                });
    }

    public static void main(String[] args) throws ChunkerException, IOException {
        System.out.println("Compiling the expression... ");
        RegularExpression expression = ChunkedSentencePattern
                .compile(args[0]);
        System.out.println(expression);
        OpenNlpSentenceChunker chunker = new OpenNlpSentenceChunker();

        System.out
                .println("Please enter a sentence to match with the above expression.");
        Scanner scan = new Scanner(System.in);
        while (scan.hasNextLine()) {
            String line = scan.nextLine();
            ChunkedSentence chunked = chunker.chunkSentence(line);
            Match match = expression
                    .match(ChunkedSentenceToken.tokenize(chunked));
            if (match != null) {
                System.out.println(match.groups().get(0));
            }
        }
    }
}