gate.creole.annic.lucene.SubQueryParser Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of gate-core Show documentation
Show all versions of gate-core Show documentation
GATE - general achitecture for text engineering - is open source
software capable of solving almost any text processing problem. This
artifact enables you to embed the core GATE Embedded with its essential
dependencies. You will able to use the GATE Embedded API and load and
store GATE XML documents. This artifact is the perfect dependency for
CREOLE plugins or for applications that need to customize the GATE
dependencies due to confict with their own dependencies or for lower
footprint.
The newest version!
/*
* SubQueryParser.java
*
* Niraj Aswani, 19/March/07
*
* $Id: SubQueryParser.html,v 1.0 2007/03/19 16:22:01 niraj Exp $
*/
package gate.creole.annic.lucene;
import java.io.*;
import java.util.*;
import gate.creole.ir.SearchException;
/**
* This class behaves as a helper class to the QueryParser and provides
* various methods which are called from various methods of QueryParser.
*
* @author niraj
*/
public class SubQueryParser {
public static void main(String[] args) {
try {
BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
while(true) {
System.out.print("Query: ");
String line = in.readLine();
if(line.length() == -1) break;
List queries = parseQuery(line);
for(int i = 0; i < queries.size(); i++) {
System.out.println("=>" + queries.get(i));
}
}
}
catch(Exception e) {
e.printStackTrace();
}
}
/**
* Method retrieves wild card characters after the closing bracket.
*/
private static String findWildCardString(int brClPos, String query) {
String wcs = "";
if(brClPos + 1 < query.length()) {
if(query.charAt(brClPos + 1) == '*' || query.charAt(brClPos + 1) == '+' || query.charAt(brClPos + 1) == '?') {
wcs = query.charAt(brClPos + 1) + "";
// ok so lets fetch the number
for(int i = brClPos + 2; i < query.length(); i++) {
if(Character.isDigit(query.charAt(i))) {
wcs += query.charAt(i);
}
else {
break;
}
}
}
}
return wcs;
}
/**
* This method, interprets the wild cards and convert query
* accordingly. For example: (A)+3 is converted into ((A) | ((A)(A)) |
* ((A)(A)(A)))
*/
private static String extractWildcards(String query) throws SearchException {
outer: while(true) {
char ch = ' ', pre = ' ';
for(int i = 0; i < query.length(); i++) {
pre = ch;
ch = query.charAt(i);
// check if it is an open bracket
// it is if it doesn't follow the '\' escape sequence
if(isOpenBracket(ch, pre)) {
// so find out where it gets closed
int brClPos = findBracketClosingPosition(i + 1, query);
if(brClPos == -1) {
throw new SearchException("unbalanced brackets",
"a closing bracket ()) is missing for this opening bracket", query, i);
}
String wildCardString = findWildCardString(brClPos, query);
int wcsLen = 0;
boolean atLeastOne = false;
// at least once
int repeatClause = 1;
if(wildCardString.length() != 0) {
if(wildCardString.length() == 1) {
// if there is only wildcard char sign
// we consider it as 1
wcsLen = 1;
}
else {
atLeastOne = (wildCardString.charAt(0) == '*' || wildCardString.charAt(0) == '?') ? false : true;
// now find out the number of Times we need to
// duplicate the bracketClause
repeatClause = Integer.parseInt(wildCardString.substring(1,
wildCardString.length()));
wcsLen = wildCardString.length();
}
String previous = query.substring(0, i);
String after = query
.substring(brClPos + wcsLen + 1, query.length());
String sToRepeat = query.substring(i, brClPos + 1);
String newString = "(";
for(int k = 1; k <= repeatClause; k++) {
newString += "(";
for(int subK = 0; subK < k; subK++) {
newString += sToRepeat;
}
newString += ")";
if(k + 1 <= repeatClause) {
newString += " | ";
}
}
if(!atLeastOne) {
newString += "| {__o__}";
}
newString += ")";
query = previous + newString + after;
continue outer;
}
}
}
// if we are here
// that means no whildcard left
return query;
}
}
/**
* this method parses the query and returns the different queries
* converted into the OR normalized form
* for e.g. ({A}|{B}){C}
* this will be converted into ({A}{C}) | ({B}{C})
* and the arrayList consists of
* 1. {A}{C}
* 2. {B}{C}
*/
public static List parseQuery(String q1) throws SearchException {
// arraylist to return - will contain all the OR normalized queries
List queries = new ArrayList();
// remove all extra spaces from the query
q1 = q1.trim();
// we add opening and closing brackets explicitly
q1 = "( " + q1 + " )";
q1 = extractWildcards(q1);
// add the main Query in the arraylist
queries.add(q1);
for(int index = 0; index < queries.size(); index++) {
// get the query to be parsed
String query = queries.get(index);
// current character and the previous character
char ch = ' ', pre = ' ';
// if query is ORed
// we need duplication
// for example: {A}({B}|{C})
// the normalized form will be
// {A}{B}
// {A}{C}
// here we need {A} to be duplicated two times
boolean duplicated = false;
int dupliSize = 0;
String data = "";
// we need to look into one query at a time and parse it
for(int i = 0; i < query.length(); i++) {
pre = ch;
ch = query.charAt(i);
// check if it is an open bracket
// it is if it doesn't follow the '\' escape sequence
if(isOpenBracket(ch, pre)) {
// so find out where it gets closed
int brClPos = findBracketClosingPosition(i + 1, query);
if(brClPos == -1) {
throw new SearchException("unbalanced brackets",
"a closing bracket ()) is missing for this opening bracket", query, i);
}
// see if there are any OR operators in it
ArrayList orTokens = findOrTokens(query.substring(i + 1, brClPos));
// orTokens will have
// for eg. {A} | ({B}{C})
// then {A}
// and ({B}{C})
// so basically findOrTokens find out all the tokens around
// | operator
if(orTokens.size() > 1) {
String text = "";
// data contains all the buffered character before the
// current positions
// for example "ABC" ({B} | {C})
// here "ABC" will be in data
// and {B} and {C} in orTokens
if(!duplicated && data.length() > 0) {
text = data;
data = "";
}
else {
if(index == queries.size() - 1) {
// this is the case where we would select the
// text as ""
text = "";
}
else {
text = queries.get(queries.size() - 1);
}
}
// so we need to duplicate the text orTokens.size()
// times
// for example "ABC" ({B} | {C})
// text = "ABC"
// orTokens {B} {C}
// so two queries will be added
// 1. "ABC"
// 2. "ABC"
queries = duplicate(queries, text, dupliSize, orTokens.size());
// and tokens will be added
// 1. "ABC" {B}
// 2. "ABC" {C}
queries = writeTokens(orTokens, queries, dupliSize);
// text is duplicated so make it true
duplicated = true;
// and how many times it was duplicated
if(dupliSize == 0) dupliSize = 1;
dupliSize *= orTokens.size();
}
else {
// what if the there is only one element between ( and )
// it is not an 'OR' query
// check how many times we have duplicated the text
if(dupliSize == 0) {
// if zero and the text buffered is ""
// we simply add "" as a separate Query
// otherwise add the buffered data as a separate
// Query
if(data.length() == 0)
queries.add("");
else queries.add(data);
// because we simply needs to add it only once
// but still we have copied it as a separate query
// so say duplicated = true
duplicated = true;
data = "";
// and ofcourse the size of the duplication will be
// only 1
dupliSize = 1;
}
// and we need to add all the contents between two
// brackets in the last duplicated
// queries
queries = writeStringInAll(query.substring(i + 1, brClPos),
dupliSize, queries);
}
i = brClPos;
}
else if(isClosingBracket(ch, pre)) {
throw new SearchException("unbalanced brackets",
"a opening bracket (() is missing for this closing bracket", query, i);
}
else {
if(duplicated) {
queries = writeCharInAll(ch, dupliSize, queries);
}
else {
data += "" + ch;
}
}
}
boolean scan = scanQueryForOrOrBracket(query);
if(scan) {
queries.remove(index);
index--;
}
}
ArrayList queriesToReturn = new ArrayList();
for(int i = 0; i < queries.size(); i++) {
String q = queries.get(i);
if(q.trim().length() == 0) {
continue;
}
else if(queriesToReturn.contains(q.trim())) {
continue;
}
else {
queriesToReturn.add(q.trim());
}
}
return queriesToReturn;
}
/**
* This method checks if query has either | or ( in it.
*/
public static boolean scanQueryForOrOrBracket(String query) {
int index = 0;
int index1 = 0;
do {
index = query.indexOf('|', index);
if(index == 0) {
return true;
}
else if(index > 0) {
// we have found it but we need to check if it is an escape
// sequence
if(query.charAt(index - 1) == '\\') {
// yes it is an escape sequence
// lets search for the next one
}
else {
return true;
}
}
// if we are here that means it was not found
index1 = query.indexOf('(', index1);
if(index1 == 0) {
return true;
}
else if(index1 > 0) {
// we have found it
if(query.charAt(index1 - 1) == '\\') {
// yes it is an escape sequence
continue;
}
else {
return true;
}
}
} while(index >= 0 && index1 >= 0);
return false;
}
/**
* This is a helper method that helps in duplicating the provided tokens.
*/
private static List writeTokens(List tokens, List queries,
int dupliSize) {
if(dupliSize == 0) dupliSize = 1;
ArrayList qToRemove = new ArrayList();
for(int j = 0; j < dupliSize; j++) {
for(int i = 1; i <= tokens.size(); i++) {
String token = tokens.get(i - 1);
if(token.trim().equals("{__o__}")) {
token = " ";
}
String s = queries
.get(queries.size() - (j * tokens.size() + i));
qToRemove.add(s);
s += token;
queries.set(queries.size() - (j * tokens.size() + i), s);
}
}
// and now remove
for(int i = 0; i < qToRemove.size(); i++) {
queries.remove(qToRemove.get(i));
}
return queries;
}
/**
* This is a helper method that helps in duplicating the provided tokens.
*/
private static List duplicate(List queries, String s, int dupliSize,
int no) {
if(s == null) s = "";
List strings = new ArrayList();
if(dupliSize == 0) {
strings.add(s);
}
else {
for(int i = 0; i < dupliSize; i++) {
strings.add(queries.get(queries.size() - (i + 1)));
}
}
for(int i = 0; i < strings.size(); i++) {
for(int j = 0; j < no; j++) {
queries.add(strings.get(i));
}
}
return queries;
}
/**
* This method given a query identifies the OR Tokens
* for eg. {A} | ({B}{C})
* then {A}
* and ({B}{C})
* so basically findOrTokens find out all the tokens around
* | operator
*/
public static ArrayList findOrTokens(String query) {
int balance = 0;
char pre = ' ';
char ch = ' ';
ArrayList ors = new ArrayList();
String s = "";
for(int i = 0; i < query.length(); i++) {
pre = ch;
ch = query.charAt(i);
if(isOpenBracket(ch, pre)) {
balance++;
s += "" + ch;
continue;
}
if(isClosingBracket(ch, pre) && balance > 0) {
balance--;
s += "" + ch;
continue;
}
if(isOrSym(ch, pre)) {
if(balance > 0) {
s += "" + ch;
continue;
}
else {
ors.add(s);
s = "";
continue;
}
}
s += "" + ch;
}
if(s.length() > 0) ors.add(s);
return ors;
}
/**
* Returns the position of a closing bracket.
*/
private static int findBracketClosingPosition(int startFrom, String query) {
int balance = 0;
char pre = ' ';
char ch = ' ';
for(int i = startFrom; i < query.length(); i++) {
pre = ch;
ch = query.charAt(i);
if(isOpenBracket(ch, pre)) {
balance++;
continue;
}
if(isClosingBracket(ch, pre)) {
if(balance > 0) {
balance--;
}
else {
return i;
}
}
}
return -1;
}
/**
* Helps in duplicating a character in the provided queries
*/
private static List writeCharInAll(char c, int no, List queries) {
for(int i = 0; i < no; i++) {
String s = queries.get(queries.size() - (i + 1));
s += "" + c;
queries.set(queries.size() - (i + 1), s);
}
return queries;
}
/**
* Helps in duplicating a string in the provided queries
*/
private static List writeStringInAll(String c, int no, List queries) {
for(int i = 0; i < no; i++) {
String s = queries.get(queries.size() - (i + 1));
s += "" + c;
queries.set(queries.size() - (i + 1), s);
}
return queries;
}
/**
* Returns if the character is bracket used to mark boundary of a token or an escape character.
*/
private static boolean isOpenBracket(char ch, char pre) {
if(ch == '(' && pre != '\\')
return true;
else return false;
}
/**
* Returns if the character is bracket used to mark boundary of a token or an escape character.
*/
private static boolean isClosingBracket(char ch, char pre) {
if(ch == ')' && pre != '\\')
return true;
else return false;
}
/**
* Returns if the character is an OR symbol used as a logical operator or an escape character.
*/
private static boolean isOrSym(char ch, char pre) {
if(ch == '|' && pre != '\\')
return true;
else return false;
}
}