com.jaeksoft.searchlib.request.NamedEntityExtractionRequest Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of opensearchserver Show documentation
Show all versions of opensearchserver Show documentation
OpenSearchServer is a powerful, enterprise-class, search engine program. Using the web user interface, the crawlers (web, file, database, ...) and the REST/RESTFul API you will be able to integrate quickly and easily advanced full-text search capabilities in your application. OpenSearchServer runs on Windows and Linux/Unix/BSD.
The newest version!
/**
* License Agreement for OpenSearchServer
*
* Copyright (C) 2013-2015 Emmanuel Keller / Jaeksoft
*
* http://www.open-search-server.com
*
* This file is part of OpenSearchServer.
*
* OpenSearchServer is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OpenSearchServer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenSearchServer.
* If not, see .
**/
package com.jaeksoft.searchlib.request;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import javax.xml.xpath.XPathExpressionException;
import org.apache.lucene.search.Query;
import org.w3c.dom.DOMException;
import org.w3c.dom.Node;
import org.xml.sax.SAXException;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.analysis.Analyzer;
import com.jaeksoft.searchlib.analysis.FilterFactory;
import com.jaeksoft.searchlib.analysis.filter.DeduplicateTokenPositionsFilter;
import com.jaeksoft.searchlib.analysis.filter.IndexLookupFilter;
import com.jaeksoft.searchlib.analysis.filter.RemoveIncludedTermFilter;
import com.jaeksoft.searchlib.analysis.filter.ShingleFilter;
import com.jaeksoft.searchlib.analysis.filter.StopFilter;
import com.jaeksoft.searchlib.analysis.tokenizer.TokenizerEnum;
import com.jaeksoft.searchlib.analysis.tokenizer.TokenizerFactory;
import com.jaeksoft.searchlib.config.Config;
import com.jaeksoft.searchlib.index.ReaderInterface;
import com.jaeksoft.searchlib.query.ParseException;
import com.jaeksoft.searchlib.result.AbstractResult;
import com.jaeksoft.searchlib.result.ResultNamedEntityExtraction;
import com.jaeksoft.searchlib.util.DomUtils;
import com.jaeksoft.searchlib.util.StringUtils;
import com.jaeksoft.searchlib.util.XPathParser;
import com.jaeksoft.searchlib.util.XmlWriter;
import com.jaeksoft.searchlib.web.ServletTransaction;
public class NamedEntityExtractionRequest extends AbstractRequest {
private String text;
private String tokenizer;
private String searchRequest;
private String namedEntityField;
private Set returnedFields;
private Map stopWordsMap;
private int maxNumberOfWords;
public NamedEntityExtractionRequest() {
super(null, RequestTypeEnum.NamedEntityExtractionRequest);
}
public NamedEntityExtractionRequest(Config config) {
super(config, RequestTypeEnum.NamedEntityExtractionRequest);
}
@Override
protected void setDefaultValues() {
super.setDefaultValues();
this.text = null;
this.searchRequest = null;
this.namedEntityField = null;
this.returnedFields = null;
this.stopWordsMap = null;
this.maxNumberOfWords = 5;
this.tokenizer = TokenizerEnum.LetterOrDigitTokenizerFactory.name();
}
@Override
public void copyFrom(AbstractRequest request) {
super.copyFrom(request);
NamedEntityExtractionRequest neeRequest = (NamedEntityExtractionRequest) request;
this.text = neeRequest.text;
this.searchRequest = neeRequest.searchRequest;
this.namedEntityField = neeRequest.namedEntityField;
this.returnedFields = neeRequest.returnedFields == null ? null : new TreeSet(neeRequest.returnedFields);
this.stopWordsMap = neeRequest.stopWordsMap == null ? null
: new TreeMap(neeRequest.stopWordsMap);
this.maxNumberOfWords = neeRequest.maxNumberOfWords;
this.tokenizer = neeRequest.tokenizer;
}
public void addReturnedField(String returnedField) {
if (StringUtils.isEmpty(returnedField))
return;
if (returnedFields == null)
returnedFields = new TreeSet();
returnedFields.add(returnedField);
}
public void removeReturnedField(String returnedField) {
if (StringUtils.isEmpty(returnedField))
return;
if (returnedFields == null)
return;
returnedFields.remove(returnedField);
}
public void addStopWords(String listName, boolean ignoreCase) {
if (listName == null)
return;
if (stopWordsMap == null)
stopWordsMap = new TreeMap();
stopWordsMap.put(listName, ignoreCase);
}
public void removeStopWords(String listName) {
if (listName == null)
return;
stopWordsMap.remove(listName);
if (stopWordsMap.isEmpty())
stopWordsMap = null;
}
public Collection getReturnedFields() {
return returnedFields;
}
public void setReturnedFields(Collection returnedFields) {
this.returnedFields.clear();
for (String returnedField : returnedFields)
addReturnedField(returnedField);
}
public void setReturnedFields(String[] returnedFields) {
this.returnedFields.clear();
for (String returnedField : returnedFields)
addReturnedField(returnedField);
}
@Override
final public Query getQuery() throws SearchLibException, IOException {
return null;
}
private final static String ATTR_SEARCH_REQUEST = "searchRequest";
private final static String ATTR_NAMED_ENTITY_FIELD = "namedEntityField";
private final static String NODE_NAME_STOPWORDS_LIST = "stopWords";
private final static String ATTR_MAX_NUMBER_OF_WORDS = "maxNumberOfWords";
private final static String ATTR_TOKENIZER = "tokenizer";
private final static String ATTR_STOPWORDS_LISTNAME = "listName";
private final static String ATTR_STOPWORDS_CASESENSITIVE = "caseSensitive";
private final static String NODE_TEXT = "text";
private final static String NODE_RETURNED_FIELD = "returnedField";
private final static String ATTR_NAME_FIELD = "name";
@Override
public void fromXmlConfigNoLock(Config config, XPathParser xpp, Node requestNode) throws XPathExpressionException,
DOMException, ParseException, InstantiationException, IllegalAccessException, ClassNotFoundException {
super.fromXmlConfigNoLock(config, xpp, requestNode);
searchRequest = DomUtils.getAttributeText(requestNode, ATTR_SEARCH_REQUEST);
namedEntityField = DomUtils.getAttributeText(requestNode, ATTR_NAMED_ENTITY_FIELD);
maxNumberOfWords = DomUtils.getAttributeInteger(requestNode, ATTR_MAX_NUMBER_OF_WORDS, 5);
tokenizer = DomUtils.getAttributeText(requestNode, ATTR_TOKENIZER,
TokenizerEnum.LetterOrDigitTokenizerFactory.name());
Node textNode = DomUtils.getFirstNode(requestNode, NODE_TEXT);
if (textNode == null)
text = DomUtils.getText(requestNode);
else
text = DomUtils.getText(textNode);
List returnedNodes = DomUtils.getNodes(requestNode, NODE_RETURNED_FIELD);
if (returnedNodes != null)
for (Node returnedNode : returnedNodes)
addReturnedField(DomUtils.getAttributeText(returnedNode, ATTR_NAME_FIELD));
List stopwordsNodes = DomUtils.getNodes(requestNode, NODE_NAME_STOPWORDS_LIST);
if (stopwordsNodes != null)
for (Node stopwordsNode : stopwordsNodes)
addStopWords(DomUtils.getAttributeText(stopwordsNode, ATTR_STOPWORDS_LISTNAME),
DomUtils.getAttributeBoolean(stopwordsNode, ATTR_STOPWORDS_CASESENSITIVE, true));
}
@Override
public void writeXmlConfig(XmlWriter xmlWriter) throws SAXException {
rwl.r.lock();
try {
xmlWriter.startElement(XML_NODE_REQUEST, XML_ATTR_NAME, getRequestName(), XML_ATTR_TYPE, getType().name(),
ATTR_SEARCH_REQUEST, searchRequest, ATTR_NAMED_ENTITY_FIELD, namedEntityField,
ATTR_MAX_NUMBER_OF_WORDS, Integer.toString(maxNumberOfWords), ATTR_TOKENIZER, tokenizer);
if (returnedFields != null) {
for (String returnedField : returnedFields) {
xmlWriter.startElement(NODE_RETURNED_FIELD, ATTR_NAME_FIELD, returnedField);
xmlWriter.endElement();
}
}
if (stopWordsMap != null) {
for (Map.Entry entry : stopWordsMap.entrySet()) {
xmlWriter.startElement(NODE_NAME_STOPWORDS_LIST, ATTR_STOPWORDS_LISTNAME, entry.getKey(),
ATTR_STOPWORDS_CASESENSITIVE, entry.getValue().toString());
xmlWriter.endElement();
}
}
if (!StringUtils.isEmpty(text)) {
xmlWriter.startElement(NODE_TEXT);
xmlWriter.textNode(text);
xmlWriter.endElement();
}
xmlWriter.endElement();
} finally {
rwl.r.unlock();
}
}
@Override
final public void setFromServletNoLock(final ServletTransaction transaction, final String prefix) {
String value = null;
if ((value = transaction.getParameterString(StringUtils.fastConcat(prefix, "text"))) != null)
text = value;
if ((value = transaction.getParameterString(StringUtils.fastConcat(prefix, "searchRequest"))) != null)
searchRequest = value;
if ((value = transaction.getParameterString(StringUtils.fastConcat(prefix, "namedEntityField"))) != null)
namedEntityField = value;
if ((value = transaction.getParameterString(StringUtils.fastConcat(prefix, "stopWordList"))) != null)
stopWordsMap.put(value, true);
Integer iValue;
if ((iValue = transaction.getParameterInteger(StringUtils.fastConcat(prefix, "maxNumberOfWords"))) != null)
maxNumberOfWords = iValue;
}
@Override
protected void resetNoLock() {
}
public List getFilterList(DeduplicateTokenPositionsFilter dtpf) throws SearchLibException {
List filterList = new ArrayList(10);
ShingleFilter shingleFilter = FilterFactory.create(config, ShingleFilter.class);
shingleFilter.setProperties(" ", 1, maxNumberOfWords);
filterList.add(shingleFilter);
if (dtpf == null)
dtpf = FilterFactory.create(config, DeduplicateTokenPositionsFilter.class);
filterList.add(dtpf);
if (stopWordsMap != null) {
for (Map.Entry entry : stopWordsMap.entrySet()) {
StopFilter stopFilter = FilterFactory.create(config, StopFilter.class);
stopFilter.setProperties(entry.getKey(), entry.getValue());
filterList.add(stopFilter);
}
}
IndexLookupFilter ilf = FilterFactory.create(config, IndexLookupFilter.class);
addReturnedField(namedEntityField);
ilf.setProperties(config.getIndexName(), searchRequest, namedEntityField,
StringUtils.join(returnedFields, '|'));
filterList.add(ilf);
RemoveIncludedTermFilter ritf = FilterFactory.create(config, RemoveIncludedTermFilter.class);
ritf.setProperties(namedEntityField, true);
filterList.add(ritf);
return filterList;
}
@Override
public AbstractResult execute(ReaderInterface reader) throws SearchLibException {
try {
AbstractSearchRequest abstractSearchRequest = (AbstractSearchRequest) config.getNewRequest(searchRequest);
if (abstractSearchRequest == null)
throw new SearchLibException("Request not found: " + searchRequest);
LinkedHashSet fieldNameSet = new LinkedHashSet();
abstractSearchRequest.getReturnFieldList().populate(fieldNameSet);
ResultNamedEntityExtraction result = new ResultNamedEntityExtraction(this);
DeduplicateTokenPositionsFilter dtpf = FilterFactory.create(config, DeduplicateTokenPositionsFilter.class);
Analyzer analyzer = new Analyzer(config);
analyzer.setIndexTokenizer(TokenizerFactory.create(config, tokenizer));
analyzer.setQueryTokenizer(TokenizerFactory.create(config, tokenizer));
analyzer.add(getFilterList(dtpf));
analyzer.getQueryAnalyzer().populate(text, result);
result.resolvePositions(namedEntityField, dtpf.getLastTokenMap(), text);
return result;
} catch (IOException e) {
throw new SearchLibException(e);
} catch (ClassNotFoundException e) {
throw new SearchLibException(e);
}
}
@Override
public String getInfo() {
rwl.r.lock();
try {
StringBuilder sb = new StringBuilder();
sb.append("SearchRequest:");
if (searchRequest != null)
sb.append(searchRequest);
sb.append(" - NamedEntityField:");
if (namedEntityField != null)
sb.append(namedEntityField);
sb.append(" - StopWordsList:");
if (stopWordsMap != null)
sb.append(stopWordsMap.size());
return sb.toString();
} finally {
rwl.r.unlock();
}
}
/**
* @return the text
*/
public String getText() {
return text;
}
/**
* @param text
* the text to set
*/
public void setText(String text) {
this.text = text;
}
/**
* @return the searchRequest
*/
public String getSearchRequest() {
return searchRequest;
}
/**
* @param searchRequest
* the searchRequest to set
*/
public void setSearchRequest(String searchRequest) {
this.searchRequest = searchRequest;
}
/**
* @return the namedEntityField
*/
public String getNamedEntityField() {
return namedEntityField;
}
/**
* @param namedEntityField
* the namedEntityField to set
*/
public void setNamedEntityField(String namedEntityField) {
this.namedEntityField = namedEntityField;
addReturnedField(namedEntityField);
}
/**
* @return the stopWordsMap
*/
public Map getStopWordsMap() {
return stopWordsMap;
}
/**
* @return the maxNumberOfWords
*/
public int getMaxNumberOfWords() {
return maxNumberOfWords;
}
/**
* @param maxNumberOfWords
* the maxNumberOfWords to set
*/
public void setMaxNumberOfWords(int maxNumberOfWords) {
this.maxNumberOfWords = maxNumberOfWords;
}
/**
* @return the tokenizer
*/
public String getTokenizer() {
return tokenizer;
}
/**
* @param tokenizer
* the tokenizer to set
*/
public void setTokenizer(String tokenizer) {
this.tokenizer = tokenizer;
}
}