org.pageseeder.flint.lucene.query.SearchResults Maven / Gradle / Ivy
/*
* Copyright 2015 Allette Systems (Australia)
* http://www.allette.com.au
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.pageseeder.flint.lucene.query;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopFieldDocs;
import org.pageseeder.flint.IndexException;
import org.pageseeder.flint.indexing.FlintDocument;
import org.pageseeder.flint.indexing.FlintField;
import org.pageseeder.flint.indexing.FlintField.NumericType;
import org.pageseeder.flint.lucene.LuceneIndexIO;
import org.pageseeder.flint.lucene.search.Fields;
import org.pageseeder.flint.lucene.util.Dates;
import org.pageseeder.flint.lucene.util.Highlighter;
import org.pageseeder.xmlwriter.XML.NamespaceAware;
import org.pageseeder.xmlwriter.XMLStringWriter;
import org.pageseeder.xmlwriter.XMLWritable;
import org.pageseeder.xmlwriter.XMLWriter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.text.ParseException;
import java.util.*;
/**
* A container for search results.
*
* Use this class to serialise Lucene Search results as XML.
*
*
Note: the current implementation is a "throw away" object, once the toXML method has been
* called, this instance is useless.
*
*
This class is not synchronized.
*
* @author Christophe Lauret (Weborganic)
* @author Jean-Baptiste Reure (Weborganic)
* @author William Liem (Allette Systems)
*
* @version 10 February 2012
*/
public final class SearchResults implements XMLWritable {
/**
* Logger.
*/
private static final Logger LOGGER = LoggerFactory.getLogger(SearchResults.class);
/**
* Types of values formatted in the result.
*/
private enum ValueType {STRING, DATE, DATETIME, LONG, DOUBLE, INT, FLOAT}
/**
* The maximum length for a field to expand.
*/
private static final int MAX_FIELD_VALUE_LENGTH = 1000;
/**
* One minute in milliseconds.
*/
private static final int ONE_MINUTE_IN_MS = 60000;
/**
* One hour in milliseconds.
*/
private static final int ONE_HOUR_IN_MS = 3600000;
/**
* The actual search results from Lucene.
*/
private final ScoreDoc[] _scoredocs;
/**
* Fields used for sorting.
*/
private final SortField[] _sortfields;
/**
* Indicates the paging information.
*/
private final SearchPaging _paging;
/**
* The query used to produce these results.
*/
private final SearchQuery _query;
/**
* The analyzer used.
*/
private final Analyzer _analyzer;
/**
* The index searcher used.
*/
private final IndexSearcher _searcher;
/**
* The index I/O.
*/
private final SearchReaders readers;
/**
* List of fields to get the extract from.
*/
private final List extractFields = new ArrayList<>();
/**
* The total number of results.
*/
private final int totalNbOfResults;
// State variables
// ---------------------------------------------------------------------------------------------
/**
* A state variable to indicate whether the search results instance can still be accessed.
*/
private boolean _terminated = false;
/**
* The timezone offset used to adjust the correct date and time.
*/
private int timezoneOffset;
// Constructors
// ---------------------------------------------------------------------------------------------
/**
* Creates a new SearchResults.
*
* @param query The search query that was used to produce these results.
* @param docs The actual search results from Lucene in TopFieldDocs.
* @param paging The paging configuration.
* @param readers The list of readers to be released at the end
* @param searcher The Lucene searcher.
*
*/
public SearchResults(SearchQuery query, TopFieldDocs docs, SearchPaging paging, Map readers, IndexSearcher searcher) {
this(query, null, docs.scoreDocs, docs.fields, (int) docs.totalHits.value, paging, new SearchReaders(readers), searcher);
}
/**
* Creates a new SearchResults.
*
* @param query The search query that was used to produce these results.
* @param docs The actual search results from Lucene in TopFieldDocs.
* @param paging The paging configuration.
* @param io The IndexIO object, used to release the searcher when terminated
* @param searcher The Lucene searcher.
*
*/
public SearchResults(SearchQuery query, TopFieldDocs docs, SearchPaging paging, LuceneIndexIO io, IndexSearcher searcher) {
this(query, null, docs.scoreDocs, docs.fields, (int) docs.totalHits.value, paging, new SearchReaders(io), searcher);
}
/**
* Creates a new SearchResults.
*
* @param query The search query that was used to produce these results.
* @param docs The actual search results from Lucene in ScoreDoc.
* @param paging The paging configuration.
* @param io The IndexIO object, used to release the searcher when terminated
* @param searcher The Lucene searcher.
*
*/
public SearchResults(SearchQuery query, ScoreDoc[] docs, int totalHits, SearchPaging paging, LuceneIndexIO io, IndexSearcher searcher) {
this(query, null, docs, null, totalHits, paging, new SearchReaders(io), searcher);
}
/**
* Creates a new SearchResults.
*
* @param query The search query that was used to produce these results.
* @param docs The actual search results from Lucene in ScoreDoc.
* @param paging The paging configuration.
* @param readers The list of readers to be released at the end
* @param searcher The Lucene searcher.
*
*/
public SearchResults(SearchQuery query, ScoreDoc[] docs, int totalHits, SearchPaging paging, Map readers, IndexSearcher searcher) {
this(query, null, docs, null, totalHits, paging, new SearchReaders(readers), searcher);
}
/**
* Creates a new SearchResults.
*
* @param query The original query
* @param hits The actual search results from Lucene in ScoreDoc.
* @param sortf The Field used to sort the results
* @param paging The paging configuration.
* @param readers The IndexIO object, used to release the searcher when terminated
* @param searcher The Lucene searcher.
*/
public SearchResults(SearchQuery query, Analyzer analyzer, ScoreDoc[] hits,
SortField[] sortf, int totalResults,
SearchPaging paging, SearchReaders readers, IndexSearcher searcher) {
this._query = query;
this._analyzer = analyzer;
this._scoredocs = hits;
this._sortfields = sortf;
this._paging = paging != null? paging : new SearchPaging();
this._searcher = searcher;
this.readers = readers;
this.totalNbOfResults = totalResults;
// default timezone is the server's
TimeZone tz = TimeZone.getDefault();
this.timezoneOffset = tz.getRawOffset();
// take daylight savings into account
if (tz.inDaylightTime(new Date())) {
this.timezoneOffset += ONE_HOUR_IN_MS;
}
}
// Basic public methods
// ---------------------------------------------------------------------------------------------
/**
* Add field names to get extracts from.
* The order matters here as the first extract found is the one included in the results.
*
* @param fields list of field names
*/
public void addExtractFields(List fields) {
this.extractFields.addAll(fields);
}
/**
* Add a field name to get extracts from.
* The order matters here as the first extract found is the one included in the results.
*
* @param field new field name
*/
public void addExtractField(String field) {
this.extractFields.add(field);
}
/**
* Returns the total number of results.
*
* @return the total number of results.
*/
public int getTotalNbOfResults() {
return this.totalNbOfResults;
}
/**
* Indicates whether the search results are empty.
*
* @return true
if the results are empty;
* false
if there is more than one hit.
*/
public boolean isEmpty() {
return this.totalNbOfResults == 0;
}
/**
* Sets the time zone to use when formatting the results as XML.
*
* @param timezoneInMinutes the timezone offset in minutes (difference with GMT)
*/
public void setTimeZone(int timezoneInMinutes) {
this.timezoneOffset = timezoneInMinutes * ONE_MINUTE_IN_MS;
}
/**
* @return the index searcher, can be used to compute facets
*
* @throws IndexException if the results have been terminated and the searcher closed
*/
public IndexSearcher searcher() throws IndexException {
if (this._terminated)
throw new IndexException("Cannot retrieve searcher after termination", new IllegalStateException());
return this._searcher;
}
/**
* @return the original query
*/
public SearchQuery query() {
return this._query;
}
/**
* Serialises the search results as XML.
*
* @param xml The XML writer.
*
* @throws IOException Should there be any I/O exception while writing the XML.
*/
@Override
public void toXML(XMLWriter xml) throws IOException {
xml.openElement("search-results", true);
int firsthit = this._paging.getFirstHit();
int lasthit = this._paging.getLastHit(this.totalNbOfResults);
// Include query
if (this._query != null) {
xml.openElement("query", true);
xml.attribute("lucene", this._query.toQuery().toString());
this._query.toXML(xml);
xml.closeElement();
}
// Display some metadata on the search
toMetadataXML(xml);
// Returned documents
xml.openElement("documents", true);
// Iterate over the hits to find the extracts
for (int i = firsthit - 1; i < lasthit; i++) {
String score = Float.toString(this._scoredocs[i].score);
Document doc = this._searcher.storedFields().document(this._scoredocs[i].doc);
String extractXML = null;
if (this._query != null && this._analyzer != null) {
Highlighter highlighter = new Highlighter(this._query.toQuery(), this._searcher.getIndexReader(), this._analyzer);
for (IndexableField f : doc.getFields()) {
if (this.extractFields.isEmpty() || this.extractFields.contains(f.name())) {
String extract = highlighter.highlight(f.name(), f.stringValue(), 200); // Documents.extract(Fields.toString(f), t.text(), 200);
if (extract != null) {
XMLStringWriter xsw = new XMLStringWriter(NamespaceAware.No);
xsw.openElement("extract");
xsw.attribute("from", f.name());
xsw.writeXML(extract);
xsw.closeElement();
extractXML = xsw.toString();
break;
}
}
}
}
// document as XML
documentToXML(doc, extractXML, score, this.timezoneOffset, xml);
}
// close 'documents'
xml.closeElement();
// close 'results'
xml.closeElement();
// close everything
terminate();
}
public static void documentToXML(Document doc, int timezoneOffset, XMLWriter xml) throws IOException {
documentToXML(doc, null, null, timezoneOffset, xml);
}
private static void documentToXML(Document doc, String extract, String score, int timezoneOffset, XMLWriter xml) throws IOException {
xml.openElement("document", true);
if (score != null) xml.element("score", score);
if (extract != null) xml.writeXML(extract);
// display the value of each field
for (IndexableField f : doc.getFields()) {
// Retrieve the value
String value = Fields.toString(f);
ValueType type = ValueType.STRING;
// check for numeric value
Number number = f.numericValue();
if (number != null) {
if (number instanceof Long) type = ValueType.LONG;
else if (number instanceof Double) type = ValueType.DOUBLE;
else if (number instanceof Integer) type = ValueType.INT;
else if (number instanceof Float) type = ValueType.FLOAT;
// format dates using ISO 8601 when possible
} else if (value != null && value.length() > 0 && f.name().contains("date") && Dates.isLuceneDate(value)) {
try {
if (value.length() > 8) {
value = Dates.toISODateTime(value, timezoneOffset);
type = ValueType.DATETIME;
} else {
value = Dates.toISODate(value);
if (value.length() == 10) {
type = ValueType.DATE;
}
}
} catch (ParseException ex) {
LOGGER.warn("Unparseable date found {}", value);
}
}
// unnecessary to return the full value of long fields
if (value != null && value.length() < MAX_FIELD_VALUE_LENGTH) {
xml.openElement("field");
xml.attribute("name", f.name());
// Display the correct attributes so that we know we can format the date
if (type == ValueType.DATE) {
xml.attribute("date", value);
} else if (type == ValueType.DATETIME) {
xml.attribute("datetime", value);
} else if (type == ValueType.LONG) {
xml.attribute("numeric-type", "long");
} else if (type == ValueType.DOUBLE) {
xml.attribute("numeric-type", "double");
} else if (type == ValueType.FLOAT) {
xml.attribute("numeric-type", "float");
} else if (type == ValueType.INT) {
xml.attribute("numeric-type", "int");
}
if (f.binaryValue() != null) xml.attribute("compressed", "true");
xml.writeText(value);
xml.closeElement();
}
}
// close 'document'
xml.closeElement();
}
public static void flintDocumentToXML(FlintDocument doc, int timezoneOffset, XMLWriter xml) throws IOException {
flintDocumentToXML(doc, null, null, timezoneOffset, xml);
}
public static void flintDocumentToXML(FlintDocument doc, String extract, String score, int timezoneOffset, XMLWriter xml) throws IOException {
xml.openElement("result", true);
if (score != null) xml.attribute("score", score);
if (extract != null) xml.writeXML(extract);
// display the value of each field
for (FlintField f : doc.fields()) {
// Retrieve the value
String value = f.value() == null ? null : f.value().toString();
ValueType type = ValueType.STRING;
// check for numeric value
NumericType nt = f.numeric();
if (nt != null) {
if (nt == NumericType.LONG) type = ValueType.LONG;
else if (nt == NumericType.DOUBLE) type = ValueType.DOUBLE;
else if (nt == NumericType.INT) type = ValueType.INT;
else if (nt == NumericType.FLOAT) type = ValueType.FLOAT;
// format dates using ISO 8601 when possible
} else if (value != null && value.length() > 0 && f.name().contains("date") && Dates.isLuceneDate(value)) {
try {
if (value.length() > 8) {
value = Dates.toISODateTime(value, timezoneOffset);
type = ValueType.DATETIME;
} else {
value = Dates.toISODate(value);
if (value.length() == 10) {
type = ValueType.DATE;
}
}
} catch (ParseException ex) {
LOGGER.warn("Unparseable date found {}", value);
}
}
// unnecessary to return the full value of long fields
if (value != null && value.length() < MAX_FIELD_VALUE_LENGTH) {
xml.openElement("field");
xml.attribute("name", f.name());
// Display the correct attributes so that we know we can format the date
if (type == ValueType.DATE) {
xml.attribute("date", value);
} else if (type == ValueType.DATETIME) {
xml.attribute("datetime", value);
} else if (type == ValueType.LONG) {
xml.attribute("numeric-type", "long");
} else if (type == ValueType.DOUBLE) {
xml.attribute("numeric-type", "double");
} else if (type == ValueType.FLOAT) {
xml.attribute("numeric-type", "float");
} else if (type == ValueType.INT) {
xml.attribute("numeric-type", "int");
}
xml.writeText(value);
xml.closeElement();
}
}
// close 'document'
xml.closeElement();
}
public int getFirstHit() {
return this._paging.getFirstHit();
}
public int getLastHit() {
return this._paging.getLastHit(this.totalNbOfResults);
}
/**
* Return the actual results.
*
* @return the search results.
*
* @throws IndexException If the search results have already been terminated.
*/
public ScoreDoc[] getScoreDoc() throws IndexException {
if (this._terminated)
throw new IndexException("Cannot retrieve documents after termination", new IllegalStateException());
return this._scoredocs;
}
/**
* Load a document from the index.
*
* Note this
*
* @param id the id of the document
* @return the document object loaded from the index, could be null
*
* @throws IndexException if the index is invalid
*/
public Document getDocument(int id) throws IndexException {
if (this._terminated)
throw new IndexException("Cannot retrieve documents after termination", new IllegalStateException());
try {
return this._searcher.storedFields().document(id);
} catch (CorruptIndexException e) {
LOGGER.error("Failed to retrieve a document because of a corrupted Index", e);
throw new IndexException("Failed to retrieve a document because of a corrupted Index", e);
} catch (IOException ioe) {
LOGGER.error("Failed to retrieve a document because of an I/O problem", ioe);
throw new IndexException("Failed to retrieve a document because of an I/O problem", ioe);
}
}
/**
* Release all references to the searcher.
*
*
Does nothing if the results have already been terminated.
*
*/
public void terminate() {
if (this._terminated) return;
this.readers.release(this._searcher);
this._terminated = true;
}
/**
* Provides an iterable class over the Lucene documents.
*
*
This allows Lucene documents from these results to be iterated over in a for each loop:
*
* for (Document doc : results.documents()) {
* ...
* }
*
*
* @return an iterable class over the Lucene documents.
*
* @throws IllegalStateException If these results have been closed (terminated already).
*/
public Iterable documents() {
if (this._terminated)
throw new IllegalStateException();
return new DocIterable(this._paging.getFirstHit() - 1, getLastHit());
}
// Private helpers
// ----------------------------------------------------------------------------------------------
/**
* Write the search results metadata as XML.
*
* @param xml The XML writer
*
* @throws IOException Should an error occur while writing the XML
*/
private void toMetadataXML(XMLWriter xml) throws IOException {
SearchPaging page = this._paging;
int total = this.totalNbOfResults;
// Display some metadata on the search
xml.openElement("metadata", true);
xml.openElement("hits", true);
xml.element("per-page", Integer.toString(page.getHitsPerPage()));
xml.element("total", Integer.toString(total));
xml.closeElement();
xml.openElement("page", true);
xml.element("first-hit", Integer.toString(page.getFirstHit()));
xml.element("last-hit", Integer.toString(page.getLastHit(total)));
xml.element("current", Integer.toString(page.getPage()));
xml.element("last", Integer.toString(page.getPageCount(total)));
xml.closeElement();
if (this._sortfields != null) {
xml.openElement("sort-fields", true);
for (SortField field : this._sortfields) {
xml.element("field", field.getField());
}
xml.closeElement();
}
xml.closeElement();
}
// Private classes
// ----------------------------------------------------------------------------------------------
/**
* An iterable class over the documents in these results.
*
* @author christophe Lauret
* @version 6 October 2011
*/
private final class DocIterable implements Iterable {
private final int _start;
private final int _end;
public DocIterable(int start, int end) {
this._start = start;
this._end = end;
}
/**
* Provides an iterable class over the Lucene documents.
*
* this can be used in a for each loop
*
* @return an iterable class over the Lucene documents.
*/
@Override
public Iterator iterator() {
return new DocIterator(this._start, this._end);
}
}
/**
* An iterator over the documents in these results.
*
* @author Christophe Lauret
* @author Jean-Baptiste Reure
* @version 16 August 2013
*/
private final class DocIterator implements Iterator {
/**
* The index searcher used.
*/
private final IndexSearcher searcher = SearchResults.this._searcher;
/**
* The actual search results from Lucene.
*/
private final ScoreDoc[] scoredocs = SearchResults.this._scoredocs;
/**
* The current index for this iterator.
*/
private int index;
/**
* The current index for this iterator.
*/
private final int endIndex;
public DocIterator(int start, int end) {
this.index = start;
this.endIndex = end;
}
@Override
public boolean hasNext() {
return this.index < this.scoredocs.length && this.index < this.endIndex;
}
@Override
public Document next() {
if (!hasNext()) throw new NoSuchElementException();
try {
return this.searcher.storedFields().document(this.scoredocs[this.index++].doc);
} catch (IOException ex) {
throw new IllegalStateException("Error retrieving document", ex);
}
}
/**
* @throws UnsupportedOperationException as it's not possible
*/
@Override
public void remove() {
throw new UnsupportedOperationException("Cannot remove documents from search results");
}
}
private static class SearchReaders {
private final LuceneIndexIO _single;
private final Map _readers = new HashMap<>();
public SearchReaders(Map readers) {
this._readers.putAll(readers);
this._single = null;
}
public SearchReaders(LuceneIndexIO io) {
this._single = io;
}
public void release(IndexSearcher searcher) {
if (this._single != null) {
this._single.releaseSearcher(searcher);
}
for (Map.Entry io : this._readers.entrySet())
io.getKey().releaseReader(io.getValue());
}
}
}