org.pageseeder.flint.lucene.util.Documents Maven / Gradle / Ivy
/*
* Copyright 2015 Allette Systems (Australia)
* http://www.allette.com.au
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.pageseeder.flint.lucene.util;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.pageseeder.flint.lucene.search.DocumentCounter;
import org.pageseeder.flint.lucene.search.Fields;
import org.pageseeder.xmlwriter.XMLWriter;
import org.pageseeder.xmlwriter.esc.XMLEscapeUTF8;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* A collection of utility methods to manipulate documents.
*
* @author Christophe Lauret
* @version 5 July 2011
*/
public final class Documents {
/**
* Utility class need no constructor.
*/
private Documents() {
}
/**
* Count the number of documents matching the specified query.
*
* @param searcher the index search to use.
* @param query the query.
*
* @return the number of documents matching the specified query.
*
* @throws IOException if thrown by the searcher.
*/
public static int count(IndexSearcher searcher, Query query) throws IOException {
DocumentCounter counter = new DocumentCounter();
searcher.search(query, counter);
return counter.getCount();
}
/**
* Returns the XML for a document.
*
* @param xml The XML writer.
* @param doc Lucene document to serialise as XML.
*
* @throws IOException Any I/O error thrown by the XML writer.
*/
@Beta
public static void toXML(XMLWriter xml, Document doc) throws IOException {
xml.openElement("document", true);
// display the value of each field
for (IndexableField f : doc.getFields()) {
String value = Fields.toString(f);
// TODO: date formatting
// Unnecessary to return the full value of long fields
if (value != null && value.length() < 100) {
xml.openElement("field");
xml.attribute("name", f.name());
xml.writeText(value);
xml.closeElement();
}
}
// close 'document'
xml.closeElement();
}
/**
* Returns the extract from the text for the given terms and with the maximum specified length.
*
* This method will include "..." whenever the text was cut (at the beginning or the end).
*
* @param text the text to search
* @param term the term to find
* @param length The length of the extract
*
* @return the extract or null
if the term could not be found.
*
* @throws IllegalArgumentException If the length of the term is larger than the length of the extract.
*/
@Beta
public static String extract(String text, String term, int length) throws IllegalArgumentException {
if (text == null) return null;
if (term.length() > length)
throw new IllegalArgumentException("Term length ("+term.length()+") is larger than extract length ("+length+")");
final int len = length - term.length();
Pattern p = Pattern.compile("(?:\\W|^)(\\Q"+term+"\\E)(?:\\W|$)", Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(text);
if (m.find()) {
StringBuilder extract = new StringBuilder();
int start = m.start(1);
int end = m.end(1);
// the entire string can be used
if (length > text.length()) {
extract.append(asXML(text.substring(0, start)));
extract.append("").append(asXML(m.group(1))).append(" ");
extract.append(asXML(text.substring(end)));
//
} else if (start < len / 2) {
extract.append(asXML(text.substring(0, start)));
extract.append("").append(asXML(m.group(1))).append(" ");
if ((text.length() - end < len - start)) {
extract.append(asXML(text.substring(end)));
} else {
extract.append(asXML(text.substring(end, end+len-start-1))).append("...");
}
} else if (text.length() - end < len / 2) {
int x = text.length() - end;
if (x > start) {
extract.append(asXML(text.substring(0, start)));
} else {
extract.append("...").append(asXML(text.substring(start - x, start)));
}
extract.append("").append(asXML(m.group(1))).append(" ");
extract.append(asXML(text.substring(end)));
} else {
extract.append("...").append(asXML(text.substring(start - (len / 2), start)));
extract.append("").append(asXML(m.group(1))).append(" ");
extract.append(asXML(text.substring(end, end + len / 2))).append("...");
}
return extract.toString();
}
return null;
}
/**
* Returns the text as a safe XML text.
* @param text The to escape for XML.
* @return the text as a safe XML text.
*/
private static String asXML(String text) {
return XMLEscapeUTF8.UTF8_ESCAPE.toElementText(text);
}
}