net.sf.jasperreports.search.LuceneUtil Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jasperreports Show documentation
Show all versions of jasperreports Show documentation
Free Java Reporting Library
/*
* JasperReports - Free Java Reporting Library.
* Copyright (C) 2001 - 2019 TIBCO Software Inc. All rights reserved.
* http://www.jaspersoft.com
*
* Unless you have purchased a commercial license agreement from Jaspersoft,
* the following license terms apply:
*
* This program is part of JasperReports.
*
* JasperReports is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* JasperReports is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with JasperReports. If not, see .
*/
package net.sf.jasperreports.search;
import net.sf.jasperreports.engine.*;
import net.sf.jasperreports.engine.util.JRStyledText;
import net.sf.jasperreports.engine.util.JRStyledTextUtil;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.document.*;
import org.apache.lucene.index.*;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.*;
import org.apache.lucene.search.spans.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.BytesRef;
import java.io.IOException;
import java.io.StringReader;
import java.util.*;
import java.util.Map.Entry;
/**
* @author Narcis Marcu ([email protected])
*/
public class LuceneUtil {
private static final Log log = LogFactory.getLog(LuceneUtil.class);
private static final String CONTENT_FIELD = "content";
private JRStyledTextAttributeSelector noneSelector;
private JRStyledTextUtil styledTextUtil;
private Analyzer analyzer;
private IndexWriter writer;
private FieldType fieldType;
private boolean isCaseSensitive;
private boolean isWholeWordsOnly;
private boolean removeAccents;
public LuceneUtil(JasperReportsContext jasperReportsContext, boolean isCaseSensitive, boolean isWholeWordsOnly, boolean removeAccents) {
this.isCaseSensitive = isCaseSensitive;
this.isWholeWordsOnly = isWholeWordsOnly;
this.removeAccents = removeAccents;
this.noneSelector = JRStyledTextAttributeSelector.getNoneSelector(jasperReportsContext);
this.styledTextUtil = JRStyledTextUtil.getInstance(jasperReportsContext);
fieldType = new FieldType();
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
fieldType.setTokenized(true);
fieldType.setStored(true);
fieldType.setStoreTermVectors(true);
fieldType.setStoreTermVectorPositions(true);
fieldType.setStoreTermVectorOffsets(true);
fieldType.freeze();
}
public SpansInfo getSpansInfo(JasperPrint jasperPrint, String queryString) throws IOException, JRException {
Long start = System.currentTimeMillis();
Directory dir = createLuceneDirectory(jasperPrint);
if (log.isDebugEnabled()) {
log.debug("original query: [" + queryString + "]");
}
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
List queryTerms = getQueryTerms(queryString);
SpanQuery query = buildQuery(queryTerms);
if (log.isDebugEnabled()) {
log.debug("lucene query: [" + query.toString() + "]");
}
TopDocs results = searcher.search(query, Integer.MAX_VALUE);
ScoreDoc[] hits = results.scoreDocs;
Map> hitTermsMap = new LinkedHashMap>();
for (int i = 0; i < hits.length; i++) {
getHitTerms(query, searcher, hits[i].doc, hitTermsMap);
}
Map termContexts = new HashMap();
// get the info for each matched term from the document's termVector
Map> hitTermsInfoMap = new HashMap>();
for (Entry> entry: hitTermsMap.entrySet()) {
List terms = entry.getValue();
Terms termVector = reader.getTermVector(entry.getKey(), CONTENT_FIELD);
PostingsEnum docsAndPositions;
for (Term term: terms) {
termContexts.put(term, TermContext.build(reader.getContext(), term));
TermsEnum iterator = termVector.iterator();
BytesRef termBytesRef = new BytesRef(term.text());
if (iterator.seekExact(termBytesRef)) {
docsAndPositions = iterator.postings(null, PostingsEnum.ALL);
docsAndPositions.nextDoc();
for (int i = 0, freq = docsAndPositions.freq(); i < freq; ++i) {
if (hitTermsInfoMap.get(entry.getKey()) == null) {
hitTermsInfoMap.put(entry.getKey(), new ArrayList());
}
hitTermsInfoMap.get(entry.getKey()).add(new HitTermInfo(docsAndPositions.nextPosition(), docsAndPositions.startOffset(), docsAndPositions.endOffset(), termBytesRef.utf8ToString()));
}
}
}
}
// get the spans for the matched terms
SpanQuery rewrittenQuery = (SpanQuery)query.rewrite(reader);
LuceneSpansInfo spansInfo = new LuceneSpansInfo(queryTerms.size());
for (LeafReaderContext context : reader.leaves())
{
SpanWeight spanWeight = rewrittenQuery.createWeight(searcher, false, 1);
Spans spans = spanWeight.getSpans(context, SpanWeight.Postings.POSITIONS);
if (spans != null) {
int nextDoc;
while ((nextDoc = spans.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
int docIndex = nextDoc + context.docBase;
Document doc = searcher.doc(docIndex);
String uid = doc.get("uid");
List hitTermsInfo = hitTermsInfoMap.get(docIndex);
for (int i = spans.nextStartPosition(); i < spans.endPosition(); i++) {
for (HitTermInfo ti : hitTermsInfo) {
if (ti.getPosition() == i) {
if (log.isDebugEnabled()) {
log.debug(String.format("term: %s@%d [%d, %d] - uid: %s, pageNo: %s", ti.getValue(), ti.getPosition(), ti.getStart(), ti.getEnd(), uid, doc.get("pageNo")));
}
ti.setPageNo(doc.get("pageNo"));
spansInfo.addTermInfo(uid, ti);
}
}
}
}
}
}
reader.close();
if (log.isDebugEnabled()) {
log.debug("search took: " + (System.currentTimeMillis() - start) + " ms");
}
return spansInfo;
}
protected Directory createLuceneDirectory(JasperPrint jasperPrint) throws IOException, JRException {
Long start = System.currentTimeMillis();
Directory dir = new RAMDirectory();
Analyzer analyzer = getConfiguredAnalyzer();
IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
iwc.setOpenMode(OpenMode.CREATE);
writer = new IndexWriter(dir, iwc);
List pages = jasperPrint.getPages();
if (pages != null && pages.size() > 0) {
if (log.isDebugEnabled()) {
log.debug("there are " + pages.size() + " pages to be indexed");
}
for (int i = 0, ps = pages.size(); i < ps; i++) {
if (log.isDebugEnabled()) {
log.debug("indexing page: " + i);
}
indexPage(pages.get(i), i);
}
}
writer.close();
if (log.isDebugEnabled()) {
log.debug("index creation took: " + (System.currentTimeMillis() - start) + " ms");
}
return dir;
}
protected void indexPage(JRPrintPage page, int pageNo) throws IOException {
List elements = page.getElements();
if (page.getElements().size() > 0) {
indexElements(pageNo, elements);
}
}
protected void indexElements(int pageNo, List elements) throws IOException {
for (int i = 0; i < elements.size(); i++) {
Object element = elements.get(i);
if (element instanceof JRPrintText) {
addContentField(pageNo, (JRPrintText) element);
} else if (element instanceof JRPrintFrame) {
JRPrintFrame frame = (JRPrintFrame) element;
indexElements(pageNo, frame.getElements());
}
}
}
protected void addContentField(int pageNo, JRPrintText element) throws IOException {
String allText;
JRStyledText styledText = getStyledText(element);
if (styledText == null) {
allText = "";
} else {
allText = styledText.getText();
}
if (allText != null && allText.length() > 0) {
Field tf = new Field(CONTENT_FIELD, allText, fieldType);
Document doc = new Document();
doc.add(new StoredField("pageNo", pageNo));
PrintElementId peid = PrintElementId.forElement(element);
doc.add(new StringField("uid", peid.toString(), Field.Store.YES));
if (log.isDebugEnabled()) {
log.debug(displayTokens(allText, peid.toString()));
}
doc.add(tf);
writer.addDocument(doc);
}
}
protected Analyzer getConfiguredAnalyzer() {
if (analyzer == null) {
analyzer = new LuceneSimpleAnalyzer(isCaseSensitive, removeAccents);
}
return analyzer;
}
protected JRStyledText getStyledText(JRPrintText textElement)
{
return styledTextUtil.getStyledText(textElement, noneSelector);
}
protected SpanQuery buildQuery(List queryTerms) {
List clauses = new ArrayList();
for (int i = 0, ln = queryTerms.size(); i < ln; i++) {
String term = queryTerms.get(i);
if (isWholeWordsOnly) {
clauses.add(new SpanTermQuery(new Term(CONTENT_FIELD, term)));
} else {
if (i == 0) {
term = "*" + term;
}
if (i == ln - 1) {
term = term + "*";
}
clauses.add(new SpanMultiTermQueryWrapper<>(new WildcardQuery(new Term(CONTENT_FIELD, term))));
}
}
if (clauses.size() > 1) {
// create a spanQuery with no distance between terms; the terms' order matters
// SpanNearQuery requires at least 2 clauses
return new SpanNearQuery(clauses.toArray(new SpanQuery[]{}), 0, true);
} else if (clauses.size() == 1) {
return clauses.get(0);
}
return null;
}
protected List getQueryTerms(String queryString) throws IOException {
List queryTerms = new ArrayList();
Analyzer analyzer = getConfiguredAnalyzer();
TokenStream tokenStream = analyzer.tokenStream(null, queryString);
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
queryTerms.add(charTermAttribute.toString());
}
return queryTerms;
}
protected String displayTokens(String text, String elementId) throws IOException {
Analyzer analyzer = new LuceneSimpleAnalyzer(isCaseSensitive, removeAccents);;
StringBuilder sb = new StringBuilder();
sb.append(elementId).append(": ").append(text).append(": ");
TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text));
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
int startOffset = offsetAttribute.startOffset();
int endOffset = offsetAttribute.endOffset();
String term = charTermAttribute.toString();
sb.append("[" + term + "](" + startOffset + "," + endOffset + ") ");
}
return sb.toString();
}
@SuppressWarnings("rawtypes")
protected void getHitTerms(Query query, IndexSearcher searcher, int docId, Map> hitTerms) throws IOException {
if (query instanceof SpanTermQuery) {
if (searcher.explain(query, docId).isMatch() == true) {
if (!hitTerms.containsKey(docId)) {
hitTerms.put(docId, new ArrayList());
}
hitTerms.get(docId).add(((SpanTermQuery) query).getTerm());
}
return;
}
if (query instanceof MultiTermQuery) {
if (!(query instanceof FuzzyQuery)) { // FuzzQuery doesn't support SetRewriteMethod
((MultiTermQuery) query).setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_REWRITE);
}
getHitTerms(query.rewrite(searcher.getIndexReader()), searcher, docId, hitTerms);
return;
}
if (query instanceof SpanNearQuery) {
for (SpanQuery bc : ((SpanNearQuery) query).getClauses()) {
getHitTerms(bc, searcher, docId, hitTerms);
}
return;
}
if (query instanceof SpanOrQuery) {
for (SpanQuery bc : ((SpanOrQuery) query).getClauses()) {
getHitTerms(bc, searcher, docId, hitTerms);
}
return;
}
if (query instanceof SpanMultiTermQueryWrapper) {
((SpanMultiTermQueryWrapper) query).setRewriteMethod(SpanMultiTermQueryWrapper.SCORING_SPAN_QUERY_REWRITE);
getHitTerms(query.rewrite(searcher.getIndexReader()), searcher, docId, hitTerms);
return;
}
}
}