
org.apache.jackrabbit.oak.query.fulltext.SimpleExcerptProvider Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.jackrabbit.oak.query.fulltext;
import java.util.BitSet;
import java.util.HashSet;
import java.util.Set;
import com.google.common.base.Splitter;
import com.google.common.collect.ImmutableSet;
import org.apache.jackrabbit.oak.api.PropertyState;
import org.apache.jackrabbit.oak.api.PropertyValue;
import org.apache.jackrabbit.oak.api.Tree;
import org.apache.jackrabbit.oak.api.Type;
import org.apache.jackrabbit.oak.commons.PathUtils;
import org.apache.jackrabbit.oak.query.Query;
import org.apache.jackrabbit.oak.query.QueryImpl;
import org.apache.jackrabbit.oak.query.ast.AndImpl;
import org.apache.jackrabbit.oak.query.ast.ConstraintImpl;
import org.apache.jackrabbit.oak.query.ast.FullTextSearchImpl;
import org.apache.jackrabbit.oak.query.ast.LiteralImpl;
import org.apache.jackrabbit.oak.query.ast.OrImpl;
import org.apache.jackrabbit.oak.spi.query.PropertyValues;
import static org.apache.jackrabbit.util.Text.encodeIllegalXMLCharacters;
/**
* This class can extract excerpts from node.
*/
public class SimpleExcerptProvider {
public static final String REP_EXCERPT_FN = "rep:excerpt(.)";
public static final String EXCERPT_END = "
";
private static int maxFragmentSize = 150;
public static String getExcerpt(String path, String columnName,
Query query, boolean highlight) {
if (path == null) {
return null;
}
Tree t = query.getTree(path);
if (t == null || !t.exists()) {
return null;
}
columnName = extractExcerptProperty(columnName);
if (columnName != null && columnName.contains("/")) {
for (String p : PathUtils.elements(PathUtils
.getParentPath(columnName))) {
if (t.hasChild(p)) {
t = t.getChild(p);
} else {
return null;
}
}
columnName = PathUtils.getName(columnName);
}
StringBuilder text = new StringBuilder();
String separator = "";
for (PropertyState p : t.getProperties()) {
if (p.getType().tag() == Type.STRING.tag()
&& (columnName == null || columnName.equalsIgnoreCase(p
.getName()))) {
text.append(separator);
separator = " ";
for (String v : p.getValue(Type.STRINGS)) {
text.append(v);
}
}
}
Set searchToken = extractFulltext(query);
if (highlight && searchToken != null) {
return highlight(text, searchToken);
}
return noHighlight(text);
}
private static String extractExcerptProperty(String column) {
// most frequent case first
if (REP_EXCERPT_FN.equalsIgnoreCase(column)) {
return null;
}
return column.substring(column.indexOf("(") + 1, column.indexOf(")"));
}
private static Set extractFulltext(Query q) {
// TODO instanceof should not be used
if (q instanceof QueryImpl) {
return extractFulltext(((QueryImpl) q).getConstraint());
}
return ImmutableSet.of();
}
private static Set extractFulltext(ConstraintImpl c) {
Set tokens = new HashSet();
// TODO instanceof should not be used,
// as it will break without us noticing if we extend the AST
if (c instanceof FullTextSearchImpl) {
FullTextSearchImpl f = (FullTextSearchImpl) c;
if (f.getFullTextSearchExpression() instanceof LiteralImpl) {
LiteralImpl l = (LiteralImpl) f.getFullTextSearchExpression();
tokens.add(l.getLiteralValue().getValue(Type.STRING));
}
}
if (c instanceof AndImpl) {
for (ConstraintImpl constraint : ((AndImpl) c).getConstraints()) {
tokens.addAll(extractFulltext(constraint));
}
}
if (c instanceof OrImpl) {
for (ConstraintImpl constraint : ((OrImpl) c).getConstraints()) {
tokens.addAll(extractFulltext(constraint));
}
}
return tokens;
}
private static Set tokenize(Set in) {
Set tokens = new HashSet();
for (String s : in) {
tokens.addAll(tokenize(s));
}
return tokens;
}
private static Set tokenize(String in) {
Set out = new HashSet();
StringBuilder token = new StringBuilder();
boolean quote = false;
for (int i = 0; i < in.length(); ) {
final int c = in.codePointAt(i);
int length = Character.charCount(c);
switch (c) {
case ' ':
if (quote) {
token.append(' ');
} else if (token.length() > 0) {
out.add(token.toString());
token = new StringBuilder();
}
break;
case '"':
case '\'':
if (quote) {
quote = false;
if (token.length() > 0) {
out.add(token.toString());
token = new StringBuilder();
}
} else {
quote = true;
}
break;
default:
token.append(new String(Character.toChars(c)));
}
i += length;
}
if (token.length() > 0) {
out.add(token.toString());
}
return out;
}
private static String noHighlight(StringBuilder text) {
if (text.length() > maxFragmentSize) {
int lastSpace = text.lastIndexOf(" ", maxFragmentSize);
if (lastSpace != -1) {
text.setLength(lastSpace);
} else {
text.setLength(maxFragmentSize);
}
text.append(" ...");
}
StringBuilder excerpt = new StringBuilder(" searchToken) {
Set tokens = tokenize(searchToken);
String escaped = encodeIllegalXMLCharacters(text.toString());
BitSet highlight = new BitSet();
for (String token : tokens) {
highlight(escaped, highlight, token);
}
StringBuilder excerpt = new StringBuilder(EXCERPT_BEGIN);
boolean strong = false;
for (int i = 0; i < escaped.length(); i++) {
if (highlight.get(i) && !strong) {
strong = true;
excerpt.append("");
} else if (!highlight.get(i) && strong) {
strong = false;
excerpt.append("");
}
excerpt.append(escaped.charAt(i));
}
if (strong) {
excerpt.append("");
}
excerpt.append(EXCERPT_END);
return excerpt.toString();
}
private static void highlight(String text, BitSet highlightBits, String token) {
boolean isLike = false;
if (token.endsWith("*")) {
if (token.length() == 1) {
// don't highlight the '*' character itself
return;
}
token = token.substring(0, token.length() - 1);
isLike = true;
}
int index = 0;
while (index < text.length()) {
index = text.indexOf(token, index);
if (index < 0) {
break;
}
int endIndex = index + token.length();
if (isLike) {
int nextSpace = text.indexOf(" ", endIndex);
if (nextSpace != -1) {
endIndex = nextSpace;
} else {
endIndex = text.length();
}
}
while (index < endIndex) {
highlightBits.set(index++);
}
}
}
public static PropertyValue getExcerpt(PropertyValue value) {
Splitter listSplitter = Splitter.on(',').trimResults().omitEmptyStrings();
StringBuilder excerpt = new StringBuilder(EXCERPT_BEGIN);
for (String v : listSplitter.splitToList(value.toString())) {
excerpt.append(v);
}
excerpt.append(EXCERPT_END);
return PropertyValues.newString(excerpt.toString());
}
}
");
excerpt.append(encodeIllegalXMLCharacters(text.toString()));
excerpt.append("
");
return excerpt.toString();
}
static String highlight(StringBuilder text, Set© 2015 - 2025 Weber Informatics LLC | Privacy Policy