
nl.inl.util.XmlHighlighter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of blacklab-util Show documentation
Show all versions of blacklab-util Show documentation
Several utility functions used by BlackLab.
The newest version!
package nl.inl.util;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
/**
* Performs highlighting of the contents of XML elements that we found hits in.
*
* NOTE: this class is not threadsafe. Use a separate instance per thread.
*/
public class XmlHighlighter {
/**
* How to deal with non-well-formed snippets: by e.g. adding an open tag at the
* beginning for an unmatched closing tag, or by removing the unmatched closing
* tag.
*/
public enum UnbalancedTagsStrategy {
ADD_TAG,
REMOVE_TAG
}
enum TagType {
EXISTING_TAG, // an existing tag
HIGHLIGHT_START, // insert tag here
HIGHLIGHT_END, // insert tag here
FIX_START, // insert start tag here to fix well-formedness
FIX_END, // insert end tag here to fix well-formedness
REMOVE_EXISTING_TAG // remove an unbalanced tag to fix well-formedness
}
/**
* Helper class for highlighting: stores a span in the original content, be it a
* place to insert a highlight tag, or an existing tag in the original XML.
*/
static class TagLocation implements Comparable {
/** Counter for assigning unique id to objectNum */
private static long n = 0;
static synchronized long getNextUniqueId() {
return n++;
}
/**
* Whether this is an existing tag from the original content, a start highlight
* tag to be added, or an end highlight tag to be added.
*/
TagType type;
/** Start position of tag in original content */
final int start;
/**
* End position of tag in original content. NOTE: this only differs from start
* if type == EXISTING_TAG. Highlight tags are not in the original content, so
* there start always equals end.
*/
final int end;
/**
* Start position of matching tag (the close to this open tag, or vice versa) in
* original content. A negative value indicates that this tag was unmatched
* (which might happen if we're highlighting snippets of a document).
*/
int matchingTagStart;
/**
* Unique id for each tag; used as a tie-breaker so sorting is always the same,
* and end tags always follow their start tags
*/
public long objectNum;
/**
* For FIX_START/END tags, indicate the tag name to use when insert. For other
* types, not used.
*/
String name;
public TagLocation(TagType type, int start, int end) {
this.type = type;
this.start = start;
this.end = end;
matchingTagStart = -1; // unmatched tag (until we find its match)
objectNum = getNextUniqueId();
}
@Override
public int compareTo(TagLocation o) {
if (this == o)
return 0;
int a = start, b = o.start;
if (a == b) {
a = end;
b = o.end;
if (a == b) {
// use the objectNum as a tie breaker so sort is always the same,
// and end tags always follow their start tags
return (int) (objectNum - o.objectNum);
}
}
return a - b;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + end;
result = prime * result + matchingTagStart;
result = prime * result + ((name == null) ? 0 : name.hashCode());
result = prime * result + (int) (objectNum ^ (objectNum >>> 32));
result = prime * result + start;
result = prime * result + ((type == null) ? 0 : type.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
TagLocation other = (TagLocation) obj;
if (end != other.end)
return false;
if (matchingTagStart != other.matchingTagStart)
return false;
if (name == null) {
if (other.name != null)
return false;
} else if (!name.equals(other.name))
return false;
if (objectNum != other.objectNum)
return false;
if (start != other.start)
return false;
return type == other.type;
}
@Override
public String toString() {
return type + "@" + start;
}
}
/**
* The XML tag to add to the content to signal where highlighting should start.
*/
private static final String startHighlightTag = "";
/**
* The XML tag to add to the content to signal where highlighting should end.
*/
private static final String endHighlightTag = " ";
/** How deep are we inside highlighting tags? */
private int inHighlightTag;
/**
* Where the highlighted content is built - therefore, this class is not
* threadsafe!
*/
StringBuilder b;
/** Remove empty tags after highlighting? */
private boolean removeEmptyHlTags = true;
/**
* How to fix well-formedness problems? If true, we remove the unbalanced tags;
* if false (the default) we add extra tags at the start or end to rebalance it.
*/
private UnbalancedTagsStrategy unbalancedTagsStrategy = UnbalancedTagsStrategy.ADD_TAG;
/**
* The outer (usually, only) highlight tag we're inside of, or null if we're not
* highlighting.
*/
private TagLocation outerHighlightTag = null;
/**
* Given XML content and a sorted list of existing tags and highlight tags to be
* added, add the tags to the content so the well-formedness of the XML is not
* affected.
*
* Also offers the option of cutting the content to a number of characters (with
* possibly a small overshoot, because it will try to cut at a word boundary),
* ignoring tags and maintaining well-formedness.
*
* @param xmlContent the XML content to highlight
* @param tags the existing tags and highlight tags to add. This list must be
* sorted!
* @param stopAfterChars after how many characters of text content to cut this
* fragment. -1 = no cutting.
* @return the highlighted XML content.
*/
private String highlightInternal(String xmlContent, List tags, int stopAfterChars) {
if (stopAfterChars < 0)
stopAfterChars = xmlContent.length();
int positionInContent = 0;
b = new StringBuilder();
inHighlightTag = 0;
int visibleCharsAdded = 0;
boolean addVisibleChars = true; // keep adding text content until we reach the preferred length
boolean wasCut = false;
for (TagLocation tag : tags) {
if (tag.start < positionInContent) {
System.out.println("ERROR IN HIGHLIGHTING");
// NOTE: before, this used to happen very occasionally. Probably fixed now,
// but just in case it's not, let's avoid a nasty exception.
continue; // skip tag
}
if (addVisibleChars) {
String visibleChars = xmlContent.substring(positionInContent, tag.start);
if (visibleCharsAdded + visibleChars.length() >= stopAfterChars) {
visibleChars = StringUtils.abbreviate(visibleChars, "", stopAfterChars - visibleCharsAdded);
if (visibleChars.length() < tag.start - positionInContent)
wasCut = true;
addVisibleChars = false;
}
b.append(visibleChars);
visibleCharsAdded += visibleChars.length();
} else {
if (positionInContent < tag.start) {
wasCut = true;
}
}
processTag(xmlContent, tag);
positionInContent = tag.end;
}
b.append(xmlContent.substring(positionInContent));
final String optionalEllipsis = wasCut ? "..." : "";
return b.toString().trim() + optionalEllipsis;
}
/**
* Decide what to do based on the tag type.
*
* @param xmlContent the content we're highlighting
* @param tag the existing tag or highlight tag to add
*/
private void processTag(String xmlContent, TagLocation tag) {
switch (tag.type) {
case HIGHLIGHT_START:
startHighlight(tag);
break;
case EXISTING_TAG:
existingTag(tag, xmlContent.substring(tag.start, tag.end));
break;
case HIGHLIGHT_END:
endHighlight();
break;
case FIX_START:
existingTag(tag, "<" + tag.name + ">");
break;
case FIX_END:
existingTag(tag, "" + tag.name + ">");
break;
case REMOVE_EXISTING_TAG:
// Simply don't add the tag
break;
}
}
/**
* Add highlight tag if not already added; increment depth
*
* @param tag where the tag occurs
*/
private void startHighlight(TagLocation tag) {
if (inHighlightTag == 0) {
b.append(startHighlightTag);
outerHighlightTag = tag;
}
inHighlightTag++;
}
/** Decrement depth; End highlight if we're at level 0 */
private void endHighlight() {
inHighlightTag--;
if (inHighlightTag == 0) {
b.append(endHighlightTag);
outerHighlightTag = null;
}
}
/**
* We encountered a tag in the content. If we're inside a highlight tag, ends
* the current highlight, add the existing tag and restart the highlighting.
*
* @param tag where the tag occurs
* @param str the existing tag encountered.
*/
private void existingTag(TagLocation tag, String str) {
boolean suspendHighlighting = false;
if (inHighlightTag > 0) {
// We should possibly suspend highlighting for this tag to maintain well-formedness.
// Check the current (outer) highlighting span and see if our matching tag is inside or outside this highlighting span.
if (outerHighlightTag.start > tag.matchingTagStart
|| outerHighlightTag.matchingTagStart <= tag.matchingTagStart) {
// Matching tag is outside the highlighting span; highlighting must be suspended to maintain well-formedness.
suspendHighlighting = true;
}
}
if (suspendHighlighting)
b.append(endHighlightTag);
b.append(str);
if (suspendHighlighting)
b.append(startHighlightTag);
}
/**
* The start and end character position of a hit, used for highlighting the
* content.
*/
public static class HitCharSpan {
private final int startChar;
private final int endChar;
public int getStartChar() {
return startChar;
}
public int getEndChar() {
return endChar;
}
public HitCharSpan(int startChar, int endChar) {
this.startChar = startChar;
this.endChar = endChar;
}
}
private static void addHitPositionsToTagList(List tags, List hitSpans, int offset,
int length) {
for (HitCharSpan hit : hitSpans) {
final int a = hit.getStartChar() - offset;
if (a < 0)
continue; // outside highlighting range, or non-highlighting element (e.g. searching for example date range)
final int b = hit.getEndChar() - offset;
if (b > length)
continue; // outside highlighting range
TagLocation start = new TagLocation(TagType.HIGHLIGHT_START, a, a);
start.matchingTagStart = b;
tags.add(start);
TagLocation end = new TagLocation(TagType.HIGHLIGHT_END, b, b);
end.matchingTagStart = a;
tags.add(end);
}
}
/**
* Given XML content, make a list of tag locations in this content.
*
* Note that the XML content is assumed to be (part of) a well-formed XML
* document. This way we can highlight a whole document or part of a document.
* It's therefore okay if we encounter close tags at the start that we haven't
* seen an open tag for, or open tags at the end that we'll never see a close
* tag for, but if there are other tag errors (e.g. hierarchy errors such as
* <i><b></i></b>) the behaviour of the highlighter is
* undefined.
*
* @param elementContent the XML content
* @return the list of tag locations, each with type EXISTING_TAG.
*/
private List makeTagList(String elementContent) {
List tags = new ArrayList<>();
// Regex for finding all XML tags.
// Group 1 indicates if this is an open or close tag
// Group 2 is the tag name
Pattern xmlTags = Pattern.compile("<(?![!?])\\s*(/?)\\s*([^>\\s]+)(\\s+[^>]*)?>");
Matcher m = xmlTags.matcher(elementContent);
List openTagStack = new ArrayList<>(); // keep track of open tags
int fixStartTagObjectNum = -1; // when adding start tags to fix well-formedness, number backwards (for correct sorting)
while (m.find()) {
TagLocation tagLocation = new TagLocation(TagType.EXISTING_TAG, m.start(), m.end());
// Keep track of open tags, so we know if the tags are matched
boolean isOpenTag = m.group(1).length() == 0;
boolean isSelfClosing = isOpenTag && isSelfClosing(m.group());
if (isOpenTag) {
if (!isSelfClosing) {
// Open tag. Add to the stack.
openTagStack.add(tagLocation);
tagLocation.name = m.group(2); // remember in case there's no close tag
} else {
// Self-closing tag. Don't add to stack, link to self
tagLocation.matchingTagStart = tagLocation.start;
}
} else {
// Close tag. Did we encounter a matching open tag?
TagLocation openTag = null;
if (!openTagStack.isEmpty()) {
// Yes, this tag is matched. Find matching tag and link them.
openTag = openTagStack.remove(openTagStack.size() - 1);
openTag.name = null; // no longer necessary to remember tag name
} else {
// Unmatched closing tag.
if (unbalancedTagsStrategy == UnbalancedTagsStrategy.REMOVE_TAG) {
// Remove it.
tagLocation.type = TagType.REMOVE_EXISTING_TAG;
} else {
// Insert a dummy open tag at the start
// of the content to maintain well-formedness
openTag = new TagLocation(TagType.FIX_START, 0, 0);
openTag.name = m.group(2); // we need to know what tag to insert
openTag.objectNum = fixStartTagObjectNum; // to fix sorting
fixStartTagObjectNum--;
tags.add(openTag);
}
}
if (openTag != null) {
// Link the matching tags together
openTag.matchingTagStart = tagLocation.start;
tagLocation.matchingTagStart = openTag.start;
}
}
// Add tag to the tag list
tags.add(tagLocation);
}
// Close any tags still open, in the correct order (for well-formedness)
for (int i = openTagStack.size() - 1; i >= 0; i--) {
if (unbalancedTagsStrategy == UnbalancedTagsStrategy.REMOVE_TAG) {
// Remove the unbalanced tag
openTagStack.get(i).type = TagType.REMOVE_EXISTING_TAG;
} else {
// Add a close tag at the end to fix the unbalanced tag
TagLocation tagLocation = new TagLocation(TagType.FIX_END, elementContent.length(),
elementContent.length());
tagLocation.name = openTagStack.get(i).name; // we remembered this for this case
tags.add(tagLocation);
}
}
return tags;
}
/**
* Determines if a tag is a self-closing tag (ends with "/>")
*
* @param tag the tag
* @return true iff it is self-closing
*/
private static boolean isSelfClosing(String tag) {
// Start at the second to last character (skip the '>') and look for slash.
for (int i = tag.length() - 2; i >= 0; i--) {
switch (tag.charAt(i)) {
case '/':
// Yes, self-closing tag
return true;
case ' ':
case '\t':
case '\n':
case '\r':
// Whitespace; continue
break;
default:
// We found an attribute or the tag name before encountering a slash, so it's not self-closing.
return false;
}
}
return false;
}
/**
* Highlight a string containing XML tags. The result is still well-formed XML.
*
* @param elementContent the string to highlight
* @param hits where the highlighting tags should go
* @return the highlighted string
*/
public String highlight(String elementContent, List hits) {
return highlight(elementContent, hits, 0);
}
/**
* Highlight part of an XML document.
*
* You cut the XML yourself and supply the part you wish to highlight, along
* with the offset of where you cut (so we know where the highlight tags should
* go).
*
* Missing tags at the beginning or end of the part will be corrected. As long
* as you cut at tag boundaries (i.e. not within a tag), the result of this
* method will still be well-formed XML.
*
* @param partialContent the (partial) XML to cut and highlight.
* @param hits the hits to use for highlighting, or null for no highlighting
* @param offset position of the first character in the string (i.e. what to
* subtract from Hit positions to highlight)
* @return the highlighted (part of the) XML string
*/
public String highlight(String partialContent, List hits, int offset) {
// Find all tags in the content and put their positions in a list
List tags = makeTagList(partialContent);
// 2. Put the positions of our hits in the same list and sort it
if (hits != null)
addHitPositionsToTagList(tags, hits, offset, partialContent.length());
tags.sort(Comparator.naturalOrder());
// Add all the highlight tags in the list into the content,
// taking care to mainting well-formedness around existing tags
String highlighted = highlightInternal(partialContent, tags, -1);
if (removeEmptyHlTags) {
// Because of the way the highlighting (and maintaining of well-formedness) occurs,
// empty highlight tags may have arisen. Remove these.
highlighted = highlighted.replaceAll(startHighlightTag + "(\\s*)" + endHighlightTag,
"$1");
}
return highlighted;
}
/**
* Cut a string after a specified number of non-tag characters, preferably at a
* word boundary, keeping all tags after the cut intact. The result is still
* well-formed XML.
*
* You might use this to show the first few lines of an XML document on the
* results page.
*
* @param elementContent the string to cut
* @param stopAfterChars after how many non-tag characters we should stop (-1
* for no limit)
* @return the cut string
*/
public String cutAroundTags(String elementContent, int stopAfterChars) {
// Find all tags in the content and put their positions in a list
List tags = makeTagList(elementContent);
tags.sort(Comparator.naturalOrder());
// Add all the highlight tags in the list into the content,
// taking care to mainting well-formedness around existing tags
return highlightInternal(elementContent, tags, stopAfterChars);
}
public static void main(String[] args) {
XmlHighlighter h = new XmlHighlighter();
String xml = "The quick " +
"brown fox ";
List hitSpans = new ArrayList<>();
hitSpans.add(new HitCharSpan(41, 46));
hitSpans.add(new HitCharSpan(101, 124));
String result = h.highlight(xml, hitSpans, 0);
System.out.println(result);
}
/**
* Set whether or not to remove empty tags at the end of highlighting
* (which can form due to the process).
*
* @param c true iff empty hl tags should be removed
*/
public void setRemoveEmptyHlTags(boolean c) {
removeEmptyHlTags = c;
}
/**
* Make a cut XML fragment well-formed.
*
* The only requirement is that tags are intact (i.e. xmlFragment doesn't start
* with "able cellpadding='3'>" or end with "</bod".
*
* The fragment is made well-formed by adding open tags to the beginning or
* close tags to the end. It is therefore not a generic way of making any
* non-well-formed document well-formed, it just works for cutting out part of a
* well-formed document.
*
* @return a well-formed fragment
*/
public String makeWellFormed(String xmlFragment) {
return highlight(xmlFragment, null, 0);
}
/**
* Get how well-formedness problems are fixed
*
* @return the strategy we're using now.
*/
public UnbalancedTagsStrategy getUnbalancedTagsStrategy() {
return unbalancedTagsStrategy;
}
/**
* Set how to fix well-formedness problems.
*
* @param strategy what to do when encountering unbalanced tags.
*/
public void setUnbalancedTagsStrategy(UnbalancedTagsStrategy strategy) {
this.unbalancedTagsStrategy = strategy;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy