All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.xwiki.xml.ExtractHandler Maven / Gradle / Ivy

/*
 * See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership.
 *
 * This is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1 of
 * the License, or (at your option) any later version.
 *
 * This software is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this software; if not, write to the Free
 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
 */
package org.xwiki.xml;

import java.util.Stack;

import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

/**
 * Extracts a well-formed XML fragment by listening to SAX events. The result has the following semantic:
* {@code xmlInput.dropAllTags().substring(start, length).unDropAssociatedTags()} *

* So basically we would create an instance like {@code new ExtractHandler(0, 400)} in order to obtain an XML * fragment with its inner text length of at most 400 characters, starting at position (character) 0 in the source * (input) XML's inner text. The ExtractHandler is used in feed plug-in to obtain a preview of an XML (HTML, to be more * specific). Another use case could be to paginate an XML source (keeping pages well-formed). *

* As an example, the result of applying an {@code ExtractHandler(3, 13)} to: *

{@code
 * 

click here to view the result

* }
* is *
{@code
 * 

ck here to

* }
* * @version $Id: b00e780e1f82791ed4c29b805e13282902d16727 $ * @since 1.6M2 */ public class ExtractHandler extends DefaultHandler { /** * A simple utility bean for representing an XML tag. */ private static class XMLTag { /** * Tag's qualified name. */ private String qName; /** * Tag's attributes. */ private Attributes atts; /** * Constructs a new XML tag with the given qualified name and attributes. * * @param qName Tag's qualified name. * @param atts Tag's attributes. */ XMLTag(String qName, Attributes atts) { this.qName = qName; this.atts = atts; } /** * @return Tag's qualified name. */ public String getQName() { return this.qName; } /** * @return Tag's attributes. */ public Attributes getAtts() { return this.atts; } } /** * The number of characters, in text nodes, that have to be read before starting the extraction. */ private int lowerBound; /** * The maximum number of characters that may be read during the parsing process. */ private int upperBound; /** * The number of characters read so far. */ private int counter; /** * The stack of open tags; when the lower bound is reached all the tags in the stack must be opened; when the upper * bound is reached all the tags in the stack must be closed. */ private Stack openedTags = new Stack<>(); /** * The fragment that is extracted during the parsing process. */ private StringBuilder result = new StringBuilder(); /** * true if the extraction was successful. The parsing process throws an exception when the upper bound * is reached; this flag is useful to distinguish between this exception and the others. */ private boolean finished; /** * Creates a new instance. * * @param start The character index from where to start the extraction. * @param length The number of plain text characters to extract. * @throws SAXException if start is less than zero or length is less than or equal to zero. */ public ExtractHandler(int start, int length) throws SAXException { super(); if (start < 0) { throw new SAXException("Start must be greater than or equal to 0"); } if (length <= 0) { throw new SAXException("Length must be greater than 0"); } this.lowerBound = start; this.upperBound = this.lowerBound + length; } /** * @return The extracted text. */ public String getResult() { return this.result.toString(); } /** * @return true if the extraction process has succeeded; false if an exception occurred during the process. */ public boolean isFinished() { return this.finished; } /** * Append an open tag with the given specification to the result buffer. * * @param qName Tag's qualified name. * @param atts Tag's attributes. */ private void openTag(String qName, Attributes atts) { this.result.append('<').append(qName); for (int i = 0; i < atts.getLength(); i++) { this.result.append(' ').append(atts.getQName(i)).append("=\"").append(atts.getValue(i)).append('\"'); } this.result.append('>'); } /** * Open all pending tags. * * @see #openTag(String, Attributes) */ private void openTags() { for (XMLTag tag : this.openedTags) { openTag(tag.getQName(), tag.getAtts()); } } /** * Close all pending tags. * * @see #closeTag(String) */ private void closeTags() { while (!this.openedTags.isEmpty()) { closeTag(this.openedTags.pop().getQName()); } } /** * Append a closed tag with the given qualified name to the result buffer. * * @param qName Tag's qualified name. */ private void closeTag(String qName) { this.result.append("'); } /** * @return true if the start point has been passed but the length limit hasn't been reached. */ private boolean isExtracting() { return this.lowerBound <= this.counter && this.counter <= this.upperBound; } @Override public void startDocument() throws SAXException { super.startDocument(); this.counter = 0; this.openedTags.clear(); this.result.setLength(0); this.finished = false; } @Override public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException { this.openedTags.push(new XMLTag(qName, atts)); if (isExtracting()) { openTag(qName, atts); } } @Override public void characters(char[] ch, int start, int length) throws SAXException { int offset = this.lowerBound - this.counter; if (offset > 0) { if (offset > length) { this.counter += length; return; } else { this.counter = this.lowerBound; openTags(); characters(ch, start + offset, length - offset); return; } } int remainingLength = this.upperBound - this.counter; if (remainingLength <= length) { String content = String.valueOf(ch, start, remainingLength); int spaceIndex = remainingLength; // If we're in the middle of a word, try to cut before it, so that we don't output half-words if (length > remainingLength && ch[start + remainingLength] != ' ') { spaceIndex = content.lastIndexOf(' '); } if (spaceIndex >= 0) { this.counter += spaceIndex; this.result.append(content.substring(0, spaceIndex)); } else { this.counter = this.upperBound; this.result.append(content); } endDocument(); throw new SAXException("Length limit reached"); } else { this.counter += length; this.result.append(ch, start, length); } } @Override public void endElement(String namespaceURI, String localName, String qName) throws SAXException { // We assume the XML fragment is well defined, and thus we shouldn't have a closed tag // without its pair open tag. So we don't test for empty stack or tag match. this.openedTags.pop(); if (isExtracting()) { closeTag(qName); } } @Override public void endDocument() throws SAXException { super.endDocument(); // Close open tags if (isExtracting()) { closeTags(); } // set finished flag to distinguish between "length limit reached" and other exceptions this.finished = true; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy