gate.html.NekoHtmlDocumentHandler Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of gate-core Show documentation
Show all versions of gate-core Show documentation
GATE - general achitecture for text engineering - is open source
software capable of solving almost any text processing problem. This
artifact enables you to embed the core GATE Embedded with its essential
dependencies. You will able to use the GATE Embedded API and load and
store GATE XML documents. This artifact is the perfect dependency for
CREOLE plugins or for applications that need to customize the GATE
dependencies due to confict with their own dependencies or for lower
footprint.
The newest version!
/*
* NekoHtmlDocumentHandler.java
*
* Copyright (c) 2006, The University of Sheffield.
*
* This file is part of GATE (see http://gate.ac.uk/), and is free
* software, licenced under the GNU Library General Public License,
* Version 2, June 1991 (in the distribution as file licence.html,
* and also available at http://gate.ac.uk/gate/licence.html).
*
* Ian Roberts, 17/Dec/2006
*
* $Id: NekoHtmlDocumentHandler.java 17597 2014-03-08 15:19:43Z markagreenwood $
*/
package gate.html;
import gate.Factory;
import gate.FeatureMap;
import gate.Gate;
import gate.GateConstants;
import gate.corpora.DocumentContentImpl;
import gate.corpora.RepositioningInfo;
import gate.event.StatusListener;
import gate.util.Err;
import gate.util.InvalidOffsetException;
import gate.util.Out;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import org.apache.xerces.xni.Augmentations;
import org.apache.xerces.xni.NamespaceContext;
import org.apache.xerces.xni.QName;
import org.apache.xerces.xni.XMLAttributes;
import org.apache.xerces.xni.XMLLocator;
import org.apache.xerces.xni.XMLResourceIdentifier;
import org.apache.xerces.xni.XMLString;
import org.apache.xerces.xni.XNIException;
import org.apache.xerces.xni.parser.XMLDocumentSource;
import org.apache.xerces.xni.parser.XMLParseException;
import org.cyberneko.html.HTMLEventInfo;
/**
* The XNI document handler used with NekoHTML to parse HTML documents.
* We use XNI rather than SAX as XNI can distinguish between empty
* elements (<element/>) and elements with an empty span
* (<element></element>), whereas SAX just treats both cases
* the same.
*/
public class NekoHtmlDocumentHandler
implements
org.apache.xerces.xni.XMLDocumentHandler,
org.apache.xerces.xni.parser.XMLErrorHandler {
private static final boolean DEBUG = false;
private static final boolean DEBUG_GENERAL = DEBUG;
private static final boolean DEBUG_ELEMENTS = DEBUG;
private static final boolean DEBUG_CHARACTERS = DEBUG;
private static final boolean DEBUG_UNUSED = DEBUG;
public static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
/**
* Constructor initialises all the private memeber data
*
* @param aDocument The gate document that will be processed
* @param anAnnotationSet The annotation set that will contain
* annotations resulted from the processing of the gate
* document
* @param ignorableTags HTML tag names (lower case) whose text content
* should be ignored by this handler.
*/
public NekoHtmlDocumentHandler(gate.Document aDocument,
gate.AnnotationSet anAnnotationSet, Set ignorableTags) {
if(ignorableTags == null) {
ignorableTags = new HashSet();
}
if(DEBUG_GENERAL) {
Out.println("Created NekoHtmlDocumentHandler. ignorableTags = "
+ ignorableTags);
}
// init stack
stack = new java.util.Stack();
// this string contains the plain text (the text without markup)
tmpDocContent = new StringBuilder(aDocument.getContent().size().intValue());
// colector is used later to transform all custom objects into
// annotation objects
colector = new LinkedList();
// the Gate document
doc = aDocument;
// init an annotation set for this gate document
basicAS = anAnnotationSet;
// first annotation ID to use
customObjectsId = 0;
this.ignorableTags = ignorableTags;
if ( Gate.getUserConfig().get(
GateConstants.DOCUMENT_ADD_SPACE_ON_UNPACK_FEATURE_NAME)!= null) {
addSpaceOnUnpack =
Gate.getUserConfig().getBoolean(
GateConstants.DOCUMENT_ADD_SPACE_ON_UNPACK_FEATURE_NAME
).booleanValue();
}
}// HtmlDocumentHandler
/**
* Set the array of line offsets. This array holds the starting
* character offset in the document of the beginning of each line of
* text, to allow us to convert the NekoHTML location information
* (line and column number) into offsets from the beginning of the
* document for repositioning info.
*/
public void setLineOffsets(int[] lineOffsets) {
this.lineOffsets = lineOffsets;
}
/**
* Called when the parser encounters the start of an HTML element.
* Empty elements also trigger this method, followed immediately by an
* {@link #endElement}.
*/
@Override
public void startElement(QName element, XMLAttributes attributes,
Augmentations augs) throws XNIException {
// deal with any outstanding character content
charactersAction();
if(DEBUG_ELEMENTS) {
Out.println("startElement: " + element.localpart);
}
// Fire the status listener if the elements processed exceded the
// rate
if(0 == (++elements % ELEMENTS_RATE))
fireStatusChangedEvent("Processed elements : " + elements);
// Start of ignorable tag
if(ignorableTags.contains(element.localpart)) {
ignorableTagLevels++;
if(DEBUG_ELEMENTS) {
Out.println(" ignorable tag: levels = " + ignorableTagLevels);
}
} // if
// Construct a feature map from the attributes list
FeatureMap fm = Factory.newFeatureMap();
// Take all the attributes an put them into the feature map
for(int i = 0; i < attributes.getLength(); i++) {
if(DEBUG_ELEMENTS) {
Out.println(" attribute: " + attributes.getLocalName(i) + " = "
+ attributes.getValue(i));
}
fm.put(attributes.getLocalName(i), attributes.getValue(i));
}
// Just analize the tag and add some\n chars and spaces to the
// tmpDocContent.The reason behind is that we need to have a
// readable form
// for the final document.
customizeAppearanceOfDocumentWithStartTag(element.localpart);
// create the start index of the annotation
Long startIndex = new Long(tmpDocContent.length());
// initialy the start index is equal with the End index
CustomObject obj = new CustomObject(element.localpart, fm, startIndex,
startIndex);
// put it into the stack
stack.push(obj);
}
/**
* Called when the parser encounters character or CDATA content.
* Characters may be reported in more than one chunk, so we gather all
* contiguous chunks together and process them in one block.
*/
@Override
public void characters(XMLString text, Augmentations augs)
throws XNIException {
if(!readCharacterStatus) {
if(reposInfo != null) {
HTMLEventInfo evInfo = (augs == null) ? null : (HTMLEventInfo)augs
.getItem(AUGMENTATIONS);
if(evInfo == null) {
Err.println("Warning: could not determine proper repositioning "
+ "info for character chunk \""
+ new String(text.ch, text.offset, text.length)
+ "\" near offset " + charactersStartOffset
+ ". Save preserving format may give incorret results.");
}
else {
// NekoHTML numbers lines and columns from 1, not 0
int line = evInfo.getBeginLineNumber() - 1;
int col = evInfo.getBeginColumnNumber() - 1;
charactersStartOffset = lineOffsets[line] + col;
if(DEBUG_CHARACTERS) {
Out.println("characters: line = " + line + " (offset " +
lineOffsets[line] + "), col = " + col + " : file offset = " +
charactersStartOffset);
}
}
}
contentBuffer = new StringBuilder();
}
readCharacterStatus = true;
boolean canAppendWS = (contentBuffer.length() == 0 || !Character
.isWhitespace(contentBuffer.charAt(contentBuffer.length() - 1)));
// we must collapse
// whitespace down to a single space, to mirror the normal
// HtmlDocumentFormat.
for(int i = text.offset; i < text.offset + text.length; ++i) {
if(!Character.isWhitespace(text.ch[i])) {
contentBuffer.append(text.ch[i]);
canAppendWS = true;
}
else {
if(canAppendWS) {
contentBuffer.append(' ');
canAppendWS = false;
}
}
}
}
/**
* Called when all text between two tags has been processed.
*/
public void charactersAction() throws XNIException {
// check whether there are actually any characters to process
if(!readCharacterStatus) {
return;
}
readCharacterStatus = false;
if(DEBUG_CHARACTERS) {
Out.println("charactersAction: offset = " + charactersStartOffset);
}
if(contentBuffer.length() == 0) return;
// Skip ignorable tag content
if(ignorableTagLevels > 0) {
if(DEBUG_CHARACTERS) {
Out.println(" inside ignorable tag, skipping");
}
return;
}
// the number of whitespace characters trimmed off the front of this
// chunk of characters
boolean thisChunkStartsWithWS = Character.isWhitespace(contentBuffer.charAt(0));
// trim leading whitespace
if(thisChunkStartsWithWS) {
contentBuffer.deleteCharAt(0);
}
if(contentBuffer.length() == 0) {
if(DEBUG_CHARACTERS) {
Out.println(" whitespace only: ignoring");
}
// if this chunk starts with whitespace and is whitespace only, then
// it ended with whitespace too
previousChunkEndedWithWS = thisChunkStartsWithWS;
return;
} // if
// trim trailing whitespace
boolean trailingWhitespace = Character.isWhitespace(contentBuffer.charAt(contentBuffer.length() - 1));
if(trailingWhitespace) {
contentBuffer.setLength(contentBuffer.length() - 1);
}
if(DEBUG_CHARACTERS) {
Out.println(" content = \"" + contentBuffer + "\"");
}
int tmpDocContentSize = tmpDocContent.length();
boolean incrementStartIndex = false;
// correct for whitespace. Since charactersAction never leaves
// tmpDocContent with a trailing whitespace character, we may
// need to add space before we append the current chunk to prevent
// two chunks either side of a tag from running into one. We need
// to do this if there is whitespace in the original content on
// one side or other of the tag (i.e. the previous chunk ended
// with space or the current chunk starts with space). Also, if
// the user's "add space on markup unpack" option is true, we add
// space anyway so as not to run things like
// "...foobar..." together into "foobar".
if(tmpDocContentSize != 0
&& !Character.isWhitespace(tmpDocContent
.charAt(tmpDocContentSize - 1))
&& (previousChunkEndedWithWS || thisChunkStartsWithWS || addSpaceOnUnpack)) {
if(DEBUG_CHARACTERS) {
Out
.println(String
.format(
" non-whitespace character %1$x (%1$c) found at end of content, adding space",
(int)tmpDocContent
.charAt(tmpDocContentSize - 1)));
}
tmpDocContent.append(' ');
incrementStartIndex = true;
}// End if
// update the document content
tmpDocContent.append(contentBuffer);
// put the repositioning information
if(reposInfo != null) {
long actualStartOffset = charactersStartOffset;
if(thisChunkStartsWithWS) {
actualStartOffset = fixStartOffsetForWhitespace(actualStartOffset);
}
int extractedPos = tmpDocContentSize;
if(incrementStartIndex) extractedPos++;
addRepositioningInfo(contentBuffer.length(), (int)actualStartOffset,
extractedPos);
} // if
// calculate the End index for all the elements of the stack
// the expression is : End index = Current doc length + text length
Long end = new Long(tmpDocContent.length());
CustomObject obj = null;
// Iterate through stack to modify the End index of the existing
// elements
java.util.Iterator anIterator = stack.iterator();
while(anIterator.hasNext()) {
// get the object and move to the next one
obj = anIterator.next();
if(incrementStartIndex && obj.getStart().equals(obj.getEnd())) {
obj.setStart(new Long(obj.getStart().longValue() + 1));
}// End if
// sets its End index
obj.setEnd(end);
}// End while
// remember whether this chunk ended with whitespace for next time
previousChunkEndedWithWS = trailingWhitespace;
}
/**
* Called when the parser encounters the end of an element.
*/
@Override
public void endElement(QName element, Augmentations augs) throws XNIException {
endElement(element, augs, false);
}
/**
* Called to signal an empty element. This simply synthesizes a
* startElement followed by an endElement event.
*/
@Override
public void emptyElement(QName element, XMLAttributes attributes,
Augmentations augs) throws XNIException {
this.startElement(element, attributes, augs);
this.endElement(element, augs, true);
}
/**
* Called when the parser encounters the end of an HTML element.
*/
public void endElement(QName element, Augmentations augs,
boolean wasEmptyElement) throws XNIException {
charactersAction();
// localName = localName.toLowerCase();
if(DEBUG_ELEMENTS) {
Out.println("endElement: " + element.localpart + " (was "
+ (wasEmptyElement ? "" : "not ") + "empty)");
}
// obj is for internal use
CustomObject obj = null;
// end of ignorable tag
if(ignorableTags.contains(element.localpart)) {
ignorableTagLevels--;
if(DEBUG_ELEMENTS) {
Out.println(" end of ignorable tag. levels = " + ignorableTagLevels);
}
} // if
// If the stack is not empty then we get the object from the stack
if(!stack.isEmpty()) {
obj = stack.pop();
// Before adding it to the colector, we need to check if is an
// emptyAndSpan one. See CustomObject's isEmptyAndSpan field.
// We only set isEmptyAndSpan if this endElement was NOT generated
// from an empty element in the HTML.
if(obj.getStart().equals(obj.getEnd()) && !wasEmptyElement) {
// The element had an end tag and its start was equal to its
// end. Hence it is anEmptyAndSpan one.
obj.getFM().put("isEmptyAndSpan", "true");
}// End iff
// we add it to the colector
colector.add(obj);
}// End if
// If element has text between, then customize its apearance
if(obj != null && obj.getStart().longValue() != obj.getEnd().longValue())
// Customize the appearance of the document
customizeAppearanceOfDocumentWithEndTag(element.localpart);
}
/**
* Called when the parser reaches the end of the document. Here we
* store the new content and construct the Original markups
* annotations.
*/
@Override
public void endDocument(Augmentations augs) throws XNIException {
if(DEBUG_GENERAL) {
Out.println("endDocument");
}
CustomObject obj = null;
// replace the old content with the new one
doc.setContent(new DocumentContentImpl(tmpDocContent.toString()));
// If basicAs is null then get the default annotation
// set from this gate document
if(basicAS == null)
basicAS = doc
.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
// sort colector ascending on its id
Collections.sort(colector);
// iterate through colector and construct annotations
while(!colector.isEmpty()) {
obj = colector.getFirst();
colector.remove(obj);
// Construct an annotation from this obj
try {
basicAS.add(obj.getStart(), obj.getEnd(), obj.getElemName(), obj
.getFM());
}
catch(InvalidOffsetException e) {
Err.prln("Error creating an annot :" + obj + " Discarded...");
}// end try
// }// end if
}// while
// notify the listener about the total amount of elements that
// has been processed
fireStatusChangedEvent("Total elements : " + elements);
}
/**
* Non-fatal error, print the stack trace but continue processing.
*/
@Override
public void error(String domain, String key, XMLParseException e) {
e.printStackTrace(Err.getPrintWriter());
}
@Override
public void fatalError(String domain, String key, XMLParseException e)
throws XNIException {
throw e;
}
// we don't do anything with processing instructions, comments or CDATA
// markers, but if we encounter them they interrupt the flow of text. Thus
// we must call charactersAction so the repositioning info is correctly
// generated.
@Override
public void processingInstruction(String target, XMLString data,
Augmentations augs) throws XNIException {
charactersAction();
}
@Override
public void comment(XMLString content,
Augmentations augs) throws XNIException {
charactersAction();
}
@Override
public void startCDATA(Augmentations augs) throws XNIException {
charactersAction();
}
@Override
public void endCDATA(Augmentations augs) throws XNIException {
charactersAction();
}
/**
* A comparator that compares two RepositioningInfo.PositionInfo
* records by their originalPosition values. It also supports either
* or both argument being a Long, in which case the Long value is used
* directly. This allows you to binarySearch for an offset rather than
* having to construct a PositionInfo record with the target value.
*/
private static final Comparator