
gate.html.HtmlDocumentHandler Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of gate-core Show documentation
Show all versions of gate-core Show documentation
GATE - general achitecture for text engineering - is
open source software capable of solving almost any text processing problem.
This artifact enables you to embed the core GATE Embedded with its essential dependencies.
You will able to use the GATE Embedded API and load and store GATE XML documents. This
artifact is the perfect dependency for CREOLE plugins or for applications that need to customize
the GATE dependencies due to confict with their own dependencies or for lower footprint.
The newest version!
/*
* HtmlDocumentHandler.java
*
* Copyright (c) 1995-2012, The University of Sheffield. See the file
* COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
*
* This file is part of GATE (see http://gate.ac.uk/), and is free
* software, licenced under the GNU Library General Public License,
* Version 2, June 1991 (in the distribution as file licence.html,
* and also available at http://gate.ac.uk/gate/licence.html).
*
* Cristian URSU, 12/June/2000
*
* $Id: HtmlDocumentHandler.java 17638 2014-03-12 09:36:47Z markagreenwood $
*/
package gate.html;
import gate.Factory;
import gate.FeatureMap;
import gate.GateConstants;
import gate.corpora.DocumentContentImpl;
import gate.corpora.RepositioningInfo;
import gate.event.StatusListener;
import gate.util.Err;
import gate.util.InvalidOffsetException;
import java.util.Collections;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Stack;
import javax.swing.text.BadLocationException;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
/** Implements the behaviour of the HTML reader.
* Methods of an object of this class are called by the HTML parser when
* events will appear.
* The idea is to parse the HTML document and construct Gate annotations
* objects.
* This class also will replace the content of the Gate document with a
* new one containing anly text from the HTML document.
*/
public class HtmlDocumentHandler extends ParserCallback {
/** Constructor initialises all the private memeber data.
* This will use the default annotation set taken from the gate document.
* @param aDocument The gate document that will be processed
* @param aMarkupElementsMap The map containing the elements that will
* transform into annotations
*/
public HtmlDocumentHandler(gate.Document aDocument, Map aMarkupElementsMap) {
this(aDocument,aMarkupElementsMap,null);
}
/** Constructor initialises all the private memeber data
* @param aDocument The gate document that will be processed
* @param aMarkupElementsMap The map containing the elements that will
* transform into annotations
* @param anAnnotationSet The annotation set that will contain annotations
* resulted from the processing of the gate document
*/
public HtmlDocumentHandler(gate.Document aDocument,
Map aMarkupElementsMap,
gate.AnnotationSet anAnnotationSet) {
// init stack
stack = new Stack();
// this string contains the plain text (the text without markup)
tmpDocContent = new StringBuffer(aDocument.getContent().size().intValue());
// colector is used later to transform all custom objects into
// annotation objects
colector = new LinkedList();
// the Gate document
doc = aDocument;
// this map contains the elements name that we want to create
// if it's null all the elements from the XML documents will be transformed
// into Gate annotation objects
markupElementsMap = aMarkupElementsMap;
// init an annotation set for this gate document
basicAS = anAnnotationSet;
customObjectsId = 0;
}//HtmlDocumentHandler
/** Keep the refference to this structure */
private RepositioningInfo reposInfo = null;
/** Keep the refference to this structure */
private RepositioningInfo ampCodingInfo = null;
/** Set repositioning information structure refference. If you set this
* refference to null information wouldn't be collected.
*/
public void setRepositioningInfo(RepositioningInfo info) {
reposInfo = info;
} // setRepositioningInfo
/** Return current RepositioningInfo object */
public RepositioningInfo getRepositioningInfo() {
return reposInfo;
} // getRepositioningInfo
/** Set repositioning information structure refference for ampersand coding.
* If you set this refference to null information wouldn't be used.
*/
public void setAmpCodingInfo(RepositioningInfo info) {
ampCodingInfo = info;
} // setRepositioningInfo
/** Return current RepositioningInfo object for ampersand coding. */
public RepositioningInfo getAmpCodingInfo() {
return ampCodingInfo;
} // getRepositioningInfo
/** The text inside the STYLE tag is processed with handleText()
.
* We should skip inserting of this text in the document. */
private boolean isInsideStyleTag = false;
/** This method is called when the HTML parser encounts the beginning
* of a tag that means that the tag is paired by an end tag and it's
* not an empty one.
*/
@Override
public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
// Fire the status listener if the elements processed exceded the rate
if (0 == (++elements % ELEMENTS_RATE))
fireStatusChangedEvent("Processed elements : " + elements);
// Start of STYLE tag
if(HTML.Tag.STYLE.equals(t)) {
isInsideStyleTag = true;
} // if
// Construct a feature map from the attributes list
FeatureMap fm = Factory.newFeatureMap();
// Take all the attributes an put them into the feature map
if (0 != a.getAttributeCount()){
Enumeration> enumeration = a.getAttributeNames();
while (enumeration.hasMoreElements()){
Object attribute = enumeration.nextElement();
fm.put(attribute.toString(),(a.getAttribute(attribute)).toString());
}// while
}// if
// Just analize the tag t and add some\n chars and spaces to the
// tmpDocContent.The reason behind is that we need to have a readable form
// for the final document.
customizeAppearanceOfDocumentWithStartTag(t);
// If until here the "tmpDocContent" ends with a NON whitespace char,
// then we add a space char before calculating the START index of this
// tag.
// This is done in order not to concatenate the content of two separate tags
// and obtain a different NEW word.
int tmpDocContentSize = tmpDocContent.length();
if ( tmpDocContentSize != 0 &&
!Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1))
) tmpDocContent.append(" ");
// create the start index of the annotation
Long startIndex = new Long(tmpDocContent.length());
// initialy the start index is equal with the End index
CustomObject obj = new CustomObject(t.toString(),fm,startIndex,startIndex);
// put it into the stack
stack.push (obj);
}//handleStartTag
/** This method is called when the HTML parser encounts the end of a tag
* that means that the tag is paired by a beginning tag
*/
@Override
public void handleEndTag(HTML.Tag t, int pos){
// obj is for internal use
CustomObject obj = null;
// end of STYLE tag
if(HTML.Tag.STYLE.equals(t)) {
isInsideStyleTag = false;
} // if
// If the stack is not empty then we get the object from the stack
if (!stack.isEmpty()){
obj = stack.pop();
// Before adding it to the colector, we need to check if is an
// emptyAndSpan one. See CustomObject's isEmptyAndSpan field.
if (obj.getStart().equals(obj.getEnd())){
// The element had an end tag and its start was equal to its end. Hence
// it is anEmptyAndSpan one.
obj.getFM().put("isEmptyAndSpan","true");
}// End iff
// we add it to the colector
colector.add(obj);
}// End if
// If element has text between, then customize its apearance
if ( obj != null &&
obj.getStart().longValue() != obj.getEnd().longValue()
)
// Customize the appearance of the document
customizeAppearanceOfDocumentWithEndTag(t);
// if t is the