gate.corpora.XmlDocumentFormat Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of gate-core Show documentation
Show all versions of gate-core Show documentation
GATE - general achitecture for text engineering - is open source
software capable of solving almost any text processing problem. This
artifact enables you to embed the core GATE Embedded with its essential
dependencies. You will able to use the GATE Embedded API and load and
store GATE XML documents. This artifact is the perfect dependency for
CREOLE plugins or for applications that need to customize the GATE
dependencies due to confict with their own dependencies or for lower
footprint.
The newest version!
/*
* XmlDocumentFormat.java
*
* Copyright (c) 1995-2012, The University of Sheffield. See the file
* COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
*
* This file is part of GATE (see http://gate.ac.uk/), and is free
* software, licenced under the GNU Library General Public License,
* Version 2, June 1991 (in the distribution as file licence.html,
* and also available at http://gate.ac.uk/gate/licence.html).
*
* Cristian URSU, 26/May/2000
*
* $Id: XmlDocumentFormat.java 19663 2016-10-10 08:44:57Z markagreenwood $
*/
package gate.corpora;
// import com.sun.xml.parser.* ;
import gate.Document;
import gate.GateConstants;
import gate.Resource;
import gate.TextualDocument;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.AutoInstance;
import gate.creole.metadata.CreoleResource;
import gate.event.StatusListener;
import gate.util.DocumentFormatException;
import gate.util.Out;
import gate.xml.XmlDocumentHandler;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
// import org.w3c.www.mime.*;
/**
* The format of Documents. Subclasses of DocumentFormat know about
* particular MIME types and how to unpack the information in any markup
* or formatting they contain into GATE annotations. Each MIME type has
* its own subclass of DocumentFormat, e.g. XmlDocumentFormat,
* RtfDocumentFormat, MpegDocumentFormat. These classes register
* themselves with a static index residing here when they are
* constructed. Static getDocumentFormat methods can then be used to get
* the appropriate format class for a particular document.
*/
@CreoleResource(name = "GATE XML Document Format", isPrivate = true,
autoinstances = {@AutoInstance(hidden = true)})
public class XmlDocumentFormat extends TextualDocumentFormat {
private static final long serialVersionUID = 3205973554326782116L;
/**
* InputFactory for the StAX parser used for GATE format XML.
*/
private static XMLInputFactory staxFactory;
/** Default construction */
public XmlDocumentFormat() {
super();
}
/** We could collect repositioning information during XML parsing */
@Override
public Boolean supportsRepositioning() {
return Boolean.TRUE;
} // supportsRepositioning
/** Old style of unpackMarkup (without collecting of RepositioningInfo) */
@Override
public void unpackMarkup(Document doc) throws DocumentFormatException {
unpackMarkup(doc, (RepositioningInfo)null, (RepositioningInfo)null);
} // unpackMarkup
/**
* Unpack the markup in the document. This converts markup from the
* native format (e.g. XML) into annotations in GATE format. Uses the
* markupElementsMap to determine which elements to convert, and what
* annotation type names to use. If the document was created from a
* String, then is recomandable to set the doc's sourceUrl to null.
* So, if the document has a valid URL, then the parser will try to
* parse the XML document pointed by the URL.If the URL is not valid,
* or is null, then the doc's content will be parsed. If the doc's
* content is not a valid XML then the parser might crash.
*
* @param doc The gate document you want to parse. If
* doc.getSourceUrl()
returns null
* then the content of doc will be parsed. Using a URL is
* recomended because the parser will report errors corectlly
* if the XML document is not well formed.
*/
@Override
public void unpackMarkup(Document doc, RepositioningInfo repInfo,
RepositioningInfo ampCodingInfo) throws DocumentFormatException {
if((doc == null)
|| (doc.getSourceUrl() == null && doc.getContent() == null)) {
throw new DocumentFormatException(
"GATE document is null or no content found. Nothing to parse!");
}// End if
// Create a status listener
StatusListener statusListener = new StatusListener() {
@Override
public void statusChanged(String text) {
// This is implemented in DocumentFormat.java and inherited here
fireStatusChanged(text);
}
};
// determine whether we have a GATE format XML document or another
// kind
String content = doc.getContent().toString();
if(content.length() > 2048) {
content = content.substring(0, 2048);
}
boolean gateFormat = isGateXmlFormat(content);
if(gateFormat) {
unpackGateFormatMarkup(doc, statusListener);
}
else {
unpackGeneralXmlMarkup(doc, repInfo, ampCodingInfo, statusListener);
}
}
/**
* Unpacks markup in the GATE-specific standoff XML markup format.
*
* @param doc the document to process
* @param statusListener optional status listener to receive status
* messages
* @throws DocumentFormatException if a fatal error occurs during
* parsing
*/
private void unpackGateFormatMarkup(Document doc,
StatusListener statusListener) throws DocumentFormatException {
boolean docHasContentButNoValidURL = hasContentButNoValidUrl(doc);
try {
Reader inputReader = null;
InputStream inputStream = null;
XMLStreamReader xsr = null;
if(docHasContentButNoValidURL) {
inputReader = new StringReader(doc.getContent().toString());
xsr = getInputFactory().createXMLStreamReader(inputReader);
}
else if(doc instanceof TextualDocument) {
String encoding = ((TextualDocument)doc).getEncoding();
// Don't strip BOM on XML.
inputReader = new InputStreamReader(doc.getSourceUrl().openStream(),
encoding);
// create stream reader with the URL as system ID, to support
// relative URLs to e.g. DTD or external entities
xsr = getInputFactory().createXMLStreamReader(
doc.getSourceUrl().toExternalForm(), inputReader);
}
else {
// not a TextualDocument, so let parser determine encoding
inputStream = doc.getSourceUrl().openStream();
xsr = getInputFactory().createXMLStreamReader(
doc.getSourceUrl().toExternalForm(), inputStream);
}
// find the opening GateDocument tag
xsr.nextTag();
// parse the document
try {
DocumentStaxUtils.readGateXmlDocument(xsr, doc, statusListener);
}
finally {
xsr.close();
if(inputStream != null) {
inputStream.close();
}
if(inputReader != null) {
inputReader.close();
}
}
}
catch(XMLStreamException e) {
doc.getFeatures().put("parsingError", Boolean.TRUE);
Boolean bThrow = (Boolean)doc.getFeatures().get(
GateConstants.THROWEX_FORMAT_PROPERTY_NAME);
if(bThrow != null && bThrow.booleanValue()) {
// the next line is commented to avoid Document creation fail on
// error
throw new DocumentFormatException(e);
}
else {
Out.println("Warning: Document remains unparsed. \n"
+ "\n Stack Dump: ");
e.printStackTrace(Out.getPrintWriter());
} // if
}
catch(IOException ioe) {
throw new DocumentFormatException("I/O exception for "
+ doc.getSourceUrl().toString(), ioe);
}
}
/**
* Returns the StAX input factory, creating one if it is currently
* null.
*
* @return staxFactory
* @throws XMLStreamException
*/
private static XMLInputFactory getInputFactory() throws XMLStreamException {
if(staxFactory == null) {
staxFactory = XMLInputFactory.newInstance();
staxFactory.setProperty(XMLInputFactory.IS_VALIDATING, Boolean.FALSE);
staxFactory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.TRUE);
staxFactory.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES,
Boolean.TRUE);
staxFactory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES,
Boolean.TRUE);
}
return staxFactory;
}
/**
* Unpack markup from any XML format. The XML elements are translated
* to annotations on the Original markups annotation set.
*
* @param doc the document to process
* @throws DocumentFormatException
*/
private void unpackGeneralXmlMarkup(Document doc, RepositioningInfo repInfo,
RepositioningInfo ampCodingInfo, StatusListener statusListener)
throws DocumentFormatException {
boolean docHasContentButNoValidURL = hasContentButNoValidUrl(doc);
XmlDocumentHandler xmlDocHandler = null;
try {
// use Xerces XML parser with JAXP
// System.setProperty("javax.xml.parsers.SAXParserFactory",
// "org.apache.xerces.jaxp.SAXParserFactoryImpl");
// Get a parser factory.
SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
// Set up the factory to create the appropriate type of parser
// non validating one
saxParserFactory.setValidating(false);
// non namesapace aware one
saxParserFactory.setNamespaceAware(true);
// create it
SAXParser xmlParser = saxParserFactory.newSAXParser();
// Create a new Xml document handler
xmlDocHandler = new XmlDocumentHandler(doc, this.markupElementsMap,
this.element2StringMap);
// Register a status listener with it
xmlDocHandler.addStatusListener(statusListener);
// set repositioning object
xmlDocHandler.setRepositioningInfo(repInfo);
// set the object with ampersand coding positions
xmlDocHandler.setAmpCodingInfo(ampCodingInfo);
org.xml.sax.XMLReader newxmlParser = xmlParser.getXMLReader();
// Set up the factory to create the appropriate type of parser
// non validating one
// http://xml.org/sax/features/validation set to false
newxmlParser.setFeature("http://xml.org/sax/features/validation", false);
// namesapace aware one
// http://xml.org/sax/features/namespaces set to true
newxmlParser.setFeature("http://xml.org/sax/features/namespaces", true);
newxmlParser.setFeature("http://xml.org/sax/features/namespace-prefixes",
true);
newxmlParser.setContentHandler(xmlDocHandler);
newxmlParser.setErrorHandler(xmlDocHandler);
newxmlParser.setDTDHandler(xmlDocHandler);
newxmlParser.setEntityResolver(xmlDocHandler);
// Parse the XML Document with the appropriate encoding
Reader docReader = null;
try{
InputSource is;
if(docHasContentButNoValidURL) {
// no URL, so parse from string
is = new InputSource(new StringReader(doc.getContent().toString()));
}
else if(doc instanceof TextualDocument) {
// textual document - load with user specified encoding
String docEncoding = ((TextualDocument)doc).getEncoding();
// don't strip BOM on XML.
docReader = new InputStreamReader(doc.getSourceUrl()
.openStream(), docEncoding);
is = new InputSource(docReader);
// must set system ID to allow relative URLs (e.g. to a DTD) to
// work
is.setSystemId(doc.getSourceUrl().toString());
}
else {
// let the parser decide the encoding
is = new InputSource(doc.getSourceUrl().toString());
}
newxmlParser.parse(is);
}finally{
//make sure the open streams are closed
if(docReader != null) docReader.close();
}
// Angel - end
((DocumentImpl)doc).setNextAnnotationId(xmlDocHandler
.getCustomObjectsId());
}
catch(ParserConfigurationException e) {
throw new DocumentFormatException("XML parser configuration exception ",
e);
}
catch(SAXException e) {
doc.getFeatures().put("parsingError", Boolean.TRUE);
Boolean bThrow = (Boolean)doc.getFeatures().get(
GateConstants.THROWEX_FORMAT_PROPERTY_NAME);
if(bThrow != null && bThrow.booleanValue()) {
// the next line is commented to avoid Document creation fail on
// error
throw new DocumentFormatException(e);
}
else {
Out.println("Warning: Document remains unparsed. \n"
+ "\n Stack Dump: ");
e.printStackTrace(Out.getPrintWriter());
} // if
}
catch(IOException e) {
throw new DocumentFormatException("I/O exception for "
+ doc.getSourceUrl(), e);
}
finally {
if(xmlDocHandler != null)
xmlDocHandler.removeStatusListener(statusListener);
}// End if else try
}// unpackMarkup
/**
* Determine whether the given document content string represents a
* GATE custom format XML document.
*/
protected static boolean isGateXmlFormat(String content) {
return (content.indexOf("