edu.stanford.nlp.util.XMLUtils Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
package edu.stanford.nlp.util;
import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.XMLConstants;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.validation.Schema;
import javax.xml.validation.SchemaFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.ErrorHandler;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import edu.stanford.nlp.io.IOUtils;
/**
* Provides some utilities for dealing with XML files, both by properly
* parsing them and by using the methods of a desperate Perl hacker.
*
* @author Teg Grenager
* @author Grace Muzny
*/
public class XMLUtils {
private XMLUtils() {} // only static methods
/**
* Returns the text content of all nodes in the given file with the given tag.
*
* @return List of String text contents of tags.
*/
public static List getTextContentFromTagsFromFile(File f, String tag) {
List sents = new ArrayList<>();
try {
sents = getTextContentFromTagsFromFileSAXException(f, tag);
} catch (SAXException e) {
System.err.println(e);
}
return sents;
}
/**
* Returns the text content of all nodes in the given file with the given tag.
* If the text contents contains embedded tags, strips the embedded tags out
* of the returned text. e.g. This is a sentence with embedded tags
* would return the list containing ["This is a sentence with embedded
* tags", "sentence"].
*
* @throws SAXException if tag doesn't exist in the file.
* @return List of String text contents of tags.
*/
public static List getTextContentFromTagsFromFileSAXException(
File f, String tag) throws SAXException {
List sents = new ArrayList<>();
try {
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
DocumentBuilder db = dbf.newDocumentBuilder();
Document doc = db.parse(f);
doc.getDocumentElement().normalize();
NodeList nodeList=doc.getElementsByTagName(tag);
for (int i = 0; i < nodeList.getLength(); i++) {
// Get element
Element element = (Element)nodeList.item(i);
String raw = element.getTextContent();
String builtUp = "";
boolean inTag = false;
for(int j = 0; j < raw.length(); j++) {
if (raw.charAt(j) == '<') {
inTag = true;
}
if (!inTag) {
builtUp += raw.charAt(j);
}
if (raw.charAt(j) == '>') {
inTag = false;
}
}
sents.add(builtUp);
}
} catch (IOException e) {
System.err.println(e);
} catch (ParserConfigurationException e) {
System.err.println(e);
}
return sents;
}
/**
* Returns a non-validating XML parser. The parser ignores both DTDs and XSDs.
*
* @return An XML parser in the form of a DocumentBuilder
*/
public static DocumentBuilder getXmlParser() {
DocumentBuilder db = null;
try {
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
dbf.setValidating(false);
//Disable DTD loading and validation
//See http://stackoverflow.com/questions/155101/make-documentbuilder-parse-ignore-dtd-references
dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
db = dbf.newDocumentBuilder();
db.setErrorHandler(new SAXErrorHandler());
} catch (ParserConfigurationException e) {
System.err.printf("%s: Unable to create XML parser\n", XMLUtils.class.getName());
e.printStackTrace();
} catch(UnsupportedOperationException e) {
System.err.printf("%s: API error while setting up XML parser. Check your JAXP version\n", XMLUtils.class.getName());
e.printStackTrace();
}
return db;
}
/**
* Returns a validating XML parser given an XSD (not DTD!).
*
* @param schemaFile
* @return An XML parser in the form of a DocumentBuilder
*/
public static DocumentBuilder getValidatingXmlParser(File schemaFile) {
DocumentBuilder db = null;
try {
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
SchemaFactory factory = SchemaFactory.newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI);
Schema schema = factory.newSchema(schemaFile);
dbf.setSchema(schema);
db = dbf.newDocumentBuilder();
db.setErrorHandler(new SAXErrorHandler());
} catch (ParserConfigurationException e) {
System.err.printf("%s: Unable to create XML parser\n", XMLUtils.class.getName());
e.printStackTrace();
} catch (SAXException e) {
System.err.printf("%s: XML parsing exception while loading schema %s\n", XMLUtils.class.getName(),schemaFile.getPath());
e.printStackTrace();
} catch(UnsupportedOperationException e) {
System.err.printf("%s: API error while setting up XML parser. Check your JAXP version\n", XMLUtils.class.getName());
e.printStackTrace();
}
return db;
}
/**
* Block-level HTML tags that are rendered with surrounding line breaks.
*/
public static final Set breakingTags = Generics.newHashSet(Arrays.asList(new String[] {"blockquote", "br", "div", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "li", "ol", "p", "pre", "ul", "tr", "td"}));
/**
* @param r the reader to read the XML/HTML from
* @param mapBack a List of Integers mapping the positions in the result buffer
* to positions in the original Reader, will be cleared on receipt
* @return the String containing the resulting text
*/
public static String stripTags(Reader r, List mapBack, boolean markLineBreaks) {
if (mapBack != null) {
mapBack.clear(); // just in case it has something in it!
}
StringBuilder result = new StringBuilder();
String text;
String tag;
int position = 0;
try {
do {
text = XMLUtils.readUntilTag(r); // will do nothing if the next thing is a tag
if (text.length() > 0) {
// add offsets to the map back
for (int i = 0; i < text.length(); i++) {
result.append(text.charAt(i));
if (mapBack != null) {
mapBack.add(Integer.valueOf(position + i));
}
}
position += text.length();
}
// System.out.println(position + " got text: " + text);
tag = XMLUtils.readTag(r);
if (tag == null) {
break;
}
if (markLineBreaks && XMLUtils.isBreaking(parseTag(tag))) {
result.append("\n");
if (mapBack != null) {
mapBack.add(Integer.valueOf(-position));
}
}
position += tag.length();
// System.out.println(position + " got tag: " + tag);
} while (true);
} catch (IOException e) {
System.err.println("Error reading string");
e.printStackTrace();
}
return result.toString();
}
public static boolean isBreaking(String tag) {
return breakingTags.contains(tag);
}
public static boolean isBreaking(XMLTag tag) {
return breakingTags.contains(tag.name);
}
/**
* Reads all text up to next XML tag and returns it as a String.
*
* @return the String of the text read, which may be empty.
*/
public static String readUntilTag(Reader r) throws IOException {
if (!r.ready()) {
return "";
}
StringBuilder b = new StringBuilder();
int c = r.read();
while (c >= 0 && c != '<') {
b.append((char) c);
c = r.read();
}
return b.toString();
}
/**
* @return the new XMLTag object, or null if couldn't be created
*/
public static XMLTag readAndParseTag(Reader r) throws IOException {
String s = readTag(r);
if (s == null) {
return null;
}
XMLTag ret = null;
try {
ret = new XMLTag(s);
} catch (Exception e) {
System.err.println("Failed to handle |" + s + "|");
}
return ret;
}
// Pattern is reentrant, going by the statement
// "many matchers can share the same pattern"
// on the Pattern javadoc. Therefore, this should be
// safe as a static final variable.
static final Pattern xmlEscapingPattern = Pattern.compile("\\&.+?;");
public static String unescapeStringForXML(String s) {
StringBuilder result = new StringBuilder();
Matcher m = xmlEscapingPattern.matcher(s);
int end = 0;
while (m.find()) {
int start = m.start();
result.append(s.substring(end, start));
end = m.end();
result.append(translate(s.substring(start, end)));
}
result.append(s.substring(end, s.length()));
return result.toString();
}
private static char translate(String s) {
switch (s) {
case "&":
return '&';
case "<":
case "≪":
return '<';
case ">":
case "≫":
return '>';
case """:
return '\"';
case "'":
return '\'';
case "*":
case "♯":
return '-';
case "=":
return '=';
case " ":
return (char) 0xA0;
case "¡":
return (char) 0xA1;
case "¢":
case "&shilling;":
return (char) 0xA2;
case "£":
return (char) 0xA3;
case "¤":
return (char) 0xA4;
case "¥":
return (char) 0xA5;
case "¦":
return (char) 0xA6;
case "§":
return (char) 0xA7;
case "¨":
return (char) 0xA8;
case "©":
return (char) 0xA9;
case "ª":
return (char) 0xAA;
case "« ":
return (char) 0xAB;
case "¬":
return (char) 0xAC;
case " ":
return (char) 0xAD;
case "®":
return (char) 0xAE;
case "¯":
return (char) 0xAF;
case "°":
return (char) 0xB0;
case "±":
return (char) 0xB1;
case "²":
return (char) 0xB2;
case "³":
return (char) 0xB3;
case "´":
return (char) 0xB4;
case "µ":
return (char) 0xB5;
case "·":
return (char) 0xB7;
case "¸":
return (char) 0xB8;
case "¹":
return (char) 0xB9;
case "º":
return (char) 0xBA;
case "»":
return (char) 0xBB;
case "¼ ":
return (char) 0xBC;
case "½":
return (char) 0xBD;
case "¾ ":
return (char) 0xBE;
case "¿":
return (char) 0xBF;
case "À":
return (char) 0xC0;
case "Á":
return (char) 0xC1;
case "Â":
return (char) 0xC2;
case "Ã":
return (char) 0xC3;
case "Ä":
return (char) 0xC4;
case "Å":
return (char) 0xC5;
case "Æ":
return (char) 0xC6;
case "Ç":
return (char) 0xC7;
case "È":
return (char) 0xC8;
case "É":
return (char) 0xC9;
case "Ê":
return (char) 0xCA;
case "Ë":
return (char) 0xCB;
case "Ì":
return (char) 0xCC;
case "Í":
return (char) 0xCD;
case "Î":
return (char) 0xCE;
case "Ï":
return (char) 0xCF;
case "Ð":
return (char) 0xD0;
case "Ñ":
return (char) 0xD1;
case "Ò":
return (char) 0xD2;
case "Ó":
return (char) 0xD3;
case "Ô":
return (char) 0xD4;
case "Õ":
return (char) 0xD5;
case "Ö":
return (char) 0xD6;
case "×":
return (char) 0xD7;
case "Ø":
return (char) 0xD8;
case "Ù":
return (char) 0xD9;
case "Ú":
return (char) 0xDA;
case "Û":
return (char) 0xDB;
case "Ü":
return (char) 0xDC;
case "Ý":
return (char) 0xDD;
case "Þ":
return (char) 0xDE;
case "ß":
return (char) 0xDF;
case "à":
return (char) 0xE0;
case "á":
return (char) 0xE1;
case "â":
return (char) 0xE2;
case "ã":
return (char) 0xE3;
case "ä":
return (char) 0xE4;
case "å":
return (char) 0xE5;
case "æ":
return (char) 0xE6;
case "ç":
return (char) 0xE7;
case "è":
return (char) 0xE8;
case "é":
return (char) 0xE9;
case "ê":
return (char) 0xEA;
case "ë ":
return (char) 0xEB;
case "ì":
return (char) 0xEC;
case "í":
return (char) 0xED;
case "î":
return (char) 0xEE;
case "ï":
return 0xEF;
case "ð":
return (char) 0xF0;
case "ñ":
return (char) 0xF1;
case "ò":
return (char) 0xF2;
case "ó":
return (char) 0xF3;
case "ô":
return (char) 0xF4;
case "õ":
return (char) 0xF5;
case "ö":
return (char) 0xF6;
case "÷":
return (char) 0xF7;
case "ø":
return (char) 0xF8;
case "ù":
return (char) 0xF9;
case "ú":
return (char) 0xFA;
case "û":
return (char) 0xFB;
case "ü":
return (char) 0xFC;
case "ý":
return (char) 0xFD;
case "þ":
return (char) 0xFE;
case "ÿ":
return (char) 0xFF;
case "Œ":
return (char) 0x152;
case "œ":
return (char) 0x153;
case "Š":
return (char) 0x160;
case "š":
return (char) 0x161;
case "Ÿ":
return (char) 0x178;
case "ˆ":
return (char) 0x2C6;
case "˜":
return (char) 0x2DC;
case "":
return (char) 0x200E;
case "":
return (char) 0x200F;
case "–":
return (char) 0x2013;
case "—":
return (char) 0x2014;
case "‘":
return (char) 0x2018;
case "’":
return (char) 0x2019;
case "‚":
return (char) 0x201A;
case "“":
case "&bquo;":
case "&bq;":
return (char) 0x201C;
case "”":
case "&equo;":
return (char) 0X201D;
case "„":
return (char) 0x201E;
case "∼":
return (char) 0x223C;
case "√":
return (char) 0x221A;
case "≤":
return (char) 0x2264;
case "≥":
return (char) 0x2265;
case "←":
return (char) 0x2190;
case "↓":
return (char) 0x2193;
case "→":
return (char) 0x2192;
case "…":
return (char) 0x2026;
case "′":
return (char) 0x2032;
case "″":
case "&ins;":
return (char) 0x2033;
case "™":
return (char) 0x2122;
case "Α":
case "&Agr;":
return (char) 0x391;
case "Β":
case "&Bgr;":
return (char) 0x392;
case "Γ":
case "&Ggr;":
return (char) 0x393;
case "Δ":
case "&Dgr;":
return (char) 0x394;
case "Ε":
case "&Egr;":
return (char) 0x395;
case "Ζ":
case "&Zgr;":
return (char) 0x396;
case "Η":
return (char) 0x397;
case "Θ":
case "&THgr;":
return (char) 0x398;
case "Ι":
case "&Igr;":
return (char) 0x399;
case "Κ":
case "&Kgr;":
return (char) 0x39A;
case "Λ":
case "&Lgr;":
return (char) 0x39B;
case "Μ":
case "&Mgr;":
return (char) 0x39C;
case "Ν":
case "&Ngr;":
return (char) 0x39D;
case "Ξ":
case "&Xgr;":
return (char) 0x39E;
case "Ο":
case "&Ogr;":
return (char) 0x39F;
case "Π":
case "&Pgr;":
return (char) 0x3A0;
case "Ρ":
case "&Rgr;":
return (char) 0x3A1;
case "Σ":
case "&Sgr;":
return (char) 0x3A3;
case "Τ":
case "&Tgr;":
return (char) 0x3A4;
case "Υ":
case "&Ugr;":
return (char) 0x3A5;
case "Φ":
case "&PHgr;":
return (char) 0x3A6;
case "Χ":
case "&KHgr;":
return (char) 0x3A7;
case "Ψ":
case "&PSgr;":
return (char) 0x3A8;
case "Ω":
case "&OHgr;":
return (char) 0x3A9;
case "α":
case "&agr;":
return (char) 0x3B1;
case "β":
case "&bgr;":
return (char) 0x3B2;
case "γ":
case "&ggr;":
return (char) 0x3B3;
case "δ":
case "&dgr;":
return (char) 0x3B4;
case "ε":
case "&egr;":
return (char) 0x3B5;
case "ζ":
case "&zgr;":
return (char) 0x3B6;
case "η":
case "&eegr;":
return (char) 0x3B7;
case "θ":
case "&thgr;":
return (char) 0x3B8;
case "ι":
case "&igr;":
return (char) 0x3B9;
case "κ":
case "&kgr;":
return (char) 0x3BA;
case "λ":
case "&lgr;":
return (char) 0x3BB;
case "μ":
case "&mgr;":
return (char) 0x3BC;
case "ν":
case "&ngr;":
return (char) 0x3BD;
case "ξ":
case "&xgr;":
return (char) 0x3BE;
case "ο":
case "&ogr;":
return (char) 0x3BF;
case "π":
case "&pgr;":
return (char) 0x3C0;
case "ρ":
case "&rgr;":
return (char) 0x3C1;
case "σ":
case "&sgr;":
return (char) 0x3C3;
case "τ":
case "&tgr;":
return (char) 0x3C4;
case "υ":
case "&ugr;":
return (char) 0x3C5;
case "φ":
case "&phgr;":
return (char) 0x3C6;
case "χ":
case "&khgr;":
return (char) 0x3C7;
case "ψ":
case "&psgr;":
return (char) 0x3C8;
case "ω":
case "&ohgr;":
return (char) 0x3C9;
case "•":
return (char) 0x2022;
case "%":
return '%';
case "+":
return '+';
case "‐":
return '-';
case "ă":
case "ā":
case "≊":
case "ą":
return 'a';
case "Ā":
return 'A';
case "ć":
case "č":
case "ĉ":
return 'c';
case "Č":
return 'C';
case "ď":
return 'd';
case "ě":
case "ē":
case "ę":
return 'e';
case "Ē":
case "Ě":
return 'E';
case "ĺ":
return 'l';
case "Ĺ":
return 'L';
case "ń":
case "ň":
case "ņ":
return 'n';
case "ř":
case "ŕ":
return 'r';
case "Ř":
return 'R';
case "ō":
return 'o';
case "ī":
return 'i';
case "ś":
case "ş":
case "ŝ":
return 's';
case "&Sacute":
case "Ş":
return 'S';
case "ť":
case "ţ":
return 't';
case "ū":
case "ů":
return 'u';
case "ŵ":
return 'w';
case "Ŷ":
return 'Y';
case "ŷ":
return 'y';
case "ž":
case "ź":
return 'z';
case "Ž":
return 'Z';
case "♥":
return (char) 0x2665;
case "∞":
return (char) 0x221E;
case "$":
return '$';
case "⊂":
case "{":
return (char) 0x2282;
case "⊃":
case "}":
return (char) 0x2283;
case "[":
return '[';
case "]":
return ']';
default:
return ' ';
}
}
/** Returns a String in which all the XML special characters have been
* escaped. The resulting String is valid to print in an XML file as an
* attribute or element value in all circumstances. (Note that it may
* escape characters that didn't need to be escaped.)
*
* @param in The String to escape
* @return The escaped String
*/
public static String escapeXML(String in) {
int leng = in.length();
StringBuilder sb = new StringBuilder(leng);
for (int i = 0; i < leng; i++) {
char c = in.charAt(i);
if (c == '&') {
sb.append("&");
} else if (c == '<') {
sb.append("<");
} else if (c == '>') {
sb.append(">");
} else if (c == '"') {
sb.append(""");
} else if (c == '\'') {
sb.append("'");
} else {
sb.append(c);
}
}
return sb.toString();
}
/** Returns a String in which some the XML special characters have been
* escaped: just the ones that need escaping in an element content.
*
* @param in The String to escape
* @return The escaped String
*/
public static String escapeElementXML(String in) {
int leng = in.length();
StringBuilder sb = new StringBuilder(leng);
for (int i = 0; i < leng; i++) {
char c = in.charAt(i);
if (c == '&') {
sb.append("&");
} else if (c == '<') {
sb.append("<");
} else if (c == '>') {
sb.append(">");
} else {
sb.append(c);
}
}
return sb.toString();
}
/** Returns a String in which some XML special characters have been
* escaped. This just escapes attribute value ones, assuming that
* you're going to quote with double quotes.
* That is, only " and & are escaped.
*
* @param in The String to escape
* @return The escaped String
*/
public static String escapeAttributeXML(String in) {
int leng = in.length();
StringBuilder sb = new StringBuilder(leng);
for (int i = 0; i < leng; i++) {
char c = in.charAt(i);
if (c == '&') {
sb.append("&");
} else if (c == '"') {
sb.append(""");
} else {
sb.append(c);
}
}
return sb.toString();
}
public static String escapeTextAroundXMLTags(String s) {
StringBuilder result = new StringBuilder();
Reader r = new StringReader(s);
try {
do {
String text = readUntilTag(r);
// System.out.println("got text: " + text);
result.append(escapeXML(text));
XMLTag tag = readAndParseTag(r);
// System.out.println("got tag: " + tag);
if (tag == null) {
break;
}
result.append(tag.toString());
} while (true);
} catch (IOException e) {
System.err.println("Error reading string");
e.printStackTrace();
}
return result.toString();
}
/**
* return either the first space or the first nbsp
*/
public static int findSpace(String haystack, int begin) {
int space = haystack.indexOf(' ', begin);
int nbsp = haystack.indexOf('\u00A0', begin);
if (space == -1 && nbsp == -1) {
return -1;
} else if (space >= 0 && nbsp >= 0) {
return Math.min(space, nbsp);
} else {
// eg one is -1, and the other is >= 0
return Math.max(space, nbsp);
}
}
public static class XMLTag {
public String text;
public String name;
public Map attributes;
public boolean isEndTag;
public boolean isSingleTag;
/**
* Assumes that String contains an XML tag.
*
* @param tag String to turn into an XMLTag object
*/
public XMLTag(String tag) {
if (tag == null || tag.length() == 0) {
throw new NullPointerException("Attempted to parse empty/null tag");
}
if (tag.charAt(0) != '<') {
throw new IllegalArgumentException("Tag did not start with <");
}
if (tag.charAt(tag.length() - 1) != '>') {
throw new IllegalArgumentException("Tag did not end with >");
}
text = tag;
int begin = 1;
if (tag.charAt(1) == '/') {
begin = 2;
isEndTag = true;
} else {
isEndTag = false;
}
int end = tag.length() - 1;
if (tag.charAt(tag.length() - 2) == '/') {
end = tag.length() - 2;
isSingleTag = true;
} else {
isSingleTag = false;
}
tag = tag.substring(begin, end);
attributes = Generics.newHashMap();
begin = 0;
end = findSpace(tag, 0);
if (end < 0) {
name = tag;
} else {
name = tag.substring(begin, end);
do {
begin = end + 1;
while (begin < tag.length() && tag.charAt(begin) < 0x21) {
begin++; // get rid of leading whitespace
}
if (begin == tag.length()) {
break;
}
end = tag.indexOf('=', begin);
if (end < 0) {
String att = tag.substring(begin);
attributes.put(att, "");
break;
}
String att = tag.substring(begin, end).trim();
begin = end + 1;
String value = null;
if (tag.length() > begin) {
while (begin < tag.length() && tag.charAt(begin) < 0x21) {
begin++;
}
if (begin < tag.length() && tag.charAt(begin) == '\"') {
// get quoted expression
begin++;
end = tag.indexOf('\"', begin);
if (end < 0) {
break; // this is a problem
}
value = tag.substring(begin, end);
end++;
} else {
// get unquoted expression
end = findSpace(tag, begin);
if (end < 0) {
end = tag.length();
}
System.out.println(begin + " " + end);
value = tag.substring(begin, end);
}
}
attributes.put(att, value);
} while (end < tag.length() - 3);
}
}
public String toString() {
return text;
}
} // end static class XMLTag
/**
* Reads all text of the XML tag and returns it as a String.
* Assumes that a '<' character has already been read.
*
* @param r The reader to read from
* @return The String representing the tag, or null if one couldn't be read
* (i.e., EOF). The returned item is a complete tag including angle
* brackets, such as <TXT>
*/
public static String readTag(Reader r) throws IOException {
if ( ! r.ready()) {
return null;
}
StringBuilder b = new StringBuilder("<");
int c = r.read();
while (c >= 0) {
b.append((char) c);
if (c == '>') {
break;
}
c = r.read();
}
if (b.length() == 1) {
return null;
}
return b.toString();
}
public static XMLTag parseTag(String tagString) {
if (tagString == null || tagString.length() == 0) {
return null;
}
if (tagString.charAt(0) != '<' ||
tagString.charAt(tagString.length() - 1) != '>') {
return null;
}
return new XMLTag(tagString);
}
public static Document readDocumentFromFile(String filename)
throws Exception
{
InputSource in = new InputSource(new FileReader(filename));
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
factory.setNamespaceAware(false);
DocumentBuilder db = factory.newDocumentBuilder();
db.setErrorHandler(new SAXErrorHandler());
return db.parse(in);
}
private static class SAXErrorHandler implements ErrorHandler {
public static String makeBetterErrorString(String msg,
SAXParseException ex) {
StringBuilder sb = new StringBuilder(msg);
sb.append(": ");
String str = ex.getMessage();
if (str.lastIndexOf(".") == str.length() - 1) {
str = str.substring(0, str.length() - 1);
}
sb.append(str);
sb.append(" at document line ").append(ex.getLineNumber());
sb.append(", column ").append(ex.getColumnNumber());
if (ex.getSystemId() != null) {
sb.append(" in entity from systemID ").append(ex.getSystemId());
} else if (ex.getPublicId() != null) {
sb.append(" in entity from publicID ").append(ex.getPublicId());
}
sb.append(".");
return sb.toString();
}
public void warning(SAXParseException exception) {
System.err.println(makeBetterErrorString("Warning", exception));
}
public void error(SAXParseException exception) {
System.err.println(makeBetterErrorString("Error", exception));
}
public void fatalError(SAXParseException ex) throws SAXParseException {
throw new SAXParseException(makeBetterErrorString("Fatal Error", ex), ex.getPublicId(), ex.getSystemId(), ex.getLineNumber(), ex.getColumnNumber());
// throw new RuntimeException(makeBetterErrorString("Fatal Error", ex));
}
} // end class SAXErrorHandler
public static Document readDocumentFromString(String s) throws Exception {
InputSource in = new InputSource(new StringReader(s));
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
factory.setNamespaceAware(false);
return factory.newDocumentBuilder().parse(in);
}
/** Tests a few methods.
* If the first arg is -readDoc then this method tests
* readDocumentFromFile.
* Otherwise, it tests readTag/readUntilTag and slurpFile.
*/
public static void main(String[] args) throws Exception {
if (args[0].equals("-readDoc")) {
Document doc = readDocumentFromFile(args[1]);
System.out.println(doc);
} else {
String s = IOUtils.slurpFile(args[0]);
Reader r = new StringReader(s);
String tag = readTag(r);
while (tag.length() > 0) {
readUntilTag(r);
tag = readTag(r);
if (tag.length() == 0) {
break;
}
System.out.println("got tag=" + new XMLTag(tag));
}
}
}
}