edu.stanford.nlp.util.XMLUtils Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
The newest version!
package edu.stanford.nlp.util;

import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.xml.XMLConstants;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.validation.Schema;
import javax.xml.validation.SchemaFactory;

import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.ErrorHandler;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.util.logging.Redwood;


/**
 * Provides some utilities for dealing with XML files, both by properly
 * parsing them and by using the methods of a desperate Perl hacker.
 *
 * @author Teg Grenager
 * @author Grace Muzny
 */
public class XMLUtils  {

  /** A logger for this class */
  private static final Redwood.RedwoodChannels log = Redwood.channels(XMLUtils.class);

  private XMLUtils() {} // only static methods

  /**
   * Returns the text content of all nodes in the given file with the given tag.
   *
   * @return List of String text contents of tags.
   */
  public static List getTextContentFromTagsFromFile(File f, String tag) {
    List sents = Generics.newArrayList();
    try {
      sents = getTextContentFromTagsFromFileSAXException(f, tag);
    } catch (SAXException e) {
      log.warn(e);
    }
    return sents;
  }

  /**
   * Returns the text content of all nodes in the given file with the given tag.
   * If the text contents contains embedded tags, strips the embedded tags out
   * of the returned text. E.g., {@code This is a sentence with embedded tags
   * } would return the list containing ["This is a sentence with embedded
   * tags", "sentence"].
   *
   * @throws SAXException if tag doesn't exist in the file.
   * @return List of String text contents of tags.
   */
  private static List getTextContentFromTagsFromFileSAXException(
          File f, String tag) throws SAXException {
    List sents = Generics.newArrayList();
    try {
      DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
      DocumentBuilder db = dbf.newDocumentBuilder();
      Document doc = db.parse(f);
      doc.getDocumentElement().normalize();

      NodeList nodeList=doc.getElementsByTagName(tag);
      for (int i = 0; i < nodeList.getLength(); i++) {
        // Get element
        Element element = (Element)nodeList.item(i);
        String raw = element.getTextContent();
        StringBuilder builtUp = new StringBuilder();
        boolean inTag = false;
        for (int j = 0; j < raw.length(); j++) {
          if (raw.charAt(j) == '<') {
            inTag = true;
          }
          if (!inTag) {
            builtUp.append(raw.charAt(j));
          }
          if (raw.charAt(j) == '>') {
            inTag = false;
          }
        }
        sents.add(builtUp.toString());
      }
    } catch (IOException | ParserConfigurationException e) {
      log.warn(e);
    }
    return sents;
  }


  /**
   * Returns the text content of all nodes in the given file with the given tag.
   *
   * @return List of String text contents of tags.
   */
  public static List getTagElementsFromFile(File f, String tag) {
    List sents = Generics.newArrayList();
    try {
      sents = getTagElementsFromFileSAXException(f, tag);
    } catch (SAXException e) {
      log.warn(e);
    }
    return sents;
  }

  /**
   * Returns the text content of all nodes in the given file with the given tag.
   * If the text contents contains embedded tags, strips the embedded tags out
   * of the returned text. E.g., {@code This is a sentence with embedded tags
   * } would return the list containing ["This is a sentence with embedded
   * tags", "sentence"].
   *
   * @throws SAXException if tag doesn't exist in the file.
   * @return List of String text contents of tags.
   */
  private static List getTagElementsFromFileSAXException(
          File f, String tag) throws SAXException {
    List sents = Generics.newArrayList();
    try {
      DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
      DocumentBuilder db = dbf.newDocumentBuilder();
      Document doc = db.parse(f);
      doc.getDocumentElement().normalize();

      NodeList nodeList=doc.getElementsByTagName(tag);
      for (int i = 0; i < nodeList.getLength(); i++) {
        // Get element
        Element element = (Element)nodeList.item(i);
        sents.add(element);
      }
    } catch (IOException | ParserConfigurationException e) {
      log.warn(e);
    }
    return sents;
  }

  /**
   * Returns the elements in the given file with the given tag associated with
   * the text content of the two previous siblings and two next siblings.
   *
   * @return List of {@code Triple} Targeted elements surrounded
   * by the text content of the two previous siblings and two next siblings.
   */
  public static List> getTagElementTriplesFromFile(File f, String tag) {
    List> sents = Generics.newArrayList();
    try {
      sents = getTagElementTriplesFromFileSAXException(f, tag);
    } catch (SAXException e) {
      log.warn(e);
    }
    return sents;
  }

  /**
   * Returns the elements in the given file with the given tag associated with
   * the text content of the previous and next siblings up to max numIncludedSiblings.
   *
   * @return List of {@code Triple} Targeted elements surrounded
   * by the text content of the two previous siblings and two next siblings.
   */
  public static List> getTagElementTriplesFromFileNumBounded(File f,
                                                                                             String tag,
                                                                                             int num) {
    List> sents = Generics.newArrayList();
    try {
      sents = getTagElementTriplesFromFileNumBoundedSAXException(f, tag, num);
    } catch (SAXException e) {
      log.warn(e);
    }
    return sents;
  }

  /**
   * Returns the elements in the given file with the given tag associated with
   * the text content of the two previous siblings and two next siblings.
   *
   * @throws SAXException if tag doesn't exist in the file.
   * @return List of {@code Triple} Targeted elements surrounded
   * by the text content of the two previous siblings and two next siblings.
   */
  public static List> getTagElementTriplesFromFileSAXException(
      File f, String tag) throws SAXException {
    return  getTagElementTriplesFromFileNumBoundedSAXException(f, tag, 2);
  }

  /**
   * Returns the elements in the given file with the given tag associated with
   * the text content of the previous and next siblings up to max numIncludedSiblings.
   *
   * @throws SAXException if tag doesn't exist in the file.
   * @return List of {@code Triple} Targeted elements surrounded
   * by the text content of the two previous siblings and two next siblings.
   */
  public static List> getTagElementTriplesFromFileNumBoundedSAXException(
      File f, String tag, int numIncludedSiblings) throws SAXException {
    List> sents = Generics.newArrayList();
    try {
      DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
      DocumentBuilder db = dbf.newDocumentBuilder();
      Document doc = db.parse(f);
      doc.getDocumentElement().normalize();

      NodeList nodeList=doc.getElementsByTagName(tag);
      for (int i = 0; i < nodeList.getLength(); i++) {
        // Get element
        Node prevNode = nodeList.item(i).getPreviousSibling();
        String prev = "";
        int count = 0;
        while (prevNode != null && count <= numIncludedSiblings) {
          prev = prevNode.getTextContent() + prev;
          prevNode = prevNode.getPreviousSibling();
          count++;
        }

        Node nextNode = nodeList.item(i).getNextSibling();
        String next = "";
        count = 0;
        while (nextNode != null && count <= numIncludedSiblings) {
          next = next + nextNode.getTextContent();
          nextNode = nextNode.getNextSibling();
          count++;
        }
        Element element = (Element)nodeList.item(i);
        Triple t = new Triple<>(prev, element, next);
        sents.add(t);
      }
    } catch (IOException | ParserConfigurationException e) {
      log.warn(e);
    }
    return sents;
  }


  /**
   * Returns a non-validating XML parser. The parser ignores both DTDs and XSDs.
   *
   * @return An XML parser in the form of a DocumentBuilder
   */
  public static DocumentBuilder getXmlParser() {
    DocumentBuilder db = null;
    try {
      DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
      dbf.setValidating(false);

      //Disable DTD loading and validation
      //See http://stackoverflow.com/questions/155101/make-documentbuilder-parse-ignore-dtd-references
      dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
      dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);

      db = dbf.newDocumentBuilder();
      db.setErrorHandler(new SAXErrorHandler());

    } catch (ParserConfigurationException e) {
      log.warnf("%s: Unable to create XML parser\n", XMLUtils.class.getName());
      log.warn(e);

    } catch(UnsupportedOperationException e) {
      log.warnf("%s: API error while setting up XML parser. Check your JAXP version\n", XMLUtils.class.getName());
      log.warn(e);
    }

    return db;
  }

  /**
   * Returns a validating XML parser given an XSD (not DTD!).
   *
   * @param schemaFile File wit hXML schema
   * @return An XML parser in the form of a DocumentBuilder
   */
  public static DocumentBuilder getValidatingXmlParser(File schemaFile) {
    DocumentBuilder db = null;
    try {
      DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();

      SchemaFactory factory = SchemaFactory.newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI);
      Schema schema = factory.newSchema(schemaFile);
      dbf.setSchema(schema);

      db = dbf.newDocumentBuilder();
      db.setErrorHandler(new SAXErrorHandler());

    } catch (ParserConfigurationException e) {
      log.warnf("%s: Unable to create XML parser\n", XMLUtils.class.getName());
      log.warn(e);

    } catch (SAXException e) {
      log.warnf("%s: XML parsing exception while loading schema %s\n", XMLUtils.class.getName(),schemaFile.getPath());
      log.warn(e);

    } catch(UnsupportedOperationException e) {
      log.warnf("%s: API error while setting up XML parser. Check your JAXP version\n", XMLUtils.class.getName());
      log.warn(e);
    }

    return db;
  }

  /**
   * Block-level HTML tags that are rendered with surrounding line breaks.
   */
  private static final Set breakingTags = Generics.newHashSet(Arrays.asList(new String[] {"blockquote", "br", "div", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "li", "ol", "p", "pre", "ul", "tr", "td"}));

  /**
   * @param r       the reader to read the XML/HTML from
   * @param mapBack a List of Integers mapping the positions in the result buffer
   *                to positions in the original Reader, will be cleared on receipt
   * @return the String containing the resulting text
   */
  public static String stripTags(Reader r, List mapBack, boolean markLineBreaks) {
    if (mapBack != null) {
      mapBack.clear(); // just in case it has something in it!
    }
    StringBuilder result = new StringBuilder();
    try {
      int position = 0;
      do {
        String text = XMLUtils.readUntilTag(r);
        if (text.length() > 0) {
          // add offsets to the map back
          for (int i = 0; i < text.length(); i++) {
            result.append(text.charAt(i));
            if (mapBack != null) {
              mapBack.add(Integer.valueOf(position + i));
            }
          }
          position += text.length();
        }
        //        System.err.println(position + " got text: " + text);
        String tag = XMLUtils.readTag(r);
        if (tag == null) {
          break;
        }
        if (markLineBreaks && XMLUtils.isBreaking(parseTag(tag))) {
          result.append("\n");
          if (mapBack != null) {
            mapBack.add(Integer.valueOf(-position));
          }
        }
        position += tag.length();
        //        System.err.println(position + " got tag: " + tag);
      } while (true);
    } catch (IOException e) {
      log.warn("Error reading string");
      log.warn(e);
    }
    return result.toString();
  }

  public static boolean isBreaking(String tag) {
    return breakingTags.contains(tag);
  }

  public static boolean isBreaking(XMLTag tag) {
    return breakingTags.contains(tag.name);
  }

  /**
   * Reads all text up to next XML tag and returns it as a String.
   *
   * @return the String of the text read, which may be empty.
   */
  public static String readUntilTag(Reader r) throws IOException {
    if (!r.ready()) {
      return "";
    }
    StringBuilder b = new StringBuilder();
    int c = r.read();
    while (c >= 0 && c != '<') {
      b.append((char) c);
      c = r.read();
    }
    return b.toString();
  }

  /**
   * @return the new XMLTag object, or null if couldn't be created
   */
  public static XMLTag readAndParseTag(Reader r) throws IOException {
    String s = readTag(r);
    if (s == null) {
      return null;
    }
    XMLTag ret = null;
    try {
      ret = new XMLTag(s);
    } catch (Exception e) {
      log.warn("Failed to handle |" + s + "|");
    }
    return ret;
  }

  // Pattern is reentrant, going by the statement "many matchers can share the same pattern"
  // on the Pattern javadoc.  Therefore, this should be safe as a static final variable.
  private static final Pattern xmlEscapingPattern = Pattern.compile("&.+?;");

  public static String unescapeStringForXML(String s) {
    StringBuilder result = new StringBuilder();
    Matcher m = xmlEscapingPattern.matcher(s);
    int end = 0;
    while (m.find()) {
      int start = m.start();
      result.append(s.substring(end, start));
      end = m.end();
      result.append(translate(s.substring(start, end)));
    }
    result.append(s.substring(end, s.length()));
    return result.toString();
  }

  private static char translate(String s) {
    switch (s) {
      case "&":
        return '&';
      case "<":
      case "≪":
        return '<';
      case ">":
      case "≫":
        return '>';
      case """:
        return '\"';
      case "'":
        return '\'';
      case "*":
      case "♯":
        return '-';
      case "=":
        return '=';
      case " ":
        return (char) 0xA0;
      case "¡":
        return (char) 0xA1;
      case "¢":
      case "&shilling;":
        return (char) 0xA2;
      case "£":
        return (char) 0xA3;
      case "¤":
        return (char) 0xA4;
      case "¥":
        return (char) 0xA5;
      case "¦":
        return (char) 0xA6;
      case "§":
        return (char) 0xA7;
      case "¨":
        return (char) 0xA8;
      case "©":
        return (char) 0xA9;
      case "ª":
        return (char) 0xAA;
      case "« ":
        return (char) 0xAB;
      case "¬":
        return (char) 0xAC;
      case " ":
        return (char) 0xAD;
      case "®":
        return (char) 0xAE;
      case "¯":
        return (char) 0xAF;
      case "°":
        return (char) 0xB0;
      case "±":
        return (char) 0xB1;
      case "²":
        return (char) 0xB2;
      case "³":
        return (char) 0xB3;
      case "´":
        return (char) 0xB4;
      case "µ":
        return (char) 0xB5;
      case "·":
        return (char) 0xB7;
      case "¸":
        return (char) 0xB8;
      case "¹":
        return (char) 0xB9;
      case "º":
        return (char) 0xBA;
      case "»":
        return (char) 0xBB;
      case "¼ ":
        return (char) 0xBC;
      case "½":
        return (char) 0xBD;
      case "¾ ":
        return (char) 0xBE;
      case "¿":
        return (char) 0xBF;
      case "À":
        return (char) 0xC0;
      case "Á":
        return (char) 0xC1;
      case "Â":
        return (char) 0xC2;
      case "Ã":
        return (char) 0xC3;
      case "Ä":
        return (char) 0xC4;
      case "Å":
        return (char) 0xC5;
      case "Æ":
        return (char) 0xC6;
      case "Ç":
        return (char) 0xC7;
      case "È":
        return (char) 0xC8;
      case "É":
        return (char) 0xC9;
      case "Ê":
        return (char) 0xCA;
      case "Ë":
        return (char) 0xCB;
      case "Ì":
        return (char) 0xCC;
      case "Í":
        return (char) 0xCD;
      case "Î":
        return (char) 0xCE;
      case "Ï":
        return (char) 0xCF;
      case "Ð":
        return (char) 0xD0;
      case "Ñ":
        return (char) 0xD1;
      case "Ò":
        return (char) 0xD2;
      case "Ó":
        return (char) 0xD3;
      case "Ô":
        return (char) 0xD4;
      case "Õ":
        return (char) 0xD5;
      case "Ö":
        return (char) 0xD6;
      case "×":
        return (char) 0xD7;
      case "Ø":
        return (char) 0xD8;
      case "Ù":
        return (char) 0xD9;
      case "Ú":
        return (char) 0xDA;
      case "Û":
        return (char) 0xDB;
      case "Ü":
        return (char) 0xDC;
      case "Ý":
        return (char) 0xDD;
      case "Þ":
        return (char) 0xDE;
      case "ß":
        return (char) 0xDF;
      case "à":
        return (char) 0xE0;
      case "á":
        return (char) 0xE1;
      case "â":
        return (char) 0xE2;
      case "ã":
        return (char) 0xE3;
      case "ä":
        return (char) 0xE4;
      case "å":
        return (char) 0xE5;
      case "æ":
        return (char) 0xE6;
      case "ç":
        return (char) 0xE7;
      case "è":
        return (char) 0xE8;
      case "é":
        return (char) 0xE9;
      case "ê":
        return (char) 0xEA;
      case "ë ":
        return (char) 0xEB;
      case "ì":
        return (char) 0xEC;
      case "í":
        return (char) 0xED;
      case "î":
        return (char) 0xEE;
      case "ï":
        return 0xEF;
      case "ð":
        return (char) 0xF0;
      case "ñ":
        return (char) 0xF1;
      case "ò":
        return (char) 0xF2;
      case "ó":
        return (char) 0xF3;
      case "ô":
        return (char) 0xF4;
      case "õ":
        return (char) 0xF5;
      case "ö":
        return (char) 0xF6;
      case "÷":
        return (char) 0xF7;
      case "ø":
        return (char) 0xF8;
      case "ù":
        return (char) 0xF9;
      case "ú":
        return (char) 0xFA;
      case "û":
        return (char) 0xFB;
      case "ü":
        return (char) 0xFC;
      case "ý":
        return (char) 0xFD;
      case "þ":
        return (char) 0xFE;
      case "ÿ":
        return (char) 0xFF;
      case "Œ":
        return (char) 0x152;
      case "œ":
        return (char) 0x153;
      case "Š":
        return (char) 0x160;
      case "š":
        return (char) 0x161;
      case "Ÿ":
        return (char) 0x178;
      case "ˆ":
        return (char) 0x2C6;
      case "˜":
        return (char) 0x2DC;
      case "‎":
        return (char) 0x200E;
      case "‏":
        return (char) 0x200F;
      case "–":
        return (char) 0x2013;
      case "—":
        return (char) 0x2014;
      case "‘":
        return (char) 0x2018;
      case "’":
        return (char) 0x2019;
      case "‚":
        return (char) 0x201A;
      case "“":
      case "&bquo;":
      case "&bq;":
        return (char) 0x201C;
      case "”":
      case "&equo;":
        return (char) 0X201D;
      case "„":
        return (char) 0x201E;
      case "∼":
        return (char) 0x223C;
      case "√":
        return (char) 0x221A;
      case "≤":
        return (char) 0x2264;
      case "≥":
        return (char) 0x2265;
      case "←":
        return (char) 0x2190;
      case "↓":
        return (char) 0x2193;
      case "→":
        return (char) 0x2192;
      case "…":
        return (char) 0x2026;
      case "′":
        return (char) 0x2032;
      case "″":
      case "&ins;":
        return (char) 0x2033;
      case "™":
        return (char) 0x2122;
      case "Α":
      case "&Agr;":
        return (char) 0x391;
      case "Β":
      case "&Bgr;":
        return (char) 0x392;
      case "Γ":
      case "&Ggr;":
        return (char) 0x393;
      case "Δ":
      case "&Dgr;":
        return (char) 0x394;
      case "Ε":
      case "&Egr;":
        return (char) 0x395;
      case "Ζ":
      case "&Zgr;":
        return (char) 0x396;
      case "Η":
        return (char) 0x397;
      case "Θ":
      case "&THgr;":
        return (char) 0x398;
      case "Ι":
      case "&Igr;":
        return (char) 0x399;
      case "Κ":
      case "&Kgr;":
        return (char) 0x39A;
      case "Λ":
      case "&Lgr;":
        return (char) 0x39B;
      case "Μ":
      case "&Mgr;":
        return (char) 0x39C;
      case "Ν":
      case "&Ngr;":
        return (char) 0x39D;
      case "Ξ":
      case "&Xgr;":
        return (char) 0x39E;
      case "Ο":
      case "&Ogr;":
        return (char) 0x39F;
      case "Π":
      case "&Pgr;":
        return (char) 0x3A0;
      case "Ρ":
      case "&Rgr;":
        return (char) 0x3A1;
      case "Σ":
      case "&Sgr;":
        return (char) 0x3A3;
      case "Τ":
      case "&Tgr;":
        return (char) 0x3A4;
      case "Υ":
      case "&Ugr;":
        return (char) 0x3A5;
      case "Φ":
      case "&PHgr;":
        return (char) 0x3A6;
      case "Χ":
      case "&KHgr;":
        return (char) 0x3A7;
      case "Ψ":
      case "&PSgr;":
        return (char) 0x3A8;
      case "Ω":
      case "&OHgr;":
        return (char) 0x3A9;
      case "α":
      case "&agr;":
        return (char) 0x3B1;
      case "β":
      case "&bgr;":
        return (char) 0x3B2;
      case "γ":
      case "&ggr;":
        return (char) 0x3B3;
      case "δ":
      case "&dgr;":
        return (char) 0x3B4;
      case "ε":
      case "&egr;":
        return (char) 0x3B5;
      case "ζ":
      case "&zgr;":
        return (char) 0x3B6;
      case "η":
      case "&eegr;":
        return (char) 0x3B7;
      case "θ":
      case "&thgr;":
        return (char) 0x3B8;
      case "ι":
      case "&igr;":
        return (char) 0x3B9;
      case "κ":
      case "&kgr;":
        return (char) 0x3BA;
      case "λ":
      case "&lgr;":
        return (char) 0x3BB;
      case "μ":
      case "&mgr;":
        return (char) 0x3BC;
      case "ν":
      case "&ngr;":
        return (char) 0x3BD;
      case "ξ":
      case "&xgr;":
        return (char) 0x3BE;
      case "ο":
      case "&ogr;":
        return (char) 0x3BF;
      case "π":
      case "&pgr;":
        return (char) 0x3C0;
      case "ρ":
      case "&rgr;":
        return (char) 0x3C1;
      case "σ":
      case "&sgr;":
        return (char) 0x3C3;
      case "τ":
      case "&tgr;":
        return (char) 0x3C4;
      case "υ":
      case "&ugr;":
        return (char) 0x3C5;
      case "φ":
      case "&phgr;":
        return (char) 0x3C6;
      case "χ":
      case "&khgr;":
        return (char) 0x3C7;
      case "ψ":
      case "&psgr;":
        return (char) 0x3C8;
      case "ω":
      case "&ohgr;":
        return (char) 0x3C9;
      case "•":
        return (char) 0x2022;
      case "%":
        return '%';
      case "+":
        return '+';
      case "‐":
        return '-';
      case "ă":
      case "ā":
      case "≊":
      case "ą":
        return 'a';
      case "Ā":
        return 'A';
      case "ć":
      case "č":
      case "ĉ":
        return 'c';
      case "Č":
        return 'C';
      case "ď":
        return 'd';
      case "ě":
      case "ē":
      case "ę":
        return 'e';
      case "Ē":
      case "Ě":
        return 'E';
      case "ĺ":
        return 'l';
      case "Ĺ":
        return 'L';
      case "ń":
      case "ň":
      case "ņ":
        return 'n';
      case "ř":
      case "ŕ":
        return 'r';
      case "Ř":
        return 'R';
      case "ō":
        return 'o';
      case "ī":
        return 'i';
      case "ś":
      case "ş":
      case "ŝ":
        return 's';
      case "&Sacute":
      case "Ş":
        return 'S';
      case "ť":
      case "ţ":
        return 't';
      case "ū":
      case "ů":
        return 'u';
      case "ŵ":
        return 'w';
      case "Ŷ":
        return 'Y';
      case "ŷ":
        return 'y';
      case "ž":
      case "ź":
        return 'z';
      case "Ž":
        return 'Z';
      case "♥":
        return (char) 0x2665;
      case "∞":
        return (char) 0x221E;
      case "$":
        return '$';
      case "⊂":
      case "{":
        return (char) 0x2282;
      case "⊃":
      case "}":
        return (char) 0x2283;
      case "[":
        return '[';
      case "]":
        return ']';
      default:
        return ' ';
    }
  }


  /** Returns a String in which all the XML special characters have been
   *  escaped. The resulting String is valid to print in an XML file as an
   *  attribute or element value in all circumstances.  (Note that it may
   *  escape characters that didn't need to be escaped.)
   *
   *  @param in The String to escape
   *  @return The escaped String
   */
  public static String escapeXML(String in) {
    int leng = in.length();
    StringBuilder sb = new StringBuilder(leng);
    for (int i = 0; i < leng; i++) {
      char c = in.charAt(i);
      if (c == '&') {
        sb.append("&");
      } else if (c == '<') {
        sb.append("<");
      } else if (c == '>') {
        sb.append(">");
      } else if (c == '"') {
        sb.append(""");
      } else if (c == '\'') {
        sb.append("'");
      } else {
        sb.append(c);
      }
    }
    return sb.toString();
  }


  /** Returns a String in which some the XML special characters have been
   *  escaped: just the ones that need escaping in an element content.
   *
   *  @param in The String to escape
   *  @return The escaped String
   */
  public static String escapeElementXML(String in) {
    int leng = in.length();
    StringBuilder sb = new StringBuilder(leng);
    for (int i = 0; i < leng; i++) {
      char c = in.charAt(i);
      if (c == '&') {
        sb.append("&");
      } else if (c == '<') {
        sb.append("<");
      } else if (c == '>') {
        sb.append(">");
      } else {
        sb.append(c);
      }
    }
    return sb.toString();
  }


  /** Returns a String in which some XML special characters have been
   *  escaped. This just escapes attribute value ones, assuming that
   *  you're going to quote with double quotes.
   *  That is, only " and & are escaped.
   *
   *  @param in The String to escape
   *  @return The escaped String
   */
  public static String escapeAttributeXML(String in) {
    int leng = in.length();
    StringBuilder sb = new StringBuilder(leng);
    for (int i = 0; i < leng; i++) {
      char c = in.charAt(i);
      if (c == '&') {
        sb.append("&");
      } else if (c == '"') {
        sb.append(""");
      } else {
        sb.append(c);
      }
    }
    return sb.toString();
  }


  public static String escapeTextAroundXMLTags(String s) {
    StringBuilder result = new StringBuilder();
    Reader r = new StringReader(s);
    try {
      do {
        String text = readUntilTag(r);
        //      System.err.println("got text: " + text);
        result.append(escapeXML(text));
        XMLTag tag = readAndParseTag(r);
        //      System.err.println("got tag: " + tag);
        if (tag == null) {
          break;
        }
        result.append(tag);
      } while (true);
    } catch (IOException e) {
      log.warn("Error reading string");
      log.warn(e);
    }
    return result.toString();
  }

  /**
   * return either the first space or the first nbsp
   */
  public static int findSpace(String haystack, int begin) {
    int space = haystack.indexOf(' ', begin);
    int nbsp = haystack.indexOf('\u00A0', begin);
    if (space == -1 && nbsp == -1) {
      return -1;
    } else if (space >= 0 && nbsp >= 0) {
      return Math.min(space, nbsp);
    } else {
      // eg one is -1, and the other is >= 0
      return Math.max(space, nbsp);
    }
  }

  public static class XMLTag {

    /** Stores the complete string passed in as the tag on construction. */
    public String text;

    /** Stores the elememnt name, such as "doc". */
    public String name;

    /** Stores attributes as a Map from keys to values. */
    public Map attributes;

    /** Whether this is an ending tag or not. */
    public boolean isEndTag;

    /** Whether this is an empty element expressed as a single empty element tag like {@code }. */
    public boolean isSingleTag;

    /**
     * Assumes that String contains an XML tag.
     *
     * @param tag String to turn into an XMLTag object
     */
    public XMLTag(String tag) {
      if (tag == null || tag.isEmpty()) {
        throw new NullPointerException("Attempted to parse empty/null tag");
      }
      if (tag.charAt(0) != '<') {
        throw new IllegalArgumentException("Tag did not start with <");
      }
      if (tag.charAt(tag.length() - 1) != '>') {
        throw new IllegalArgumentException("Tag did not end with >");
      }
      text = tag;
      int begin = 1;
      if (tag.charAt(1) == '/') {
        begin = 2;
        isEndTag = true;
      } else {
        isEndTag = false;
      }
      int end = tag.length() - 1;
      if (tag.charAt(tag.length() - 2) == '/') {
        end = tag.length() - 2;
        isSingleTag = true;
      } else {
        isSingleTag = false;
      }
      tag = tag.substring(begin, end);
      attributes = Generics.newHashMap();
      begin = 0;
      end = findSpace(tag, 0);

      if (end < 0) {
        name = tag;
      } else {
        name = tag.substring(begin, end);
        do {
          begin = end + 1;
          while (begin < tag.length() && tag.charAt(begin) < 0x21) {
            begin++; // get rid of leading whitespace
          }
          if (begin == tag.length()) {
            break;
          }
          end = tag.indexOf('=', begin);
          if (end < 0) {
            String att = tag.substring(begin);
            attributes.put(att, "");
            break;
          }
          String att = tag.substring(begin, end).trim();
          begin = end + 1;
          String value = null;
          if (tag.length() > begin) {
            while (begin < tag.length() && tag.charAt(begin) < 0x21) {
              begin++;
            }
            if (begin < tag.length() && tag.charAt(begin) == '\"') {
              // get quoted expression
              begin++;
              end = tag.indexOf('\"', begin);
              if (end < 0) {
                break; // this is a problem
              }
              value = tag.substring(begin, end);
              end++;
            } else {
              // get unquoted expression
              end = findSpace(tag, begin);
              if (end < 0) {
                end = tag.length();
              }
//              System.err.println(begin + " " + end);
              value = tag.substring(begin, end);
            }
          }
          attributes.put(att, value);
        } while (end < tag.length() - 3);
      }
    }

    public String toString() {
      return text;
    }

    /**
     * Given a list of attributes, return the first one that is non-null
     */
    public String getFirstNonNullAttributeFromList(List attributesList) {
      for (String attribute : attributesList) {
        if (attributes.get(attribute) != null) {
          return attributes.get(attribute);
        }
      }
      return null;
    }
  } // end static class XMLTag


  /**
   * Reads all text of the XML tag and returns it as a String.
   * Assumes that a '<' character has already been read.
   *
   * @param r The reader to read from
   * @return The String representing the tag, or null if one couldn't be read
   *         (i.e., EOF).  The returned item is a complete tag including angle
   *         brackets, such as {@code }
   */
  public static String readTag(Reader r) throws IOException {
    if ( ! r.ready()) {
      return null;
    }
    StringBuilder b = new StringBuilder("<");
    int c = r.read();
    while (c >= 0) {
      b.append((char) c);
      if (c == '>') {
        break;
      }
      c = r.read();
    }
    if (b.length() == 1) {
      return null;
    }
    return b.toString();
  }

  public static XMLTag parseTag(String tagString) {
    if (tagString == null || tagString.isEmpty()) {
      return null;
    }
    if (tagString.charAt(0) != '<' ||
        tagString.charAt(tagString.length() - 1) != '>') {
      return null;
    }
    return new XMLTag(tagString);
  }

  public static Document readDocumentFromFile(String filename) throws Exception {
    InputSource in = new InputSource(new FileReader(filename));
    DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
    factory.setNamespaceAware(false);
    DocumentBuilder db = factory.newDocumentBuilder();
    db.setErrorHandler(new SAXErrorHandler());
    return db.parse(in);
  }

  private static class SAXErrorHandler implements ErrorHandler {

    public static String makeBetterErrorString(String msg,
                                               SAXParseException ex) {
      StringBuilder sb = new StringBuilder(msg);
      sb.append(": ");
      String str = ex.getMessage();
      if (str.lastIndexOf('.') == str.length() - 1) {
        str = str.substring(0, str.length() - 1);
      }
      sb.append(str);
      sb.append(" at document line ").append(ex.getLineNumber());
      sb.append(", column ").append(ex.getColumnNumber());
      if (ex.getSystemId() != null) {
        sb.append(" in entity from systemID ").append(ex.getSystemId());
      } else if (ex.getPublicId() != null) {
        sb.append(" in entity from publicID ").append(ex.getPublicId());
      }
      sb.append('.');
      return sb.toString();
    }

    @Override
    public void warning(SAXParseException exception) {
      log.warn(makeBetterErrorString("Warning", exception));
    }

    @Override
    public void error(SAXParseException exception) {
      log.error(makeBetterErrorString("Error", exception));
    }

    @Override
    public void fatalError(SAXParseException ex) throws SAXParseException {
      throw new SAXParseException(makeBetterErrorString("Fatal Error", ex),
              ex.getPublicId(), ex.getSystemId(), ex.getLineNumber(), ex.getColumnNumber());
      // throw new RuntimeException(makeBetterErrorString("Fatal Error", ex));
    }

  } // end class SAXErrorHandler

  public static Document readDocumentFromString(String s) throws Exception {
    InputSource in = new InputSource(new StringReader(s));
    DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
    factory.setNamespaceAware(false);
    return factory.newDocumentBuilder().parse(in);
  }

  /** Tests a few methods.
   *  If the first arg is -readDoc then this method tests
   *  readDocumentFromFile.
   *  Otherwise, it tests readTag/readUntilTag and slurpFile.
   */
  public static void main(String[] args) throws Exception {
    if (args[0].equals("-readDoc")) {
      Document doc = readDocumentFromFile(args[1]);
      System.out.println(doc);
    } else {
      String s = IOUtils.slurpFile(args[0]);
      Reader r = new StringReader(s);
      String tag = readTag(r);
      while (tag != null && ! tag.isEmpty()) {
        readUntilTag(r);
        tag = readTag(r);
        if (tag == null || tag.isEmpty()) {
          break;
        }
        System.out.println("got tag=" + new XMLTag(tag));
      }
    }
  }

}