All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.process.TransformXML Maven / Gradle / Ivy

Go to download

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.process; 
import edu.stanford.nlp.util.logging.Redwood;


import java.io.*;
import java.util.*;

import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import java.util.function.Function;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.util.XMLUtils;


/**
 * Reads XML from an input file or stream and writes XML to an output
 * file or stream, while transforming text appearing inside specified
 * XML tags by applying a specified {@link Function
 * Function}.  See TransformXMLApplications for examples.
 * Implementation note: This is done using SAX2.
 *
 * @param  The type of the output of the Function (from String to T)
 * @author Bill MacCartney
 * @author Anna Rafferty (refactoring, making SAXInterface easy to extend elsewhere)
 */
public class TransformXML  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(TransformXML.class);

  private final SAXParser saxParser;

  public SAXInterface buildSaxInterface() { return new SAXInterface<>(); }

  public static class SAXInterface extends DefaultHandler {

    protected List elementsToBeTransformed;
    protected StringBuffer textToBeTransformed;
    protected PrintWriter outWriter = new PrintWriter(System.out, true);
    protected Function function;

    /**
     * How far down we are in the nested tags.  For example, if we've
     * seen <foo> <bar> and "foo" and "bar" are both tags
     * we care about, then depth = 2.
     */
    protected int depth = 0;

    public SAXInterface() {
      elementsToBeTransformed = new ArrayList<>();
      depth = 0;
      openingTag = null;
      textToBeTransformed = new StringBuffer();
    }

    /**
     * The first tag from {@link elementsToBeTransformed}
     * that we saw the last time {@link depth} was
     * 0.
     * 
* You would expect incoming XML to be well-formatted, but just in * case it isn't, we keep track of this so we can output the * correct closing tag. */ String openingTag; private void outputTextAndTag(String qName, Attributes attributes, boolean close) { // If we're not already in an element to be transformed, first // echo the previous text... outWriter.print(XMLUtils.escapeXML(textToBeTransformed.toString())); textToBeTransformed = new StringBuffer(); // ... then echo the new tag to outStream outWriter.print('<'); if (close) { outWriter.print('/'); } outWriter.print(qName); if (attributes != null) { for (int i = 0; i < attributes.getLength(); i++) { outWriter.print(' '); outWriter.print(attributes.getQName(i)); outWriter.print("=\""); outWriter.print(XMLUtils.escapeXML(attributes.getValue(i))); outWriter.print('"'); } } outWriter.print(">\n"); } @Override public void endDocument() { // Theoretically, there shouldn't be anything in the buffer after // the last closing tag, but if there is, it's probably better to // echo it than ignore it outWriter.print(XMLUtils.escapeXML(textToBeTransformed.toString())); // we need to flush because there are no other ways we // explicitely flush outWriter.flush(); } // Called at the beginning of each element. If the tag is on the // designated list, set flag to remember that we're in an element // to be transformed. In either case, echo tag. @Override public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { //log.info("start element " + qName); if (depth == 0) { outputTextAndTag(qName, attributes, false); } if (elementsToBeTransformed.contains(qName)) { if (depth == 0) { openingTag = qName; } ++depth; } } // Called at the end of each element. If the tag is on the // designated list, apply the designated {@link Function // Function} to the accumulated text and echo the the // result. In either case, echo the closing tag. @Override public void endElement(String uri, String localName, String qName) throws SAXException { //log.info("end element " + qName + "; function is " + function.getClass()); //log.info("elementsToBeTransformed is " + elementsToBeTransformed); //log.info("textToBeTransformed is " + textToBeTransformed); if (depth == 0) { outputTextAndTag(qName, null, true); } else { if (elementsToBeTransformed.contains(qName)) { --depth; if (depth == 0) { String text = textToBeTransformed.toString().trim(); // factored out so subclasses can handle the text differently processText(text); textToBeTransformed = new StringBuffer(); outWriter.print("\n"); } } // when we're inside a block to be transformed, we ignore // elements that don't end the block. } } public void processText(String text) { if (text.length() > 0) { text = function.apply(text).toString(); outWriter.print(XMLUtils.escapeXML(text)); outWriter.print('\n'); } } // Accumulate characters in buffer of text to be transformed // (SAX may call this after each line break) @Override public void characters(char[] buf, int offset, int len) throws SAXException { // log.info("characters |" + new String(buf, offset, len) + "|"); textToBeTransformed.append(buf, offset, len); } } // end static class SAXInterface /** * This version of the SAXInterface doesn't escape the text produced * by the function. This is useful in the case where the function * already produces well-formed XML. One example of this is the * Tagger, which already escapes the inner text and produces xml * tags around the words. */ public static class NoEscapingSAXInterface extends SAXInterface { @Override public void processText(String text) { if (text.length() > 0) { text = function.apply(text).toString(); outWriter.print(text); outWriter.print('\n'); } } } public TransformXML() { try { saxParser = SAXParserFactory.newInstance().newSAXParser(); } catch (Exception e) { log.info("Error configuring XML parser: " + e); throw new RuntimeException(e); } } /** * Read XML from the specified file and write XML to stdout, * while transforming text appearing inside the specified XML * tags by applying the specified {@link Function * Function}. Note that the Function * you supply must be prepared to accept Strings as * input; if your Function doesn't handle * Strings, you need to write a wrapper for it that * does. * * @param tags an array of Strings, each an XML tag * within which the transformation should be applied * @param fn the {@link Function Function} to apply * @param in the File to read from */ public void transformXML(String[] tags, Function fn, File in) { InputStream ins = null; try { ins = new BufferedInputStream(new FileInputStream(in)); transformXML(tags, fn, ins, System.out); } catch (Exception e) { log.info("Error reading file " + in + ": " + e); e.printStackTrace(); } finally { IOUtils.closeIgnoringExceptions(ins); } } /** * Read XML from the specified file and write XML to specified file, * while transforming text appearing inside the specified XML tags * by applying the specified {@link Function Function}. * Note that the Function you supply must be * prepared to accept Strings as input; if your * Function doesn't handle Strings, you * need to write a wrapper for it that does. * * @param tags an array of Strings, each an XML tag * within which the transformation should be applied * @param fn the {@link Function Function} to apply * @param in the File to read from * @param out the File to write to */ public void transformXML(String[] tags, Function fn, File in, File out) { InputStream ins = null; OutputStream outs = null; try { ins = new BufferedInputStream(new FileInputStream(in)); outs = new BufferedOutputStream(new FileOutputStream(out)); transformXML(tags, fn, ins, outs); } catch (Exception e) { log.info("Error reading file " + in + " or writing file " + out + ": " + e); e.printStackTrace(); } finally { IOUtils.closeIgnoringExceptions(ins); IOUtils.closeIgnoringExceptions(outs); } } /** * Read XML from input stream and write XML to stdout, while * transforming text appearing inside the specified XML tags by * applying the specified {@link Function Function}. * Note that the Function you supply must be * prepared to accept Strings as input; if your * Function doesn't handle Strings, you * need to write a wrapper for it that does. * * @param tags an array of Strings, each an XML tag * within which the transformation should be applied * @param fn the {@link Function Function} to apply * @param in the InputStream to read from */ public void transformXML(String[] tags, Function fn, InputStream in) { transformXML(tags, fn, in, System.out); } /** * Read XML from input stream and write XML to output stream, * while transforming text appearing inside the specified XML tags * by applying the specified {@link Function Function}. * Note that the Function you supply must be * prepared to accept Strings as input; if your * Function doesn't handle Strings, you * need to write a wrapper for it that does. * * @param tags an array of Strings, each an XML tag * within which the transformation should be applied * @param fn the {@link Function Function} to apply * @param in the InputStream to read from * @param out the OutputStream to write to */ public void transformXML(String[] tags, Function fn, InputStream in, OutputStream out) { transformXML(tags, fn, in, new OutputStreamWriter(out), buildSaxInterface()); } /** * Read XML from input stream and write XML to output stream, * while transforming text appearing inside the specified XML tags * by applying the specified {@link Function Function}. * Note that the Function you supply must be * prepared to accept Strings as input; if your * Function doesn't handle Strings, you * need to write a wrapper for it that does. *

Implementation notes: The InputStream is assumed to already * be buffered if useful, and we need a stream, so that the XML decoder * can determine the correct character encoding of the XML file. The output * is to a Writer, and the provided Writer should again be buffered if * desirable. Internally, this Writer is wrapped as a PrintWriter. * * @param tags an array of Strings, each an XML entity * within which the transformation should be applied * @param fn the {@link Function Function} to apply * @param in the InputStream to read from * @param w the Writer to write to */ public void transformXML(String[] tags, Function fn, InputStream in, Writer w) { transformXML(tags, fn, in, w, buildSaxInterface()); } /** * Calls the fully specified transformXML with an InputSource * constructed from in. */ public void transformXML(String[] tags, Function fn, InputStream in, Writer w, SAXInterface handler) { transformXML(tags, fn, new InputSource(in), w, handler); } /** * Calls the fully specified transformXML with an InputSource * constructed from in. */ public void transformXML(String[] tags, Function fn, Reader in, Writer w, SAXInterface handler) { transformXML(tags, fn, new InputSource(in), w, handler); } /** * Read XML from input source and write XML to output writer, * while transforming text appearing inside the specified XML tags * by applying the specified {@link Function Function}. * Note that the Function you supply must be * prepared to accept Strings as input; if your * Function doesn't handle Strings, you * need to write a wrapper for it that does. *
*

Implementation notes: The InputSource is assumed to already * be buffered if useful, and we need a stream, so that the XML decoder * can determine the correct character encoding of the XML file. * TODO: does that mean there's a bug if you send it a Reader * instead of an InputStream? It seems to work with a Reader... *
* The output is to a Writer, and the provided Writer should again * be buffered if desirable. Internally, this Writer is wrapped as * a PrintWriter. * * @param tags an array of Strings, each an XML entity * within which the transformation should be applied * @param fn the {@link Function Function} to apply * @param in the InputStream to read from * @param w the Writer to write to * @param saxInterface the sax handler you would like to use (default is SaxInterface, defined in this class, but you may define your own handler) */ public void transformXML(String[] tags, Function fn, InputSource in, Writer w, SAXInterface saxInterface) { saxInterface.outWriter = new PrintWriter(w, true); saxInterface.function = fn; saxInterface.elementsToBeTransformed = new ArrayList<>(); saxInterface.elementsToBeTransformed.addAll(Arrays.asList(tags)); try { saxParser.parse(in, saxInterface); } catch (Exception e) { throw new RuntimeException(e); } } } // end class TransformXML





© 2015 - 2024 Weber Informatics LLC | Privacy Policy