org.biojavax.utils.XMLTools Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of core Show documentation
BioJava core module
There is a newer version: 1.9.7
/*
 *                    BioJava development code
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  If you do not have a copy,
 * see:
 *
 *      http://www.gnu.org/copyleft/lesser.html
 *
 * Copyright for this code is held jointly by the individual
 * authors.  These should be listed in @author doc comments.
 *
 * For more information on the BioJava project and its aims,
 * or to join the biojava-l mailing list, visit the home page
 * at:
 *
 *      http://www.biojava.org/
 *
 */

package org.biojavax.utils;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.regex.Pattern;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

/**
 * Utility class for reading chunks of XML files and feeding them to SAX.
 * @author Richard Holland
 * @since 1.5
 */
public class XMLTools {
    
    // Static methods so should never be instantiated.
    private XMLTools() {}
    
    /**
     * Attempts to read XML file in chunks, passing each chunk to a SAX parser.
     * As each chunk is read into memory in a buffer, you need to ensure that each chunk
     * is small enough to fit into available memory. Only one chunk is held in memory
     * at any one time, and then only long enough for it to be parsed.
     * When checking for the presence of further chunks, it'll only read up to 1000 chars
     * further into the file, after which results will be unpredictable.
     * @param reader the reader to read the XML from
     * @param m_handler the SAX parser to feed the XML to
     * @param chunkToken the token to read. The parser will locate the first instance of
     * <chunkToken and will buffer all content, including the opening tag and up to
     * and including the closing </chunkToken> tag. It will not currently handle
     * <chunkToken/> instances, nor instances where more than one tag appears per line,
     * or extra spaces appear between the angle brackets, slashes, and tag name of the
     * tag we are searching for.
     * @return true if there is another chunk left to read after this one, false if not.
     * @throws ParserConfigurationException if there was a problem setting up the SAX parser.
     * @throws SAXException if there was a problem parsing the XML.
     * @throws IOException if there was a problem reading the XML from the reader.
     */
    public static boolean readXMLChunk(BufferedReader reader, DefaultHandler m_handler, String chunkToken) throws ParserConfigurationException, SAXException, IOException {
        // read next chunk from  to  inclusive into buffer
        // process buffer through XML parser
        StringBuffer buffer = new StringBuffer();

        Pattern start = Pattern.compile(".*<"+chunkToken+".*");
        Pattern end = Pattern.compile(".*.*");
        
        boolean begunChunk = false;
        boolean filledBuffer = false;
        String line = null;
        while (!filledBuffer && (line=reader.readLine())!=null) {
            line = line.trim();
            if (!begunChunk && !start.matcher(line).matches()) continue;
            else begunChunk = true;
            buffer.append(line+"\n");
            if (end.matcher(line).matches()) filledBuffer = true;
        }
        if (!filledBuffer) throw new SAXException("Unexpectedly reached end of file");
        reader.mark(10000);
        boolean hasAnotherChunk = false;
        while (!hasAnotherChunk && (line=reader.readLine())!=null) {
            line = line.trim();
            if (start.matcher(line).matches()) hasAnotherChunk = true;
        }
        reader.reset();
        
        SAXParser m_xmlParser;
        SAXParserFactory factory = SAXParserFactory.newInstance();
        factory.setValidating(true);
        m_xmlParser = factory.newSAXParser();
        
        InputSource source = new InputSource(new StringReader(buffer.toString()));
        m_xmlParser.parse(source, m_handler);
        
        // return true if there are more in our buffer
        return hasAnotherChunk;
    }
}