groovyx.net.http.ParserRegistry Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of http-builder Show documentation
A builder-style HTTP client API, including authentication, and extensible handling of common content-types such as JSON and XML. It is built on top of Apache's HttpClient.
There is a newer version: 0.7.1
Show newest version
/*
 * Copyright 2008-2011 Thomas Nichols.  http://blog.thomnichols.org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * You are receiving this code free of charge, which represents many hours of
 * effort from other individuals and corporations.  As a responsible member
 * of the community, you are encouraged (but not required) to donate any
 * enhancements or improvements back to the community under a similar open
 * source license.  Thank you. -TMN
 */
package groovyx.net.http;

import groovy.json.JsonSlurper;
import groovy.lang.Closure;
import groovy.util.XmlSlurper;
import groovy.util.slurpersupport.GPathResult;
import groovyx.net.http.HTTPBuilder.RequestConfigDelegate;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import javax.xml.parsers.ParserConfigurationException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.utils.URLEncodedUtils;
import org.apache.http.entity.HttpEntityWrapper;
import org.apache.http.message.BasicHeader;
import org.apache.xml.resolver.Catalog;
import org.apache.xml.resolver.CatalogManager;
import org.apache.xml.resolver.tools.CatalogResolver;
import org.codehaus.groovy.runtime.MethodClosure;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;


/**
 * Keeps track of response parsers for each content type.  Each parser
 * should should be a closure that accepts an {@link HttpResponse} instance,
 * and returns whatever handler is appropriate for reading the response
 * data for that content-type.  For example, a plain-text response should
 * probably be parsed with a Reader, while an XML response
 * might be parsed by an XmlSlurper, which would then be passed to the
 * response closure. 
 *
 * Note that all methods in this class assume {@link HttpResponse#getEntity()}
 * return a non-null value.  It is the job of the HTTPBuilder instance to ensure
 * a NullPointerException is not thrown by passing a response that contains no
 * entity.
 *
 * You can see the list of content-type parsers that are built-in to the
 * ParserRegistry class in {@link #buildDefaultParserMap()}.
 *
 * @see ContentType
 * @author Tom Nichols
 */
public class ParserRegistry {

    /**
     * The default parser used for unregistered content-types.  This is a copy
     * of {@link #parseStream(HttpResponse)}, which is like a no-op that just
     * returns the unaltered response stream.
     */
    protected final Closure DEFAULT_PARSER = new MethodClosure( this, "parseStream" );
    /**
     * The default charset to use when no charset is given in the Content-Type
     * header of a response.  This can be modifid via {@link #setDefaultCharset(String)}.
     */
    public static final String DEFAULT_CHARSET = "UTF-8";

    private Closure defaultParser = DEFAULT_PARSER;
    private Map registeredParsers = buildDefaultParserMap();
    private static String defaultCharset = DEFAULT_CHARSET;

    protected static final Log log = LogFactory.getLog( ParserRegistry.class );

    /**
     * This CatalogResolver is static to avoid the overhead of re-parsing
     * the catalog definition file every time.  Unfortunately, there's no
     * way to share a single Catalog instance between resolvers.  The
     * {@link Catalog} class is technically not thread-safe, but as long as you
     * do not parse catalog files while using the resolver, it should be fine.
     */
    protected static CatalogResolver catalogResolver;

    static {
        CatalogManager catalogManager = new CatalogManager();
        catalogManager.setIgnoreMissingProperties( true );
        catalogManager.setUseStaticCatalog( false );
        catalogManager.setRelativeCatalogs( true );
        try {
            catalogResolver = new CatalogResolver( catalogManager );
            catalogResolver.getCatalog().parseCatalog(
                    ParserRegistry.class.getResource( "/catalog/html.xml" ) );
        } catch ( IOException ex ) {
            LogFactory.getLog( ParserRegistry.class )
                .warn( "Could not resolve default XML catalog", ex );
        }
    }

    /**
     * Set the charset to use for parsing character streams when no charset
     * is given in the Content-Type header.
     * @param charset the charset to use, or null to use
     *     {@link #DEFAULT_CHARSET}
     */
    public static void setDefaultCharset( String charset ) {
        defaultCharset = charset == null ? DEFAULT_CHARSET : charset;
    }

    /**
     * Helper method to get the charset from the response.  This should be done
     * when manually parsing any text response to ensure it is decoded using the
     * correct charset. For instance:     * Reader reader = new InputStreamReader( resp.getEntity().getContent(),
     *   ParserRegistry.getCharset( resp ) );
     * @param resp
     */
    public static String getCharset( HttpResponse resp ) {
        try {
            NameValuePair charset = resp.getEntity().getContentType()
                .getElements()[0].getParameterByName("charset");

            if ( charset == null || charset.getValue().trim().equals("") ) {
                log.debug( "Could not find charset in response; using " + defaultCharset );
                return defaultCharset;
            }

            return charset.getValue();
        }
        catch ( RuntimeException ex ) { // NPE or OOB Exceptions
            log.warn( "Could not parse charset from content-type header in response" );
            return Charset.defaultCharset().name();
        }
    }

    /**
     * Helper method to get the content-type string from the response
     * (no charset).
     * @param resp
     */
    public static String getContentType( HttpResponse resp ) {
        if ( resp.getEntity() == null )
            throw new IllegalArgumentException( "Response does not contain data" );
        if ( resp.getEntity().getContentType() == null )
            throw new IllegalArgumentException( "Response does not have a content-type header" );
        try {
            return resp.getEntity().getContentType().getElements()[0].getName();
        }
        catch ( RuntimeException ex ) {  // NPE or OOB Exceptions
            throw new IllegalArgumentException( "Could not parse content-type from response" );
        }
    }

    /**
     * Default parser used for binary data.  This simply returns the underlying
     * response InputStream.
     * @see ContentType#BINARY
     * @see HttpEntity#getContent()
     * @param resp
     * @return an InputStream the binary response stream
     * @throws IllegalStateException
     * @throws IOException
     */
    public InputStream parseStream( HttpResponse resp ) throws IOException {
        return resp.getEntity().getContent();
    }

    /**
     * Default parser used to handle plain text data.  The response text
     * is decoded using the charset passed in the response content-type
     * header.
     * @see ContentType#TEXT
     * @param resp
     * @return
     * @throws UnsupportedEncodingException
     * @throws IllegalStateException
     * @throws IOException
     */
    public Reader parseText( HttpResponse resp ) throws IOException {
        return new InputStreamReader( resp.getEntity().getContent(),
                ParserRegistry.getCharset( resp ) );
    }

    /**
     * Default parser used to decode a URL-encoded response.
     * @see ContentType#URLENC
     * @param resp
     * @return
     * @throws IOException
     */
    public Map parseForm( final HttpResponse resp ) throws IOException {
        HttpEntity entity = resp.getEntity();
        /* URLEncodedUtils won't parse the content unless the content-type is
           application/x-www-form-urlencoded.  Since we want to be able to force
           parsing regardless of what the content-type header says, we need to
           'spoof' the content-type if it's not already acceptable. */
        if ( ! ContentType.URLENC.toString().equals( ParserRegistry.getContentType( resp ) ) ) {
            entity = new HttpEntityWrapper( entity ) {
                @Override public org.apache.http.Header getContentType() {
                    String value = ContentType.URLENC.toString();
                    String charset = ParserRegistry.getCharset( resp );
                    if ( charset != null ) value += "; charset=" + charset;
                    return new BasicHeader( "Content-Type", value );
                };
            };
        }
        List params = URLEncodedUtils.parse( entity );
        Map paramMap = new HashMap(params.size());
        for ( NameValuePair param : params )
            paramMap.put( param.getName(), param.getValue() );
        return paramMap;
    }

    /**
     * Parse an HTML document by passing it through the NekoHTML parser.
     * @see ContentType#HTML
     * @see org.cyberneko.html.parsers.SAXParser
     * @see XmlSlurper#parse(Reader)
     * @param resp HTTP response from which to parse content
     * @return the {@link GPathResult} from calling {@link XmlSlurper#parse(Reader)}
     * @throws IOException
     * @throws SAXException
     */
    public GPathResult parseHTML( HttpResponse resp ) throws IOException, SAXException {
        XMLReader p = new org.cyberneko.html.parsers.SAXParser();
        p.setEntityResolver( catalogResolver );
        return new XmlSlurper( p ).parse( parseText( resp ) );
    }

    /**
     * Default parser used to decode an XML response.
     * @see ContentType#XML
     * @see XmlSlurper#parse(Reader)
     * @param resp HTTP response from which to parse content
     * @return the {@link GPathResult} from calling {@link XmlSlurper#parse(Reader)}
     * @throws IOException
     * @throws SAXException
     * @throws ParserConfigurationException
     */
    public GPathResult parseXML( HttpResponse resp ) throws IOException, SAXException, ParserConfigurationException {
        XmlSlurper xml = new XmlSlurper();
        xml.setEntityResolver( catalogResolver );
        return xml.parse( parseText( resp ) );
    }

    /**
     * Default parser used to decode a JSON response.
     * @see ContentType#JSON
     * @param resp
     * @return
     * @throws IOException
     */
    public Object parseJSON( HttpResponse resp ) throws IOException {
        // there is a bug in the JsonSlurper.parse method...
        //String jsonTxt = DefaultGroovyMethods.getText( parseText( resp ) );
        return new JsonSlurper().parse( parseText( resp ) );
    }

    /**
     * Returns a map of default parsers.  Override this method to change
     * what parsers are registered by default.  A 'parser' is really just a
     * closure that acceipts an {@link HttpResponse} instance and returns
     * some parsed data.  You can of course call
     * super.buildDefaultParserMap() and then add or remove
     * from that result as well.
     *
     * Default registered parsers are:
     * 

     * {@link ContentType#BINARY} :  {@link #parseStream(HttpResponse) parseStream()}
     * {@link ContentType#TEXT} :  {@link #parseText(HttpResponse) parseText()}
     * {@link ContentType#URLENC} :  {@link #parseForm(HttpResponse) parseForm()}
     * {@link ContentType#XML} :  {@link #parseXML(HttpResponse) parseXML()}
     * {@link ContentType#JSON} :  {@link #parseJSON(HttpResponse) parseJSON()}
     * 
     */
    protected Map buildDefaultParserMap() {
        Map parsers = new HashMap();

        parsers.put( ContentType.BINARY.toString(), new MethodClosure( this, "parseStream" ) );
        parsers.put( ContentType.TEXT.toString(), new MethodClosure(this,"parseText") );
        parsers.put( ContentType.URLENC.toString(), new MethodClosure(this,"parseForm") );
        parsers.put( ContentType.HTML.toString(), new MethodClosure(this,"parseHTML") );

        Closure pClosure = new MethodClosure(this,"parseXML");
        for ( String ct : ContentType.XML.getContentTypeStrings() )
            parsers.put( ct, pClosure );

        pClosure = new MethodClosure(this,"parseJSON");
        for ( String ct : ContentType.JSON.getContentTypeStrings() )
            parsers.put( ct, pClosure );

        return parsers;
    }

    /**
     * Add a new XML catalog definiton to the static XML resolver catalog.
     * See the 
     * HTTPBuilder source catalog for an example.
     *
     * @param catalogLocation URL of a catalog definition file
     * @throws IOException if the given URL cannot be parsed or accessed for whatever reason.
     */
    public static void addCatalog( URL catalogLocation ) throws IOException {
        catalogResolver.getCatalog().parseCatalog( catalogLocation );
    }

    /**
     * Access the default catalog used by all HTTPBuilder instances.
     * @return the static {@link CatalogResolver} instance
     */
    public static CatalogResolver getCatalogResolver() {
        return catalogResolver;
    }

    /**
     * Get the default parser used for unregistered content-types.
     * @return
     */
    public Closure getDefaultParser() {
        return this.defaultParser;
    }

    /**
     * Set the default parser used for unregistered content-types.
     * @param defaultParser if
     */
    public void setDefaultParser( Closure defaultParser ) {
        if ( defaultParser == null ) this.defaultParser = DEFAULT_PARSER;
        this.defaultParser = defaultParser;
    }

    /**
     * Retrieve a parser for the given response content-type string.  This
     * is called by HTTPBuildre to retrieve the correct parser for a given
     * content-type.  The parser is then used to decode the response data prior
     * to passing it to a response handler.
     * @param contentType
     * @return parser that can interpret the given response content type,
     *   or the default parser if no parser is registered for the given
     *   content-type.  It should NOT return a null value.
     */
    public Closure getAt( Object contentType ) {
        String ct = contentType.toString();
        int idx = ct.indexOf( ';' );
        if ( idx > 0 ) ct = ct.substring( 0, idx );

        Closure parser = registeredParsers.get(ct);
        if ( parser != null ) return parser;

        log.warn( "Cannot find parser for content-type: " + ct
                    + " -- using default parser.");
        return defaultParser;
    }

    /**
     * Register a new parser for the given content-type.  The parser closure
     * should accept an {@link HttpResponse} argument and return a type suitable
     * to be passed as the 'parsed data' argument of a
     * {@link RequestConfigDelegate#getResponse() response handler} closure.
     * @param contentType  content-type string
     * @param value code that will parse the HttpResponse and return parsed
     *   data to the response handler.
     */
    public void putAt( Object contentType, Closure value ) {
        if ( contentType instanceof ContentType ) {
            for ( String ct : ((ContentType)contentType).getContentTypeStrings() )
                this.registeredParsers.put( ct, value );
        }
        else this.registeredParsers.put( contentType.toString(), value );
    }

    /**
     * Alias for {@link #getAt(Object)} to allow property-style access.
     * @param key content-type string
     * @return
     */
    public Closure propertyMissing( Object key ) {
        return this.getAt( key );
    }

    /**
     * Alias for {@link #putAt(Object, Closure)} to allow property-style access.
     * @param key content-type string
     * @param value parser closure
     */
    public void propertyMissing( Object key, Closure value ) {
        this.putAt( key, value );
    }

    /**
     * Iterate over the entire parser map
     * @return
     */
    public Iterator> iterator() {
        return this.registeredParsers.entrySet().iterator();
    }
}