All Downloads are FREE. Search and download functionalities are using the official Maven repository.

jlibs.xml.sax.crawl.XMLCrawler Maven / Gradle / Ivy

There is a newer version: 3.0.1
Show newest version
/**
 * Copyright 2015 Santhosh Kumar Tekuri
 *
 * The JLibs authors license this file to you under the Apache License,
 * version 2.0 (the "License"); you may not use this file except in compliance
 * with the License. You may obtain a copy of the License at:
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 */

package jlibs.xml.sax.crawl;

import jlibs.core.io.ByteArrayOutputStream2;
import jlibs.core.io.FileNavigator;
import jlibs.core.io.FileUtil;
import jlibs.core.lang.ByteSequence;
import jlibs.core.lang.OS;
import jlibs.core.net.URLUtil;
import jlibs.xml.sax.SAXUtil;
import jlibs.xml.xsl.TransformerUtil;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.xml.sax.helpers.XMLFilterImpl;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.sax.SAXSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;

/**
 * @author Santhosh Kumar T
 */
public class XMLCrawler{
    private CrawlingRules rules;

    private Element current;
    private int depth;

    public XMLCrawler(){
        rules = CrawlingRules.defaultRules();
        crawled = new HashMap();
    }
    
    public XMLCrawler(CrawlingRules rules){
        this.rules = rules;
        crawled = new HashMap();
    }

    private XMLCrawler(XMLCrawler crawler){
        rules = crawler.rules;
        crawled = crawler.crawled;
        resolver = crawler.resolver;
    }

    private XMLFilterImpl xmlFilter = new XMLFilterImpl(){
        @Override
        public void startDocument() throws SAXException{
            current = rules.doc;
            super.startDocument();
        }

        @Override
        public void startElement(String namespaceURI, String localName, String qualifiedName, Attributes atts) throws SAXException{
            if(depth==0){
                Element elem = current.findChild(namespaceURI, localName);
                if(elem==null)
                    depth = 1;
                else{
                    current = elem;
                    try{
                        if(file==null && current.extension!=null)
                            setFile(listener.toFile(url, current.extension));
                        if(current.locationAttribute!=null){
                            String linkNamespace = null;
                            if(current.namespaceAttribute!=null)
                                linkNamespace = atts.getValue(current.namespaceAttribute.getNamespaceURI(), current.namespaceAttribute.getLocalPart());
                            String location = atts.getValue(current.locationAttribute.getNamespaceURI(), current.locationAttribute.getLocalPart());
                            if(location!=null){
                                location = resolveLink(linkNamespace, location);
                                URL targetURL = URLUtil.toURL(location);
                                File targetFile = null;
                                if(crawled!=null)
                                    targetFile = crawled.get(targetURL);
                                if(targetFile==null && listener.doCrawl(targetURL))
                                    targetFile = new XMLCrawler(XMLCrawler.this).crawl(new InputSource(location), listener, null);
                                String href = targetFile==null ? location : FileNavigator.INSTANCE.getRelativePath(file.getParentFile(), targetFile);
                                AttributesImpl newAtts = new AttributesImpl(atts);
                                int index = atts.getIndex(current.locationAttribute.getNamespaceURI(), current.locationAttribute.getLocalPart());
                                newAtts.setValue(index, href);
                                atts = newAtts;
                            }
                        }
                    }catch(IOException ex){
                        throw new SAXException(ex);
                    }
                }
            }else
                depth++;

            super.startElement(namespaceURI, localName, qualifiedName, atts);
        }

        @Override
        public void endElement(String uri, String localName, String qName) throws SAXException{
            super.endElement(uri, localName, qName);
            if(depth==0)
                current = current.parent;
            else
                depth--;
        }
    };

    /*-------------------------------------------------[ Resource Resolver ]---------------------------------------------------*/

    private Resolver resolver;
    public void setResolver(Resolver resolver){
        this.resolver = resolver;
    }

    private String resolveLink(String namespace, String location){
        String resolvedLocation = null;
        if(resolver!=null)
            resolvedLocation = resolver.resolve(namespace, url.toExternalForm(), location);
        if(resolvedLocation==null)
            resolvedLocation = URLUtil.toURI(url).resolve(location).toString();
        return resolvedLocation;
    }

    public static interface Resolver{
        public String resolve(String namespace, String base, String location);
    }

    /*-------------------------------------------------[ Crawling ]---------------------------------------------------*/

    private CrawlerListener listener;

    private URL url;
    private File file;
    private DelegatingOutputStream out;

    private Map crawled;
    private void setFile(File file) throws IOException{
        this.file = file;
        out.setDelegate(new FileOutputStream(file));
        crawled.put(url, file);
    }

    public File crawl(InputSource document, CrawlerListener listener, File file) throws IOException{
        if(document.getSystemId()==null)
            throw new IllegalArgumentException("InputSource without systemID can't be crawled");
        this.listener = listener;
        url = URLUtil.toURL(document.getSystemId());

        try{
            xmlFilter.setParent(SAXUtil.newSAXParser(true, false, false).getXMLReader());
            SAXSource source = new SAXSource(xmlFilter, document);
            out = new DelegatingOutputStream();
            if(file!=null)
                setFile(file);

            TransformerUtil.newTransformer(null, false, 4, null)
                    .transform(source, new StreamResult(out));
        }catch(SAXException ex){
            throw new IOException(ex);
        } catch(ParserConfigurationException ex){
            throw new IOException(ex);
        } catch(TransformerException ex){
            throw new IOException(ex);
        }
        return this.file;
    }

    public File crawlInto(InputSource document, File dir) throws IOException{
        return crawl(document, new DefaultCrawlerListener(dir), null);
    }

    public void crawl(InputSource document, File file) throws IOException{
        crawl(document, new DefaultCrawlerListener(file.getParentFile()), file);
    }

    private static class DelegatingOutputStream extends FilterOutputStream{
        public DelegatingOutputStream(){
            super(new ByteArrayOutputStream2());
        }

        public void setDelegate(OutputStream out) throws IOException{
            ByteSequence seq = ((ByteArrayOutputStream2)this.out).toByteSequence();
            out.write(seq.buffer(), seq.offset(), seq.length());
            this.out = out;
        }
    }

    public static void main(String[] args) throws Exception{
        if(args.length!=2){
            System.out.println("usage: crawl-xml."+(OS.get().isWindows()?"bat":"sh")+"  ");
            System.exit(1);
        }

        File dir = new File(args[1]);
        FileUtil.mkdirs(dir);
        new XMLCrawler().crawlInto(new InputSource(args[0]), dir);
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy