jlibs.xml.sax.crawl.XMLCrawler Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jlibs-xml-crawler Show documentation
Show all versions of jlibs-xml-crawler Show documentation
Crawl XML(wsdl, xsd, xsl) documents
/**
* Copyright 2015 Santhosh Kumar Tekuri
*
* The JLibs authors license this file to you under the Apache License,
* version 2.0 (the "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*/
package jlibs.xml.sax.crawl;
import jlibs.core.io.ByteArrayOutputStream2;
import jlibs.core.io.FileNavigator;
import jlibs.core.io.FileUtil;
import jlibs.core.lang.ByteSequence;
import jlibs.core.lang.OS;
import jlibs.core.net.URLUtil;
import jlibs.xml.sax.SAXUtil;
import jlibs.xml.xsl.TransformerUtil;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.xml.sax.helpers.XMLFilterImpl;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.sax.SAXSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;
/**
* @author Santhosh Kumar T
*/
public class XMLCrawler{
private CrawlingRules rules;
private Element current;
private int depth;
public XMLCrawler(){
rules = CrawlingRules.defaultRules();
crawled = new HashMap();
}
public XMLCrawler(CrawlingRules rules){
this.rules = rules;
crawled = new HashMap();
}
private XMLCrawler(XMLCrawler crawler){
rules = crawler.rules;
crawled = crawler.crawled;
resolver = crawler.resolver;
}
private XMLFilterImpl xmlFilter = new XMLFilterImpl(){
@Override
public void startDocument() throws SAXException{
current = rules.doc;
super.startDocument();
}
@Override
public void startElement(String namespaceURI, String localName, String qualifiedName, Attributes atts) throws SAXException{
if(depth==0){
Element elem = current.findChild(namespaceURI, localName);
if(elem==null)
depth = 1;
else{
current = elem;
try{
if(file==null && current.extension!=null)
setFile(listener.toFile(url, current.extension));
if(current.locationAttribute!=null){
String linkNamespace = null;
if(current.namespaceAttribute!=null)
linkNamespace = atts.getValue(current.namespaceAttribute.getNamespaceURI(), current.namespaceAttribute.getLocalPart());
String location = atts.getValue(current.locationAttribute.getNamespaceURI(), current.locationAttribute.getLocalPart());
if(location!=null){
location = resolveLink(linkNamespace, location);
URL targetURL = URLUtil.toURL(location);
File targetFile = null;
if(crawled!=null)
targetFile = crawled.get(targetURL);
if(targetFile==null && listener.doCrawl(targetURL))
targetFile = new XMLCrawler(XMLCrawler.this).crawl(new InputSource(location), listener, null);
String href = targetFile==null ? location : FileNavigator.INSTANCE.getRelativePath(file.getParentFile(), targetFile);
AttributesImpl newAtts = new AttributesImpl(atts);
int index = atts.getIndex(current.locationAttribute.getNamespaceURI(), current.locationAttribute.getLocalPart());
newAtts.setValue(index, href);
atts = newAtts;
}
}
}catch(IOException ex){
throw new SAXException(ex);
}
}
}else
depth++;
super.startElement(namespaceURI, localName, qualifiedName, atts);
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException{
super.endElement(uri, localName, qName);
if(depth==0)
current = current.parent;
else
depth--;
}
};
/*-------------------------------------------------[ Resource Resolver ]---------------------------------------------------*/
private Resolver resolver;
public void setResolver(Resolver resolver){
this.resolver = resolver;
}
private String resolveLink(String namespace, String location){
String resolvedLocation = null;
if(resolver!=null)
resolvedLocation = resolver.resolve(namespace, url.toExternalForm(), location);
if(resolvedLocation==null)
resolvedLocation = URLUtil.toURI(url).resolve(location).toString();
return resolvedLocation;
}
public static interface Resolver{
public String resolve(String namespace, String base, String location);
}
/*-------------------------------------------------[ Crawling ]---------------------------------------------------*/
private CrawlerListener listener;
private URL url;
private File file;
private DelegatingOutputStream out;
private Map crawled;
private void setFile(File file) throws IOException{
this.file = file;
out.setDelegate(new FileOutputStream(file));
crawled.put(url, file);
}
public File crawl(InputSource document, CrawlerListener listener, File file) throws IOException{
if(document.getSystemId()==null)
throw new IllegalArgumentException("InputSource without systemID can't be crawled");
this.listener = listener;
url = URLUtil.toURL(document.getSystemId());
try{
xmlFilter.setParent(SAXUtil.newSAXParser(true, false, false).getXMLReader());
SAXSource source = new SAXSource(xmlFilter, document);
out = new DelegatingOutputStream();
if(file!=null)
setFile(file);
TransformerUtil.newTransformer(null, false, 4, null)
.transform(source, new StreamResult(out));
}catch(SAXException ex){
throw new IOException(ex);
} catch(ParserConfigurationException ex){
throw new IOException(ex);
} catch(TransformerException ex){
throw new IOException(ex);
}
return this.file;
}
public File crawlInto(InputSource document, File dir) throws IOException{
return crawl(document, new DefaultCrawlerListener(dir), null);
}
public void crawl(InputSource document, File file) throws IOException{
crawl(document, new DefaultCrawlerListener(file.getParentFile()), file);
}
private static class DelegatingOutputStream extends FilterOutputStream{
public DelegatingOutputStream(){
super(new ByteArrayOutputStream2());
}
public void setDelegate(OutputStream out) throws IOException{
ByteSequence seq = ((ByteArrayOutputStream2)this.out).toByteSequence();
out.write(seq.buffer(), seq.offset(), seq.length());
this.out = out;
}
}
public static void main(String[] args) throws Exception{
if(args.length!=2){
System.out.println("usage: crawl-xml."+(OS.get().isWindows()?"bat":"sh")+" ");
System.exit(1);
}
File dir = new File(args[1]);
FileUtil.mkdirs(dir);
new XMLCrawler().crawlInto(new InputSource(args[0]), dir);
}
}