thredds.catalog.crawl.CatalogCrawler Maven / Gradle / Ivy
/*
* Copyright 1998-2009 University Corporation for Atmospheric Research/Unidata
*
* Portions of this software were developed by the Unidata Program at the
* University Corporation for Atmospheric Research.
*
* Access and use of this software shall impose the following obligations
* and understandings on the user. The user is granted the right, without
* any fee or cost, to use, copy, modify, alter, enhance and distribute
* this software, and any derivative works thereof, and its supporting
* documentation for any purpose whatsoever, provided that this entire
* notice appears in all copies of the software, derivative works and
* supporting documentation. Further, UCAR requests that the user credit
* UCAR/Unidata in any publications that result from the use of this
* software or in any product that includes this software. The names UCAR
* and/or Unidata, however, may not be used in any advertising or publicity
* to endorse or promote any products or commercial entity unless specific
* written permission is obtained from UCAR/Unidata. The user also
* understands that UCAR/Unidata is not obligated to provide the user with
* any support, consulting, training or assistance of any kind with regard
* to the use, operation and performance of this software nor to provide
* the user with any updates, revisions, new versions or "bug fixes."
*
* THIS SOFTWARE IS PROVIDED BY UCAR/UNIDATA "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL UCAR/UNIDATA BE LIABLE FOR ANY SPECIAL,
* INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING
* FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
* NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
* WITH THE ACCESS, USE OR PERFORMANCE OF THIS SOFTWARE.
*/
package thredds.catalog.crawl;
import ucar.nc2.util.CancelTask;
import java.io.*;
import java.util.List;
import java.util.ArrayList;
import java.util.Random;
import thredds.catalog.InvCatalogFactory;
import thredds.catalog.InvCatalogImpl;
import thredds.catalog.InvDataset;
import thredds.catalog.InvCatalogRef;
/**
* This crawls a catalog tree for its datasets, which are sent to a listener.
* You can get all or some of the datasets.
* A "direct" dataset is one which hasAccess() is true, meaning it has one or more access elements.
*
* Example use:
*
* CatalogCrawler.Listener listener = new CatalogCrawler.Listener() {
* public void getDataset(InvDataset dd) {
* if (dd.isHarvest())
* doHarvest(dd);
* }
* };
* CatalogCrawler crawler = new CatalogCrawler( CatalogCrawler.USE_ALL_DIRECT, false, listener);
*
*
* @author John Caron
*/
public class CatalogCrawler {
/**
* return all datasets
*/
static public final int USE_ALL = 0;
/**
* return all direct datasets, ie that have an access URL
*/
static public final int USE_ALL_DIRECT = 1;
/**
* return first dataset in each collection of direct datasets.
*/
static public final int USE_FIRST_DIRECT = 2;
/**
* return one random dataset in each collection of direct datasets.
*/
static public final int USE_RANDOM_DIRECT = 3;
/**
* return one random dataset in each collection of direct datasets.
*/
static public final int USE_RANDOM_DIRECT_NOT_FIRST_OR_LAST = 4;
private boolean skipDatasetScan = false;
private int type = USE_ALL;
private Listener listen;
private Random random;
private int countCatrefs;
/**
* Constructor.
*
* @param type CatalogCrawler.USE_XXX constant: When you get to a dataset containing leaf datasets,
* do all, only the first, or a randomly chosen one.
* @param skipDatasetScan if true, dont recurse into DatasetScan elements. This is
* useful if you are looking only for collection level metadata.
* @param listen this is called for each dataset.
*/
public CatalogCrawler(int type, boolean skipDatasetScan, Listener listen) {
this.type = type;
this.skipDatasetScan = skipDatasetScan;
this.listen = listen;
if (type == USE_RANDOM_DIRECT || type == USE_RANDOM_DIRECT_NOT_FIRST_OR_LAST )
this.random = new Random(System.currentTimeMillis());
}
/**
* Open a catalog and crawl (depth first) all the datasets in it.
* Close catalogs and release their resources as you.
*
* @param catUrl url of catalog to open
* @param task user can cancel the task (may be null)
* @param out send status messages to here (may be null)
* @param context caller can pass this object in (used for thread safety)
* @return number of catalog references opened and crawled
*/
public int crawl(String catUrl, CancelTask task, PrintStream out, Object context) {
InvCatalogFactory catFactory = InvCatalogFactory.getDefaultFactory(true);
InvCatalogImpl cat = catFactory.readXML(catUrl);
StringBuilder buff = new StringBuilder();
boolean isValid = cat.check(buff, false);
if (out != null) {
out.println("catalog <" + cat.getName() + "> " + (isValid ? "is" : "is not") + " valid");
out.println(" validation output=\n" + buff);
}
if (isValid)
return crawl(cat, task, out, context);
return 0;
}
/**
* Crawl a catalog thats already been opened.
* When you get to a dataset containing leaf datasets, do all, only the first, or a randomly chosen one.
*
* @param cat the catalog
* @param task user can cancel the task (may be null)
* @param out send status messages to here (may be null)
* @param context caller can pass this object in (used for thread safety)
* @return number of catalog references opened and crawled
*/
public int crawl(InvCatalogImpl cat, CancelTask task, PrintStream out, Object context) {
if (out != null)
out.println("***CATALOG " + cat.getCreateFrom());
countCatrefs = 0;
for (InvDataset ds : cat.getDatasets()) {
if (type == USE_ALL)
crawlDataset(ds, task, out, context, true);
else
crawlDirectDatasets(ds, task, out, context, true);
if ((task != null) && task.isCancel()) break;
}
return 1 + countCatrefs;
}
/**
* Crawl this dataset recursively, return all datasets
*
* @param ds the dataset
* @param task user can cancel the task (may be null)
* @param out send status messages to here (may be null)
* @param context caller can pass this object in (used for thread safety)
*/
public void crawlDataset(InvDataset ds, CancelTask task, PrintStream out, Object context, boolean release) {
boolean isCatRef = (ds instanceof InvCatalogRef);
boolean isDataScan = ds.findProperty("DatasetScan") != null;
boolean skipScanChildren = skipDatasetScan && (ds instanceof InvCatalogRef) && isDataScan;
if (isCatRef) {
InvCatalogRef catref = (InvCatalogRef) ds;
if (out != null)
out.println(" **CATREF " + catref.getURI() + " (" + ds.getName() + ") ");
countCatrefs++;
if (!listen.getCatalogRef( catref, context)) {
if (release) catref.release();
return;
}
}
if (!isCatRef || skipScanChildren || isDataScan)
listen.getDataset(ds, context);
// recurse - depth first
if (!skipScanChildren) {
List dlist = ds.getDatasets();
if (isCatRef) {
InvCatalogRef catref = (InvCatalogRef) ds;
if (!isDataScan) {
listen.getDataset(catref.getProxyDataset(), context); // wait till a catref is read, so all metadata is there !
}
}
for (InvDataset dds : dlist) {
crawlDataset(dds, task, out, context, release);
if ((task != null) && task.isCancel())
break;
}
}
if (isCatRef && release) {
InvCatalogRef catref = (InvCatalogRef) ds;
catref.release();
}
}
/**
* Crawl this dataset recursively. Only send back direct datasets
*
* @param ds the dataset
* @param task user can cancel the task (may be null)
* @param out send status messages to here (may be null)
* @param context caller can pass this object in (used for thread safety)
*/
public void crawlDirectDatasets(InvDataset ds, CancelTask task, PrintStream out, Object context, boolean release) {
boolean isCatRef = (ds instanceof InvCatalogRef);
boolean skipScanChildren = skipDatasetScan && (ds instanceof InvCatalogRef) && (ds.findProperty("DatasetScan") != null);
if (isCatRef) {
InvCatalogRef catref = (InvCatalogRef) ds;
if (out != null)
out.println(" **CATREF " + catref.getURI() + " (" + ds.getName() + ") ");
countCatrefs++;
if (!listen.getCatalogRef( catref, context)) {
if (release) catref.release();
return;
}
}
// get datasets with data access ("leaves")
List dlist = ds.getDatasets();
List leaves = new ArrayList();
for (InvDataset dds : dlist) {
if (dds.hasAccess())
leaves.add(dds);
}
if (leaves.size() > 0) {
if (type == USE_FIRST_DIRECT) {
InvDataset dds = (InvDataset) leaves.get(0);
listen.getDataset(dds, context);
} else if (type == USE_RANDOM_DIRECT) {
listen.getDataset(chooseRandom(leaves), context);
} else if (type == USE_RANDOM_DIRECT_NOT_FIRST_OR_LAST) {
listen.getDataset(chooseRandomNotFirstOrLast(leaves), context);
} else { // do all of them
for (InvDataset dds : leaves) {
listen.getDataset(dds, context);
if ((task != null) && task.isCancel()) break;
}
}
}
// recurse
if (!skipScanChildren) {
for (InvDataset dds : dlist) {
if (dds.hasNestedDatasets())
crawlDirectDatasets(dds, task, out, context, release);
if ((task != null) && task.isCancel())
break;
}
}
/* if (out != null) {
int took = (int) (System.currentTimeMillis() - start);
out.println(" ** " + ds.getName() + " took " + took + " msecs\n");
} */
if (ds instanceof InvCatalogRef && release) {
InvCatalogRef catref = (InvCatalogRef) ds;
catref.release();
}
}
private InvDataset chooseRandom(List datasets) {
int index = random.nextInt(datasets.size());
return (InvDataset) datasets.get(index);
}
private InvDataset chooseRandomNotFirstOrLast(List datasets) {
int index = random.nextInt(datasets.size());
if ( index == 0 && datasets.size() > 1)
index++;
else if ( index == datasets.size() - 1 && datasets.size() > 1)
index--;
return (InvDataset) datasets.get(index);
}
static public interface Listener {
/**
* Gets called for each dataset found.
* @param dd the dataset
* @param context caller can pass this object in (used for thread safety)
*/
public void getDataset(InvDataset dd, Object context);
/**
* Gets called for each catalogRef found
* @param dd the dataset
* @return true to process, false to skip
* @param context caller can pass this object in (used for thread safety)
*/
public boolean getCatalogRef(InvCatalogRef dd, Object context);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy