All Downloads are FREE. Search and download functionalities are using the official Maven repository.

thredds.catalog.crawl.CatalogCrawler Maven / Gradle / Ivy

Go to download

The NetCDF-Java Library is a Java interface to NetCDF files, as well as to many other types of scientific data formats.

The newest version!
/*
 * Copyright 1998-2009 University Corporation for Atmospheric Research/Unidata
 *
 * Portions of this software were developed by the Unidata Program at the
 * University Corporation for Atmospheric Research.
 *
 * Access and use of this software shall impose the following obligations
 * and understandings on the user. The user is granted the right, without
 * any fee or cost, to use, copy, modify, alter, enhance and distribute
 * this software, and any derivative works thereof, and its supporting
 * documentation for any purpose whatsoever, provided that this entire
 * notice appears in all copies of the software, derivative works and
 * supporting documentation.  Further, UCAR requests that the user credit
 * UCAR/Unidata in any publications that result from the use of this
 * software or in any product that includes this software. The names UCAR
 * and/or Unidata, however, may not be used in any advertising or publicity
 * to endorse or promote any products or commercial entity unless specific
 * written permission is obtained from UCAR/Unidata. The user also
 * understands that UCAR/Unidata is not obligated to provide the user with
 * any support, consulting, training or assistance of any kind with regard
 * to the use, operation and performance of this software nor to provide
 * the user with any updates, revisions, new versions or "bug fixes."
 *
 * THIS SOFTWARE IS PROVIDED BY UCAR/UNIDATA "AS IS" AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL UCAR/UNIDATA BE LIABLE FOR ANY SPECIAL,
 * INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING
 * FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
 * NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
 * WITH THE ACCESS, USE OR PERFORMANCE OF THIS SOFTWARE.
 */

package thredds.catalog.crawl;

import ucar.nc2.util.CancelTask;

import java.io.*;
import java.util.List;
import java.util.ArrayList;
import java.util.Random;

import thredds.catalog.InvCatalogFactory;
import thredds.catalog.InvCatalogImpl;
import thredds.catalog.InvDataset;
import thredds.catalog.InvCatalogRef;


/**
 * This crawls a catalog tree for its datasets, which are sent to a listener.
 * You can get all or some of the datasets.
 * A "direct" dataset is one which hasAccess() is true, meaning it has one or more access elements.
 * 

* Example use: *

 * CatalogCrawler.Listener listener = new CatalogCrawler.Listener() {
 *   public void getDataset(InvDataset dd) {
 *     if (dd.isHarvest())
 *       doHarvest(dd);
 *   }
 * };
 * CatalogCrawler crawler = new CatalogCrawler( CatalogCrawler.USE_ALL_DIRECT, false, listener);
 * 
* * @author John Caron */ public class CatalogCrawler { public enum Type { all, // return all datasets all_direct, // return all direct datasets, ie that have an access URL first_direct, // return first dataset in each collection of direct datasets random_direct, // return one random dataset in each collection of direct datasets random_direct_middle // return one random dataset in each collection of direct datasets } // old static public final int USE_ALL = 0; static public final int USE_ALL_DIRECT = 1; static public final int USE_FIRST_DIRECT = 2; static public final int USE_RANDOM_DIRECT = 3; static public final int USE_RANDOM_DIRECT_NOT_FIRST_OR_LAST = 4; private Filter filter = null; private Type type = Type.all; private Listener listen; private Random random; private int countCatrefs; /** * Constructor. * * @param type CatalogCrawler.USE_XXX constant: When you get to a dataset containing leaf datasets, * do all, only the first, or a randomly chosen one. * @param skipDatasetScan if true, dont recurse into DatasetScan elements. This is * useful if you are looking only for collection level metadata. * @param listen this is called for each dataset. */ public CatalogCrawler(int type, boolean skipDatasetScan, Listener listen) { this (Type.values()[type], new FilterDatasetScan(skipDatasetScan), listen); } /** * Constructor. * * @param type CatalogCrawler.Type * @param filter dont process this dataset or its descendants. may be null * @param listen each dataset gets passed to the listener */ public CatalogCrawler(Type type, Filter filter, Listener listen) { this.type = type; this.filter = filter; this.listen = listen; if (type == Type.random_direct || type == Type.random_direct_middle ) this.random = new Random(System.currentTimeMillis()); } /** * Open a catalog and crawl (depth first) all the datasets in it. * Close catalogs and release their resources as you. * * @param catUrl url of catalog to open * @param task user can cancel the task (may be null) * @param out send status messages to here (may be null) * @param context caller can pass this object in (used for thread safety) * @return number of catalog references opened and crawled */ public int crawl(String catUrl, CancelTask task, PrintStream out, Object context) { InvCatalogFactory catFactory = InvCatalogFactory.getDefaultFactory(true); InvCatalogImpl cat = catFactory.readXML(catUrl); StringBuilder buff = new StringBuilder(); boolean isValid = cat.check(buff, false); if (out != null) { out.println("catalog <" + cat.getName() + "> " + (isValid ? "is" : "is not") + " valid"); out.println(" validation output=\n" + buff); } if (isValid) return crawl(cat, task, out, context); return 0; } /** * Crawl a catalog thats already been opened. * When you get to a dataset containing leaf datasets, do all, only the first, or a randomly chosen one. * * @param cat the catalog * @param task user can cancel the task (may be null) * @param out send status messages to here (may be null) * @param context caller can pass this object in (used for thread safety) * @return number of catalog references opened and crawled */ public int crawl(InvCatalogImpl cat, CancelTask task, PrintStream out, Object context) { if (out != null) out.println("***CATALOG " + cat.getCreateFrom()); countCatrefs = 0; for (InvDataset ds : cat.getDatasets()) { if (type == Type.all) crawlDataset(ds, task, out, context, true); else crawlDirectDatasets(ds, task, out, context, true); if ((task != null) && task.isCancel()) break; } return 1 + countCatrefs; } /** * Crawl this dataset recursively, return all datasets * * @param ds the dataset * @param task user can cancel the task (may be null) * @param out send status messages to here (may be null) * @param context caller can pass this object in (used for thread safety) */ public void crawlDataset(InvDataset ds, CancelTask task, PrintStream out, Object context, boolean release) { boolean isCatRef = (ds instanceof InvCatalogRef); if (filter != null && filter.skipAll(ds)) { if (isCatRef && release) ((InvCatalogRef) ds).release(); return; } boolean isDataScan = ds.findProperty("DatasetScan") != null; if (isCatRef) { InvCatalogRef catref = (InvCatalogRef) ds; if (out != null) out.println(" **CATREF " + catref.getURI() + " (" + ds.getName() + ") "); countCatrefs++; if (!listen.getCatalogRef( catref, context)) { if (release) catref.release(); return; } } if (!isCatRef || isDataScan) listen.getDataset(ds, context); // recurse - depth first List dlist = ds.getDatasets(); if (isCatRef) { InvCatalogRef catref = (InvCatalogRef) ds; if (!isDataScan) { listen.getDataset(catref.getProxyDataset(), context); // wait till a catref is read, so all metadata is there ! } } for (InvDataset dds : dlist) { crawlDataset(dds, task, out, context, release); if ((task != null) && task.isCancel()) break; } if (isCatRef && release) { InvCatalogRef catref = (InvCatalogRef) ds; catref.release(); } } /** * Crawl this dataset recursively. Only send back direct datasets * * @param ds the dataset * @param task user can cancel the task (may be null) * @param out send status messages to here (may be null) * @param context caller can pass this object in (used for thread safety) */ public void crawlDirectDatasets(InvDataset ds, CancelTask task, PrintStream out, Object context, boolean release) { boolean isCatRef = (ds instanceof InvCatalogRef); if (filter != null && filter.skipAll(ds)) { if (isCatRef && release) ((InvCatalogRef) ds).release(); return; } if (isCatRef) { InvCatalogRef catref = (InvCatalogRef) ds; if (out != null) out.println(" **CATREF " + catref.getURI() + " (" + ds.getName() + ") "); countCatrefs++; if (!listen.getCatalogRef( catref, context)) { if (release) catref.release(); return; } } // get datasets with data access ("leaves") List dlist = ds.getDatasets(); List leaves = new ArrayList(); for (InvDataset dds : dlist) { if (dds.hasAccess()) leaves.add(dds); } if (leaves.size() > 0) { if (type == Type.first_direct) { InvDataset dds = leaves.get(0); listen.getDataset(dds, context); } else if (type == Type.random_direct) { listen.getDataset(chooseRandom(leaves), context); } else if (type == Type.random_direct_middle) { listen.getDataset(chooseRandomNotFirstOrLast(leaves), context); } else { // do all of them for (InvDataset dds : leaves) { listen.getDataset(dds, context); if ((task != null) && task.isCancel()) break; } } } // recurse for (InvDataset dds : dlist) { if (dds.hasNestedDatasets()) crawlDirectDatasets(dds, task, out, context, release); if ((task != null) && task.isCancel()) break; } /* if (out != null) { int took = (int) (System.currentTimeMillis() - start); out.println(" ** " + ds.getName() + " took " + took + " msecs\n"); } */ if (ds instanceof InvCatalogRef && release) { InvCatalogRef catref = (InvCatalogRef) ds; catref.release(); } } private InvDataset chooseRandom(List datasets) { int index = random.nextInt(datasets.size()); return (InvDataset) datasets.get(index); } private InvDataset chooseRandomNotFirstOrLast(List datasets) { int index = random.nextInt(datasets.size()); if ( index == 0 && datasets.size() > 1) index++; else if ( index == datasets.size() - 1 && datasets.size() > 1) index--; return (InvDataset) datasets.get(index); } static public interface Listener { /** * Gets called for each dataset found. * @param dd the dataset * @param context caller can pass this object in (used for thread safety) */ public void getDataset(InvDataset dd, Object context); /** * Gets called for each catalogRef found * @param dd the dataset * @return true to process, false to skip * @param context caller can pass this object in (used for thread safety) */ public boolean getCatalogRef(InvCatalogRef dd, Object context); } static public interface Filter { public boolean skipAll(InvDataset ds); } private static class FilterDatasetScan implements Filter { boolean skipDatasetScan; private FilterDatasetScan(boolean skipDatasetScan) { this.skipDatasetScan = skipDatasetScan; } @Override public boolean skipAll(InvDataset ds) { return skipDatasetScan && (ds instanceof InvCatalogRef) && (ds.findProperty("DatasetScan") != null); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy