All Downloads are FREE. Search and download functionalities are using the official Maven repository.

thredds.client.catalog.tools.CatalogCrawler Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (c) 1998-2018 University Corporation for Atmospheric Research/Unidata
 * See LICENSE for license information.
 */
package thredds.client.catalog.tools;

import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter;
import com.beust.jcommander.ParameterDescription;
import com.beust.jcommander.ParameterException;
import com.google.common.base.MoreObjects;
import thredds.client.catalog.*;
import thredds.client.catalog.builder.CatalogBuilder;
import ucar.ma2.Array;
import ucar.ma2.InvalidRangeException;
import ucar.nc2.Variable;
import ucar.nc2.dataset.NetcdfDataset;
import ucar.nc2.ft2.coverage.*;
import ucar.nc2.util.CancelTask;
import ucar.nc2.util.Indent;
import ucar.nc2.util.Misc;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.*;
import ucar.nc2.write.Ncdump;

/**
 * Crawl client catalogs
 *
 * @author caron
 * @since 1/11/2015
 */
public class CatalogCrawler {

  public interface Listener {
    /**
     * Gets called for each dataset found.
     *
     * @param dd the dataset
     * @param context object passed into crawl() by the caller
     */
    void getDataset(Dataset dd, Object context);
  }

  public interface Filter {
    // true means skip this dataset and any children
    boolean skipAll(Dataset ds);

    // true means skip this catref and any children
    boolean skipCatref(CatalogRef ds, int level);
  }

  public enum Type {
    all, // return all datasets
    all_direct, // return all direct datasets, ie that have an access URL
    first_direct, // return first dataset in each collection of direct datasets
    random_direct, // return one random dataset in each collection of direct datasets
    random_direct_middle, // return one random dataset in each collection of direct datasets
    random_direct_max // return max random datasets in entire catalog
  }

  private final Type type;
  private final int max;
  private final Filter filter;
  private final Listener listen;
  private final CancelTask task;
  private final PrintWriter out;
  private final Object context;

  private Random random;
  private int countCatrefs;

  private int numReadFailures;

  /**
   * Constructor.
   *
   * @param type CatalogCrawler.Type
   * @param max if > 0, only process max datasets, then exit (random_direct_max only)
   * @param filter dont process this dataset or its descendants. may be null
   * @param listen each dataset gets passed to the listener. if null, send the dataset name to standard out
   * @param task user can cancel the task (may be null)
   * @param out send status messages to here (may be null)
   * @param context caller can pass this object to Listener (eg used for thread safety)
   */
  public CatalogCrawler(Type type, int max, Filter filter, Listener listen, CancelTask task, PrintWriter out,
      Object context) {
    this.type = type == null ? Type.all : type;
    this.max = max;
    this.filter = filter;
    this.listen = listen;
    this.task = task;
    this.out = out;
    this.context = context;

    if (type == Type.random_direct || type == Type.random_direct_middle || type == Type.random_direct_max)
      this.random = new Random(System.currentTimeMillis());
  }

  /**
   * Open a catalog and crawl (depth first) all the datasets in it.
   * Any that pass the filter are sent to the Listener
   * Close catalogs and release their resources as you.
   *
   * @param catUrl url of catalog to open (xml, not html)
   * @return number of catalogs (this + catrefs) opened and crawled
   */
  public int crawl(String catUrl) throws IOException {

    CatalogBuilder catFactory = new CatalogBuilder();
    Catalog cat = catFactory.buildFromLocation(catUrl, null);
    boolean isValid = !catFactory.hasFatalError();
    if (out != null) {
      out.println("Catalog <" + catUrl + "> " + (isValid ? "read ok" : "is not valid"));
      if (!isValid)
        out.println(" validation output=\n" + catFactory.getErrorMessage());
    }

    this.countCatrefs = 0;
    if (isValid)
      return crawl(cat);

    System.err.printf("%s%n", catFactory.getErrorMessage());
    return 0;
  }

  /**
   * Crawl a catalog thats already been opened.
   *
   * @param cat the catalog
   * @return number of catalog references opened and crawled
   */
  public int crawl(Catalog cat) throws IOException {
    this.countCatrefs = 0;
    crawl(cat, 0, new Indent(2));
    return 1 + countCatrefs;
  }

  private int crawl(Catalog cat, int level, Indent indent) throws IOException {
    for (Dataset ds : cat.getDatasetsLocal()) {
      crawlDataset(ds, level, indent);
      if ((task != null) && task.isCancel())
        break;
    }
    return 1 + countCatrefs;
  }

  /**
   * Crawl this dataset recursively.
   *
   * @param ds the dataset
   * @param level is the top dataset
   * @param indent print indentation
   */
  private void crawlDataset(Dataset ds, int level, Indent indent) throws IOException {
    if (filter != null && filter.skipAll(ds))
      return;

    if (ds instanceof CatalogRef) {
      CatalogRef catref = (CatalogRef) ds;

      if (filter != null && filter.skipCatref(catref, level + 1))
        return;

      if (out != null)
        out.printf("%n%sCatalogRef %s (%s)%n", indent, catref.getURI(), ds.getName());
      countCatrefs++;

      Catalog cat = readCatref(catref, out, indent);
      if (cat == null) {
        numReadFailures++;
        return;
      }

      crawl(cat, level + 1, indent.incr());
      indent.decr();
      return;
    }

    if (filter != null && filter.skipAll(ds))
      return;

    if (level == 0) {
      if (type == Type.all || ds.hasAccess())
        listen.getDataset(ds, context);
    }

    if (type == Type.all) {
      for (Dataset dds : ds.getDatasetsLocal()) {
        if (!(dds instanceof CatalogRef))
          listen.getDataset(dds, context);
        crawlDataset(dds, level, indent.incr());
        indent.decr();
        if ((task != null) && task.isCancel())
          break;
      }

    } else {

      // get datasets with data access ("leaves")
      List dlist = ds.getDatasetsLocal();
      List leaves = new ArrayList<>();
      for (Dataset dds : dlist) {
        if (dds.hasAccess())
          leaves.add(dds);
      }

      if (!leaves.isEmpty()) {
        if (type == Type.first_direct) {
          Dataset dds = leaves.get(0);
          listen.getDataset(dds, context);

        } else if (type == Type.random_direct) {
          listen.getDataset(chooseRandom(leaves), context);

        } else if (type == Type.random_direct_middle) {
          listen.getDataset(chooseRandomNotFirstOrLast(leaves), context);

        } else { // do all of them
          for (Dataset dds : leaves) {
            listen.getDataset(dds, context);
            if ((task != null) && task.isCancel())
              break;
          }
        }
      }
      // recurse
      for (Dataset dds : ds.getDatasetsLocal()) {
        if (dds.hasNestedDatasets() || (dds instanceof CatalogRef)) {
          crawlDataset(dds, level, indent.incr());
          indent.decr();
          if ((task != null) && task.isCancel())
            break;
        }
      }
    }
  }

  private Catalog readCatref(CatalogRef catref, PrintWriter out, Indent indent) {
    CatalogBuilder builder = new CatalogBuilder();
    Catalog cat = builder.buildFromCatref(catref);
    if (builder.hasFatalError() || cat == null) {
      if (out != null)
        out.printf("%sError reading catref %s err=%s%n", indent, catref.getName(), builder.getErrorMessage());
      return null;
    }
    return cat;
  }

  private Dataset chooseRandom(List datasets) {
    int index = random.nextInt(datasets.size());
    return (Dataset) datasets.get(index);
  }

  private Dataset chooseRandomNotFirstOrLast(List datasets) {
    int index = random.nextInt(datasets.size());
    if (index == 0 && datasets.size() > 1)
      index++;
    else if (index == datasets.size() - 1 && datasets.size() > 1)
      index--;
    return (Dataset) datasets.get(index);
  }

  @Override
  public String toString() {
    return MoreObjects.toStringHelper(this).add("filter", filter).add("max", max).add("type", type)
        .add("listen", listen).add("random", random).add("countCatrefs", countCatrefs).toString();
  }

  public int getNumReadFailures() {
    return numReadFailures;
  }


  /*
   * public int crawlAllInDirectory(Path directory, boolean recurse, CancelTask task, PrintWriter out, Object context)
   * throws IOException {
   * int count = 0;
   * try (DirectoryStream ds = Files.newDirectoryStream(directory)) {
   * for (Path p : ds) {
   * if (Files.isDirectory(p)) {
   * if (recurse)
   * crawlAllInDirectory(p, recurse, task, out, context);
   * } else {
   * count += crawl("file:" + p.toString(), null, null, null);
   * }
   * if ((task != null) && task.isCancel()) break;
   * }
   * }
   * return count;
   * }
   */

  //////////////////////////////////////////////////////////////////////////////

  private static class FilterDatasetScan implements Filter {
    PrintWriter out;
    boolean skipDatasetScan;
    int catrefLevel;

    int count;
    int countSkip;
    int countCatrefs;

    private FilterDatasetScan(PrintWriter pw, boolean skipDatasetScan, int catrefLevel) {
      this.out = pw;
      this.skipDatasetScan = skipDatasetScan;
      this.catrefLevel = catrefLevel;
    }

    @Override
    public boolean skipAll(Dataset ds) {
      boolean skip = skipDatasetScan && (ds instanceof CatalogRef) && (ds.findProperty("DatasetScan") != null);
      if (skip) {
        countSkip++;
        out.printf("  skip DatasetScan %s%n", ds.getName());
      }
      count++;
      return skip;
    }


    @Override
    public boolean skipCatref(CatalogRef dd, int level) {
      countCatrefs++;
      if (catrefLevel <= 0)
        return false;
      return level > catrefLevel;
    }
  }

  private static class CommandLine {
    @Parameter(names = {"-cat", "--catalog"}, description = "Top catalog URL", required = true)
    public String topCatalog;

    @Parameter(names = {"-t", "--type"}, description = "type of crawl. Allowed values="
        + "[all, all_direct, first_direct, random_direct, random_direct_middle, random_direct_max]")
    public Type type = Type.all;

    @Parameter(names = {"-sh", "--showNames"}, description = "show dataset names ")
    public boolean showNames;

    @Parameter(names = {"-o", "--openDataset"}, description = "try to open the dataset ")
    public boolean openDataset;

    @Parameter(names = {"-r", "--readRandom"}, description = "read some random data")
    public boolean readRandom;

    @Parameter(names = {"-skipScans", "--skipScans"}, description = "skip DatasetScans ")
    public boolean skipDatasetScan = true;

    @Parameter(names = {"-catrefLevel", "--catrefLevel"}, description = "skip Catalog References > nested level")
    public int catrefLevel;

    @Parameter(names = {"-h", "--help"}, description = "Display this help and exit", help = true)
    public boolean help;

    private static class ParameterDescriptionComparator implements Comparator {
      // Display parameters in this order in the usage information.
      private final List orderedParamNames = Arrays.asList("--catalog", "--type", "--openDataset",
          "--skipScans", "--readRandom", "--catrefLevel", "--showNames", "--help");

      @Override
      public int compare(ParameterDescription p0, ParameterDescription p1) {
        int index0 = orderedParamNames.indexOf(p0.getLongestName());
        int index1 = orderedParamNames.indexOf(p1.getLongestName());
        assert index0 >= 0 : "Unexpected parameter name: " + p0.getLongestName();
        assert index1 >= 0 : "Unexpected parameter name: " + p1.getLongestName();

        return Integer.compare(index0, index1);
      }
    }

    //////////////////////////////////////////////////////////////////////////////////////////////////////////////

    private final JCommander jc;

    CommandLine(String progName, String[] args) throws ParameterException {
      this.jc = new JCommander(this, args); // Parses args and uses them to initialize *this*.
      jc.setProgramName(progName); // Displayed in the usage information.

      // Set the ordering of of parameters in the usage information.
      jc.setParameterDescriptionComparator(new ParameterDescriptionComparator());
    }

    void printUsage() {
      jc.usage();
    }

    @Override
    public String toString() {
      return "topCatalog='" + topCatalog + '\'' + "\n   type=" + type + ", showNames=" + showNames
          + ", skipDatasetScan=" + skipDatasetScan + ", catrefLevel=" + catrefLevel + ", openDataset=" + openDataset;
    }
  }

  private static class Counter {
    int datasets;
    int openFc;
    int failFc;
    int failException;
    int openOdap;
    int failOdap;
    int openCdmr;
    int failCdmr;
  }

  public static void main(String[] args) throws Exception {
    String progName = CatalogCrawler.class.getName();
    long start = System.currentTimeMillis();
    Counter c = new Counter();

    try {
      CommandLine cmdLine = new CommandLine(progName, args);
      if (cmdLine.help) {
        cmdLine.printUsage();
        return;
      }
      System.out.printf("%s %n   %s%n", progName, cmdLine);

      PrintWriter pw = new PrintWriter(System.out, true);
      FilterDatasetScan filter = new FilterDatasetScan(pw, cmdLine.skipDatasetScan, cmdLine.catrefLevel);
      CancelTask task = null;

      CatalogCrawler crawler = new CatalogCrawler(cmdLine.type, -1, filter, (dd, context) -> {
        c.datasets++;

        if (cmdLine.showNames) {
          Service s = dd.getServiceDefault();
          String sname = (s == null) ? "none" : s.getName();
          pw.format("  Dataset '%s' service=%s%n", dd.getName(), sname);
        }

        if (cmdLine.openDataset && dd.hasAccess()) {
          Service s = dd.getServiceDefault();
          if (s == null || s.getServiceTypeName().equalsIgnoreCase(ServiceType.HTTPServer.name())) // skip files
            return;

          DataFactory fac = new DataFactory();
          try (DataFactory.Result result = fac.openFeatureDataset(dd, task)) {
            if (result.fatalError) {
              pw.format("  Dataset fatalError=%s%n", result.errLog);
              c.failFc++;
            } else {
              pw.format("  Dataset '%s' opened as type=%s%n", dd.getName(), result.featureDataset.getFeatureType());
              c.openFc++;
              if (cmdLine.readRandom && result.featureDataset instanceof FeatureDatasetCoverage) {
                readRandom((FeatureDatasetCoverage) result.featureDataset, pw);
              }
            }
          } catch (IOException e) {
            e.printStackTrace(pw);
            c.failException++;

          } catch (InvalidRangeException e) {
            e.printStackTrace();
            c.failException++;
          }

          // opendap
          Access opendap = dd.getAccess(ServiceType.OPENDAP);
          if (opendap != null) {
            if (readAccess(opendap, fac, pw))
              c.openOdap++;
            else
              c.failOdap++;
          }

          // cdmremote
          Access cdmremote = dd.getAccess(ServiceType.CdmRemote);
          if (cdmremote != null) {
            if (readAccess(cdmremote, fac, pw))
              c.openCdmr++;
            else
              c.failCdmr++;
          }

        }
      }, task, pw, null);

      int count = 0;

      count += crawler.crawl(cmdLine.topCatalog);
      pw.flush();

      long took = System.currentTimeMillis() - start;
      System.out.printf("%nthat took %d msecs%n", took);
      System.out.printf("count catalogs = %d%n", count);
      System.out.printf("count catrefs  = %d%n", filter.countCatrefs);
      System.out.printf("count skipped  = %d%n", filter.countSkip);
      System.out.printf("count filterCalls = %d%n%n", filter.count);

      System.out.printf("             count datasets = %d%n", c.datasets);
      System.out.printf("count open featureCollection = %d%n", c.openFc);
      System.out.printf("count fail featureCollection = %d%n", c.failFc);
      System.out.printf("         count failException = %d%n", c.failException);
      System.out.printf("          count open Opendap = %d%n", c.openOdap);
      System.out.printf("          count fail Opendap = %d%n", c.failOdap);
      System.out.printf("             count open Cdmr = %d%n", c.openCdmr);
      System.out.printf("             count fail Cdmr = %d%n", c.failCdmr);

    } catch (ParameterException e) {
      System.err.println(e.getMessage());
      System.err.printf("Try \"%s --help\" for more information.%n", progName);
    }
  }

  private static boolean readRandom(FeatureDatasetCoverage covDataset, PrintWriter pw)
      throws IOException, InvalidRangeException {
    CoverageCollection cc = covDataset.getCoverageCollections().get(0);
    int ncov = cc.getCoverageCount();
    Random r = new Random(System.currentTimeMillis());
    int randomIdx = r.nextInt(ncov);
    int count = 0;
    Coverage randomCov = null;
    for (Coverage c : cc.getCoverages()) {
      if (count == randomIdx) {
        randomCov = c;
        break;
      }
      count++;
    }
    if (randomCov == null) {
      pw.format("Bad random coverage");
      return false;
    }

    SubsetParams subset = new SubsetParams().setTimePresent();
    GeoReferencedArray geo = randomCov.readData(subset);
    Array data = geo.getData();
    System.out.printf(" read data from %s shape = %s%n", randomCov.getName(), Arrays.toString(data.getShape()));
    return true;
  }

  private static boolean readAccess(Access access, DataFactory fac, PrintWriter pw) {
    Formatter log = new Formatter();
    try (NetcdfDataset ncd = fac.openDataset(access, false, null, log)) {
      if (ncd == null) {
        pw.format("  Dataset opendap fatalError=%s%n", log);
        return false;

      } else {
        pw.format("  Dataset '%s' opened as %s%n", access.getDataset().getName(), access.getService());
        return readRandom(ncd, pw);
      }
    } catch (InvalidRangeException | IOException e) {
      e.printStackTrace(pw);
      return false;
    }
  }

  private static boolean readRandom(NetcdfDataset ncd, PrintWriter pw) throws IOException, InvalidRangeException {
    int ncov = ncd.getVariables().size();
    Random r = new Random(System.currentTimeMillis());
    int randomIdx = r.nextInt(ncov);
    Variable randomVariable = ncd.getVariables().get(randomIdx);

    int[] shape = randomVariable.getShape();
    int[] origin = new int[shape.length];
    int[] size = new int[shape.length];
    for (int i = 0; i < shape.length; i++) {
      origin[i] = r.nextInt(shape[i]);
      size[i] = 1;
    }

    Array data = randomVariable.read(origin, size);
    pw.format(" read data from %s origin = %s return = %s%n", randomVariable.getNameAndDimensions(),
        Arrays.toString(origin), Ncdump.printArray(data));
    return true;

  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy