All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ucar.nc2.dataset.DatasetUrl Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (c) 1998-2018 John Caron and University Corporation for Atmospheric Research/Unidata
 * See LICENSE for license information.
 */
package ucar.nc2.dataset;

import static java.net.HttpURLConnection.HTTP_FORBIDDEN;
import static java.net.HttpURLConnection.HTTP_NOT_ACCEPTABLE;
import static java.net.HttpURLConnection.HTTP_OK;
import static java.net.HttpURLConnection.HTTP_UNAUTHORIZED;
import com.google.common.annotations.VisibleForTesting;
import javax.annotation.Nullable;

import com.google.common.collect.Multimap;
import thredds.client.catalog.ServiceType;
import thredds.inventory.MFile;
import thredds.inventory.MFiles;
import ucar.httpservices.HTTPFactory;
import ucar.httpservices.HTTPMethod;
import ucar.nc2.util.EscapeStrings;
import ucar.unidata.util.StringUtil2;
import ucar.unidata.util.Urlencoded;
import java.io.*;
import java.util.*;

/**
 * Detection of the protocol from a location string.
 * TODO: Review and refactor as needed. Perhaps BiMap\?
 *
 * @author caron
 * @since 10/20/2015.
 */
public class DatasetUrl {
  private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
  private static final String slashalpha = "\\/" + alpha;

  private static final String[] FRAGPROTOCOLS = {"dap4", "dap2", "dods", "cdmremote", "thredds", "ncml"};
  private static final ServiceType[] FRAGPROTOSVCTYPE = {ServiceType.DAP4, ServiceType.OPENDAP, ServiceType.OPENDAP,
      ServiceType.THREDDS, ServiceType.THREDDS, ServiceType.NCML};

  /**
   * Return the set of leading protocols for a url; may be more than one.
   * Watch out for Windows paths starting with a drive letter => protocol
   * names must all have a length > 1.
   * Watch out for '::'
   * Each captured protocol is saved without trailing ':'
   * Assume: the protocols MUST be terminated by the occurrence of '/'.
   *
   * @param url the url whose protocols to return
   * @return list of leading protocols without the trailing :
   */
  @VisibleForTesting
  public static List getProtocols(String url) {
    List allprotocols = new ArrayList<>(); // all leading protocols upto path or host

    // Note, we cannot use split because of the context sensitivity
    // This code is quite ugly because of all the confounding cases
    // (e.g. windows path, embedded colons, etc.).
    // Specifically, the 'file:' protocol is a problem because
    // it has so many non-standard forms such as file:x/y file://x/y file:///x/y.
    StringBuilder buf = new StringBuilder(url);
    // If there are any leading protocols, then they must stop at the first '/'.
    int slashpos = buf.indexOf("/");
    // Check special cases of file: or cdms3: with no slashes after:
    if (url.startsWith("file:") && "/\\".indexOf(url.charAt(5)) < 0) {
      allprotocols.add("file");
    } else if (url.startsWith("cdms3:") && "/\\".indexOf(url.charAt(6)) < 0) {
      allprotocols.add("cdms3");
    } else if (slashpos >= 0) {
      // Remove everything after the first slash
      buf.delete(slashpos + 1, buf.length());
      int index = buf.indexOf(":");
      while (index > 0) {
        // Validate protocol
        if (!validateProtocol(buf, 0, index))
          break;
        String protocol = buf.substring(0, index); // not including trailing ':'
        allprotocols.add(protocol);
        buf.delete(0, index + 1); // remove the leading protocol
        index = buf.indexOf(":");
      }
    }
    return allprotocols;
  }

  // Eliminate windows drive letters.
  // "protocol:" must be followed by alpha or "/"
  private static boolean validateProtocol(StringBuilder buf, int startpos, int endpos) {
    int len = endpos - startpos;
    if (len == 0)
      return false;
    char cs = buf.charAt(startpos);
    char ce1 = buf.charAt(endpos + 1);
    String wtf = "/\\";
    if (len == 1 && alpha.indexOf(cs) >= 0 && (ce1 == '/' || ce1 == '\\'))
      return false; // looks like windows drive letter
    // If trailing colon is not followed by alpha or /, then assume not url
    return slashalpha.indexOf(ce1) >= 0;
  }

  /////////////////////////////////////////////////////////////////////////////////////

  public static DatasetUrl findDatasetUrl(String orgLocation) throws IOException {
    ServiceType serviceType = null;

    // Canonicalize the location
    String location = StringUtil2.replace(orgLocation.trim(), '\\', "/");
    List allProtocols = getProtocols(location);

    String trueUrl = location;
    String leadProtocol;
    if (allProtocols.isEmpty()) {
      leadProtocol = "file"; // The location has no leading protocols, assume file:
    } else {
      leadProtocol = allProtocols.get(0);
    }

    // Priority in deciding
    // the service type is as follows.
    // 1. "protocol" tag in fragment
    // 2. specific protocol in fragment
    // 3. leading protocol
    // 4. path extension
    // 5. contact the server (if defined)

    // temporarily remove any trailing query or fragment
    String fragment = null;
    int pos = trueUrl.lastIndexOf('#');
    if (pos >= 0) {
      fragment = trueUrl.substring(pos + 1);
      trueUrl = trueUrl.substring(0, pos);
    }
    pos = location.lastIndexOf('?');
    String query = null;
    if (pos >= 0 && !leadProtocol.equals("cdms3")) {
      query = trueUrl.substring(pos + 1);
      trueUrl = trueUrl.substring(0, pos);
    }
    if (fragment != null)
      serviceType = searchFragment(fragment);

    if (serviceType == null) // See if leading protocol tells us how to interpret
      serviceType = decodeLeadProtocol(leadProtocol);

    if (serviceType == null) // See if path tells us how to interpret
      serviceType = searchPath(trueUrl);

    if (serviceType == null) {
      // There are several possibilities at this point; all of which
      // require further info to disambiguate
      // - we have file:// or file:; we need to see if
      // the extension can help, otherwise, start defaulting.
      // - we have a simple url: e.g. http://... ; contact the server
      if (leadProtocol.equals("file") || leadProtocol.equals("cdms3")) {
        serviceType = decodePathExtension(trueUrl); // look at the path extension
        // If it's a S3 file, it is expensive to peak inside to check if it's ncml, so we will only check extension
        if (serviceType == null && !leadProtocol.equals("cdms3") && checkIfNcml(new File(location))) {
          serviceType = ServiceType.NCML;
        }
      } else {
        serviceType = disambiguateHttp(trueUrl);
        // special cases
        if ((serviceType == null || serviceType == ServiceType.HTTPServer)) {
          // ncml file being served over http?
          if (checkIfRemoteNcml(trueUrl)) {
            serviceType = ServiceType.NCML;
          }
        }
      }
    }
    if (serviceType == ServiceType.NCML) { // ??
      // If lead protocol was null, then pretend it was a file
      // Note that technically, this should be 'file://'
      trueUrl = (allProtocols.isEmpty() ? "file:" + trueUrl : trueUrl);
    }

    // Add back the query and fragment (if any)
    if (query != null || fragment != null) {
      StringBuilder buf = new StringBuilder(trueUrl);
      if (query != null) {
        buf.append('?');
        buf.append(query);
      }
      if (fragment != null) {
        buf.append('#');
        buf.append(fragment);
      }
      trueUrl = buf.toString();
    }
    return DatasetUrl.create(serviceType, trueUrl);
  }

  /**
   * Given a location, find markers indicated which protocol to use
   * TODO: what use case is this handling ?
   *
   * @param fragment the fragment is to be examined
   * @return The discovered ServiceType, or null
   */
  private static ServiceType searchFragment(String fragment) {
    if (fragment.isEmpty())
      return null;
    Map map = parseFragment(fragment);
    if (map == null)
      return null;
    String protocol = map.get("protocol");

    if (protocol == null) {
      for (String p : FRAGPROTOCOLS) {
        if (map.get(p) != null) {
          protocol = p;
          break;
        }
      }
    }
    if (protocol != null) {
      if (protocol.equalsIgnoreCase("dap") || protocol.equalsIgnoreCase("dods"))
        return ServiceType.OPENDAP;
      if (protocol.equalsIgnoreCase("dap4"))
        return ServiceType.DAP4;
      if (protocol.equalsIgnoreCase("cdmremote"))
        return ServiceType.CdmRemote;
      if (protocol.equalsIgnoreCase("thredds"))
        return ServiceType.THREDDS;
      if (protocol.equalsIgnoreCase("ncml"))
        return ServiceType.NCML;
    }
    return null;
  }

  /**
   * Given the fragment part of a url, see if it
   * parses as name=value pairs separated by '&'
   * (same as query part).
   *
   * @param fragment the fragment part of a url
   * @return a map of the name value pairs (possibly empty),
   *         or null if the fragment does not parse.
   */
  private static Map parseFragment(String fragment) {
    Map map = new HashMap<>();
    if (fragment != null && fragment.length() >= 0) {
      if (fragment.charAt(0) == '#')
        fragment = fragment.substring(1);
      String[] pairs = fragment.split("[ \t]*[&][ \t]*");
      for (String pair : pairs) {
        String[] pieces = pair.split("[ \t]*[=][ \t]*");
        switch (pieces.length) {
          case 1:
            map.put(EscapeStrings.unescapeURL(pieces[0]).toLowerCase(), "true");
            break;
          case 2:
            map.put(EscapeStrings.unescapeURL(pieces[0]).toLowerCase(),
                EscapeStrings.unescapeURL(pieces[1]).toLowerCase());
            break;
          default:
            return null; // does not parse
        }
      }
    }
    return map;
  }

  /**
   * Given a url, search the path to look for protocol indicators
   *
   * @param url the url is to be examined
   * @return The discovered ServiceType, or null
   */
  private static ServiceType searchPath(String url) {
    if (false) { // Disable for now
      if (url == null || url.isEmpty())
        return null;
      url = url.toLowerCase(); // for matching purposes
      for (int i = 0; i < FRAGPROTOCOLS.length; i++) {
        String p = FRAGPROTOCOLS[i];
        if (url.contains("/thredds/" + p.toLowerCase() + "/")) {
          return FRAGPROTOSVCTYPE[i];
        }
      }
    }
    return null;
  }

  /**
   * Check path extension; assumes no query or fragment
   *
   * @param path the path to examine for extension
   * @return ServiceType inferred from the extension or null
   */
  private static ServiceType decodePathExtension(String path) {
    // Look at the path extensions
    if (path.endsWith(".dds") || path.endsWith(".das") || path.endsWith(".dods"))
      return ServiceType.OPENDAP;

    if (path.matches("^.*[.](dmr|dap|dsr)([.](xml|html))?$"))
      return ServiceType.DAP4;

    // This has to be last so that DAP4 request are not mis-identified as NCML.
    if (path.endsWith(".xml") || path.endsWith(".ncml"))
      return ServiceType.NCML;
    return null;
  }


  /*
   * Attempt to map a leading url protocol url to a service type (see thredds.catalog.ServiceType).
   * Possible service types should include at least the following.
   * 
    *
  1. OPENDAP (DAP2 protocol) *
  2. DAP4 (DAP4 protocol) *
  3. CdmRemote (remote ncstream) *
* * @param protocol The leading protocol * * @return ServiceType indicating how to handle the url, or null. */ @Urlencoded private static ServiceType decodeLeadProtocol(String protocol) { switch (protocol) { case "dods": return ServiceType.OPENDAP; case "dap4": return ServiceType.DAP4; case "httpserver": case "nodods": return ServiceType.HTTPServer; case "cdmremote": return ServiceType.CdmRemote; case "thredds": return ServiceType.THREDDS; } return null; } ////////////////////////////////////////////////////////////////// /** * If the URL alone is not sufficient to disambiguate the location, * then this method will attempt to do a specific kind of request on * the server, typically a HEAD call using the URL. * It finds the header "Content-Description" * and uses it value (e.g. "ncstream" or "dods", etc) * in order to disambiguate. * * @param location the url to disambiguate * @return ServiceType indicating how to handle the url */ @Urlencoded private static ServiceType disambiguateHttp(String location) throws IOException { boolean checkDap2 = false; boolean checkDap4 = false; boolean checkCdmr = false; if (!location.startsWith("http")) { return null; } // some TDS specific tests if (location.contains("cdmremote")) { ServiceType result = checkIfCdmr(location); if (result != null) return result; checkCdmr = true; } if (location.contains("dodsC")) { ServiceType result = checkIfDods(location); if (result != null) return result; checkDap2 = true; } if (location.contains("dap4") || location.contains("d4ts")) { ServiceType result = checkIfDap4(location); if (result != null) return result; checkDap4 = true; } if (!checkDap2) { ServiceType result = checkIfDods(location); if (result != null) return result; } if (!checkDap4) { ServiceType result = checkIfDap4(location); if (result != null) return result; } if (!checkCdmr) { ServiceType result = checkIfCdmr(location); return result; } return null; } // cdmremote private static ServiceType checkIfCdmr(String location) throws IOException { try (HTTPMethod method = HTTPFactory.Head(location + "?req=header")) { int statusCode = method.execute(); if (statusCode >= 300) { if (statusCode == HTTP_UNAUTHORIZED || statusCode == HTTP_FORBIDDEN) throw new IOException("Unauthorized to open dataset " + location); else throw new IOException(location + " is not a valid URL, return status=" + statusCode); } Optional value = method.getResponseHeaderValue("Content-Description"); return value.map(v -> v.equalsIgnoreCase("ncstream") ? ServiceType.CdmRemote : null).orElse(null); } } // not sure what other opendap servers do, so fall back on check for dds private static ServiceType checkIfDods(String location) throws IOException { int len = location.length(); // Strip off any trailing .dds, .das, or .dods if (location.endsWith(".dds")) location = location.substring(0, len - ".dds".length()); if (location.endsWith(".das")) location = location.substring(0, len - ".das".length()); if (location.endsWith(".dods")) location = location.substring(0, len - ".dods".length()); // Opendap assumes that the caller has properly escaped the url try ( // For some reason, the head method is not using credentials // method = session.newMethodHead(location + ".dds"); HTTPMethod method = HTTPFactory.Get(location + ".dds")) { int status = method.execute(); if (status == HTTP_OK) { Optional value = method.getResponseHeaderValue("Content-Description"); if (value.isPresent()) { String v = value.get(); if (v.equalsIgnoreCase("dods-dds") || v.equalsIgnoreCase("dods_dds")) return ServiceType.OPENDAP; else throw new IOException("OPeNDAP Server Error= " + method.getResponseAsString()); } } if (status == HTTP_UNAUTHORIZED || status == HTTP_FORBIDDEN) throw new IOException("Unauthorized to open dataset " + location); // not dods return null; } } // check for dmr private static ServiceType checkIfDap4(String location) throws IOException { if (location.matches("^.*[.](dmr|dap|dsr)([.](xml|html))?$")) { // Strip off any trailing DAP4 suffix(es) if (location.endsWith(".xml")) location = location.substring(0, location.length() - ".xml".length()); else if (location.endsWith(".html")) location = location.substring(0, location.length() - ".html".length()); // location must end with dap, dmr, or dsr; strip it off location = location.substring(0, location.length() - ".dxx".length()); } location = location + ".dsr.xml"; // get known dap4 response try (HTTPMethod method = HTTPFactory.Get(location)) { int status = method.execute(); if (status == HTTP_OK) { Optional value = method.getResponseHeaderValue("Content-Description"); if (value.isPresent()) { if (value.get().contains("application/vnd.opendap.dap4")) return ServiceType.DAP4; } } // not dap4 return null; } } // The first 128 bytes should contain enough info to tell if this looks like an actual ncml file or not. // For example, here is an example 128 byte response: // \n= 300) { if (statusCode == HTTP_UNAUTHORIZED) { throw new IOException("Unauthorized to open dataset " + location); } else if (statusCode == HTTP_NOT_ACCEPTABLE) { String msg = location + " - this server does not support returning content without any encoding."; msg = msg + " Please download the file locally. Return status=" + statusCode; throw new IOException(msg); } else { throw new IOException(location + " is not a valid URL, return status=" + statusCode); } } return checkIfNcml(method.getResponseAsString()); } } return false; } private static boolean checkIfNcml(File file) throws IOException { if (!file.exists() || file.isDirectory()) { return false; } try (BufferedInputStream in = new BufferedInputStream(new FileInputStream(file), NUM_BYTES_TO_DETERMINE_NCML)) { byte[] bytes = new byte[NUM_BYTES_TO_DETERMINE_NCML]; int bytesRead = in.read(bytes); if (bytesRead <= 0) { return false; } else { return checkIfNcml(new String(bytes, 0, bytesRead)); } } } private static boolean checkIfNcml(String string) { // Look for the ncml element as well as a reference to the ncml namespace URI. return string.contains("




© 2015 - 2025 Weber Informatics LLC | Privacy Policy