thredds.crawlabledataset.CrawlableDatasetDods Maven / Gradle / Ivy

Go to download
/*
 * Copyright 1998-2009 University Corporation for Atmospheric Research/Unidata
 *
 * Portions of this software were developed by the Unidata Program at the
 * University Corporation for Atmospheric Research.
 *
 * Access and use of this software shall impose the following obligations
 * and understandings on the user. The user is granted the right, without
 * any fee or cost, to use, copy, modify, alter, enhance and distribute
 * this software, and any derivative works thereof, and its supporting
 * documentation for any purpose whatsoever, provided that this entire
 * notice appears in all copies of the software, derivative works and
 * supporting documentation.  Further, UCAR requests that the user credit
 * UCAR/Unidata in any publications that result from the use of this
 * software or in any product that includes this software. The names UCAR
 * and/or Unidata, however, may not be used in any advertising or publicity
 * to endorse or promote any products or commercial entity unless specific
 * written permission is obtained from UCAR/Unidata. The user also
 * understands that UCAR/Unidata is not obligated to provide the user with
 * any support, consulting, training or assistance of any kind with regard
 * to the use, operation and performance of this software nor to provide
 * the user with any updates, revisions, new versions or "bug fixes."
 *
 * THIS SOFTWARE IS PROVIDED BY UCAR/UNIDATA "AS IS" AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL UCAR/UNIDATA BE LIABLE FOR ANY SPECIAL,
 * INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING
 * FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
 * NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
 * WITH THE ACCESS, USE OR PERFORMANCE OF THIS SOFTWARE.
 */
package thredds.crawlabledataset;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import thredds.util.DodsURLExtractor;

/**
 * A description
 * 
 * @author Ethan Davis
 * @author Bas Retsios
 * @since Jun 8, 2005 15:34:04 -0600
 */
public class CrawlableDatasetDods implements CrawlableDataset {
	static private org.slf4j.Logger log = org.slf4j.LoggerFactory
			.getLogger(CrawlableDatasetDods.class);

	private static DodsURLExtractor urlExtractor = null;
	
	private static Map listDatasetsMap = null; // maintain an in-memory copy for performance reasons .. TODO: add a version-check
	
	private String path;
	
	private URLConnection pathUrlConnection = null; // store this, for performance reasons

	private String name;

	private Object configObj = null;
	
	protected CrawlableDatasetDods() {
	}

	protected CrawlableDatasetDods(String path, Object configObj)
	{

		if (urlExtractor == null)
			urlExtractor = new DodsURLExtractor();
		
		if (listDatasetsMap == null) // for performance
			listDatasetsMap = new HashMap();

		if (configObj != null) {
			log.debug("CrawlableDatasetDods(): config object not null, it will be ignored <"
							+ configObj.toString() + ">.");
			this.configObj = configObj;
		}
		
		if (path.startsWith("http:")) {
			this.path = path;

			try {
				new URI(path); // check syntax .. URISyntaxException if its not good
				name = getName(path);
			} catch (URISyntaxException e) {
		        String tmpMsg = "Bad URI syntax for path <" + path + ">: " + e.getMessage();
		        log.debug( "CrawlableDatasetDods(): " + tmpMsg);
		        throw new IllegalArgumentException( tmpMsg);
			}

		    // Check if this accessPoint URL is an OPeNDAP server URL.
			// For now commented-out because it takes far too long when expanding a directory:
			// all links would be tested, because a CrawlableDataset is new-ed too fast (when its parent is expanded).
			/*
		    String apVersionString = path + (path.endsWith("/") ? "version" : "/version");
		    String apVersionResultContent = null;
		    try
		    {
		      apVersionResultContent = urlExtractor.getTextContent( apVersionString);
		    }
		    catch (java.io.IOException e)
		    {
		      String tmpMsg = "The accessPoint URL is not an OPeNDAP server URL (no version info) <" + apVersionString + ">";
		      log.error( "CrawlableDatasetDods(): " + tmpMsg, e);
		    }
		    if ( apVersionResultContent == null ||
		    	 (apVersionResultContent.indexOf( "DODS") == -1 &&
		         apVersionResultContent.indexOf( "OPeNDAP") == -1 &&
		         apVersionResultContent.indexOf( "DAP") == -1))
		    {
		      String tmpMsg = "The accessPoint URL version info is not valid <" + apVersionResultContent + ">";
		      log.error(  "CrawlableDatasetDods(): " + tmpMsg);
		    }
		    */
		} else {
			String tmpMsg = "Invalid url <" + path + ">.";
			log.debug("CrawlableDatasetDods(): " + tmpMsg);
			throw new IllegalArgumentException(tmpMsg);
		}
	}
	
	private CrawlableDatasetDods(CrawlableDatasetDods parent, String childPath)
	{
		String normalChildPath = childPath.startsWith("/")?childPath.substring(1):childPath;
		this.path = parent.getPath();
		this.path += this.path.endsWith("/") ? normalChildPath : "/" + normalChildPath;
		this.name = getName(path);
		this.configObj = null;
	}

	private String getName(String path) {
		// Attempt to return the last name in the path name sequence.
		if (!path.equals("/")) {
			String tmpName = path.endsWith("/")
                       ? path.substring(0, path.length() - 1) : path;
      int index = tmpName.lastIndexOf("/");
			if (index != -1)
				tmpName = tmpName.substring(index + 1);
			return tmpName;
		} else
			return path;
	}

  /**
   * Provide access to the java.net.URI that this CrawlableDataset represents.
   *
   * @return the java.net.URI that this CrawlableDataset represents.
   */
  public URI getUri()
  {
    try
    {
      return new URI( this.path);
    }
    catch ( URISyntaxException e )
    {
      return null;
    }
  }

  public Object getConfigObject() {
		return configObj;
	}

	public String getPath() {
		return (this.path);
	}

	public String getName() {
		return (this.name);
	}

	public boolean isCollection() {
		return isCollection(path);
	}

	public CrawlableDataset getDescendant( String relativePath )
	{
    if ( relativePath.startsWith( "/"))
      throw new IllegalArgumentException( "Path must be relative <" + relativePath + ">.");
    return new CrawlableDatasetDods(this, relativePath);
	}

	// how do we determine if a url is a collection?
	// we can't count on a trailing backslash, as this was removed by CrawlableDatasetFactory
	// for now, assume collection unless a known file extension is encountered
	private static String [] knownFileExtensions = {".hdf", ".xml", ".nc", ".bz2", ".cdp", ".jpg"};
	
	private static boolean isCollection(String path)
	{
		String testPath = path.toLowerCase(); // otherwise our matches may fail
		if (isDodsDataset(testPath))
			return false;
		else
		{
			int i = 0;
			while ((i < (knownFileExtensions.length)) && !testPath.endsWith(knownFileExtensions[i]))
					++i;
			return (i >= knownFileExtensions.length); // i < length means we deal with a known file ==> no collection 
		}
	}

	private static String [] dodsExtensions = {".html", ".htm", ".das", ".dds", ".info"};

	private static String getDodsExtension(String path)
	{
		String extension = "";
		String testPath = path.toLowerCase(); // otherwise our matches may fail

		int i = 0;
		while ((i < (dodsExtensions.length)) && !testPath.endsWith(dodsExtensions[i]))
			++i;
		if (i < dodsExtensions.length)
			extension = dodsExtensions[i];
		return extension;
	}

	private static boolean isDodsDataset(String path)
	{
		return getDodsExtension(path).length() > 0;
	}
	
	private static String removeDodsExtension(String path)
	{
		String dodsExtension = getDodsExtension(path);
		if (dodsExtension.length() > 0)
			path = path.substring(0, path.length() - dodsExtension.length());

		return path;
	}
	
	// This function shouldn't be here !!!
	// It is a workaround for many OPeNDAP servers that crop part of their urls (the /opendap-bin/nph-dods/ part)
	// e.g. of server with problem (2-Nov-2006): http://acdisc.sci.gsfc.nasa.gov/opendap-bin/nph-dods/OPENDAP/Giovanni/
	private String forceChild(String url)
	{
		String prefix = path;
		if (prefix.endsWith("/"))
			prefix = path.substring(0, path.length() - 1); // because the url also contains a '/' that we will use			
		int j = url.substring(0, url.length() - 1).lastIndexOf('/'); // url.length() - 1 was intentional .. if the last char is a '/', we're interested in the previous one.
		if (j >= 0)
		{
			String ret = prefix + url.substring(j);
			return ret;
		}
		else // relative paths .. leave intact
			return url;
	}

	public List listDatasets() throws IOException {

		if (!this.isCollection()) {
			String tmpMsg = "This dataset <" + this.getPath()
					+ "> is not a collection dataset.";
			log.error("listDatasets(): " + tmpMsg);
			throw new IllegalStateException(tmpMsg);
		}
		
		if (listDatasetsMap.containsKey(path)) // shortcut .. for performance
			return (List)listDatasetsMap.get(path);
		else
		{
			List list = new ArrayList();
			List pathList = new ArrayList(); // only for detecting duplicates (after removing the extension, sometimes we end up with duplicates)
	
			// Get list of possible datasets from current URL.
			List possibleDsList = null;
			try {
				String openPath = path;
				if (!openPath.endsWith("/")) // if you skip this, you will find that relative URLs don't work (fails in "extract", and in particular in URL u = new URL(baseURL, value))
					openPath += "/";
				possibleDsList = urlExtractor.extract(openPath);
			} catch (java.io.IOException e) {
				log.warn("listDatasets(): IOException while extracting dataset info from given OPeNDAP directory <"
								+ path + ">, return empty list: " + e.getMessage());
				return (list);
			}
	
			// Handle each link in the current access path.
			String curDsUrlString = null;
			for (Iterator it = possibleDsList.iterator(); it.hasNext(); ) {
				curDsUrlString = (String) it.next();
				// Perform some tests on curDsUrlString
				// Skip datasets that aren't OPeNDAP datasets (".html") or
				// collection datasets ("/").
				if ((!isDodsDataset(curDsUrlString)) && (!isCollection(curDsUrlString))) {
					log.warn("expandThisLevel(): Dataset isn't an OPeNDAP dataset or collection dataset, skip <"
									+ path + ">.");
					continue;
				}
	
				curDsUrlString = removeDodsExtension(curDsUrlString);

        // This function goes a bit too far trying to recover from servers that drop part of URL path.
        // However, it also converts URLs that point to external servers to be subdirectories of this CrDS.
        //curDsUrlString = forceChild(curDsUrlString);

        // Skip any URLs that aren't children of this CrDs
        if ( !curDsUrlString.startsWith( path ) )
        {
          log.debug( "listDatasets(): skipping URL <" + curDsUrlString + ">, not child of this CrDs <" + path + ">." );
          continue;
        }

        if (pathList.contains(curDsUrlString))
					continue; // duplicate
				else
					pathList.add(curDsUrlString);
				
	
				// Avoid links back down the path hierarchy (i.e., parent directory links).
				// Comment: this call was taken over from CrawlableDatasetFile. Since we use forceChild, this call is currently useless.
				if (!curDsUrlString.startsWith(path)) {
					log.debug("listDatasets(): current path <" + curDsUrlString
							+ "> not child of given" + " location <" + path
							+ ">, skip.");
					continue;
				}
	
				try {
					new URI(curDsUrlString); // syntax check
				} catch (URISyntaxException e) {
					log.error("listDatasets(): Skipping dataset  <"
							+ curDsUrlString + "> due to URISyntaxException: "
							+ e.getMessage());
					continue;
				}
	
				log.debug("listDatasets(): handle dataset (" + curDsUrlString
								+ ")");
	
				// So far so good .. curDsUrlString passed all tests, thus add it to the list
        try {
          list.add(CrawlableDatasetFactory.createCrawlableDataset(
							curDsUrlString, this.getClass().getName(), null));
				} catch (ClassNotFoundException e) {
					log.warn("listDatasets(): Can't make CrawlableDataset for child url <"
									+ curDsUrlString + ">: " + e.getMessage());
				} catch (NoSuchMethodException e) {
					log.warn("listDatasets(): Can't make CrawlableDataset for child url <"
									+ curDsUrlString + ">: " + e.getMessage());
				} catch (IllegalAccessException e) {
					log.warn("listDatasets(): Can't make CrawlableDataset for child url <"
									+ curDsUrlString + ">: " + e.getMessage());
				} catch (InvocationTargetException e) {
					log.warn("listDatasets(): Can't make CrawlableDataset for child url <"
									+ curDsUrlString + ">: " + e.getMessage());
				} catch (InstantiationException e) {
					log.warn("listDatasets(): Can't make CrawlableDataset for child url <"
									+ curDsUrlString + ">: " + e.getMessage());
				}
			}
			
			listDatasetsMap.put(path, list); // remember it next time, for performance
	
			return list;
		}
	}

	public List listDatasets(CrawlableDatasetFilter filter) throws IOException {
		List list = this.listDatasets();
		if (filter == null)
			return list;
		List retList = new ArrayList();
		for (Iterator it = list.iterator(); it.hasNext();) {
			CrawlableDataset curDs = (CrawlableDataset) it.next();
			if (filter.accept(curDs)) {
				retList.add(curDs);
			}
		}
		return (retList);
	}

	public CrawlableDataset getParentDataset() {
		if (!path.equals("/")) {
      String parentPath = path;
      int index = parentPath.lastIndexOf( "/", parentPath.endsWith( "/") ? parentPath.length() - 2 : parentPath.length() - 1 );
      if ( index != -1 )
        parentPath = parentPath.substring( 0, index + 1 );
      
			return new CrawlableDatasetDods( parentPath, null);
		} else
			return null;
	}

  public boolean exists() {
	if (pathUrlConnection == null)
	try {
		URL u = new URL(path);
		pathUrlConnection = u.openConnection();
	} catch (MalformedURLException e) {
	} catch (IOException e) {
	}
  if ( pathUrlConnection != null )
  try {
		int responseCode = ((HttpURLConnection)pathUrlConnection).getResponseCode();
		if (responseCode >= 200 && responseCode < 300) // Successful
			return true;
	} catch (IOException e) {
	}
	return false;
  }

  public long length() {
    if (this.isCollection())
      return (0);
    if (pathUrlConnection == null)
    {
    	try {
	        URL u = new URL(path);
	        pathUrlConnection = u.openConnection();
	    } catch (MalformedURLException e) {
	    } catch (IOException e) {
	    }
    }
    if (pathUrlConnection != null)
    	return pathUrlConnection.getContentLength();
    else
    	return (-1);
  }

	public Date lastModified() {
		if (pathUrlConnection == null)
		{
			try {
				URL u = new URL(path);
				pathUrlConnection = u.openConnection();
			} catch (MalformedURLException e) {
			} catch (IOException e) {
			}
		}

		if (pathUrlConnection != null)
		{
			long lastModified = pathUrlConnection.getLastModified();
			if (lastModified != 0)
			{
				Calendar cal = Calendar.getInstance();
				cal.clear();
				cal.setTimeInMillis(lastModified);
				return (cal.getTime());
			}
			else
				return null;
		}
		else
			return null;
	}

  public String toString()
  {
    return this.path;
  }
}