decodes.datasource.WebDirectoryDataSource Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of opendcs Show documentation
A collection of software for aggregatting and processing environmental data such as from NOAA GOES satellites.
The newest version!
package decodes.datasource;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.io.Reader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.LinkedList;
import java.util.Properties;
import java.util.TimeZone;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import decodes.db.DataSource;
import decodes.db.Database;
import decodes.db.DatabaseException;
import decodes.db.InvalidDatabaseException;
import decodes.db.NetworkList;
import decodes.db.Platform;
import decodes.util.PropertySpec;
import ilex.util.EnvExpander;
import ilex.util.IDateFormat;
import ilex.util.Logger;
import ilex.util.PropertiesUtil;
import ilex.var.Variable;


/**
 * Designed for MB Hydro use of the dd.weatheroffice.gc.ca web site.
 * Provide a directory URL with embedded times in it. Build the URL and list the files
 * contained in the directory. Parse the file names for mediumIDs I'm interested in.
 * File names also contain the message time stamp.
 * 
 * https://dd.weather.gc.ca/bulletins/alphanumeric/20240326/CA/CWAO/00/ was used to verify behavior
 * of this source as best as possible. We suspect the agency has changed the directory structure of 
 * the data and we are not currently aware of expectation. Please contact the OpenDCS team if you 
 * need this working and can communicate the current expectations whether it's for the above link
 * or another source of data following a similar design.
 */
public class WebDirectoryDataSource extends DataSourceExec
{
	public final String module = "WebDirectory";
	
	private String directoryUrl = null;
	private String urlFieldDelimiter = "_";
	private TimeZone urlTimeZone = TimeZone.getTimeZone("UTC");
	private String urlTimeFormat = "ddHHmm";
	private static final Pattern HTML_LINK_PATTERN = Pattern.compile(".*.*?)\".*/?>.*");
	
	/** Position of the time within the file name (0=no time, 1=1st position) */
	private int urlTimePos = 3;
	/** Position of the station ID within the file name (0=no ID, 1=1st position) */
	private int urlIdPos = 5;
	
	private static PropertySpec[] myPropSpecs =
	{
		new PropertySpec("directoryUrl", PropertySpec.STRING,
			"(required) URL of the directory that lists the file names containing messages"),
		new PropertySpec("urlFieldDelimiter", PropertySpec.STRING,
			"(default = underscore) Delimiter for fields within the filenames."),
		new PropertySpec("urlTimePos", PropertySpec.INT,
			"(default=3) Position of time within the delimited file name (1=first pos)"),
		new PropertySpec("urlIdPos", PropertySpec.INT,
			"(default=5) Position of the transport medium ID within the file name (1=first pos)"),
		new PropertySpec("urlTimeFormat", PropertySpec.STRING,
				"(default = HHmmss) SimpleDateFormat format string for time in the filename"),
		new PropertySpec("urlTimeZone", PropertySpec.STRING,
				"(default = UTC) Time Zone for the time within the directory and file names")
	};
	
	private SimpleDateFormat fnSdf = new SimpleDateFormat(urlTimeFormat);
	private Date dSince = null, dUntil = null;
	private ArrayList rsNetlists = new ArrayList();
	private static final long MS_PER_DAY = 3600L * 24L * 1000L;
	private String currentDirUrl = null;
	private LinkedList fileList = new LinkedList();
	private Calendar nextDirectoryCal = Calendar.getInstance();
	private Calendar fileTimeCal = Calendar.getInstance();
	private SimpleDateFormat debugSdf = new SimpleDateFormat("yyyy/MMM/dd-HH:mm:ss");

	/**
	 * @see decodes.datasource.DataSourceExec#DataSourceExec(DataSource, Database) DataSourceExec Constructor
	 *
	 * @param dataSource
	 * @param decodesDatabase
	 */
	public WebDirectoryDataSource(DataSource source, Database db) {
		super(source, db);
	}


	@Override
	public void processDataSource() 
		throws InvalidDatabaseException
	{
	}

	@Override
	public void init(Properties rsProps, String since, String until, Vector networkLists)
		throws DataSourceException
	{
		directoryUrl = PropertiesUtil.getIgnoreCase(rsProps, "directoryUrl");
		if (directoryUrl == null)
		{
			directoryUrl = PropertiesUtil.getIgnoreCase(dbDataSource.arguments, "directoryUrl");
			if (directoryUrl == null)
				throw new DataSourceException(module + ": missing required 'directoryUrl' property.");
		}
		String s;
		if ((s = PropertiesUtil.getIgnoreCase(rsProps, "urlFieldDelimiter")) != null)
		{
			urlFieldDelimiter = s;
		}
		if ((s = PropertiesUtil.getIgnoreCase(rsProps, "urlTimePos")) != null)
		{
			try { urlTimePos = Integer.parseInt(s.trim()); }
			catch(Exception ex)
			{
				throw new DataSourceException(module + ": bad urlTimePos property '" + s 
					+ "' (must be integer)");
			}
		}
		if ((s = PropertiesUtil.getIgnoreCase(rsProps, "urlIdPos")) != null)
		{
			try { urlIdPos = Integer.parseInt(s.trim()); }
			catch(Exception ex)
			{
				throw new DataSourceException(module + ": bad urlIdPos property '" + s 
					+ "' (must be integer)");
			}
		}
		if ((s = PropertiesUtil.getIgnoreCase(rsProps, "urlTimeFormat")) != null)
		{
			urlTimeFormat = s;
			fnSdf = new SimpleDateFormat(urlTimeFormat);
		}
		if ((s = PropertiesUtil.getIgnoreCase(rsProps, "urlTimeZone")) != null)
		{
			urlTimeZone = TimeZone.getTimeZone(s);
		}
		
		if (since != null)
		{
			try { dSince = IDateFormat.parse(since); }
			catch(Exception ex)
			{
				throw new DataSourceException(module + ": bad since time '" + since + "': " + ex);
			}
		}
		else // default to last 24 hours
		{
			dSince = new Date(System.currentTimeMillis() - MS_PER_DAY);
		}
		if (until != null)
		{
			try { dUntil = IDateFormat.parse(until); }
			catch(Exception ex)
			{
				throw new DataSourceException(module + ": bad until time '" + until + "': " + ex);
			}
		}
		else // default to 'now'
			dUntil = new Date();
		
//		if ((s = PropertiesUtil.getIgnoreCase(rsProps, "urlDirectoryInterval")) != null)
//		{
//			try { urlDirectoryInterval = Integer.parseInt(s.trim()); }
//			catch(Exception ex)
//			{
//				throw new DataSourceException(module + ": bad urlDirectoryInterval property '" + s 
//					+ "' (must be integer)");
//			}
//		}

		nextDirectoryCal.setTimeZone(urlTimeZone);
		fileTimeCal.setTimeZone(urlTimeZone);
		fnSdf.setTimeZone(urlTimeZone);
		debugSdf.setTimeZone(urlTimeZone);
		
		nextDirectoryCal.setTimeInMillis(dSince.getTime());
		// Truncate to hour.
		nextDirectoryCal.set(Calendar.MINUTE, 0);
		nextDirectoryCal.set(Calendar.SECOND, 0);
		nextDirectoryCal.set(Calendar.MILLISECOND, 0);
		// Subtract an hour because readNextDirectory increments before reading.
		nextDirectoryCal.add(Calendar.HOUR_OF_DAY, -1);
		
		Logger.instance().debug1(module + " since=" + debugSdf.format(dSince) + ", next=" 
			+ debugSdf.format(nextDirectoryCal.getTime()));
		
		if (networkLists != null)
			for(NetworkList nl : networkLists)
				rsNetlists.add(nl);
	}

	@Override
	public void close()
	{
		// No resources left open.
	}

	@Override
	public RawMessage getRawMessage() 
		throws DataSourceException
	{
		String filename;
		while ((filename = getNextFile()) != null)
		{
			Logger.instance().debug1(module + " filename '" + filename + "'");
			
			// Parse the ID and date/time from the file name
			String fields[] = filename.split(urlFieldDelimiter);
			if (fields == null || fields.length < urlIdPos)
			{
				Logger.instance().warning(module + " bad filename in directory '" + filename
					+ "' -- no id field in position " + urlIdPos);
				continue;
			}
			String id = fields[urlIdPos - 1];
			if (fields == null || fields.length < urlTimePos)
			{
				Logger.instance().warning(module + " bad filename in directory '" + filename
					+ "' -- no time field in position " + urlTimePos);
				continue;
			}
			Date fileTime = null;
			try
			{
				fileTime = fnSdf.parse(fields[urlTimePos-1]);
			}
			catch (ParseException e)
			{
				Logger.instance().warning(module + " filename '" + filename 
					+ "' has unparsable time in position " + urlTimePos);
				continue;
			}
			
			// File contains day hour minute. Construct full time from directory year/month.
			fileTimeCal.setTime(fileTime);
			fileTimeCal.set(Calendar.YEAR, nextDirectoryCal.get(Calendar.YEAR));
			fileTimeCal.set(Calendar.MONTH, nextDirectoryCal.get(Calendar.MONTH));
			// A file might arrive late, just after midnight. If file hour is 23 and directory hour is 00,
			// then subtract a day to the previous day.
			if (fileTimeCal.get(Calendar.HOUR_OF_DAY) > nextDirectoryCal.get(Calendar.HOUR_OF_DAY))
				fileTimeCal.add(Calendar.DAY_OF_YEAR, -1);
Logger.instance().debug1("\tparsed fileTime=" + debugSdf.format(fileTime) + ", corrected fileTime=" 
+ debugSdf.format(fileTimeCal.getTime()));
			fileTime = fileTimeCal.getTime();
			
			// Check to see if this ID is in one of my network lists if not, continue;
			boolean found = false;
			String mediumType = null;;
			for(NetworkList netlist : rsNetlists)
				if (netlist.getEntry(id) != null)
				{
					found = true;
					mediumType = netlist.transportMediumType;
					break;
				}
			if (!found)
			{
				Logger.instance().debug1(module + " filename '" + filename 
					+ "' skipped because ID '" + id + "' is not in network lists.");
				continue;
			}
			
			// Read the file into memory and build RawMessage
			String fileUrl = currentDirUrl 
				+ (currentDirUrl.endsWith("/") ? "" : "/")
				+ filename;
			InputStream istrm = null;
			BufferedInputStream bis = null;
			ByteArrayOutputStream baos = null;
			try
			{
				URL url = new URL(fileUrl);
				bis = new BufferedInputStream(url.openStream());
				baos = new ByteArrayOutputStream();
				byte buf[] = new byte[4096];
				int len;
				while ((len = bis.read(buf)) > 0)
					baos.write(buf, 0, len);
				
				if ((len = baos.size()) == 0)
				{
					if (!found)
					{
						Logger.instance().debug1(module + " url '" + fileUrl 
							+ "' resulted in an empty file -- skipped.");
						continue;
					}
				}
				RawMessage ret = new RawMessage(baos.toByteArray(), len);
				
				// Set the Performance Measurements
				ret.setPM(GoesPMParser.MESSAGE_TIME, new Variable(fileTime));
				ret.setPM(GoesPMParser.DCP_ADDRESS, new Variable(id));
				ret.setTimeStamp(fileTime);
				ret.setMediumId(id);

				Platform platform = 
					Database.getDb().platformList.getPlatform(mediumType, id, fileTime);
				if (platform != null)
				{
					ret.setPlatform(platform);
					ret.setTransportMedium(platform.getTransportMedium(mediumType));
				}
				else if (!getAllowNullPlatform())
				{
					throw new UnknownPlatformException(module + " " + mediumType + ":" + id);
				}
				
				return ret;
			}
			catch (MalformedURLException ex)
			{
				Logger.instance().warning(module + " bad URL '" + fileUrl + "': " + ex);
				continue;
			}
			catch (IOException ex)
			{
				Logger.instance().warning(module + " Error reading URL '" + fileUrl + "': " + ex);
				continue;
			}
			catch (DatabaseException ex)
			{
				Logger.instance().warning(module + " Error looking up platform for TM " 
						+ mediumType + "':" + id + ": " + ex);
				continue;
			}
			catch (UnknownPlatformException ex)
			{
				throw ex;
			}
			catch (Exception ex)
			{
				Logger.instance().warning(module + " Unexpected exception reading URL '" 
						+ fileUrl + "':" + ex);
				continue;
			}
			finally
			{
				if (baos != null)
					try { baos.close(); } catch(Exception ex) {}
				if (bis != null)
					try { bis.close(); } catch(Exception ex) {}
				if (istrm != null)
					try { istrm.close(); } catch(Exception ex) {}
			}
		}
		throw new DataSourceEndException("No more files to read.");
	}
	
	@Override
	public PropertySpec[] getSupportedProps()
	{
		return myPropSpecs;
	}

	/**
	 * Cycle through the list in memory. If empty read the next directory URL.
	 * Account for directory URLs that may contain no files (skip them).
	 * @return
	 */
	private String getNextFile()
	{
		while (fileList.isEmpty())
			if (!readNextDirectory())
				return null;
		return fileList.isEmpty() ? null : fileList.removeFirst();
	}
	
	/**
	 * Increment next Directory Time and build directory URL. Read it into memory
	 * and parse the anchor tags that contain file names. Place these
	 * @return false if end of archive is reached, true if a directory was read.
	 */
	private boolean readNextDirectory()
	{
		nextDirectoryCal.add(Calendar.HOUR_OF_DAY, 1);
		Date nextDirectoryTime = nextDirectoryCal.getTime();
		if (nextDirectoryTime.after(dUntil))
			return false;
		
		Logger.instance().debug1(module + " reading directory for time " + debugSdf.format(nextDirectoryTime));
		
		Properties urlProps = new Properties();
		urlProps.setProperty("TZ", urlTimeZone.getID());
		currentDirUrl = EnvExpander.expand(directoryUrl, urlProps, nextDirectoryTime);
		try
		{
			Logger.instance().debug1(module + " reading URL '" + currentDirUrl + "'");
			URL dirUrl = new URL(currentDirUrl);
			
			try (InputStream input = dirUrl.openStream();
				BufferedReader reader = new BufferedReader(new InputStreamReader(input)))
			{
				String line = null;
				while((line = reader.readLine()) != null)
				{
					Matcher m = HTML_LINK_PATTERN.matcher(line);
					if (m.matches())
					{
						fileList.add(m.group("link"));
					}
				}
			}

			return true;
		}
		catch(MalformedURLException ex)
		{
			Logger.instance().warning(module + " Bad URL '" + currentDirUrl + "': " + ex);
			return false;
		}
		catch(FileNotFoundException ex)
		{
			Logger.instance().warning(module + " FileNotFound reading URL '" + currentDirUrl + "': " + ex);
			// Sometimes the depot skips an hour. Keep going unless we're past the until time.
			return nextDirectoryTime.before(dUntil);
		}
		catch(IOException ex)
		{
			Logger.instance().warning(module + " Error reading URL '" + currentDirUrl + "': " + ex);
		}
		catch(Exception ex)
		{
			String msg = module + " Unexpected exception reading URL '" + currentDirUrl + "': " + ex;
			Logger.instance().warning(msg);
			PrintStream ps = Logger.instance().getLogOutput();
			if (ps != null)
				ex.printStackTrace(ps);
			
		}
		
		return false;
	}
	
	@Override
	public boolean supportsTimeRanges()
	{
		return true;
	}


}