All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.univocity.api.entity.html.FetchOptions Maven / Gradle / Ivy

There is a newer version: 2.2.1
Show newest version
package com.univocity.api.entity.html;

import com.univocity.api.io.*;

import java.io.*;

/**
 * Configuration class for use in the {@link HtmlElement#fetchResources} methods
 * Setters return `this` instance to enable method chaining during initialization.
 *
 * @author Univocity Software Pty Ltd - [email protected]
 */
public class FetchOptions implements Cloneable {

	private boolean overwriteSharedResources = false;
	private FileProvider sharedResourceDir;
	private boolean flattenDirectoryStructure;
	private DownloadHandler downloadHandler;
	private String baseUri;
	private boolean downloadBlacklistingEnabled = true;
	private long remoteInterval = 5L;

	/**
	 * Default constructor for FetchOptions
	 * Defaults to not flattening directory and accepting any String
	 */
	public FetchOptions() {
		flattenDirectoryStructure = false;
	}

	/**
	 * The current base URI associated with the document whose resources are being fetched. Used to "build" the full
	 * URL used to download a given resource. For example, if a link such as ``
	 * is being processed, and the base URI is set to `http://www.univocity.com`, the download URL will be
	 * `http://www.univocity.com/Images/Icons/garage.svg`
	 *
	 * @return the base URI if available, or an empty {@code String}
	 */
	public String getBaseUri() {
		return baseUri;
	}

	/**
	 * Modifies the current base URI associated with the document whose resources are being fetched. Used to "build" the full
	 * URL used to download a given resource. For example, if a link such as ``
	 * is being processed, and the base URI is set to `http://www.univocity.com`, the download URL will be
	 * `http://www.univocity.com/Images/Icons/garage.svg`
	 *
	 * @param baseUri base URI to use for generating absolute download URL paths.
	 */
	public void setBaseUri(String baseUri) {
		this.baseUri = baseUri;
	}

	/**
	 * Option to flatten the path section of a fetched resource into the new filename.
	 *
	 * A file with the relative path such as `./path/to/resource/image.png`
	 * would normally be saved as a file named `image.png` in the `./path/to/resource/` directory.
	 *
	 * When flattened it will instead be saved as `path_to_resource_image.png` in the `.` directory.
	 *
	 * @param flatten whether to flatten the path of a resource into the saved name.
	 *
	 */
	public void flattenDirectories(boolean flatten) {
		this.flattenDirectoryStructure = flatten;
	}

	/**
	 * Whether or not the resource filenames should be 'flattened'. That is to say have the directories condensed into
	 * the filename so all resource files are in the same directory but all uniquely named.
	 * e.g.
	 *
	 * A file with the relative path such as `./path/to/resource/image.png`
	 * would normally be saved as a file named `image.png` in the `./path/to/resource/` directory.
	 *
	 * When flattened it will instead be saved as `path_to_resource_image.png` in the `.` directory.
	 *
	 * @return whether or not the directory structure in filenames will be flattened when saving resources.
	 */
	public boolean flattenDirectoryStructure() {
		return flattenDirectoryStructure;
	}

	/**
	 * Returns the {@link DownloadHandler} callback to be used by the fetch resources operation.
	 * @return the current download handler
	 */
	public DownloadHandler getDownloadHandler() {
		return downloadHandler;
	}

	/**
	 * Defines a {@link DownloadHandler} to manipulate the downloads performed by the fetch resources operation.
	 * @param downloadHandler the download handler to use
	 */
	public void setDownloadHandler(DownloadHandler downloadHandler) {
		this.downloadHandler = downloadHandler;
	}

	/**
	 * Returns a flag indicating whether resources that have been downloaded and are shared among multiple pages should
	 * be overwritten during a new fetch resources operation.
	 *
	 * @return whether local files that already exist should be overwritten
	 */
	public boolean isOverwriteSharedResources() {
		return overwriteSharedResources;
	}

	/**
	 * Defines whether resources that have been downloaded and are shared among multiple pages should
	 * be overwritten during a new fetch resources operation.
	 *
	 * @param overwriteSharedResources flag indicating that local files that already exist should be overwritten
	 */
	public void setOverwriteSharedResources(boolean overwriteSharedResources) {
		this.overwriteSharedResources = overwriteSharedResources;
	}

	/**
	 * Returns the shared resource directory used to store files referenced by one or more HTML pages and CSS files.
	 * Use it to prevent downloading the same images and CSS files over and over again for each HTML page you want to
	 * store.
	 *
	 * If unspecified (i.e. `null`) a directory named after the HTML file concatenated with the `_files` the suffix will
	 * be created, and all resources used by that HTML will be stored in this directory - which emulates what most browsers
	 * do when their "File -> Save Page As..." action is executed.
	 *
	 * @return the current resource directory, if any.
	 */
	public FileProvider getSharedResourceDir() {
		return sharedResourceDir;
	}


	/**
	 * Defines the shared resource directory used to store files referenced by one or more HTML pages and CSS files.
	 * Use it to prevent downloading the same images and CSS files over and over again for each HTML page you want to
	 * store.
	 *
	 * If unspecified (i.e. `null`) a directory named after the HTML file concatenated with the `_files` the suffix will
	 * be created, and all resources used by that HTML will be stored in this directory - which emulates what most browsers
	 * do when their "File -> Save Page As..." action is executed.
	 *
	 * @param sharedResourceDir the path to a shared resource directory to use.  It can contain system variables enclosed
	 * within { and } (e.g. {@code {user.home}/Downloads"}). Subdirectories that don't exist will be created if required.
	 */
	public void setSharedResourceDir(String sharedResourceDir) {
		if (sharedResourceDir == null) {
			this.sharedResourceDir = null;
		} else {
			this.sharedResourceDir = new FileProvider(sharedResourceDir);
		}
	}

	/**
	 * Defines the shared resource directory used to store files referenced by one or more HTML pages and CSS files.
	 * Use it to prevent downloading the same images and CSS files over and over again for each HTML page you want to
	 * store.
	 *
	 * If unspecified (i.e. `null`) a directory named after the HTML file concatenated with the `_files` the suffix will
	 * be created, and all resources used by that HTML will be stored in this directory - which emulates what most browsers
	 * do when their "File -> Save Page As..." action is executed.
	 *
	 * @param sharedResourceDir the path to a shared resource directory to use.  Subdirectories that don't exist will
	 * be created if required.
	 */
	public void setSharedResourceDir(File sharedResourceDir) {
		if (sharedResourceDir == null) {
			this.sharedResourceDir = null;
		} else {
			this.sharedResourceDir = new FileProvider(sharedResourceDir);
		}
	}

	/**
	 * Indicates whether URLs of resources that resulted in a download failure (such as a 404) should be blacklisted
	 * while the parser is running, so no further attempts to access the same URL will be made. Enabled by default to improve
	 * speed when fetching resources of multiple pages, especially when link following is used.
	 *
	 * @return flag indicating whether bad URLs should be blacklisted
	 */
	public boolean isDownloadBlacklistingEnabled() {
		return downloadBlacklistingEnabled;
	}

	/**
	 * Configures whether URLs of resources that resulted in a download failure (such as a 404) should be blacklisted
	 * while the parser is running, so no further attempts to access the same URL will be made. Enabled by default to improve
	 * speed when fetching resources of multiple pages, especially when link following is used.
	 *
	 * @param downloadBlacklistingEnabled flag indicating whether bad URLs should be blacklisted
	 */
	public void setDownloadBlacklistingEnabled(boolean downloadBlacklistingEnabled) {
		this.downloadBlacklistingEnabled = downloadBlacklistingEnabled;
	}

	/**
	 * Returns the minimum interval of time to wait between each download request. This is required to prevent
	 * submitting multiple requests to the same server at the same time.
	 *
	 * Defaults to 5 ms
	 *
	 * @return the minimum time (in milliseconds) to wait between download requests.
	 *         Values {@code <= 0} mean the internal {@link RateLimiter} is disabled.
	 */
	public final long getRemoteInterval() {
		return remoteInterval;
	}

	/**
	 * Defines the minimum interval of time to wait between each download request. This is required to prevent submitting
	 * multiple requests to the same server at the same time.
	 *
	 * Defaults to 5 ms
	 *
	 * @param remoteInterval minimum time (in milliseconds) to wait between download requests.
	 *                       Any value {@code <= 0} will disable the internal {@link RateLimiter}.
	 */
	public final void setRemoteInterval(long remoteInterval) {
		if (remoteInterval < 0L) {
			remoteInterval = 0L;
		}
		this.remoteInterval = remoteInterval;
	}

	@Override
	protected FetchOptions clone() {
		try {
			return (FetchOptions) super.clone();
		} catch (CloneNotSupportedException e) {
			throw new IllegalStateException(e);
		}
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy