burlap.behavior.singleagent.planning.stochastic.rtdp.RTDP Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of burlap Show documentation
The Brown-UMBC Reinforcement Learning and Planning (BURLAP) Java code library is for the use and development of single or multi-agent planning and learning algorithms and domains to accompany them. The library uses a highly flexible state/observation representation where you define states with your own Java classes, enabling support for domains that discrete, continuous, relational, or anything else. Planning and learning algorithms range from classic forward search planning to value-function-based stochastic planning and learning algorithms.
The newest version!
package burlap.behavior.singleagent.planning.stochastic.rtdp;

import burlap.behavior.policy.GreedyQPolicy;
import burlap.behavior.policy.Policy;
import burlap.behavior.policy.PolicyUtils;
import burlap.behavior.singleagent.Episode;
import burlap.behavior.singleagent.planning.Planner;
import burlap.behavior.singleagent.planning.stochastic.DynamicProgramming;
import burlap.behavior.valuefunction.ConstantValueFunction;
import burlap.behavior.valuefunction.ValueFunction;
import burlap.debugtools.DPrint;
import burlap.mdp.core.action.Action;
import burlap.mdp.core.state.State;
import burlap.mdp.singleagent.SADomain;
import burlap.statehashing.HashableState;
import burlap.statehashing.HashableStateFactory;

import java.util.LinkedList;
import java.util.List;


/**
 * Implementation of Real-time dynamic programming [1]. The planning algorithm uses a Q-value derived policy to sample rollouts in the domain. During
 * each step of the rollout, the current state has its value updated using the Bellman operator and the action for the current state
 * is selected using a greedy Q policy in which ties are randomly broken. Alternatively, this algorithm may be set to batch mode. In batch mode,
 * all Bellman updates are stalled until rollout
 * is complete, after which the Bellman update is performed on each state that was visited
 * in reverse.
 * 
 * To ensure optimality, an optimistic value function initialization should be used. However, RTDP excels when a good value function initialization
 * (e.g., an admissible heuristic) can be provided.
 * 
 * 
 * 
 * 1. Barto, Andrew G., Steven J. Bradtke, and Satinder P. Singh. "Learning to act using real-time dynamic programming." Artificial Intelligence 72.1 (1995): 81-138.
 * 
 * 
 * @author James MacGlashan
 *
 */
public class RTDP extends DynamicProgramming implements Planner{

	
	/**
	 * The policy to use for episode rollouts
	 */
	protected Policy					rollOutPolicy;
	
	/**
	 * the number of rollouts to perform when planning is started unless the value function delta is small enough.
	 */
	protected int						numRollouts;
	
	/**
	 * When the maximum change in the value function from a rollout is smaller than this value, VI will terminate.
	 */
	protected double					maxDelta;
	
	/**
	 * The maximum depth/length of a rollout before it is terminated and Bellman updates are performed.
	 */
	protected int						maxDepth;
	
	
	/**
	 * RTDP will be delcared "converged" if there are this many consecutive policy rollouts in which the value function change is smaller than the maxDelta value.
	 * The default value is 10.
	 */
	protected int						minNumRolloutsWithSmallValueChange = 10;
	
	
	/**
	 * If set to use batch mode; Bellman updates will be stalled until a rollout is complete and then run in reverse.
	 */
	protected boolean					useBatch = false;
	
	
	/**
	 * Stores the number of Bellman updates made across all planning.
	 */
	protected int						numberOfBellmanUpdates = 0;
	
	
	
	/**
	 * Initializes. The value function will be initialized to vInit by default everywhere and will use a greedy policy with random tie breaks
	 * for performing rollouts. Use the {@link #setValueFunctionInitialization(burlap.behavior.valuefunction.ValueFunction)} method
	 * to change the value function initialization and the {@link #setRollOutPolicy(Policy)} method to change the rollout policy to something else. vInit
	 * should be set to something optimistic like VMax to ensure convergence.
	 * @param domain the domain in which to plan
	 * @param gamma the discount factor
	 * @param hashingFactory the state hashing factor to use
	 * @param vInit the value to the the value function for all states will be initialized
	 * @param numRollouts the number of rollouts to perform when planning is started.
	 * @param maxDelta when the maximum change in the value function from a rollout is smaller than this value, planning will terminate.
	 * @param maxDepth the maximum depth/length of a rollout before it is terminated and Bellman updates are performed.
	 */
	public RTDP(SADomain domain, double gamma, HashableStateFactory hashingFactory, double vInit, int numRollouts, double maxDelta, int maxDepth){
		
		this.DPPInit(domain, gamma, hashingFactory);
		
		this.numRollouts = numRollouts;
		this.maxDelta = maxDelta;
		this.maxDepth = maxDepth;
		this.rollOutPolicy = new GreedyQPolicy(this);
		
		this.valueInitializer = new ConstantValueFunction(vInit);
		
	}
	
	
	
	
	/**
	 * Initializes. The value function will be initialized to vInit by default everywhere and will use a greedy policy with random tie breaks
	 * for performing rollouts. Use the {@link #setValueFunctionInitialization(burlap.behavior.valuefunction.ValueFunction)} method
	 * to change the value function initialization and the {@link #setRollOutPolicy(Policy)} method to change the rollout policy to something else. vInit
	 * should be set to something optimistic like VMax to ensure convergence.
	 * @param domain the domain in which to plan
	 * @param gamma the discount factor
	 * @param hashingFactory the state hashing factor to use
	 * @param vInit the object which defines how the value function will be initialized for each individual state.
	 * @param numRollouts the number of rollouts to perform when planning is started.
	 * @param maxDelta when the maximum change in the value function from a rollout is smaller than this value, planning will terminate.
	 * @param maxDepth the maximum depth/length of a rollout before it is terminated and Bellman updates are performed.
	 */
	public RTDP(SADomain domain, double gamma, HashableStateFactory hashingFactory, ValueFunction vInit, int numRollouts, double maxDelta, int maxDepth){
		
		this.DPPInit(domain, gamma, hashingFactory);
		
		this.numRollouts = numRollouts;
		this.maxDelta = maxDelta;
		this.maxDepth = maxDepth;
		this.rollOutPolicy = new GreedyQPolicy(this);
		
		this.valueInitializer = vInit;
		
	}
	
	
	/**
	 * Sets the number of rollouts to perform when planning is started (unless the value function delta is small enough).
	 * @param p the number of passes
	 */
	public void setNumPasses(int p){
		this.numRollouts = p;
	}
	
	/**
	 * Sets the maximum delta state value update in a rollout that will cause planning to terminate
	 * @param delta the max delta
	 */
	public void setMaxDelta(double delta){
		this.maxDelta = delta;
	}
	
	
	/**
	 * Sets the rollout policy to use.
	 * @param p the rollout policy to use
	 */
	public void setRollOutPolicy(Policy p){
		this.rollOutPolicy = p;
	}
	
	/**
	 * Sets the maximum depth of a rollout to use until it is prematurely temrinated to update the value function.
	 * @param d the maximum depth of a rollout.
	 */
	public void setMaxDynamicDepth(int d){
		this.maxDepth = d;
	}
	
	
	/**
	 * Sets the minimum number of consecutive rollsouts with a value function change less than the maxDelta value that will cause RTDP
	 * to stop.
	 * @param nRollsouts the minimum number of consecutive rollouts required.
	 */
	public void setMinNumRolloutsWithSmallValueChange(int nRollsouts){
		this.minNumRolloutsWithSmallValueChange = nRollsouts;
	}
	
	
	/**
	 * When batch mode is set, Bellman updates will be stalled until a roll out is complete and then run in reverse.
	 * @param useBatch whether to use batchmode RTDP or not.
	 */
	public void toggleBatchMode(boolean useBatch){
		this.useBatch = useBatch;
	}
	
	/**
	 * Returns the total number of Bellman updates across all planning
	 * @return the total number of Bellman updates across all planning
	 */
	public int getNumberOfBellmanUpdates(){
		return this.numberOfBellmanUpdates;
	}

	/**
	 * Plans from the input state and then returns a {@link burlap.behavior.policy.GreedyQPolicy} that greedily
	 * selects the action with the highest Q-value and breaks ties uniformly randomly.
	 * @param initialState the initial state of the planning problem
	 * @return a {@link burlap.behavior.policy.GreedyQPolicy}.
	 */
	@Override
	public GreedyQPolicy planFromState(State initialState) {
		
		if(!useBatch){
			this.normalRTDP(initialState);
		}
		else{
			this.batchRTDP(initialState);
		}

		return new GreedyQPolicy(this);

	}
	


	
	/**
	 * Runs normal RTDP in which bellman updates are performed after each action selection.
	 * @param initialState the initial state from which to plan
	 */
	protected void normalRTDP(State initialState){
		
		int totalStates = 0;
		int consecutiveSmallDeltas = 0;
		for(int i = 0; i < numRollouts; i++){
			
			State curState = initialState;
			int nSteps = 0;
			double delta = 0;
			while(!model.terminal(curState) && nSteps < this.maxDepth){
				
				HashableState sh = this.hashingFactory.hashState(curState);
				
				//select an action
				Action ga = this.rollOutPolicy.action(curState);
				
				//update this state's value
				double curV = this.value(sh);
				double nV = this.performBellmanUpdateOn(sh);
				delta = Math.max(Math.abs(nV - curV), delta); 
				this.numberOfBellmanUpdates++;
				
				//take the action
				curState = model.sample(curState, ga).op;
				nSteps++;
			}
			
			totalStates += nSteps;
			
			DPrint.cl(debugCode, "Pass: " + i + "; Num states: " + nSteps + " (total: " + totalStates + ")");
			
			if(delta < this.maxDelta){
				consecutiveSmallDeltas++;
				if(consecutiveSmallDeltas >= this.minNumRolloutsWithSmallValueChange){
					break;
				}
			}
			else{
				consecutiveSmallDeltas = 0;
			}
			
			
		}
		
	}
	
	
	/**
	 * Performs Bellman updates only after a rollout is complete and in reverse order
	 * @param initialState the initial state from which to plan
	 */
	protected void batchRTDP(State initialState){
		
		int totalStates = 0;
		
		int consecutiveSmallDeltas = 0;
		for(int i = 0; i < numRollouts; i++){
			
			Episode ea = PolicyUtils.rollout(rollOutPolicy, initialState, model, maxDepth);
			LinkedList  orderedStates = new LinkedList();
			for(State s : ea.stateSequence){
				orderedStates.addFirst(this.stateHash(s));
			}
			
			double delta = this.performOrderedBellmanUpdates(orderedStates);
			totalStates += orderedStates.size();
			DPrint.cl(debugCode, "Pass: " + i + "; Num states: " + orderedStates.size() + " (total: " + totalStates + ")");
			
			if(delta < this.maxDelta){
				consecutiveSmallDeltas++;
				if(consecutiveSmallDeltas >= this.minNumRolloutsWithSmallValueChange){
					break;
				}
			}
			else{
				consecutiveSmallDeltas = 0;
			}
		}
		
		
	}
	
	
	/**
	 * Performs ordered Bellman updates on the list of (hashed) states provided to it.
	 * @param states the ordered list of states on which to perform Bellamn updates.
	 * @return the maximum change in the value function for the given states
	 */
	protected double performOrderedBellmanUpdates(List  states){
		
		double delta = 0.;
		for(HashableState sh : states){
			
			double v = this.value(sh);
			
			double maxQ = this.performBellmanUpdateOn(sh);
			delta = Math.max(Math.abs(maxQ - v), delta);
			this.numberOfBellmanUpdates++;
			
		}
		
		return delta;
		
	}
	

}