aima.core.probability.mdp.search.PolicyIteration Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of aima-core Show documentation
AIMA-Java Core Algorithms from the book Artificial Intelligence a Modern Approach 3rd Ed.
The newest version!
package aima.core.probability.mdp.search;

import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

import aima.core.agent.Action;
import aima.core.probability.mdp.MarkovDecisionProcess;
import aima.core.probability.mdp.Policy;
import aima.core.probability.mdp.PolicyEvaluation;
import aima.core.probability.mdp.impl.LookupPolicy;
import aima.core.util.Util;

/**
 * Artificial Intelligence A Modern Approach (3rd Edition): page 657.

 * 

 * 
 *  * function POLICY-ITERATION(mdp) returns a policy
 *   inputs: mdp, an MDP with states S, actions A(s), transition model P(s' | s, a)
 *   local variables: U, a vector of utilities for states in S, initially zero
 *                    π, a policy vector indexed by state, initially random
 *                    
 *   repeat
 *      U <- POLICY-EVALUATION(π, U, mdp)
 *      unchanged? <- true
 *      for each state s in S do
 *          if max_{a ∈ A(s)} Σ_s'P(s'|s,a)U[s'] > Σ_s'P(s'|s,π[s])U[s'] then do
 *             π[s] <- argmax_{a ∈ A(s)} Σ_s'P(s'|s,a)U[s']
 *             unchanged? <- false
 *   until unchanged?
 *   return π
 * 
 * 
 * Figure 17.7 The policy iteration algorithm for calculating an optimal policy.
 * 
 * @param 
 *            the state type.
 * @param 
 *            the action type.
 * 
 * @author Ciaran O'Reilly
 * @author Ravi Mohan
 * 
 */
public class PolicyIteration {

	private PolicyEvaluation policyEvaluation = null;

	/**
	 * Constructor.
	 * 
	 * @param policyEvaluation
	 *            the policy evaluation function to use.
	 */
	public PolicyIteration(PolicyEvaluation policyEvaluation) {
		this.policyEvaluation = policyEvaluation;
	}

	// function POLICY-ITERATION(mdp) returns a policy
	/**
	 * The policy iteration algorithm for calculating an optimal policy.
	 * 
	 * @param mdp
	 *            an MDP with states S, actions A(s), transition model P(s'|s,a)
	 * @return an optimal policy
	 */
	public Policy policyIteration(MarkovDecisionProcess mdp) {
		// local variables: U, a vector of utilities for states in S, initially
		// zero
		Map U = Util.create(mdp.states(), new Double(0));
		// π, a policy vector indexed by state, initially random
		Map pi = initialPolicyVector(mdp);
		boolean unchanged;
		// repeat
		do {
			// U <- POLICY-EVALUATION(π, U, mdp)
			U = policyEvaluation.evaluate(pi, U, mdp);
			// unchanged? <- true
			unchanged = true;
			// for each state s in S do
			for (S s : mdp.states()) {
				// calculate:
				// max_{a ∈ A(s)}
				// Σ_s'P(s'|s,a)U[s']
				double aMax = Double.NEGATIVE_INFINITY, piVal = 0;
				A aArgmax = pi.get(s);
				for (A a : mdp.actions(s)) {
					double aSum = 0;
					for (S sDelta : mdp.states()) {
						aSum += mdp.transitionProbability(sDelta, s, a)
								* U.get(sDelta);
					}
					if (aSum > aMax) {
						aMax = aSum;
						aArgmax = a;
					}
					// track:
					// Σ_s'P(s'|s,π[s])U[s']
					if (a.equals(pi.get(s))) {
						piVal = aSum;
					}
				}
				// if max_{a ∈ A(s)}
				// Σ_s'P(s'|s,a)U[s']
				// > Σ_s'P(s'|s,π[s])U[s'] then do
				if (aMax > piVal) {
					// π[s] <- argmax_{a ∈A(s)}
					// Σ_s'P(s'|s,a)U[s']
					pi.put(s, aArgmax);
					// unchanged? <- false
					unchanged = false;
				}
			}
			// until unchanged?
		} while (!unchanged);

		// return π
		return new LookupPolicy(pi);
	}

	/**
	 * Create a policy vector indexed by state, initially random.
	 * 
	 * @param mdp
	 *            an MDP with states S, actions A(s), transition model P(s'|s,a)
	 * @return a policy vector indexed by state, initially random.
	 */
	public static  Map initialPolicyVector(
			MarkovDecisionProcess mdp) {
		Map pi = new LinkedHashMap();
		List actions = new ArrayList();
		for (S s : mdp.states()) {
			actions.clear();
			actions.addAll(mdp.actions(s));
			// Handle terminal states (i.e. no actions).
			if (actions.size() > 0) {
				pi.put(s, Util.selectRandomlyFromList(actions));
			}
		}
		return pi;
	}
}