aima.core.probability.mdp.search.PolicyIteration Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of aima-core Show documentation
Show all versions of aima-core Show documentation
AIMA-Java Core Algorithms from the book Artificial Intelligence a Modern Approach 3rd Ed.
The newest version!
package aima.core.probability.mdp.search;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import aima.core.agent.Action;
import aima.core.probability.mdp.MarkovDecisionProcess;
import aima.core.probability.mdp.Policy;
import aima.core.probability.mdp.PolicyEvaluation;
import aima.core.probability.mdp.impl.LookupPolicy;
import aima.core.util.Util;
/**
* Artificial Intelligence A Modern Approach (3rd Edition): page 657.
*
*
*
* function POLICY-ITERATION(mdp) returns a policy
* inputs: mdp, an MDP with states S, actions A(s), transition model P(s' | s, a)
* local variables: U, a vector of utilities for states in S, initially zero
* π, a policy vector indexed by state, initially random
*
* repeat
* U <- POLICY-EVALUATION(π, U, mdp)
* unchanged? <- true
* for each state s in S do
* if maxa ∈ A(s) Σs'P(s'|s,a)U[s'] > Σs'P(s'|s,π[s])U[s'] then do
* π[s] <- argmaxa ∈ A(s) Σs'P(s'|s,a)U[s']
* unchanged? <- false
* until unchanged?
* return π
*
*
* Figure 17.7 The policy iteration algorithm for calculating an optimal policy.
*
* @param
* the state type.
* @param
* the action type.
*
* @author Ciaran O'Reilly
* @author Ravi Mohan
*
*/
public class PolicyIteration {
private PolicyEvaluation policyEvaluation = null;
/**
* Constructor.
*
* @param policyEvaluation
* the policy evaluation function to use.
*/
public PolicyIteration(PolicyEvaluation policyEvaluation) {
this.policyEvaluation = policyEvaluation;
}
// function POLICY-ITERATION(mdp) returns a policy
/**
* The policy iteration algorithm for calculating an optimal policy.
*
* @param mdp
* an MDP with states S, actions A(s), transition model P(s'|s,a)
* @return an optimal policy
*/
public Policy policyIteration(MarkovDecisionProcess mdp) {
// local variables: U, a vector of utilities for states in S, initially
// zero
Map U = Util.create(mdp.states(), new Double(0));
// π, a policy vector indexed by state, initially random
Map pi = initialPolicyVector(mdp);
boolean unchanged;
// repeat
do {
// U <- POLICY-EVALUATION(π, U, mdp)
U = policyEvaluation.evaluate(pi, U, mdp);
// unchanged? <- true
unchanged = true;
// for each state s in S do
for (S s : mdp.states()) {
// calculate:
// maxa ∈ A(s)
// Σs'P(s'|s,a)U[s']
double aMax = Double.NEGATIVE_INFINITY, piVal = 0;
A aArgmax = pi.get(s);
for (A a : mdp.actions(s)) {
double aSum = 0;
for (S sDelta : mdp.states()) {
aSum += mdp.transitionProbability(sDelta, s, a)
* U.get(sDelta);
}
if (aSum > aMax) {
aMax = aSum;
aArgmax = a;
}
// track:
// Σs'P(s'|s,π[s])U[s']
if (a.equals(pi.get(s))) {
piVal = aSum;
}
}
// if maxa ∈ A(s)
// Σs'P(s'|s,a)U[s']
// > Σs'P(s'|s,π[s])U[s'] then do
if (aMax > piVal) {
// π[s] <- argmaxa ∈A(s)
// Σs'P(s'|s,a)U[s']
pi.put(s, aArgmax);
// unchanged? <- false
unchanged = false;
}
}
// until unchanged?
} while (!unchanged);
// return π
return new LookupPolicy(pi);
}
/**
* Create a policy vector indexed by state, initially random.
*
* @param mdp
* an MDP with states S, actions A(s), transition model P(s'|s,a)
* @return a policy vector indexed by state, initially random.
*/
public static Map initialPolicyVector(
MarkovDecisionProcess mdp) {
Map pi = new LinkedHashMap();
List actions = new ArrayList();
for (S s : mdp.states()) {
actions.clear();
actions.addAll(mdp.actions(s));
// Handle terminal states (i.e. no actions).
if (actions.size() > 0) {
pi.put(s, Util.selectRandomlyFromList(actions));
}
}
return pi;
}
}