aima.core.probability.mdp.search.ValueIteration Maven / Gradle / Ivy
package aima.core.probability.mdp.search;
import java.util.Map;
import java.util.Set;
import aima.core.agent.Action;
import aima.core.probability.mdp.MarkovDecisionProcess;
import aima.core.util.Util;
/**
* Artificial Intelligence A Modern Approach (3rd Edition): page 653.
*
*
*
* function VALUE-ITERATION(mdp, ε) returns a utility function
* inputs: mdp, an MDP with states S, actions A(s), transition model P(s' | s, a),
* rewards R(s), discount γ
* ε the maximum error allowed in the utility of any state
* local variables: U, U', vectors of utilities for states in S, initially zero
* δ the maximum change in the utility of any state in an iteration
*
* repeat
* U <- U'; δ <- 0
* for each state s in S do
* U'[s] <- R(s) + γ maxa ∈ A(s) Σs'P(s' | s, a) U[s']
* if |U'[s] - U[s]| > δ then δ <- |U'[s] - U[s]|
* until δ < ε(1 - γ)/γ
* return U
*
*
* Figure 17.4 The value iteration algorithm for calculating utilities of
* states. The termination condition is from Equation (17.8):
*
*
* if ||Ui+1 - Ui|| < ε(1 - γ)/γ then ||Ui+1 - U|| < ε
*
*
* @param
* the state type.
* @param
* the action type.
*
* @author Ciaran O'Reilly
* @author Ravi Mohan
*
*/
public class ValueIteration {
// discount γ to be used.
private double gamma = 0;
/**
* Constructor.
*
* @param gamma
* discount γ to be used.
*/
public ValueIteration(double gamma) {
if (gamma > 1.0 || gamma <= 0.0) {
throw new IllegalArgumentException("Gamma must be > 0 and <= 1.0");
}
this.gamma = gamma;
}
// function VALUE-ITERATION(mdp, ε) returns a utility function
/**
* The value iteration algorithm for calculating the utility of states.
*
* @param mdp
* an MDP with states S, actions A(s),
* transition model P(s' | s, a), rewards R(s)
* @param epsilon
* the maximum error allowed in the utility of any state
* @return a vector of utilities for states in S
*/
public Map valueIteration(MarkovDecisionProcess mdp,
double epsilon) {
//
// local variables: U, U', vectors of utilities for states in S,
// initially zero
Map U = Util.create(mdp.states(), new Double(0));
Map Udelta = Util.create(mdp.states(), new Double(0));
// δ the maximum change in the utility of any state in an
// iteration
double delta = 0;
// Note: Just calculate this once for efficiency purposes:
// ε(1 - γ)/γ
double minDelta = epsilon * (1 - gamma) / gamma;
// repeat
do {
// U <- U'; δ <- 0
U.putAll(Udelta);
delta = 0;
// for each state s in S do
for (S s : mdp.states()) {
// maxa ∈ A(s)
Set actions = mdp.actions(s);
// Handle terminal states (i.e. no actions).
double aMax = 0;
if (actions.size() > 0) {
aMax = Double.NEGATIVE_INFINITY;
}
for (A a : actions) {
// Σs'P(s' | s, a) U[s']
double aSum = 0;
for (S sDelta : mdp.states()) {
aSum += mdp.transitionProbability(sDelta, s, a)
* U.get(sDelta);
}
if (aSum > aMax) {
aMax = aSum;
}
}
// U'[s] <- R(s) + γ
// maxa ∈ A(s)
Udelta.put(s, mdp.reward(s) + gamma * aMax);
// if |U'[s] - U[s]| > δ then δ <- |U'[s] - U[s]|
double aDiff = Math.abs(Udelta.get(s) - U.get(s));
if (aDiff > delta) {
delta = aDiff;
}
}
// until δ < ε(1 - γ)/γ
} while (delta > minDelta);
// return U
return U;
}
}