All Downloads are FREE. Search and download functionalities are using the official Maven repository.

aima.core.probability.mdp.search.PolicyIteration Maven / Gradle / Ivy

Go to download

AIMA-Java Core Algorithms from the book Artificial Intelligence a Modern Approach 3rd Ed.

The newest version!
package aima.core.probability.mdp.search;

import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

import aima.core.agent.Action;
import aima.core.probability.mdp.MarkovDecisionProcess;
import aima.core.probability.mdp.Policy;
import aima.core.probability.mdp.PolicyEvaluation;
import aima.core.probability.mdp.impl.LookupPolicy;
import aima.core.util.Util;

/**
 * Artificial Intelligence A Modern Approach (3rd Edition): page 657.
*
* *
 * function POLICY-ITERATION(mdp) returns a policy
 *   inputs: mdp, an MDP with states S, actions A(s), transition model P(s' | s, a)
 *   local variables: U, a vector of utilities for states in S, initially zero
 *                    π, a policy vector indexed by state, initially random
 *                    
 *   repeat
 *      U <- POLICY-EVALUATION(π, U, mdp)
 *      unchanged? <- true
 *      for each state s in S do
 *          if maxa ∈ A(s) Σs'P(s'|s,a)U[s'] > Σs'P(s'|s,π[s])U[s'] then do
 *             π[s] <- argmaxa ∈ A(s) Σs'P(s'|s,a)U[s']
 *             unchanged? <- false
 *   until unchanged?
 *   return π
 * 
* * Figure 17.7 The policy iteration algorithm for calculating an optimal policy. * * @param * the state type. * @param * the action type. * * @author Ciaran O'Reilly * @author Ravi Mohan * */ public class PolicyIteration { private PolicyEvaluation policyEvaluation = null; /** * Constructor. * * @param policyEvaluation * the policy evaluation function to use. */ public PolicyIteration(PolicyEvaluation policyEvaluation) { this.policyEvaluation = policyEvaluation; } // function POLICY-ITERATION(mdp) returns a policy /** * The policy iteration algorithm for calculating an optimal policy. * * @param mdp * an MDP with states S, actions A(s), transition model P(s'|s,a) * @return an optimal policy */ public Policy policyIteration(MarkovDecisionProcess mdp) { // local variables: U, a vector of utilities for states in S, initially // zero Map U = Util.create(mdp.states(), new Double(0)); // π, a policy vector indexed by state, initially random Map pi = initialPolicyVector(mdp); boolean unchanged; // repeat do { // U <- POLICY-EVALUATION(π, U, mdp) U = policyEvaluation.evaluate(pi, U, mdp); // unchanged? <- true unchanged = true; // for each state s in S do for (S s : mdp.states()) { // calculate: // maxa ∈ A(s) // Σs'P(s'|s,a)U[s'] double aMax = Double.NEGATIVE_INFINITY, piVal = 0; A aArgmax = pi.get(s); for (A a : mdp.actions(s)) { double aSum = 0; for (S sDelta : mdp.states()) { aSum += mdp.transitionProbability(sDelta, s, a) * U.get(sDelta); } if (aSum > aMax) { aMax = aSum; aArgmax = a; } // track: // Σs'P(s'|s,π[s])U[s'] if (a.equals(pi.get(s))) { piVal = aSum; } } // if maxa ∈ A(s) // Σs'P(s'|s,a)U[s'] // > Σs'P(s'|s,π[s])U[s'] then do if (aMax > piVal) { // π[s] <- argmaxa ∈A(s) // Σs'P(s'|s,a)U[s'] pi.put(s, aArgmax); // unchanged? <- false unchanged = false; } } // until unchanged? } while (!unchanged); // return π return new LookupPolicy(pi); } /** * Create a policy vector indexed by state, initially random. * * @param mdp * an MDP with states S, actions A(s), transition model P(s'|s,a) * @return a policy vector indexed by state, initially random. */ public static Map initialPolicyVector( MarkovDecisionProcess mdp) { Map pi = new LinkedHashMap(); List actions = new ArrayList(); for (S s : mdp.states()) { actions.clear(); actions.addAll(mdp.actions(s)); // Handle terminal states (i.e. no actions). if (actions.size() > 0) { pi.put(s, Util.selectRandomlyFromList(actions)); } } return pi; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy