算法描述
import numpy as np
import pandas as pd
import time
N_STATES = 25 # the length of the 2 dimensional world
ACTIONS = ['left', 'right','up','down'] # available actions
EPSILON = 0.7 # greedy police
ALPHA = 0.8 # learning rate
GAMMA = 0.9 # discount factor
MAX_EPISODES = 1000 # maximum episodes
FRESH_TIME = 0.00001 # fresh time for one move
def build_q_table(n_states, actions):
table = pd.DataFrame(
np.zeros((n_states, len(actions))), # q_table initial values
columns=actions, # actions's name
)
return table
def build_e_table(n_states, actions):
table = pd.DataFrame(
np.zeros((n_states, len(actions))),
columns=actions,
)
return table
def choose_action(state, q_table):
state_actions = q_table.iloc[state, :]
if (np.random.uniform() > EPSILON) or ((state_actions == 0).all()): # act non-greedy or state-action have no value
if state==0:
action_name=np.random.choice(['right','down'])
elif state>0 and state20 and state0 and state20 and state
关注
打赏