import random import gym import numpy as np from collections import deque from keras.models import Sequential from keras.layers import Dense from keras.optimizers import Adam EPISODES = 1000 class DQNAgent: def __init__(self, state_size, action_size): self.state_size = state_size self.action_size = action_size self.memory = deque(maxlen=2000) self.gamma = ?? # discount rate self.epsilon = ?? # exploration rate self.epsilon_min = ?? # minimum epsilon self.epsilon_decay = ?? # epsilon decay self.learning_rate = 0.001 # learning rate self.model = self._build_model() self.target_model = self._build_model() self.update_target_model() self.target_model_update_count = 0 def _build_model(self): # Neural Net for Deep-Q learning Model # Build DNN as per the question model = Sequential() model.add(Dense(24, input_dim=self.state_size, activation='relu')) model.add(Dense()) model.add(Dense()) model.compile(loss='', optimizer=Adam(lr=self.learning_rate)) return model def update_target_model(self): self.target_model.set_weights(self.model.get_weights()) def act(self, state): # Implement e-greedy algo. return action def replay(self, batch_size): minibatch = random.sample(self.memory, batch_size) states, targets_f = [], [] for state, action, reward, next_state, done in minibatch: target = reward if not done: target = (reward + self.gamma * np.amax(self.target_model.predict(next_state)[0])) target_f = self.model.predict(state) target_f[0][action] = target states.append(state[0]) targets_f.append(target_f[0]) history = self.model.fit(np.array(states), np.array(targets_f), epochs=1, verbose=0) loss = history.history['loss'][0] if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay self.target_model_update_count = self.target_model_update_count + 1 if (self.target_model_update_count % 1) == 0: self.update_target_model() # try modifying this update rate return loss if __name__ == "__main__": env = gym.make('CartPole-v1') state_size = env.observation_space.shape[0] action_size = env.action_space.n agent = DQNAgent(state_size, action_size) done = False batch_size = 32 for e in range(EPISODES): state = env.reset() state = np.reshape(state, [1, state_size]) for score in range(500): # env.render() # uncomment to see the rendering # Pass the state to the agent and collect the action action = agent.act(??) #Pass the action to the environment and get next state and reward next_state, reward, done, _ = env.step(??) reward = reward if not done else -10 next_state = np.reshape(next_state, [1, state_size]) # Store state action reward next_state and done in memory agent.memory.append((state, action, reward, next_state, done)) # update the state with the next state state = ?? if done: print("episode: {}/{}, score: {}, e: {:.2}" .format(e, EPISODES, time, agent.epsilon)) break if len(agent.memory) > batch_size: loss = agent.replay(batch_size)