过度估计的实验
Normalized score
# 导入必要的库
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import random
# 定义Q网络
class QNetwork(nn.Module):
def __init__(self, state_size, action_size):
super(QNetwork, self).__init__()
self.fc1 = nn.Linear(state_size, 64)
self.fc2 = nn.Linear(64, 64)
self.fc3 = nn.Linear(64, action_size)
def forward(self, state):
x = torch.relu(self.fc1(state))
x = torch.relu(self.fc2(x))
return self.fc3(x)
# 定义经验回放
class ReplayBuffer:
def __init__(self, buffer_size):
self.buffer_size = buffer_size
self.buffer = deque(maxlen=buffer_size)
def add(self, state, action, reward, next_state, done):
self.buffer.append((state, action, reward, next_state, done))
def sample(self, batch_size):
samples = random.sample(self.buffer, batch_size)
return map(list, zip(*samples))
def __len__(self):
return len(self.buffer)
# 定义双DQN代理
class DoubleDQNAgent:
def __init__(self, state_size, action_size):
self.state_size = state_size
self.action_size = action_size
self.network = QNetwork(state_size, action_size)
self.target_network = QNetwork(state_size, action_size)
self.buffer = ReplayBuffer(10000)
self.optimizer = optim.Adam(self.network.parameters())
self.criterion = nn.MSELoss()
def get_action(self, state, epsilon):
if np.random.random() < epsilon:
return np.random.randint(self.action_size)
else:
state = torch.FloatTensor(state)
q_values = self.network(state)
return torch.argmax(q_values).item()
def update(self, batch_size):
states, actions, rewards, next_states, dones = self.buffer.sample(batch_size)
states = torch.FloatTensor(states)
actions = torch.LongTensor(actions)
rewards = torch.FloatTensor(rewards)
next_states = torch.FloatTensor(next_states)
dones = torch.FloatTensor(dones)
current_q = self.network(states).gather(1, actions.unsqueeze(1))
next_q = self.target_network(next_states).max(1)[0]
target_q = rewards + (1 - dones) * 0.99 * next_q
loss = self.criterion(current_q, target_q.unsqueeze(1))
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
def update_target_network(self):
self.target_network.load_state_dict(self.network.state_dict())
def save_model(self, path):
torch.save(self.network.state_dict(), path)
def load_model(self, path):
if os.path.exists(path):
self.network.load_state_dict(torch.load(path))
self.target_network.load_state_dict(torch.load(path))
# 定义训练函数
def train(agent, env, episodes, batch_size, epsilon):
for episode in range(episodes):
state = env.reset()
done = False
episode_reward = 0
while not done:
action = agent.get_action(state, epsilon)
next_state, reward, done, _ = env.step(action)
agent.buffer.add(state, action, reward, next_state, done)
state = next_state
episode_reward += reward
if len(agent.buffer) > batch_size:
agent.update(batch_size)
if episode % 10 == 0:
agent.update_target_network()
agent.save_model(f"model_episode_{episode}.pth")
print(f"Episode: {episode}, Reward: {episode_reward}, Epsilon: {epsilon}")
# 定义主函数
def main():
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = DoubleDQNAgent(state_size, action_size)
agent.load_model("pretrained_model.pth")
train(agent, env, 1000, 64, 0.1)
if __name__ == "__main__":
main()
<< · Back Index ·>>