diff --git a/main.py b/main.py index 36c982d..3031e03 100644 --- a/main.py +++ b/main.py @@ -24,7 +24,8 @@ BATCH_SIZE = 32 POLICY_UPDATE = 4 TARGET_UPDATE = 10_000 -WARM_STEPS = 50_000 +# WARM_STEPS = 50_000 +WARM_STEPS = 1 MAX_STEPS = 50_000_000 EVALUATE_FREQ = 100_000 diff --git a/utils_drl.py b/utils_drl.py index 9c74ac7..d2da1d8 100644 --- a/utils_drl.py +++ b/utils_drl.py @@ -82,15 +82,13 @@ def learn(self, memory: ReplayMemory, batch_size: int) -> float: values = self.__policy(state_batch.float()).gather(1, action_batch) values_next = self.__target(next_batch.float()).max(1).values.detach() expected = (self.__gamma * values_next.unsqueeze(1)) * \ - (1. - done_batch) + reward_batch + (1. - done_batch) + reward_batch # 如果done则是r,否则是r + gamma * max Q loss = F.smooth_l1_loss(values, expected) # smooth l1损失 self.__optimizer.zero_grad() # 将模型的参数梯度初始化为0 - print(loss) - loss.backward() # 计算梯度 - print(loss) + loss.backward() # 计算梯度,存到__policy.parameters.grad()中 for param in self.__policy.parameters(): - param.grad.data.clamp_(-1, 1) # 固定所有参数为[-1, 1] + param.grad.data.clamp_(-1, 1) # 固定所有梯度为[-1, 1] self.__optimizer.step() # 做一步最优化 return loss.item()