目錄
- 一、離散動作
- 二、連續動作
- 1、例子1
- 2、知乎給出的示例
- 2、github里面的代碼
免責聲明:以下代碼部分來自網絡,部分來自ChatGPT,部分來自個人的理解。如有其他觀點,歡迎討論!
一、離散動作
注意:本文均以PPO算法為例。
# time: 2023/11/22 21:04
# author: YanJPimport torch
import torch
import torch.nn as nn
from torch.distributions import Categoricalclass MultiDimensionalActor(nn.Module):def __init__(self, input_dim, output_dims):super(MultiDimensionalActor, self).__init__()# Define a shared feature extraction networkself.feature_extractor = nn.Sequential(nn.Linear(input_dim, 128),nn.ReLU(),nn.Linear(128, 64),nn.ReLU())# Define individual output layers for each action dimensionself.output_layers = nn.ModuleList([nn.Linear(64, num_actions) for num_actions in output_dims])def forward(self, state):# Feature extractionfeatures = self.feature_extractor(state)# Generate Categorical objects for each action dimensioncategorical_objects = [Categorical(logits=output_layer(features)) for output_layer in self.output_layers]return categorical_objects# 定義主函數
def main():# 定義輸入狀態維度和每個動作維度的動作數input_dim = 10output_dims = [5, 8] # 兩個動作維度,分別有 3 和 4 個可能的動作# 創建 MultiDimensionalActor 實例actor_network = MultiDimensionalActor(input_dim, output_dims)# 生成輸入狀態(這里使用隨機數據作為示例)state = torch.randn(1, input_dim)# 調用 actor 網絡categorical_objects = actor_network(state)# 輸出每個動作維度的采樣動作和對應的對數概率for i, categorical in enumerate(categorical_objects):sampled_action = categorical.sample()log_prob = categorical.log_prob(sampled_action)print(f"Sampled action for dimension {i+1}: {sampled_action.item()}, Log probability: {log_prob.item()}")if __name__ == "__main__":main()#Sampled action for dimension 1: 1, Log probability: -1.4930928945541382
#Sampled action for dimension 2: 3, Log probability: -2.1875085830688477
注意代碼中categorical函數的兩個不同傳入參數的區別:參考鏈接
簡單來說,logits是計算softmax的,probs直接就是已知概率的時候傳進去就行。
二、連續動作
參考鏈接:github、知乎
為什么取對數概率?參考回答
1、例子1
先看如下的代碼:
# time: 2023/11/21 21:33
# author: YanJP
#這是對應多維連續變量的例子:
# 參考鏈接:https://github.com/XinJingHao/PPO-Continuous-Pytorch/blob/main/utils.py
# https://www.zhihu.com/question/417161289
import torch.nn as nn
import torch
class Policy(nn.Module):def __init__(self, in_dim, n_hidden_1, n_hidden_2, num_outputs):super(Policy, self).__init__()self.layer = nn.Sequential(nn.Linear(in_dim, n_hidden_1),nn.ReLU(True),nn.Linear(n_hidden_1, n_hidden_2),nn.ReLU(True),nn.Linear(n_hidden_2, num_outputs))class Normal(nn.Module):def __init__(self, num_outputs):super().__init__()self.stds = nn.Parameter(torch.zeros(num_outputs)) #創建一個可學習的參數 def forward(self, x):dist = torch.distributions.Normal(loc=x, scale=self.stds.exp())action = dist.sample((every_dimention_output,)) #這里我覺得是最重要的,不填sample的參數的話,默認每個分布只采樣一個值!!!!!!!!return actionif __name__ == '__main__':policy = Policy(4,20,20,5)normal = Normal(5) #設置5個維度every_dimention_output=10 #每個維度10個輸出observation = torch.Tensor(4)action = normal.forward(policy.layer( observation))print("action: ",action)
- self.stds.exp(),表示求指數,因為正態分布的標準差都是正數。
- action = dist.sample((every_dimention_output,))這里最重要!!!
2、知乎給出的示例
class Agent(nn.Module):def __init__(self, envs):super(Agent, self).__init__()self.actor_mean = nn.Sequential(layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)),nn.Tanh(),layer_init(nn.Linear(64, 64)),nn.Tanh(),layer_init(nn.Linear(64, np.prod(envs.single_action_space.shape)), std=0.01),)self.actor_logstd = nn.Parameter(torch.zeros(1, np.prod(envs.single_action_space.shape)))def get_action_and_value(self, x, action=None):action_mean = self.actor_mean(x)action_logstd = self.actor_logstd.expand_as(action_mean)action_std = torch.exp(action_logstd)probs = Normal(action_mean, action_std)if action is None:action = probs.sample()return action, probs.log_prob(action).sum(1), probs.entropy().sum(1), self.critic(x)
這里的np.prod(envs.single_action_space.shape),表示每個維度的動作數相乘,然后初始化這么多個actor網絡的標準差和均值,最后action里面的sample就是采樣這么多個數據。(感覺還是拉成了一維計算)
2、github里面的代碼
github
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Beta,Normalclass GaussianActor_musigma(nn.Module):def __init__(self, state_dim, action_dim, net_width):super(GaussianActor_musigma, self).__init__()self.l1 = nn.Linear(state_dim, net_width)self.l2 = nn.Linear(net_width, net_width)self.mu_head = nn.Linear(net_width, action_dim)self.sigma_head = nn.Linear(net_width, action_dim)def forward(self, state):a = torch.tanh(self.l1(state))a = torch.tanh(self.l2(a))mu = torch.sigmoid(self.mu_head(a))sigma = F.softplus( self.sigma_head(a) )return mu,sigmadef get_dist(self, state):mu,sigma = self.forward(state)dist = Normal(mu,sigma)return distdef deterministic_act(self, state):mu, sigma = self.forward(state)return mu
上述代碼主要是通過設置mu_head 和sigma_head的個數,來實現多維動作。
class GaussianActor_mu(nn.Module):def __init__(self, state_dim, action_dim, net_width, log_std=0):super(GaussianActor_mu, self).__init__()self.l1 = nn.Linear(state_dim, net_width)self.l2 = nn.Linear(net_width, net_width)self.mu_head = nn.Linear(net_width, action_dim)self.mu_head.weight.data.mul_(0.1)self.mu_head.bias.data.mul_(0.0)self.action_log_std = nn.Parameter(torch.ones(1, action_dim) * log_std)def forward(self, state):a = torch.relu(self.l1(state))a = torch.relu(self.l2(a))mu = torch.sigmoid(self.mu_head(a))return mudef get_dist(self,state):mu = self.forward(state)action_log_std = self.action_log_std.expand_as(mu)action_std = torch.exp(action_log_std)dist = Normal(mu, action_std)return distdef deterministic_act(self, state):return self.forward(state)
class Critic(nn.Module):def __init__(self, state_dim,net_width):super(Critic, self).__init__()self.C1 = nn.Linear(state_dim, net_width)self.C2 = nn.Linear(net_width, net_width)self.C3 = nn.Linear(net_width, 1)def forward(self, state):v = torch.tanh(self.C1(state))v = torch.tanh(self.C2(v))v = self.C3(v)return v
上述代碼只定義了mu的個數與維度數一樣,std作為可學習的參數之一。