PyTorch 深度學習實戰(23):多任務強化學習(Multi-Task RL)之擴展

之前的PyTorch 深度學習實戰(23):多任務強化學習(Multi-Task RL)總結擴展運用代碼如下:

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.distributions import Normal
from torch.amp import autocast, GradScaler
from metaworld.envs import ALL_V2_ENVIRONMENTS_GOAL_OBSERVABLE
import time
from collections import deque# ================== 配置參數 ==================
class MultiTaskPPOConfig:# 任務配置task_names = ['reach-v2-goal-observable','push-v2-goal-observable','pick-place-v2-goal-observable']num_tasks = 3# 網絡架構shared_dim = 512task_specific_dim = 256meta_controller_dim = 128shared_layers = 2task_specific_layers = 1# 訓練參數lr = 5e-5meta_lr = 1e-5gamma = 0.99gae_lambda = 0.97clip_epsilon = 0.15ppo_epochs = 5batch_size = 4096max_episodes = 10000max_steps = 200grad_clip = 0.5entropy_coef = 0.1# 探索參數initial_std = 1.5min_std = 0.2std_decay = 0.999# 課程學習安排curriculum_schedule = {0: ['reach-v2-goal-observable'],1000: ['reach-v2-goal-observable', 'push-v2-goal-observable'],3000: ['push-v2-goal-observable', 'pick-place-v2-goal-observable'],6000: ['reach-v2-goal-observable', 'push-v2-goal-observable', 'pick-place-v2-goal-observable']}# 監控配置log_interval = 50eval_interval = 500eval_episodes = 10device = torch.device("cuda" if torch.cuda.is_available() else "cpu")# ================== MetaController ==================
class MetaController(nn.Module):def __init__(self, num_tasks, state_dim):super().__init__()self.net = nn.Sequential(nn.Linear(state_dim, MultiTaskPPOConfig.meta_controller_dim),nn.LayerNorm(MultiTaskPPOConfig.meta_controller_dim),nn.GELU(),nn.Linear(MultiTaskPPOConfig.meta_controller_dim, num_tasks))# 初始化參數for layer in self.net:if isinstance(layer, nn.Linear):nn.init.orthogonal_(layer.weight, gain=0.01)nn.init.constant_(layer.bias, 0.0)def forward(self, state):logits = self.net(state)return torch.softmax(logits, -1), logits# ================== 共享策略網絡 ==================
class SharedPolicy(nn.Module):def __init__(self, state_dim, action_dim):super().__init__()self.action_dim = action_dimself.current_std = MultiTaskPPOConfig.initial_std# 共享網絡層self.shared_net = nn.Sequential(nn.Linear(state_dim, MultiTaskPPOConfig.shared_dim),nn.LayerNorm(MultiTaskPPOConfig.shared_dim),nn.GELU(),nn.Linear(MultiTaskPPOConfig.shared_dim, MultiTaskPPOConfig.shared_dim),nn.GELU())# 多任務頭部self.task_heads = nn.ModuleList()self.value_heads = nn.ModuleList()for _ in range(MultiTaskPPOConfig.num_tasks):# 動作頭task_head = nn.Sequential(nn.Linear(MultiTaskPPOConfig.shared_dim, MultiTaskPPOConfig.task_specific_dim),nn.GELU(),nn.Linear(MultiTaskPPOConfig.task_specific_dim, action_dim))self.task_heads.append(task_head)# 值函數頭value_head = nn.Sequential(nn.Linear(MultiTaskPPOConfig.shared_dim, MultiTaskPPOConfig.task_specific_dim),nn.GELU(),nn.Linear(MultiTaskPPOConfig.task_specific_dim, 1))self.value_heads.append(value_head)# 可學習的對數標準差self.log_std = nn.Parameter(torch.zeros(1, action_dim))# 初始化參數self._init_weights()def _init_weights(self):for head in self.task_heads:for layer in head:if isinstance(layer, nn.Linear):nn.init.orthogonal_(layer.weight, gain=0.01)nn.init.constant_(layer.bias, 0.0)for head in self.value_heads:for layer in head:if isinstance(layer, nn.Linear):nn.init.orthogonal_(layer.weight, gain=1.0)nn.init.constant_(layer.bias, 0.0)def decay_action_std(self):"""衰減動作標準差"""self.current_std = max(self.current_std * MultiTaskPPOConfig.std_decay,MultiTaskPPOConfig.min_std)def forward(self, states, task_ids):# 確保輸入是float32states = states.float() if states.dtype != torch.float32 else statesshared_features = self.shared_net(states)batch_size = states.size(0)# 初始化輸出張量action_means = torch.zeros(batch_size, self.action_dim,dtype=torch.float32,device=states.device)action_stds = torch.exp(self.log_std).expand(batch_size, -1) * self.current_stdvalues = torch.zeros(batch_size, 1,dtype=torch.float32,device=states.device)unique_task_ids = torch.unique(task_ids)for task_id_tensor in unique_task_ids:task_id = task_id_tensor.item()mask = (task_ids == task_id_tensor)if not mask.any():continueselected_features = shared_features[mask]# 計算任務特定輸出with autocast(device_type=states.device.type, enabled=False):  # 禁用混合精度task_action = self.task_heads[task_id](selected_features.float())task_value = self.value_heads[task_id](selected_features.float())action_means[mask] = task_actionvalues[mask] = task_valuereturn action_means, action_stds, values# ================== 訓練系統 ==================
class EnhancedMultiTaskPPOTrainer:def __init__(self):# 初始化多任務環境self.envs = []self.state_dim = Noneself.action_dim = None# 驗證環境并獲取維度for task_name in MultiTaskPPOConfig.task_names:env = ALL_V2_ENVIRONMENTS_GOAL_OBSERVABLE[task_name]()obs, _ = env.reset()if self.state_dim is None:self.state_dim = obs.shape[0]self.action_dim = env.action_space.shape[0]else:assert obs.shape[0] == self.state_dim, f"狀態維度不一致: {task_name}"self.envs.append(env)# 初始化策略網絡self.policy = SharedPolicy(self.state_dim, self.action_dim).to(MultiTaskPPOConfig.device)self.optimizer = optim.AdamW(self.policy.parameters(), lr=MultiTaskPPOConfig.lr)self.scheduler = optim.lr_scheduler.CosineAnnealingLR(self.optimizer,T_max=MultiTaskPPOConfig.max_episodes,eta_min=1e-6)self.scaler = GradScaler(enabled=MultiTaskPPOConfig.device.type == 'cuda')# 初始化MetaControllerself.meta_controller = MetaController(MultiTaskPPOConfig.num_tasks,self.state_dim).to(MultiTaskPPOConfig.device)self.meta_optimizer = optim.Adam(self.meta_controller.parameters(),lr=MultiTaskPPOConfig.meta_lr)# 初始化經驗回放緩沖self.buffer = deque(maxlen=MultiTaskPPOConfig.max_steps)# 課程學習狀態self.current_phase = 0self.phase_thresholds = sorted(MultiTaskPPOConfig.curriculum_schedule.keys())# 訓練統計self.episode_rewards = {i: deque(maxlen=100) for i in range(MultiTaskPPOConfig.num_tasks)}self.episode_lengths = {i: deque(maxlen=100) for i in range(MultiTaskPPOConfig.num_tasks)}self.meta_data = {'states': [],'chosen_tasks': [],'rewards': []}# 評估統計self.eval_rewards = {i: [] for i in range(MultiTaskPPOConfig.num_tasks)}self.eval_success = {i: [] for i in range(MultiTaskPPOConfig.num_tasks)}def get_current_tasks(self, episode):"""獲取當前課程階段的任務列表"""if len(self.phase_thresholds) > 1 and self.current_phase < len(self.phase_thresholds) - 1:if episode >= self.phase_thresholds[self.current_phase + 1]:self.current_phase += 1task_names = MultiTaskPPOConfig.curriculum_schedule[self.phase_thresholds[self.current_phase]]return [MultiTaskPPOConfig.task_names.index(name) for name in task_names]def collect_experience(self, num_steps, episode):"""集成課程學習和meta controller的經驗收集"""current_tasks = self.get_current_tasks(episode)for _ in range(num_steps):# 從當前課程任務中隨機選擇基礎任務base_task_id = np.random.choice(current_tasks)env = self.envs[base_task_id]if not hasattr(env, '_last_obs'):state, _ = env.reset()else:state = env._last_obs# MetaController調整state_tensor = torch.FloatTensor(state).unsqueeze(0).to(MultiTaskPPOConfig.device)with torch.no_grad():task_probs, _ = self.meta_controller(state_tensor)task_probs = task_probs.squeeze().cpu().numpy()# 過濾概率分布mask = np.zeros_like(task_probs)mask[current_tasks] = 1filtered_probs = task_probs * maskfiltered_probs = filtered_probs / (filtered_probs.sum() + 1e-6)# 任務選擇策略if np.random.rand() < 0.7:task_id = np.random.choice(current_tasks, p=filtered_probs[current_tasks])else:task_id = np.random.choice(current_tasks)# 記錄meta controller決策self.meta_data['states'].append(state_tensor)self.meta_data['chosen_tasks'].append(task_id)# 執行選擇的taskenv = self.envs[task_id]with torch.no_grad():task_id_tensor = torch.tensor([task_id], dtype=torch.long, device=MultiTaskPPOConfig.device)action_mean, action_std, value = self.policy(state_tensor, task_id_tensor)dist = Normal(action_mean.float(), action_std.float())  # 確保分布參數是float32action = dist.sample().squeeze(0)log_prob = dist.log_prob(action).sum(-1, keepdim=True)action_np = action.cpu().numpy()next_state, reward, done, trunc, info = env.step(action_np)# 記錄數據self.buffer.append({'state': state,'action': action_np,'log_prob': log_prob.cpu(),'reward': float(reward),'done': bool(done),'task_id': task_id,'value': float(value.item()),'success': info.get('success', False)})# 記錄meta controller的反饋self.meta_data['rewards'].append(reward)state = next_state if not (done or trunc) else env.reset()[0]def compute_gae(self, values, rewards, dones):"""計算廣義優勢估計(GAE)"""advantages = []last_advantage = 0next_value = 0next_non_terminal = 1.0for t in reversed(range(len(rewards))):delta = rewards[t] + MultiTaskPPOConfig.gamma * next_value * next_non_terminal - values[t]last_advantage = delta + MultiTaskPPOConfig.gamma * MultiTaskPPOConfig.gae_lambda * next_non_terminal * last_advantageadvantages.append(last_advantage)next_value = values[t]next_non_terminal = 1.0 - dones[t]advantages = torch.tensor(advantages[::-1], dtype=torch.float32).to(MultiTaskPPOConfig.device)returns = advantages + torch.tensor(values, dtype=torch.float32).to(MultiTaskPPOConfig.device)return (advantages - advantages.mean()) / (advantages.std() + 1e-8), returnsdef calculate_task_weights(self):"""基于最近表現計算任務權重"""task_weights = torch.ones(MultiTaskPPOConfig.num_tasks,device=MultiTaskPPOConfig.device)for task_id in range(MultiTaskPPOConfig.num_tasks):if len(self.episode_rewards[task_id]) > 10:# 計算最近10個episode的成功率recent_rewards = list(self.episode_rewards[task_id])[-10:]success_rate = sum(1 for r in recent_rewards if r > 0) / len(recent_rewards)# 動態調整權重if success_rate < 0.3:task_weights[task_id] = 2.0  # 困難任務加倍權重elif success_rate > 0.8:task_weights[task_id] = 0.5  # 簡單任務減半權重return task_weights / task_weights.sum()def update_meta_controller(self):"""更新任務選擇策略"""if len(self.meta_data['states']) == 0:returnstates = torch.cat(self.meta_data['states'])chosen_tasks = torch.tensor(self.meta_data['chosen_tasks'],device=MultiTaskPPOConfig.device)rewards = torch.tensor(self.meta_data['rewards'],dtype=torch.float32,device=MultiTaskPPOConfig.device)# 清空數據self.meta_data = {'states': [],'chosen_tasks': [],'rewards': []}# 歸一化獎勵rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-6)# 更新MetaControllertask_probs, logits = self.meta_controller(states)selected_probs = task_probs.gather(1, chosen_tasks.unsqueeze(1))loss = -torch.log(selected_probs + 1e-6) * rewards.unsqueeze(1)loss = loss.mean()self.meta_optimizer.zero_grad()loss.backward()torch.nn.utils.clip_grad_norm_(self.meta_controller.parameters(),MultiTaskPPOConfig.grad_clip)self.meta_optimizer.step()def update_policy(self):"""策略更新方法"""if not self.buffer:return 0, 0, 0# 從緩沖中提取數據batch = list(self.buffer)states = torch.tensor([x['state'] for x in batch],dtype=torch.float32,device=MultiTaskPPOConfig.device)actions = torch.FloatTensor(np.array([x['action'] for x in batch])).to(MultiTaskPPOConfig.device)old_log_probs = torch.cat([x['log_prob'] for x in batch]).to(MultiTaskPPOConfig.device)rewards = torch.FloatTensor([x['reward'] for x in batch]).to(MultiTaskPPOConfig.device)dones = torch.FloatTensor([x['done'] for x in batch]).to(MultiTaskPPOConfig.device)task_ids = torch.tensor([x['task_id'] for x in batch],dtype=torch.long,device=MultiTaskPPOConfig.device)values = torch.FloatTensor([x['value'] for x in batch]).to(MultiTaskPPOConfig.device)successes = torch.FloatTensor([x['success'] for x in batch]).to(MultiTaskPPOConfig.device)# 計算GAE和returnsadvantages, returns = self.compute_gae(values.cpu().numpy(), rewards.cpu().numpy(), dones.cpu().numpy())# 計算任務權重task_weights = self.calculate_task_weights()# 自動混合精度訓練total_policy_loss = 0total_value_loss = 0total_entropy = 0for _ in range(MultiTaskPPOConfig.ppo_epochs):# 隨機打亂數據perm = torch.randperm(len(batch))for i in range(0, len(batch), MultiTaskPPOConfig.batch_size):idx = perm[i:i + MultiTaskPPOConfig.batch_size]# 獲取小批量數據batch_states = states[idx]batch_actions = actions[idx]batch_old_log_probs = old_log_probs[idx]batch_returns = returns[idx]batch_advantages = advantages[idx]batch_task_ids = task_ids[idx]with autocast(device_type=MultiTaskPPOConfig.device.type,enabled=MultiTaskPPOConfig.device.type == 'cuda'):# 前向傳播action_means, action_stds, new_values = self.policy(batch_states, batch_task_ids)dist = Normal(action_means, action_stds)new_log_probs = dist.log_prob(batch_actions).sum(-1, keepdim=True)entropy = dist.entropy().mean()# 計算重要性采樣比率ratio = (new_log_probs - batch_old_log_probs).exp()# 策略損失surr1 = ratio * batch_advantages.unsqueeze(-1)surr2 = torch.clamp(ratio, 1 - MultiTaskPPOConfig.clip_epsilon,1 + MultiTaskPPOConfig.clip_epsilon) * batch_advantages.unsqueeze(-1)policy_loss_per_task = -torch.min(surr1, surr2)# 應用任務權重selected_weights = task_weights[batch_task_ids].unsqueeze(-1)policy_loss = (policy_loss_per_task * selected_weights).mean()policy_loss -= MultiTaskPPOConfig.entropy_coef * entropy# 值函數損失 (帶clip)value_pred_clipped = values[idx] + (new_values - values[idx]).clamp(-MultiTaskPPOConfig.clip_epsilon,MultiTaskPPOConfig.clip_epsilon)value_loss1 = (new_values.squeeze() - batch_returns).pow(2)value_loss2 = (value_pred_clipped.squeeze() - batch_returns).pow(2)value_loss = 0.5 * torch.max(value_loss1, value_loss2).mean()# 總損失loss = policy_loss + value_loss# 反向傳播self.scaler.scale(loss).backward()total_policy_loss += policy_loss.item()total_value_loss += value_loss.item()total_entropy += entropy.item()# 梯度裁剪和參數更新self.scaler.unscale_(self.optimizer)torch.nn.utils.clip_grad_norm_(self.policy.shared_net.parameters(), 1.0)torch.nn.utils.clip_grad_norm_(list(self.policy.task_heads.parameters()) +list(self.policy.value_heads.parameters()),0.5)self.scaler.step(self.optimizer)self.scaler.update()self.optimizer.zero_grad()self.scheduler.step()# 衰減動作噪聲self.policy.decay_action_std()return (total_policy_loss / MultiTaskPPOConfig.ppo_epochs,total_value_loss / MultiTaskPPOConfig.ppo_epochs,total_entropy / MultiTaskPPOConfig.ppo_epochs)def evaluate_policy(self):"""評估當前策略性能"""eval_results = {i: {'rewards': [], 'successes': []} for i in range(MultiTaskPPOConfig.num_tasks)}for task_id in range(MultiTaskPPOConfig.num_tasks):env = self.envs[task_id]for _ in range(MultiTaskPPOConfig.eval_episodes):state, _ = env.reset()episode_reward = 0done = Falsesuccess = Falsefor _ in range(MultiTaskPPOConfig.max_steps):with torch.no_grad():state_tensor = torch.FloatTensor(state).unsqueeze(0).to(MultiTaskPPOConfig.device)task_id_tensor = torch.tensor([task_id], dtype=torch.long, device=MultiTaskPPOConfig.device)action_mean, _, _ = self.policy(state_tensor, task_id_tensor)action = action_mean.squeeze(0).cpu().numpy()state, reward, done, trunc, info = env.step(action)episode_reward += rewardsuccess = success or info.get('success', False)if done or trunc:breakeval_results[task_id]['rewards'].append(episode_reward)eval_results[task_id]['successes'].append(success)# 記錄評估結果for task_id in range(MultiTaskPPOConfig.num_tasks):avg_reward = np.mean(eval_results[task_id]['rewards'])success_rate = np.mean(eval_results[task_id]['successes'])self.eval_rewards[task_id].append(avg_reward)self.eval_success[task_id].append(success_rate)return eval_resultsdef train(self):print(f"開始訓練,設備:{MultiTaskPPOConfig.device}")print(f"課程安排:{MultiTaskPPOConfig.curriculum_schedule}")start_time = time.time()# 初始評估self.evaluate_policy()for episode in range(MultiTaskPPOConfig.max_episodes):# 經驗收集階段self.collect_experience(MultiTaskPPOConfig.max_steps, episode)# 策略優化階段policy_loss, value_loss, entropy = self.update_policy()# MetaController更新self.update_meta_controller()# 記錄統計信息for exp in self.buffer:task_id = exp['task_id']self.episode_rewards[task_id].append(exp['reward'])self.episode_lengths[task_id].append(1)# 定期輸出日志if (episode + 1) % MultiTaskPPOConfig.log_interval == 0:avg_rewards = {k: np.mean(v) if v else 0 for k, v in self.episode_rewards.items()}success_rates = {k: np.mean([1 if r > 0 else 0 for r in v]) if v else 0for k, v in self.episode_rewards.items()}time_cost = time.time() - start_time# 打印當前課程階段current_task_names = MultiTaskPPOConfig.curriculum_schedule[self.phase_thresholds[self.current_phase]]print(f"\nEpisode {episode + 1:5d} | Time: {time_cost:6.1f}s")print(f"當前課程階段: {current_task_names} (Phase {self.current_phase})")print(f"動作標準差: {self.policy.current_std:.3f} | 學習率: {self.scheduler.get_last_lr()[0]:.2e}")for task_id in range(MultiTaskPPOConfig.num_tasks):task_name = MultiTaskPPOConfig.task_names[task_id]print(f"  {task_name:25s} | Avg Reward: {avg_rewards[task_id]:7.2f} | Success Rate: {success_rates[task_id]:.2f}")print(f"  Policy Loss: {policy_loss:.4f} | Value Loss: {value_loss:.4f} | Entropy: {entropy:.4f}")start_time = time.time()# 定期評估if (episode + 1) % MultiTaskPPOConfig.eval_interval == 0:eval_results = self.evaluate_policy()if (episode + 1) % 1000 == 0:print("\n評估結果:")for task_id in range(MultiTaskPPOConfig.num_tasks):task_name = MultiTaskPPOConfig.task_names[task_id]avg_reward = np.mean(eval_results[task_id]['rewards'])success_rate = np.mean(eval_results[task_id]['successes'])print(f"  {task_name:25s} | Avg Reward: {avg_reward:7.2f} | Success Rate: {success_rate:.2f}")# 訓練結束保存模型torch.save({'policy_state_dict': self.policy.state_dict(),'meta_controller_state_dict': self.meta_controller.state_dict(),'optimizer_state_dict': self.optimizer.state_dict()}, "multitask_ppo_model.pth")if __name__ == "__main__":trainer = EnhancedMultiTaskPPOTrainer()print(f"狀態維度: {trainer.state_dim}, 動作維度: {trainer.action_dim}")trainer.train()

部分輸出為:

Episode    50 | Time:  216.6s
當前課程階段: ['reach-v2-goal-observable'] (Phase 0)
動作標準差: 1.427 | 學習率: 5.00e-05reach-v2-goal-observable  | Avg Reward:    1.42 | Success Rate: 1.00push-v2-goal-observable   | Avg Reward:    0.00 | Success Rate: 0.00pick-place-v2-goal-observable | Avg Reward:    0.00 | Success Rate: 0.00Policy Loss: -0.1777 | Value Loss: 471.4303 | Entropy: 1.7773Episode   100 | Time:  193.3s
當前課程階段: ['reach-v2-goal-observable'] (Phase 0)
動作標準差: 1.357 | 學習率: 5.00e-05reach-v2-goal-observable  | Avg Reward:    1.42 | Success Rate: 1.00push-v2-goal-observable   | Avg Reward:    0.00 | Success Rate: 0.00pick-place-v2-goal-observable | Avg Reward:    0.00 | Success Rate: 0.00Policy Loss: -0.1729 | Value Loss: 357.7264 | Entropy: 1.7293......Episode  2800 | Time:  198.6s
當前課程階段: ['reach-v2-goal-observable', 'push-v2-goal-observable'] (Phase 1)
動作標準差: 0.200 | 學習率: 4.11e-05reach-v2-goal-observable  | Avg Reward:    1.44 | Success Rate: 1.00push-v2-goal-observable   | Avg Reward:    0.05 | Success Rate: 1.00pick-place-v2-goal-observable | Avg Reward:    0.00 | Success Rate: 0.00Policy Loss: 0.0092 | Value Loss: 191.3147 | Entropy: -0.0918Episode  2850 | Time:  212.2s
當前課程階段: ['reach-v2-goal-observable', 'push-v2-goal-observable'] (Phase 1)
動作標準差: 0.200 | 學習率: 4.08e-05reach-v2-goal-observable  | Avg Reward:    1.44 | Success Rate: 1.00push-v2-goal-observable   | Avg Reward:    0.05 | Success Rate: 1.00pick-place-v2-goal-observable | Avg Reward:    0.00 | Success Rate: 0.00Policy Loss: 0.0090 | Value Loss: 183.6324 | Entropy: -0.0902Episode  2900 | Time:  210.4s
當前課程階段: ['reach-v2-goal-observable', 'push-v2-goal-observable'] (Phase 1)
動作標準差: 0.200 | 學習率: 4.05e-05reach-v2-goal-observable  | Avg Reward:    1.44 | Success Rate: 1.00push-v2-goal-observable   | Avg Reward:    0.05 | Success Rate: 1.00pick-place-v2-goal-observable | Avg Reward:    0.00 | Success Rate: 0.00Policy Loss: 0.0089 | Value Loss: 188.5185 | Entropy: -0.0889Episode  2950 | Time:  210.1s
當前課程階段: ['reach-v2-goal-observable', 'push-v2-goal-observable'] (Phase 1)
動作標準差: 0.200 | 學習率: 4.02e-05reach-v2-goal-observable  | Avg Reward:    1.44 | Success Rate: 1.00push-v2-goal-observable   | Avg Reward:    0.05 | Success Rate: 1.00pick-place-v2-goal-observable | Avg Reward:    0.00 | Success Rate: 0.00Policy Loss: 0.0087 | Value Loss: 183.0386 | Entropy: -0.0874Episode  3000 | Time:  212.0s
當前課程階段: ['reach-v2-goal-observable', 'push-v2-goal-observable'] (Phase 1)
動作標準差: 0.200 | 學習率: 3.99e-05reach-v2-goal-observable  | Avg Reward:    1.45 | Success Rate: 1.00push-v2-goal-observable   | Avg Reward:    0.05 | Success Rate: 1.00pick-place-v2-goal-observable | Avg Reward:    0.00 | Success Rate: 0.00Policy Loss: 0.0086 | Value Loss: 182.9761 | Entropy: -0.0858評估結果:reach-v2-goal-observable  | Avg Reward:  106.66 | Success Rate: 0.00push-v2-goal-observable   | Avg Reward:    3.99 | Success Rate: 0.00pick-place-v2-goal-observable | Avg Reward:    4.49 | Success Rate: 0.00Episode  3050 | Time:  234.3s
當前課程階段: ['push-v2-goal-observable', 'pick-place-v2-goal-observable'] (Phase 2)
動作標準差: 0.200 | 學習率: 3.96e-05reach-v2-goal-observable  | Avg Reward:    1.45 | Success Rate: 1.00push-v2-goal-observable   | Avg Reward:    0.05 | Success Rate: 1.00pick-place-v2-goal-observable | Avg Reward:    0.02 | Success Rate: 1.00Policy Loss: 0.0084 | Value Loss: 28.1028 | Entropy: -0.0843Episode  3100 | Time:  210.3s
當前課程階段: ['push-v2-goal-observable', 'pick-place-v2-goal-observable'] (Phase 2)
動作標準差: 0.200 | 學習率: 3.93e-05reach-v2-goal-observable  | Avg Reward:    1.45 | Success Rate: 1.00push-v2-goal-observable   | Avg Reward:    0.05 | Success Rate: 1.00pick-place-v2-goal-observable | Avg Reward:    0.02 | Success Rate: 1.00Policy Loss: 0.0083 | Value Loss: 0.1660 | Entropy: -0.0829Episode  3150 | Time:  209.8s
當前課程階段: ['push-v2-goal-observable', 'pick-place-v2-goal-observable'] (Phase 2)
動作標準差: 0.200 | 學習率: 3.90e-05reach-v2-goal-observable  | Avg Reward:    1.45 | Success Rate: 1.00push-v2-goal-observable   | Avg Reward:    0.05 | Success Rate: 1.00pick-place-v2-goal-observable | Avg Reward:    0.02 | Success Rate: 1.00Policy Loss: 0.0082 | Value Loss: 0.1506 | Entropy: -0.0818Episode  3200 | Time:  210.2s
當前課程階段: ['push-v2-goal-observable', 'pick-place-v2-goal-observable'] (Phase 2)
動作標準差: 0.200 | 學習率: 3.86e-05reach-v2-goal-observable  | Avg Reward:    1.45 | Success Rate: 1.00push-v2-goal-observable   | Avg Reward:    0.05 | Success Rate: 1.00pick-place-v2-goal-observable | Avg Reward:    0.02 | Success Rate: 1.00Policy Loss: 0.0080 | Value Loss: 0.1429 | Entropy: -0.0801Episode  3250 | Time:  210.3s
當前課程階段: ['push-v2-goal-observable', 'pick-place-v2-goal-observable'] (Phase 2)
動作標準差: 0.200 | 學習率: 3.83e-05reach-v2-goal-observable  | Avg Reward:    1.45 | Success Rate: 1.00push-v2-goal-observable   | Avg Reward:    0.05 | Success Rate: 1.00pick-place-v2-goal-observable | Avg Reward:    0.02 | Success Rate: 1.00Policy Loss: 0.0079 | Value Loss: 0.1725 | Entropy: -0.0785Episode  3300 | Time:  209.7s
當前課程階段: ['push-v2-goal-observable', 'pick-place-v2-goal-observable'] (Phase 2)
動作標準差: 0.200 | 學習率: 3.80e-05reach-v2-goal-observable  | Avg Reward:    1.45 | Success Rate: 1.00push-v2-goal-observable   | Avg Reward:    0.05 | Success Rate: 1.00pick-place-v2-goal-observable | Avg Reward:    0.02 | Success Rate: 1.00Policy Loss: 0.0077 | Value Loss: 0.1990 | Entropy: -0.0771Episode  3350 | Time:  209.5s
當前課程階段: ['push-v2-goal-observable', 'pick-place-v2-goal-observable'] (Phase 2)
動作標準差: 0.200 | 學習率: 3.76e-05reach-v2-goal-observable  | Avg Reward:    1.45 | Success Rate: 1.00push-v2-goal-observable   | Avg Reward:    0.05 | Success Rate: 1.00pick-place-v2-goal-observable | Avg Reward:    0.02 | Success Rate: 1.00Policy Loss: 0.0076 | Value Loss: 0.2084 | Entropy: -0.0758Episode  3400 | Time:  210.1s
當前課程階段: ['push-v2-goal-observable', 'pick-place-v2-goal-observable'] (Phase 2)
動作標準差: 0.200 | 學習率: 3.73e-05reach-v2-goal-observable  | Avg Reward:    1.45 | Success Rate: 1.00push-v2-goal-observable   | Avg Reward:    0.05 | Success Rate: 1.00pick-place-v2-goal-observable | Avg Reward:    0.02 | Success Rate: 1.00Policy Loss: 0.0075 | Value Loss: 0.2057 | Entropy: -0.0745Episode  3450 | Time:  210.9s
當前課程階段: ['push-v2-goal-observable', 'pick-place-v2-goal-observable'] (Phase 2)
動作標準差: 0.200 | 學習率: 3.70e-05reach-v2-goal-observable  | Avg Reward:    1.45 | Success Rate: 1.00push-v2-goal-observable   | Avg Reward:    0.05 | Success Rate: 1.00pick-place-v2-goal-observable | Avg Reward:    0.02 | Success Rate: 1.00Policy Loss: 0.0073 | Value Loss: 0.2251 | Entropy: -0.0733Episode  3500 | Time:  210.1s
當前課程階段: ['push-v2-goal-observable', 'pick-place-v2-goal-observable'] (Phase 2)
動作標準差: 0.200 | 學習率: 3.66e-05reach-v2-goal-observable  | Avg Reward:    1.45 | Success Rate: 1.00push-v2-goal-observable   | Avg Reward:    0.05 | Success Rate: 1.00pick-place-v2-goal-observable | Avg Reward:    0.02 | Success Rate: 1.00Policy Loss: 0.0072 | Value Loss: 0.2199 | Entropy: -0.0723......

本文來自互聯網用戶投稿,該文觀點僅代表作者本人,不代表本站立場。本站僅提供信息存儲空間服務,不擁有所有權,不承擔相關法律責任。
如若轉載,請注明出處:http://www.pswp.cn/web/77805.shtml
繁體地址,請注明出處:http://hk.pswp.cn/web/77805.shtml
英文地址,請注明出處:http://en.pswp.cn/web/77805.shtml

如若內容造成侵權/違法違規/事實不符,請聯系多彩編程網進行投訴反饋email:809451989@qq.com,一經查實,立即刪除!

相關文章

前端——CSS1

一&#xff0c;概述 CSS&#xff08;Cascading Style Sheets&#xff09;&#xff08;級聯樣式表&#xff09; css是一種樣式表語言&#xff0c;為html標簽修飾定義外觀&#xff0c;分工不同 涉及&#xff1a;對網頁的文字、背景、寬、高、布局進行修飾 分為內嵌樣式表&…

賦能航天教育:高校衛星仿真教學實驗平臺解決方案

?????? 隨著全球航天事業的飛速發展&#xff0c;對高素質航天人才的需求日益增長。如何在高校階段提前鍛煉學生的航天工程實踐能力&#xff0c;成為教育界的重要命題。作為領先的通信與網絡技術供應商&#xff0c;IPLOOK基于自身在5G核心網、衛星通信及仿真平臺領域的深…

Python爬蟲(10)Python數據存儲實戰:基于pymongo的MongoDB開發深度指南

目錄 一、為什么需要文檔型數據庫&#xff1f;1.1 數據存儲的范式變革1.2 pymongo的核心優勢 二、pymongo核心操作全解析2.1 環境準備2.2 數據庫連接與CRUD操作2.3 聚合管道實戰2.4 分批次插入百萬級數據&#xff08;進階&#xff09;2.5 分批次插入百萬級數據&#xff08;進階…

Springboot 手搓 后端 滑塊驗證碼生成

目錄 一、效果演示 二、后端滑塊驗證碼生成思路 三、原理解析 四、核心代碼拿走 滑塊驗證碼react前端實現&#xff0c;見我的這篇博客&#xff1a;前端 React 彈窗式 滑動驗證碼實現_react中使用阿里云滑塊驗證碼2.0前端接入及相關視覺-CSDN博客 一、效果演示 生成的案例…

關于flink兩階段提交高并發下程序卡住問題

先拋出代碼 package com.dpf.flink;import com.dpf.flink.sink.MysqlSink; import org.apache.flink.api.common.serialization.SimpleStringSchema; import org.apache.flink.api.common.typeinfo.Types; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.…

html css js網頁制作成品——HTML+CSS+js美甲店網頁設計(5頁)附源碼

美甲店 目錄 一、&#x1f468;?&#x1f393;網站題目 二、??網站描述 三、&#x1f4da;網站介紹 四、&#x1f310;網站效果 五、&#x1fa93; 代碼實現 &#x1f9f1;HTML 六、&#x1f947; 如何讓學習不再盲目 七、&#x1f381;更多干貨 一、&#x1f468;?&a…

LeetCode[347]前K個高頻元素

思路&#xff1a; 使用小頂堆&#xff0c;最小的元素都出去了&#xff0c;省的就是大&#xff0c;高頻的元素了&#xff0c;所以要維護一個小頂堆&#xff0c;使用map存元素高頻變化&#xff0c;map存堆里&#xff0c;然后輸出堆的東西就行了 代碼&#xff1a; class Solution…

2024年網站開發語言選擇指南:PHP/Java/Node.js/Python如何選型?

2024年網站開發語言選擇指南&#xff1a;PHP/Java/Node.js/Python如何選型&#xff1f; 一、8大主流Web開發語言技術對比 1. PHP開發&#xff1a;中小型網站的首選方案 最新版本&#xff1a;PHP 8.3&#xff08;2023年11月發布&#xff09;核心優勢&#xff1a; 全球78%的網站…

從數據結構說起(一)

1 揭開數據結構神奇的面紗 1.1 初識數據結構 在C的標準庫模板&#xff08;Standard Template Library,STL&#xff09;課程上&#xff0c;我初次結識了《數據結構》。C語言提供的標準庫模板是面向對象程序設計與泛型程序設計思想相結合的典范。所謂的泛型編程就是編寫不依賴于具…

JAVA--- 關鍵字static

之前我們學習了JAVA 面向對象的一些基本知識&#xff0c;今天來進階一下&#xff01;&#xff01;&#xff01; static關鍵字 static表示靜態&#xff0c;是JAVA中的一個修飾符&#xff0c;可以修飾成員方法&#xff0c;成員變量&#xff0c;可用于修飾類的成員&#xff08;變…

4.27比賽總結

文章目錄 T1T2法一&#xff1a;倍增求 LCA法二&#xff1a;Dijkstra 求最短路法三&#xff1a;dfs 求深度 T3T4總結 T1 一道非常簡單的題&#xff0c;結果我因為一句話沒寫掛了 80pts…… 題目中沒寫 a a a 數組要按照 b b b 數組的順序&#xff0c;所以對于最大方案&#x…

數據一致性巡檢總結:基于分桶采樣的設計與實現

數據一致性巡檢總結&#xff1a;基于分桶采樣的設計與實現 背景 在分布式系統中&#xff0c;緩存&#xff08;如 Redis&#xff09;與數據庫&#xff08;如 MySQL&#xff09;之間的數據一致性問題是一個常見的挑戰。由于緩存的引入&#xff0c;數據在緩存和數據庫之間可能存…

SpringBoot與Druid整合,實現主從數據庫同步

通過引入主從數據庫同步系統&#xff0c;可以顯著提升平臺的性能和穩定性&#xff0c;同時保證數據的一致性和安全性。Druid連接池也提供了強大的監控和安全防護功能&#xff0c;使得整個系統更加健壯和可靠。 我們為什么選擇Druid&#xff1f; 高效的連接管理&#xff1a;Dru…

在Linux系統中安裝MySQL,二進制包版

1、檢查是否已安裝數據庫&#xff08;rpm軟件包管理器&#xff09; rpm -qa | grep mysql rpm -qa | grep mariadb #centOS7自帶mariadb與mysql數據庫沖突2、刪除已有數據庫 rpm -e –nodeps 軟件名稱 3、官網下載MySQL包 4、上傳 # 使用FinalShell或Xshell工具上傳&#…

【含文檔+PPT+源碼】基于SpringBoot電腦DIY裝機教程網站的設計與實現

項目介紹 本課程演示的是一款 基于SpringBoot電腦DIY裝機教程網站的設計與實現&#xff0c;主要針對計算機相關專業的正在做畢設的學生與需要項目實戰練習的 Java 學習者。 1.包含&#xff1a;項目源碼、項目文檔、數據庫腳本、軟件工具等所有資料 2.帶你從零開始部署運行本套…

Spring Boot 緩存機制:從原理到實踐

文章目錄 一、引言二、Spring Boot 緩存機制原理2.1 緩存抽象層2.2 緩存注解2.3 緩存管理器 三、入門使用3.1 引入依賴3.2 配置緩存3.3 啟用緩存3.4 使用緩存注解3.5 實體類 四、踩坑記錄4.1 緩存鍵生成問題4.2 緩存過期與更新問題4.3 事務與緩存的一致性問題 五、心得體會5.1 …

Spark讀取Apollo配置

--conf spark.driver.extraJavaOptions-Dapp.idapollo的app.id -Denvfat -Dapollo.clusterfat -Dfat_metaapollo的meta地址 --conf spark.executor.extraJavaOptions-Dapp.idapollo的app.id -Denvfat -Dapollo.clusterfat -Dfat_metaapollo的meta地址 在spark的提交命令中&…

[逆向工程]如何理解小端序?逆向工程中的字節序陷阱與實戰解析

[逆向工程]如何理解小端序&#xff1f;逆向工程中的字節序陷阱與實戰解析 關鍵詞&#xff1a;逆向工程、小端序、字節序、二進制分析、數據解析 引言&#xff1a;為什么字節序是逆向工程師的必修課&#xff1f; 在逆向工程中&#xff0c;分析二進制數據是最基礎的任務之一。…

項目三 - 任務2:創建筆記本電腦類(一爹多叔)

在本次實戰中&#xff0c;我們通過Java的單根繼承和多接口實現特性&#xff0c;設計了一個筆記本電腦類。首先創建了Computer抽象類&#xff0c;提供計算的抽象方法&#xff0c;模擬電腦的基本功能。接著定義了NetCard和USB兩個接口&#xff0c;分別包含連接網絡和USB設備的抽象…

ElasticSearch深入解析(六):集群核心配置

1.開發模式和生產模式 Elasticsearch默認運行在開發模式下&#xff0c;此模式允許節點在配置存在錯誤時照常啟動&#xff0c;僅將警告信息寫入日志文件。而生產模式則更為嚴格&#xff0c;一旦檢測到配置錯誤&#xff0c;節點將無法啟動&#xff0c;這是一種保障系統穩定性的安…