bitrl_cuberl_docs/a2c_8h_source.html

#ifndef A2C_H

#define A2C_H


#include "cuberl/base/cubeai_config.h"


#ifdef USE_PYTORCH


#include "cuberl/base/cuberl_types.h"

#include "cuberl/utils/torch_adaptor.h"

#include "cuberl/rl/algorithms/rl_algorithm_base.h"

#include "cuberl/rl/algorithms/utils.h"

#include "cuberl/rl/episode_info.h"

#include "cuberl/rl/algorithms/pg/a2c_config.h"

#include "cuberl/rl/algorithms/pg/a2c_monitor.h"

#include "cuberl/data_structs/experience_buffer.h"


#include <torch/torch.h>


#ifdef CUBERL_DEBUG

#include <cassert>

#endif


#include <chrono>

#include <memory>

#include <tuple>


namespace cuberl{

namespace rl::algos::pg

{


    template<typename EnvType, typename PolicyType, typename CriticType>

    class A2CSolver final: public RLSolverBase<EnvType>

    {

    public:


        typedef EnvType env_type;


        typedef PolicyType policy_type;


        typedef CriticType critic_type;


        typedef typename env_type::state_type state_type;

        typedef typename env_type::action_type action_type;


        typedef typename A2CMonitor<action_type,

                                    state_type>::experience_buffer_type experience_buffer_type;


        A2CSolver(const A2CConfig& config,

                  policy_type& policy, critic_type& critic,

                  std::unique_ptr<torch::optim::Optimizer>& policy_optimizer,

                  std::unique_ptr<torch::optim::Optimizer>& critic_optimizer);


        virtual void actions_before_training_begins(env_type&);


        virtual void actions_after_training_ends(env_type&) override final{}


        virtual void actions_before_episode_begins(env_type&,

                                                   uint_t /*episode_idx*/) override final{}


        virtual void actions_after_episode_ends(env_type&,

                                                uint_t /*episode_idx*/,

                                                const EpisodeInfo&) override final{}


        virtual EpisodeInfo on_training_episode(env_type&, uint_t /*episode_idx*/);


        void set_train_mode()noexcept;


        void set_evaluation_mode()noexcept;


        A2CMonitor<action_type, state_type>& get_monitor(){return monitor_;}


    private:


        A2CConfig config_;


        policy_type& policy_;


        critic_type& critic_;


        A2CMonitor<action_type, state_type> monitor_;


        std::unique_ptr<torch::optim::Optimizer> policy_optimizer_;


        std::unique_ptr<torch::optim::Optimizer> critic_optimizer_;


        uint_t create_episode_batch_(env_type&,

                                     uint_t /*episode_idx*/,

                                     experience_buffer_type& buffer);


        std::tuple<real_t, real_t>

        train_with_batch_(experience_buffer_type& buffer);


    };


    template<typename EnvType, typename PolicyType, typename CriticType>

    A2CSolver<EnvType, PolicyType, CriticType>::A2CSolver(const A2CConfig& config,

                                                          policy_type& policy, critic_type& critic,

                                                          std::unique_ptr<torch::optim::Optimizer>& policy_optimizer,

                                                          std::unique_ptr<torch::optim::Optimizer>& critic_optimizer)

        :

        config_(config),

        policy_(policy),

        critic_(critic),

        monitor_(),

        policy_optimizer_(std::move(policy_optimizer)),

        critic_optimizer_(std::move(critic_optimizer))

    {}


    template<typename EnvType, typename PolicyType, typename CriticType>

    void

    A2CSolver<EnvType, PolicyType, CriticType>::set_train_mode()noexcept{

        policy_ -> train();

        critic_ -> train();


    }


    template<typename EnvType, typename PolicyType, typename CriticType>

    void

    A2CSolver<EnvType, PolicyType, CriticType>::set_evaluation_mode()noexcept{

        policy_ -> eval();

        critic_ -> eval();


    }


    template<typename EnvType, typename PolicyType, typename CriticType>

    void

    A2CSolver<EnvType, PolicyType, CriticType>::actions_before_training_begins(env_type& /*env*/){


        monitor_.reset();

        monitor_.policy_loss_values.reserve(config_.n_episodes);

        monitor_.critic_loss_values.reserve(config_.n_episodes);

        monitor_.rewards.reserve(config_.n_episodes);

        monitor_.episode_duration.reserve(config_.n_episodes);

        set_train_mode();

    }


    template<typename EnvType, typename PolicyType, typename CriticType>

    EpisodeInfo

    A2CSolver<EnvType, PolicyType, CriticType>::on_training_episode(env_type& env, uint_t episode_idx){


        auto start = std::chrono::steady_clock::now();


        // the buffer to use

        experience_buffer_type buffer(config_.max_itrs_per_episode);


        // collect the buffer

        auto eps_itrs = create_episode_batch_(env, episode_idx, buffer);


        // train the networks with from the

        // collected buffer

        auto [episode_reward, total_episode_loss] = train_with_batch_(buffer);


        auto end = std::chrono::steady_clock::now();

        std::chrono::duration<real_t> elapsed_seconds = end - start;


        monitor_.episode_duration.push_back(eps_itrs);


        EpisodeInfo info;

        info.episode_index = episode_idx;

        info.episode_reward = episode_reward;

        info.episode_iterations = eps_itrs;

        info.total_time = elapsed_seconds;

        return info;

    }


    template<typename EnvType, typename PolicyType, typename CriticType>

    uint_t

    A2CSolver<EnvType, PolicyType, CriticType>::create_episode_batch_(env_type& env,

                                                                      uint_t /*episode_idx*/,

                                                                      experience_buffer_type& buffer){

        typedef typename A2CMonitor<action_type,

                                    state_type>::experience_tuple_type experience_tuple_type;


        // reset the environment

        //  for every episode reset the environment

        auto old_timestep = env.reset();


        // loop over the iterations

        uint_t itrs = 0;

        for(; itrs < config_.max_itrs_per_episode; ++itrs){


            auto [action, log_prob] = policy_ -> act(old_timestep.observation());

            auto values = critic_ -> evaluate(old_timestep.observation());


            // step into the environment

            auto next_time_step = env.step(action);


            auto next_state = next_time_step.observation();

            auto reward = next_time_step.reward();


            experience_tuple_type exp = {old_timestep.observation(),

                action,

                reward,

                next_time_step.done(),

                log_prob,

                values};


            // put the observation into the buffer

            buffer.append(exp);


            if (next_time_step.done()){

                break;

            }


            old_timestep = next_time_step;


        }


        return itrs + 1;

    }


    template<typename EnvType,typename PolicyType, typename CriticType>

    std::tuple<real_t, real_t>

    A2CSolver<EnvType, PolicyType, CriticType>::train_with_batch_(experience_buffer_type& buffer){


        // because of the way we treat the values

        // we loose the requires_grad so we need to set it

        using namespace cuberl::utils::pytorch;


        typedef typename A2CMonitor<action_type,

                                    state_type>::experience_tuple_type experience_tuple_type;

        typedef std::vector<experience_tuple_type> batch_type;


        // the batch for this episode

        auto batch = buffer.template get<batch_type>();

        auto rewards_batch  = monitor_.template get<real_t, 2>(batch);

        auto values_batch   = monitor_.template get<torch_tensor_t, 5>(batch);

        auto logprobs_batch = monitor_.template get<torch_tensor_t, 4>(batch);


        // compute the discounted rewards for this batch

        auto discounted_returns = cuberl::rl::algos::calculate_step_discounted_return(rewards_batch,

            config_.gamma);


        auto torch_rewards_batch = TorchAdaptor::to_torch(discounted_returns,

                                                          config_.device_type,

                                                          false);


        auto torch_values_batch = TorchAdaptor::stack(values_batch,

                                                      config_.device_type

        );


        auto torch_logprobs_batch = TorchAdaptor::stack(logprobs_batch,

                                                        config_.device_type);


        // form the advantage

        auto advantage = torch_rewards_batch - torch_values_batch;


        // take the mean because we collect batches

        auto actor_loss = -(torch_logprobs_batch * advantage.detach()).mean();

        auto critic_loss = advantage.pow(2).mean();


        if(config_.clip_policy_grad){


            // clip the grad if needed

            torch::nn::utils::clip_grad_norm_(policy_->parameters(),

                                              config_.max_grad_norm_policy);


        }


        if(config_.clip_critic_grad){

            torch::nn::utils::clip_grad_norm_(critic_->parameters(),

                                              config_.max_grad_norm_critic);


        }


        // Backward pass and optimize

        policy_optimizer_->zero_grad();

        critic_optimizer_ -> zero_grad();


        actor_loss.backward();

        critic_loss.backward();


        policy_optimizer_ -> step();

        critic_optimizer_ -> step();


        auto total_episode_policy_loss = actor_loss.item().template to<real_t>();

        auto total_episode_critic_loss = critic_loss.item().template to<real_t>();


        // compute the undiscounted reward as the reward

        // for this episode

        auto R = cuberl::maths::sum(rewards_batch);


        monitor_.policy_loss_values.push_back(total_episode_policy_loss);

        monitor_.critic_loss_values.push_back(total_episode_critic_loss);

        monitor_.rewards.push_back(R);


        return std::make_tuple(R, total_episode_policy_loss + total_episode_critic_loss);


    }


}

}

#endif

#endif // A2C_H

a2c_config.h

a2c_monitor.h

cuberl_types.h

episode_info.h

bitrl::uint_t
std::size_t uint_t
uint_t
Definition bitrl_types.h:43

cubeai::rl::policies::PolicyType
PolicyType
Definition policy_type.h:8

cuberl::maths::mean
real_t mean(IteratorType begin, IteratorType end, bool parallel=true)
mean Compute the mean value of the values in the provided iterator range
Definition vector_math.h:126

cuberl::maths::sum
std::iterator_traits< IteratorType >::value_type sum(IteratorType begin, IteratorType end, bool parallel=true)
Definition vector_math.h:98

cuberl::rl::algos::calculate_step_discounted_return
std::vector< T > calculate_step_discounted_return(const std::vector< T > &rewards, T gamma)
Given an array of rewards, for each entry calculate the following: $$G = \sum_{k=t+1}^T \gamma^{k-t-1...
Definition utils.h:161

cuberl
Various utilities used when working with RL problems.
Definition cuberl_types.h:16

example::state_type
std::pair< uint_t, uint_t > state_type
Definition example_15.cpp:28

extended_kalman_filter.R
int R
Definition extended_kalman_filter.py:54

play.action
dict action
Definition play.py:41

play.reward
reward
Definition play.py:44

play.info
info
Definition play.py:44

play.env
env
Definition play.py:30

play.policy
dict policy
Definition play.py:26

plot_losses.values
list values
Definition plot_losses.py:13

rl_example_10::env_type
bitrl::envs::gymnasium::CliffWorld env_type
Definition rl_example_10.cpp:32

rl_algorithm_base.h

torch_adaptor.h

utils.h