bitrl_cuberl_docs/double__q__learning_8h_source.html

#ifndef DOUBLE_Q_LEARNING_H

#define DOUBLE_Q_LEARNING_H


#include "cubeai/base/cubeai_types.h"

#include "cubeai/rl/algorithms/td/td_algo_base.h"

#include "cubeai/rl/rl_mixins.h"

#include "cubeai/rl/worlds/envs_concepts.h"

#include "cubeai/rl/episode_info.h"

#include "cubeai/maths/matrix_utilities.h"


#include "bitrl/utils/io/csv_file_writer.h"

#include "bitrl/bitrl_consts.h"


#include <chrono>

#include <random>


namespace cuberl{


namespace rl::algos::td

{


    struct DoubleQLearningConfig

    {


        std::string path{bitrl::consts::INVALID_STR};

        real_t tolerance;

        real_t gamma;

        real_t eta;

        uint_t max_num_iterations_per_episode;

        uint_t n_episodes;

        uint_t seed{42};


    };


    template<envs::discrete_world_concept EnvTp, typename ActionSelector>


    class DoubleQLearning final: public TDAlgoBase<EnvTp>,

                                 protected with_double_q_table_mixin<DynMat<real_t>>,

                                 protected with_double_q_table_max_action_mixin

    {

    public:


        typedef typename TDAlgoBase<EnvTp>::env_type env_type;


        typedef typename TDAlgoBase<EnvTp>::action_type action_type;


        typedef typename TDAlgoBase<EnvTp>::state_type state_type;


        typedef ActionSelector action_selector_type;


        DoubleQLearning(const DoubleQLearningConfig config, const ActionSelector& selector);


        virtual void actions_before_training_begins(env_type&);


        virtual void actions_after_training_ends(env_type&);


        virtual void actions_before_episode_begins(env_type&, uint_t /*episode_idx*/){}


        virtual void actions_after_episode_ends(env_type&, uint_t episode_idx,

                                                const EpisodeInfo& /*einfo*/){ action_selector_.adjust_on_episode(episode_idx);}


        virtual EpisodeInfo on_training_episode(env_type&, uint_t episode_idx);


        void save(std::string filename)const;


    private:


        DoubleQLearningConfig config_;


        action_selector_type action_selector_;


        void update_q_table_(const action_type& action, const state_type& cstate,

                             const state_type& next_state, real_t reward);


    };


    template <envs::discrete_world_concept EnvTp, typename ActionSelector>


    DoubleQLearning<EnvTp, ActionSelector>::DoubleQLearning(const DoubleQLearningConfig config, const ActionSelector& selector)

        :

        TDAlgoBase<EnvTp>(),

        with_double_q_table_mixin<DynMat<real_t>>(),

        config_(config),

        action_selector_(selector)

    {}


    template<envs::discrete_world_concept EnvTp, typename ActionSelector>

    void


    DoubleQLearning<EnvTp, ActionSelector>::actions_before_training_begins(env_type& env){

        this->with_double_q_table_mixin<DynMat<real_t>>::initialize(env.n_states(), env.n_actions(), 0.0);

    }


    template<envs::discrete_world_concept EnvTp, typename ActionSelector>

    EpisodeInfo


    DoubleQLearning<EnvTp, ActionSelector>::on_training_episode(env_type& env, uint_t episode_idx){


        auto start = std::chrono::steady_clock::now();

        EpisodeInfo info;


        // total score for the episode

        auto episode_score = 0.0;


        auto state = env.reset().observation();


        uint_t itr=0;

        for(;  itr < config_.max_num_iterations_per_episode; ++itr){


            // select an action

            auto action = action_selector_(this->with_double_q_table_mixin<DynMat<real_t>>::q_table_1,

                                           this->with_double_q_table_mixin<DynMat<real_t>>::q_table_2, state);


            // Take an action on the environment

            auto step_type_result = env.step(action);


            auto next_state = step_type_result.observation();

            auto reward = step_type_result.reward();

            auto done = step_type_result.done();


            // accumulate score

            episode_score += reward;


            // update the table

            update_q_table_(action, state, next_state, reward);

            state = next_state;


            if(done){

                break;

            }

        }


        auto end = std::chrono::steady_clock::now();

        std::chrono::duration<real_t> elapsed_seconds = end-start;


        info.episode_index = episode_idx;

        info.episode_reward = episode_score;

        info.episode_iterations = itr;

        info.total_time = elapsed_seconds;

        return info;


    }


    template <envs::discrete_world_concept EnvTp, typename ActionSelector>

    void

    DoubleQLearning<EnvTp, ActionSelector>::update_q_table_(const action_type& action, const state_type& cstate,

                                                            const state_type& next_state, real_t reward){


        // flip a coin 50% of the time we update Q1

        // whilst 50% of the time Q2

        std::mt19937 gen(config_.seed); //rd());


        // generate a number in [0, 1]

        std::uniform_real_distribution<> real_dist_(0.0, 1.0);


        // update Q1

        if(real_dist_(gen) <= 0.5){


            // the current qvalue

            auto q_current = this->with_double_q_table_mixin<DynMat<real_t>>::template get<1>(cstate, action);

            auto Qsa_next = 0.0;


            //if(this->env_ref_().is_valid_state(next_state)){

            auto max_act = this->with_double_q_table_max_action_mixin::max_action(this->with_double_q_table_mixin<DynMat<real_t>>::q_table_1,

                next_state, this->env_ref_().n_actions());


            // value of next state

            Qsa_next = this->with_double_q_table_mixin<DynMat<real_t>>::template get<2>(next_state, max_act);

            //}


            // construct TD target

            auto target = reward + (config_.gamma * Qsa_next);


            // get updated value

            auto new_value = q_current + (config_.eta * (target - q_current));

            this->with_double_q_table_mixin<DynMat<real_t>>::template set<1>(cstate, action, new_value);

        }

        else{


            // the current qvalue

            auto q_current = this->with_double_q_table_mixin<DynMat<real_t>>::template get<2>(cstate, action);

            auto Qsa_next = 0.0;


            auto max_act = this->with_double_q_table_max_action_mixin::max_action(this->with_double_q_table_mixin<DynMat<real_t>>::q_table_2,

                next_state, this->env_ref_().n_actions());


            // value of next state

            Qsa_next = this->with_double_q_table_mixin<DynMat<real_t>>::template get<1>(next_state, max_act);


            // construct TD target

            auto target = reward + (config_.gamma * Qsa_next);


            // get updated value

            auto new_value = q_current + (config_.eta * (target - q_current));

            this->with_double_q_table_mixin<DynMat<real_t>>::template set<2>(cstate, action, new_value);

        }

    }


    template <envs::discrete_world_concept EnvTp, typename ActionSelector>

    void


    DoubleQLearning<EnvTp, ActionSelector>::save(std::string filename)const{


        rlenvscpp::utils::io::CSVWriter file_writer(filename, ',', true);

        std::vector<std::string> col_names(1 + this->with_double_q_table_mixin<DynMat<real_t>>::q_table_1.columns());

        col_names[0] = "state_index";


        for(uint_t i = 0; i< this->with_double_q_table_mixin<DynMat<real_t>>::q_table_1.columns(); ++i){

            col_names[i + 1] = "action_" + std::to_string(i);

        }


        file_writer.write_column_names(col_names);


        for(uint_t s=0; s < this->with_double_q_table_mixin<DynMat<real_t>>::q_table_1.rows(); ++s){


            auto actions = maths::get_row(this->with_double_q_table_mixin<DynMat<real_t>>::q_table_1, s);

            file_writer.write_row(std::make_tuple(s, actions));


            actions = maths::get_row(this->with_double_q_table_mixin<DynMat<real_t>>::q_table_2, s);

            file_writer.write_row(std::make_tuple(s, actions));

        }

    }


}


}


#endif // DOUBLE_Q_LEARNING_H

bitrl_consts.h

cuberl::rl::algos::td::DoubleQLearning
The class DoubleQLearning. Simple tabular implemtation of double q-learning algorithm.
Definition double_q_learning.h:48

cuberl::rl::algos::td::DoubleQLearning::DoubleQLearning
DoubleQLearning(const DoubleQLearningConfig config, const ActionSelector &selector)
Constructor.
Definition double_q_learning.h:130

cuberl::rl::algos::td::DoubleQLearning::actions_after_training_ends
virtual void actions_after_training_ends(env_type &)
actions_after_training_ends. Actions to execute after the training iterations have finisehd

cuberl::rl::algos::td::DoubleQLearning::action_type
TDAlgoBase< EnvTp >::action_type action_type
action_t
Definition double_q_learning.h:60

cuberl::rl::algos::td::DoubleQLearning::actions_before_training_begins
virtual void actions_before_training_begins(env_type &)
actions_before_training_begins. Execute any actions the algorithm needs before starting the iteration...
Definition double_q_learning.h:141

cuberl::rl::algos::td::DoubleQLearning::action_selector_type
ActionSelector action_selector_type
action_selector_t
Definition double_q_learning.h:70

cuberl::rl::algos::td::DoubleQLearning::env_type
TDAlgoBase< EnvTp >::env_type env_type
env_t
Definition double_q_learning.h:55

cuberl::rl::algos::td::DoubleQLearning::actions_before_episode_begins
virtual void actions_before_episode_begins(env_type &, uint_t)
actions_before_training_episode
Definition double_q_learning.h:92

cuberl::rl::algos::td::DoubleQLearning::save
void save(std::string filename) const
Definition double_q_learning.h:253

cuberl::rl::algos::td::DoubleQLearning::on_training_episode
virtual EpisodeInfo on_training_episode(env_type &, uint_t episode_idx)
on_episode Do one on_episode of the algorithm
Definition double_q_learning.h:147

cuberl::rl::algos::td::DoubleQLearning::state_type
TDAlgoBase< EnvTp >::state_type state_type
state_t
Definition double_q_learning.h:65

cuberl::rl::algos::td::DoubleQLearning::actions_after_episode_ends
virtual void actions_after_episode_ends(env_type &, uint_t episode_idx, const EpisodeInfo &)
actions_after_training_episode
Definition double_q_learning.h:97

cuberl::rl::algos::td::TDAlgoBase
The TDAlgoBase class. Base class for deriving TD algorithms.
Definition td_algo_base.h:19

cuberl::rl::algos::td::TDAlgoBase::action_type
env_type::action_type action_type
action_t
Definition td_algo_base.h:30

cuberl::rl::algos::td::TDAlgoBase::state_type
env_type::state_type state_type
state_t
Definition td_algo_base.h:35

cuberl::rl::algos::td::TDAlgoBase::env_type
EnvType env_type
env_t
Definition td_algo_base.h:25

csv_file_writer.h

bitrl::consts::INVALID_STR
const std::string INVALID_STR
Invalid string.
Definition bitrl_consts.h:26

bitrl::real_t
double real_t
real_t
Definition bitrl_types.h:23

bitrl::uint_t
std::size_t uint_t
uint_t
Definition bitrl_types.h:43

bitrl::DynMat
Eigen::MatrixX< T > DynMat
Dynamically sized matrix to use around the library.
Definition bitrl_types.h:49

cuberl::maths::get_row
DynVec< T > get_row(const DynMat< T > &matrix, uint_t row_idx)
Extract the cidx-th column from the matrix.
Definition matrix_utilities.h:130

cuberl
Various utilities used when working with RL problems.
Definition cuberl_types.h:16

cuberl::rl::EpisodeInfo
The EpisodeInfo struct.
Definition episode_info.h:19

cuberl::rl::algos::td::DoubleQLearningConfig
Definition double_q_learning.h:27

cuberl::rl::algos::td::DoubleQLearningConfig::max_num_iterations_per_episode
uint_t max_num_iterations_per_episode
Definition double_q_learning.h:33

cuberl::rl::algos::td::DoubleQLearningConfig::gamma
real_t gamma
Definition double_q_learning.h:31

cuberl::rl::algos::td::DoubleQLearningConfig::tolerance
real_t tolerance
Definition double_q_learning.h:30

cuberl::rl::algos::td::DoubleQLearningConfig::seed
uint_t seed
Definition double_q_learning.h:35

cuberl::rl::algos::td::DoubleQLearningConfig::n_episodes
uint_t n_episodes
Definition double_q_learning.h:34

cuberl::rl::algos::td::DoubleQLearningConfig::eta
real_t eta
Definition double_q_learning.h:32

cuberl::rl::algos::td::DoubleQLearningConfig::path
std::string path
Definition double_q_learning.h:29

cuberl::rl::with_double_q_table_max_action_mixin
Definition rl_mixins.h:302

cuberl::rl::with_double_q_table_max_action_mixin::max_action
static uint_t max_action(const TableTp &q1_table, const TableTp &q2_table, const StateTp &state, uint_t n_actions)
Returns the max action by averaging the state values from the two tables.
Definition rl_mixins.h:322

cuberl::rl::with_double_q_table_mixin
Definition rl_mixins.h:138