bitrl & cuberl Documentation
Simulation engine for reinforcement learning agents
Loading...
Searching...
No Matches
first_visit_mc.h
Go to the documentation of this file.
1// SPDX-FileCopyrightText: 2024 <copyright holder> <email>
2// SPDX-License-Identifier: Apache-2.0
3
4#ifndef FIRST_VISIT_MC_H
5#define FIRST_VISIT_MC_H
6
7#include "cuberl/base/cubeai_config.h"
9
12#include "bitrl/bitrl_consts.h"
13
14#ifdef CUBEAI_PRINT_DBG_MSGS
15 #include <boost/log/trivial.hpp>
16#endif
17
18
19#include <string>
20#include <algorithm>
21#include <vector>
22
23namespace cuberl{
24namespace rl::algos::mc
25{
26
38
42 template<typename EnvType, typename TrajectoryGenerator, typename DecayLRSchedule, typename DiscountGenerator>
44 {
45 public:
46
51 typedef EnvType env_type;
52
56 typedef TrajectoryGenerator trajectory_generator_type;
57
61 typedef DecayLRSchedule decay_lr_schedule_type;
62
66 typedef DiscountGenerator discount_generator_type;
67
68
73 typedef typename env_type::time_step_type time_step_type;
74
75
81 TrajectoryGenerator& trajectory_gen,
82 DecayLRSchedule& decay_lr_schedule,
83 discount_generator_type& discount_generator);
84
90
96
101
105 void actions_after_episode_ends(env_type&, uint_t /*episode_idx*/, const EpisodeInfo& /*einfo*/){}
106
111
116 void save(const std::string& filename)const;
117
118 private:
119
120
126
132
133
138 TrajectoryGenerator trajectory_gen_;
139
144 DecayLRSchedule decay_lr_schedule_;
145
150 discount_generator_type discount_generator_;
151
152 };
153
154 template<typename EnvType,
155 typename TrajectoryGenerator, typename DecayLRSchedule, typename DiscountGenerator>
156 FirstVisitMCSolver<EnvType, TrajectoryGenerator,
157 DecayLRSchedule, DiscountGenerator>::FirstVisitMCSolver(FirstVisitMCSolverConfig solver_config,
158 TrajectoryGenerator& trajectory_gen,
159 DecayLRSchedule& decay_lr_schedule,
160 discount_generator_type& discount_generator)
161 :
162 v_(),
163 config_(solver_config),
164 trajectory_gen_(trajectory_gen),
165 decay_lr_schedule_(decay_lr_schedule),
166 discount_generator_(discount_generator)
167 {}
168
169
170 template<typename EnvType,
171 typename TrajectoryGenerator, typename DecayLRSchedule, typename DiscountGenerator>
172 void
173 FirstVisitMCSolver<EnvType,
174 TrajectoryGenerator, DecayLRSchedule, DiscountGenerator>::actions_before_training_begins(env_type& env){
175
176 v_.resize(env.n_states());
177 std::for_each(v_.begin(), v_.end(),
178 [](auto& item){item = 0.0;});
179 }
180
181 template<typename EnvType,
182 typename TrajectoryGenerator, typename DecayLRSchedule, typename DiscountGenerator>
184 FirstVisitMCSolver<EnvType,
185 TrajectoryGenerator, DecayLRSchedule, DiscountGenerator>::on_training_episode(env_type& env,
186 uint_t episode_idx){
187
188 // start timing the training on this episode
189 auto start = std::chrono::steady_clock::now();
190
191 // generate the trajectory for the environment
192 // for this episode
193 auto trajectory = trajectory_gen_(env, config_.max_steps);
194
195 const auto trajectory_size = std::distance(trajectory.begin(), trajectory.end());
196
197#ifdef CUBEAI_PRINT_DBG_MSGS
198 if(trajectory_size == 0){
199 BOOST_LOG_TRIVIAL(warning)<<"Trajectory size="<<trajectory_size<<std::endl;
200 }
201#endif
202
203 // accummulate the rewards in an array
204 // we need this in order to take the dot product
205 // with the discounts
206 std::vector<real_t> rewards;
207 rewards.reserve(trajectory_size);
208
209 auto time_step_itr = trajectory.begin();
210 for(; time_step_itr != trajectory.end(); ++time_step_itr){
211 auto time_step = *time_step_itr;
212 rewards.push_back(time_step.reward());
213 }
214
215 // compute the discounts for the generated trajectory
216 auto discounts = discount_generator_(trajectory, config_.max_steps);
217
218 // calculate learning rate
219 auto alpha = decay_lr_schedule_(config_.init_alpha, episode_idx);
220
221 std::vector<bool> visited(env.n_states(), false);
222 time_step_itr = trajectory.begin();
223 for(uint_t count=0; time_step_itr != trajectory.end(); ++time_step_itr, ++count){
224
225 auto time_step = *time_step_itr;
226
227 if(visited[time_step.observation()])
228 continue;
229
230 visited[time_step.observation()] = true;
231
232 // find the steps from the current time_step to the end
233 // of the trajectory
234 auto n_steps = std::distance(time_step_itr, trajectory.end());
235
236 // calculate the return. First extract up to n_steps
237 // from the discounts
238 auto trajectory_discounts = cuberl::maths::extract_subvector(discounts, n_steps);
239 auto trajectory_rewards = cuberl::maths::extract_subvector(rewards, count, false);
240 auto G = cuberl::maths::dot_product(trajectory_discounts, trajectory_rewards);
241 auto mc_error = G - v_[time_step.observation()];
242
243 // update the state value
244 v_[time_step.observation()] += alpha * mc_error;
245 }
246
247 auto end = std::chrono::steady_clock::now();
248 std::chrono::duration<real_t> elapsed_seconds = end-start;
249 auto episode_info = EpisodeInfo();
250 episode_info.episode_index = episode_idx;
251 episode_info.total_time = elapsed_seconds;
252 episode_info.episode_iterations = std::distance(trajectory.begin(), trajectory.end());
253 return episode_info;
254
255 }
256
257
258}
259}
260#endif // FIRST_VISIT_MC_H
Definition first_visit_mc.h:44
void actions_before_training_begins(env_type &env)
actions_before_training_begins. Execute any actions the algorithm needs before starting the iteration...
Definition first_visit_mc.h:174
void actions_after_training_ends(env_type &)
actions_after_training_ends. Actions to execute after the training iterations have finisehd
Definition first_visit_mc.h:95
DecayLRSchedule decay_lr_schedule_type
Definition first_visit_mc.h:61
EnvType env_type
The environment type.
Definition first_visit_mc.h:51
DiscountGenerator discount_generator_type
Definition first_visit_mc.h:66
void actions_after_episode_ends(env_type &, uint_t, const EpisodeInfo &)
actions_after_training_episode
Definition first_visit_mc.h:105
TrajectoryGenerator trajectory_generator_type
Definition first_visit_mc.h:56
void save(const std::string &filename) const
save the results
void actions_before_episode_begins(env_type &, uint_t)
actions_before_training_episode
Definition first_visit_mc.h:100
EpisodeInfo on_training_episode(env_type &env, uint_t episode_idx)
on_episode Do one on_episode of the algorithm
Definition first_visit_mc.h:185
env_type::time_step_type time_step_type
The time step type used by the environment.
Definition first_visit_mc.h:73
const std::string INVALID_STR
Invalid string.
Definition bitrl_consts.h:26
double real_t
real_t
Definition bitrl_types.h:23
Eigen::RowVectorX< T > DynVec
Dynamically sized row vector.
Definition bitrl_types.h:74
std::size_t uint_t
uint_t
Definition bitrl_types.h:43
std::iterator_traits< IteratorType >::value_type dot_product(IteratorType bv1, IteratorType ev1, IteratorType bv2, IteratorType ev2)
Definition vector_math.h:610
std::vector< T > extract_subvector(const std::vector< T > &vec, uint_t end, bool up_to=true)
Definition vector_math.h:477
Various utilities used when working with RL problems.
Definition cuberl_types.h:16
The EpisodeInfo struct.
Definition episode_info.h:19
real_t gamma
Definition first_visit_mc.h:29
uint_t max_steps
Definition first_visit_mc.h:34
uint_t n_episodes
Definition first_visit_mc.h:35
real_t tolerance
Definition first_visit_mc.h:30
real_t alpha_decay_ratio
Definition first_visit_mc.h:33
std::string save_path
Definition first_visit_mc.h:36
real_t init_alpha
Definition first_visit_mc.h:31
real_t min_alpha
Definition first_visit_mc.h:32