Functions
DynMat< real_t >	create_transition_matrix ()

DynMat< real_t >	compute_matrix_power (const DynMat< real_t > &mat, uint_t power)

void	print_matrix (const DynMat< real_t > &mat)

real_t	get_reward (real_t prob, uint_t n=10)

void	update_record (std::vector< std::vector< real_t > > &records, uint_t action, real_t r)

uint_t	get_best_arm (const std::vector< std::vector< real_t > > &records)

std::vector< real_t >	get_probs (uint_t n)

DynVec< real_t >	extract_part (const std::vector< std::vector< real_t > > &values)

Variables
const uint_t	N = 10

const auto	N_EXPERIMENTS = 500

const auto	TAU = 0.7

const uint	SEED = 42

Detailed Description

Solve the multi-arm bandit problem using soft-max policy. When using a soft-max policy policy we get a distribution of probabilities over the actions. We select the action with the highest probability. For this example we will solve a 10-armed bandit problem, so N=10.

This example is taken from the book: Reinforcement Learning in Action by Manning Publications.

Function Documentation

◆ compute_matrix_power()

DynMat< real_t > exe::compute_matrix_power	(	const DynMat< real_t > &	mat,
		uint_t	power
	)

◆ create_transition_matrix()

DynMat< real_t > exe::create_transition_matrix ( )

◆ extract_part()

DynVec< real_t > exe::extract_part ( const std::vector< std::vector< real_t > > & values )

◆ get_best_arm()

uint_t exe::get_best_arm ( const std::vector< std::vector< real_t > > & records )

◆ get_probs()

std::vector< real_t > exe::get_probs ( uint_t n )

◆ get_reward()

real_t exe::get_reward	(	real_t	prob,
		uint_t	n = `10`
	)

◆ print_matrix()

void exe::print_matrix ( const DynMat< real_t > & mat )

◆ update_record()

void exe::update_record	(	std::vector< std::vector< real_t > > &	records,
		uint_t	action,
		real_t	r
	)

Variable Documentation

◆ N

const uint_t exe::N = 10

◆ N_EXPERIMENTS

const auto exe::N_EXPERIMENTS = 500

◆ SEED

const uint exe::SEED = 42

◆ TAU

const auto exe::TAU = 0.7

Functions

Variables