#############함수 정의
## relu함수
relu<-function(x){
ifelse(x>0,x,0)
}
## Neural Network FeedForward
nn.ff2<-function (nn, batch_x)
{
m <- nrow(batch_x)
if (nn$visible_dropout > 0) {
nn$dropout_mask[[1]] <- dropout.mask(ncol(batch_x), nn$visible_dropout)
batch_x <- t(t(batch_x) * nn$dropout_mask[[1]])
}
nn$post[[1]] <- batch_x
i<-2
for (i in 2:(length(nn$size) - 1)) {
nn$pre[[i]] <- t(nn$W[[i - 1]] %*% t(nn$post[[(i - 1)]]) +
nn$B[[i - 1]])
if (nn$activationfun == "sigm") {
nn$post[[i]] <- sigm(nn$pre[[i]])
}
else if (nn$activationfun == "tanh") {
nn$post[[i]] <- tanh(nn$pre[[i]])
}
else if (nn$activationfun == "relu") {
nn$post[[i]] <- relu(nn$pre[[i]])
}
else if (nn$activationfun == "linear") {
nn$post[[i]] <- (nn$pre[[i]])
}
else {
stop("unsupport activation function!")
}
if (nn$hidden_dropout > 0) {
nn$dropout_mask[[i]] <- dropout.mask(ncol(nn$post[[i]]),
nn$hidden_dropout)
nn$post[[i]] <- t(t(nn$post[[i]]) * nn$dropout_mask[[i]])
}
}
dim(nn$W[[i - 1]])
dim(t(nn$post[[(i - 1)]]))
i <- length(nn$size)
nn$pre[[i]] <- t(nn$W[[i - 1]] %*% t(nn$post[[(i - 1)]]) +
nn$B[[i - 1]])
if (nn$output == "sigm") {
nn$post[[i]] <- sigm(nn$pre[[i]])
} else if (nn$output == "linear") {
nn$post[[i]] <- nn$pre[[i]]
} else if (nn$output == "softmax") {
# nn$post[[i]] <- 2.71818^(nn$pre[[i]])
nn$post[[i]] <- exp(nn$pre[[i]])
if(sum(is.infinite(nn$post[[i]]))>0){
infin<-is.infinite(nn$post[[i]])
nn$post[[i]][infin]<-1
nn$post[[i]][!infin]<-0
}else{
nn$post[[i]] <- nn$post[[i]]/rowSums(nn$post[[i]])
}
# nn$post[[i]][is.na(nn$post[[i]] )]<-1
}
nn
}
### Neural Network Backpropagation
nn.bp<-function (nn)
{
n <- length(nn$size)
d <- list()
if (nn$output == "sigm") {
d[[n]] <- -nn$e * (nn$post[[n]] * (1 - nn$post[[n]]))
}
else if (nn$output == "linear" || nn$output == "softmax") {
d[[n]] <- -nn$e
}
for (i in (n - 1):2) {
if (nn$activationfun == "sigm") {
d_act <- nn$post[[i]] * (1 - nn$post[[i]])
}
else if (nn$activationfun == "tanh") {
d_act <- 1.7159 * 2/3 * (1 - 1/(1.7159)^2 * nn$post[[i]]^2)
}
else if (nn$activationfun == "relu") {
d_act <- ifelse(nn$post[[i]]>0,1,0)
}
d[[i]] <- (d[[i + 1]] %*% nn$W[[i]]) * d_act
if (nn$hidden_dropout > 0) {
d[[i]] <- t(t(d[[i]]) * nn$dropout_mask[[i]])
}
}
for (i in 1:(n - 1)) {
dw <- t(d[[i + 1]]) %*% nn$post[[i]]/nrow(d[[i + 1]])
dw <- dw * nn$learningrate
if (nn$momentum > 0) {
nn$vW[[i]] <- nn$momentum * nn$vW[[i]] + dw
dw <- nn$vW[[i]]
}
nn$W[[i]] <- nn$W[[i]] - dw
db <- colMeans(d[[i + 1]])
db <- db * nn$learningrate
if (nn$momentum > 0) {
nn$vB[[i]] <- nn$momentum * nn$vB[[i]] + db
db <- nn$vB[[i]]
}
nn$B[[i]] <- nn$B[[i]] - db
}
nn
}
##sigmoid
sigm<-function (x)
{
1/(1 + exp(-x))
}
### state를 좌표로 변환
coord<-function(state){
re_index<-which(state==1)
xx<-ceiling(re_index/ 10) ## 행
yy<-re_index %% 10 ## 열
yy<-ifelse(yy ==0,10,yy)
c(xx,yy)
}
### action
move<-function(x,action){
if(action == "left"){
if(x[2]-1<1){
x
}else{
x[2]<-x[2]-1
x
}
}
if(action == "right"){
if(x[2]+1>ncol(stm)){
x
}else{
x[2]<-x[2]+1
x
}
}
if(action == "up"){
if(x[1]-1<1){
x
}else{
x[1]<-x[1]-1
x
}
}
if(action == "down"){
if(x[1]+1>nrow(stm)){
x
}else{
x[1]<-x[1]+1
x
}
}
x
}
### 다음 위치 받아오기
next_where<-function(index){
zero<-rep(0,100)
zero[index]<-1
zero
}
#######state matrix
stm<-matrix(1:100,ncol=10,nrow=10,byrow=T)
#####reward 함수
return_reward<-function(state,current_state){
re_index<-which(state==1)
if( re_index==100){
reward<- 5# episode end
done<-T
}
else if(re_index==12 |re_index==42|re_index==44|re_index==45 |
re_index==68|re_index==72|re_index==80){
reward<- -2
done<-F
}else{
reward <- -1
done<-F
}
if(re_index==which(current_state==1)){
reward<-reward*2
}
xx<-ceiling(re_index/ 10) ## row
yy<-re_index %% 10 ## col
yy<-ifelse(yy ==0,10,yy)
reward_weight<-sqrt(162)-sqrt((yy-10)^2+(xx-10)^2) #weigthed reward by distance from current state to goal
reward<-reward+reward_weight*0.05
c(reward,done)
}
action<-c("left","right","down","up")
state_size <-ncol(stm)*nrow(stm)
## 10 x 10 frozen lake problem
# S : start, F : Frozen, H : Hole, G : Goal
# SFFFF|FFFFF
# FHFFF|FFFFF
# FFFFF|FFFFF
# FFFFF|FFFFF
# FHFHH|FFFFF
# FFFFF|FFFFF
# FFFFF|FFHFF
# FHFFF|FFFFH
# FFFFF|FFFFF
# FFFFF|FFFFG
### initialize neural network
{
input_dim<-state_size
hidden<-c(30)
output_dim<-4
size <- c(input_dim, hidden, output_dim)
activationfun<-"relu"
output<-"linear"
batchsize<-30
momentum<-0
learningrate_scale<-1
hidden_dropout = 0
visible_dropout = 0
numepochs = 10
learningrate<-0.1
vW <- list()
vB <- list()
W <- list()
B <- list()
for (i in 2:length(size)) {
W[[i - 1]] <- matrix(runif(size[i] * size[i - 1],
min = -0.1, max = 0.1), c(size[i], size[i - 1]))
B[[i - 1]] <- runif(size[i], min = -0.1, max = 0.1)
vW[[i - 1]] <- matrix(rep(0, size[i] * size[i - 1]),
c(size[i], size[i - 1]))
vB[[i - 1]] <- rep(0, size[i])
}
qn1<- list(input_dim = input_dim, output_dim = output_dim,
hidden = hidden, size = size, activationfun = activationfun,
learningrate = learningrate, momentum = momentum, learningrate_scale = learningrate_scale,
hidden_dropout = hidden_dropout, visible_dropout = visible_dropout,
output = output, W = W, vW = vW, B = B, vB = vB)
vW <- list()
vB <- list()
W <- list()
B <- list()
#### target network
target_qn<-qn1
}
epoch<-50
mini_batch<-20
init_data<-c(1,rep(0,state_size-1)) ## 초기 state
dis_f<-0.99 ## discount factor
reward_list<-c()
final_action_list<-list()
step_list<-c()
q_table<-list()
replay_buffer<-list() ## replay buffer
r<-1
bi<-1
for(i in 1:20000){
total_r<-0 ## total reward
episode_done<-0
qn1<-nn.ff2(qn1,t(init_data)) ## 초기 state
step<-1
action_list<-NULL
st<-c(1,1)
while(episode_done==0){
if(step >1){
da<-diag(state_size)
qn1<-nn.ff2(qn1,t(next_state))
action_index<-which.max(qn1$post[[length(size)]]) ## max q값 action을 취함
current_state<-next_state
}else{
current_state<-init_data
action_index<-which.max(qn1$post[[length(size)]]) ## max q값 action을 취함
}
th<-1/(i/50+10)
if(runif(1) < th){ ## e-greedy search
next_action<- action[sample(1:4,1)]
}else{
next_action<-action[action_index]
}
####### if episode smaller than 10, just choose action randomly
if(i < 10){
next_action<- action[sample(1:4,1)]
}
action_list<-c(action_list,next_action)
st<-move(st,next_action) ## 다음 state받아오기
state_index<-stm[st[1],st[2]] ## state index를 받아서
next_state<-next_where(state_index) ## 다음 state vector 만들기
re_ep<-return_reward(next_state,current_state) ## reward받기
total_r<-total_r+re_ep[1] ## total reward 계산
episode_done<-re_ep[2] ## 에피소드 종료 여부
step<-step+1
#########
#### store current state, action, reward, done, next_state at replay_buffer
replay_buffer[[bi]]<- c(which(current_state==1),next_action,re_ep,state_index)
bi<-bi+1
if(bi == 100000){ ## replay buffer가 10만개를 넘어가면 맨 앞에서부터 채워 넣음
bi <- 1
}
if(step == 500){ ## step이 500이 되면 종료
cat("\n",i," epsode-",step)
step_list<-c(step_list,step)
final_action_list[[i]]<-action_list
reward_list<-c(reward_list,total_r)
cat("\n final location")
print(coord(next_state))
ts.plot(reward_list,main=paste0((reward_list)[length(reward_list)],"-",step,"-",min(step_list)))
if(i %% 10 ==0){
ad<-apply(nn.ff2(qn1,diag(state_size))$post[[length(qn1$size)]],1,which.max);ad
q_table[[i]]<-matrix(action[ad],ncol=sqrt(state_size),byrow=T)
print(matrix(action[ad],ncol=sqrt(state_size),byrow=T))
}
break;
}
if(episode_done==1){ ## 에피소드가 끝나면 종료
cat("\n",i," epsode-",step)
cat("\n final location")
print(coord(next_state))
if(i %% 10 ==0){
ad<-apply(nn.ff2(qn1,diag(state_size))$post[[length(qn1$size)]],1,which.max);ad
q_table[[i]]<-matrix(action[ad],ncol=sqrt(state_size),byrow=T)
print(matrix(action[ad],ncol=sqrt(state_size),byrow=T))
}
step_list<-c(step_list,step)
final_action_list[[i]]<-action_list
reward_list<-c(reward_list,total_r)
ts.plot(reward_list,main=paste0((reward_list)[length(reward_list)],"-",step,"-",min(step_list)))
break;
}
}
if(i > 9){
## it learns once in five times of episode
## 5번 episode를 돌때마다 DQN UPdate
if(i %% 5==0){
### sampling from replay_buffer
for(u in 1:epoch){
sam<-sample(1:length(replay_buffer),mini_batch)
sam_1<-replay_buffer[sam]
x_stack<-NULL
y_stack<-NULL
q<-1
for(q in 1:length(sam_1)){
re<-rep(0,state_size)
re[as.numeric(sam_1[[q]][1])]<-1
x_stack<- rbind(x_stack,re) ##x stack
qvalue<-nn.ff2(qn1,t(re))$post[[length(qn1$size)]]
######### state, action, reward, done, next_state
## sam_1[[q]][1] current_state
## sam_1[[q]][2] action
## sam_1[[q]][3] reward
## sam_1[[q]][4] episode done
## sam_1[[q]][5] next_state
if( sam_1[[q]][4]==1){ ## 에피소드가 끝나지 않았을때
qvalue[action==sam_1[[q]][2]]<-as.numeric(sam_1[[q]][3])
y_stack<-rbind(y_stack,qvalue) ## y stack
}else{ ## 에피소드가 끝났을 떄
re2<-rep(0,state_size)
re2[as.numeric(sam_1[[q]][5])]<-1
target_qn<-nn.ff2(target_qn,t(re2)) ## feed forward using target netwrok
true_y<-max(target_qn$post[[length(target_qn$size)]])
qvalue[action==sam_1[[q]][2]]<- as.numeric(sam_1[[q]][3])+dis_f*true_y
y_stack<-rbind(y_stack,qvalue) ## y stack
}
}
######## feed forward xstack
qn1<-nn.ff2(qn1,x_stack)
####### error
qn1$e<- (y_stack)-qn1$post[[length(qn1$size)]]
####### back propagation
qn1<-nn.bp(qn1)
r<-r+1
}
cat("\n","DQN update")
ad<-apply(nn.ff2(qn1,diag(state_size))$post[[length(qn1$size)]],1,which.max);ad
q_table[[r]]<-matrix(action[ad],ncol=sqrt(state_size),byrow=T)
print(matrix(action[ad],ncol=(sqrt(state_size)),byrow=T))
###### copy qnetwork to target network
target_qn<-qn1
}
}
}
'강화학습' 카테고리의 다른 글
[강화학습 논문 리뷰] Rewarding impact-driven exploration for procedurally-generated environments. (0) | 2020.10.29 |
---|---|
R로하는 강화학습 (DQN) (Keras) (0) | 2018.04.27 |
예제로 쉽게 알아보는 강화학습 기초(Q-learning, Reinforcement Learning) (5) | 2018.04.07 |
Dueling Network Architectures for Deep Reinforcement Learning (1) | 2018.02.24 |
Hierarchical Deep Reinforcement Learning (HDQN) (2) | 2017.11.21 |
댓글