From 247f646f02728477627274a883348f1d4e734d09 Mon Sep 17 00:00:00 2001
From: maberet <maberet@ada.local.isima.fr>
Date: Wed, 29 Jun 2022 21:09:44 +0200
Subject: [PATCH]  correction formules qlearn

---
 .../jeu_appren_par_renfo/src/main.c           |  7 ++--
 .../jeu_appren_par_renfo/src/qlearn.c         | 33 +++++++++----------
 .../jeu_appren_par_renfo/src/qlearn.h         |  6 +++-
 3 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/travail_de_groupe/jeu_appren_par_renfo/src/main.c b/travail_de_groupe/jeu_appren_par_renfo/src/main.c
index 94fa800..bd35d29 100644
--- a/travail_de_groupe/jeu_appren_par_renfo/src/main.c
+++ b/travail_de_groupe/jeu_appren_par_renfo/src/main.c
@@ -6,12 +6,13 @@ int game_state;
 
 int main(){ 
     float ***** Q = allocateAndInitiateQ();
+    int i = 10000;
     
     srand ( time(NULL));
-
-    traningAgent(1000000, 15, Q);
+    while (i>0){
+    traningAgent(10000000, 40, Q);
+    i--;} 
     
-
      writeQ(Q);
     // running = 1;
     // game_state = GAME;
diff --git a/travail_de_groupe/jeu_appren_par_renfo/src/qlearn.c b/travail_de_groupe/jeu_appren_par_renfo/src/qlearn.c
index 38fce48..d5a9aa7 100644
--- a/travail_de_groupe/jeu_appren_par_renfo/src/qlearn.c
+++ b/travail_de_groupe/jeu_appren_par_renfo/src/qlearn.c
@@ -115,15 +115,15 @@ int argmax(float * arr){
     int i;
     float max = arr[0];
     int maxIndex = 0;
-    printf("argmax:  %f ", arr[0]);
+    //printf("argmax:  %f ", arr[0]);
     for(i = 1; i < NUMBER_ACTION; i++){
-        printf("%f ", arr[i]);
+        //printf("%f ", arr[i]);
         if (arr[i] > max){
             max = arr[i];
             maxIndex = i;
         }
     }
-    printf("\n");
+    //printf("\n");
     return maxIndex;
 }
 
@@ -361,7 +361,8 @@ void traningAgent ( int numberRun, int numberStep, float *****Q) {// pour avoir
         zoneAngleH=converterIntoAngleH(angleH);
         dropZone=convertIntoZone(dropPoint.x,dropPoint.y); 
         canonZone= convertIntoZoneCanon(canon.x,canon.y); 
-        reward=0;
+        reward=0; 
+        //printf("%d %d %d %d \n",dropZone, canonZone,zoneAngleH,zoneAngleF);
 
         for (i=0; i<numberStep-1;i++){ 
             action = takeAction(agent->x,agent->y,Q,canonZone,zoneAngleH,zoneAngleF,greedy); 
@@ -374,28 +375,25 @@ void traningAgent ( int numberRun, int numberStep, float *****Q) {// pour avoir
             line.reward=0; 
             actionStack(stack,line);
             moveAgent(agent, action);
-            printf("wtf%d \n ",i);
 
         }
-        action = takeAction(agent->x,agent->y,Q,canonZone,zoneAngleH,zoneAngleF,greedy); 
+        action = takeAction(agent->x, agent->y,Q,canonZone,zoneAngleH,zoneAngleF,greedy); 
         agentZone = convertIntoZone(agent->x, agent->y); 
         line.receiverZone=agentZone; 
         line.shooterZone =canonZone; 
         line.angleHZone= zoneAngleH; 
         line.angleFZone= zoneAngleF; 
         line.action= action;
-        line.reward=0; 
-        actionStack(stack,line);
-        moveAgent(agent, action);
+        
         if (agentZone==dropZone){ 
-                    reward=1; 
+                    line.reward=1; 
                 }
-                else{reward= 0;}
-
-        printf("wtf2\n ");
+                else{line.reward= 0;}
+        actionStack(stack,line);
+        moveAgent(agent, action);
 
         Q[line.receiverZone][line.shooterZone][line.angleHZone][line.angleFZone][line.action] +=  
-                    + greedy* ( reward- Q[line.receiverZone][line.shooterZone][line.angleHZone][line.angleFZone][line.action] );
+                    + LEARN_RATE* ( line.reward - Q[line.receiverZone][line.shooterZone][line.angleHZone][line.angleFZone][line.action] );
        
         while (!emptyStack(stack)){
             maxAction= argmax(Q[line.receiverZone][line.shooterZone][line.angleHZone][line.angleFZone]);
@@ -403,12 +401,13 @@ void traningAgent ( int numberRun, int numberStep, float *****Q) {// pour avoir
             line=unStack(stack);
 
             Q[line.receiverZone][line.shooterZone][line.angleHZone][line.angleFZone][line.action] +=  
-                    + greedy* ( reward + LEARN_RATE *Q[line.receiverZone][line.shooterZone][line.angleHZone][line.angleFZone][maxAction]
+                    + LEARN_RATE* ( reward +  DISCOUNT*Q[line.receiverZone][line.shooterZone][line.angleHZone][line.angleFZone][maxAction]
                     - Q[line.receiverZone][line.shooterZone][line.angleHZone][line.angleFZone][line.action] );
         }  
-        printf("wtf1 \n ");
+        //printf("is empty : %d\n ", emptyStack(stack));
         numberRun--; 
         greedy=greedy-1/((float)numberRun);
-        printf("wtf1 \n ");
+
+        if ( numberRun%1000000==1){printf (" %d \n  ", numberRun);} 
     } 
 } 
\ No newline at end of file
diff --git a/travail_de_groupe/jeu_appren_par_renfo/src/qlearn.h b/travail_de_groupe/jeu_appren_par_renfo/src/qlearn.h
index 5d4fc87..5714908 100644
--- a/travail_de_groupe/jeu_appren_par_renfo/src/qlearn.h
+++ b/travail_de_groupe/jeu_appren_par_renfo/src/qlearn.h
@@ -10,7 +10,11 @@
 
 #define M_PI 3.14159265358979323846
 
-#define LEARN_RATE 0.9
+#define LEARN_RATE 0.8
+#define DISCOUNT 0.5
+
+
+
 
 #define NUMBER_ZONE_SHOOTER 4
 #define NUMBER_ZONE_RECEIVER 4
-- 
GitLab