From 247f646f02728477627274a883348f1d4e734d09 Mon Sep 17 00:00:00 2001 From: maberet <maberet@ada.local.isima.fr> Date: Wed, 29 Jun 2022 21:09:44 +0200 Subject: [PATCH] correction formules qlearn --- .../jeu_appren_par_renfo/src/main.c | 7 ++-- .../jeu_appren_par_renfo/src/qlearn.c | 33 +++++++++---------- .../jeu_appren_par_renfo/src/qlearn.h | 6 +++- 3 files changed, 25 insertions(+), 21 deletions(-) diff --git a/travail_de_groupe/jeu_appren_par_renfo/src/main.c b/travail_de_groupe/jeu_appren_par_renfo/src/main.c index 94fa800..bd35d29 100644 --- a/travail_de_groupe/jeu_appren_par_renfo/src/main.c +++ b/travail_de_groupe/jeu_appren_par_renfo/src/main.c @@ -6,12 +6,13 @@ int game_state; int main(){ float ***** Q = allocateAndInitiateQ(); + int i = 10000; srand ( time(NULL)); - - traningAgent(1000000, 15, Q); + while (i>0){ + traningAgent(10000000, 40, Q); + i--;} - writeQ(Q); // running = 1; // game_state = GAME; diff --git a/travail_de_groupe/jeu_appren_par_renfo/src/qlearn.c b/travail_de_groupe/jeu_appren_par_renfo/src/qlearn.c index 38fce48..d5a9aa7 100644 --- a/travail_de_groupe/jeu_appren_par_renfo/src/qlearn.c +++ b/travail_de_groupe/jeu_appren_par_renfo/src/qlearn.c @@ -115,15 +115,15 @@ int argmax(float * arr){ int i; float max = arr[0]; int maxIndex = 0; - printf("argmax: %f ", arr[0]); + //printf("argmax: %f ", arr[0]); for(i = 1; i < NUMBER_ACTION; i++){ - printf("%f ", arr[i]); + //printf("%f ", arr[i]); if (arr[i] > max){ max = arr[i]; maxIndex = i; } } - printf("\n"); + //printf("\n"); return maxIndex; } @@ -361,7 +361,8 @@ void traningAgent ( int numberRun, int numberStep, float *****Q) {// pour avoir zoneAngleH=converterIntoAngleH(angleH); dropZone=convertIntoZone(dropPoint.x,dropPoint.y); canonZone= convertIntoZoneCanon(canon.x,canon.y); - reward=0; + reward=0; + //printf("%d %d %d %d \n",dropZone, canonZone,zoneAngleH,zoneAngleF); for (i=0; i<numberStep-1;i++){ action = takeAction(agent->x,agent->y,Q,canonZone,zoneAngleH,zoneAngleF,greedy); @@ -374,28 +375,25 @@ void traningAgent ( int numberRun, int numberStep, float *****Q) {// pour avoir line.reward=0; actionStack(stack,line); moveAgent(agent, action); - printf("wtf%d \n ",i); } - action = takeAction(agent->x,agent->y,Q,canonZone,zoneAngleH,zoneAngleF,greedy); + action = takeAction(agent->x, agent->y,Q,canonZone,zoneAngleH,zoneAngleF,greedy); agentZone = convertIntoZone(agent->x, agent->y); line.receiverZone=agentZone; line.shooterZone =canonZone; line.angleHZone= zoneAngleH; line.angleFZone= zoneAngleF; line.action= action; - line.reward=0; - actionStack(stack,line); - moveAgent(agent, action); + if (agentZone==dropZone){ - reward=1; + line.reward=1; } - else{reward= 0;} - - printf("wtf2\n "); + else{line.reward= 0;} + actionStack(stack,line); + moveAgent(agent, action); Q[line.receiverZone][line.shooterZone][line.angleHZone][line.angleFZone][line.action] += - + greedy* ( reward- Q[line.receiverZone][line.shooterZone][line.angleHZone][line.angleFZone][line.action] ); + + LEARN_RATE* ( line.reward - Q[line.receiverZone][line.shooterZone][line.angleHZone][line.angleFZone][line.action] ); while (!emptyStack(stack)){ maxAction= argmax(Q[line.receiverZone][line.shooterZone][line.angleHZone][line.angleFZone]); @@ -403,12 +401,13 @@ void traningAgent ( int numberRun, int numberStep, float *****Q) {// pour avoir line=unStack(stack); Q[line.receiverZone][line.shooterZone][line.angleHZone][line.angleFZone][line.action] += - + greedy* ( reward + LEARN_RATE *Q[line.receiverZone][line.shooterZone][line.angleHZone][line.angleFZone][maxAction] + + LEARN_RATE* ( reward + DISCOUNT*Q[line.receiverZone][line.shooterZone][line.angleHZone][line.angleFZone][maxAction] - Q[line.receiverZone][line.shooterZone][line.angleHZone][line.angleFZone][line.action] ); } - printf("wtf1 \n "); + //printf("is empty : %d\n ", emptyStack(stack)); numberRun--; greedy=greedy-1/((float)numberRun); - printf("wtf1 \n "); + + if ( numberRun%1000000==1){printf (" %d \n ", numberRun);} } } \ No newline at end of file diff --git a/travail_de_groupe/jeu_appren_par_renfo/src/qlearn.h b/travail_de_groupe/jeu_appren_par_renfo/src/qlearn.h index 5d4fc87..5714908 100644 --- a/travail_de_groupe/jeu_appren_par_renfo/src/qlearn.h +++ b/travail_de_groupe/jeu_appren_par_renfo/src/qlearn.h @@ -10,7 +10,11 @@ #define M_PI 3.14159265358979323846 -#define LEARN_RATE 0.9 +#define LEARN_RATE 0.8 +#define DISCOUNT 0.5 + + + #define NUMBER_ZONE_SHOOTER 4 #define NUMBER_ZONE_RECEIVER 4 -- GitLab