diff --git a/travail_de_groupe/jeu_appren_par_renfo/src/main.c b/travail_de_groupe/jeu_appren_par_renfo/src/main.c index 94fa8008c72fe413115db2e2bbe80174443cce69..bd35d299cd42f7033c93d79c3a548ae53dc68bdb 100644 --- a/travail_de_groupe/jeu_appren_par_renfo/src/main.c +++ b/travail_de_groupe/jeu_appren_par_renfo/src/main.c @@ -6,12 +6,13 @@ int game_state; int main(){ float ***** Q = allocateAndInitiateQ(); + int i = 10000; srand ( time(NULL)); - - traningAgent(1000000, 15, Q); + while (i>0){ + traningAgent(10000000, 40, Q); + i--;} - writeQ(Q); // running = 1; // game_state = GAME; diff --git a/travail_de_groupe/jeu_appren_par_renfo/src/qlearn.c b/travail_de_groupe/jeu_appren_par_renfo/src/qlearn.c index 38fce48ff63247084ab18bdcb4034cc993ec67c6..d5a9aa7a73c097e9a7e280cf6a0c263129d4f847 100644 --- a/travail_de_groupe/jeu_appren_par_renfo/src/qlearn.c +++ b/travail_de_groupe/jeu_appren_par_renfo/src/qlearn.c @@ -115,15 +115,15 @@ int argmax(float * arr){ int i; float max = arr[0]; int maxIndex = 0; - printf("argmax: %f ", arr[0]); + //printf("argmax: %f ", arr[0]); for(i = 1; i < NUMBER_ACTION; i++){ - printf("%f ", arr[i]); + //printf("%f ", arr[i]); if (arr[i] > max){ max = arr[i]; maxIndex = i; } } - printf("\n"); + //printf("\n"); return maxIndex; } @@ -361,7 +361,8 @@ void traningAgent ( int numberRun, int numberStep, float *****Q) {// pour avoir zoneAngleH=converterIntoAngleH(angleH); dropZone=convertIntoZone(dropPoint.x,dropPoint.y); canonZone= convertIntoZoneCanon(canon.x,canon.y); - reward=0; + reward=0; + //printf("%d %d %d %d \n",dropZone, canonZone,zoneAngleH,zoneAngleF); for (i=0; i<numberStep-1;i++){ action = takeAction(agent->x,agent->y,Q,canonZone,zoneAngleH,zoneAngleF,greedy); @@ -374,28 +375,25 @@ void traningAgent ( int numberRun, int numberStep, float *****Q) {// pour avoir line.reward=0; actionStack(stack,line); moveAgent(agent, action); - printf("wtf%d \n ",i); } - action = takeAction(agent->x,agent->y,Q,canonZone,zoneAngleH,zoneAngleF,greedy); + action = takeAction(agent->x, agent->y,Q,canonZone,zoneAngleH,zoneAngleF,greedy); agentZone = convertIntoZone(agent->x, agent->y); line.receiverZone=agentZone; line.shooterZone =canonZone; line.angleHZone= zoneAngleH; line.angleFZone= zoneAngleF; line.action= action; - line.reward=0; - actionStack(stack,line); - moveAgent(agent, action); + if (agentZone==dropZone){ - reward=1; + line.reward=1; } - else{reward= 0;} - - printf("wtf2\n "); + else{line.reward= 0;} + actionStack(stack,line); + moveAgent(agent, action); Q[line.receiverZone][line.shooterZone][line.angleHZone][line.angleFZone][line.action] += - + greedy* ( reward- Q[line.receiverZone][line.shooterZone][line.angleHZone][line.angleFZone][line.action] ); + + LEARN_RATE* ( line.reward - Q[line.receiverZone][line.shooterZone][line.angleHZone][line.angleFZone][line.action] ); while (!emptyStack(stack)){ maxAction= argmax(Q[line.receiverZone][line.shooterZone][line.angleHZone][line.angleFZone]); @@ -403,12 +401,13 @@ void traningAgent ( int numberRun, int numberStep, float *****Q) {// pour avoir line=unStack(stack); Q[line.receiverZone][line.shooterZone][line.angleHZone][line.angleFZone][line.action] += - + greedy* ( reward + LEARN_RATE *Q[line.receiverZone][line.shooterZone][line.angleHZone][line.angleFZone][maxAction] + + LEARN_RATE* ( reward + DISCOUNT*Q[line.receiverZone][line.shooterZone][line.angleHZone][line.angleFZone][maxAction] - Q[line.receiverZone][line.shooterZone][line.angleHZone][line.angleFZone][line.action] ); } - printf("wtf1 \n "); + //printf("is empty : %d\n ", emptyStack(stack)); numberRun--; greedy=greedy-1/((float)numberRun); - printf("wtf1 \n "); + + if ( numberRun%1000000==1){printf (" %d \n ", numberRun);} } } \ No newline at end of file diff --git a/travail_de_groupe/jeu_appren_par_renfo/src/qlearn.h b/travail_de_groupe/jeu_appren_par_renfo/src/qlearn.h index 5d4fc877bd5cb18eb4850e341d3626e5362c55f6..5714908ce41888d61cf0fb6a280683bc3df6f8ff 100644 --- a/travail_de_groupe/jeu_appren_par_renfo/src/qlearn.h +++ b/travail_de_groupe/jeu_appren_par_renfo/src/qlearn.h @@ -10,7 +10,11 @@ #define M_PI 3.14159265358979323846 -#define LEARN_RATE 0.9 +#define LEARN_RATE 0.8 +#define DISCOUNT 0.5 + + + #define NUMBER_ZONE_SHOOTER 4 #define NUMBER_ZONE_RECEIVER 4