\u5f3a\u5316\u5b66\u4e60\u57fa\u672c\u4ecb\u7ecd<\/h2>\n
\u5f3a\u5316\u5b66\u4e60\u662f\u4e00\u79cd\u4e0d\u540c\u4e8e\u76d1\u7763\u5b66\u4e60\u548c\u65e0\u76d1\u7763\u5b66\u4e60\u7684\u5728\u7ebf\u5b66\u4e60\u6280\u672f,\u57fa\u672c\u6a21\u578b\u56fe\u4e00\u6240\u793a\u3002\u5b83\u628a\u5b66\u4e60\u770b\u4f5c\u662f\u4e00\u4e2a\u201c\u8bd5\u63a2\u4e00\u8bc4\u4ef7\u201d\u7684\u8fc7\u7a0b,\u9996\u5148\u5b66\u4e60\u7cfb\u7edf\u79f0\u4e3a\u667a\u80fd\u4f53\u611f\u77e5\u73af\u5883\u72b6\u6001,\u91c7\u53d6\u67d0\u4e00\u4e2a\u52a8\u4f5c\u4f5c\u7528\u4e8e\u73af\u5883,\u73af\u5883\u63a5\u53d7\u8be5\u52a8\u4f5c\u540e\u72b6\u6001\u53d1\u751f\u53d8\u5316,\u540c\u65f6\u7ed9\u51fa\u4e00\u4e2a\u56de\u62a5\u5956\u52b1\u6216\u60e9\u7f5a\u53cd\u9988\u7ed9\u5f3a\u5316\u5b66\u4e60\u7cfb\u7edf,\u5f3a\u5316\u5b66\u7cfb\u7edf\u6839\u636e\u5f3a\u5316\u4fe1\u53f7\u548c\u73af\u5883\u7684\u5f53\u524d\u72b6\u6001\u518d\u9009\u62e9\u4e0b\u4e00\u4e2a\u52a8\u4f5c,\u9009\u62e9\u7684\u539f\u5219\u662f\u4f7f\u53d7\u5230\u518d\u52b1\u7684\u6982\u7387\u589e\u5927\u3002<\/span><\/p>\n<\/p>\n \u667a\u80fd\u4f53\u5728\u548c\u73af\u5883\u4ea4\u4e92\u65f6,\u5728\u6bcf\u4e00\u65f6\u523b\u4f1a\u53d1\u751f\u5982\u4e0b\u4e8b\u4ef6\u5e8f\u5217 1.\u8bc4\u4ef7\u51fd\u6570<\/span><\/p>\n \u667a\u80fd\u4f53\u7684\u5b66\u4e60\u76ee\u6807\u662f\u6700\u5927\u5316\u672a\u6765\u56de\u62a5\u7684\u7d2f\u79ef\u503c\u3002\u8bc4\u4ef7\u51fd\u6570,\u662f\u5bf9\u957f\u671f\u56de\u62a5\u7684\u4e00\u79cd\u91cf\u5ea6,\u6709\u4e09\u79cd\u8fd4\u56de\u8868\u8fbe\u5f0f\u3002 2\uff09\u6298\u6263\u56de\u62a5\u65e0\u9650\u8303\u56f4\u6a21\u578b\u5b83\u662f\u5728\u65e0\u9650\u7684\u9636\u6bb5\u5185\u5bf9\u56de\u62a5\u7684\u7d2f\u79ef\u3002<\/span><\/p>\n<\/p>\n \u03b3\u662f\u6298\u6263\u56e0\u5b50,\u901a\u5e380\u2264\u03b3\uff1c1\u3002\u901a\u8fc7\u8c03\u8282,\u53ef\u4ee5\u63a7\u5236\u5b66\u4e60\u7cfb\u7edf\u5bf9\u5b83\u81ea\u5df1\u884c\u52a8\u7684\u77ed\u671f\u548c\u957f\u671f\u7ed3\u679c\u8003\u8651\u7684\u7a0b\u5ea6\u3002\u5728\u6781\u7aef\u60c5\u51b5,\u5f53\u03b3=0\u65f6\u7cfb\u7edf\u662f\u77ed\u89c6\u7684,\u5b83\u53ea\u8003\u8651\u884c\u52a8\u7684\u5f53\u524d\u7ed3\u679c\u3002\u5f53\u03b3\u63a5\u8fd11\u65f6,\u672a\u6765\u7684\u56de\u62a5\u5728\u91c7\u53d6\u6700\u4f18\u884c\u52a8\u65f6\u53d8\u5f97\u66f4\u4e3a\u91cd\u8981.<\/span><\/p>\n 3\uff09\u5e73\u5747\u56de\u62a5\u6a21\u578b\u4e00\u91c7\u7528\u7b2c\u4e09\u79cd\u6807\u51c6,\u7d2f\u8ba1\u672a\u6765\u56de\u62a5\u7684\u5e73\u5747\u503c,\u6807\u51c6\u4e3a<\/span><\/p>\n<\/p>\n \u4e0a\u9762\u4e09\u79cd\u56de\u62a5\u8868\u8fbe\u5f0f,\u4f7f\u7528\u6700\u591a\u7684\u662f\u6298\u6263\u56de\u62a5\u6307\u6807\u3002<\/span><\/p>\n \u9a6c\u5c14\u53ef\u592b\u51b3\u7b56\u8fc7\u7a0b\uff08MDP\uff09<\/strong><\/span><\/p>\n \u5927\u5bb6\u5e94\u8be5\u8fd8\u8bb0\u5f97\u9a6c\u5c14\u79d1\u592b\u94fe(Markov Chain)\uff0c\u4e86\u89e3\u673a\u5668\u5b66\u4e60\u7684\u4e5f\u90fd\u77e5\u9053\u9690\u9a6c\u5c14\u53ef\u592b\u6a21\u578b(Hidden Markov Model\uff0cHMM)\u3002\u5b83\u4eec\u5177\u6709\u7684\u4e00\u4e2a\u5171\u540c\u6027\u8d28\u5c31\u662f\u9a6c\u5c14\u53ef\u592b\u6027(\u65e0\u540e\u6548\u6027)\uff0c\u4e5f\u5c31\u662f\u6307\u7cfb\u7edf\u7684\u4e0b\u4e2a\u72b6\u6001\u53ea\u4e0e\u5f53\u524d\u72b6\u6001\u4fe1\u606f\u6709\u5173\uff0c\u800c\u4e0e\u66f4\u65e9\u4e4b\u524d\u7684\u72b6\u6001\u65e0\u5173\u3002<\/span><\/p>\n \u9a6c\u5c14\u53ef\u592b\u51b3\u7b56\u8fc7\u7a0b(Markov Decision Process, MDP)\u4e5f\u5177\u6709\u9a6c\u5c14\u53ef\u592b\u6027\uff0c\u4e0e\u4e0a\u9762\u4e0d\u540c\u7684\u662fMDP\u8003\u8651\u4e86\u52a8\u4f5c\uff0c\u5373\u7cfb\u7edf\u4e0b\u4e2a\u72b6\u6001\u4e0d\u4ec5\u548c\u5f53\u524d\u7684\u72b6\u6001\u6709\u5173\uff0c\u4e5f\u548c\u5f53\u524d\u91c7\u53d6\u7684\u52a8\u4f5c\u6709\u5173\u3002\u8fd8\u662f\u4e3e\u4e0b\u68cb\u7684\u4f8b\u5b50\uff0c\u5f53\u6211\u4eec\u5728\u67d0\u4e2a\u5c40\u9762\uff08\u72b6\u6001s\uff09\u8d70\u4e86\u4e00\u6b65(\u52a8\u4f5ca)\uff0c\u8fd9\u65f6\u5bf9\u624b\u7684\u9009\u62e9\uff08\u5bfc\u81f4\u4e0b\u4e2a\u72b6\u6001s\u2019\uff09\u6211\u4eec\u662f\u4e0d\u80fd\u786e\u5b9a\u7684\uff0c\u4f46\u662f\u4ed6\u7684\u9009\u62e9\u53ea\u548cs\u548ca\u6709\u5173\uff0c\u800c\u4e0d\u7528\u8003\u8651\u66f4\u65e9\u4e4b\u524d\u7684\u72b6\u6001\u548c\u52a8\u4f5c\uff0c\u5373s\u2019\u662f\u6839\u636es\u548ca\u968f\u673a\u751f\u6210\u7684\u3002<\/span><\/p>\n \u6211\u4eec\u7528\u4e00\u4e2a\u4e8c\u7ef4\u8868\u683c\u8868\u793a\u4e00\u4e0b\uff0c\u5404\u79cd\u9a6c\u5c14\u53ef\u592b\u5b50\u6a21\u578b\u7684\u5173\u7cfb\u5c31\u5f88\u6e05\u695a\u4e86\uff1a<\/span><\/p>\n <\/p>\n \u4e00\u4e2a\u9a6c\u5c14\u53ef\u592b\u51b3\u7b56\u8fc7\u7a0b\u7531\u4e00\u4e2a\u56db\u5143\u7ec4\u6784\u6210M = (S, A, Psa<\/sub>, ) <\/span><\/p>\n MDP \u7684\u52a8\u6001\u8fc7\u7a0b\u5982\u4e0b\uff1a\u67d0\u4e2a\u667a\u80fd\u4f53(agent)\u7684\u521d\u59cb\u72b6\u6001\u4e3as0<\/sub>\uff0c\u7136\u540e\u4ece A \u4e2d\u6311\u9009\u4e00\u4e2a\u52a8\u4f5ca0<\/sub>\u6267\u884c\uff0c\u6267\u884c\u540e\uff0cagent \u6309Psa<\/sub>\u6982\u7387\u968f\u673a\u8f6c\u79fb\u5230\u4e86\u4e0b\u4e00\u4e2as1<\/sub>\u72b6\u6001\uff0cs1<\/sub>\u2208 Ps<\/sub>0a<\/sub>0<\/sub>\u3002\u7136\u540e\u518d\u6267\u884c\u4e00\u4e2a\u52a8\u4f5ca1<\/sub>\uff0c\u5c31\u8f6c\u79fb\u5230\u4e86s2<\/sub>\uff0c\u63a5\u4e0b\u6765\u518d\u6267\u884ca2<\/sub>\u2026\uff0c\u6211\u4eec\u53ef\u4ee5\u7528\u4e0b\u9762\u7684\u56fe\u8868\u793a\u72b6\u6001\u8f6c\u79fb\u7684\u8fc7\u7a0b\u3002<\/span><\/p>\n <\/span><\/p>\n \u5982\u679c\u56de\u62a5r\u662f\u6839\u636e\u72b6\u6001s\u548c\u52a8\u4f5ca\u5f97\u5230\u7684\uff0c\u5219MDP\u8fd8\u53ef\u4ee5\u8868\u793a\u6210\u4e0b\u56fe\uff1a<\/span><\/p>\n <\/span><\/p>\n \u9a6c\u5c14\u79d1\u592b\u51b3\u7b56\u95ee\u9898\u7684\u76ee\u7684\u662f\u5bfb\u6c42\u4e00\u4e2a\u6700\u4f18\u7b56\u7565,\u5373\u4f7f\u8bc4\u4ef7\u51fd\u6570\u6700\u5927\u5316\u7684\u4e00\u7cfb\u5217\u52a8\u4f5c\u3002\u5bf9\u4e8e\u6bcf\u4e00\u65f6\u523b\u7684\u72b6\u6001s(t),\u667a\u80fd\u4f53\u5747\u4f1a\u901a\u8fc7\u6700\u4f18\u7b56\u7565\u03c0\u9009\u53d6\u9002\u5f53\u7684\u52a8\u4f5c\u3002<\/span><\/p>\n \u800c\u589e\u5f3a\u5b66\u4e60\u7684\u76ee\u7684\u5c31\u662f\u6c42\u89e3\u9a6c\u5c14\u53ef\u592b\u51b3\u7b56\u8fc7\u7a0b(MDP)\u7684\u6700\u4f18\u7b56\u7565\u3002<\/span><\/p>\n \u90a3\u4e48\u5982\u4f55\u6c42\u89e3\u6700\u4f18\u7b56\u7565\u5462\uff1f\u57fa\u672c\u7684\u89e3\u6cd5\u6709\u4e09\u79cd\uff1a<\/span><\/p>\n 1.\u52a8\u6001\u89c4\u5212\u6cd5(dynamic programming methods)<\/span><\/p>\n 2.\u8499\u7279\u5361\u7f57\u65b9\u6cd5(Monte Carlo methods)<\/span><\/p>\n 3.\u65f6\u95f4\u5dee\u5206\u6cd5(temporal difference)\u3002<\/span><\/p>\n \u52a8\u6001\u89c4\u5212\u6cd5\u662f\u5176\u4e2d\u6700\u57fa\u672c\u7684\u7b97\u6cd5\u3002<\/span><\/p>\n Q\u4e00\u5b66\u4e60\u662f\u5f3a\u5316\u5b66\u4e60\u7684\u4e3b\u8981\u7b97\u6cd5\u4e4b\u4e00\uff0c\u662f\u4e00\u79cd\u65e0\u6a21\u578b\u7684\u5b66\u4e60\u65b9\u6cd5\uff0c\u5b83\u63d0\u4f9b\u667a\u80fd\u7cfb\u7edf\u5728\u9a6c\u5c14\u53ef\u592b\u73af\u5883\u4e2d\u5229\u7528\u7ecf\u5386\u7684\u52a8\u4f5c\u5e8f\u5217\u9009\u62e9\u6700\u4f18\u52a8\u4f5c\u7684\u4e00\u79cd\u5b66\u4e60\u80fd\u529b\u3002Q-\u5b66\u4e60\u57fa\u4e8e\u7684\u4e00\u4e2a\u5173\u952e\u5047\u8bbe\u662f\u667a\u80fd\u4f53\u548c\u73af\u5883\u7684\u4ea4\u4e92\u53ef\u770b\u4f5c\u4e3a\u4e00\u4e2aMarkov\u51b3\u7b56\u8fc7\u7a0b(MDP)\uff0c\u5373\u667a\u80fd\u4f53\u5f53\u524d\u6240\u5904\u7684\u72b6\u6001\u548c\u6240\u9009\u62e9\u7684\u52a8\u4f5c\uff0c\u51b3\u5b9a\u4e00\u4e2a\u56fa\u5b9a\u7684\u72b6\u6001\u8f6c\u79fb\u6982\u7387\u5206\u5e03\u3001\u4e0b\u4e00\u4e2a\u72b6\u6001\u3001\u5e76\u5f97\u5230\u4e00\u4e2a\u5373\u65f6\u56de\u62a5\u3002Q-\u5b66\u4e60\u7684\u76ee\u6807\u662f\u5bfb\u627e\u4e00\u4e2a\u7b56\u7565\u53ef\u4ee5\u6700\u5927\u5316\u5c06\u6765\u83b7\u5f97\u7684\u62a5\u916c\u3002<\/span><\/p>\n Q-Learning\u662f\u4e00\u9879\u65e0\u6a21\u578b\u7684\u589e\u5f3a\u5b66\u4e60\u6280\u672f\uff0c\u5b83\u53ef\u4ee5\u5728MDP\u95ee\u9898\u4e2d\u5bfb\u627e\u4e00\u4e2a\u6700\u4f18\u7684\u52a8\u4f5c\u9009\u62e9\u7b56\u7565\u3002\u5b83\u901a\u8fc7\u4e00\u4e2a\u52a8\u4f5c-\u4ef7\u503c\u51fd\u6570\u6765\u8fdb\u884c\u5b66\u4e60\uff0c\u5e76\u4e14\u6700\u7ec8\u80fd\u591f\u6839\u636e\u5f53\u524d\u72b6\u6001\u53ca\u6700\u4f18\u7b56\u7565\u7ed9\u51fa\u671f\u671b\u7684\u52a8\u4f5c\u3002\u5b83\u7684\u4e00\u4e2a\u4f18\u70b9\u5c31\u662f\u5b83\u4e0d\u9700\u8981\u77e5\u9053\u67d0\u4e2a\u73af\u5883\u7684\u6a21\u578b\u4e5f\u53ef\u4ee5\u5bf9\u52a8\u4f5c\u8fdb\u884c\u671f\u671b\u503c\u6bd4\u8f83\uff0c\u8fd9\u5c31\u662f\u4e3a\u4ec0\u4e48\u5b83\u88ab\u79f0\u4f5c\u65e0\u6a21\u578b\u7684\u3002<\/span><\/p>\n Q--learning\u4e2d\uff0c\u6bcf\u4e2aQ(s\uff0ca)\u5bf9\u5e94\u4e00\u4e2a\u76f8\u5e94\u7684Q\u503c\uff0c\u5728\u5b66\u4e60\u8fc7\u7a0b\u4e2d\u6839\u636eQ\u503c\uff0c\u9009\u62e9\u52a8\u4f5c\u3002Q\u503c\u7684\u5b9a\u4e49\u662f\u5982\u679c\u6267\u884c\u5f53\u524d\u76f8\u5173\u7684\u52a8\u4f5c\u5e76\u4e14\u6309\u7167\u67d0\u4e00\u4e2a\u7b56\u7565\u6267\u884c\u4e0b\u53bb\uff0c\u5c06\u5f97\u5230\u7684\u56de\u62a5\u7684\u603b\u548c\u3002\u6700\u4f18Q\u503c\u53ef\u8868\u793a\u4e3aQ+\uff0c\u5176\u5b9a\u4e49\u662f\u6267\u884c\u76f8\u5173\u7684\u52a8\u4f5c\u5e76\u6309\u7167\u6700\u4f18\u7b56\u7565\u6267\u884c\u4e0b\u53bb\uff0c\u5c06\u5f97\u5230\u7684\u56de\u62a5\u7684\u603b\u548c\uff0c\u5176\u5b9a\u4e49\u5982\u4e0b\uff1a<\/span><\/p>\n<\/p>\n \u5176\u4e2d\uff1as\u8868\u793a\u72b6\u6001\u96c6\uff0cA\u8868\u793a\u52a8\u4f5c\u96c6\uff0cT(s\uff0ca\uff0cs\u2019)\u8868\u793a\u5728\u72b6\u6001s\u4e0b\u6267\u884c\u52a8\u4f5ca\uff0c\u8f6c\u6362\u5230\u72b6\u6001s\u2019\u7684\u6982\u7387\uff0cr(s,a)\u8868\u793a\u5728\u72b6\u6001s\u4e0b\u6267\u884c\u52a8\u4f5ca\u5c06\u5f97\u5230\u7684\u56de\u62a5\uff0c\u8868\u793a\u6298\u6263\u56e0\u5b50\uff0c\u51b3\u5b9a\u65f6\u95f4\u7684\u8fdc\u8fd1\u5bf9\u56de\u62a5\u7684\u5f71\u54cd\u7a0b\u5ea6\u3002<\/span><\/p>\n \u667a\u80fd\u4f53\u7684\u6bcf\u4e00\u6b21\u5b66\u4e60\u8fc7\u7a0b\u53ef\u4ee5\u770b\u4f5c\u662f\u4ece\u4e00\u4e2a\u968f\u673a\u72b6\u6001\u5f00\u59cb\uff0c\u91c7\u7528\u4e00\u4e2a\u7b56\u7565\u6765\u9009\u62e9\u52a8\u4f5c\uff0c\u5982\u03b5\u8d2a\u5a6a\u7b56\u7565\u6216Boltzamann\u5206\u5e03\u7b56\u7565\u3002\u91c7\u7528\u968f\u673a\u7b56\u7565\u662f\u4e3a\u4e86\u4fdd\u8bc1\u667a\u80fd\u4f53\u80fd\u591f\u641c\u7d22\u6240\u6709\u53ef\u80fd\u7684\u52a8\u4f5c\uff0c\u5bf9\u6bcf\u4e2aQ(s\uff0ca)\u8fdb\u884c\u66f4\u65b0\u3002\u667a\u80fd\u4f53\u5728\u6267\u884c\u5b8c\u6240\u9009\u7684\u52a8\u4f5c\u540e\uff0c\u89c2\u5bdf\u65b0\u7684\u72b6\u6001\u548c\u56de\u62a5\uff0c\u7136\u540e\u6839\u636e\u65b0\u72b6\u6001\u7684\u6700\u5927Q\u503c\u548c\u56de\u62a5\u6765\u66f4\u65b0\u4e0a\u4e00\u4e2a\u72b6\u6001\u548c\u52a8\u4f5c\u7684Q\u503c\u3002\u667a\u80fd\u4f53\u5c06\u4e0d\u65ad\u6839\u636e\u65b0\u7684\u72b6\u6001\u9009\u62e9\u52a8\u4f5c\uff0c\u76f4\u81f3\u5230\u8fbe\u4e00\u4e2a\u7ec8\u6b62\u72b6\u6001\u3002\u4e0b\u9762\u7ed9\u51faQ\u2014learning\u7b97\u6cd5\u7684\u63cf\u8ff0\uff1a<\/span><\/p>\n <\/p>\n \u6bcf\u6b21\u66f4\u65b0\u6211\u4eec\u90fd\u7528\u5230\u4e86 Q \u73b0\u5b9e\u548c Q \u4f30\u8ba1, \u800c\u4e14 Q-learning \u7684\u8ff7\u4eba\u4e4b\u5904\u5c31\u662f \u5728 Q(s1, a2) \u73b0\u5b9e \u4e2d, \u4e5f\u5305\u542b\u4e86\u4e00\u4e2a Q(s2) \u7684\u6700\u5927\u4f30\u8ba1\u503c, \u5c06\u5bf9\u4e0b\u4e00\u6b65\u7684\u8870\u51cf\u7684\u6700\u5927\u4f30\u8ba1\u548c\u5f53\u524d\u6240\u5f97\u5230\u7684\u5956\u52b1\u5f53\u6210\u8fd9\u4e00\u6b65\u7684\u73b0\u5b9e, \u5f88\u5947\u5999\u5427. \u6700\u540e\u6211\u4eec\u6765\u8bf4\u8bf4\u8fd9\u5957\u7b97\u6cd5\u4e2d\u4e00\u4e9b\u53c2\u6570\u7684\u610f\u4e49. \u03b5 greedy \u662f\u7528\u5728\u51b3\u7b56\u4e0a\u7684\u4e00\u79cd\u7b56\u7565, \u6bd4\u5982 \u03b5= 0.9 \u65f6, \u5c31\u8bf4\u660e\u670990% \u7684\u60c5\u51b5\u6211\u4f1a\u6309\u7167 Q \u8868\u7684\u6700\u4f18\u503c\u9009\u62e9\u884c\u4e3a, 10% \u7684\u65f6\u95f4\u4f7f\u7528\u968f\u673a\u9009\u884c\u4e3a. \u03b1\u662f\u5b66\u4e60\u7387, \u6765\u51b3\u5b9a\u8fd9\u6b21\u7684\u8bef\u5dee\u6709\u591a\u5c11\u662f\u8981\u88ab\u5b66\u4e60\u7684, \u03b1\u662f\u4e00\u4e2a\u5c0f\u4e8e1 \u7684\u6570. \u03b3\u662f\u5bf9\u672a\u6765 reward \u7684\u8870\u51cf\u503c. \u6211\u4eec\u53ef\u4ee5\u8fd9\u6837\u60f3\u8c61.<\/span><\/p>\n Q-learning \u662f\u4e00\u4e2a off-policy \u7684\u7b97\u6cd5, \u56e0\u4e3a\u91cc\u9762\u7684 max action \u8ba9 Q table \u7684\u66f4\u65b0\u53ef\u4ee5\u4e0d\u57fa\u4e8e\u6b63\u5728\u7ecf\u5386\u7684\u7ecf\u9a8c(\u53ef\u4ee5\u662f\u73b0\u5728\u5b66\u4e60\u7740\u5f88\u4e45\u4ee5\u524d\u7684\u7ecf\u9a8c,\u751a\u81f3\u662f\u5b66\u4e60\u4ed6\u4eba\u7684\u7ecf\u9a8c).<\/span><\/p>\n On-policy \u4e0e off-policy \u672c\u8d28\u533a\u522b\u5728\u4e8e\uff1a\u66f4\u65b0Q\u503c\u65f6\u6240\u4f7f\u7528\u7684\u65b9\u6cd5\u662f\u6cbf\u7528\u65e2\u5b9a\u7684\u7b56\u7565\uff08on-policy\uff09\u8fd8\u662f\u4f7f\u7528\u65b0\u7b56\u7565\uff08off-policy\uff09<\/span><\/p>\n<\/p>\n \u4ee5\u540e\u8981\u8bb2\u7684Sarsa\u7b97\u6cd5\u662fon-policy\u7684\u3002\u4e5f\u5c31\u662f\u8bf4\uff0cSarsa\u7b97\u6cd5\u5728\u66f4\u65b0Q\u8868\u7684\u65f6\u5019\u6240\u9075\u5faa\u7684\u7b56\u7565\u4e0e\u5f53\u524d\u7b56\u7565\u4e00\u81f4\u3002<\/span><\/p>\n \u8ba9\u5c0f\u9e1f\u5b66\u4e60\u600e\u4e48\u98de\u662f\u4e00\u4e2a\u5f3a\u5316\u5b66\u4e60<\/strong>\uff08reinforcement learning\uff09\u7684\u8fc7\u7a0b\uff0c\u5f3a\u5316\u5b66\u4e60\u4e2d\u6709\u72b6\u6001<\/strong>(state)\u3001\u52a8\u4f5c<\/strong>(action)\u3001\u5956\u8d4f<\/strong>(reward)\u8fd9\u4e09\u4e2a\u8981\u7d20\u3002\u667a\u80fd\u4f53\uff08Agent\uff0c\u5728\u8fd9\u91cc\u5c31\u662f\u6307\u6211\u4eec\u806a\u660e\u7684\u5c0f\u9e1f\uff09\u9700\u8981\u6839\u636e\u5f53\u524d\u72b6\u6001\u6765\u91c7\u53d6\u52a8\u4f5c\uff0c\u83b7\u5f97\u76f8\u5e94\u7684\u5956\u8d4f\u4e4b\u540e\uff0c\u518d\u53bb\u6539\u8fdb\u8fd9\u4e9b\u52a8\u4f5c\uff0c\u4f7f\u5f97\u4e0b\u6b21\u518d\u5230\u76f8\u540c\u72b6\u6001\u65f6\uff0c\u667a\u80fd\u4f53\u80fd\u505a\u51fa\u66f4\u4f18\u7684\u52a8\u4f5c\u3002<\/span><\/p>\n \u72b6\u6001\u7684\u9009\u62e9<\/strong> \u5728\u8fd9\u4e2a\u95ee\u9898\u4e2d\uff0c\u72b6\u6001\u7684\u63d0\u53d6\u65b9\u5f0f\u53ef\u4ee5\u6709\u5f88\u591a\u79cd\uff1a\u6bd4\u5982\u8bf4\u53d6\u6574\u4e2a\u6e38\u620f\u753b\u9762\u505a\u56fe\u50cf\u5904\u7406\u554a\uff0c\u6216\u662f\u6839\u636e\u5c0f\u9e1f\u7684\u9ad8\u5ea6\u548c\u7ba1\u5b50\u7684\u8ddd\u79bb\u554a\u3002\u5728\u8fd9\u91cc\u9009\u7528\u7684\u662f\u8ddf<\/span>SarvagyaVaish\u9879\u76ee\u76f8\u540c\u7684\u72b6\u6001\u63d0\u53d6\u65b9\u5f0f\uff0c\u5373\u53d6\u5c0f\u9e1f\u5230\u4e0b\u4e00\u6839\u4e0b\u4fa7\u7ba1\u5b50\u7684\u6c34\u5e73\u8ddd\u79bb\u548c\u5782\u76f4\u8ddd\u79bb\u5dee\u4f5c\u4e3a\u5c0f\u9e1f\u7684\u72b6\u6001\uff1a<\/span><\/p>\n <\/span><\/p>\n \uff08\u56fe\u7247\u6765\u81ea<\/span>Flappy Bird RL by SarvagyaVaish\uff09<\/span><\/p>\n \u8bb0\u8fd9\u4e2a\u72b6\u6001\u4e3a\uff0c\u4e3a\u6c34\u5e73\u8ddd\u79bb\uff0c\u4e3a\u5782\u76f4\u8ddd\u79bb\u3002<\/span><\/p>\n \u52a8\u4f5c\u7684\u9009\u62e9<\/strong> \u5c0f\u9e1f\u53ea\u6709\u4e24\u79cd\u52a8\u4f5c\u53ef\u9009\uff1a1.\u5411\u4e0a\u98de\u4e00\u4e0b\uff0c2.\u4ec0\u4e48\u90fd\u4e0d\u505a\u3002<\/span><\/p>\n \u5956\u8d4f\u7684\u9009\u62e9<\/strong> \u8fd9\u91cc\u91c7\u7528\u7684\u65b9\u5f0f\u662f\uff1a\u5c0f\u9e1f\u6d3b\u7740\u65f6\uff0c\u6bcf\u4e00\u5e27\u7ed9\u4e881\u7684\u5956\u8d4f\uff1b\u82e5\u6b7b\u4ea1\uff0c\u5219\u7ed9\u4e88-1000\u7684\u5956\u8d4f\uff1b\u82e5\u6210\u529f\u7ecf\u8fc7\u4e00\u4e2a\u6c34\u7ba1\uff0c\u5219\u7ed9\u4e8850\u7684\u5956\u8d4f\u3002<\/span><\/p>\n \u63d0\u5230Q-learning\uff0c\u6211\u4eec\u9700\u8981\u5148\u4e86\u89e3Q\u7684\u542b\u4e49\u3002<\/span><\/p>\n Q<\/strong>\u4e3a\u52a8\u4f5c\u6548\u7528\u51fd\u6570<\/strong>\uff08action-utility function\uff09\uff0c\u7528\u4e8e\u8bc4\u4ef7\u5728\u7279\u5b9a\u72b6\u6001\u4e0b\u91c7\u53d6\u67d0\u4e2a\u52a8\u4f5c\u7684\u4f18\u52a3\uff0c\u53ef\u4ee5\u5c06\u4e4b\u7406\u89e3\u4e3a\u667a\u80fd\u4f53\uff08Agent\uff0c\u6211\u4eec\u806a\u660e\u7684\u5c0f\u9e1f\uff09\u7684\u5927\u8111\u3002\u6211\u4eec\u53ef\u4ee5\u628aQ\u5f53\u505a\u662f\u4e00\u5f20\u8868\u3002\u8868\u4e2d\u7684\u6bcf\u4e00\u884c\u662f\u4e00\u4e2a\u72b6\u6001\uff0c\u6bcf\u4e00\u5217\uff08\u8fd9\u4e2a\u95ee\u9898\u4e2d\u5171\u6709\u4e24\u5217\uff09\u8868\u793a\u4e00\u4e2a\u52a8\u4f5c\uff08\u98de\u4e0e\u4e0d\u98de\uff09\u3002<\/span><\/p>\n \u4f8b\u5982\uff1a<\/span><\/p>\n <\/span><\/p>\n \u8fd9\u5f20\u8868\u4e00\u5171 \u884c\uff0c\u8868\u793a \u4e2a\u72b6\u6001\uff0c\u6bcf\u4e2a\u72b6\u6001\u6240\u5bf9\u5e94\u7684\u52a8\u4f5c\u90fd\u6709\u4e00\u4e2a\u6548\u7528\u503c<\/strong>\u3002\u8bad\u7ec3\u4e4b\u540e\u7684\u5c0f\u9e1f\u5728\u67d0\u4e2a\u4f4d\u7f6e\u5904\u98de\u4e0e\u4e0d\u98de\u7684\u51b3\u7b56\u5c31\u662f\u901a\u8fc7\u8fd9\u5f20\u8868\u786e\u5b9a\u7684\u3002\u5c0f\u9e1f\u4f1a\u5148\u53bb\u6839\u636e\u5f53\u524d\u6240\u5728\u4f4d\u7f6e\u67e5\u627e\u5230\u5bf9\u5e94\u7684\u884c\uff0c\u7136\u540e\u518d\u6bd4\u8f83\u4e24\u5217\u7684\u503c\uff08\u98de\u4e0e\u4e0d\u98de\uff09\u7684\u5927\u5c0f\uff0c\u9009\u62e9\u503c\u8f83\u5927\u7684\u52a8\u4f5c\u4f5c\u4e3a\u5f53\u524d\u5e27\u7684\u52a8\u4f5c<\/strong>\u3002<\/span><\/p>\n \u90a3\u4e48\u8fd9\u4e2aQ\u662f\u600e\u4e48\u8bad\u7ec3\u5f97\u6765\u7684\u5462\uff0c\u8d34\u4e00\u6bb5\u4f2a\u4ee3\u7801\u3002<\/span><\/p>\n \u5176\u4e2d\u6709\u4e24\u4e2a\u503c\u5f97\u6ce8\u610f\u7684\u5730\u65b9<\/span><\/p>\n 1.\u201c\u6839\u636e\u5f53\u524dQ\u548c\u4f4d\u7f6eS\uff0c\u4f7f\u7528\u4e00\u79cd\u7b56\u7565\uff0c\u5f97\u5230\u52a8\u4f5cA\uff0c\u8fd9\u4e2a\u7b56\u7565\u53ef\u4ee5\u662f\u03b5-greedy\u7b49\u3002\u201d<\/span><\/p>\n \u8fd9\u91cc\u4fbf\u662f\u9898\u4e3b\u6240\u7591\u60d1\u7684\u95ee\u9898\uff0c\u5982\u4f55\u5728\u63a2\u7d22\u4e0e\u7ecf\u9a8c\u4e4b\u95f4\u5e73\u8861\uff1f\u5047\u5982\u6211\u4eec\u7684\u5c0f\u9e1f\u5728\u8bad\u7ec3\u8fc7\u7a0b\u4e2d\uff0c\u6bcf\u6b21\u90fd\u91c7\u53d6\u5f53\u524d\u72b6\u6001\u6548\u7528\u503c\u6700\u5927\u7684\u52a8\u4f5c\uff0c\u90a3\u4f1a\u4e0d\u4f1a\u6709\u66f4\u597d\u7684\u9009\u62e9\u4e00\u76f4\u6ca1\u6709\u88ab\u63a2\u7d22\u5230\uff1f\u5c0f\u9e1f\u4e00\u76f4\u4f1a\u88ab\u684e\u688f\u5728\u4ee5\u5f80\u7684\u7ecf\u9a8c\u4e4b\u4e2d\u3002\u800c\u5047\u82e5\u5c0f\u9e1f\u5728\u8fd9\u91cc\u6bcf\u6b21\u968f\u673a\u9009\u53d6\u4e00\u4e2a\u52a8\u4f5c\uff0c\u4f1a\u4e0d\u4f1a\u56e0\u4e3a\u63a2\u7d22\u4e86\u592a\u591a\u65e0\u7528\u7684\u72b6\u6001\u800c\u5bfc\u81f4\u6536\u655b\u7f13\u6162\uff1f<\/span><\/p>\n \u4e8e\u662f\u5c31\u6709\u4eba\u63d0\u51fa\u4e86\u03b5-greedy<\/strong>\u65b9\u6cd5\uff0c\u5373\u6bcf\u4e2a\u72b6\u6001\u6709\u03b5\u7684\u6982\u7387\u8fdb\u884c\u63a2\u7d22<\/strong>\uff08\u5373\u968f\u673a\u9009\u53d6\u98de\u6216\u4e0d\u98de\uff09\uff0c\u800c\u5269\u4e0b\u76841-\u03b5\u7684\u6982\u7387\u5219\u8fdb\u884c\u5f00\u53d1<\/strong>\uff08\u9009\u53d6\u5f53\u524d\u72b6\u6001\u4e0b\u6548\u7528\u503c\u8f83\u5927\u7684\u90a3\u4e2a\u52a8\u4f5c\uff09\u3002\u03b5\u4e00\u822c\u53d6\u503c\u8f83\u5c0f\uff0c0.01\u5373\u53ef\u3002\u5f53\u7136\u9664\u4e86\u03b5-greedy\u65b9\u6cd5\u8fd8\u6709\u4e00\u4e9b\u6548\u679c\u66f4\u597d\u7684\u65b9\u6cd5\uff0c\u4e0d\u8fc7\u53ef\u80fd\u590d\u6742\u5f88\u591a\u3002<\/span><\/p>\n \u4ee5\u6b64\u4e5f\u53ef\u4ee5\u770b\u51fa\uff0cQ-learning\u5e76\u975e\u6bcf\u6b21\u8fed\u4ee3\u90fd\u6cbf\u5f53\u524dQ\u503c\u6700\u9ad8\u7684\u8def\u5f84\u524d\u8fdb\u3002<\/span><\/p>\n 2. <\/span><\/p>\n \u8fd9\u4e2a\u5c31\u662fQ-learning\u7684\u8bad\u7ec3\u516c\u5f0f\u4e86\u3002\u5176\u4e2d\u03b1<\/strong>\u4e3a\u5b66\u4e60\u901f\u7387<\/strong>\uff08learning rate\uff09\uff0c\u03b3<\/strong>\u4e3a\u6298\u6263\u56e0\u5b50<\/strong>\uff08discount factor\uff09\u3002\u6839\u636e\u516c\u5f0f\u53ef\u4ee5\u770b\u51fa\uff0c\u5b66\u4e60\u901f\u7387\u03b1\u8d8a\u5927\uff0c\u4fdd\u7559\u4e4b\u524d\u8bad\u7ec3\u7684\u6548\u679c\u5c31\u8d8a\u5c11<\/strong>\u3002\u6298\u6263\u56e0\u5b50\u03b3\u8d8a\u5927\uff0c\u6240\u8d77\u5230\u7684\u4f5c\u7528\u5c31\u8d8a\u5927\u3002\u4f46\u6307\u4ec0\u4e48\u5462\uff1f<\/span><\/p>\n \u5c0f\u9e1f\u5728\u5bf9\u72b6\u6001\u8fdb\u884c\u66f4\u65b0\u65f6\uff0c\u4f1a\u8003\u8651\u5230\u773c\u524d\u5229\u76ca<\/strong>\uff08R\uff09\uff0c\u548c\u8bb0\u5fc6\u4e2d\u7684\u5229\u76ca<\/strong>\uff08\uff09\u3002<\/span><\/p>\n \u6307\u7684\u4fbf\u662f\u8bb0\u5fc6\u4e2d\u7684\u5229\u76ca<\/strong>\u3002\u5b83\u662f\u6307\u5c0f\u9e1f\u8bb0\u5fc6\u91cc\u4e0b\u4e00\u4e2a\u72b6\u6001\u7684\u52a8\u4f5c\u4e2d\u6548\u7528\u503c\u7684\u6700\u5927\u503c\u3002\u5982\u679c\u5c0f\u9e1f\u4e4b\u524d\u5728\u4e0b\u4e00\u4e2a\u72b6\u6001\u7684\u67d0\u4e2a\u52a8\u4f5c\u4e0a\u5403\u8fc7\u751c\u5934\uff08\u9009\u62e9\u4e86\u67d0\u4e2a\u52a8\u4f5c\u4e4b\u540e\u83b7\u5f97\u4e8650\u7684\u5956\u8d4f\uff09\uff0c\u90a3\u4e48\u5b83\u5c31\u66f4\u5e0c\u671b\u63d0\u65e9\u5730\u5f97\u77e5\u8fd9\u4e2a\u6d88\u606f\uff0c\u4ee5\u4fbf\u4e0b\u56de\u5728\u72b6\u6001\u53ef\u4ee5\u901a\u8fc7\u9009\u62e9\u6b63\u786e\u7684\u52a8\u4f5c\u7ee7\u7eed\u8fdb\u5165\u8fd9\u4e2a\u5403\u751c\u5934\u7684\u72b6\u6001\u3002<\/span><\/p>\n \u53ef\u4ee5\u770b\u51fa\uff0c\u03b3\u8d8a\u5927\uff0c\u5c0f\u9e1f\u5c31\u4f1a\u8d8a\u91cd\u89c6\u4ee5\u5f80\u7ecf\u9a8c\uff0c\u8d8a\u5c0f\uff0c\u5c0f\u9e1f\u53ea\u91cd\u89c6\u773c\u524d\u5229\u76ca\uff08R\uff09\u3002<\/span><\/strong><\/p>\n \u6839\u636e\u4e0a\u9762\u7684\u4f2a\u4ee3\u7801\uff0c\u5c31\u53ef\u4ee5\u5199\u51faQ-learning\u7684\u4ee3\u7801\u4e86\u3002<\/span><\/p>\n <\/p>\n \u8bad\u7ec3\u540e\u7684\u5c0f\u9e1f\u4e00\u76f4\u6302\u5728\u90a3\u91cc\u53ef\u4ee5\u98de\u5230\u51e0\u5343\u5206~<\/span><\/p>\n \u53c2\u8003\u6587\u732e\uff1a<\/span><\/span><\/p>\n https:\/\/www.cnblogs.com\/jinxulin\/p\/3517377.html<\/p>\n Q-learning<\/p>\n https:\/\/zhuanlan.zhihu.com\/p\/<\/p>\n https:\/\/www.zhihu.com\/question\/\/answer\/<\/span><\/p>\n <\/p>\n <\/span><\/p>\n<\/div>\n \u8f6c\u8f7d\u4e8e:https:\/\/www.cnblogs.com\/yifdu25\/p\/8169226.html<\/p>\n","protected":false},"excerpt":{"rendered":"q-learning\u7b97\u6cd5_\u9752\u79d1\u5927quet\u767b\u5f55\u5f3a\u5316\u5b66\u4e60\u57fa\u672c\u4ecb\u7ecd\u5f3a\u5316\u5b66\u4e60\u662f\u4e00\u79cd\u4e0d\u540c\u4e8e\u76d1\u7763\u5b66\u4e60\u548c\u65e0\u76d1\u7763\u5b66\u4e60\u7684\u5728\u7ebf\u5b66\u4e60\u6280\u672f,\u57fa\u672c\u6a21\u578b\u56fe\u4e00\u6240\u793a","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[],"tags":[],"_links":{"self":[{"href":"https:\/\/mushiming.com\/wp-json\/wp\/v2\/posts\/6170"}],"collection":[{"href":"https:\/\/mushiming.com\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/mushiming.com\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/mushiming.com\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/mushiming.com\/wp-json\/wp\/v2\/comments?post=6170"}],"version-history":[{"count":0,"href":"https:\/\/mushiming.com\/wp-json\/wp\/v2\/posts\/6170\/revisions"}],"wp:attachment":[{"href":"https:\/\/mushiming.com\/wp-json\/wp\/v2\/media?parent=6170"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/mushiming.com\/wp-json\/wp\/v2\/categories?post=6170"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/mushiming.com\/wp-json\/wp\/v2\/tags?post=6170"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}
1)\u667a\u80fd\u4f53\u611f\u77e5t\u65f6\u523b\u7684\u73af\u5883\u72b6\u6001s(t)
2)\u9488\u5bf9\u5f53\u524d\u7684\u72b6\u6001\u548c\u5373\u65f6\u56de\u62a5r(t),\u667a\u80fd\u4f53\u9009\u62e9\u4e00\u6267\u884c\u52a8\u4f5ca(t)\u3002
3)\u5f53\u667a\u80fd\u4f53\u6240\u9009\u62e9\u7684\u52a8\u4f5c\u4f5c\u7528\u4e8e\u73af\u5883\u65f6,\u73af\u5883\u53d1\u751f\u53d8\u5316
\u73af\u5883\u72b6\u6001\u8f6c\u79fb\u81f3\u4e0b\u4e00\u65b0\u7684\u72b6\u6001s(t+1)
\u7ed9\u51fa\u5373\u65f6\u56de\u62a5r(t),\u53c8\u79f0\u4e3a\u5956\u8d4f\u56de\u62a5
4)\u5373\u65f6\u56de\u62a5r(t)\u53cd\u9988\u7ed9\u667a\u80fd\u4f53,t<-t+1,\u3002
5)\u8f6c\u5411\u7b2c2\u6b65,\u5982\u679c\u65b0\u7684\u72b6\u6001\u4e3a\u7ed3\u675f\u72b6\u6001,\u5219\u505c\u6b62\u5faa\u73af\u3002
\u5176\u4e2d\u5373\u65f6\u56de\u62a5r(t),\u7531\u73af\u5883\u72b6\u6001s(t)\u4e0e\u667a\u80fd\u4f53\u7684\u8f93\u51faa(t)\u51b3\u5b9a\u3002a\u2208A,A\u4e3a\u4e00\u7ec4\u52a8\u4f5c\u96c6\u3002<\/span><\/p>\n\u57fa\u672c\u77e5\u8bc6<\/h3>\n
1\uff09\u6709\u9650\u8303\u56f4\u6a21\u578b\u5b83\u662f\u5728\u6709\u9650\u7684\u9636\u6bb5\u5185\u5bf9\u56de\u62a5\u7684\u7d2f\u79ef\u3002\u4e3a\u91c7\u6837\u65f6\u523b,\u4e3a\u667a\u80fd\u4f53\u4ece\u65f6\u523b\u8d77\u5230\u7ed3\u675f\u8fd0\u884c\u7684\u603b\u6b65\u6570,\u53ef\u4ee5\u4e0d\u9884\u5148\u786e\u5b9a<\/span><\/p>\n<\/p>\n\n
Q-learning\u57fa\u672c\u4ecb\u7ecd<\/h2>\n
\u5b9e\u4f8b\uff1a\uff08Flappy Bird Q-learning\uff09<\/span><\/h2>\n
\u95ee\u9898\u5206\u6790<\/strong><\/h3>\n
\u5173\u4e8eQ<\/strong><\/h3>\n
\u8bad\u7ec3<\/strong><\/h3>\n
Initialize Q arbitrarily \/\/\u968f\u673a\u521d\u59cb\u5316Q\u503c Repeat (for each episode): \/\/\u6bcf\u4e00\u6b21\u6e38\u620f\uff0c\u4ece\u5c0f\u9e1f\u51fa\u751f\u5230\u6b7b\u4ea1\u662f\u4e00\u4e2aepisode Initialize S \/\/\u5c0f\u9e1f\u521a\u5f00\u59cb\u98de\uff0cS\u4e3a\u521d\u59cb\u4f4d\u7f6e\u7684\u72b6\u6001 Repeat (for each step of episode): \u6839\u636e\u5f53\u524dQ\u548c\u4f4d\u7f6eS\uff0c\u4f7f\u7528\u4e00\u79cd\u7b56\u7565\uff0c\u5f97\u5230\u52a8\u4f5cA \/\/\u8fd9\u4e2a\u7b56\u7565\u53ef\u4ee5\u662f\u03b5-greedy\u7b49 \u505a\u4e86\u52a8\u4f5cA\uff0c\u5c0f\u9e1f\u5230\u8fbe\u65b0\u7684\u4f4d\u7f6eS'\uff0c\u5e76\u83b7\u5f97\u5956\u52b1R \/\/\u5956\u52b1\u53ef\u4ee5\u662f1\uff0c50\u6216\u8005-1000 Q(S,A) \u2190 (1-\u03b1)*Q(S,A) + \u03b1*[R + \u03b3*maxQ(S',a)] \/\/\u5728Q\u4e2d\u66f4\u65b0S S \u2190 S' until S is terminal \/\/\u5373\u5230\u5c0f\u9e1f\u6b7b\u4ea1\u4e3a\u6b62 <\/span><\/code><\/pre>\n
\u6210\u679c<\/strong><\/h3>\n
\n