{"id":6170,"date":"2024-08-17T08:01:01","date_gmt":"2024-08-17T00:01:01","guid":{"rendered":""},"modified":"2024-08-17T08:01:01","modified_gmt":"2024-08-17T00:01:01","slug":"q-learning\u7b97\u6cd5_\u9752\u79d1\u5927quet\u767b\u5f55","status":"publish","type":"post","link":"https:\/\/mushiming.com\/6170.html","title":{"rendered":"q-learning\u7b97\u6cd5_\u9752\u79d1\u5927quet\u767b\u5f55"},"content":{"rendered":"
\n

\u5f3a\u5316\u5b66\u4e60\u57fa\u672c\u4ecb\u7ecd<\/h2>\n

\u5f3a\u5316\u5b66\u4e60\u662f\u4e00\u79cd\u4e0d\u540c\u4e8e\u76d1\u7763\u5b66\u4e60\u548c\u65e0\u76d1\u7763\u5b66\u4e60\u7684\u5728\u7ebf\u5b66\u4e60\u6280\u672f,\u57fa\u672c\u6a21\u578b\u56fe\u4e00\u6240\u793a\u3002\u5b83\u628a\u5b66\u4e60\u770b\u4f5c\u662f\u4e00\u4e2a\u201c\u8bd5\u63a2\u4e00\u8bc4\u4ef7\u201d\u7684\u8fc7\u7a0b,\u9996\u5148\u5b66\u4e60\u7cfb\u7edf\u79f0\u4e3a\u667a\u80fd\u4f53\u611f\u77e5\u73af\u5883\u72b6\u6001,\u91c7\u53d6\u67d0\u4e00\u4e2a\u52a8\u4f5c\u4f5c\u7528\u4e8e\u73af\u5883,\u73af\u5883\u63a5\u53d7\u8be5\u52a8\u4f5c\u540e\u72b6\u6001\u53d1\u751f\u53d8\u5316,\u540c\u65f6\u7ed9\u51fa\u4e00\u4e2a\u56de\u62a5\u5956\u52b1\u6216\u60e9\u7f5a\u53cd\u9988\u7ed9\u5f3a\u5316\u5b66\u4e60\u7cfb\u7edf,\u5f3a\u5316\u5b66\u7cfb\u7edf\u6839\u636e\u5f3a\u5316\u4fe1\u53f7\u548c\u73af\u5883\u7684\u5f53\u524d\u72b6\u6001\u518d\u9009\u62e9\u4e0b\u4e00\u4e2a\u52a8\u4f5c,\u9009\u62e9\u7684\u539f\u5219\u662f\u4f7f\u53d7\u5230\u518d\u52b1\u7684\u6982\u7387\u589e\u5927\u3002<\/span><\/p>\n<\/p>\n

\u667a\u80fd\u4f53\u5728\u548c\u73af\u5883\u4ea4\u4e92\u65f6,\u5728\u6bcf\u4e00\u65f6\u523b\u4f1a\u53d1\u751f\u5982\u4e0b\u4e8b\u4ef6\u5e8f\u5217
1)\u667a\u80fd\u4f53\u611f\u77e5t\u65f6\u523b\u7684\u73af\u5883\u72b6\u6001s(t)
2)\u9488\u5bf9\u5f53\u524d\u7684\u72b6\u6001\u548c\u5373\u65f6\u56de\u62a5r(t),\u667a\u80fd\u4f53\u9009\u62e9\u4e00\u6267\u884c\u52a8\u4f5ca(t)\u3002
3)\u5f53\u667a\u80fd\u4f53\u6240\u9009\u62e9\u7684\u52a8\u4f5c\u4f5c\u7528\u4e8e\u73af\u5883\u65f6,\u73af\u5883\u53d1\u751f\u53d8\u5316
           \u73af\u5883\u72b6\u6001\u8f6c\u79fb\u81f3\u4e0b\u4e00\u65b0\u7684\u72b6\u6001s(t+1)
           \u7ed9\u51fa\u5373\u65f6\u56de\u62a5r(t),\u53c8\u79f0\u4e3a\u5956\u8d4f\u56de\u62a5
4)\u5373\u65f6\u56de\u62a5r(t)\u53cd\u9988\u7ed9\u667a\u80fd\u4f53,t<-t+1,\u3002
5)\u8f6c\u5411\u7b2c2\u6b65,\u5982\u679c\u65b0\u7684\u72b6\u6001\u4e3a\u7ed3\u675f\u72b6\u6001,\u5219\u505c\u6b62\u5faa\u73af\u3002
\u5176\u4e2d\u5373\u65f6\u56de\u62a5r(t),\u7531\u73af\u5883\u72b6\u6001s(t)\u4e0e\u667a\u80fd\u4f53\u7684\u8f93\u51faa(t)\u51b3\u5b9a\u3002a\u2208A,A\u4e3a\u4e00\u7ec4\u52a8\u4f5c\u96c6\u3002<\/span><\/p>\n

\u57fa\u672c\u77e5\u8bc6<\/h3>\n

1.\u8bc4\u4ef7\u51fd\u6570<\/span><\/p>\n

\u667a\u80fd\u4f53\u7684\u5b66\u4e60\u76ee\u6807\u662f\u6700\u5927\u5316\u672a\u6765\u56de\u62a5\u7684\u7d2f\u79ef\u503c\u3002\u8bc4\u4ef7\u51fd\u6570,\u662f\u5bf9\u957f\u671f\u56de\u62a5\u7684\u4e00\u79cd\u91cf\u5ea6,\u6709\u4e09\u79cd\u8fd4\u56de\u8868\u8fbe\u5f0f\u3002
1\uff09\u6709\u9650\u8303\u56f4\u6a21\u578b\u5b83\u662f\u5728\u6709\u9650\u7684\u9636\u6bb5\u5185\u5bf9\u56de\u62a5\u7684\u7d2f\u79ef\u3002\u4e3a\u91c7\u6837\u65f6\u523b,\u4e3a\u667a\u80fd\u4f53\u4ece\u65f6\u523b\u8d77\u5230\u7ed3\u675f\u8fd0\u884c\u7684\u603b\u6b65\u6570,\u53ef\u4ee5\u4e0d\u9884\u5148\u786e\u5b9a<\/span><\/p>\n<\/p>\n

2\uff09\u6298\u6263\u56de\u62a5\u65e0\u9650\u8303\u56f4\u6a21\u578b\u5b83\u662f\u5728\u65e0\u9650\u7684\u9636\u6bb5\u5185\u5bf9\u56de\u62a5\u7684\u7d2f\u79ef\u3002<\/span><\/p>\n<\/p>\n

\u03b3\u662f\u6298\u6263\u56e0\u5b50,\u901a\u5e380\u2264\u03b3\uff1c1\u3002\u901a\u8fc7\u8c03\u8282,\u53ef\u4ee5\u63a7\u5236\u5b66\u4e60\u7cfb\u7edf\u5bf9\u5b83\u81ea\u5df1\u884c\u52a8\u7684\u77ed\u671f\u548c\u957f\u671f\u7ed3\u679c\u8003\u8651\u7684\u7a0b\u5ea6\u3002\u5728\u6781\u7aef\u60c5\u51b5,\u5f53\u03b3=0\u65f6\u7cfb\u7edf\u662f\u77ed\u89c6\u7684,\u5b83\u53ea\u8003\u8651\u884c\u52a8\u7684\u5f53\u524d\u7ed3\u679c\u3002\u5f53\u03b3\u63a5\u8fd11\u65f6,\u672a\u6765\u7684\u56de\u62a5\u5728\u91c7\u53d6\u6700\u4f18\u884c\u52a8\u65f6\u53d8\u5f97\u66f4\u4e3a\u91cd\u8981.<\/span><\/p>\n

3\uff09\u5e73\u5747\u56de\u62a5\u6a21\u578b\u4e00\u91c7\u7528\u7b2c\u4e09\u79cd\u6807\u51c6,\u7d2f\u8ba1\u672a\u6765\u56de\u62a5\u7684\u5e73\u5747\u503c,\u6807\u51c6\u4e3a<\/span><\/p>\n<\/p>\n

\u4e0a\u9762\u4e09\u79cd\u56de\u62a5\u8868\u8fbe\u5f0f,\u4f7f\u7528\u6700\u591a\u7684\u662f\u6298\u6263\u56de\u62a5\u6307\u6807\u3002<\/span><\/p>\n

\u9a6c\u5c14\u53ef\u592b\u51b3\u7b56\u8fc7\u7a0b\uff08MDP\uff09<\/strong><\/span><\/p>\n

\u5927\u5bb6\u5e94\u8be5\u8fd8\u8bb0\u5f97\u9a6c\u5c14\u79d1\u592b\u94fe(Markov Chain)\uff0c\u4e86\u89e3\u673a\u5668\u5b66\u4e60\u7684\u4e5f\u90fd\u77e5\u9053\u9690\u9a6c\u5c14\u53ef\u592b\u6a21\u578b(Hidden Markov Model\uff0cHMM)\u3002\u5b83\u4eec\u5177\u6709\u7684\u4e00\u4e2a\u5171\u540c\u6027\u8d28\u5c31\u662f\u9a6c\u5c14\u53ef\u592b\u6027(\u65e0\u540e\u6548\u6027)\uff0c\u4e5f\u5c31\u662f\u6307\u7cfb\u7edf\u7684\u4e0b\u4e2a\u72b6\u6001\u53ea\u4e0e\u5f53\u524d\u72b6\u6001\u4fe1\u606f\u6709\u5173\uff0c\u800c\u4e0e\u66f4\u65e9\u4e4b\u524d\u7684\u72b6\u6001\u65e0\u5173\u3002<\/span><\/p>\n

\u9a6c\u5c14\u53ef\u592b\u51b3\u7b56\u8fc7\u7a0b(Markov Decision Process, MDP)\u4e5f\u5177\u6709\u9a6c\u5c14\u53ef\u592b\u6027\uff0c\u4e0e\u4e0a\u9762\u4e0d\u540c\u7684\u662fMDP\u8003\u8651\u4e86\u52a8\u4f5c\uff0c\u5373\u7cfb\u7edf\u4e0b\u4e2a\u72b6\u6001\u4e0d\u4ec5\u548c\u5f53\u524d\u7684\u72b6\u6001\u6709\u5173\uff0c\u4e5f\u548c\u5f53\u524d\u91c7\u53d6\u7684\u52a8\u4f5c\u6709\u5173\u3002\u8fd8\u662f\u4e3e\u4e0b\u68cb\u7684\u4f8b\u5b50\uff0c\u5f53\u6211\u4eec\u5728\u67d0\u4e2a\u5c40\u9762\uff08\u72b6\u6001s\uff09\u8d70\u4e86\u4e00\u6b65(\u52a8\u4f5ca)\uff0c\u8fd9\u65f6\u5bf9\u624b\u7684\u9009\u62e9\uff08\u5bfc\u81f4\u4e0b\u4e2a\u72b6\u6001s\u2019\uff09\u6211\u4eec\u662f\u4e0d\u80fd\u786e\u5b9a\u7684\uff0c\u4f46\u662f\u4ed6\u7684\u9009\u62e9\u53ea\u548cs\u548ca\u6709\u5173\uff0c\u800c\u4e0d\u7528\u8003\u8651\u66f4\u65e9\u4e4b\u524d\u7684\u72b6\u6001\u548c\u52a8\u4f5c\uff0c\u5373s\u2019\u662f\u6839\u636es\u548ca\u968f\u673a\u751f\u6210\u7684\u3002<\/span><\/p>\n

\u6211\u4eec\u7528\u4e00\u4e2a\u4e8c\u7ef4\u8868\u683c\u8868\u793a\u4e00\u4e0b\uff0c\u5404\u79cd\u9a6c\u5c14\u53ef\u592b\u5b50\u6a21\u578b\u7684\u5173\u7cfb\u5c31\u5f88\u6e05\u695a\u4e86\uff1a<\/span><\/p>\n

<\/p>\n

\u4e00\u4e2a\u9a6c\u5c14\u53ef\u592b\u51b3\u7b56\u8fc7\u7a0b\u7531\u4e00\u4e2a\u56db\u5143\u7ec4\u6784\u6210M = (S, A, Psa<\/sub>, ) <\/span><\/p>\n