{"id":6853,"date":"2024-07-25T23:01:03","date_gmt":"2024-07-25T15:01:03","guid":{"rendered":""},"modified":"2024-07-25T23:01:03","modified_gmt":"2024-07-25T15:01:03","slug":"\u56fe\u7247\u8f6cexcel\u7f51\u7ad9_\u600e\u4e48\u628a\u56fe\u7247\u8f6c\u5316\u4e3aexcel\u8868\u683c","status":"publish","type":"post","link":"https:\/\/mushiming.com\/6853.html","title":{"rendered":"\u56fe\u7247\u8f6cexcel\u7f51\u7ad9_\u600e\u4e48\u628a\u56fe\u7247\u8f6c\u5316\u4e3aexcel\u8868\u683c"},"content":{"rendered":"
## \u4f7f\u7528\u963f\u91cc\u4e91\u7684\u8bfb\u5149ocr\u6765\u628a\u56fe\u7247\u8868\u683c\u8f6c\u4e3aExcel<\/p>\n
## @suyin<\/p>\n
## 2020-06-19<\/p>\n
## \u8bfb\u5149 \u9ad8\u7cbe\u7248 \u8fd8\u6709\u4e2a\u4e00\u822c\u7cbe\u5ea6\u7684\uff0c\u53ef\u4ee5\u641c\u7d22<\/p>\n
https:\/\/duguang.aliyun.com\/document\/%E9%AB%98%E7%B2%BE%E7%89%88.html<\/p>\n
# {
\n <\/p>\n
# \/\/\u56fe\u50cf\u6570\u636e\uff1abase64\u7f16\u7801\uff0c\u8981\u6c42base64\u7f16\u7801\u540e\u5927\u5c0f\u4e0d\u8d85\u8fc74M\uff0c\u6700\u77ed\u8fb9\u81f3\u5c1115px\uff0c\u6700\u957f\u8fb9\u6700\u59274096px\uff0c\u652f\u6301jpg\/png\/bmp\u683c\u5f0f\uff0c\u548curl\u53c2\u6570\u53ea\u80fd\u540c\u65f6\u5b58\u5728\u4e00\u4e2a<\/p>\n
# \"img\": \"\",<\/p>\n
# \/\/\u56fe\u50cfurl\u5730\u5740\uff1a\u56fe\u7247\u5b8c\u6574URL\uff0cURL\u957f\u5ea6\u4e0d\u8d85\u8fc71024\u5b57\u8282\uff0cURL\u5bf9\u5e94\u7684\u56fe\u7247base64\u7f16\u7801\u540e\u5927\u5c0f\u4e0d\u8d85\u8fc74M\uff0c\u6700\u77ed\u8fb9\u81f3\u5c1115px\uff0c\u6700\u957f\u8fb9\u6700\u59274096px\uff0c\u652f\u6301jpg\/png\/bmp\u683c\u5f0f\uff0c\u548cimg\u53c2\u6570\u53ea\u80fd\u540c\u65f6\u5b58\u5728\u4e00\u4e2a<\/p>\n
# \"url\": \"\",<\/p>\n
# \/\/\u662f\u5426\u9700\u8981\u8bc6\u522b\u7ed3\u679c\u4e2d\u6bcf\u4e00\u884c\u7684\u7f6e\u4fe1\u5ea6\uff0c\u9ed8\u8ba4\u4e0d\u9700\u8981\u3002 true\uff1a\u9700\u8981 false\uff1a\u4e0d\u9700\u8981<\/p>\n
# \"prob\": false,<\/p>\n
# \/\/\u662f\u5426\u9700\u8981\u5355\u5b57\u8bc6\u522b\u529f\u80fd\uff0c\u9ed8\u8ba4\u4e0d\u9700\u8981\u3002 true\uff1a\u9700\u8981 false\uff1a\u4e0d\u9700\u8981<\/p>\n
# \"charInfo\": false,<\/p>\n
# \/\/\u662f\u5426\u9700\u8981\u81ea\u52a8\u65cb\u8f6c\u529f\u80fd\uff0c\u9ed8\u8ba4\u4e0d\u9700\u8981\u3002 true\uff1a\u9700\u8981 false\uff1a\u4e0d\u9700\u8981<\/p>\n
# \"rotate\": false,<\/p>\n
# \/\/\u662f\u5426\u9700\u8981\u8868\u683c\u8bc6\u522b\u529f\u80fd\uff0c\u9ed8\u8ba4\u4e0d\u9700\u8981\u3002 true\uff1a\u9700\u8981 false\uff1a\u4e0d\u9700\u8981<\/p>\n
# \"table\": false,<\/p>\n
# \/\/\u5b57\u5757\u8fd4\u56de\u987a\u5e8f\uff0cfalse\u8868\u793a\u4ece\u5de6\u5f80\u53f3\uff0c\u4ece\u4e0a\u5230\u4e0b\u7684\u987a\u5e8f\uff0ctrue\u8868\u793a\u4ece\u4e0a\u5230\u4e0b\uff0c\u4ece\u5de6\u5f80\u53f3\u7684\u987a\u5e8f\uff0c\u9ed8\u8ba4false<\/p>\n
# \"sortPage\": false<\/p>\n
# }<\/p>\n
import urllib.request<\/p>\n
import urllib.parse<\/p>\n
import json<\/p>\n
import time<\/p>\n
import base64<\/p>\n
jpg_path = 'test_orc__v3'<\/p>\n
with open(jpg_path + '.jpg', 'rb') as f: # \u4ee5\u4e8c\u8fdb\u5236\u8bfb\u53d6\u672c\u5730\u56fe\u7247<\/p>\n
data = f.read()<\/p>\n
encodestr = str(base64.b64encode(data),'utf-8')<\/p>\n
#\u8bf7\u6c42\u5934<\/p>\n
# \u8bf7\u4fee\u6539\u4e3a\u4f60\u81ea\u5df1\u7684appcode\uff0c\u53ef\u4ece\u4e91\u5e02\u573a\u8ba2\u5355\u6216\u8005api\u7f51\u5173\u5904\u83b7\u5f97<\/p>\n
# \u53bb\u6ce8\u518c\u4e86\u5c31\u5728\u540e\u53f0\u53ef\u4ee5\u770b\u5230\uff0c\u628axxxxx\u6539\u4e3a\u4f60\u7684appcode\u5c31\u597d\u4e86<\/p>\n
AppCode = \"xxxxxx\"<\/p>\n
headers = {
\n <\/p>\n
'Authorization': 'APPCODE ' + AppCode,<\/p>\n
'Content-Type': 'application\/json; charset=UTF-8'<\/p>\n
}<\/p>\n
def posturl(url, data={}):<\/p>\n
try:<\/p>\n
params = json.dumps(dict).encode(encoding='UTF8')<\/p>\n
req = urllib.request.Request(url, params, headers)<\/p>\n
r = urllib.request.urlopen(req)<\/p>\n
html =r.read()<\/p>\n
r.close();<\/p>\n
return html.decode(\"utf8\")<\/p>\n
except urllib.error.HTTPError as e:<\/p>\n
print(e.code)<\/p>\n
print(e.read().decode(\"utf8\"))<\/p>\n
time.sleep(1)<\/p>\n
url_request = \"https:\/\/ocrapi-advanced.taobao.com\/ocrservice\/advanced\"<\/p>\n
dict = {'img': encodestr,<\/p>\n
'rotate': True,<\/p>\n
'table': True}<\/p>\n
html = posturl(url_request, data=dict)<\/p>\n
# print(html)<\/p>\n
text = json.loads(html)<\/p>\n
# text<\/p>\n
# text['prism_tablesInfo']<\/p>\n
# \/\/\u8868\u683c\u4e2d\u5355\u5143\u683cid\uff0c\u548cprism_wordsInfo\u4fe1\u606f\u4e2d\u7684tableCellId\u5bf9\u5e94<\/p>\n
# \"tableCellId\": 0,<\/p>\n
# \/\/\u5355\u5143\u683c\u4e2d\u7684\u6587\u5b57<\/p>\n
# \"word\": \"\uff1a2017\",<\/p>\n
# \/\/xStartCell\u7f29\u5199\uff0c\u8868\u793a\u6a2a\u8f74\u65b9\u5411\u8be5\u5355\u5143\u683c\u8d77\u59cb\u5728\u7b2c\u51e0\u4e2a\u5355\u5143\u683c\uff0c\u7b2c\u4e00\u4e2a\u5355\u5143\u683c\u503c\u4e3a0<\/p>\n
# \"xsc\": 0,<\/p>\n
# \/\/xEndCell\u7f29\u5199\uff0c\u8868\u793a\u6a2a\u8f74\u65b9\u5411\u8be5\u5355\u5143\u683c\u7ed3\u675f\u5728\u7b2c\u51e0\u4e2a\u5355\u5143\u683c\uff0c\u7b2c\u4e00\u4e2a\u5355\u5143\u683c\u503c\u4e3a0\uff0c\u5982\u679cxsc\u548cxec\u90fd\u4e3a0\u8bf4\u660e\u8be5\u6587\u5b57\u5728\u6a2a\u8f74\u65b9\u5411\u5360\u636e\u4e86\u4e00\u4e2a\u5355\u5143<\/p>\n
# \u683c\u5e76\u4e14\u5728\u7b2c\u4e00\u4e2a\u5355\u5143\u683c\u5185<\/p>\n
# \"xec\": 0,<\/p>\n
# \/\/yStartCell\u7f29\u5199\uff0c\u8868\u793a\u7eb5\u8f74\u65b9\u5411\u8be5\u5355\u5143\u683c\u8d77\u59cb\u5728\u7b2c\u51e0\u4e2a\u5355\u5143\u683c\uff0c\u7b2c\u4e00\u4e2a\u5355\u5143\u683c\u503c\u4e3a0<\/p>\n
# \"ysc\": 0,<\/p>\n
# \/\/yEndCell\u7f29\u5199\uff0c\u8868\u793a\u7eb5\u8f74\u65b9\u5411\u8be5\u5355\u5143\u683c\u7ed3\u675f\u5728\u7b2c\u51e0\u4e2a\u5355\u5143\u683c\uff0c\u7b2c\u4e00\u4e2a\u5355\u5143\u683c\u503c\u4e3a0<\/p>\n
# \"yec\": 0,<\/p>\n
# \/\/\u5355\u5143\u683c\u4f4d\u7f6e\uff0c\u6309\u7167\u5355\u5143\u683c\u56db\u4e2a\u89d2\u7684\u5750\u6807\u987a\u65f6\u9488\u6392\u5217\uff0c\u5206\u522b\u4e3a\u5de6\u4e0aXY\u5750\u6807\u3001\u53f3\u4e0aXY\u5750\u6807\u3001\u53f3\u4e0bXY\u5750\u6807\u3001\u5de6\u4e0bXY\u5750\u6807<\/p>\n
# \"pos\": [<\/p>\n
# {
\n <\/p>\n
# \"x\": 107,<\/p>\n
# \"y\": 203<\/p>\n
# },<\/p>\n
# {
\n <\/p>\n
# \"x\": 247,<\/p>\n
# \"y\": 203<\/p>\n
# },<\/p>\n
xCellSize = text['prism_tablesInfo'][0]['xCellSize']<\/p>\n
yCellSize = text['prism_tablesInfo'][0]['yCellSize']<\/p>\n
word = text['prism_tablesInfo'][0]['cellInfos'][0]['word']<\/p>\n
xsc = text['prism_tablesInfo'][0]['cellInfos'][0]['xsc']<\/p>\n
xec = text['prism_tablesInfo'][0]['cellInfos'][0]['xec']<\/p>\n
ysc = text['prism_tablesInfo'][0]['cellInfos'][0]['ysc']<\/p>\n
yec = text['prism_tablesInfo'][0]['cellInfos'][0]['yec']<\/p>\n
print('xCellSize:', xCellSize)<\/p>\n
print('yCellSize:', yCellSize)<\/p>\n
print('word:', word)<\/p>\n
print('xsc:', xsc)<\/p>\n
print('xec:', xec)<\/p>\n
print('ysc:', ysc)<\/p>\n
print('yec:', yec)<\/p>\n
# !pip install xlwt<\/p>\n
# http:\/\/www.ityouknow.com\/python\/2019\/12\/29\/python-excel-103.html<\/p>\n
# workbook = xlwt.Workbook(encoding='utf-8')<\/p>\n
# worksheet = workbook.add_sheet('sheet1')<\/p>\n
# #\u901a\u8fc7worksheet\u8c03\u7528merge()\u521b\u5efa\u5408\u5e76\u5355\u5143\u683c<\/p>\n
# #\u7b2c\u4e00\u4e2a\u548c\u7b2c\u4e8c\u4e2a\u53c2\u6570\u5355\u8868\u884c\u5408\u5e76,\u7b2c\u4e09\u4e2a\u548c\u7b2c\u56db\u4e2a\u53c2\u6570\u5217\u5408\u5e76,<\/p>\n
# #\u5408\u5e76\u7b2c0\u5217\u5230\u7b2c2\u5217\u7684\u5355\u5143\u683c<\/p>\n
# worksheet.write_merge(0, 0, 0, 2, 'first merge')<\/p>\n
# #\u5408\u5e76\u7b2c1\u884c\u7b2c2\u884c\u7b2c\u4e00\u5217\u7684\u5355\u5143\u683c<\/p>\n
# worksheet.write_merge(0, 1, 0, 0, 'first merge')<\/p>\n
# workbook.save('students.xls')<\/p>\n
# \u5bfc\u5165 xlwt \u5e93<\/p>\n
import xlwt<\/p>\n
# \u521b\u5efa xls \u6587\u4ef6\u5bf9\u8c61<\/p>\n
wb = xlwt.Workbook()<\/p>\n
# \u65b0\u589e\u4e24\u4e2a\u8868\u5355\u9875<\/p>\n
sheet_name = 'test'<\/p>\n
sh1 = wb.add_sheet(sheet_name, cell_overwrite_ok=True)<\/p>\n
# \u6570\u636e\u5199\u5165Excel<\/p>\n
# len(text['prism_tablesInfo'])<\/p>\n
# \u6709\u591a\u4e2atable<\/p>\n
prism_tablesInfo = text['prism_tablesInfo']<\/p>\n
for table in prism_tablesInfo:<\/p>\n
cellInfos = table['cellInfos']<\/p>\n
for cell in cellInfos:<\/p>\n
word = cell['word']<\/p>\n
xsc = cell['xsc']<\/p>\n
xec = cell['xec']<\/p>\n
ysc = cell['ysc']<\/p>\n
yec = cell['yec']<\/p>\n
sh1.write(ysc, xsc, word)<\/p>\n
# sh1.write_merge(ysc, yec, xsc, xec, word)<\/p>\n
# print('word:', word, '-xsc:', xsc, 'xec:', xec, 'ysc:', ysc, 'yec:', yec)<\/p>\n
# \u6700\u540e\u4fdd\u5b58\u6587\u4ef6\u5373\u53ef<\/p>\n
save_file = 'ocr\u6d4b\u8bd5__V4'<\/p>\n
wb.save(save_file + '.xls')<\/p>\n<\/div>\n","protected":false},"excerpt":{"rendered":"\u56fe\u7247\u8f6cexcel\u7f51\u7ad9_\u600e\u4e48\u628a\u56fe\u7247\u8f6c\u5316\u4e3aexcel\u8868\u683c##\u4f7f\u7528\u963f\u91cc\u4e91\u7684\u8bfb\u5149ocr\u6765\u628a\u56fe\u7247\u8868\u683c\u8f6c\u4e3aExcel##@suyin##2020-06-19##\u8bfb\u5149\u9ad8\u7cbe\u7248\u8fd8...","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[],"tags":[],"_links":{"self":[{"href":"https:\/\/mushiming.com\/wp-json\/wp\/v2\/posts\/6853"}],"collection":[{"href":"https:\/\/mushiming.com\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/mushiming.com\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/mushiming.com\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/mushiming.com\/wp-json\/wp\/v2\/comments?post=6853"}],"version-history":[{"count":0,"href":"https:\/\/mushiming.com\/wp-json\/wp\/v2\/posts\/6853\/revisions"}],"wp:attachment":[{"href":"https:\/\/mushiming.com\/wp-json\/wp\/v2\/media?parent=6853"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/mushiming.com\/wp-json\/wp\/v2\/categories?post=6853"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/mushiming.com\/wp-json\/wp\/v2\/tags?post=6853"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}