{"id":6843,"date":"2024-07-26T12:01:03","date_gmt":"2024-07-26T04:01:03","guid":{"rendered":""},"modified":"2024-07-26T12:01:03","modified_gmt":"2024-07-26T04:01:03","slug":"\u81ea\u7136\u8bed\u8a00\u5904\u7406(NLP)\u4e4b\u82f1\u6587\u5355\u8bcd\u8bcd\u6027\u8fd8\u539f","status":"publish","type":"post","link":"https:\/\/mushiming.com\/6843.html","title":{"rendered":"\u81ea\u7136\u8bed\u8a00\u5904\u7406(NLP)\u4e4b\u82f1\u6587\u5355\u8bcd\u8bcd\u6027\u8fd8\u539f"},"content":{"rendered":"
\u8bcd\u5f62\u8fd8\u539f\uff08Lemmatization\uff09\u662f\u6587\u672c\u9884\u5904\u7406\u4e2d\u7684\u91cd\u8981\u90e8\u5206\uff0c\u4e0e\u8bcd\u5e72\u63d0\u53d6\uff08stemming\uff09\u5f88\u76f8\u4f3c\u3002<\/p>\n
\u7b80\u5355\u8bf4\u6765\uff0c\u8bcd\u5f62\u8fd8\u539f\u5c31\u662f\u53bb\u6389\u5355\u8bcd\u7684\u8bcd\u7f00\uff0c\u63d0\u53d6\u5355\u8bcd\u7684\u4e3b\u5e72\u90e8\u5206\uff0c\u901a\u5e38\u63d0\u53d6\u540e\u7684\u5355\u8bcd\u4f1a\u662f\u5b57\u5178\u4e2d\u7684\u5355\u8bcd\uff0c\u4e0d\u540c\u4e8e\u8bcd\u5e72\u63d0\u53d6\uff08stemming\uff09\uff0c\u63d0\u53d6\u540e\u7684\u5355\u8bcd\u4e0d\u4e00\u5b9a\u4f1a\u51fa\u73b0\u5728\u5355\u8bcd\u4e2d\u3002\u6bd4\u5982\uff0c\u5355\u8bcd\u201ccars\u201d\u8bcd\u5f62\u8fd8\u539f\u540e\u7684\u5355\u8bcd\u4e3a\u201ccar\u201d\uff0c\u5355\u8bcd\u201cate\u201d\u8bcd\u5f62\u8fd8\u539f\u540e\u7684\u5355\u8bcd\u4e3a\u201ceat\u201d\u3002<\/p>\n
\u5728Python\u7684nltk\u6a21\u5757\u4e2d\uff0c\u4f7f\u7528WordNet\u4e3a\u6211\u4eec\u63d0\u4f9b\u4e86\u7a33\u5065\u7684\u8bcd\u5f62\u8fd8\u539f\u7684\u51fd\u6570\u3002\u5982\u4ee5\u4e0b\u793a\u4f8bPython\u4ee3\u7801\uff1a<\/p>\n
from nltk.stem import WordNetLemmatizer wnl = WordNetLemmatizer() # lemmatize nouns print(wnl.lemmatize('cars', 'n')) print(wnl.lemmatize('men', 'n')) # lemmatize verbs print(wnl.lemmatize('running', 'v')) print(wnl.lemmatize('ate', 'v')) # lemmatize adjectives print(wnl.lemmatize('saddest', 'a')) print(wnl.lemmatize('fancier', 'a')) <\/code><\/pre>\n\u8fd0\u884c\u7ed3\u679c\uff1a<\/p>\n
car men run eat sad fancy<\/code><\/pre>\n \u5728\u4ee5\u4e0a\u4ee3\u7801\u4e2d\uff0cwnl.lemmatize()\u51fd\u6570\u53ef\u4ee5\u8fdb\u884c\u8bcd\u5f62\u8fd8\u539f\uff0c\u7b2c\u4e00\u4e2a\u53c2\u6570\u4e3a\u5355\u8bcd\uff0c\u7b2c\u4e8c\u4e2a\u53c2\u6570\u4e3a\u8be5\u5355\u8bcd\u7684\u8bcd\u6027\uff0c\u5982\u540d\u8bcd\uff0c\u52a8\u8bcd\uff0c\u5f62\u5bb9\u8bcd\u7b49\uff0c\u8fd4\u56de\u7684\u7ed3\u679c\u4e3a\u8f93\u5165\u5355\u8bcd\u7684\u8bcd\u5f62\u8fd8\u539f\u540e\u7684\u7ed3\u679c\u3002<\/p>\n
\u8bcd\u5f62\u8fd8\u539f\u4e00\u822c\u662f\u7b80\u5355\u7684\uff0c\u4f46\u5177\u4f53\u6211\u4eec\u5728\u4f7f\u7528\u65f6\uff0c\u6307\u5b9a\u5355\u8bcd\u7684\u8bcd\u6027\u5f88\u91cd\u8981\uff0c\u4e0d\u7136\u8bcd\u5f62\u8fd8\u539f\u53ef\u80fd\u6548\u679c\u4e0d\u597d\uff0c\u5982\u4ee5\u4e0b\u4ee3\u7801\uff1a<\/p>\n
from nltk.stem import WordNetLemmatizer wnl = WordNetLemmatizer() print(wnl.lemmatize('ate', 'n')) print(wnl.lemmatize('fancier', 'v')) <\/code><\/pre>\n\u8f93\u51fa\u7ed3\u679c\u5982\u4e0b\uff1a<\/p>\n
<\/p>\n
\u90a3\u4e48\uff0c\u5982\u4f55\u83b7\u53d6\u5355\u8bcd\u7684\u8bcd\u6027\u5462\uff1f\u5728NLP\u4e2d\uff0c\u4f7f\u7528Parts of speech\uff08POS\uff09\u6280\u672f\u5b9e\u73b0\u3002\u5728nltk\u4e2d\uff0c\u53ef\u4ee5\u4f7f\u7528nltk.pos_tag()\u83b7\u53d6\u5355\u8bcd\u5728\u53e5\u5b50\u4e2d\u7684\u8bcd\u6027\uff0c\u5982\u4ee5\u4e0bPython\u4ee3\u7801\uff1a<\/p>\n
from nltk import word_tokenize from nltk import pos_tag sentence = 'The brown fox is quick and he is jumping over the lazy dog' tokens = word_tokenize(sentence) tagged_sent = pos_tag(tokens) print(tokens) print(tagged_sent) <\/code><\/pre>\n\u8f93\u51fa\u7ed3\u679c\u5982\u4e0b\uff1a<\/p>\n
['The', 'brown', 'fox', 'is', 'quick', 'and', 'he', 'is', 'jumping', 'over', 'the', 'lazy', 'dog'] [('The', 'DT'), ('brown', 'JJ'), ('fox', 'NN'), ('is', 'VBZ'), ('quick', 'JJ'), ('and', 'CC'), ('he', 'PRP'), ('is', 'VBZ'), ('jumping', 'VBG'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN')] <\/code><\/pre>\n OK\uff0c\u77e5\u9053\u4e86\u83b7\u53d6\u5355\u8bcd\u5728\u53e5\u5b50\u4e2d\u7684\u8bcd\u6027\uff0c\u518d\u7ed3\u5408\u8bcd\u5f62\u8fd8\u539f\uff0c\u5c31\u80fd\u5f88\u597d\u5730\u5b8c\u6210\u8bcd\u5f62\u8fd8\u539f\u529f\u80fd\u3002\u793a\u4f8b\u7684Python\u4ee3\u7801\u5982\u4e0b\uff1a<\/p>\n
from nltk import word_tokenize, pos_tag from nltk.corpus import wordnet from nltk.stem import WordNetLemmatizer # \u83b7\u53d6\u5355\u8bcd\u7684\u8bcd\u6027 def get_wordnet_pos(tag): if tag.startswith('J'): return wordnet.ADJ elif tag.startswith('V'): return wordnet.VERB elif tag.startswith('N'): return wordnet.NOUN elif tag.startswith('R'): return wordnet.ADV else: return None sentence = 'football is a family of team sports that involve, to varying degrees, kicking a ball to score a goal.' print(sentence) tokens = word_tokenize(sentence) # \u5206\u8bcd tagged_sent = pos_tag(tokens) # \u83b7\u53d6\u5355\u8bcd\u7684\u8bcd\u6027 print(tagged_sent) wnl = WordNetLemmatizer() lemmas_sent = [] for tag in tagged_sent: wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN lemmas_sent.append(wnl.lemmatize(tag[0], pos=wordnet_pos)) # \u8bcd\u6027\u8fd8\u539f print(lemmas_sent) <\/code><\/pre>\n\u8f93\u51fa\u7ed3\u679c\u5982\u4e0b\uff1a<\/p>\n
football is a family of team sports that involve, to varying degrees, kicking a ball to score a goal. [('football', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('family', 'NN'), ('of', 'IN'), ('team', 'NN'), ('sports', 'NNS'), ('that', 'WDT'), ('involve', 'VBP'), (',', ','), ('to', 'TO'), ('varying', 'VBG'), ('degrees', 'NNS'), (',', ','), ('kicking', 'VBG'), ('a', 'DT'), ('ball', 'NN'), ('to', 'TO'), ('score', 'VB'), ('a', 'DT'), ('goal', 'NN'), ('.', '.')] ['football', 'be', 'a', 'family', 'of', 'team', 'sport', 'that', 'involve', ',', 'to', 'vary', 'degree', ',', 'kick', 'a', 'ball', 'to', 'score', 'a', 'goal', '.']<\/code><\/pre>\n\u8f93\u51fa\u7684\u7ed3\u679c\u5c31\u662f\u5bf9\u53e5\u5b50\u4e2d\u7684\u5355\u8bcd\u8fdb\u884c\u8bcd\u5f62\u8fd8\u539f\u540e\u7684\u7ed3\u679c\u3002<\/p>\n","protected":false},"excerpt":{"rendered":"\u81ea\u7136\u8bed\u8a00\u5904\u7406(NLP)\u4e4b\u82f1\u6587\u5355\u8bcd\u8bcd\u6027\u8fd8\u539f\u81ea\u7136\u8bed\u8a00\u5904\u7406(NLP)\u4e4b\u82f1\u6587\u5355\u8bcd\u8bcd\u6027\u8fd8\u539f\u8bcd\u5f62\u8fd8\u539f\uff08Lemmatization\uff09\u662f\u6587\u672c\u9884\u5904\u7406\u4e2d\u7684\u91cd\u8981\u90e8...","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[],"tags":[],"_links":{"self":[{"href":"https:\/\/mushiming.com\/wp-json\/wp\/v2\/posts\/6843"}],"collection":[{"href":"https:\/\/mushiming.com\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/mushiming.com\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/mushiming.com\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/mushiming.com\/wp-json\/wp\/v2\/comments?post=6843"}],"version-history":[{"count":0,"href":"https:\/\/mushiming.com\/wp-json\/wp\/v2\/posts\/6843\/revisions"}],"wp:attachment":[{"href":"https:\/\/mushiming.com\/wp-json\/wp\/v2\/media?parent=6843"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/mushiming.com\/wp-json\/wp\/v2\/categories?post=6843"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/mushiming.com\/wp-json\/wp\/v2\/tags?post=6843"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}