内涵段子正则爬取:
"""
内涵段子爬虫
https://www.neihan8.com/article/index.html
"""
from urllib import request,parse
from urllib import error
import chardet
from lxml import etree
import csv,string,re
import csv
def neihanba(url,beginPage, endPage):
for page in range(beginPage, endPage):
pn = page
if pn <= 1:
fullurl = url + "index.html"
else:
fullurl = url + "index_%s"%pn + ".html"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
req = request.Request(fullurl, headers=headers)
try:
response = request.urlopen(req)
resHtml = response.read()
resHtml = resHtml.decode("utf-8", 'ignore')
# 笑话标题
title = r'<h3><a .*?>(.*?)</a></h3>'
title_pattern = re.compile(title,re.I | re.S | re.M)
joketitle = title_pattern.findall(resHtml)
# 笑话内容
content = r'<div class="desc">.*?(.*?)</div>'
content_pattern = re.compile(content, re.I | re.S | re.M)
jokecontent = content_pattern.findall(resHtml)
for m in range(1,len(jokecontent)):
k = jokecontent[m]
filename = './data1/neihanba' + '.csv'
with open(filename, 'a', encoding='utf-8') as file:
wr = csv.writer(file)
wr.writerow([joketitle,jokecontent])
# 笑话url
jokeurl = r'<h3><a href="(.*?)" .*?>.*?</a></h3>'
url_patter = re.compile(jokeurl, re.I | re.S | re.M)
jurl = url_patter.findall(resHtml)
for i in jurl:
jokefullurl = "https://www.neihan8.com" + i
response = request.urlopen(jokefullurl)
resHtml = response.read()
resHtml = resHtml.decode("utf-8", 'ignore')
# 笑话标题
jokecontitle = r'<h1 class="title">(.*?)</h1>'
jokecontitle_pattern = re.compile(jokecontitle, re.I | re.S | re.M)
jokecontitle_content = jokecontitle_pattern.findall(resHtml)
for a in jokecontitle_content:
joke_content_title = a
# 笑话内容
jokecontent1 = r'<p>(.*?)</p>'
joke_pattern = re.compile(jokecontent1, re.I | re.S | re.M)
joke_content = joke_pattern.findall(resHtml)
for s in range(len(joke_content)-2):
openjoke_content = joke_content[s]
filename = './data1/neihanba1' + '.csv'
with open(filename, 'a', encoding='utf-8') as file:
wr = csv.writer(file)
wr.writerow([openjoke_content])
except error.URLError as e:
print(e)
if __name__ == "__main__":
proxy = {"http": "118.31.220.3:8080"}
proxy_support = request.ProxyHandler(proxy)
opener = request.build_opener(proxy_support)
request.install_opener(opener)
beginPage = int(input("请输入起始页:"))
endPage = int(input("请输入终止页:"))
url = "https://www.neihan8.com/article/"
neihanba(url, beginPage, endPage)