对内涵段子正则的提取

(19) 2024-04-15 19:01:01

内涵段子正则爬取:

"""
内涵段子爬虫
https://www.neihan8.com/article/index.html

"""
from  urllib import request,parse
from  urllib import error
import chardet
from lxml import etree
import csv,string,re
import csv
def neihanba(url,beginPage, endPage):
    for page in range(beginPage, endPage):
        pn = page
        if pn <= 1:
            fullurl = url + "index.html"
        else:
            fullurl = url + "index_%s"%pn + ".html"
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
        req = request.Request(fullurl, headers=headers)
        try:
            response = request.urlopen(req)
            resHtml = response.read()
            resHtml = resHtml.decode("utf-8", 'ignore')
            # 笑话标题
            title = r'<h3><a .*?>(.*?)</a></h3>'
            title_pattern = re.compile(title,re.I | re.S | re.M)
            joketitle = title_pattern.findall(resHtml)
            # 笑话内容
            content = r'<div class="desc">.*?(.*?)</div>'
            content_pattern = re.compile(content, re.I | re.S | re.M)
            jokecontent = content_pattern.findall(resHtml)
            for m in range(1,len(jokecontent)):
                k = jokecontent[m]
            filename = './data1/neihanba' + '.csv'
            with open(filename, 'a', encoding='utf-8') as file:
                wr = csv.writer(file)
                wr.writerow([joketitle,jokecontent])
            # 笑话url
            jokeurl = r'<h3><a href="(.*?)" .*?>.*?</a></h3>'
            url_patter = re.compile(jokeurl, re.I | re.S | re.M)
            jurl = url_patter.findall(resHtml)
            for i in jurl:
                jokefullurl = "https://www.neihan8.com" + i
                response = request.urlopen(jokefullurl)
                resHtml = response.read()
                resHtml = resHtml.decode("utf-8", 'ignore')
                # 笑话标题
                jokecontitle = r'<h1 class="title">(.*?)</h1>'
                jokecontitle_pattern = re.compile(jokecontitle, re.I | re.S | re.M)
                jokecontitle_content = jokecontitle_pattern.findall(resHtml)
                for a in jokecontitle_content:
                    joke_content_title = a
                # 笑话内容
                jokecontent1 = r'<p>(.*?)</p>'
                joke_pattern = re.compile(jokecontent1, re.I | re.S | re.M)
                joke_content = joke_pattern.findall(resHtml)
                for s in range(len(joke_content)-2):
                    openjoke_content = joke_content[s]
                    filename = './data1/neihanba1' + '.csv'
                    with open(filename, 'a', encoding='utf-8') as file:
                        wr = csv.writer(file)
                        wr.writerow([openjoke_content])
        except error.URLError as e:
            print(e)




if __name__ == "__main__":
    proxy = {"http": "118.31.220.3:8080"}
    proxy_support = request.ProxyHandler(proxy)
    opener = request.build_opener(proxy_support)
    request.install_opener(opener)
    beginPage = int(input("请输入起始页:"))
    endPage = int(input("请输入终止页:"))
    url = "https://www.neihan8.com/article/"
    neihanba(url, beginPage, endPage)

 

THE END

发表回复