selenium模拟登录爬取多页面vivio手机信息,xpath进行数据解析,最终追加保存为csv形式
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import etree
import pandas as pd
import time
import os
#selinum通过搜索进入手机信息页面
def login(url):
driver.get(url)
driver.maximize_window()
# 点击搜索框
search_btn = wait.until(EC.element_to_be_clickable((By.XPATH, '//span[@class="vp-head-search"]')))
search_btn.click()
# 输入需要爬取的商品信息并再次搜索
search_btm = wait.until(EC.element_to_be_clickable((By.XPATH, '//div[@class="vp-head-search-input"]/input')))
search_btm.click()
search_btp = wait.until(EC.presence_of_element_located((By.XPATH, '//div[@class="vp-head-search-input"]/input')))
search_btp.send_keys('手机')
search_btq = wait.until(EC.element_to_be_clickable((By.XPATH, '//span[@class="vp-head-search-confirm"]')))
search_btq.click()
time.sleep(5)
return driver.page_source
# 通过点击下一页按钮,获取出第1页外其它页网页源代码
def get_next_page():
# 将滚动条拖动至页面底端,使下一页按钮显露出来
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
# 查找下一页按钮
next_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'pagingR')))
# 单击按钮
next_button.click()
time.sleep(5)
return driver.page_source
def parse_page(html):
dom = etree.HTML(html)
# 商品名称
name = dom.xpath('//div[@class="itemTitle"]/span[1]/text()')
print(name)
print(len(name))
# 商品价格
money = dom.xpath('//div[@class="itemTitle"]/span[2]/text()')
print(money)
print(len(name))
# 商品概述
info = dom.xpath('//div[@class="itemText"]/text()')
print(info)
print(len(name))
data = pd.DataFrame({
'商品名称':name,
'商品价格':money,
'商品概述':info
})
return data
#每页数据以追加形式保存至csv文件
def save_file(data): #参数为DataFrame
columns = ['商品名称','商品价格','商品概述']
filename = 'vivo手机与手机配件信息.csv'
if os.path.exists(filename):
data.to_csv(filename,mode='a',encoding='utf_8_sig',columns=columns,index=False,header=False)
else:
data.to_csv(filename,mode='a',encoding='utf_8_sig',columns=columns,index=False)
print("保存成功!")
if __name__ == '__main__':
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
url = "https://vivo.com.cn/"
#获取手机页面
html = login(url)
# 获取手机信息
for i in range(1, 8):
if i != 1: #不是第一页
html = get_next_page()
data = parse_page(html)
print(data)
save_file(data)
print("手机的第{}页爬取完成!".format(i))
#selinum点击配件按钮跳转到手机配件
search_btn = wait.until(EC.element_to_be_clickable((By.ID, 'tab_contentAccessories')))
search_btn.click()
#获取手机配件信息
for i in range(1, 10):
if i != 1: # 不是第一页
html = get_next_page()
data = parse_page(html)
print(data)
save_file(data)
print("配件的第{}页爬取完成!".format(i))
driver.close()