省略了手工导出数据的麻烦,而且爱站说还要VIP会员才能导出。
自己能用就行,代码写得很渣。
import urllib.request
from lxml import etree
import time
import os
os.environ['NO_PROXY'] = 'aizhan.com/'
for x in range(1,51):
url = "https://baidurank.aizhan.com/baidu/huangye88.com/product/0/"+ str(x) +"/exp/-1/"
request = urllib.request.Request(url)
response = urllib.request.urlopen(request).read()
data = response.decode('utf-8','ignore').replace(u'\xa9', u'')
selector = etree.HTML(data)
words = selector.xpath('//a[@class="gray" and parent::*[@class="title" ]]/text()')
keywords = ''.join(words)
with open("a.txt","a") as f:
f.write(keywords.replace('\t',''))
time.sleep(10)
print("已经完成第"+ str(x) + "页内容的抓取......")