一、使用
运行后会生成一个txt文件。
二、代码:
#encoding:utf-8
import struct
import binascii
class Baidu(object):
def __init__(self, originfile):
self.originfile = originfile
self.lefile = originfile + '.le'
self.txtfile = originfile[0:(originfile.__len__()-5)] + '.txt'
self.buf = [b'0' for x in range(0,2)]
self.listwords = []
#字节流大端转小端
def be2le(self):
of = open(self.originfile,'rb')
lef = open(self.lefile, 'wb')
contents = of.read()
contents_size = contents.__len__()
mo_size = (contents_size % 2)
#保证是偶数
if mo_size > 0:
contents_size += (2-mo_size)
contents += contents + b'0000'
#大小端交换
for i in range(0, contents_size, 2):
self.buf[1] = contents[i]
self.buf[0] = contents[i+1]
le_bytes = struct.pack('2B', self.buf[0], self.buf[1])
lef.write(le_bytes)
print('写入成功转为小端的字节流')
of.close()
lef.close()
def le2txt(self):
lef = open(self.lefile, 'rb')
txtf = open(self.txtfile, 'w')
#以字符串形式读取转成小端后的字节流,百度词典的起始位置为0x350
le_bytes = lef.read().hex()[0x350:]
i = 0
while i<len(le_bytes):
result = le_bytes[i:i+4]
i+=4
#将所有字符解码成汉字,拼音或字符
content = binascii.a2b_hex(result).decode('utf-16-be')
#判断汉字
if '\u4e00' <= content <= '\u9fff':
self.listwords.append(content)
else:
if self.listwords:
word = ''.join(self.listwords)
txtf.write(word + '\n')
self.listwords = []
print('写入txt成功')
lef.close()
txtf.close()
if __name__ == '__main__':
path = r'书法词汇大全【官方推荐】.scel'
bd = Baidu(path)
bd.be2le()
bd.le2txt()
三、采集百度搜索结果
import requests
import tldextract
import time
from lxml import etree
def sava_data(filename,content):
with open(filename,"a",encoding="utf-8") as f:
f.write(content + '\n')
def Redirect(url):
try:
res = requests.get(url,timeout=10)
url = res.url
except Exception as e:
print("4",e)
time.sleep(1)
return url
def baidu_search(wd,pn_max):
#百度搜索爬虫,给定关键词和页数以及存储到哪个文件中,返回结果去重复后的url集合
url = "https://www.baidu.com/s"
return_url = []
for page in range(pn_max):
pn = page*10
querystring = {"wd":wd,"pn":pn}
headers = {
'pragma': "no-cache",
'accept-encoding': "gzip, deflate, br",
'accept-language': "zh-CN,zh;q=0.8",
'upgrade-insecure-requests': "1",
'user-agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
'cache-control': "no-cache",
'connection': "keep-alive",
}
try:
response = requests.request("GET", url, headers=headers, params=querystring)
#解析html
selector = etree.HTML(response.text, parser=etree.HTMLParser(encoding='utf-8'))
#根据属性href筛选标签
for i in range(1, 11):
context = selector.xpath('//*[@id="'+str(pn+i)+'"]/h3/a[1]/@href')
# title = selector.xpath('//*[@id="' + str(i) + '"]/h3/a/text()')
# title = "".join(title)
# print(len(context),context[0])
#跳转到获取的url,若可跳转则返回url
xurl = Redirect(context[0])
# return_url[title] = xurl
return_url.append(xurl)
# response.close()
except Exception as e:
print ("页面加载失败", e)
continue
return return_url
def get_root_domain(url):
val = tldextract.extract(url)
root_domain = val.domain + '.'+val.suffix
return root_domain
with open("keywordstest.txt", "r",encoding='utf-8') as f:
for line in f.readlines():
baidu_urls = baidu_search(line, 3)
for url in baidu_urls:
if "baidu.com" in url:
pass
else:
# print(url)
url = get_root_domain(url)
if url == ".":
pass
else:
print(url)
sava_data("rusult7.txt",url)
Updated on May-23-2020
三、强子的方法
1.强子在视频中讲的获取百度搜索结果中的真实网址的方法:
for i in urls:
req = urllib2.Request(i)
req.add_header('userg-agent','')
realurl = urllib.urlopen(req).geturl()
url = url.split('/')[-2] #获取根域名
参考:https://www.bilibili.com/video/BV1mW411v743?from=search&seid=3477489273158434483
2.还可以使用递归:
def getlist(keyword,domain,pn=0):
if pn < 50:
pn += 10
return getlist(keyword,domain,pn=pn)
这个人的许多python视频不错:
https://space.bilibili.com/266594796