Home >  > python读取搜狗字库及采集百度

python读取搜狗字库及采集百度

0

一、使用
运行后会生成一个txt文件。

二、代码:

#encoding:utf-8
import struct
import binascii

class Baidu(object):

    def __init__(self, originfile):
        self.originfile = originfile
        self.lefile = originfile + '.le'
        self.txtfile = originfile[0:(originfile.__len__()-5)] + '.txt'
        self.buf = [b'0' for x in range(0,2)]
        self.listwords = []

    #字节流大端转小端
    def be2le(self):
        of = open(self.originfile,'rb')
        lef = open(self.lefile, 'wb')
        contents = of.read()
        contents_size = contents.__len__()
        mo_size = (contents_size % 2)
        #保证是偶数
        if mo_size > 0:
            contents_size += (2-mo_size)
            contents += contents + b'0000'
        #大小端交换
        for i in range(0, contents_size, 2):
            self.buf[1] = contents[i]
            self.buf[0] = contents[i+1]
            le_bytes = struct.pack('2B', self.buf[0], self.buf[1])
            lef.write(le_bytes)
        print('写入成功转为小端的字节流')
        of.close()
        lef.close()

    def le2txt(self):
        lef = open(self.lefile, 'rb')
        txtf = open(self.txtfile, 'w')
        #以字符串形式读取转成小端后的字节流,百度词典的起始位置为0x350
        le_bytes = lef.read().hex()[0x350:]
        i = 0
        while i<len(le_bytes):
            result = le_bytes[i:i+4]
            i+=4
            #将所有字符解码成汉字,拼音或字符
            content = binascii.a2b_hex(result).decode('utf-16-be')
            #判断汉字
            if '\u4e00' <= content <= '\u9fff':
                self.listwords.append(content)
            else:
                if self.listwords:
                    word = ''.join(self.listwords)
                    txtf.write(word + '\n')
                self.listwords = []
        print('写入txt成功')
        lef.close()
        txtf.close()

if __name__ == '__main__':
    path = r'书法词汇大全【官方推荐】.scel'
    bd = Baidu(path)
    bd.be2le()
    bd.le2txt()

三、采集百度搜索结果

import requests
import tldextract
import time
from lxml import etree


def sava_data(filename,content):
    with open(filename,"a",encoding="utf-8") as f:
        f.write(content + '\n')

def Redirect(url):
    try:
        res = requests.get(url,timeout=10)
        url = res.url
    except Exception as e:
        print("4",e)
        time.sleep(1)
    return url

def baidu_search(wd,pn_max):
    #百度搜索爬虫,给定关键词和页数以及存储到哪个文件中,返回结果去重复后的url集合
    url = "https://www.baidu.com/s"
    return_url = []
    for page in range(pn_max):
        pn = page*10
        querystring = {"wd":wd,"pn":pn}
        headers = {
            'pragma': "no-cache",
            'accept-encoding': "gzip, deflate, br",
            'accept-language': "zh-CN,zh;q=0.8",
            'upgrade-insecure-requests': "1",
            'user-agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
            'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            'cache-control': "no-cache",
            'connection': "keep-alive",
            }
        try:
            response = requests.request("GET", url, headers=headers, params=querystring)
            #解析html
            selector = etree.HTML(response.text, parser=etree.HTMLParser(encoding='utf-8'))
            #根据属性href筛选标签
            for i in range(1, 11):
                context = selector.xpath('//*[@id="'+str(pn+i)+'"]/h3/a[1]/@href')
                # title = selector.xpath('//*[@id="' + str(i) + '"]/h3/a/text()')
                # title = "".join(title)
                # print(len(context),context[0])
                #跳转到获取的url,若可跳转则返回url
                xurl = Redirect(context[0])
                # return_url[title] = xurl
                return_url.append(xurl)

            # response.close()
        except Exception as e:
            print ("页面加载失败", e)
            continue
    return return_url

def get_root_domain(url):
    val = tldextract.extract(url)
    root_domain = val.domain + '.'+val.suffix
    return root_domain


with open("keywordstest.txt", "r",encoding='utf-8') as f:
  for line in f.readlines():
    baidu_urls = baidu_search(line, 3)
    for url in baidu_urls:
        if "baidu.com" in url:
            pass
        else:
            # print(url)
            url = get_root_domain(url)
            if url == ".":
                pass
            else:
                print(url)
                sava_data("rusult7.txt",url)

Updated on May-23-2020
三、强子的方法
1.强子在视频中讲的获取百度搜索结果中的真实网址的方法:

for i in urls:
    req = urllib2.Request(i)
    req.add_header('userg-agent','')
    realurl = urllib.urlopen(req).geturl()
    url = url.split('/')[-2]   #获取根域名

参考:https://www.bilibili.com/video/BV1mW411v743?from=search&seid=3477489273158434483

2.还可以使用递归:

def getlist(keyword,domain,pn=0):
    if pn < 50:
        pn += 10
        return getlist(keyword,domain,pn=pn)

这个人的许多python视频不错:
https://space.bilibili.com/266594796

本文暂无标签

发表评论

*

*