Home >  > Requests模板(Get与Post)芝麻代理

Requests模板(Get与Post)芝麻代理

0

一、这是发送手机验证码的模板:

import requests
import json


class YunPian(object):

    def __init__(self, api_key):
        self.api_key = api_key
        self.single_send_url = "https://sms.yunpian.com/v2/"

    def send_sms(self, code, mobile):
        params = {
            "apikey": self.api_key,
            "mobile": mobile,
            "text": "您的验证码是{code},如非本人操作,请忽略本信息。".format(code = code)
        }

        response = requests.post(self.single_send_url,data=params)
        re_dict = json.loads(response.text)
        print(re_dict)

if __name__ == "__main__":
    yun_pian = YunPian("xxxx")
    yun_pian.send_sms("2007","18782902356")

这里又有一个坑,要留意,就是response = requests.post(self.single_send_url,data=params)后面的参数要注意,如果是用的get方法,后面应该要用params=XXX这样的形式,比如params=payload。

备注:
这里有用到。

二、网页编码
1.编码

import requests

html = requests.get('https://www.baidu.com')
html.encoding = 'utf-8'
print (html.text)

2.requests自动识别编码

response = requests.get("http://www.x.com")
#自动识别解码
response.encoding = response.apparent_encoding

3.request乱码
使用“html.text.encode('utf-8')”也打印不出结果。
乱码的解决方案
其实前面3行就可以了。

四、requests的返回

import requests
response = requests.get("https://www.baidu.com", data=None, timeout=10)
 
# #打印出服务器响应的header信息
# print("打印出服务器响应的header信息:",response.headers)

# #打印出服务器响应的状态码,结果为<Response [200]>
print("打印出服务器响应的状态码:",response.status_code)

print(response) 
print("打印出request",response.request)
print("打印出请求的cookie:",response.cookies)
print("判断请求是否ok:",response.ok)

五、request.get

def get_url_content(self,url, max_try_number=5):
	#封装的requests.get
	try_num = 0
	while True:
		try:
			return requests.get(url, timeout=5)
		except Exception as http_err:
			print(url, "抓取报错", http_err)
			try_num += 1
			if try_num >= max_try_number:
				print("尝试失败次数过多,放弃尝试")
				return None

如何处理request返回的报错信息?

from bs4 import BeautifulSoup
url = '<div class="caller_ref">这是div</div> <a href="/tomasi/cardio/vgh/SPsdeGBHH">超链接</a>' 

soup = BeautifulSoup(url,'html.parser')
print("soup的长度是:{}".format(len(soup)))
if soup.title is None:
    print("没有title,title is None")


content = soup.find('div', {'class': 'caller_ref1'})
if content is None:
    print("没有找到div的值,div返回None")

soup的长度是:3
没有title,title is None
没有找到div的值,div返回None

所以,用len可以判断soup(不能用None),不过即使soup为空白,程序也不会报错。
用none可以判断title,content返回是否为空。

六、添加代理
1.需要在http://h.zhimaruanjian.com/getapi/ 的白名单添加本机ip。u:138 Ps:F14

2.相关参数
获取数量选1个。

3.获取代理ip的代码
另外,发现采集国外网站的时候,用自己的ip采集很慢,有时采不到,用代理IP反而快很多。

import requests
import json

# 添加http代理,芝麻代理为例

headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': '',
'DNT': '1',
'Host': 'www.baidu.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
}

# 获取代理
def get_http_ip():
    url = "http://webapi.http.zhimacangku.com/getip?num=1&type=1&pro=&city=0&yys=0&port=11&time=1&ts=0&ys=0&cs=0&lb=1&sb=0&pb=4&mr=1&regions="
    html = requests.get(url, timeout=30)
    ip = html.content.strip()
    return bytes.decode(ip)


# def get_http_ip(num):
#   #使用json的方式取得代理ip
#   url= "http://webapi.http.zhimacangku.com/getip?num={}&type=2&pro=&city=0&yys=0&port=1&time=1&ts=0&ys=0&cs=0&lb=1&sb=0&pb=4&mr=1&regions=".format(num)
#   html = requests.get(url, timeout=30).text
#   json_str = json.loads(html).get('data')
#   print(json_str)


proxy_ip = get_http_ip()
proxies = {
  "http": "http://{ip}".format(ip=proxy_ip),
  "https": "http://{ip}".format(ip=proxy_ip),
}

import requests

html = requests.get('https://www.baidu.com/baidu?tn=dealio_dg&wd=ip', headers=headers, proxies=proxies)


html.encoding = 'utf-8'
content = html.text

import re

ip = re.search(r'class="c-gap-right">本机IP:&nbsp;(.*?)</span>',content)
ip.group(1)
print(ip.group(1))

3.效果展示

4.代理池方案
https://blog.csdn.net/qq_37978800/article/details/108182356

七、采集sogou
(一)效果

(二)代码
发现加了cookie之后,不需要换IP,采集100页列表页都没有问题。

# -*- coding: utf-8 -*-
import requests
from lxml import etree
import random
import time

headers = {
    "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Encoding":"gzip, deflate, br",
    "Accept-Language":"zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
    "Connection":"keep-alive",
    "Cookie":"IPLOC=CN4401; SUID=3D88E578302EA00A5EE0A5C700091A7A; CXID=2D2E573D28DBF81F0D5EE82AE194B591; wuid=AAFfVMjnMQAAAAqHDEAx3wAAkwA=; ssuid=4566914326; usid=MQ7Cat9I6sYTA4pv; SNUID=BCA801FC898D3212BEC05BC4892BFA83; SUV=00323ADCDF4A72625F8983AF6ABFA771; FREQUENCY=1602847663829_2; front_screen_resolution=1920*1080; front_screen_dpi=1; sw_uuid=1992113152; ad=wkllllllll2k46ovlllllplWvPwlllllWnQyZyllll9lllllxklll5@@@@@@@@@@; ABTEST=0|1613645893|v1; weixinIndexVisited=1; ld=nkllllllll2kaum2lllllplmziwlllllNYkP2yllllwlll ppinfo=fa4414ad0d; passport=5|1613732661|1614942261|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZTo1OkJvd2VufGNydDoxMDoxNjEzNzMyNjYxfHJlZm5pY2s6NTpCb3dlbnx1c2VyaWQ6NDQ6bzl0Mmx1Smo1YlMtYk9FRVFMblJMbmxiV3JWMEB3ZWl4aW4uc29odS5jb218|f003b0d15e|JeJkSzFeHtPrilwydblkqBT-ATqXzrachzaryN0RBTvCHeRXs03-K4fxZ1wfTRU027D4KSjaaG6Su6s2zp9AAjpjKwloLvQWBr3lUWVleKg8RszyfMm5Zpc6HqBQX7GfNyFl-p3KXcKdzNF3HHLxU2HJWnXed7AWY6i-TtbGvm8; sgid=05-49569309-AWAvmzW7J4aNYZkPO1K8yA8; ppmdig=1613732663000000b8e1c27bc97c631f915fcdca3f8f2c32",
    "Host":"weixin.sogou.com",
    "Referer":"https://weixin.sogou.com",
    "Upgrade-Insecure-Requests":"1",
    "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:85.0) Gecko/20100101 Firefox/85.0",
} 


def get_proxy():
    url = "http://webapi.http.zhimacangku.com/getip?num=1&type=1&pro=&city=0&yys=0&port=1&time=1&ts=0&ys=0&cs=0&lb=1&sb=0&pb=4&mr=1&regions="
    try:
        response = requests.get(url, timeout=30)
        if response.status_code == 200:
            ip = response.content.strip()
            return bytes.decode(ip)
    except ConnectionError:
        return None


url2 = "https://weixin.sogou.com/weixin?query=%E9%A3%8E%E6%99%AF&type=2&page=%d"+"&ie=utf8" ##定义通用翻页url
for pg in range(1,4):
    new_url = "https://weixin.sogou.com/weixin?query=%E9%A3%8E%E6%99%AF&type=2&page={}".format(pg)+"&ie=utf8"
    # print(new_url)
    proxy = get_proxy()
    proxies = {
    'http': 'http://' + proxy
    }
    print("正在使用的代理是:",proxies)
    html2 = requests.get(url=new_url,headers=headers,proxies=proxies)
    print(html2.status_code,"-----"+str(pg)+new_url)
    tree2 = etree.HTML(html2.text)
    title = tree2.xpath("//div[@class='txt-box']/h3/a//text()")
    print(title)
    # nr = tree2.xpath("//div[@class='txt-box']/p//text()")
    # print(nr)
    print("-"*80)
    time.sleep(1) #因为代理网站1秒只能获取一个IP,所以这里需要暂停1秒。


本文暂无标签

发表评论

*

*