Flask框架小程序后端分离开发学习笔记《4》向服务器端发送模拟请求-爬虫

2024-01-20 10:04:05
开发
32

Flask框架小程序后端分离开发学习笔记《4》向服务器端发送模拟请求-爬虫

Flask是使用python的后端，由于小程序需要后端开发，遂学习一下后端开发。
下面代码，是一个比较老的版本了，可以借鉴一下。

import socket
import ssl


def parsed_url(url):
    #检查协议
    protocol = 'http'
    if url[:7] == 'http://':
        u = url.split('://')[1]
    elif url[:8] == 'https://':
        protocol = 'https'
        u = url.split('://')[1]
    else:
        #':/l '定位然后取第一个/的位置来切片
        u = url
    # 经过这样，url中的协议被切掉了，注意切片后数组下标由0开始
    # 例如 https://www.baidu.com:2024/apple
    # 切除后 www.baidu.com:2024/apple


    # 检查默认path
    i = u.find('/')
    if i == -1:
        host = u
        path ='/'
    else:
        host = u[:i]
        path = u[i:]
    # 这一步处理过后
    # host = www.baidu.com:2024
    # path = apple


    # 检查端口
    # 默认端口
    port_dict = {
   
        'http': 80,
        'https': 443,
    }
    # 非默认端口
    port = port_dict[protocol]
    if ':' in host:
        h = host.split(':')
        # print(h)  测试用，当控制台说哪有问题，阔以尝试使用打印大法，看看是啥问题
        host = h[0]
        port = int(h[1])
    return protocol, host, port, path


#以下test开头的函数是单元测试
def test_parsed_url():
    #parsed_url函数很容易出错，所以我们写测试函数来运行看检测是否正确运m
    http = 'http'
    https = 'https'
    host = 'g.cn'
    path = '/'
    test_items = [
        ('http://g.cn', (http,host,80,path)),
        ('http://g.cn/', (http,host,80,path)),
        ('http://g.cn:90',(http, host,90,path)),
        ('http://g.cn:90/', (http,host, 90,path)),

        # 这里面的都是典型测试用例即可
        ('https://g.cn', (https,host,443,path)),
        ('https://g.cn:233/',(https,host,233,path)),
    ]

    for t in test_items:
        url, expected = t
        u = parsed_url(url)
        # assert是一个语句，名字叫断言
        # 如果断言成功，条件成立，则通过测试，否则为测试失败，中断程序报错
        e = "parsed_url ERROR，{}{}{}".format(url,u,expected)
        assert u == expected, e    # 如果u == expected为true，就没问题，继续；否则就会中断，然后打印e
        # print("test_parsed_url测试通过！")


def socket_by_protocol(protocol,host):
    # 根据协议返回一个socket实例
    s = socket.socket()
    if protocol == 'https':
        # 创建一个默认的SSL上下文环境
        context = ssl.create_default_context()
        # 使用SSL上下文环境来包装socket对象
        s = context.wrap_socket(s, server_hostname=host)
    return s

def response_by_socket(s):
    # 持续接收响应，返回响应的byte型
    # 参数是一个socket实例
    # 返回这个socket读取的所有数据
    response = b''
    buffer_size = 1024
    while True:
        r = s.recv(buffer_size)
        if len(r) == 0:
            break
        response += r
    return response

def parsed_response(r):
    # 把response解析出状态码headers body返回
    # 状态码是int
    # headers 是dict
    # body是str
    header, body = r.split('\r\n\r\n',1)     # \r\n\r\n是其分界线
    h = header.split('\r\n')   # 就会切分很多行  例如：HTTP/1.1 301 Moved Permanently
    status_code = h[0].split()[1]   # h[0]就是响应行 status_code=301  按照空格切分
    status_code = int(status_code)

    headers = {
   }
    for line in h[1:]:   # 把响应Header部分每一行都存成字典
        k, v = line.split(': ')
        headers[k] = v
    return status_code, headers, body


def get(url):
    # 用GET请求url并返回响应
    protocol,host,port,path= parsed_url(url)

    # 因为协议不一样，socket实例构建方式不同
    s = socket_by_protocol(protocol,host)
    s.connect((host,port))
    request = 'GET HTTP/1.1\r\nhost: {}\r\nconnectibn: close\r\n\r\n'.format(path,host)
    encoding = 'utf-8'
    s.send(request.encode(encoding))

    response = response_by_socket(s)
    r = response.decode(encoding)

    # 利用parsed_response处理接收到的响应，分别得到想要的东西
    status_code, headers, body = parsed_response(r)
    if status_code in [301, 302]:     # 301、302说明是需要重定向
        url = headers['Location']
        return get(url)

    return status_code, headers, body

def test_get():
    # 测试是否能正确处理HTTP和HTTPS
    urls = [
        'http://movie.douban.com/top250',
        'https://movie.douban.com/top250',
    ]
    # 这里就直接调用了get如果出错就会挂，测试得比较简单
    for u in urls:
        status_code, headers, body = get(u)
        print(status_code,headers,body)


def test():
    # 用于测试的主函数
    test_parsed_url()
    test_get()
    # test_parsed_response()

if __name__ == '__main__':
    test()
    # main()


# 代码注意模块化，写what不写how，不写具体怎么实现，具体怎么实现就封装起来

最后尝试请求豆瓣的网页，并未得到，我怀疑是有反爬手段，我们的请求数据还有很多东西没加进去，所以看起来不像是浏览器发送的请求，后续会继续学习，解决这个问题。

原文地址:https://blog.csdn.net/qq_55473229/article/details/135709124 本文来自互联网用户投稿，该文观点仅代表作者本人，不代表本站立场。本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如若转载，请注明出处：https://www.suanlizi.com/kf/1748526781784264704.html 如若内容造成侵权/违法违规/事实不符，请联系《酸梨子》网邮箱：1419361763@qq.com进行投诉反馈，一经查实，立即删除！

阅读全部