爬取研招网

项目指路

https://gitee.com/chenjian0502/yanzhaowang

实现功能

用于爬取研招网的信息,目前实现的是,输入省份以及专业代码可以load到相关的信息,报考人数,专业,学院,学校,考试科目等。
在这里插入图片描述
在这里插入图片描述

代码

import requests
from bs4 import BeautifulSoup
from pandas.core.frame import DataFrame
import re
import time


class Graduate:
    def __init__(self, province, category):
        self.head = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKi"
                          "t/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"
        }
        self.data = []
        self.province = province
        self.category = category
        # print(self.category+"self.province"+self.province)

    def get_list_fun(self, url, name):
        """获取提交表单代码"""
        response = requests.get(url, headers=self.head)
        province = response.json()
        with open("{}.txt".format(name), "w") as f:
            for x in province:
                f.write(str(x))
                f.write("\n")

    def get_list(self):
        """
        分别获取省,学科门类,专业编号数据
        写入txt文件
        """
        self.get_list_fun("https://yz.chsi.com.cn/zsml/pages/getSs.jsp", "province")
        self.get_list_fun('https://yz.chsi.com.cn/zsml/pages/getMl.jsp', "category")
        self.get_list_fun('https://yz.chsi.com.cn/zsml/pages/getZy.jsp', 'major')

    def get_school_url(self):
        """
        输入省份,
        发送post请求,获取数据
        提取数据
        必填省份,学科门类,专业可选填
        返回学校网址
        默认了全日制 xxfx 可修改变换
        """
        url = "https://yz.chsi.com.cn/zsml/queryAction.do"
        data = {
            "ssdm": self.province,
            "yjxkdm": self.category,
            'xxfs': "1"
        }
        # 可以直接替换也可以
        # data = {
        #     'ssdm': "44",
        #     'dwmc': "",
        #     'mldm': "08",
        #     'mlmc': "",
        #     'yjxkdm': "0812",
        #     'zymc': "",
        #     'xxfs': "1"
        #     # 'pageno': "1"
        # }
        response = requests.post(url, data=data, headers=self.head)
        html = response.text
        # print(html)
        reg = re.compile(r'(<tr>.*? </tr>)', re.S)
        content = re.findall(reg, html)
        schools_url = re.findall('<a href="(.*?)" target="_blank">.*?</a>', str(content))
        # print(schools_url)
        return schools_url

    def get_college_data(self, url):
        """返回一个学校所有学院数据"""
        response = requests.get(url, headers=self.head)
        html = response.text
        colleges_url = re.findall('<td class="ch-table-center"><a href="(.*?)" '
                                  'target="_blank">查看</a>', html)
        return colleges_url

    def get_final_data(self, url):
        """输出一个学校一个学院一个专业的数据"""
        temp = []
        response = requests.get(url, headers=self.head)
        html = response.text
        # print(html)
        soup = BeautifulSoup(html, features='lxml')
        # + soup.find_all('tbody', {"class": "zsml-res-items"})
        summary = (soup.find_all('td', {"class": "zsml-summary"}))
        subject = soup.find_all('tbody', {"class": "zsml-res-items"})
        # result = []
        # for tr in subject:
        #     for td in tr.find_all('td'):
        #         result.append(td.text.replace('\r\n', '').replace('\n', ''))
        # print(result)
        results = []
        for tr in subject:
            for td in tr.find_all('td'):
                text = td.text
                result = re.findall(r'[\u4e00-\u9fa5\d]+', text)
                results.append(result)

        # print(results)
        summary = summary + results

        # print(subject)
        # print("==========================================")
        # print(summary)
        # print("==========================================")
        for x in summary:
            # print("++++++++++++++++++++++++++++++++")
            # print(x.get_text())
            # print("++++++++++++++++++++++++++++++++")
            if isinstance(x, list):
                temp.append(x)
            else:
                temp.append(x.get_text() or x)
                if x.get_text().__contains__("专业"):
                    if "不含推免" in x.get_text():
                        if "不区分" not in x.get_text():
                            if "不分" not in x.get_text():
                                temp.append("专业")
                                num = re.findall(r'\d+', x.get_text())
                                match = re.search(r'\d+', str(num))
                                if match:
                                    result = match.group()
                                    # print(result)
                                    temp.append(result)
                elif x.get_text().__contains__("研究方向"):
                    if "不含推免" in x.get_text():
                        if "不区分" not in x.get_text():
                            if "不分" not in x.get_text():
                                temp.append("研究方向")
                                num2 = re.findall(r'\d+', x.get_text())
                                match = re.search(r'\d+', str(num2))
                                if match:
                                    result = match.group()
                                    temp.append(result)
                elif x.get_text().__contains__("一级学科"):
                    if "不含推免" in x.get_text():
                        temp.append("一级学科")
                        num2 = re.findall(r'\d+', x.get_text())
                        match = re.search(r'\d+', str(num2))
                        if match:
                            result = match.group()
                            temp.append(result)
        # temp.append(result)
        self.data.append(temp)

    def get_schools_data(self):
        """获取所有学校的数据"""
        url = "https://yz.chsi.com.cn"
        schools_url = self.get_school_url()
        amount = len(schools_url)
        i = 0
        for school_url in schools_url:
            i += 1
            url_ = url + school_url
            # 找到一个学校对应所有满足学院网址
            colleges_url = self.get_college_data(url_)
            print("已完成第" + str(i) + "/" + str(amount) + "学院爬取")
            time.sleep(1)
            for college_url in colleges_url:
                _url = url + college_url
                self.get_final_data(_url)

    def get_data_frame(self):
        """将列表形数据转化为数据框格式"""
        data = DataFrame(self.data)
        # print(data)
        data.to_csv(self.province + self.category + ".csv",
                    encoding="utf_8_sig")
        # writer = pd.ExcelWriter("C:\\Users\\Administrator\\Desktop\\信息\\" + self.province + self.category + ".xlsx")
        # data.to_excel(writer, sheet_name="1")


if __name__ == '__main__':
    # province = "44"  # 广东
    # province = "43"  # 湖南
    # province = "42"  # 湖北
    # province = "33"  # 湖北

    # category = "0835"  # 软件工程
    # category = "0812"  # 计算机技术
    # category = "0854"  # 电子信息
    # category = "0501"  # 中国语言文学
    # 省份代码列表
    # provinceList = ['42', '43', '44', '35', '51', '31', '32', '33', '34', '45']
    # provinceList = ['44']
    # # 专业代码列表
    # categoryList = ['0812', '0835', '0854']
    # categoryList = ['0835']
    # for province in provinceList:
    #     for category in categoryList:
    #         print("开始爬取" + province + "的" + category)
    #         spyder = Graduate(province, category)
    #         spyder.get_schools_data()
    #         spyder.get_data_frame()
    #         print(province + category + '写入成功')
    province = input("请输入查询学校省份编号:")
    category = input("请输入查询专业代码:")
    print("开始爬取" + province + "的" + category)
    spyder = Graduate(province, category)
    spyder.get_schools_data()
    spyder.get_data_frame()
    print(province + category + '写入成功')

使用

    self.get_list_fun("https://yz.chsi.com.cn/zsml/pages/getSs.jsp", "province")
    self.get_list_fun('https://yz.chsi.com.cn/zsml/pages/getMl.jsp', "category")
    self.get_list_fun('https://yz.chsi.com.cn/zsml/pages/getZy.jsp', 'major')

上面的三个链接是用来获取省份类别和专业的

有了之后将代码中的替换成自己想要的就可以了。

可以看这个:https://gitee.com/chenjian0502/yanzhaowang,有什么问题可以问

相关推荐

  1. Python易云平台

    2024-04-08 13:58:03       13 阅读

最近更新

  1. TCP协议是安全的吗?

    2024-04-08 13:58:03       16 阅读
  2. 阿里云服务器执行yum,一直下载docker-ce-stable失败

    2024-04-08 13:58:03       16 阅读
  3. 【Python教程】压缩PDF文件大小

    2024-04-08 13:58:03       15 阅读
  4. 通过文章id递归查询所有评论(xml)

    2024-04-08 13:58:03       18 阅读

热门阅读

  1. 力扣-图论

    2024-04-08 13:58:03       12 阅读
  2. web蓝桥杯真题:图片水印生成

    2024-04-08 13:58:03       12 阅读
  3. 面经 (24-4)

    2024-04-08 13:58:03       14 阅读
  4. 力扣-搜索二维矩阵

    2024-04-08 13:58:03       13 阅读
  5. 3GPP-LTE Band26标准定义频点和信道(V17.3.0 (2022-09)

    2024-04-08 13:58:03       13 阅读
  6. Python基于Tkinter的加法游戏

    2024-04-08 13:58:03       12 阅读