知网相关文章采集

import os
import requests
import json
import pandas as pd

#爬取地址
# 实现将python中的数据写入excel中
def write_to_excel(result, path, sheet_name):
    if not os.path.exists(path):
        write = pd.ExcelWriter(path)
        result.to_excel(write, sheet_name=sheet_name, index=False)
        write.save()
    else:
        Original_df = pd.read_excel(path, sheet_name='result')
        result = pd.concat([Original_df, result], axis=0, join='outer')
        result = result.drop_duplicates()
        write = pd.ExcelWriter(path)
        result.to_excel(write, sheet_name=sheet_name, index=False)
        write.save()
def crawl(step, start_point, Save_path):
    """step为每批次爬取的页数,开始页数,保存路径"""
    df = pd.DataFrame()
    count = 0
    # step为每批次爬取的页数,group_num是能够爬取完整step的组数          
    group_num = int(2465 / step)
    start_group = int(start_point / step)#1/50=40
    # 保存获取失败的网页页码          
    failure_get = []
    for group in range(start_group, group_num + 1):
        # 当能够获取完整step时,循环最大值设置为step          
        if group != group_num:
            max = step + 1
        else:
            # 当不能够获取完整的step时,循环最大值设置为剩余获取页数
            max = 2465 - step * group_num
        for m in range(1, max):
            j = group * step + m
            try:
                print(f'正在爬取第{j}页')
                url = f'https://mci.cnki.net//statistics/query?q=&start={j}&size=20'
                User_Agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
                cookies = 'cangjieConfig_CHKD2=%7B%22status%22%3Atrue%2C%22startTime%22%3A%222021-12-27%22%2C%22endTime%22%3A%222022-06-24%22%2C%22type%22%3A%22mix%22%2C%22poolSize%22%3A%2210%22%2C%22intervalTime%22%3A10000%2C%22persist%22%3Afalse%7D; SID=012053; Ecp_ClientId=b230620141100392435; Ecp_IpLoginFail=230620219.137.5.106'
                headers_list = {'User-Agent': User_Agent, 'Cookie': cookies}
                # 解决代理报错问题          
                proxies = {"http": None, "https": None}
                issue_url_resp = requests.get(url=url, proxies=proxies, headers=headers_list)
                print()
                for i in range(0, len(json.loads(issue_url_resp.text)['data']['data'])):
                    df.loc[count, 'Chinese_title'] = json.loads(issue_url_resp.text)['data']['data'][i]['title']
                    df.loc[count, 'English_title'] = json.loads(issue_url_resp.text)['data']['data'][i]['entitle']
                    df.loc[count, 'url'] = json.loads(issue_url_resp.text)['data']['data'][i]['url']
                    count = count + 1
            except:
                print(f'第{j}页爬取失败!!!!!!!!!!!!')
                failure_get.append(j)
        write_to_excel(df, Save_path, 'result')
    print(failure_get)


if __name__ == '__main__':
    Save_path = r'C:\Users\Childers\中国知网关键词.xlsx'
    # 上次无响应的页数          
    start_point = 1
    crawl(50, start_point, Save_path)

最近更新

  1. TCP协议是安全的吗?

    2024-05-12 18:30:08       18 阅读
  2. 阿里云服务器执行yum,一直下载docker-ce-stable失败

    2024-05-12 18:30:08       19 阅读
  3. 【Python教程】压缩PDF文件大小

    2024-05-12 18:30:08       19 阅读
  4. 通过文章id递归查询所有评论(xml)

    2024-05-12 18:30:08       20 阅读

热门阅读

  1. [力扣题解]53. 最大子数组和

    2024-05-12 18:30:08       11 阅读
  2. 哈希表第5/9题--两数之和

    2024-05-12 18:30:08       9 阅读
  3. let和const命令

    2024-05-12 18:30:08       12 阅读
  4. 网络工程师----第二十三天

    2024-05-12 18:30:08       8 阅读
  5. 图搜索算法详解

    2024-05-12 18:30:08       9 阅读
  6. MySQL视图简介

    2024-05-12 18:30:08       7 阅读
  7. pat乙1032-挖掘技术哪家强

    2024-05-12 18:30:08       12 阅读
  8. ctfshow web入门 php反序列化 web275--web278(无web276)

    2024-05-12 18:30:08       6 阅读