1 简介
BeautifulSoup 是一个用于解析HTML和XML文档的Python库。它提供了一种灵活和便捷的方式来导航、搜索和修改解析树。BeautifulSoup简化了网络爬虫的工作,使得开发者可以轻松地解析网页内容,提取所需的数据。
2 初体验
使用BeautifulSoup的第一步是安装它,可以通过pip进行安装:
pip install beautifulsoup4
然后,你可以导入BeautifulSoup类并创建一个BeautifulSoup对象来解析HTML文档:
from bs4 import BeautifulSoup html_doc = """ <html><head><title>测试页面</title></head> <body> <p class="title">标题</p> <p class="story">这是一个故事。</p> </body> </html> """ soup = BeautifulSoup(html_doc, 'html.parser')
3 节点选择器
BeautifulSoup提供了多种选择HTML节点的方法。例如,可以使用find()
或find_all()
方法根据标签名、属性或文本内容来查找节点:
# 查找第一个<p>标签 p_tag = soup.find('p') # 查找所有<p>标签 p_tags = soup.find_all('p') # 查找class为"title"的<p>标签 title_p = soup.find('p', class_='title')
4 tag对象嵌套选择
你可以使用.
来访问tag对象的子节点:
# 访问<p>标签内部的文本 text = title_p.string # 访问<p>标签的子标签 children = title_p.children
5 关联选择
关联选择是指通过父节点、子节点或兄弟节点之间的关系来选择元素。BeautifulSoup提供了.parent
、.children
、.next_sibling
和.previous_sibling
等属性来访问这些关系:
# 获取<p>标签的父节点 parent_tag = title_p.parent # 遍历<p>标签的所有子节点 for child in title_p.children: print(child)
6 方法选择器
BeautifulSoup还提供了多种方法来过滤和选择节点,如基于文本内容、正则表达式或lambda函数的选择:
# 查找包含特定文本的<p>标签 p_with_text = soup.find('p', text='标题') # 查找所有包含特定文本模式的<p>标签 p_with_pattern = soup.find_all('p', text=re.compile('故事'))
7 CSS选择器
类似于CSS选择器,BeautifulSoup也支持通过CSS选择器语法来选取节点:
# 使用CSS选择器选择class为"title"的<p>标签 title_p_css = soup.select_one('p.title') # 选择所有<p>标签 p_tags_css = soup.select('p')
8 案例
案例 1:提取网页标题
from bs4 import BeautifulSoup import requests url = 'http://example.com' response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') title_tag = soup.find('title') if title_tag: print("网页标题:", title_tag.string)
案例 2:提取网页链接
from bs4 import BeautifulSoup import requests url = 'http://example.com' response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') links = soup.find_all('a') # 查找所有 <a> 标签(链接) for link in links: href = link.get('href') # 获取链接的 href 属性 print("链接:", href)
案例 3:提取表格数据
from bs4 import BeautifulSoup html_doc = """ <table> <tr> <th>姓名</th> <th>年龄</th> </tr> <tr> <td>张三</td> <td>25</td> </tr> <tr> <td>李四</td> <td>30</td> </tr> </table> """ soup = BeautifulSoup(html_doc, 'html.parser') table = soup.find('table') rows = table.find_all('tr') for row in rows: cells = row.find_all(['th', 'td']) # 查找表头和单元格 for cell in cells: print(cell.get_text(), end='\t') # 打印单元格内容,使用制表符分隔 print() # 换行
以下是一些使用BeautifulSoup的额外代码案例,这些案例涵盖了从网页中提取信息、修改HTML内容以及使用CSS选择器等不同方面的应用。
案例 4:使用CSS选择器提取数据
from bs4 import BeautifulSoup import requests url = 'https://example.com/somepage' response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') # 使用CSS选择器提取所有的图片链接 img_links = soup.select('img[src]') for img in img_links: print(img['src']) # 使用CSS选择器提取具有特定类的元素 elements_with_class = soup.select('.class-name') for element in elements_with_class: print(element.get_text())
案例 5:提取特定表格中的数据
from bs4 import BeautifulSoup html_doc = """ <table id="data-table"> <tr> <th>Name</th> <th>Age</th> </tr> <tr> <td>Alice</td> <td>28</td> </tr> <tr> <td>Bob</td> <td>35</td> </tr> </table> """ soup = BeautifulSoup(html_doc, 'html.parser') # 提取ID为"data-table"的表格中的所有行 table = soup.find('table', {'id': 'data-table'}) rows = table.find_all('tr') # 遍历行并提取数据 for row in rows: cols = row.find_all(['th', 'td']) cols = [ele.text.strip() for ele in cols] print(cols)
案例 6:修改HTML内容
from bs4 import BeautifulSoup html_doc = """ <html><head><title>Test Page</title></head> <body> <p class="title">Old Title</p> <p class="story">Old Story</p> </body> </html> """ soup = BeautifulSoup(html_doc, 'html.parser') # 修改标题文本 soup.title.string = 'New Title' # 修改类为"title"的p标签的文本 soup.find('p', class_='title').string = 'New Title Text' # 添加新的p标签 new_p = soup.new_tag('p') new_p.string = 'This is a new paragraph.' soup.body.append(new_p) # 输出修改后的HTML print(soup.prettify())
案例7 :处理嵌套的HTML结构
from bs4 import BeautifulSoup html_doc = """ <div class="container"> <div class="item"> <h2>Item 1</h2> <p>Description for item 1</p> </div> <div class="item"> <h2>Item 2</h2> <p>Description for item 2</p> </div> </div> """ soup = BeautifulSoup(html_doc, 'html.parser') # 提取所有的item类div,并处理它们的内容 items = soup.find_all('div', class_='item') for item in items: header = item.find('h2').text description = item.find('p').text print(f"Header: {header}") print(f"Description: {description}") print("-" * 20)
9 爬取某笔趣阁小说
#!/usr/bin/env python from urllib import response from bs4 import BeautifulSoup import requests import os import logging from fake_useragent import UserAgent # 隧道域名:端口号 # tunnel = "r250.kdltps.com:15818" # 用户名密码方式 # username = "t19754578624032" # password = "hemrc89p" # proxies = { # "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}, # "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel} # } # 白名单方式(需提前设置白名单) # proxies = { # # "http": "http://%(proxy)s/" % {"proxy": tunnel}, # "https": "http://%(proxy)s/" % {"proxy": tunnel} # } # # # 要访问的目标网页 # target_url = "https://dev.kdlapi.com/testproxy" # # 使用隧道域名发送请求 # response = requests.get(target_url, proxies=proxies) # # 获取页面内容 # if response.status_code == 200: # print(response.text) # 请勿使用keep-alive复用连接(会导致隧道不能切换IP) local_save_path = 'C:/Users/EA/Desktop/10-爬虫篇-第十次直播/Code/novel/' ua = UserAgent() headers = { "User-Agent":ua.random } logging.basicConfig(level=logging.INFO, format='%(asctime)s -%(levelname)s: %(message)s') url = 'https://www.biqukan8.cc/0_790/' url_list = [] name_list = [] flag_name = '' # print(url_list) # print(name_list) def novel_content(url,name): txt_response = requests.get(url=url,headers=headers) # print(txt_response.text) txts_soup = BeautifulSoup(str(txt_response.text),"lxml") txts = txts_soup.find_all(id = 'content',class_='showtxt') # print(type(list(enumerate(txt)))) # print(len(list(enumerate(txts)))) #1 证明他是一个整段 text_soup = BeautifulSoup(str(txts),'lxml') text = text_soup.div.text file_write(name,text) def file_write(name,text): directory_path = local_save_path + novel_name if os.path.exists(directory_path): print(f"目录'{directory_path}'存在!") else: #创建路径 os.mkdir(directory_path) print(f"目录'{directory_path}'已经创建!") #将刚刚获取到的小说内容写进去 write_flag = True name_path = os.path.join(directory_path,f"{name}.txt") with open(name_path,"a+",encoding='utf-8') as file: for each in text: if each == 'h': write_flag = False if write_flag == True and each != '': file.write(each) file.write('\n\n') response = requests.get(url,headers=headers) response.encoding = 'gbk' # logging.info(response.text) soup = BeautifulSoup(response.text,"lxml") chapters = soup.find_all('div',class_='listmain') # logging.info(chapters) download_soup = BeautifulSoup(str(chapters),"lxml") # logging.info(download_soup.contents) #小说名 novel_name = str(download_soup.dl.dt).split("》")[0][5:] # print(str(novel_name).split("》")[0][5:]) #《元尊》正文卷 flag_name = "《"+novel_name+"》"+"正文卷" # logging.info(flag_name) begin_flag = False for child in download_soup.dl.children: if child != '\n': if child.string == u"%s" % flag_name: begin_flag = True if begin_flag == True and child.a != None: download_url = "https://www.biqukan8.cc/" + child.a.get("href") download_name = child.a.string # print(download_url,download_name) url_list.append(download_url) name_list.append(download_name) # #用zip函数把两个列表合并起来 combined_list = zip(url_list,name_list) for item1,item2 in combined_list: novel_content(item1,item2)