Scrapy数据存储到数据库
存储数据的三个关键步骤
- spider获取数据
- 定义item创建数据结构
- 定义pipeline处理数据并保存
创建spider并获取数据
目标:获取豆瓣的电影名(仅第一页)
from bs4 import BeautifulSoup
import scrapy
from douban.items import DoubanItem
class Douban250(scrapy.Spider):
name = "douban250"
allowed_domains = ["movie.douban.com"]
start_urls = ["https://movie.douban.com/top250"]
def parse(self, response):
soup = BeautifulSoup(response.body, 'lxml')
hd_tags = soup.find_all('div', class_='hd')
title = []
for i in hd_tags:
title.append(i.find_all('span', class_='title')[0].text)
douban_item = DoubanItem()
douban_item['title'] = title
yield douban_item
解析方式为bs4为例
此时获取到的数据组成一个列表传递给douban_item
管道
item接受数据
# item.py
import scrapy
class DoubanItem(scrapy.Item):
title = scrapy.Field()
pipeline存储数据
# pipeline.py
import pymysql
from itemadapter import ItemAdapter
class DoubanPipeline:
# 初始化Mysql数据库
def __init__(self):
self.conn = pymysql.connect(
host='127.0.0.1',
port=3306,
user='root',
password='666',
database='douban',
charset='utf8'
)
# 创建游标
self.cursor = self.conn.cursor()
# 关闭爬虫方法(提交事务并关闭)
def close_spider(self, spider):
self.conn.commit()
self.conn.close()
def process_item(self, item, spider):
# 获取item中的'title' 不存在则为空字符串
title = item.get('title', '')
for i in title:
# 遍历title列表并执行sql语句
self.cursor.execute(
'insert into douban250 (title) values (%s)',
(i,)
)
return item