#coding=utf-8
#!/usr/bin/python
# 导入requests库
import requests
# 导入文件操作库
import os
import re
import bs4
from bs4 import BeautifulSoup
import sys
from util.mysql_DBUtils import mysql
# 写入数据库
def write_db(param):
try:
sql = "insert into house (url,housing_estate,position,square_metre,unit_price,total_price,follow,take_look,pub_date) "
sql = sql + "VALUES(%(url)s,%(housing_estate)s, %(position)s,%(square_metre)s,"
sql = sql + "%(unit_price)s,%(total_price)s,%(follow)s,%(take_look)s,%(pub_date)s)"
mysql.insert(sql, param)
except Exception as e:
print(e)
# 主方法
def main():
# 给请求指定一个请求头来模拟chrome浏览器
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'}
page_max = 100
# 爬图地址
for i in range(1, int(page_max) + 1):
print("第几页:" + str(i))
if i == 1:
house = 'https://qd.lianjia.com/ershoufang/shibei/'
else:
house = 'https://qd.lianjia.com/ershoufang/shibei/pg'+str(i)
res = requests.get(house, headers=headers)
soup = BeautifulSoup(res.text, 'html.parser')
li_max = soup.find('ul', class_='sellListContent').find_all('li')
for li in li_max:
try:
house_param = {}
# 格式 荣馨苑 | 3室2厅 | 115.91平米 | 南 北 | 毛坯 | 无电梯
content = li.find('div', class_='houseInfo').text
content = content.split("|")
house_param['housing_estate'] = content[0]
house_param['square_metre'] = re.findall(r'-?\d+\.?\d*e?-?\d*?', content[2])[0]
# --------------------------------------------------------#
# 位置 水清沟
position = li.find('div', class_='positionInfo').find('a').text
house_param['position'] = position
# --------------------------------------------------------#
totalprice = li.find('div', class_='totalPrice').text
house_param['total_price'] = re.sub("\D", "", totalprice)
unitprice = li.find('div', class_='unitPrice').text
house_param['unit_price'] = re.sub("\D", "", unitprice)
# --------------------------------------------------------#
# 57人关注 / 共13次带看 / 6个月以前发布
follow = li.find('div', class_='followInfo').text
follow = follow.split("/")
house_param['follow'] = re.sub("\D", "", follow[0])
house_param['take_look'] = re.sub("\D", "", follow[1])
# --------------------------------------------------------#
# 二手房地址
title_src = li.find('div', class_='title').find('a').attrs['href']
house_param['url'] = re.sub("\D", "", title_src)
res = requests.get(title_src, headers=headers)
soup = BeautifulSoup(res.text, 'html.parser')
# --------------------------------------------------------#
# 挂牌时间(重要数据)
pub_date = soup.find('div', class_='transaction').find_all('li')[0].find_all('span')[1].text
house_param['pub_date'] = pub_date
write_db(house_param)
except Exception as e:
print(e)
mysql.end("commit")
mysql.dispose()
if __name__ == '__main__':
main()
#!/usr/bin/python3
# -*- coding:utf-8 -*-
import pymysql, os, configparser
from pymysql.cursors import DictCursor
from DBUtils.PooledDB import PooledDB
class Config(object):
"""
# Config().get_content("user_information")
配置文件里面的参数
[dbMysql]
host = 192.168.1.80
port = 3306
user = root
password = 123456
"""
def __init__(self, config_filename="dbMysqlConfig.cnf"):
file_path = os.path.join(os.path.dirname(__file__), config_filename)
self.cf = configparser.ConfigParser()
self.cf.read(file_path)
def get_sections(self):
return self.cf.sections()
def get_options(self, section):
return self.cf.options(section)
def get_content(self, section):
result = {}
for option in self.get_options(section):
value = self.cf.get(section, option)
result[option] = int(value) if value.isdigit() else value
return result
class BasePymysqlPool(object):
def __init__(self, host, port, user, password, db_name):
self.db_host = host
self.db_port = int(port)
self.user = user
self.password = str(password)
self.db = db_name
self.conn = None
self.cursor = None
class MyPymysqlPool(BasePymysqlPool):
"""
MYSQL数据库对象,负责产生数据库连接 , 此类中的连接采用连接池实现
获取连接对象:conn = Mysql.getConn()
释放连接对象;conn.close()或del conn
"""
# 连接池对象
__pool = None
def __init__(self, conf_name=None):
self.conf = Config().get_content(conf_name)
super(MyPymysqlPool, self).__init__(**self.conf)
# 数据库构造函数,从连接池中取出连接,并生成操作游标
self._conn = self.__getConn()
self._cursor = self._conn.cursor()
def __getConn(self):
"""
@summary: 静态方法,从连接池中取出连接
@return MySQLdb.connection
"""
if MyPymysqlPool.__pool is None:
__pool = PooledDB(creator=pymysql,
mincached=1,
maxcached=20,
host=self.db_host,
port=self.db_port,
user=self.user,
passwd=self.password,
db=self.db,
use_unicode=True,
charset="utf8",
cursorclass=DictCursor)
return __pool.connection()
def getAll(self, sql, param=None):
"""
@summary: 执行查询,并取出所有结果集
@param sql:查询SQL,如果有查询条件,请只指定条件列表,并将条件值使用参数[param]传递进来
@param param: 可选参数,条件列表值(元组/列表)
@return: result list(字典对象)/boolean 查询到的结果集
"""
if param is None:
count = self._cursor.execute(sql)
else:
count = self._cursor.execute(sql, param)
if count > 0:
result = self._cursor.fetchall()
else:
result = False
return result
def getOne(self, sql, param=None):
"""
@summary: 执行查询,并取出第一条
@param sql:查询SQL,如果有查询条件,请只指定条件列表,并将条件值使用参数[param]传递进来
@param param: 可选参数,条件列表值(元组/列表)
@return: result list/boolean 查询到的结果集
"""
if param is None:
count = self._cursor.execute(sql)
else:
count = self._cursor.execute(sql, param)
if count > 0:
result = self._cursor.fetchone()
else:
result = False
return result
def getMany(self, sql, num, param=None):
"""
@summary: 执行查询,并取出num条结果
@param sql:查询SQL,如果有查询条件,请只指定条件列表,并将条件值使用参数[param]传递进来
@param num:取得的结果条数
@param param: 可选参数,条件列表值(元组/列表)
@return: result list/boolean 查询到的结果集
"""
if param is None:
count = self._cursor.execute(sql)
else:
count = self._cursor.execute(sql, param)
if count > 0:
result = self._cursor.fetchmany(num)
else:
result = False
return result
def insertMany(self, sql, values):
"""
@summary: 向数据表插入多条记录
@param sql:要插入的SQL格式
@param values:要插入的记录数据tuple(tuple)/list[list]
@return: count 受影响的行数
"""
count = self._cursor.executemany(sql, values)
return count
def __query(self, sql, param=None):
if param is None:
count = self._cursor.execute(sql)
else:
count = self._cursor.execute(sql, param)
return count
def update(self, sql, param=None):
"""
@summary: 更新数据表记录
@param sql: SQL格式及条件,使用(%s,%s)
@param param: 要更新的 值 tuple/list
@return: count 受影响的行数
"""
return self.__query(sql, param)
def insert(self, sql, param=None):
"""
@summary: 更新数据表记录
@param sql: SQL格式及条件,使用(%s,%s)
@param param: 要更新的 值 tuple/list
@return: count 受影响的行数
"""
return self.__query(sql, param)
def delete(self, sql, param=None):
"""
@summary: 删除数据表记录
@param sql: SQL格式及条件,使用(%s,%s)
@param param: 要删除的条件 值 tuple/list
@return: count 受影响的行数
"""
return self.__query(sql, param)
def begin(self):
"""
@summary: 开启事务
"""
self._conn.autocommit(0)
def end(self, option='commit'):
"""
@summary: 结束事务
"""
if option == 'commit':
self._conn.commit()
else:
self._conn.rollback()
def dispose(self, isEnd=1):
"""
@summary: 释放连接池资源
"""
if isEnd == 1:
self.end('commit')
else:
self.end('rollback')
self._cursor.close()
self._conn.close()
mysql = MyPymysqlPool("dbMysql")
if __name__ == '__main__':
sqlAll = "select id, title from novel limit 2;"
result = mysql.getAll(sqlAll)
print(result)
# 释放资源
mysql.dispose()
[dbMysql]
host = localhost
port = 3306
user = root
password = 123456
db_name = house