Scrapy第三章-图片存库Mysql

wylc123 1年前 ⋅ 1669 阅读

草!这么多图,老子咋看,加个数据库记录,方便后面浏览。老铁,瀑布流了解一下。

1. python版Mysql数据库操作工具类

一个Mysql操作工具脚本解决问题

mysqlConfig.cfg 设置数据库连接参数

#db_name可以不设置,实现多数据库连接
[notdbMysql]
host = 127.0.0.1
port = 3306
user = root
password = 123456
db_name = pic

MySqlConn.py 数据库操作工具类

#!/usr/bin/env python
#coding=utf-8
#!文件类型: python
#!创建时间: 2018/12/3 16:46
#!作者: SongBin
#!文件名称: MySqlConn.py

import pymysql, os, configparser
from pymysql.cursors import DictCursor
from DBUtils.PooledDB import PooledDB


class Config(object):
    """
    # Config().get_content("user_information")

    配置文件里面的参数
    [notdbMysql]
    host = 192.168.1.101
    port = 3306
    user = root
    password = python123
    """

    def __init__(self, config_filename="mysqlConfig.cfg"):
        file_path = os.path.join(os.path.dirname(__file__), config_filename)
        self.cf = configparser.ConfigParser()
        # 读取中文配置文件要加 encoding="utf-8" 或 encoding="utf-8-sig"
        self.cf.read(file_path,encoding="utf-8")

    def get_sections(self):
        return self.cf.sections()

    def get_options(self, section):
        return self.cf.options(section)

    def get_content(self, section):
        result = {}
        for option in self.get_options(section):
            value = self.cf.get(section, option)
            result[option] = int(value) if value.isdigit() else value
        return result


class BasePymysqlPool(object):
    def __init__(self, host, port, user, password, db_name=None):
        self.db_host = host
        self.db_port = int(port)
        self.user = user
        self.password = str(password)
        self.db = db_name
        self.conn = None
        self.cursor = None


class MyPymysqlPool(BasePymysqlPool):
    """
    MYSQL数据库对象,负责产生数据库连接 , 此类中的连接采用连接池实现获取连接对象:conn = Mysql.getConn()
            释放连接对象;conn.close()或del conn
    """
    # 连接池对象
    __pool = None

    def __init__(self, conf_name=None):
        self.conf = Config().get_content(conf_name)
        super(MyPymysqlPool, self).__init__(**self.conf)
        # 数据库构造函数,从连接池中取出连接,并生成操作游标
        self._conn = self.__getConn()
        self._cursor = self._conn.cursor()

    def __getConn(self):
        """
        @summary: 静态方法,从连接池中取出连接
        @return MySQLdb.connection
        """
        if MyPymysqlPool.__pool is None:
            __pool = PooledDB(creator=pymysql,
                              mincached=1,
                              maxcached=20,
                              host=self.db_host,
                              port=self.db_port,
                              user=self.user,
                              passwd=self.password,
                              db=self.db,
                              use_unicode=True,
                              charset="utf8",
                              cursorclass=DictCursor)
        return __pool.connection()

    def getAll(self, sql, param=None):
        """
        @summary: 执行查询,并取出所有结果集
        @param sql:查询SQL,如果有查询条件,请只指定条件列表,并将条件值使用参数[param]传递进来
        @param param: 可选参数,条件列表值(元组/列表)
        @return: result list(字典对象)/boolean 查询到的结果集
        """
        if param is None:
            count = self._cursor.execute(sql)
        else:
            count = self._cursor.execute(sql, param)
        if count > 0:
            result = self._cursor.fetchall()
        else:
            result = False
        return result

    def getOne(self, sql, param=None):
        """
        @summary: 执行查询,并取出第一条
        @param sql:查询SQL,如果有查询条件,请只指定条件列表,并将条件值使用参数[param]传递进来
        @param param: 可选参数,条件列表值(元组/列表)
        @return: result list/boolean 查询到的结果集
        """
        if param is None:
            count = self._cursor.execute(sql)
        else:
            count = self._cursor.execute(sql, param)
        if count > 0:
            result = self._cursor.fetchone()
        else:
            result = False
        return result

    def getMany(self, sql, num, param=None):
        """
        @summary: 执行查询,并取出num条结果
        @param sql:查询SQL,如果有查询条件,请只指定条件列表,并将条件值使用参数[param]传递进来
        @param num:取得的结果条数
        @param param: 可选参数,条件列表值(元组/列表)
        @return: result list/boolean 查询到的结果集
        """
        if param is None:
            count = self._cursor.execute(sql)
        else:
            count = self._cursor.execute(sql, param)
        if count > 0:
            result = self._cursor.fetchmany(num)
        else:
            result = False
        return result
    def insertOneGetId(self,sql, param=None):
        """
        @summary: 向数据表插入一条记录,并返回该记录的ID
        @param sql:要插入的SQL格式,使用(%s,%s)
        @param values:要插入的记录数据值 tuple/list
        @return: newId 新插入记录的ID
        """
        self.__query(sql, param)
        # 获取最新自增ID 获取最新自增ID 获取最新自增ID
        newId = self._cursor.lastrowid
        return newId

    def insertMany(self, sql, values):
        """
        @summary: 向数据表插入多条记录
        @param sql:要插入的SQL格式
        @param values:要插入的记录数据tuple(tuple)/list[list]
        @return: count 受影响的行数
        """
        count = self._cursor.executemany(sql, values)
        return count

    def __query(self, sql, param=None):
        if param is None:
            count = self._cursor.execute(sql)
        else:
            count = self._cursor.execute(sql, param)
        return count

    def update(self, sql, param=None):
        """
        @summary: 更新数据表记录
        @param sql: SQL格式及条件,使用(%s,%s)
        @param param: 要更新的  值 tuple/list
        @return: count 受影响的行数
        """
        return self.__query(sql, param)

    def insert(self, sql, param=None):
        """
        @summary: 更新数据表记录
        @param sql: SQL格式及条件,使用(%s,%s)
        @param param: 要更新的  值 tuple/list
        @return: count 受影响的行数
        """
        return self.__query(sql, param)

    def delete(self, sql, param=None):
        """
        @summary: 删除数据表记录
        @param sql: SQL格式及条件,使用(%s,%s)
        @param param: 要删除的条件 值 tuple/list
        @return: count 受影响的行数
        """
        return self.__query(sql, param)

    def begin(self):
        """
        @summary: 开启事务
        """
        self._conn.autocommit(0)

    def end(self, option='commit'):
        """
        @summary: 结束事务
        """
        if option == 'commit':
            self._conn.commit()
        else:
            self._conn.rollback()

    def dispose(self, isEnd=1):
        """
        @summary: 释放连接池资源
        """
        if isEnd == 1:
            self.end('commit')
        else:
            self.end('rollback')
        self._cursor.close()
        self._conn.close()


if __name__ == '__main__':
    import datetime
    def getNowTime():
        # 格式化字符串
        now_time = datetime.datetime.now()
        now_time_str = datetime.datetime.strftime(now_time, '%Y-%m-%d %H:%M:%S')
        # now_time.strftime('%Y-%m-%d %H:%M:%S')
        return now_time_str

    print(getNowTime())

    mysql = MyPymysqlPool("notdbMysql")

    # sql = "select * from article where title = %s ORDER BY CreateTime DESC"
    # result9 = mysql.getOne(sql, '我们')
    # print(result9['id'])



    sqlinMany = "insert into mgroup (groupname,Status,CreateUser,Createtime) values (%s,%s,%s,%s)"
    # vals =[[1,'http://www.songbin.top/1.jpg',getNowTime()],[2,'http://www.songbin.top/2.jpg',getNowTime()]]
    # vals =[(1,'http://www.songbin.top/3.jpg',getNowTime()),(2,'http://www.songbin.top/3.jpg',getNowTime())]
    vals = ('美女图片', 1, 'root',getNowTime())
    result4 = mysql.insertOneGetId(sqlinMany, vals)
    print(result4)

    # 执行查询,并取出所有结果集

    sqlAll = "SELECT * FROM article WHERE info like  '%%%s%%'" % '我'
    result = mysql.getAll(sqlAll)
    print(result)
    # 执行查询,并取出第一条
    sqlOne = "select * from article"
    result2 = mysql.getAll(sqlOne)
    print(result2)

    # 执行查询,并取出num条结果
    sqlMany = "select * from article"
    result3 = mysql.getMany(sqlMany, 1)
    print(result3)

    # 向数据表插入多条记录

    # sqlinMany = "insert into imglist (aid,imgurl,createtime) values (%s,%s,%s)"
    # # vals =[[1,'http://www.songbin.top/1.jpg',getNowTime()],[2,'http://www.songbin.top/2.jpg',getNowTime()]]
    # # vals =[(1,'http://www.songbin.top/3.jpg',getNowTime()),(2,'http://www.songbin.top/3.jpg',getNowTime())]
    # vals = ((1, 'http://www.songbin.top/4.jpg', getNowTime()), (2, 'http://www.songbin.top/4.jpg', getNowTime()))
    # result4 = mysql.insertMany(sqlinMany, vals)
    # print(result4)

    # 释放资源
    mysql.dispose()

2. Scrapy爬虫添加Mysql数据固化模块

爬虫添加数据库操作中间件 MySQLPipeline.py

# -*- coding: utf-8 -*-
# @Time    : 2018/12/9 22:47
# @Author  : SongBin
# @Email   : 1370811553@qq.com
# @File    : MySQLPipeline.py
# @Software: PyCharm
import pymysql.cursors
import logging

from pic.utils.DateTimeHelper import DateTimeHelper
from pic.utils.MySqlConn import MyPymysqlPool


class MySQLPipeline(object):
    def __init__(self):
        self.dtHelper = DateTimeHelper()

    def process_item(self, item, spider):
        mysql = MyPymysqlPool("notdbMysql")
		sql = "select * from pic where name = %s ORDER BY CreateTime DESC"
		flag = mysql.getOne(sql,item['name'])
		if flag==False :
			insql = "insert into pic(name,addr,Status,CreateTime) values (%s, %s, %s, %s)"
			vals = (item['name'],item['addr'],1,self.dtHelper.getNowTime())
			newID = mysql.insertOneGetId(insql,vals)
		else:
			newID = flag['id']
		item['id'] = newID

        mysql.dispose()
        return item  # 必须实现返回

在setting.py里面设置

ITEM_PIPELINES = {
    'pic.pipelines.PicPipeline': 100,
    'pic.MySQLPipeline.MySQLPipeline': 101,
}

爬虫在爬取到item信息后会传入,Mysql数据库操作中间件 MySQLPipeline.py进行数据固化处理。

这样我们就很容易的,方便其他展示类的程序操作了。比如,我弄了个瀑布流图片展示:

3. 问题

报错ModuleNotFoundError: No module named 'DBUtils'可以参考文章

https://www.daxueyiwu.com/post/72

相关推荐

不要着急,慢慢更新

scrapy爬虫启示录-小伙子老夫看你血气方刚这本《爬虫秘录》就传给你了

Scrapy初章-Scrapy理论简介

Scrapy次章-啥也不干就是爬图

Scrapy第三章-图片存库Mysql

Scrapy第四章-设置代理IP偷偷爬图

Scrapy第五章-多线程加速爬图

Scrapy终章-1024福利

Scrapy最最最终章-搂一把sakimichan

更多内容请访问:IT源点

相关文章推荐

全部评论: 0

    我有话说: