Scrapy终章-1024福利

由于站长比较怂，福利网站自己想办法去。可能找对了，套上就能用，谁知道呢，你试试呗，猥琐发育不要浪！

从头到尾捋一遍，scrapy爬虫相关技术。

1、创建工程

scrapy startproject bt1024

2、创建爬虫程序

cd movie
scrapy genspider bt www.xxxxx.com

3、分析网页结构

结构比较简单，就按阅读文章的模式，看完一页看下一页，每一页都有文章标题，图片列表，图片包括图片标题，图片名称，图片地址。

4、设置数据存储模板

根据网页分析，需要两个存储模板-文章、图片

item.py

import scrapy
class ArticleItem(scrapy.Item):
    # define the fields for your item here like:
    id = scrapy.Field()
    gid = scrapy.Field()
    title = scrapy.Field()
    filesize = scrapy.Field()
    info = scrapy.Field()
    magnet = scrapy.Field()
    bturl = scrapy.Field()
    flag = scrapy.Field()
    pass
class ImgListItem(scrapy.Item):
    aid = scrapy.Field()
    title = scrapy.Field()
    imgName = scrapy.Field()
    imgUrl = scrapy.Field()
    imgPath = scrapy.Field()
    flag = scrapy.Field()
    imgDisabled = scrapy.Field()
    pass

5、编写爬虫

# -*- coding: utf-8 -*-
import os
import time
import traceback
import scrapy
from bt1024.items import ImgListItem, ArticleItem
from bt1024.settings import log_file, log_level
from bt1024.utils.Logger import Logger
log = Logger(log_file, level=log_level)
class BtSpider(scrapy.Spider):
    # 爬虫名称，唯一
    name = 'bt1024'
    # 允许访问的域
    allowed_domains = ['xxx.xxx.xxxx','zzz.zzz.zzz']
    file = open('url.txt', mode="r", encoding="utf8")
    starturl = file.readline();  # 获取开始url
    file.close()
    # 初始URL
    start_urls = [starturl]
    def parse(self, response):
        # 基础URL
        baseurl = 'http://xxx.xxx.xxxx/pw/'
        try:
            allPics = response.xpath('// *[ @ id = "read_tpc"] / img')
            title = response.css("#subject_tpc::text").extract_first()
            item2 = ArticleItem()
            item2['title'] = title
            item2['flag'] ='article'
            item2['filesize'] = 20
            item2['info'] ='info'
            item2['magnet'] = 'magnet'
            item2['bturl'] = 'bturl'
            yield item2
            for pic in allPics:
                # 分别处理每个图片，取出名称及地址
                item = ImgListItem()
                imgurl = pic.xpath('@src').extract()[0]
                log.logger.info("图片地址："+imgurl)
                name = os.path.split(imgurl)[-1]
                item['flag'] = 'img'
                item['aid'] = item2['id']
                item['title'] = title
                item['imgName'] = name
                item['imgUrl'] = imgurl
                # 返回爬取到的信息
                yield item
                if item['imgDisabled']:
                    break
            # 下一页
            # nextpage = response.xpath('//*[@id="main"]/div[3]/span[3]/span/a')
            # url = nextpage.xpath('@href').extract()[0]
            # yield self.make_requests_from_url(baseurl+url)
            # //*[@id="main"]/div[3]/span[3]/span/a(下一页)
            # //*[@id="main"]/div[3]/span[3]/a(上一页)
            next_page = response.xpath('//*[@id="main"]/div[3]/span[3]/a').xpath('@href').extract_first()
            log.logger.info("上一页：" + next_page)
            time.sleep(1)  # 自定义
            if next_page is not None:
                next_page = response.urljoin(next_page)
                log.logger.info("上一页（转换后）：" + next_page)
                file = open('url.txt', mode="w", encoding="utf8")
                file.write(str(next_page))
                file.close()
                yield scrapy.Request(next_page, callback=self.parse)

        except:
            log.logger.error(traceback.print_exc())
            print("异常，到达终点！")
            log.logger.error("异常，到达终点")
            pass

6、设置配置文件

# -*- coding: utf-8 -*-

# Scrapy settings for bt1024 project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
import datetime
import time
import random

BOT_NAME = 'bt1024'

SPIDER_MODULES = ['bt1024.spiders']
NEWSPIDER_MODULE = 'bt1024.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent

# user agent 列表
USER_AGENT_LIST = [
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
    "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
    "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
    "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
    "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
    "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
    "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
    "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
    "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
]
# 随机生成user agent
USER_AGENT = random.choice(USER_AGENT_LIST)
# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'bt1024.middlewares.Bt1024SpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
    'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 100,
    'bt1024.middlewares.ProxyMiddleWare': 110,
    # 'bt1024.middlewares.ProxyMiddleware2': 110,
    'bt1024.middlewares.Bt1024DownloaderMiddleware': 534,
}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'bt1024.pipelines.Bt1024Pipeline': 300,
    'bt1024.MySQLPipeline.MySQLPipeline': 301,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

###########################我的配置##############################
# 唯美写真图片存放位置
WmImgPath = 'H:\down_pic'
##############日志相关#####################
to_day = datetime.datetime.now()
LOG_FILE = 'H:/pylogs/scrapy_{}_{}_{}.log'.format(to_day.year,to_day.month,to_day.day)
LOG_LEVEL = 'DEBUG'
LOG_ENABLED = True #启用logging
LOG_ENCODING ='utf-8' #logging使用的编码
log_file = 'H:/pylogs/mylog_{}_{}_{}.log'.format(to_day.year,to_day.month,to_day.day)
log_level = 'debug'
HTTPERROR_ALLOWED_CODES = [403, 404]
##############日志相关#####################

###########################我的配置##############################

# -*- coding: utf-8 -*-
# @Time    : 2018/12/9 22:47
# @Author  : SongBin
# @Email   : 1370811553@qq.com
# @File    : MySQLPipeline.py
# @Software: PyCharm
import pymysql.cursors
import logging

from bt1024.utils.DateTimeHelper import DateTimeHelper
from bt1024.utils.MySqlConn import MyPymysqlPool


class MySQLPipeline(object):
    def __init__(self):
        self.dtHelper = DateTimeHelper()

    def process_item(self, item, spider):
        mysql = MyPymysqlPool("notdbMysql")
        if item['flag'] == 'article':
            sql = "select * from article where title = %s ORDER BY CreateTime DESC"
            flag = mysql.getOne(sql,item['title'])
            if flag==False :
                insql = "insert into article(gid,title,filesize,info,magnet,bturl,Status,CreateUser,CreateTime) values (%s, %s, %s, %s,%s, %s, %s, %s, %s)"
                vals = (1,item['title'],item['filesize'],item['info'],item['magnet'],item['bturl'],1,'root',self.dtHelper.getNowTime())
                newID = mysql.insertOneGetId(insql,vals)
            else:
                newID = flag['id']
            item['id'] = newID
        else:
            inmsql = "insert into imglist(aid,imgname,imgurl,imgpath,Status,CreateUser,CreateTime) values (%s,%s,%s,%s, %s, %s, %s)"
            imginflag = mysql.insert(inmsql,(item['aid'],item['imgName'],item['imgUrl'],item['imgPath'],1,'root',self.dtHelper.getNowTime()))

        mysql.dispose()
        return item  # 必须实现返回

8、pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import socket
import traceback
import urllib.request
import os
from bt1024.settings import WmImgPath
from bt1024.settings import log_file, log_level
from bt1024.utils.Logger import Logger
log = Logger(log_file, level=log_level)

socket.setdefaulttimeout(5)  # 设置socket层的超时时间为20秒
class Bt1024Pipeline(object):
    def process_item(self, item, spider):
        if item['flag'] == 'img':
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0'}
            item['imgDisabled'] = False;
            try:
                url = item['imgUrl']
                req = urllib.request.Request(url, headers=headers)
                res = urllib.request.urlopen(req,None,5)
                content = res.read()
            except UnicodeDecodeError as e:
                print('-----UnicodeDecodeErrorurl:', url)
                # 当图片链接失效时，做标记
                # print(traceback.print_exc())
                item['imgDisabled'] = True;
                return item

            except urllib.error.URLError as e:
                print("-----urlErrorurl:", url)
                # 当图片链接失效时，做标记
                # print(traceback.print_exc())
                item['imgDisabled'] = True;
                return item

            except socket.timeout as e:
                print("-----socket timout:", url)
                # 当图片链接失效时，做标记
                # print(traceback.print_exc())
                item['imgDisabled'] = True;
                return item
            basepath = os.path.join(WmImgPath, item['title'])
            if not os.path.exists(basepath):
                os.makedirs(basepath)

            file_name = os.path.join(basepath, item['imgName'])
            item['imgPath'] = os.path.join(item['title'],item['imgName'])
            with open(file_name, 'wb') as fp:
                fp.write(content)
            return item
        else:
            return item

9、设置代理IP

middlewares.py

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
import time

from scrapy import signals
import random

class Bt1024SpiderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, dict or Item objects.
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Response, dict
        # or Item objects.
        pass

    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)


class Bt1024DownloaderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):
        '''设置headers和切换请求头'''
        referer = request.url
        if referer:
            request.headers['Referer'] = referer

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

#IP代理池
class ProxyMiddleWare(object):
    """docstring for ProxyMiddleWare"""

    def process_request(self, request, spider):
        '''对request对象加上proxy'''
        proxy = self.get_random_proxy()
        print("this is request ip:" + proxy)
        request.meta['proxy'] = proxy

    def process_response(self, request, response, spider):
        '''对返回的response处理'''
        # 如果返回的response状态不是200，重新生成当前request对象
        if response.status != 200:
            proxy = self.get_random_proxy()
            print("this is response ip:" + proxy)
            # 对当前reque加上代理
            request.meta['proxy'] = proxy
            return request
        return response

    def get_random_proxy(self):
        '''随机从文件中读取proxy'''
        while 1:
            with open('proxies.txt', 'r') as f:
                proxies = f.readlines()
            if proxies:
                break
            else:
                time.sleep(1)
        proxy = random.choice(proxies).strip()
        return proxy

class ProxyMiddleware2(object):
    '''
    设置Proxy
    '''

    def process_request(self, request, spider):
        request.meta['proxy'] = 'http://218.22.7.62:53281'

10、配置日志输出

#!usr/bin/env python
# -*- coding: utf-8 -*-
#!文件类型： python
#!创建时间： 2019/1/15 15:00
#!作者： SongBin
#!文件名称： Logger.py
#!简介：日志配置工具 根据严重程度排序：DEBUG < INFO < WARNING < ERROR < CRITICA
import logging
from logging import handlers

class Logger(object):
    level_relations = {
        'debug':logging.DEBUG,
        'info':logging.INFO,
        'warning':logging.WARNING,
        'error':logging.ERROR,
        'crit':logging.CRITICAL
    }#日志级别关系映射

    def __init__(self,filename,level='info',when='D',backCount=10,fmt='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s'):
        self.logger = logging.getLogger(filename)
        format_str = logging.Formatter(fmt)#设置日志格式
        self.logger.setLevel(self.level_relations.get(level))#设置日志级别
        sh = logging.StreamHandler()#往屏幕上输出
        sh.setFormatter(format_str) #设置屏幕上显示的格式
        th = handlers.TimedRotatingFileHandler(filename=filename,when=when,backupCount=backCount,encoding='utf-8')#往文件里写入#指定间隔时间自动生成文件的处理器
        #实例化TimedRotatingFileHandler
        #interval是时间间隔，backupCount是备份文件的个数，如果超过这个个数，就会自动删除，when是间隔的时间单位，单位有以下几种：
        # S 秒
        # M 分
        # H 小时、
        # D 天、
        # W 每星期（interval==0时代表星期一）
        # midnight 每天凌晨
        th.setFormatter(format_str)#设置文件里写入的格式
        self.logger.addHandler(sh) #把对象加到logger里
        self.logger.addHandler(th)
if __name__ == '__main__':
    log = Logger('all.log',level='debug')
    log.logger.debug('debug')
    log.logger.info('info')
    log.logger.warning('警告')
    log.logger.error('报错')
    log.logger.critical('严重')
    log2 = Logger('info.log', level='info')
    log2.logger.debug('debug')
    log2.logger.warning('警告')

在setting.py做了日志相关的配置。

Scrapy终章-1024福利

1、创建工程

2、创建爬虫程序

3、分析网页结构

4、设置数据存储模板

5、编写爬虫

6、设置配置文件

7、编写数据处理脚本

8、pipelines.py

9、设置代理IP

10、配置日志输出

相关推荐

全部评论: 0 条

本文目录

热门标签

程序员导航

热门文章

阿里云新老用户最新优惠

最新发布

最新评论