由于站长比较怂,福利网站自己想办法去。可能找对了,套上就能用,谁知道呢,你试试呗,猥琐发育不要浪!
从头到尾捋一遍,scrapy爬虫相关技术。
1、创建工程
scrapy startproject bt1024
2、创建爬虫程序
cd movie
scrapy genspider bt www.xxxxx.com
3、分析网页结构
结构比较简单,就按阅读文章的模式,看完一页看下一页,每一页都有文章标题,图片列表,图片包括图片标题,图片名称,图片地址。
4、设置数据存储模板
根据网页分析,需要两个存储模板-文章、图片
item.py
import scrapy
class ArticleItem(scrapy.Item):
# define the fields for your item here like:
id = scrapy.Field()
gid = scrapy.Field()
title = scrapy.Field()
filesize = scrapy.Field()
info = scrapy.Field()
magnet = scrapy.Field()
bturl = scrapy.Field()
flag = scrapy.Field()
pass
class ImgListItem(scrapy.Item):
aid = scrapy.Field()
title = scrapy.Field()
imgName = scrapy.Field()
imgUrl = scrapy.Field()
imgPath = scrapy.Field()
flag = scrapy.Field()
imgDisabled = scrapy.Field()
pass
5、编写爬虫
# -*- coding: utf-8 -*-
import os
import time
import traceback
import scrapy
from bt1024.items import ImgListItem, ArticleItem
from bt1024.settings import log_file, log_level
from bt1024.utils.Logger import Logger
log = Logger(log_file, level=log_level)
class BtSpider(scrapy.Spider):
# 爬虫名称,唯一
name = 'bt1024'
# 允许访问的域
allowed_domains = ['xxx.xxx.xxxx','zzz.zzz.zzz']
file = open('url.txt', mode="r", encoding="utf8")
starturl = file.readline(); # 获取开始url
file.close()
# 初始URL
start_urls = [starturl]
def parse(self, response):
# 基础URL
baseurl = 'http://xxx.xxx.xxxx/pw/'
try:
allPics = response.xpath('// *[ @ id = "read_tpc"] / img')
title = response.css("#subject_tpc::text").extract_first()
item2 = ArticleItem()
item2['title'] = title
item2['flag'] ='article'
item2['filesize'] = 20
item2['info'] ='info'
item2['magnet'] = 'magnet'
item2['bturl'] = 'bturl'
yield item2
for pic in allPics:
# 分别处理每个图片,取出名称及地址
item = ImgListItem()
imgurl = pic.xpath('@src').extract()[0]
log.logger.info("图片地址:"+imgurl)
name = os.path.split(imgurl)[-1]
item['flag'] = 'img'
item['aid'] = item2['id']
item['title'] = title
item['imgName'] = name
item['imgUrl'] = imgurl
# 返回爬取到的信息
yield item
if item['imgDisabled']:
break
# 下一页
# nextpage = response.xpath('//*[@id="main"]/div[3]/span[3]/span/a')
# url = nextpage.xpath('@href').extract()[0]
# yield self.make_requests_from_url(baseurl+url)
# //*[@id="main"]/div[3]/span[3]/span/a(下一页)
# //*[@id="main"]/div[3]/span[3]/a(上一页)
next_page = response.xpath('//*[@id="main"]/div[3]/span[3]/a').xpath('@href').extract_first()
log.logger.info("上一页:" + next_page)
time.sleep(1) # 自定义
if next_page is not None:
next_page = response.urljoin(next_page)
log.logger.info("上一页(转换后):" + next_page)
file = open('url.txt', mode="w", encoding="utf8")
file.write(str(next_page))
file.close()
yield scrapy.Request(next_page, callback=self.parse)
except:
log.logger.error(traceback.print_exc())
print("异常,到达终点!")
log.logger.error("异常,到达终点")
pass
6、设置配置文件
# -*- coding: utf-8 -*-
# Scrapy settings for bt1024 project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
import datetime
import time
import random
BOT_NAME = 'bt1024'
SPIDER_MODULES = ['bt1024.spiders']
NEWSPIDER_MODULE = 'bt1024.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# user agent 列表
USER_AGENT_LIST = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
]
# 随机生成user agent
USER_AGENT = random.choice(USER_AGENT_LIST)
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'bt1024.middlewares.Bt1024SpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 100,
'bt1024.middlewares.ProxyMiddleWare': 110,
# 'bt1024.middlewares.ProxyMiddleware2': 110,
'bt1024.middlewares.Bt1024DownloaderMiddleware': 534,
}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'bt1024.pipelines.Bt1024Pipeline': 300,
'bt1024.MySQLPipeline.MySQLPipeline': 301,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
###########################我的配置##############################
# 唯美写真图片存放位置
WmImgPath = 'H:\down_pic'
##############日志相关#####################
to_day = datetime.datetime.now()
LOG_FILE = 'H:/pylogs/scrapy_{}_{}_{}.log'.format(to_day.year,to_day.month,to_day.day)
LOG_LEVEL = 'DEBUG'
LOG_ENABLED = True #启用logging
LOG_ENCODING ='utf-8' #logging使用的编码
log_file = 'H:/pylogs/mylog_{}_{}_{}.log'.format(to_day.year,to_day.month,to_day.day)
log_level = 'debug'
HTTPERROR_ALLOWED_CODES = [403, 404]
##############日志相关#####################
###########################我的配置##############################
7、编写数据处理脚本
# -*- coding: utf-8 -*-
# @Time : 2018/12/9 22:47
# @Author : SongBin
# @Email : 1370811553@qq.com
# @File : MySQLPipeline.py
# @Software: PyCharm
import pymysql.cursors
import logging
from bt1024.utils.DateTimeHelper import DateTimeHelper
from bt1024.utils.MySqlConn import MyPymysqlPool
class MySQLPipeline(object):
def __init__(self):
self.dtHelper = DateTimeHelper()
def process_item(self, item, spider):
mysql = MyPymysqlPool("notdbMysql")
if item['flag'] == 'article':
sql = "select * from article where title = %s ORDER BY CreateTime DESC"
flag = mysql.getOne(sql,item['title'])
if flag==False :
insql = "insert into article(gid,title,filesize,info,magnet,bturl,Status,CreateUser,CreateTime) values (%s, %s, %s, %s,%s, %s, %s, %s, %s)"
vals = (1,item['title'],item['filesize'],item['info'],item['magnet'],item['bturl'],1,'root',self.dtHelper.getNowTime())
newID = mysql.insertOneGetId(insql,vals)
else:
newID = flag['id']
item['id'] = newID
else:
inmsql = "insert into imglist(aid,imgname,imgurl,imgpath,Status,CreateUser,CreateTime) values (%s,%s,%s,%s, %s, %s, %s)"
imginflag = mysql.insert(inmsql,(item['aid'],item['imgName'],item['imgUrl'],item['imgPath'],1,'root',self.dtHelper.getNowTime()))
mysql.dispose()
return item # 必须实现返回
8、pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import socket
import traceback
import urllib.request
import os
from bt1024.settings import WmImgPath
from bt1024.settings import log_file, log_level
from bt1024.utils.Logger import Logger
log = Logger(log_file, level=log_level)
socket.setdefaulttimeout(5) # 设置socket层的超时时间为20秒
class Bt1024Pipeline(object):
def process_item(self, item, spider):
if item['flag'] == 'img':
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0'}
item['imgDisabled'] = False;
try:
url = item['imgUrl']
req = urllib.request.Request(url, headers=headers)
res = urllib.request.urlopen(req,None,5)
content = res.read()
except UnicodeDecodeError as e:
print('-----UnicodeDecodeErrorurl:', url)
# 当图片链接失效时,做标记
# print(traceback.print_exc())
item['imgDisabled'] = True;
return item
except urllib.error.URLError as e:
print("-----urlErrorurl:", url)
# 当图片链接失效时,做标记
# print(traceback.print_exc())
item['imgDisabled'] = True;
return item
except socket.timeout as e:
print("-----socket timout:", url)
# 当图片链接失效时,做标记
# print(traceback.print_exc())
item['imgDisabled'] = True;
return item
basepath = os.path.join(WmImgPath, item['title'])
if not os.path.exists(basepath):
os.makedirs(basepath)
file_name = os.path.join(basepath, item['imgName'])
item['imgPath'] = os.path.join(item['title'],item['imgName'])
with open(file_name, 'wb') as fp:
fp.write(content)
return item
else:
return item
9、设置代理IP
middlewares.py
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
import time
from scrapy import signals
import random
class Bt1024SpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class Bt1024DownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
'''设置headers和切换请求头'''
referer = request.url
if referer:
request.headers['Referer'] = referer
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
#IP代理池
class ProxyMiddleWare(object):
"""docstring for ProxyMiddleWare"""
def process_request(self, request, spider):
'''对request对象加上proxy'''
proxy = self.get_random_proxy()
print("this is request ip:" + proxy)
request.meta['proxy'] = proxy
def process_response(self, request, response, spider):
'''对返回的response处理'''
# 如果返回的response状态不是200,重新生成当前request对象
if response.status != 200:
proxy = self.get_random_proxy()
print("this is response ip:" + proxy)
# 对当前reque加上代理
request.meta['proxy'] = proxy
return request
return response
def get_random_proxy(self):
'''随机从文件中读取proxy'''
while 1:
with open('proxies.txt', 'r') as f:
proxies = f.readlines()
if proxies:
break
else:
time.sleep(1)
proxy = random.choice(proxies).strip()
return proxy
class ProxyMiddleware2(object):
'''
设置Proxy
'''
def process_request(self, request, spider):
request.meta['proxy'] = 'http://218.22.7.62:53281'
10、配置日志输出
#!usr/bin/env python
# -*- coding: utf-8 -*-
#!文件类型: python
#!创建时间: 2019/1/15 15:00
#!作者: SongBin
#!文件名称: Logger.py
#!简介:日志配置工具 根据严重程度排序:DEBUG < INFO < WARNING < ERROR < CRITICA
import logging
from logging import handlers
class Logger(object):
level_relations = {
'debug':logging.DEBUG,
'info':logging.INFO,
'warning':logging.WARNING,
'error':logging.ERROR,
'crit':logging.CRITICAL
}#日志级别关系映射
def __init__(self,filename,level='info',when='D',backCount=10,fmt='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s'):
self.logger = logging.getLogger(filename)
format_str = logging.Formatter(fmt)#设置日志格式
self.logger.setLevel(self.level_relations.get(level))#设置日志级别
sh = logging.StreamHandler()#往屏幕上输出
sh.setFormatter(format_str) #设置屏幕上显示的格式
th = handlers.TimedRotatingFileHandler(filename=filename,when=when,backupCount=backCount,encoding='utf-8')#往文件里写入#指定间隔时间自动生成文件的处理器
#实例化TimedRotatingFileHandler
#interval是时间间隔,backupCount是备份文件的个数,如果超过这个个数,就会自动删除,when是间隔的时间单位,单位有以下几种:
# S 秒
# M 分
# H 小时、
# D 天、
# W 每星期(interval==0时代表星期一)
# midnight 每天凌晨
th.setFormatter(format_str)#设置文件里写入的格式
self.logger.addHandler(sh) #把对象加到logger里
self.logger.addHandler(th)
if __name__ == '__main__':
log = Logger('all.log',level='debug')
log.logger.debug('debug')
log.logger.info('info')
log.logger.warning('警告')
log.logger.error('报错')
log.logger.critical('严重')
log2 = Logger('info.log', level='info')
log2.logger.debug('debug')
log2.logger.warning('警告')
在setting.py做了日志相关的配置。
相关推荐
不要着急,慢慢更新
scrapy爬虫启示录-小伙子老夫看你血气方刚这本《爬虫秘录》就传给你了
更多内容请访问:IT源点
注意:本文归作者所有,未经作者允许,不得转载