#!usr/bin/env python
# coding=utf-8
import json
import time
from lxml import etree
import scrapy
import re
import logging.config
from spider_project.illegal_tax_spider.spiders import *
class Xiamenspider(scrapy.Spider):
name = 'tupian'
logging.config.fileConfig(log_conf)
logger = logging.getLogger(log_example)
start_urls = ['https://smtmm.win/']
def start_requests(self):
print('========================init 图片=====================')
self.logger.info('run xiamen_qsgg.start_requests')
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Connection': 'Keep-Alive', # 保持链接状态
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36',
'Accept-Language': 'zh-CN',
'Host': 'smtmm.win',
'X-Requested-With': 'XMLHttpRequest',
}
url = 'https://smtmm.win/'
request = scrapy.FormRequest(url, headers=headers, meta={'cookiejar':1},
method='GET',callback=self.wwquery, dont_filter=True)
yield request
def wwquery(self, response):
self.logger.info('run xiamen_qsgg.wwquery')
for page in range(1,2):
print('正在爬取第%s页' % page)
headers = {
'Accept': 'text/html, */*; q=0.01',
'Connection': 'Keep-Alive', # 保持链接状态
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36',
'Accept-Language': 'zh-CN',
'Host': 'smtmm.win',
'X-Requested-With': 'XMLHttpRequest',
'Referer': 'https://smtmm.win/',
}
url = 'https://smtmm.win/?page='+str(page)
# print(url)
request = scrapy.FormRequest(url, headers=headers, meta={'cookiejar': response.meta['cookiejar']},
method='GET',callback=self.parse_wwquery,dont_filter=True)
yield request
def parse_wwquery(self, response):
self.logger.info('run xiamen_qsgg.parse_wwquery')
html = response.body.decode('utf-8')
pattern1 = re.compile(r'-image:url(.*?);"></div>')
urlorg = pattern1.findall(html)
pattern2 = re.compile(r'" >(.*?)</a></h2>')
nameorg = pattern2.findall(html)
for count , url in enumerate(urlorg):
headers = {
'Accept': 'text/html, */*; q=0.01',
'Connection': 'Keep-Alive', # 保持链接状态
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36',
'Accept-Language': 'zh-CN',
'Host': 'smtmm.win',
'X-Requested-With': 'XMLHttpRequest',
'Referer': 'https://smtmm.win/',
}
url = 'https://smtmm.win' + url[1:-1]
name = nameorg[count]
request = scrapy.FormRequest(url, headers=headers, meta={'cookiejar': response.meta['cookiejar']},
method='GET', callback=self.dowloadfile, dont_filter=True)
yield request
request.meta['file_name'] = name
def dowloadfile(self, response):
self.logger.info('run xiamen_qsgg.dowloadfile')
file_name = response.meta['file_name']
file_path = 'D:\\tupian\\'
with open(file_path + file_name+'.jpg', "wb") as pdf:
pdf.write(response.body) |