Commit 2df624d0 by guof

初始化文件

parents
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class ScwspiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
vehicle_model = scrapy.Field() # 公告型号
vehicle_batch = scrapy.Field() # 公告批次
vehicle_brand = scrapy.Field() # 品牌
vehicle_type = scrapy.Field() # 类型
approved_load = scrapy.Field() # 额定质量
gross_mass = scrapy.Field() # 总质量
unladen_mass = scrapy.Field() # 装备质量
fuel_type = scrapy.Field() # 燃料种类
axle_number = scrapy.Field() # 轴数
emission_standard = scrapy.Field() # 排放依据标准
wheelbase = scrapy.Field() # 轴距
axle_load = scrapy.Field() # 轴荷
spring_nums = scrapy.Field() # 弹簧片数
tire_nums = scrapy.Field() # 轮胎数
tire_specification = scrapy.Field() # 轮胎规格
approaching_departure_angle = scrapy.Field() # 接近离去角
front_rear_suspension = scrapy.Field() # 前悬后悬
front_gauge = scrapy.Field() # 前轮距
back_gauge = scrapy.Field() # 后轮距
vehicle_id_number = scrapy.Field() # 识别代号
vehicle_length = scrapy.Field() # 整车长
vehicle_width = scrapy.Field() # 整车宽
vehicle_high = scrapy.Field() # 整车高
container_length = scrapy.Field() # 货厢长
container_width = scrapy.Field() # 货厢宽
container_high = scrapy.Field() # 货厢高
maximum_speed = scrapy.Field() # 最高车速
seat = scrapy.Field() # 额定载客
cab_nums = scrapy.Field() # 驾驶室准乘人数
streering = scrapy.Field() # 转向形式
allow_trailer_weight = scrapy.Field() # 准拖挂车总质量
zzllyxs = scrapy.Field() # 载质量利用系数
bgcazzdczzl = scrapy.Field() # 半挂车鞍座最大承载质量
company_name = scrapy.Field() # 企业名称
company_address = scrapy.Field() # 企业地址
postcode = scrapy.Field() # 邮政编码
chasis1 = scrapy.Field() # 底盘1
chasis2 = scrapy.Field() # 底盘2
chasis3 = scrapy.Field()# 底盘3
chasis4 = scrapy.Field()# 底盘4
engine_info = scrapy.Field() # 发动机信息 :发动机型号,发动机生产企业,发动机商标,排量,功率
ut_url = scrapy.Field() # 爬取url
ut_type = scrapy.Field() # 数据类型 scw_basic
ut_source = scrapy.Field() # 数据来源 商车网
ut_uptime= scrapy.Field() # 获取时间戳
ut_dotime = scrapy.Field() # 获取日期
ut_unique_id = scrapy.Field() # 唯一性md5
ut_tale = scrapy.Field()
style_img_url = scrapy.Field() # 车型图片
# response.xpath('//div/table/tr[1]/td[1]').extract()
\ No newline at end of file
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class ScwspiderSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class ScwspiderDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
from SCWSpider import settings
class ScwspiderPipeline(object):
def __init__(self):
# 获取setting主机名、端口号和数据库名称
host = settings.MONGODB_HOST
port = settings.MONGODB_PORT
dbname = settings.MONGODB_DB_NAME
# 创建数据库连接
self.client = pymongo.MongoClient(host=host, port=port,
username=settings.MONGODB_DB_USERNAME,
password=settings.MONGODB_DB_PASSWD)
# 指向指定数据库
mdb = self.client[dbname]
# 获取数据库里面存放数据的表名
self.post = mdb[settings.MONGODB_COLLECTION_NAME]
def process_item(self, item, spider):
data = dict(item)
# 向指定的表里添加数据
self.post.update_one({"vehicle_model": item["vehicle_model"], "vehicle_batch": item["vehicle_batch"]},
{"$set": data}, True)
return item
def __del__(self):
self.client.close()
print("连接关闭了")
if __name__ == '__main__':
con = ScwspiderPipeline()
print(con.client.list_database_names())
# print(type(con.post))
# print(con.post.update_one())
# result = con.post.update_one({"vehicle_model": ["vehicle_model"]},
# {"$set": {"vehicle_model": ["vehicle_model"], "vehicle_batch": "vehicle_batch"}})
# result = con.post.insert_one({"vehicle_model": "vehicle_model", "vehicle_batch": "vehicle_batch"})
# -*- coding: utf-8 -*-
# Scrapy settings for SCWSpider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'SCWSpider'
SPIDER_MODULES = ['SCWSpider.spiders']
NEWSPIDER_MODULE = 'SCWSpider.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'SCWSpider (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'SCWSpider.middlewares.ScwspiderSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'SCWSpider.middlewares.ScwspiderDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'SCWSpider.pipelines.ScwspiderPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
ITEM_PIPELINES = {
'SCWSpider.pipelines.ScwspiderPipeline': 300,
}
MONGODB_HOST = '47.102.110.205'
# 端口号,默认27017
MONGODB_PORT = 27867
# 设置数据库名称
MONGODB_DB_NAME = 'SCW_data'
MONGODB_DB_PASSWD = 'Passw0rd'
MONGODB_DB_USERNAME = 'root'
# 存放本数据的表名称
MONGODB_COLLECTION_NAME = 'vehicle_info'
# -*- coding: utf-8 -*-
import hashlib
from urllib.parse import urljoin
import scrapy
import time
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError, TCPTimedOutError
from SCWSpider.items import ScwspiderItem
class ScwvehiclespiderSpider(scrapy.Spider):
name = 'SCWVehicleSpider'
# allowed_domains = ['https://www.cn357.com/']
# start_urls = ['https://www.cn357.com/notice61794_BJ5059VBBEAG1']
PAGE_URL = 'https://www.cn357.com/notice_{0}'
DETAIL_URL = 'https://www.cn357.com/'
item_name_list = {
'vehicle_model': '//div/table/tr[2]/td[2]',
'vehicle_batch': '//div/table/tr[2]/td[4]',
'vehicle_brand': '//div/table/tr[3]/td[2]',
'vehicle_type': '//div/table/tr[3]/td[4]',
'approved_load': '//div/table/tr[4]/td[2]',
'gross_mass': '//div/table/tr[4]/td[4]',
'unladen_mass': '//div/table/tr[5]/td[2]',
'fuel_type': '//div/table/tr[5]/td[4]',
'emission_standard': '//div/table/tr[6]/td[2]',
'axle_number': '//div/table/tr[6]/td[4]',
'wheelbase': '//div/table/tr[7]/td[2]',
'axle_load': '//div/table/tr[7]/td[4]',
'spring_nums': '//div/table/tr[8]/td[2]',
'tire_nums': '//div/table/tr[8]/td[4]',
'tire_specification': '//div/table/tr[9]/td[2]',
'approaching_departure_angle': '//div/table/tr[9]/td[4]',
'front_rear_suspension': '//div/table/tr[10]/td[2]',
'front_gauge': '//div/table/tr[10]/td[4]',
'back_gauge': '//div/table/tr[11]/td[2]',
'vehicle_id_number': '//div/table/tr[11]/td[4]',
'vehicle_length': '//div/table/tr[12]/td[2]',
'vehicle_width': '//div/table/tr[12]/td[4]',
'vehicle_high': '//div/table/tr[13]/td[2]',
'container_length': '//div/table/tr[13]/td[4]',
'container_width': '//div/table/tr[14]/td[2]',
'container_high': '//div/table/tr[14]/td[4]',
'maximum_speed': '//div/table/tr[15]/td[2]',
'seat': '//div/table/tr[15]/td[4]',
'cab_nums': '//div/table/tr[16]/td[2]',
'streering': '//div/table/tr[16]/td[4]',
'allow_trailer_weight': '//div/table/tr[17]/td[2]',
'zzllyxs': '//div/table/tr[17]/td[4]',
'bgcazzdczzl': '//div/table/tr[18]/td[2]',
'company_name': '//div/table/tr[18]/td[4]',
'company_address': '//div/table/tr[19]/td[2]',
'postcode': '//div/table/tr[20]/td[4]',
'chasis1': '//div/table/tr[21]/td[2]',
'chasis2': '//div/table/tr[21]/td[4]',
'chasis3': '//div/table/tr[22]/td[2]',
'chasis4': '//div/table/tr[22]/td[4]',
'engine_info': '//div[@class="gMain"]/table/tr',
'style_img_url': '//*[@id="noticeImage"]/ul/li',
}
def start_requests(self):
for i in range(1, 353):
# 第一页
yield scrapy.Request(self.PAGE_URL.format(i), callback=self.parse, errback=self.errback_httpbin)
def parse(self, response):
self.log("===========================| %s |" % response.url)
vehicle_model_list = response.xpath('/html/body/div[6]/div[1]/table/tr/td[1]/a')
for vehicle_model in vehicle_model_list:
url = urljoin(self.DETAIL_URL, vehicle_model.xpath('@href').extract()[0])
self.log('vehilce_detail_url: {0}'.format(url))
yield scrapy.Request(url, callback=self.parse_item)
next_page = response.xpath('//span[@class="pageList"]/a')
if next_page:
next_page = next_page[-1]
if next_page.xpath('text()').extract()[0] == '下一页':
next_page = urljoin(self.PAGE_URL, next_page.xpath('@href').extract()[0])
self.log('page_url: %s' % next_page)
yield scrapy.Request(next_page, callback=self.parse)
def parse_item(self, response):
item = ScwspiderItem()
for k in self.item_name_list.keys():
item[k] = self.get_vehicle_info(response, k)
self.log("===========================| {0} |".format(item['vehicle_model'] + item['vehicle_batch']))
item['ut_url'] = response.url # 爬取url
item['ut_type'] = 'scw_basic' # 数据类型 scw_basic
item['ut_source'] = '商车网' # 数据来源 商车网
item['ut_uptime'] = int(time.time()) # 获取时间戳
item['ut_dotime'] = time.strftime("%Y-%m-%d") # 获取日期
item['ut_tale'] = 'basic_vehicle'
# item['ut_unique_id'] = scrapy.Field() # 唯一性md5
#hashlib.md5('你好'.encode(encoding='UTF-8')).hexdigest()
yield item
# 异常处理
def errback_httpbin(self, failure):
if failure.check(HttpError):
response = failure.value.response
print('HttpError on {0}'.format(response.url))
elif failure.check(DNSLookupError):
request = failure.request
print('DNSLookupError on {0}'.format(request.url))
elif failure.check(TimeoutError, TCPTimedOutError):
request = failure.request
print('TimeoutError on {0}'.format(request.url))
def get_engine(self, response, field_name):
engine_td = response.xpath(self.item_name_list[field_name])[-2].xpath('td[1]/table/tr[2]/td')
if not engine_td:
return []
engine_models = engine_td[0].xpath('text()').extract()
engine_companys = engine_td[1].xpath('text()').extract()
engine_brands = engine_td[2].xpath('text()').extract()
engine_ccs = engine_td[3].xpath('text()').extract()
engine_powers = engine_td[4].xpath('text()').extract()
engine_info = []
for i in range(len(engine_models)):
info = {}
info['engine_model'] = engine_models[i] if len(engine_models) > i else ''
info['engine_company'] = engine_companys[i] if len(engine_companys) > i else ''
info['engine_brand'] = engine_brands[i] if len(engine_brands) > i else ''
info['engine_cc'] = engine_ccs[i] if len(engine_ccs) > i else ''
info['engine_power'] = engine_powers[i] if len(engine_powers) > i else ''
engine_info.append(info)
return engine_info
def get_vehicle_info(self, response, field_name):
if field_name == 'engine_info':
return self.get_engine(response, field_name)
if field_name == 'style_img_url':
img_list = response.xpath(self.item_name_list[field_name])
imgs = []
if img_list:
for i in img_list:
imgs.append(i.xpath('img').attrib.get('src'))
return imgs
field_list = response.xpath(self.item_name_list[field_name] + '//text()').extract()
if field_list:
field = ''
for fd in field_list:
field += fd
else:
field = ''
return field
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
from scrapy import cmdline
cmdline.execute('scrapy crawl SCWVehicleSpider'.split())
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = SCWSpider.settings
[deploy]
#url = http://localhost:6800/
project = SCWSpider
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment