初始化文件

2df624d0 · guof · 2df624d0 · 2df624d0 · 2df624d0 · 2df624d0
Commit 2df624d0 authored Apr 27, 2022 by guof
9 changed files
--- a/SCWSpider/__init__.py
+++ b/SCWSpider/__init__.py
--- a/SCWSpider/items.py
+++ b/SCWSpider/items.py
+# -*- coding: utf-8 -*-
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/items.html
+import scrapy
+class ScwspiderItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    vehicle_model = scrapy.Field()  # 公告型号
+    vehicle_batch = scrapy.Field()  # 公告批次
+    vehicle_brand = scrapy.Field()  # 品牌
+    vehicle_type = scrapy.Field()  # 类型
+    approved_load = scrapy.Field()  # 额定质量
+    gross_mass = scrapy.Field()  # 总质量
+    unladen_mass = scrapy.Field()  # 装备质量
+    fuel_type = scrapy.Field()  # 燃料种类
+    axle_number = scrapy.Field()  # 轴数
+    emission_standard = scrapy.Field() # 排放依据标准
+    wheelbase = scrapy.Field() # 轴距
+    axle_load = scrapy.Field() # 轴荷
+    spring_nums = scrapy.Field() # 弹簧片数
+    tire_nums = scrapy.Field() # 轮胎数
+    tire_specification = scrapy.Field()    # 轮胎规格
+    approaching_departure_angle = scrapy.Field()  # 接近离去角
+    front_rear_suspension = scrapy.Field() # 前悬后悬
+    front_gauge = scrapy.Field() # 前轮距
+    back_gauge = scrapy.Field()  # 后轮距
+    vehicle_id_number = scrapy.Field() # 识别代号
+    vehicle_length = scrapy.Field() # 整车长
+    vehicle_width = scrapy.Field() # 整车宽
+    vehicle_high = scrapy.Field() # 整车高
+    container_length = scrapy.Field() # 货厢长
+    container_width = scrapy.Field() # 货厢宽
+    container_high = scrapy.Field() # 货厢高
+    maximum_speed = scrapy.Field() # 最高车速
+    seat = scrapy.Field() # 额定载客
+    cab_nums = scrapy.Field() # 驾驶室准乘人数
+    streering = scrapy.Field() # 转向形式
+    allow_trailer_weight = scrapy.Field() # 准拖挂车总质量
+    zzllyxs = scrapy.Field() # 载质量利用系数
+    bgcazzdczzl = scrapy.Field() # 半挂车鞍座最大承载质量
+    company_name = scrapy.Field() # 企业名称
+    company_address = scrapy.Field() # 企业地址
+    postcode = scrapy.Field() # 邮政编码
+    chasis1 = scrapy.Field() # 底盘1
+    chasis2 = scrapy.Field() # 底盘2
+    chasis3 = scrapy.Field()# 底盘3
+    chasis4 = scrapy.Field()# 底盘4
+    engine_info = scrapy.Field()  # 发动机信息 :发动机型号，发动机生产企业，发动机商标，排量，功率
+    ut_url = scrapy.Field() # 爬取url
+    ut_type = scrapy.Field() # 数据类型 scw_basic
+    ut_source = scrapy.Field() # 数据来源 商车网
+    ut_uptime= scrapy.Field() # 获取时间戳
+    ut_dotime = scrapy.Field() # 获取日期
+    ut_unique_id = scrapy.Field() # 唯一性md5
+    ut_tale = scrapy.Field()
+    style_img_url = scrapy.Field() # 车型图片
+    # response.xpath('//div/table/tr[1]/td[1]').extract()
\ No newline at end of file
--- a/SCWSpider/middlewares.py
+++ b/SCWSpider/middlewares.py
+# -*- coding: utf-8 -*-
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+from scrapy import signals
+class ScwspiderSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+        # Should return None or raise an exception.
+        return None
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+class ScwspiderDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/SCWSpider/pipelines.py
+++ b/SCWSpider/pipelines.py
+# -*- coding: utf-8 -*-
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import pymongo
+from SCWSpider import settings
+class ScwspiderPipeline(object):
+    def __init__(self):
+        # 获取setting主机名、端口号和数据库名称
+        host = settings.MONGODB_HOST
+        port = settings.MONGODB_PORT
+        dbname = settings.MONGODB_DB_NAME
+        # 创建数据库连接
+        self.client = pymongo.MongoClient(host=host, port=port,
+                                     username=settings.MONGODB_DB_USERNAME,
+                                     password=settings.MONGODB_DB_PASSWD)
+        # 指向指定数据库
+        mdb = self.client[dbname]
+        # 获取数据库里面存放数据的表名
+        self.post = mdb[settings.MONGODB_COLLECTION_NAME]
+    def process_item(self, item, spider):
+        data = dict(item)
+        # 向指定的表里添加数据
+        self.post.update_one({"vehicle_model": item["vehicle_model"], "vehicle_batch": item["vehicle_batch"]},
+                             {"$set": data}, True)
+        return item
+    def __del__(self):
+        self.client.close()
+        print("连接关闭了")
+if __name__ == '__main__':
+    con = ScwspiderPipeline()
+    print(con.client.list_database_names())
+    # print(type(con.post))
+    # print(con.post.update_one())
+    # result = con.post.update_one({"vehicle_model": ["vehicle_model"]},
+    #                              {"$set": {"vehicle_model": ["vehicle_model"], "vehicle_batch": "vehicle_batch"}})
+    # result = con.post.insert_one({"vehicle_model": "vehicle_model", "vehicle_batch": "vehicle_batch"})
--- a/SCWSpider/settings.py
+++ b/SCWSpider/settings.py
+# -*- coding: utf-8 -*-
+# Scrapy settings for SCWSpider project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+BOT_NAME = 'SCWSpider'
+SPIDER_MODULES = ['SCWSpider.spiders']
+NEWSPIDER_MODULE = 'SCWSpider.spiders'
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'SCWSpider (+http://www.yourdomain.com)'
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+# Configure a delay for requests for the same website (default: 0)
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+# Enable or disable spider middlewares
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'SCWSpider.middlewares.ScwspiderSpiderMiddleware': 543,
+#}
+# Enable or disable downloader middlewares
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'SCWSpider.middlewares.ScwspiderDownloaderMiddleware': 543,
+#}
+# Enable or disable extensions
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+# Configure item pipelines
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+#ITEM_PIPELINES = {
+#    'SCWSpider.pipelines.ScwspiderPipeline': 300,
+#}
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+# Enable and configure HTTP caching (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+ITEM_PIPELINES = {
+   'SCWSpider.pipelines.ScwspiderPipeline': 300,
+}
+MONGODB_HOST = '47.102.110.205'
+# 端口号，默认27017
+MONGODB_PORT = 27867
+# 设置数据库名称
+MONGODB_DB_NAME = 'SCW_data'
+MONGODB_DB_PASSWD = 'Passw0rd'
+MONGODB_DB_USERNAME = 'root'
+# 存放本数据的表名称
+MONGODB_COLLECTION_NAME = 'vehicle_info'
--- a/SCWSpider/spiders/SCWVehicleSpider.py
+++ b/SCWSpider/spiders/SCWVehicleSpider.py
+# -*- coding: utf-8 -*-
+import hashlib
+from urllib.parse import urljoin
+import scrapy
+import time
+from scrapy.spidermiddlewares.httperror import HttpError
+from twisted.internet.error import DNSLookupError, TCPTimedOutError
+from SCWSpider.items import ScwspiderItem
+class ScwvehiclespiderSpider(scrapy.Spider):
+    name = 'SCWVehicleSpider'
+    # allowed_domains = ['https://www.cn357.com/']
+    # start_urls = ['https://www.cn357.com/notice61794_BJ5059VBBEAG1']
+    PAGE_URL = 'https://www.cn357.com/notice_{0}'
+    DETAIL_URL = 'https://www.cn357.com/'
+    item_name_list = {
+        'vehicle_model': '//div/table/tr[2]/td[2]',
+        'vehicle_batch': '//div/table/tr[2]/td[4]',
+        'vehicle_brand': '//div/table/tr[3]/td[2]',
+        'vehicle_type': '//div/table/tr[3]/td[4]',
+        'approved_load': '//div/table/tr[4]/td[2]',
+        'gross_mass': '//div/table/tr[4]/td[4]',
+        'unladen_mass': '//div/table/tr[5]/td[2]',
+        'fuel_type': '//div/table/tr[5]/td[4]',
+        'emission_standard': '//div/table/tr[6]/td[2]',
+        'axle_number': '//div/table/tr[6]/td[4]',
+        'wheelbase': '//div/table/tr[7]/td[2]',
+        'axle_load': '//div/table/tr[7]/td[4]',
+        'spring_nums': '//div/table/tr[8]/td[2]',
+        'tire_nums': '//div/table/tr[8]/td[4]',
+        'tire_specification': '//div/table/tr[9]/td[2]',
+        'approaching_departure_angle': '//div/table/tr[9]/td[4]',
+        'front_rear_suspension': '//div/table/tr[10]/td[2]',
+        'front_gauge': '//div/table/tr[10]/td[4]',
+        'back_gauge': '//div/table/tr[11]/td[2]',
+        'vehicle_id_number': '//div/table/tr[11]/td[4]',
+        'vehicle_length': '//div/table/tr[12]/td[2]',
+        'vehicle_width': '//div/table/tr[12]/td[4]',
+        'vehicle_high': '//div/table/tr[13]/td[2]',
+        'container_length': '//div/table/tr[13]/td[4]',
+        'container_width': '//div/table/tr[14]/td[2]',
+        'container_high': '//div/table/tr[14]/td[4]',
+        'maximum_speed': '//div/table/tr[15]/td[2]',
+        'seat': '//div/table/tr[15]/td[4]',
+        'cab_nums': '//div/table/tr[16]/td[2]',
+        'streering': '//div/table/tr[16]/td[4]',
+        'allow_trailer_weight': '//div/table/tr[17]/td[2]',
+        'zzllyxs': '//div/table/tr[17]/td[4]',
+        'bgcazzdczzl': '//div/table/tr[18]/td[2]',
+        'company_name': '//div/table/tr[18]/td[4]',
+        'company_address': '//div/table/tr[19]/td[2]',
+        'postcode': '//div/table/tr[20]/td[4]',
+        'chasis1': '//div/table/tr[21]/td[2]',
+        'chasis2': '//div/table/tr[21]/td[4]',
+        'chasis3': '//div/table/tr[22]/td[2]',
+        'chasis4': '//div/table/tr[22]/td[4]',
+        'engine_info': '//div[@class="gMain"]/table/tr',
+        'style_img_url': '//*[@id="noticeImage"]/ul/li',
+    }
+    def start_requests(self):
+        for i in range(1, 353):
+            # 第一页
+            yield scrapy.Request(self.PAGE_URL.format(i), callback=self.parse, errback=self.errback_httpbin)
+    def parse(self, response):
+        self.log("===========================| %s |" % response.url)
+        vehicle_model_list = response.xpath('/html/body/div[6]/div[1]/table/tr/td[1]/a')
+        for vehicle_model in vehicle_model_list:
+            url = urljoin(self.DETAIL_URL, vehicle_model.xpath('@href').extract()[0])
+            self.log('vehilce_detail_url: {0}'.format(url))
+            yield scrapy.Request(url, callback=self.parse_item)
+        next_page = response.xpath('//span[@class="pageList"]/a')
+        if next_page:
+            next_page = next_page[-1]
+            if next_page.xpath('text()').extract()[0] == '下一页':
+                next_page = urljoin(self.PAGE_URL, next_page.xpath('@href').extract()[0])
+                self.log('page_url: %s' % next_page)
+                yield scrapy.Request(next_page, callback=self.parse)
+    def parse_item(self, response):
+        item = ScwspiderItem()
+        for k in self.item_name_list.keys():
+            item[k] = self.get_vehicle_info(response, k)
+        self.log("===========================| {0} |".format(item['vehicle_model'] + item['vehicle_batch']))
+        item['ut_url'] = response.url # 爬取url
+        item['ut_type'] = 'scw_basic'  # 数据类型 scw_basic
+        item['ut_source'] = '商车网'  # 数据来源 商车网
+        item['ut_uptime'] = int(time.time())  # 获取时间戳
+        item['ut_dotime'] = time.strftime("%Y-%m-%d") # 获取日期
+        item['ut_tale'] = 'basic_vehicle'
+        # item['ut_unique_id'] = scrapy.Field()  # 唯一性md5
+        #hashlib.md5('你好'.encode(encoding='UTF-8')).hexdigest()
+        yield item
+    # 异常处理
+    def errback_httpbin(self, failure):
+        if failure.check(HttpError):
+            response = failure.value.response
+            print('HttpError on {0}'.format(response.url))
+        elif failure.check(DNSLookupError):
+            request = failure.request
+            print('DNSLookupError on {0}'.format(request.url))
+        elif failure.check(TimeoutError, TCPTimedOutError):
+            request = failure.request
+            print('TimeoutError on {0}'.format(request.url))
+    def get_engine(self, response, field_name):
+        engine_td = response.xpath(self.item_name_list[field_name])[-2].xpath('td[1]/table/tr[2]/td')
+        if not engine_td:
+            return []
+        engine_models = engine_td[0].xpath('text()').extract()
+        engine_companys = engine_td[1].xpath('text()').extract()
+        engine_brands = engine_td[2].xpath('text()').extract()
+        engine_ccs = engine_td[3].xpath('text()').extract()
+        engine_powers = engine_td[4].xpath('text()').extract()
+        engine_info = []
+        for i in range(len(engine_models)):
+            info = {}
+            info['engine_model'] = engine_models[i] if len(engine_models) > i else ''
+            info['engine_company'] = engine_companys[i] if len(engine_companys) > i else ''
+            info['engine_brand'] = engine_brands[i] if len(engine_brands) > i else ''
+            info['engine_cc'] = engine_ccs[i] if len(engine_ccs) > i else ''
+            info['engine_power'] = engine_powers[i] if len(engine_powers) > i else ''
+            engine_info.append(info)
+        return engine_info
+    def get_vehicle_info(self, response, field_name):
+        if field_name == 'engine_info':
+            return self.get_engine(response, field_name)
+        if field_name == 'style_img_url':
+            img_list = response.xpath(self.item_name_list[field_name])
+            imgs = []
+            if img_list:
+                for i in img_list:
+                    imgs.append(i.xpath('img').attrib.get('src'))
+            return imgs
+        field_list = response.xpath(self.item_name_list[field_name] + '//text()').extract()
+        if field_list:
+            field = ''
+            for fd in field_list:
+                field += fd
+        else:
+            field = ''
+        return field
--- a/SCWSpider/spiders/__init__.py
+++ b/SCWSpider/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/begin.py
+++ b/begin.py
+from scrapy import cmdline
+cmdline.execute('scrapy crawl SCWVehicleSpider'.split())
--- a/scrapy.cfg
+++ b/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+[settings]
+default = SCWSpider.settings
+[deploy]
+#url = http://localhost:6800/
+project = SCWSpider