scrapy爬取集号吧
爬取目标地址: http://www.jihaoba.com/escrow/
新建一个scrapy项目
scrapy startproject phone
cd进入phone:新建
scrapy genspider ph http://www.jihaoba.com/escrow/
先来看看spiders下面的ph.py代码
# -*- coding: utf-8 -*- import scrapy from scrapy.http import Request from phone.items import PhoneItem class PhSpider(scrapy.Spider): name = 'ph' allowed_domains = ['www.jihaoba.com'] start_urls = [ 'http://www.jihaoba.com/escrow/' ] def parse(self, response): lists = response.xpath("//div[@class='numbershow']/ul") for li in lists: item = PhoneItem() item['phone_num'] = li.xpath('li[contains(@class,"number")]/a/@href').re("\\d{11}")[0] item['price'] = li.xpath('li[@class="price"]/span/text()').extract_first()[1:] item['yys'] = li.xpath('li[@class="brand"]/text()').extract_first() yield item next = "http://www.jihaoba.com"+response.xpath("//a[@class='m-pages-next']/@href").extract_first() if next: yield Request(next, callback=self.parse)
item.py
class PhoneItem(scrapy.Item): # define the fields for your item here like: phone_num = scrapy.Field() price = scrapy.Field() yys = scrapy.Field() pass
再来看看数据库保存的代码MysqlPipeline 管道文件,settings里面要配置
# -*- coding: utf-8 -*- import time import pymysql from twisted.enterprise import adbapi # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html class PhonePipeline(object): def process_item(self, item, spider): return item class MysqlPipeline(object): def __init__(self, dbpool): self.dbpool = dbpool @classmethod def from_settings(cls, settings): # 获取settings文件中的配置 dbparms = dict( host=settings['MYSQL_HOST'], db=settings['MYSQL_DBNAME'], user=settings['MYSQL_USER'], passwd=settings['MYSQL_PASSWORD'], charset='utf8', cursorclass=pymysql.cursors.DictCursor, use_unicode=True, ) # 使用Twisted中的adbapi获取数据库连接池对象 dbpool = adbapi.ConnectionPool("pymysql", **dbparms) return cls(dbpool) def process_item(self, item, spider): # 使用teisted讲mysql插入变成异步执行 # 使用数据库连接池对象进行数据库操作,自动传递cursor对象到第一个参数 query = self.dbpool.runInteraction(self.do_insert, item) # 设置出错时的回调方法,自动传递出错消息对象failure到第一个参数 query.addErrback(self.handle_error, item, spider) # 处理异常 def handle_error(self, failure, item, spider): # 处理异步插入的异常 print(failure) def do_insert(self, cursor, item): # 执行具体的插入 insert_sql = """ insert into te(phone, price, yys, create_time) values (%s, %s, %s, %s) """ cursor.execute(insert_sql, (item["phone_num"], item["price"], item["yys"], time.time()))
最后settings的配置 数据库连接配置
ITEM_PIPELINES = { # 'phone.pipelines.PhonePipeline': 300, 'phone.pipelines.MysqlPipeline': 1, } # 数据库配置 MYSQL_HOST = "127.0.0.1" MYSQL_DBNAME = "test" MYSQL_USER = "root" MYSQL_PASSWORD = "root"
- 版权申明:此文如未标注转载均为本站原创,自由转载请表明出处《龙行博客》。
- 本文网址:https://www.liaotaoo.cn/282.html
- 上篇文章:scrapy爬取ygdy8
- 下篇文章:scrapy使用mysql保存数据