scrapy爬取搜狗图片

请注意,本文编写于 247 天前,最后修改于 66 天前,其中某些信息可能已经过时。

获取搜狗图片

# -*- coding: utf-8 -*-
from urllib.parse import urlencode
import json
import scrapy
import os
import re
import urllib.request

class SougouimgSpider(scrapy.Spider):
    name = 'sougouimg'
    allowed_domains = ['pic.sogou.com']
    start_urls = ['https://pic.sogou.com/']

    def parse(self, response):
        page = 1
        endpage = 5     # 终点页
        keywords = r'哆啦A梦'
        for page in range(1,endpage):
            yield scrapy.Request(self.geturl(keywords,page), callback=self.sougou)



    def sougou(self,response):
        # 获取get参数
        # print(response.text)
        data = response.text
        js = json.loads(data)
        for list in js['items']:
            img_url = list['pic_url']
            self.savve(img_url)

    def geturl(self, keywords, page):     # 传入关键字,页码
        param = {
            'query': keywords,
            'mode': '1',
            'start': page*48,
            'reqType': 'ajax',
            'reqFrom': 'result',
            'tn': '0'
        }

        ps = urlencode(param)
        url = 'https://pic.sogou.com/pics?' + ps
        return url

    def savve(self,img_url):
        path = os.path.dirname(os.path.abspath(__file__))+"\\搜狗图片"

        dir = os.path.exists(path)

        if not dir:
            os.makedirs(path)
        reg = re.compile('[^\/]+$')
        # 保存图片
        title= reg.findall(img_url)[0]
        sougou = path + "\\" + title
    try:
    
    urllib.request.urlretrieve(img_url, sougou)
       except Exception as e:
           print(title+"下载失败")
       finally:
           print(title+"下载完毕")
by浅枫沐雪