爬取京东和天猫下载商品图片

    最近开始搞爬虫了,业务遇到的一个需求就是按excel里面的条件,下载京东和天猫的商品图片,包括商品头图和详情图。

    对于京东来说,下载头图没什么难度,但是下载详情图却不容易,因为京东商品详情图的图片并不在html源码里面,需要用浏览器的开发者工具的network抓包分析ajax请求才能得到具体的图片地址。

    对于天猫来说,抓头图也没什么难度,但是下载详情图也不容易,因为天猫的商品详情图需要登录才能看到,但是我用requests库没有分析出来怎么登录,后来发现天猫的手机版没有做防爬措施,手机版不用登录也能看到商品详情图,真是太好了,世界一下欢乐多了。

    业务提供的excel表如下:

    

    我的python代码如下:

# -*- coding:utf8 -*-
import requests
from lxml import etree
import pandas as pd
import os
import re
class DownLoadPic(object):
    def __init__(self, excel_path):
        self.excel_path = excel_path
    def get_excel_data(self):
        """
        从excel里面获取数据
        :return:
        """
        df = pd.read_excel(self.excel_path)
        return df.values
    @staticmethod
    def switch_tm_url(url):
        """
        把天猫url转换为手机版的url,因为手机版的比电脑版的反爬限制少
        :return:
        """
        url = url.replace('detail.tmall.com', 'detail.m.tmall.com')
        return url
    @staticmethod
    def session_():
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                                 'Chrome/77.0.3865.90 Safari/537.36',
                   'Accept-Encoding': 'gzip, deflate, br',
                   'Accept': '*/*',
                   'Connection': 'keep-alive'}
        s = requests.Session()
        s.headers.update(headers)
        return s
    def get_tm_header_image_url_list(self, url):
        """
        获取天猫头图的url
        :return:
        """
        s = self.session_()
        html = s.get(url).text
        html_obj = etree.HTML(html)
        header_image_url_list = html_obj.xpath(
            '/html/body/div[1]/div[2]/div/div[2]/div[1]/div/section/div/a/img/@data-src')
        del html
        del html_obj
        return header_image_url_list
    def get_tm_detail_image_url_list(self, url):
        """
        获取天猫详情图的url
        :return:
        """
        s = self.session_()
        html = s.get(url).text
        html_obj = etree.HTML(html)
        detail_image_url_list = html_obj.xpath('//*[@id="modules-desc"]/div/div/div/div/img/@data-ks-lazyload')
        del html
        del html_obj
        return detail_image_url_list
    def get_jd_header_image_url_list(self, url):
        headers = {
            "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'
        }
        r = requests.get(url=url)
        html = r.text
        html_obj = etree.HTML(html)
        result = []
        for img in html_obj.xpath("//ul/li/img/@src"):
            if img.find('https:') < 0:
                img_url = "https:" + img
            else:
                img_url = img
            header_img_url = img_url.replace("s54x54_", "").replace("s75x75_", "").replace("n5/", "cv/s1080x1080_")
            result.append(header_img_url)
            # print(header_img_url)
        del html
        del html_obj
        return result
    def get_jd_detail_image_url_list(self, url):
        """
            参考文章:https://www.jianshu.com/p/9de3be54abc1
        """
        headers = {
            "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'
        }
        result = []
        # src
        r = requests.get(url=url, headers=headers)
        html = r.text
        js_url = re.findall('cd.jd.com/description/channel\?skuId=[\d]*&mainSkuId=[\d]*&cdn=2', html, re.S)[0]
        js_url = "https://" + js_url
        try:
            r = requests.get(url=js_url, headers=headers)
            # 如果响应状态码不是 200,就主动抛出异常
            r.raise_for_status()
            # 关闭连接 !!!--非常重要
            r.close()
        except Exception as e:
            ogger.error(e)
        # 当没有异常发生时,else中的语句将会被执行
        else:
            html = r.text
        # print(html)
        # print(r.status_code)
        imgs = re.findall('data-lazyload=[^\s]*([a-zA-Z]*:*//[^\s\\\]*)', html, re.S)
        # print("src_imgs的值是", imgs)
        for img in imgs:
            if img.find('https:') < 0:
                img_url = "https:" + img
            else:
                img_url = img
            img_url = img_url.replace('jfs', 's9080x9080_jfs')
            # print(img_url)
            if not (img_url in result):
                result.append(img_url)
        # background_img
        r = requests.get(url=url, headers=headers)
        html = r.text
        js_url = re.findall('cd.jd.com/description/channel\?skuId=[\d]*&mainSkuId=[\d]*&cdn=2', html, re.S)[0]
        js_url = "https://" + js_url
        # print('js_url的值是', js_url)
        try:
            r = requests.get(url=js_url, headers=headers)
            # 如果响应状态码不是 200,就主动抛出异常
            r.raise_for_status()
            # 关闭连接 !!!--非常重要
            r.close()
        except Exception as e:
            ogger.error(e)
        # 当没有异常发生时,else中的语句将会被执行
        else:
            html = r.text
        # print(html)
        # print(r.status_code)
        imgs = re.findall('background-image:url\(([a-zA-Z]*:*//[^\s);]*)', html, re.S)
        # print("background_img的值是", imgs)
        for img in imgs:
            if img.find('https:') < 0:
                img_url = "https:" + img
            else:
                img_url = img
            img_url = img_url.replace('jfs', 's9080x9080_jfs')
            # print(img_url)
            if not (img_url in result):
                result.append(img_url)
        # print('result:', result)
        return result
    def save_image(self, index, image_title, image_url, method):
        """
        保存图片到当前的data/images目录下
        :param index:
        :param method: 判断是主图还是详情图,可选值为header_image和detail_image
        :param image_title: 图片的标题
        :param image_url: 图片的url
        :return:
        """
        # 获取图片后缀
        file_suffix = os.path.splitext(image_url)[1]
        cwd = os.getcwd()
        image_path_filename = ''
        save_path = os.path.join(cwd, 'data\\images', image_title)
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        if method == 'header_image':
            image_path_filename = os.path.join(save_path, image_title + index + file_suffix)
        elif method == 'detail_image':
            image_path_filename = os.path.join(save_path, 'xq_' + index + file_suffix)
        # 获取图片的二级制内容到指定目录下
        s = self.session_()
        image = s.get(image_url)
        # 保存图片
        with open(image_path_filename, 'wb') as f:
            f.write(image.content)
    def save_tm_image(self, goods_title, url):
        """
        下载天猫的头图和详情图
        :param goods_title:
        :param url:
        :return:
        """
        # 1、如果是天猫的url,下载详情图需要转成手机版的url
        url = self.switch_tm_url(url)
        # 2、下载天猫头图
        tm_header_image_url_list = self.get_tm_header_image_url_list(url)
        for index, img_url in enumerate(tm_header_image_url_list, start=1):
            if not ('https:' in img_url):
                img_url = 'https:' + img_url
            index = str(index)
            self.save_image(index=index,
                            image_title=goods_title,
                            image_url=img_url,
                            method='header_image')
        print('爬取天猫头图成功:', goods_title)
        # 3、下载天猫详情图
        tm_detail_url_list = self.get_tm_detail_image_url_list(url)
        for index, img_url in enumerate(tm_detail_url_list, start=1):
            if not ('https:' in img_url):
                img_url = 'https:' + img_url
            index = str(index)
            self.save_image(index=index,
                            image_title=goods_title,
                            image_url=img_url,
                            method='detail_image')
        print('爬取天猫详情图成功:', goods_title)
    def save_jd_image(self, goods_title, url):
        """
        保存京东的头图和详情图
        :param goods_title:
        :param url:
        :return:
        """
        # 1、下载京东头图
        jd_header_image_url_list = self.get_jd_header_image_url_list(url)
        for index, img_url in enumerate(jd_header_image_url_list, start=1):
            if not ('https:' in img_url):
                img_url = 'https:' + img_url
            index = str(index)
            self.save_image(index=index,
                            image_title=goods_title,
                            image_url=img_url,
                            method='header_image')
        print('爬取京东头图成功:', goods_title)
        # 2、下载京东详情图
        jd_detail_url_list = self.get_jd_detail_image_url_list(url)
        for index, img_url in enumerate(jd_detail_url_list, start=1):
            # if not ('https:' in img_url):
            #     img_url = 'https:' + img_url
            index = str(index)
            self.save_image(index=index,
                            image_title=goods_title,
                            image_url=img_url,
                            method='detail_image')
        print('爬取京东详情图成功:', goods_title)
    def down_all_images(self):
        """
        下载天猫和京东图片,包括头图和详情图
        :return:
        """
        excel_data_list = self.get_excel_data()
        for index, row_data in enumerate(excel_data_list, start=1):
            goods_title = row_data[0].strip()
            url = row_data[1]
            # 下载天猫的图片
            if "detail.tmall.com" in url:
                self.save_tm_image(goods_title, url)
            elif "item.jd.com" in url:
                self.save_jd_image(goods_title, url)
    def main(self):
        self.down_all_images()
if __name__ == "__main__":
    print('开始爬取,请稍后...')
    down_pic = DownLoadPic(excel_path=r'D:\RPA\down_pic\data\样例.xlsx')
    down_pic.main()
    print('全部完成!!!')


运行结果:


请使用浏览器的分享功能分享到微信等