最近开始搞爬虫了,业务遇到的一个需求就是按excel里面的条件,下载京东和天猫的商品图片,包括商品头图和详情图。
对于京东来说,下载头图没什么难度,但是下载详情图却不容易,因为京东商品详情图的图片并不在html源码里面,需要用浏览器的开发者工具的network抓包分析ajax请求才能得到具体的图片地址。
对于天猫来说,抓头图也没什么难度,但是下载详情图也不容易,因为天猫的商品详情图需要登录才能看到,但是我用requests库没有分析出来怎么登录,后来发现天猫的手机版没有做防爬措施,手机版不用登录也能看到商品详情图,真是太好了,世界一下欢乐多了。
业务提供的excel表如下:
我的python代码如下:
# -*- coding:utf8 -*- import requests from lxml import etree import pandas as pd import os import re class DownLoadPic(object): def __init__(self, excel_path): self.excel_path = excel_path def get_excel_data(self): """ 从excel里面获取数据 :return: """ df = pd.read_excel(self.excel_path) return df.values @staticmethod def switch_tm_url(url): """ 把天猫url转换为手机版的url,因为手机版的比电脑版的反爬限制少 :return: """ url = url.replace('detail.tmall.com', 'detail.m.tmall.com') return url @staticmethod def session_(): headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/77.0.3865.90 Safari/537.36', 'Accept-Encoding': 'gzip, deflate, br', 'Accept': '*/*', 'Connection': 'keep-alive'} s = requests.Session() s.headers.update(headers) return s def get_tm_header_image_url_list(self, url): """ 获取天猫头图的url :return: """ s = self.session_() html = s.get(url).text html_obj = etree.HTML(html) header_image_url_list = html_obj.xpath( '/html/body/div[1]/div[2]/div/div[2]/div[1]/div/section/div/a/img/@data-src') del html del html_obj return header_image_url_list def get_tm_detail_image_url_list(self, url): """ 获取天猫详情图的url :return: """ s = self.session_() html = s.get(url).text html_obj = etree.HTML(html) detail_image_url_list = html_obj.xpath('//*[@id="modules-desc"]/div/div/div/div/img/@data-ks-lazyload') del html del html_obj return detail_image_url_list def get_jd_header_image_url_list(self, url): headers = { "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko' } r = requests.get(url=url) html = r.text html_obj = etree.HTML(html) result = [] for img in html_obj.xpath("//ul/li/img/@src"): if img.find('https:') < 0: img_url = "https:" + img else: img_url = img header_img_url = img_url.replace("s54x54_", "").replace("s75x75_", "").replace("n5/", "cv/s1080x1080_") result.append(header_img_url) # print(header_img_url) del html del html_obj return result def get_jd_detail_image_url_list(self, url): """ 参考文章:https://www.jianshu.com/p/9de3be54abc1 """ headers = { "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko' } result = [] # src r = requests.get(url=url, headers=headers) html = r.text js_url = re.findall('cd.jd.com/description/channel\?skuId=[\d]*&mainSkuId=[\d]*&cdn=2', html, re.S)[0] js_url = "https://" + js_url try: r = requests.get(url=js_url, headers=headers) # 如果响应状态码不是 200,就主动抛出异常 r.raise_for_status() # 关闭连接 !!!--非常重要 r.close() except Exception as e: ogger.error(e) # 当没有异常发生时,else中的语句将会被执行 else: html = r.text # print(html) # print(r.status_code) imgs = re.findall('data-lazyload=[^\s]*([a-zA-Z]*:*//[^\s\\\]*)', html, re.S) # print("src_imgs的值是", imgs) for img in imgs: if img.find('https:') < 0: img_url = "https:" + img else: img_url = img img_url = img_url.replace('jfs', 's9080x9080_jfs') # print(img_url) if not (img_url in result): result.append(img_url) # background_img r = requests.get(url=url, headers=headers) html = r.text js_url = re.findall('cd.jd.com/description/channel\?skuId=[\d]*&mainSkuId=[\d]*&cdn=2', html, re.S)[0] js_url = "https://" + js_url # print('js_url的值是', js_url) try: r = requests.get(url=js_url, headers=headers) # 如果响应状态码不是 200,就主动抛出异常 r.raise_for_status() # 关闭连接 !!!--非常重要 r.close() except Exception as e: ogger.error(e) # 当没有异常发生时,else中的语句将会被执行 else: html = r.text # print(html) # print(r.status_code) imgs = re.findall('background-image:url\(([a-zA-Z]*:*//[^\s);]*)', html, re.S) # print("background_img的值是", imgs) for img in imgs: if img.find('https:') < 0: img_url = "https:" + img else: img_url = img img_url = img_url.replace('jfs', 's9080x9080_jfs') # print(img_url) if not (img_url in result): result.append(img_url) # print('result:', result) return result def save_image(self, index, image_title, image_url, method): """ 保存图片到当前的data/images目录下 :param index: :param method: 判断是主图还是详情图,可选值为header_image和detail_image :param image_title: 图片的标题 :param image_url: 图片的url :return: """ # 获取图片后缀 file_suffix = os.path.splitext(image_url)[1] cwd = os.getcwd() image_path_filename = '' save_path = os.path.join(cwd, 'data\\images', image_title) if not os.path.exists(save_path): os.makedirs(save_path) if method == 'header_image': image_path_filename = os.path.join(save_path, image_title + index + file_suffix) elif method == 'detail_image': image_path_filename = os.path.join(save_path, 'xq_' + index + file_suffix) # 获取图片的二级制内容到指定目录下 s = self.session_() image = s.get(image_url) # 保存图片 with open(image_path_filename, 'wb') as f: f.write(image.content) def save_tm_image(self, goods_title, url): """ 下载天猫的头图和详情图 :param goods_title: :param url: :return: """ # 1、如果是天猫的url,下载详情图需要转成手机版的url url = self.switch_tm_url(url) # 2、下载天猫头图 tm_header_image_url_list = self.get_tm_header_image_url_list(url) for index, img_url in enumerate(tm_header_image_url_list, start=1): if not ('https:' in img_url): img_url = 'https:' + img_url index = str(index) self.save_image(index=index, image_title=goods_title, image_url=img_url, method='header_image') print('爬取天猫头图成功:', goods_title) # 3、下载天猫详情图 tm_detail_url_list = self.get_tm_detail_image_url_list(url) for index, img_url in enumerate(tm_detail_url_list, start=1): if not ('https:' in img_url): img_url = 'https:' + img_url index = str(index) self.save_image(index=index, image_title=goods_title, image_url=img_url, method='detail_image') print('爬取天猫详情图成功:', goods_title) def save_jd_image(self, goods_title, url): """ 保存京东的头图和详情图 :param goods_title: :param url: :return: """ # 1、下载京东头图 jd_header_image_url_list = self.get_jd_header_image_url_list(url) for index, img_url in enumerate(jd_header_image_url_list, start=1): if not ('https:' in img_url): img_url = 'https:' + img_url index = str(index) self.save_image(index=index, image_title=goods_title, image_url=img_url, method='header_image') print('爬取京东头图成功:', goods_title) # 2、下载京东详情图 jd_detail_url_list = self.get_jd_detail_image_url_list(url) for index, img_url in enumerate(jd_detail_url_list, start=1): # if not ('https:' in img_url): # img_url = 'https:' + img_url index = str(index) self.save_image(index=index, image_title=goods_title, image_url=img_url, method='detail_image') print('爬取京东详情图成功:', goods_title) def down_all_images(self): """ 下载天猫和京东图片,包括头图和详情图 :return: """ excel_data_list = self.get_excel_data() for index, row_data in enumerate(excel_data_list, start=1): goods_title = row_data[0].strip() url = row_data[1] # 下载天猫的图片 if "detail.tmall.com" in url: self.save_tm_image(goods_title, url) elif "item.jd.com" in url: self.save_jd_image(goods_title, url) def main(self): self.down_all_images() if __name__ == "__main__": print('开始爬取,请稍后...') down_pic = DownLoadPic(excel_path=r'D:\RPA\down_pic\data\样例.xlsx') down_pic.main() print('全部完成!!!')
运行结果: