2717图片大全抓取【爬虫scrapy】

0.处理执行文件

main.py from scrapy import cmdline cmdline.execute('scrapy crawl sexgirl'.split())

1.设定抓取的字段名

item.py # -*- coding: utf-8 -*- import scrapy class SexItem(scrapy.Item): # names = scrapy.Field() # category = scrapy.Field() # time = scrapy.Field() imgurl = scrapy.Field() # number = scrapy.Field() # url = scrapy.Field()

2.依靠xpath获取节点,对数据清洗,及抓取


# -*- coding: utf-8 -*- import scrapy from sex.items import SexItem class SexgirlSpider(scrapy.Spider): name = 'sexgirl' start_urls = ['http://www.27270.com/ent/meinvtupian/list_11_1.html'] start = 1 # 如果遇到网页状态为404,500 handle_httpstatus_list = [404, 500] url = "https://www.27270.com/ent/meinvtupian/list_11_" end = ".html" start_urls = [url + str(start) + end] def parse(self, response): item = SexItem() prefix = '/html/body/div[2]/div[7]/ul/li' for i in range(1,30): links = response.xpath("/html/body/div[2]/div[7]/ul/li["+str(i)+"]/a/@href").extract() for link in links: for j in range(0,8): link0 = link.replace('.html','_'+str(j)+'.html') # print(link0) yield scrapy.Request('https://www.27270.com'+link0,callback = self.parse_item) # 设置页码终止条件,并且每次发送新的页面请求调用parse方法处理 # yield item if self.start <= 214: self.start += 1 yield scrapy.Request(self.url+str(self.start)+self.end,callback = self.parse) def parse_item(self, response): if response.status in self.handle_httpstatus_list: if self.start <= 214: self.start += 1 yield scrapy.Request(self.url+str(self.start)+self.end,callback = self.parse) else: i = SexItem() # deal thumb image i['imgurl'] = response.xpath('//*[@id="picBody"]/p/a[1]/img/@src').extract(); # print(i) yield i


3.对数据处理,选择以json文件存储!

pipelines.py # -*- coding: utf-8 -*- import requests from sex import settings import os import codecs import json class SexPipeline(object): def __init__(self): self.filename = codecs.open('sex.json',"w",encoding='utf-8') def process_item(self, item, spider): if 'imgurl' in item: images = [] dir_path = '%s/%s' % (settings.IMAGES_STORE, spider.name) if not os.path.exists(dir_path): os.makedirs(dir_path) for image_url in item['imgurl']: us = image_url.split('/')[3:] image_file_name = '_'.join(us) file_path = '%s/%s' % (dir_path, image_file_name) images.append(file_path) if os.path.exists(file_path): continue with open(file_path, 'wb') as handle: response = requests.get(image_url, stream=True) for block in response.iter_content(1024): if not block: break handle.write(block) item['imgurl'] = images content = json.dumps(dict(item),ensure_ascii=False) + "\n" self.filename.write(content) return item def spider_closed(self, spider): self.file.close()

4.设置user-agent, 延时时间, 以及图片存放位置

setting.py # -*- coding: utf-8 -*- BOT_NAME = 'sex' SPIDER_MODULES = ['sex.spiders'] NEWSPIDER_MODULE = 'sex.spiders' IMAGES_STORE = './images' DOWNLOAD_DELAY = 0.01 DEFAULT_REQUEST_HEADERS = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3236.0 Safari/537.36',

5.根目录下得执行 python main.py ,以下是部分json数据!

{"imgurl": ["./images/sexgirl/uploads_tu_201901_445_4.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201901_445_5.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201901_445_3.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201901_445_2.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201901_445_2.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201901_281_7.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201901_281_6.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201901_281_5.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_843db25414.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_7a3a7bd7b6.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201809_9999_dc334c8a2c.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_b7b6cf53a1.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201809_9999_4adbf39400.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_73c3042ed2.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_f972282a8c.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201809_9999_e3fd918c1a.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_00fdb5d13b.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_bc2411894f.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_e3fd745280.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_68247376ef.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_b4705fbf19.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201809_9999_0d2b9ab9b3.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_4df232fae5.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_cae9a44d1a.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_56e4b63bb6.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_673815dca9.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_ec347e1cb5.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_5e74779ac1.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_70c7c54fed.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_59a8090ce6.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_d692e22667.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_c694b10121.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201809_9999_b685006e6f.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_199d55987e.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_84f2bbfaf6.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_fef5f2404f.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201809_9999_30faaec658.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_7514568073.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_12c544948a.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_12569ab094.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_360b7f9c72.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_2405bc4933.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_9a88e1c771.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_3d9e2c5d1e.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_013e48041e.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201809_9999_0e0ce76962.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_df5f093639.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_d2c94dddab.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_70b4817d4e.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_65a87b78d8.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_3d5e6cbf62.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_9bc2bcadc9.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_1198a853a8.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_212a2c65e7.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_1bce695f51.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_5263853e0c.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_59637ed86b.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_46ecae635d.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_b9e1685824.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_2fbd72567f.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201810_9999_d86702e048.jpg"]} {"imgurl": ["./images/sexgirl/uploads_tu_201808_9999_dba76320e8.jpg"]} .....

6.抓取效果


本文于 2020-01-11 17:32 由作者进行过修改

本文链接:https://itarvin.com/detail-20.aspx

登录

注册