今天学习了Python简单爬虫,爬了一个表情包网站: [http://www.doutula.com/] ,以后聊天斗图不怕没表情包了,下面是完成的代码,分享给大家
import requests from lxml import html from concurrent import futures import time import os etree = html.etree headers = { 'Referer': 'http://www.doutula.com/', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)' + ' Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0' } def download_img(img_url, dir_name, img_name): img_format = img_url.replace('!dta', '').split('.')[-1] filename = img_name + "." + img_format print(img_url, img_name, filename) # filename = src_url.split('/')[-1].replace('!dta', '') dir_name_s = 'imgs/{}'.format(dir_name) if not os.path.exists('imgs'): os.makedirs('imgs') if not os.path.exists(dir_name_s): os.makedirs(dir_name_s) if not os.path.exists('imgs/all'): os.makedirs('imgs/all') img = requests.get(img_url, headers=headers) with open('imgs/{}/{}'.format(dir_name, filename), 'wb') as file: file.write(img.content) with open('imgs/all/{}'.format(filename), 'wb') as file: file.write(img.content) print(img_url, filename) def get_page(url): time.sleep(0.001) resp = requests.get(url, headers=headers) dir_name = url.split('page=')[-1] print(resp, url) html_str = etree.HTML(resp.text) img_s = html_str.xpath('.//img/@data-original') img_name_s = html_str.xpath('.//div[@class="col-xs-6 col-sm-3"]/img/@alt') exe = futures.ThreadPoolExecutor(max_workers=40) print(len(img_s), len(img_name_s)) for i in range(len(img_s)): exe.submit(download_img, img_s[i], dir_name, img_name_s[i]) next_link = html_str.xpath('.//a[@rel="next"]/@href') return next_link def main(start_page_num=1, end_page_num=4): next_link_base = 'http://www.doutula.com/article/list/?page=' next_link = 'http://www.doutula.com/' current_num = start_page_num - 1 while next_link: time.sleep(0.2) current_num += 1 next_link = get_page(next_link_base + str(current_num)) if current_num >= end_page_num: break if __name__ == "__main__": main(start_page_num=101, end_page_num=109) '''页面链接格式 http://www.doutula.com/article/list/?page=1 http://www.doutula.com/article/list/?page=2 http://www.doutula.com/article/list/?page=3 http://www.doutula.com/article/list/?page=4 ''' '''图片链接格式 http://img.doutula.com/production/uploads/image//2018/11/03/20181103474657_oXHzJM.gif!dta 20181103474657_oXHzJM.gif http://img.doutula.com/production/uploads/image//2018/11/03/20181103474657_tkfgCQ.gif!dta 20181103474657_tkfgCQ.gif http://img.doutula.com/production/uploads/image//2018/11/03/20181103474658_ueRcby.gif!dta 20181103474658_ueRcby.gif https://ws4.sinaimg.cn/bmiddle/6af89bc8gw1f8o03g63ypj207f0ammx3.jpg 6af89bc8gw1f8o03g63ypj207f0ammx3.jpg https://ws2.sinaimg.cn/bmiddle/6af89bc8gw1f8oyzh4zy9j204e04emx1.jpg 6af89bc8gw1f8oyzh4zy9j204e04emx1.jpg https://ws3.sinaimg.cn/bmiddle/6af89bc8gw1f8p6zsv91oj2068068q31.jpg 6af89bc8gw1f8p6zsv91oj2068068q31.jpg https://ws1.sinaimg.cn/bmiddle/6af89bc8gw1f8ppr3fnaag203c037js7.gif 6af89bc8gw1f8ppr3fnaag203c037js7.gif ''' '''图片HTML格式 <img src="//static.doutula.com/img/loader_170_160.png" style="margin: 0 auto; min-height: inherit;" data-original="https://ws4.sinaimg.cn/bmiddle/6af89bc8gw1f8rlirv225j20a00a0750.jpg" alt="没有表情包,我怎么装逼" class="img-responsive lazy image_dta" data-backup="http://img.doutula.com/production/uploads/image//2016/05/27/20160527315611_jIzUAx.jpg!dta"> '''
运行效果:
- 本文固定链接: https://www.coordsoft.com/post/10.html
- 转载请注明: admin 于 生活随想 - zwgu 's world 发表
《本文》有 0 条评论