首页 > Python > Python爬 [http://www.doutula.com/] 表情包图片网站
2018
11-08

Python爬 [http://www.doutula.com/] 表情包图片网站

今天学习了Python简单爬虫,爬了一个表情包网站: [http://www.doutula.com/] ,以后聊天斗图不怕没表情包了,下面是完成的代码,分享给大家

import requests
from lxml import html
from concurrent import futures
import time
import os
etree = html.etree

headers = {
    'Referer': 'http://www.doutula.com/',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)' +
                  ' Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
}


def download_img(img_url, dir_name, img_name):
    img_format = img_url.replace('!dta', '').split('.')[-1]
    filename = img_name + "." + img_format
    print(img_url, img_name, filename)
    # filename = src_url.split('/')[-1].replace('!dta', '')
    dir_name_s = 'imgs/{}'.format(dir_name)
    if not os.path.exists('imgs'):
        os.makedirs('imgs')
    if not os.path.exists(dir_name_s):
        os.makedirs(dir_name_s)
    if not os.path.exists('imgs/all'):
        os.makedirs('imgs/all')
    img = requests.get(img_url, headers=headers)
    with open('imgs/{}/{}'.format(dir_name, filename), 'wb') as file:
        file.write(img.content)
    with open('imgs/all/{}'.format(filename), 'wb') as file:
        file.write(img.content)
    print(img_url, filename)


def get_page(url):
    time.sleep(0.001)
    resp = requests.get(url, headers=headers)
    dir_name = url.split('page=')[-1]
    print(resp, url)
    html_str = etree.HTML(resp.text)
    img_s = html_str.xpath('.//img/@data-original')
    img_name_s = html_str.xpath('.//div[@class="col-xs-6 col-sm-3"]/img/@alt')
    exe = futures.ThreadPoolExecutor(max_workers=40)
    print(len(img_s), len(img_name_s))
    for i in range(len(img_s)):
        exe.submit(download_img, img_s[i], dir_name, img_name_s[i])
    next_link = html_str.xpath('.//a[@rel="next"]/@href')
    return next_link


def main(start_page_num=1, end_page_num=4):
    next_link_base = 'http://www.doutula.com/article/list/?page='
    next_link = 'http://www.doutula.com/'
    current_num = start_page_num - 1
    while next_link:
        time.sleep(0.2)
        current_num += 1
        next_link = get_page(next_link_base + str(current_num))
        if current_num >= end_page_num:
            break


if __name__ == "__main__":
    main(start_page_num=101, end_page_num=109)


'''页面链接格式
http://www.doutula.com/article/list/?page=1
http://www.doutula.com/article/list/?page=2
http://www.doutula.com/article/list/?page=3
http://www.doutula.com/article/list/?page=4
'''

'''图片链接格式
http://img.doutula.com/production/uploads/image//2018/11/03/20181103474657_oXHzJM.gif!dta 20181103474657_oXHzJM.gif
http://img.doutula.com/production/uploads/image//2018/11/03/20181103474657_tkfgCQ.gif!dta 20181103474657_tkfgCQ.gif
http://img.doutula.com/production/uploads/image//2018/11/03/20181103474658_ueRcby.gif!dta 20181103474658_ueRcby.gif
https://ws4.sinaimg.cn/bmiddle/6af89bc8gw1f8o03g63ypj207f0ammx3.jpg 6af89bc8gw1f8o03g63ypj207f0ammx3.jpg
https://ws2.sinaimg.cn/bmiddle/6af89bc8gw1f8oyzh4zy9j204e04emx1.jpg 6af89bc8gw1f8oyzh4zy9j204e04emx1.jpg
https://ws3.sinaimg.cn/bmiddle/6af89bc8gw1f8p6zsv91oj2068068q31.jpg 6af89bc8gw1f8p6zsv91oj2068068q31.jpg
https://ws1.sinaimg.cn/bmiddle/6af89bc8gw1f8ppr3fnaag203c037js7.gif 6af89bc8gw1f8ppr3fnaag203c037js7.gif
'''

'''图片HTML格式
<img src="//static.doutula.com/img/loader_170_160.png"
 style="margin: 0 auto; min-height: inherit;"
 data-original="https://ws4.sinaimg.cn/bmiddle/6af89bc8gw1f8rlirv225j20a00a0750.jpg"
 alt="没有表情包,我怎么装逼"
 class="img-responsive lazy image_dta"
 data-backup="http://img.doutula.com/production/uploads/image//2016/05/27/20160527315611_jIzUAx.jpg!dta">
'''

运行效果:

image.png


image.png               image.png

本文》有 0 条评论

留下一个回复