AI摘要

文章介绍Wallhaven壁纸站特色,并给出完整Python爬虫:用tkinter构建GUI,支持关键词、页码、线程数、保存路径配置,多线程下载高清壁纸,可暂停,含异常重试。

导读

Wallhaven是一个令人感到惊艳的壁纸网站,汇集海量精美壁纸,还为用户提供了榜单、精选、专题等,更有分类、标签、搜索、收藏、同步等贴心服务,旨为让用户能够更轻松的发现优质壁纸。

代码

    import requests
    from lxml import etree
    import tkinter as tk
    from tkinter import filedialog
    import os
    import threading
    import time
    import tkinter.messagebox as messagebox
     
    def select_save_directory():
        save_directory = filedialog.askdirectory()
        if save_directory:
            save_entry.delete(0, tk.END)
            save_entry.insert(0, save_directory)
     
     
    stop_event = threading.Event()  # 停止事件,默认为未设置
    max_threads = 5  # 最大线程数限制
    semaphore = threading.Semaphore(max_threads)  # 信号量,用于控制并发线程数
     
     
    def perform_download(series_name, save_option, page_start, page_end, save_directory):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Mobile Safari/537.36'
        }
     
        url_template = 'https://wallhaven.cc/search?q={}&page=%d'
     
        try:
            for page in range(page_start, page_end + 1):
                if stop_event.is_set(): # 如果停止事件被设置,则终止下载
                    return
     
                print('正在下载第%d页图片' % page)
                url = url_template.format(series_name)
                new_url = url % page
                page_text = requests.get(url=new_url, headers=headers).text
     
                tree = etree.HTML(page_text)
                li_list = tree.xpath('//*[@id="thumbs"]/section/ul/li')
                for li in li_list:
                    detail_url = li.xpath('./figure/a/@href')[0]
                    detail_page_text = requests.get(url=detail_url, headers=headers).text
                    tree2 = etree.HTML(detail_page_text)
                    img_src = tree2.xpath('//*[@id="wallpaper"]/@src')[0]
     
                    if save_option == 1:
                        img_name = img_src.split('/')[-1]
                        img_path = os.path.join(save_directory, img_name)
                        img_data = requests.get(url=img_src, headers=headers).content
                        with open(img_path, 'wb') as fp:
                            fp.write(img_data)
                            print(img_name, '下载成功!')
     
            messagebox.showinfo("提示", "图片下载完成!")
     
        except Exception as e:
            print('下载过程中出现错误:', str(e))
            print('等待5秒后继续尝试...')
            time.sleep(5)  # 等待5秒后继续尝试爬取
            perform_download(series_name, save_option, page_start, page_end, save_directory)
     
        finally:
            semaphore.release()  # 释放信号量,表示线程结束
     
     
    def update_thread_count_label():
        current_thread_count = max_threads - semaphore._value
        thread_count_label.config(text="当前线程数: " + str(current_thread_count))
     
     
    def download_images(max_threads):
        global semaphore
        semaphore = threading.Semaphore(max_threads)
     
        stop_event.clear()  # 清除停止事件,表示开始下载
     
        # 获取用户输入的系列名称、保存选项和下载页码范围
        series_name = series_entry.get()
        save_option = save_var.get()
        page_start = int(start_entry.get())
        page_end = int(end_entry.get())
        save_directory = save_entry.get()  # 获取保存路径
     
        # 创建并启动线程
        for page in range(page_start, page_end + 1):
            if stop_event.is_set():  # 如果停止事件被设置,则终止下载
                break
     
            semaphore.acquire()  # 获取信号量,控制线程数量
            t = threading.Thread(target=perform_download, args=(series_name, save_option, page, page, save_directory))
            t.start()
     
        update_thread_count_label()
     
    def stop_download():
        stop_event.set()  # 设置停止事件,表示停止下载
     
     
    # 创建主窗口
    window = tk.Tk()
    window.title("图片下载应用程序")
    window.geometry("800x450")  # 设置窗口大小
     
    #布局容器
    frame = tk.Frame(window)
    frame.pack(pady=20)
     
    #输入系列名称的窗口
    series_label = tk.Label(frame, text="系列名称:")
    series_label.grid(row=0, column=0, padx=10, pady=5)
     
    series_entry = tk.Entry(frame)
    series_entry.grid(row=0, column=1, padx=10, pady=5)
     
    #保存选项的窗口
    save_option_label = tk.Label(frame, text="保存选项:")
    save_option_label.grid(row=1, column=0, padx=10, pady=5)
     
    save_var = tk.IntVar()
    save_radio1 = tk.Radiobutton(frame, text="下载照片", variable=save_var, value=1)
    save_radio1.grid(row=1, column=1, padx=10, pady=5)
     
    #输入页码范围的窗口
    page_range_label = tk.Label(frame, text="页码范围:")
    page_range_label.grid(row=2, column=0, padx=10, pady=5)
     
    start_label = tk.Label(frame, text="起始页码:")
    start_label.grid(row=2, column=1, padx=10, pady=5)
     
    end_label = tk.Label(frame, text="结束页码:")
    end_label.grid(row=3, column=1, padx=10, pady=5)
     
    start_entry = tk.Entry(frame)
    start_entry.grid(row=2, column=2, padx=10, pady=5)
     
    end_entry = tk.Entry(frame)
    end_entry.grid(row=3, column=2, padx=10, pady=5)
     
    #选择保存路径的窗口
    save_directory_label = tk.Label(frame, text="保存路径:")
    save_directory_label.grid(row=4, column=0, padx=10, pady=5)
     
    save_entry = tk.Entry(frame)
    save_entry.grid(row=4, column=1, padx=10, pady=5)
     
    select_button = tk.Button(frame, text="选择路径", command=select_save_directory)
    select_button.grid(row=4, column=2, padx=10, pady=5)
     
    #最大线程数选择
    thread_count_label = tk.Label(frame, text="最大线程数:")
    thread_count_label.grid(row=5, column=0, padx=10, pady=5)
     
    thread_count_var = tk.StringVar()
    thread_count_dropdown = tk.OptionMenu(frame, thread_count_var, "1", "2", "3", "4", "5")
    thread_count_dropdown.grid(row=5, column=1, padx=10, pady=5)
    thread_count_var.set("5")  # 默认值设置为5
     
    #启动下载和停止下载按钮
    download_button = tk.Button(frame, text="启动下载", width=15, command=lambda: download_images(max_threads))
    stop_button = tk.Button(frame, text="停止下载", width=15, command=stop_download)
     
    #设置按钮居中对齐
    download_button.grid(row=6, column=1, padx=10, pady=5, sticky="nsew")
    stop_button.grid(row=6, column=2, padx=10, pady=5, sticky="nsew")
     
    #显示当前线程数的标签
    thread_count_label = tk.Label(frame, text="当前线程数: " + str(max_threads - semaphore._value))
    thread_count_label.grid(row=5, column=1, columnspan=3, padx=10, pady=5)
     
    #运行主循环
    window.mainloop()
如果觉得我的文章对你有用,请随意赞赏
END
本文作者:
文章标题:Wallhaven壁纸爬取
本文地址:https://hh2xx.cn/archives/193/
版权说明:若无注明,本文皆HH の Blog's原创,转载请保留文章出处。