wallhaven壁纸网站爬虫
wallhaven壁纸网站爬虫
基础库
import os  
import time  
  
import requests  
from bs4 import BeautifulSoup  
import re
# 使用pip install re安装
思路分析

- 分析上面的几个url,发现toplist与hot只是单词不同,page代表第几页,一页24副图片。
- 查看网页源码,分析缩略图的链接。
- 根据缩略的url与原图的url对比,发现仅需替换单词而已。
- 例如缩略图的链接https://th.wallhaven.cc/small/85/856dj2.jpg
- 原图链接https://w.wallhaven.cc/full/85/wallhaven-856dj2.jpg
- 因此仅需使用正则进行替换即可。
- 但是在爬取过程中发现有的图片无法下载,分析原因:
- 缩略图全是jpg格式,而原图不一定是jpg格式,可能是png等等
- 如何解决呢?总不能把全部格式全下载一遍吧。
- 采取二级爬虫。
 
二级爬虫
import os  
import time  
  
import requests  
from bs4 import BeautifulSoup  
import re  
  
download_dir = r'D:\壁纸wallhaven'  
os.makedirs(download_dir, exist_ok=True)  
  
headers = {  
    # 'referer': 'https://wallhaven.cc/',  
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',  
}  
# # url = "https://wallhaven.cc/toplist?page=3"  
proxies = {'http': 'http://127.0.0.1:7890', 'https': 'http://127.0.0.1:7890'}  
# response = requests.get(url, proxies=proxies)  
  
# 发起请求并获取页面内容  
base_url = "https://wallhaven.cc/toplist?page={}"  
  
start_page = 3  
end_page = 10  
  
for page in range(start_page, end_page + 1):  
    url = base_url.format(page)  
    print(f"正在爬取第{page}页")  
    print(url)  
    response = requests.get(url, proxies=proxies)  
    html_content = response.text  
  
    # 使用BeautifulSoup解析页面内容  
    soup = BeautifulSoup(html_content, "html.parser")  
    # 找到所有class为"preview"的链接  
    preview_links = soup.find_all("a", class_="preview")
- 上述代码获取到预览的链接
- 使用BeautifulSoup进行解析预览的链接
- 再次进行爬取,爬取预览链接里面的内容
 
- 可见,针对爬取到的内容进行清洗页面,获取原图链接
获取原图链接
for page in range(start_page, end_page + 1):  
    url = base_url.format(page)  
    print(f"正在爬取第{page}页")  
    print(url)  
    response = requests.get(url, proxies=proxies)  
    html_content = response.text  
  
    # 使用BeautifulSoup解析页面内容  
    soup = BeautifulSoup(html_content, "html.parser")  
    # 找到所有class为"preview"的链接  
    preview_links = soup.find_all("a", class_="preview")  
  
    # 提取链接并打印  
    for link in preview_links:  
        href = link.get("href")  
        print("======================================")  
        print(href)  
        time.sleep(1)  
        # 发起GET请求并下载图片  
        try:  
  
            response_2 = requests.get(href, proxies=proxies)  
  
            # 使用BeautifulSoup解析HTML内容  
            soup = BeautifulSoup(response_2.text, "html.parser")  
  
            # 找到id为"wallpaper"的img标签  
            wallpaper_img = soup.find("img", id="wallpaper")
- 再次使用soup清洗页面,获取链接,到此即可拿到原图链接
- 再进行下载,写入文件即可
完整源码
import os  
import time  
  
import requests  
from bs4 import BeautifulSoup  
import re  
  
download_dir = r'D:\壁纸wallhaven'  
os.makedirs(download_dir, exist_ok=True)  
  
headers = {  
    # 'referer': 'https://wallhaven.cc/',  
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',  
}  
# # url = "https://wallhaven.cc/toplist?page=3"  
proxies = {'http': 'http://127.0.0.1:7890', 'https': 'http://127.0.0.1:7890'}  
# response = requests.get(url, proxies=proxies)  
  
# 发起请求并获取页面内容  
base_url = "https://wallhaven.cc/toplist?page={}"  
  
start_page = 3  
end_page = 10  
  
for page in range(start_page, end_page + 1):  
    url = base_url.format(page)  
    print(f"正在爬取第{page}页")  
    print(url)  
    response = requests.get(url, proxies=proxies)  
    html_content = response.text  
  
    # 使用BeautifulSoup解析页面内容  
    soup = BeautifulSoup(html_content, "html.parser")  
    # 找到所有class为"preview"的链接  
    preview_links = soup.find_all("a", class_="preview")  
  
    # 提取链接并打印  
    for link in preview_links:  
        href = link.get("href")  
        print("======================================")  
        print(href)  
        time.sleep(1)  
        # 发起GET请求并下载图片  
        try:  
  
            response_2 = requests.get(href, proxies=proxies)  
  
            # 使用BeautifulSoup解析HTML内容  
            soup = BeautifulSoup(response_2.text, "html.parser")  
  
            # 找到id为"wallpaper"的img标签  
            wallpaper_img = soup.find("img", id="wallpaper")  
  
            # 提取链接并打印  
            src_link = wallpaper_img.get("src")  
            print(src_link)  
  
            file_name = src_link.split("/")[-1]  
            print(file_name)  
  
            # 发起GET请求并下载图片  
            response = requests.get(src_link, proxies=proxies)  
  
            # 检查请求是否成功  
            if response.status_code == 200:  
                # 保存图片到本地  
                with open(os.path.join(download_dir, f"{file_name}"), "wb") as f:  
                    f.write(response.content)  
                print("图片下载成功!" + src_link)  
            else:  
                print("下载失败,状态码:", response.status_code, '"', src_link, '"')  
        except Exception as e:  
            print("爬取失败:", e)  
            continue  # 跳过当前链接,继续处理下一个链接
注意
- 需要使用代理
- 使用clash默认端口号为7890
- 其他代理更改为对应的代理即可
- 壁纸保存的位置
- 代理
- 开始与结束的页码,一页24张
 