pikapython/document/git_study/曹轶杰-20191218-网络编程实验课.txt

103 lines
3.3 KiB
Plaintext
Raw Normal View History

2022-10-07 12:59:11 +08:00
# 导入selenium包
import os.path
import pickle
import re
from time import sleep
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
# 初始化一个引用计数,用于后面的图片简单命名
index = 0
# 设置代理服务器端口
proxies = {'https':'http://127.0.0.1:10900'}
next_url = ''
# 定义爬虫方法
def geturl(url):
# 创建浏览器对象
browser = webdriver.Firefox(executable_path="D:\GeckoDriver\geckodriver.exe")
# 设置要爬取的网站
browser.get(url)
# css获得对应html形成列表
s = browser.find_elements_by_css_selector("div[class='post-preview-container'] a")
pic = s[:]
for i in pic:
# print(i)
huoqvpicture(str(i.get_attribute("href")))
sleep(1)
print("翻页")
# 获取下一页的链接
link = browser.find_elements_by_css_selector("div[class='paginator numbered-paginator mt-8 mb-4 space-x-2 flex justify-center items-center'] a")
# print(str(link[-1].get_attribute("href")))
# 将下一页链接的值赋给next_url
next_url = str(link[-1].get_attribute("href"))
browser.close()
# print(next_url)
geturl(next_url)
def huoqvpicture(url):
browser = webdriver.Firefox(executable_path="D:\GeckoDriver\geckodriver.exe")
# global index
browser.get(url)
n = browser.find_elements_by_css_selector("section[id='content'] section")
try:
s = browser.find_elements_by_css_selector("div[class='notice notice-small post-notice post-notice-resized'] a")
print(str(s[-1].get_attribute("href")))
DownLoadPicture(str(s[-1].get_attribute("href")), str(n[0].get_attribute("data-id")))
# index = 0
except:
p = browser.find_elements_by_css_selector("section[class='image-container note-container blacklisted'] picture img")
print(str(p[-1].get_attribute("src")))
DownLoadPicture(str(p[-1].get_attribute("src")), str(n[0].get_attribute("data-id")))
# index = 1
# 确定图片格式
# print(index)
# 定位original picture链接所在位置
# print(s[-1].get_attribute("src"))
# 打印链接地址
#图片名称
sleep(1)
print(str(n[0].get_attribute("data-id")))
browser.close()
def DownLoadPicture(url,name):
root = "./picture/"
path1 =root + name + '.jpg'
path2 =root + name + '.png'
# 若文件是.jpg 格式
try:
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(path1):
sleep(1)
r = requests.get(url,proxies=proxies)
print(r.status_code)
with open(path1, 'wb') as f:
f.write(r.content)
f.close()
print("文件保存成功")
else:
print("文件已存在")
# 若文件是.png 格式
except:
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(path2):
sleep(1)
r = requests.get(url, proxies=proxies)
print(r.status_code)
with open(path2, 'wb') as f:
f.write(r.content)
f.close()
print("文件保存成功")
else:
print("文件已存在")
sleep(1)
if __name__ == "__main__":
url = "https://danbooru.donmai.us/"
geturl(url)