mirror of
https://gitee.com/insArvin/nypc_python_advanced.git
synced 2026-04-18 00:02:28 +08:00
删除文件 Spider/neteaseMusicSpider.py
This commit is contained in:
@@ -1,111 +0,0 @@
|
|||||||
# 导入time模块,用于添加延时
|
|
||||||
import time
|
|
||||||
|
|
||||||
# 导入requests模块并重命名为req,用于发送HTTP请求
|
|
||||||
import requests as req
|
|
||||||
|
|
||||||
# 导入BeautifulSoup模块,用于解析HTML文档
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
# 导入selenium的webdriver模块,用于控制浏览器
|
|
||||||
from selenium import webdriver
|
|
||||||
|
|
||||||
# 导入selenium的By模块,用于定位元素
|
|
||||||
from selenium.webdriver.common.by import By
|
|
||||||
|
|
||||||
# 导入selenium的Service模块,用于管理浏览器驱动服务
|
|
||||||
from selenium.webdriver.chrome.service import Service
|
|
||||||
|
|
||||||
# 导入webdriver_manager的ChromeDriverManager模块,用于自动管理Chrome驱动
|
|
||||||
from webdriver_manager.chrome import ChromeDriverManager
|
|
||||||
|
|
||||||
|
|
||||||
# 定义网易音乐爬虫类
|
|
||||||
class neteaseMusicSpider:
|
|
||||||
# 初始化方法,设置爬虫的基本配置
|
|
||||||
def __init__(self):
|
|
||||||
# 设置目标URL,网易云音乐排行榜页面
|
|
||||||
self.url = 'https://music.163.com/discover/toplist?id=3779629'
|
|
||||||
# 设置请求头,模拟浏览器访问
|
|
||||||
self.headers = {
|
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
|
|
||||||
}
|
|
||||||
# 初始化排行榜列表
|
|
||||||
self.toplist = []
|
|
||||||
|
|
||||||
# 获取页面内容的方法
|
|
||||||
def get_page(self):
|
|
||||||
# 用Selenium渲染页面,获取iframe
|
|
||||||
# 创建Service对象,指定ChromeDriver路径
|
|
||||||
service = Service(executable_path='./chromedriver_win32/chromedriver.exe')
|
|
||||||
# 启动Chrome浏览器
|
|
||||||
driver = webdriver.Chrome(service=service)
|
|
||||||
# 访问目标网址
|
|
||||||
driver.get(self.url)
|
|
||||||
# 等待3秒,让JavaScript加载完成iframe和#document
|
|
||||||
time.sleep(3)
|
|
||||||
|
|
||||||
# 定位iframe元素
|
|
||||||
iframe_elem = driver.find_element(By.TAG_NAME, "iframe")
|
|
||||||
# 打印iframe元素信息
|
|
||||||
print(iframe_elem)
|
|
||||||
# 切换到iframe的#document上下文
|
|
||||||
driver.switch_to.frame(iframe_elem)
|
|
||||||
|
|
||||||
# 提取iframe的完整HTML内容(即#document内容)
|
|
||||||
iframe_html = driver.page_source
|
|
||||||
# 关闭浏览器
|
|
||||||
driver.quit()
|
|
||||||
# 返回获取到的HTML内容
|
|
||||||
return iframe_html
|
|
||||||
|
|
||||||
# 解析页面内容的方法
|
|
||||||
def parse_page(self):
|
|
||||||
try:
|
|
||||||
# 使用BeautifulSoup解析获取到的页面内容
|
|
||||||
soup = BeautifulSoup(self.get_page(), 'html.parser')
|
|
||||||
# 查找所有class为'even'的tr标签
|
|
||||||
trs = soup.find_all('tr', class_='even')
|
|
||||||
# 返回找到的tr标签列表
|
|
||||||
return trs
|
|
||||||
except Exception as e:
|
|
||||||
# 捕获异常并打印错误信息
|
|
||||||
print('soup转换异常', e)
|
|
||||||
# 返回None表示解析失败
|
|
||||||
return None
|
|
||||||
|
|
||||||
# 获取歌曲信息的方法
|
|
||||||
def get_songs(self):
|
|
||||||
# 初始化歌曲字典
|
|
||||||
songs = {}
|
|
||||||
try:
|
|
||||||
# 调用parse_page方法获取解析后的HTML元素
|
|
||||||
songs_html = self.parse_page()
|
|
||||||
# 打印获取到的HTML元素及其类型
|
|
||||||
print(songs_html, '\n', type(songs_html))
|
|
||||||
# 遍历每个歌曲元素
|
|
||||||
for song in songs_html:
|
|
||||||
# 打印当前歌曲元素(此处歌曲可能只获取奇数rank,偶数rank的页面结构需要去区分获取)
|
|
||||||
print(song, '\n')
|
|
||||||
# 提取歌曲标题,通过CSS选择器定位元素并获取title属性
|
|
||||||
s_title = song.select('span > a > b')[0].get_attribute_list('title')[0]
|
|
||||||
# 提取歌手信息,通过CSS选择器定位元素并获取title属性
|
|
||||||
s_singer = song.select('td:nth-child(4) > div')[0].get_attribute_list('title')[0]
|
|
||||||
# 提取歌曲时长,通过CSS选择器定位元素并获取文本内容
|
|
||||||
s_duration = song.select('td.s-fc3 > span')[0].string
|
|
||||||
# 提取歌曲ID,通过CSS选择器定位元素
|
|
||||||
s_id = song.select('td.rank > div > div > span')
|
|
||||||
# 打印提取到的歌曲信息
|
|
||||||
print(s_id, s_title, s_singer, s_duration, '\n')
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
# 捕获异常并打印错误信息
|
|
||||||
print('异常', e)
|
|
||||||
|
|
||||||
|
|
||||||
# 程序入口点
|
|
||||||
if __name__ == '__main__':
|
|
||||||
# 创建neteaseMusicSpider类的实例
|
|
||||||
nms = neteaseMusicSpider()
|
|
||||||
# 调用get_songs方法开始爬取歌曲信息
|
|
||||||
nms.get_songs()
|
|
||||||
Reference in New Issue
Block a user