mirror of
https://gitee.com/insArvin/nypc_python_advanced.git
synced 2026-04-18 01:12:29 +08:00
update Spider/neteaseMusicSpider/main.py.
修复获取榜单奇数歌曲问题 Signed-off-by: iamzhaohaibo <941604465@qq.com>
This commit is contained in:
@@ -37,13 +37,13 @@ class neteaseMusicSpider:
|
|||||||
def get_page(self):
|
def get_page(self):
|
||||||
# 用Selenium渲染页面,获取iframe
|
# 用Selenium渲染页面,获取iframe
|
||||||
# 创建Service对象,指定ChromeDriver路径
|
# 创建Service对象,指定ChromeDriver路径
|
||||||
service = Service(executable_path='./chromedriver_win32/chromedriver.exe')
|
service = Service(executable_path='/Users/zhaohaibo/Desktop/chromedriver-mac-x64/chromedriver')
|
||||||
# 启动Chrome浏览器
|
# 启动Chrome浏览器
|
||||||
driver = webdriver.Chrome(service=service)
|
driver = webdriver.Chrome(service=service)
|
||||||
# 访问目标网址
|
# 访问目标网址
|
||||||
driver.get(self.url)
|
driver.get(self.url)
|
||||||
# 等待3秒,让JavaScript加载完成iframe和#document
|
# 等待3秒,让JavaScript加载完成iframe和#document
|
||||||
time.sleep(3)
|
# time.sleep(3)
|
||||||
|
|
||||||
# 定位iframe元素
|
# 定位iframe元素
|
||||||
iframe_elem = driver.find_element(By.TAG_NAME, "iframe")
|
iframe_elem = driver.find_element(By.TAG_NAME, "iframe")
|
||||||
@@ -64,8 +64,8 @@ class neteaseMusicSpider:
|
|||||||
try:
|
try:
|
||||||
# 使用BeautifulSoup解析获取到的页面内容
|
# 使用BeautifulSoup解析获取到的页面内容
|
||||||
soup = BeautifulSoup(self.get_page(), 'html.parser')
|
soup = BeautifulSoup(self.get_page(), 'html.parser')
|
||||||
# 查找所有class为'even'的tr标签
|
# 查找table > tbody标签下的所有的tr标签
|
||||||
trs = soup.find_all('tr', class_='even')
|
trs = soup.select('table > tbody')[0]
|
||||||
# 返回找到的tr标签列表
|
# 返回找到的tr标签列表
|
||||||
return trs
|
return trs
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -85,8 +85,8 @@ class neteaseMusicSpider:
|
|||||||
print(songs_html, '\n', type(songs_html))
|
print(songs_html, '\n', type(songs_html))
|
||||||
# 遍历每个歌曲元素
|
# 遍历每个歌曲元素
|
||||||
for song in songs_html:
|
for song in songs_html:
|
||||||
# 打印当前歌曲元素(此处歌曲可能只获取奇数rank,偶数rank的页面结构需要去区分获取)
|
# 提取歌曲排行,通过CSS选择器定位元素并获取排行对应文本内容
|
||||||
print(song, '\n')
|
s_rank = song.select('td:nth-child(1) >div>span')[0].string
|
||||||
# 提取歌曲标题,通过CSS选择器定位元素并获取title属性
|
# 提取歌曲标题,通过CSS选择器定位元素并获取title属性
|
||||||
s_title = song.select('span > a > b')[0].get_attribute_list('title')[0]
|
s_title = song.select('span > a > b')[0].get_attribute_list('title')[0]
|
||||||
# 提取歌手信息,通过CSS选择器定位元素并获取title属性
|
# 提取歌手信息,通过CSS选择器定位元素并获取title属性
|
||||||
@@ -94,9 +94,17 @@ class neteaseMusicSpider:
|
|||||||
# 提取歌曲时长,通过CSS选择器定位元素并获取文本内容
|
# 提取歌曲时长,通过CSS选择器定位元素并获取文本内容
|
||||||
s_duration = song.select('td.s-fc3 > span')[0].string
|
s_duration = song.select('td.s-fc3 > span')[0].string
|
||||||
# 提取歌曲ID,通过CSS选择器定位元素
|
# 提取歌曲ID,通过CSS选择器定位元素
|
||||||
s_id = song.select('td.rank > div > div > span')
|
s_id = song.select('td:nth-child(2) > div > div > span')[0].get_attribute_list('data-res-id')[0]
|
||||||
# 打印提取到的歌曲信息
|
# 打印提取到的歌曲信息
|
||||||
print(s_id, s_title, s_singer, s_duration, '\n')
|
print(s_rank, s_id, s_title, s_singer, s_duration, '\n')
|
||||||
|
# 将歌曲信息添加到字典中、方便后续写入数据库、表格存储
|
||||||
|
songs[s_id] = {
|
||||||
|
'rank': s_rank,
|
||||||
|
'title': s_title,
|
||||||
|
'singer': s_singer,
|
||||||
|
'duration': s_duration
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# 捕获异常并打印错误信息
|
# 捕获异常并打印错误信息
|
||||||
|
|||||||
Reference in New Issue
Block a user