思路
一开始的想法是全都直接根据歌单获取歌曲的ID,然后每首歌都按那个格式001.mp3\001.txt\001.png存下来
后来发现我好像做不到直接下mp3……因为在网页版一点会跳出来请在PC版下载(咦所以结果我也不知道怎么操作……)
然后计划就变成了 - 爬图片+歌词,命名成和音乐同个格式,然后按名字排序之后就可以用改名软件批量直接改成这个样子了(自以为很机智)
虽然本人对python的理解还停留在曾经因作业要求爬过一次的莎士比亚全集……
具体操作
美汤(程序员都是吃货系列
好像是评论比较难爬
然后没有追求的我就把这一块都去了……
http://www.jianshu.com/p/2b783f7914c6
先对着一个歌单把ID爬下来
然后再对着ID爬各种……(只能想到这样了)
http://www.bkjia.com/Pythonjc/1189598.html
歌曲
https://www.zhihu.com/question/41505181
python 下载mp3(这个例子是百度音乐的)
http://www.th7.cn/Program/Python/201507/499020.shtml
图片
http://www.tuicool.com/articles/6NVjAjr
http://www.th7.cn/Program/Python/201603/775611.shtml
http://www.cnblogs.com/Albert-Lee/p/6276847.html
歌词
https://juejin.im/entry/59185f37da2f60005dee09d3
http://www.cnblogs.com/Beyond-Ricky/p/6757954.html
具体实现
应该可以从如下代码中看到以上一些参考文献的痕迹(捂脸)
虽然拼凑痕迹有点明显orz反正能大概实现功能已经很满意了
读取歌单
# 爬取某歌单的所有歌曲ID
def getSongIdList():
songIdList = []
playListUrl = 'http://music.163.com/playlist?id=918802417'#空城 finished
soup = BeautifulSoup(_session.get(playListUrl).content)
ul = soup.find('ul', attrs={'class': 'f-hide'})
for li in ul.findAll('li'):
songId = (li.find('a'))['href'].split('=')[1]
print songId
songIdList.append(songId)
# 去一下重复的歌曲I
songIdList = list(set(songIdList))
return songIdList
歌曲名+歌手名
url = BASE_URL + 'song?id=' + str(song.id)
url.decode('utf-8')
soup = BeautifulSoup(_session.get(url).content)
strArr = soup.title.string.split(' - ')
song.singer = strArr[1]
name = strArr[0].encode('utf-8')
歌词
def get_lyric_by_music_id(music_id,songstr):#定义一个函数,通过音乐的id得到歌词
lrc_url = 'http://music.163.com/api/song/lyric?' + 'id=' + str(music_id) + '&lv=1&kv=1&tv=-1'
#lrc_url = BASE_URL + 'song?id=' + str(music_id)
lyric=requests.get(lrc_url)
json_obj=lyric.text
#print(json_obj)
j=json.loads(json_obj)
file_atr = 'E:\\music\\' + songstr + '.txt'
f = open(file_atr,'wb')
f.writelines(songstr.encode('utf8'))
f.write('\n')
try:#部分歌曲没有歌词,这里引入一个异常
lrc=j['lrc']['lyric']
pat=re.compile(r'\[.*\]')
lrc=re.sub(pat,"",lrc)
lrc=lrc.strip()
#print type(lrc)
f.writelines(lrc.encode('utf8'))
f.close()
return lrc
except KeyError as e:
f.writelines('No Available Lyric on CloudMusic')
f.close()
专辑封面
def get_img(url,songstr): ##保存图片
#print '##正在读取图片##'
#print url
urlStream=urllib.urlopen(url)
htmlString=urlStream.read()
#print htmlString
if( len(htmlString)!=0 ):
patternString=r'http://p1.music.126.net/.*?.jpg'
searchPattern=re.compile(patternString)
imgUrlList=searchPattern.findall(htmlString)
imgUrl =imgUrlList[0]
#print imgUrl
if( len(imgUrl)!= 0 ):
urllib.urlretrieve(imgUrl,'E:\\music\\' + songstr + '.jpg')
完整代码
import requests
from selenium import webdriver
from bs4 import BeautifulSoup
import os, json
import base64
import warnings
import re,urllib,uuid
'''
遇到不懂的问题?Python学习交流群:1136201545满足你的需求,资料都已经上传群文件,可以自行下载!
'''
warnings.filterwarnings("ignore")
BASE_URL = 'http://music.163.com/'
_session = requests.session()
localPath='d://pythonPath'
def createFileWithFileName(localPathParam,fileName):
totalPath=localPathParam+'//'+fileName
if not os.path.exists(totalPath):
file=open(totalPath,'a+')
file.close()
class Song(object):
def __lt__(self, other):
return self.commentCount > other.commentCount
# 爬取某歌单的所有歌曲ID
def getSongIdList():
songIdList = []
playListUrl = 'http://music.163.com/playlist?id=918802417'#空城 finished
soup = BeautifulSoup(_session.get(playListUrl).content)
ul = soup.find('ul', attrs={'class': 'f-hide'})
for li in ul.findAll('li'):
songId = (li.find('a'))['href'].split('=')[1]
print songId
songIdList.append(songId)
# 去一下重复的歌曲I
songIdList = list(set(songIdList))
return songIdList
# 获取歌曲信息
def matchSong(songId):
song = Song()
song.id = songId
return song
def get_lyric_by_music_id(music_id,songstr):#定义一个函数,通过音乐的id得到歌词
lrc_url = 'http://music.163.com/api/song/lyric?' + 'id=' + str(music_id) + '&lv=1&kv=1&tv=-1'
#lrc_url = BASE_URL + 'song?id=' + str(music_id)
lyric=requests.get(lrc_url)
json_obj=lyric.text
#print(json_obj)
j=json.loads(json_obj)
file_atr = 'E:\\music\\' + songstr + '.txt'
f = open(file_atr,'wb')
f.writelines(songstr.encode('utf8'))
f.write('\n')
try:#部分歌曲没有歌词,这里引入一个异常
lrc=j['lrc']['lyric']
pat=re.compile(r'\[.*\]')
lrc=re.sub(pat,"",lrc)
lrc=lrc.strip()
#print type(lrc)
f.writelines(lrc.encode('utf8'))
f.close()
return lrc
except KeyError as e:
f.writelines('No Available Lyric on CloudMusic')
f.close()
def get_img(url,songstr): ##保存图片
#print '##正在读取图片##'
#print url
urlStream=urllib.urlopen(url)
htmlString=urlStream.read()
#print htmlString
if( len(htmlString)!=0 ):
patternString=r'http://p1.music.126.net/.*?.jpg'
searchPattern=re.compile(patternString)
imgUrlList=searchPattern.findall(htmlString)
imgUrl =imgUrlList[0]
#print imgUrl
if( len(imgUrl)!= 0 ):
urllib.urlretrieve(imgUrl,'E:\\music\\' + songstr + '.jpg')
# 设置歌曲的信息
def setSongInfo(song):
url = BASE_URL + 'song?id=' + str(song.id)
url.decode('utf-8')
soup = BeautifulSoup(_session.get(url).content)
strArr = soup.title.string.split(' - ')
song.singer = strArr[1]
name = strArr[0].encode('utf-8')
song.name = name
songstr = strArr[0]+ ' - '+ strArr[1]
songstr = songstr.replace('/',' ')
song.lrc = get_lyric_by_music_id(song.id,songstr)
song.img = get_img(url,songstr)
# 获取符合条件的歌曲列表
def getSongList():
print ' ##正在爬取歌曲编号... ##'
songIdList = getSongIdList()
print ' ##爬取歌曲编号完成,共计爬取到' + str(len(songIdList)) + '首##'
songList=[]
for id in songIdList[0:]:
song = matchSong(id)
if None != song:
setSongInfo(song)
songList.append(song)
print '成功匹配一首{名称:', song.name, ' - ', song.singer, '}'
# print ' ##爬取完成,符合条件的的共计' + str(len(songList)) + '首##'
return songList
def main():
songList = getSongList()
if __name__ == '__main__':
main()
运行过程
[外链图片转存失败(img-47MjNNQ3-1566302376843)(https://upload-images.jianshu.io/upload_images/13406307-a1dec35e4ec7df1b?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)]
结果
(这里我很怂的把歌手名去掉了因为斜杠的问题解决不掉)
[外链图片转存失败(img-BzCkZ8HE-1566302376843)(https://upload-images.jianshu.io/upload_images/13406307-35414fe7ff93d3aa?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)]
迷之操作
格式工厂转图片格式 + 拖把更名器改名……
[外链图片转存失败(img-2Ja1G5m3-1566302376843)(https://upload-images.jianshu.io/upload_images/13406307-5d1252d2187dcf6d?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)]
我想知道这个失败是什么鬼……
[外链图片转存失败(img-2Y2odihY-1566302376844)(https://upload-images.jianshu.io/upload_images/13406307-98f58cef10adbe9c?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)]