Download all mp3 file from URL
version 1 : 直接构造URL
import requests
def save_file(url, response, path=r'./'):
mp3_file_name = url.split('/')[-1]
assert(mp3_file_name.endswith('.mp3'))
with open(path + mp3_file_name, 'wb') as f:
f.write(response.content)
print('successful downloaded '+mp3_file_name)
if __name__ == '__main__':
for unit in range(1, 31): # download mp3 from unit1 to unit30
mp3_url = "http://downloads.bbc.co.uk/learningenglish/lowerintermediate/unit{}/u{}_6min_vocab.mp3".format(
unit, unit)
r = requests.get(mp3_url) # HTTP get method
try:
r.raise_for_status() # raise an Error, if make a bad request(e.g, 404 error)
except requests.exceptions.HTTPError as err:
print(err)
continue
save_file(mp3_url, r)
explaination
Requests
: Third-party HTTP library with better support for secure connections.
推荐用requests
,而不是标准库的urllib
.
pip install
pip3 install requests beautifulsoup4
defect
发现某些url并不符合特定的规则,不能直接构造得到
version2 : parse html to get URLs
import requests
import bs4
import re
def parseHTML(page_url):
file_url_list = []
r = requests.get(page_url)
try:
r.raise_for_status()
except requests.exceptions.HTTPError as err:
SystemError(err) # standard librayry function
bs = bs4.BeautifulSoup(r.text, 'html.parser')
for tag in bs.find_all(href=re.compile(r'http://downloads.bbc.co.uk/')):
file_url_list.append(tag.get('href'))
return file_url_list
def download_from_url(file_url):
print('start downloading from ' + file_url)
r = requests.get(file_url) # HTTP get method
try:
r.raise_for_status() # raise an Error, if make a bad request(e.g, 404 error)
except requests.exceptions.HTTPError as err:
print(err)
# save mp3 to disk
file_name = file_url.split('/')[-1]
save_file(file_name, r)
def save_file(file_name, response, path=r'./'):
with open(path + file_name, 'wb') as f:
f.write(response.content)
print('successful downloaded '+file_name)
if __name__ == '__main__':
# download mp3 from unit1 to unit30
for unit in range(1, 3):
''' replace download_page_url to yours '''
download_page_url = "https://www.bbc.co.uk/learningenglish/english/course/lower-intermediate/unit-{}/downloads".format(
unit)
for url in parseHTML(download_page_url):
download_from_url(url)
version3 : class version
import requests
import bs4
import re
class parseAndDownload():
def __init__(self, page_url) -> None:
self._page_url = page_url
def run(self):
for download_url in self._parseHTML():
self._download_from_url(download_url)
def _parseHTML(self):
file_url_list = []
r = requests.get(self._page_url)
try:
r.raise_for_status()
except requests.exceptions.HTTPError as err:
SystemError(err)
bs = bs4.BeautifulSoup(r.text, 'html.parser')
# TODO: find a better way to find download url ?
for tag in bs.find_all(href=re.compile(r'http://downloads.bbc.co.uk/')):
file_url_list.append(tag.get('href'))
return file_url_list
def _download_from_url(self, file_url):
print('start downloading from ' + file_url)
r = requests.get(file_url) # HTTP get method
try:
r.raise_for_status() # raise an Error, if make a bad request(e.g, 404 error)
except requests.exceptions.HTTPError as err:
print(err)
# save mp3 to disk
file_name = file_url.split('/')[-1]
with open(file_name, 'wb') as f:
f.write(r.content)
print('successful downloaded '+file_name)
if __name__ == '__main__':
# TODO: download mp3 from unit1 to unit30
for unit in range(1, 31):
''' replace download_page_url to yours '''
download_page_url = "https://www.bbc.co.uk/learningenglish/english/course/lower-intermediate/unit-{}/downloads".format(
unit)
a = parseAndDownload(download_page_url)
a.run()
version 4: multi thread
import requests
import bs4
import re
import threading
class parseAndDownload():
def __init__(self, page_url) -> None:
self._page_url = page_url
def run(self):
for download_url in self._parseHTML():
self._download_from_url(download_url)
def _parseHTML(self):
file_url_list = []
r = requests.get(self._page_url)
try:
r.raise_for_status()
except requests.exceptions.HTTPError as err:
SystemError(err)
bs = bs4.BeautifulSoup(r.text, 'html.parser')
# TODO: find a better way to find download url ?
for tag in bs.find_all(href=re.compile(r'http://downloads.bbc.co.uk/')):
file_url_list.append(tag.get('href'))
return file_url_list
def _download_from_url(self, file_url):
print('start downloading from ' + file_url)
# TODO: add headers to avoid remote disconnect
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:101.0) Gecko/20100101 Firefox/101.0'}
r = requests.get(file_url, headers=headers) # HTTP get method
try:
r.raise_for_status() # raise an Error, if make a bad request(e.g, 404 error)
except requests.exceptions.HTTPError as err:
print(err)
# save mp3 to disk
file_name = file_url.split('/')[-1]
with open(file_name, 'wb') as f:
f.write(r.content)
print('successful downloaded '+file_name)
if __name__ == '__main__':
# TODO: download mp3 from unit1 to unit30
download_threads = []
for unit in range(1, 31):
''' replace download_page_url to yours '''
download_page_url = "https://www.bbc.co.uk/learningenglish/english/course/lower-intermediate/unit-{}/downloads".format(
unit)
a = parseAndDownload(download_page_url)
# create a new thread
thread = threading.Thread(target=a.run())
download_threads.append(thread)
thread.start()
# wait all threads to return
for thread in download_threads:
thread.join()
print('\nAll files were downloaded!')
no multi-thread at all
TODO: why use threading.Thread(), but just like single-thread ???
http.client.RemoteDisconnected: Remote end closed connection without response
- 下载文件的过程,经常因为网络问题导致http disconnect问题;
- 想到的解决办法是: 直接打印url,再导入下载软件中;
version 5: Actor model - a more clear multi-thread programming model
TODO:
ref
https://requests.readthedocs.io/en/latest/