You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
getDiscography/utils/processor.py

136 lines
4.7 KiB

This file contains invisible Unicode characters!

This file contains invisible Unicode characters that may be processed differently from what appears below. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to reveal hidden characters.

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import os
import bs4
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from .browser import new_browser
from .yt_dlp_logger import *
from const import *
import time
import wget
import yt_dlp
def download_file(url, output):
filename = wget.download(url, out=output)
os.rename(filename, output + '/album.jpg')
return filename
def process_download(artist):
artist = artist.title()
# Initialize a new browser object to go collect the data we need
browser = new_browser(headless=True)
url = QUERY_URL + artist
browser.maximize_window()
browser.implicitly_wait(1)
response = browser.get(url)
last_height = browser.execute_script("return document.body.scrollHeight")
browser.execute_script("window.scrollTo(0, 500);")
time.sleep(1)
res_value = ''
try:
# get the financial value when it's populated to the page
value_element = WebDriverWait(browser, 5).until(
EC.presence_of_element_located(locator=(By.XPATH, '//div[@id="shelf-container"]'))
)
element = browser.find_element(By.XPATH, ALBUM_CONTAINER_ITEMS_XPATH)
if element:
time.sleep(1)
res_value += element.get_attribute('outerHTML')
btn_right = browser.find_element(By.XPATH, BTN_RIGHT_FULL_XPATH)
btn_right_displayed = True
safety_index = 0
while btn_right_displayed:
# actions = ActionChains(browser)
# actions.move_to_element(btn_right).perform()
safety_index += 1
time.sleep(1)
browser.execute_script(click_script)
time.sleep(1)
element = browser.find_element(By.XPATH, ALBUM_CONTAINER_ITEMS_XPATH)
res_value += element.get_attribute('outerHTML')
time.sleep(1)
btn_right_displayed = btn_right.is_displayed()
if safety_index > 5:
btn_right_displayed = False
time.sleep(1)
finally:
# after 5 seconds, give up
browser.quit()
# Process the gathered HTML data for list of data
html = bs4.BeautifulSoup(res_value, features="html.parser")
albums = html.find_all('a')
albums_data_list = []
for album in albums:
album_data = {'artist': artist.title()}
if album.has_key('href'):
album_data.update({'link': album['href']})
album_image = album.find('img')
if album_image and album_image.has_key('src'):
album_data.update({'cover': album_image['src']})
album_title = album.find('div', {'id': 'card-title'})
if album_title and hasattr(album_title, 'text'):
album_data.update({'album': album_title.text.replace('\n', '').replace('/', '-')})
albums_data_list.append(album_data)
processed_albums_data_list = []
processed_album_names = []
# Eliminate duplicate entries:
for item in albums_data_list:
if item.get('cover') and item.get('album') not in processed_album_names:
processed_albums_data_list.append(item)
processed_album_names.append(item.get('album'))
#===:= Download the albums ===#
# Create Artist folder/path
artist_path = MEDIA_FOLDER + '/%s' % artist
if not os.path.exists(artist_path):
os.mkdir(artist_path)
for item in processed_albums_data_list:
print('---')
# Create album folder
album = item.get('album')
album_path = artist_path + '/%s' % album
if not os.path.exists(album_path):
os.mkdir(album_path)
# Save album cover
if item.get('cover'):
try:
download_file(item.get('cover'), album_path)
except Exception as e:
print("Warning: %s" % e)
# Download album
print(item)
ydl_opts = {
'logger': MyLogger(),
'progress_hooks': [my_hook],
'format': 'mp3/bestaudio/best',
'outtmpl': album_path + '/%(title)s.%(ext)s',
# See help(yt_dlp.postprocessor) for a list of available Postprocessors and their arguments
'postprocessors': [{ # Extract audio using ffmpeg
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
}]
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
try:
error_code = ydl.download('https://youtube.com' + item.get('link'))
except Exception as e:
print('!!!!!!!!!')
print(e)
return processed_albums_data_list