You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
81 lines
2.8 KiB
81 lines
2.8 KiB
import bs4
|
|
import time
|
|
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
from selenium.webdriver.common.by import By
|
|
from .browser import new_browser
|
|
from const import *
|
|
|
|
|
|
def process_scraped_data(artist, res_value):
|
|
# Process the gathered HTML data for list of data
|
|
html = bs4.BeautifulSoup(res_value, features="html.parser")
|
|
albums = html.find_all('a')
|
|
albums_data_list = []
|
|
for album in albums:
|
|
album_data = {
|
|
'artist': artist.title(),
|
|
'downloaded': False,
|
|
'downloading': False,
|
|
}
|
|
if album.has_key('href'):
|
|
album_data.update({'link': album['href']})
|
|
|
|
album_image = album.find('img')
|
|
if album_image and album_image.has_key('src'):
|
|
album_data.update({'cover': album_image['src']})
|
|
|
|
album_title = album.find('div', {'id': 'card-title'})
|
|
if album_title and hasattr(album_title, 'text'):
|
|
album_data.update({'album': album_title.text.replace('\n', '').replace('/', '-')})
|
|
|
|
albums_data_list.append(album_data)
|
|
|
|
return albums_data_list
|
|
|
|
|
|
def scrape(artist):
|
|
browser = new_browser(headless=True)
|
|
url = QUERY_URL + artist
|
|
browser.maximize_window()
|
|
browser.implicitly_wait(1)
|
|
response = browser.get(url)
|
|
last_height = browser.execute_script("return document.body.scrollHeight")
|
|
browser.execute_script("window.scrollTo(0, 500);")
|
|
time.sleep(1)
|
|
scrape_data = ''
|
|
|
|
try:
|
|
# get the financial value when it's populated to the page
|
|
value_element = WebDriverWait(browser, 10).until(
|
|
EC.presence_of_element_located(locator=(By.XPATH, '//div[@id="shelf-container"]'))
|
|
)
|
|
element = browser.find_element(By.XPATH, ALBUM_CONTAINER_ITEMS_XPATH)
|
|
if element:
|
|
time.sleep(1)
|
|
scrape_data += element.get_attribute('outerHTML')
|
|
btn_right = browser.find_element(By.XPATH, BTN_RIGHT_FULL_XPATH)
|
|
btn_right_displayed = True
|
|
safety_index = 0
|
|
while btn_right_displayed:
|
|
# actions = ActionChains(browser)
|
|
# actions.move_to_element(btn_right).perform()
|
|
safety_index += 1
|
|
time.sleep(1)
|
|
browser.execute_script(click_script)
|
|
time.sleep(1)
|
|
element = browser.find_element(By.XPATH, ALBUM_CONTAINER_ITEMS_XPATH)
|
|
scrape_data += element.get_attribute('outerHTML')
|
|
time.sleep(1)
|
|
btn_right_displayed = btn_right.is_displayed()
|
|
if safety_index > 5:
|
|
btn_right_displayed = False
|
|
time.sleep(1)
|
|
|
|
finally:
|
|
# after 10 seconds, give up
|
|
browser.quit()
|
|
|
|
return scrape_data
|