You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
200 lines
8.8 KiB
200 lines
8.8 KiB
<?php
|
|
|
|
namespace App\Models;
|
|
|
|
use App\Utils\ImageUrl;
|
|
use Facebook\WebDriver\Remote\RemoteWebDriver;
|
|
use Facebook\WebDriver\WebDriverBy;
|
|
use Facebook\WebDriver\WebDriverAction;
|
|
use Facebook\WebDriver\WebDriverExpectedCondition;
|
|
|
|
class WebScraper
|
|
{
|
|
/**
|
|
* Fallback scrape option for the youtube music search page; in some cases there are no additional artists available
|
|
*
|
|
* @return RemoteWebDriver
|
|
*/
|
|
public static function scrapeArtist($driver)
|
|
{
|
|
$response = [];
|
|
$artistContainer = $driver->findElement(WebDriverBy::cssSelector('.main-card-content-container'));
|
|
$artistThumbnail = $artistContainer->findElement(WebDriverBy::cssSelector('img'))->getAttribute('src');
|
|
$artistLink = $artistContainer->findElements(WebDriverBy::cssSelector('a'));
|
|
$artistHref = $artistLink[0]->getAttribute('href');
|
|
$artistName = $artistLink[0]->getAttribute('title');
|
|
|
|
// Resize image and save to file, provide path to data
|
|
$imageUrl = ImageUrl::modifyGoogleImageUrl($artistThumbnail);
|
|
$imageFileUrl = ImageUrl::save_img_url($imageUrl, 'artist');
|
|
|
|
$data = [
|
|
'name' => $artistName,
|
|
'thumbnail' => $artistThumbnail,
|
|
'url_remote' => $artistHref,
|
|
'image' => $imageFileUrl,
|
|
];
|
|
$artist_id = Artist::findOrCreateByName($artistName, $data);
|
|
return $artist_id->read();
|
|
}
|
|
|
|
/**
|
|
* The first scrape that is attempted; this will return the artists and similar artists per youtube so we can return
|
|
* the users search with additional suggestions, or a list of suggestions if their exact search isn't found.
|
|
*
|
|
* @return RemoteWebDriver
|
|
*/
|
|
public static function scrapeArtists($driver)
|
|
{
|
|
$response = [];
|
|
// Click the artist button to force a "structure" of results
|
|
$artistBtnXpath = '//a[@title="Show artist results"]';
|
|
$driver->wait(10, 500)->until(
|
|
WebDriverExpectedCondition::visibilityOfElementLocated(WebDriverBy::xpath($artistBtnXpath))
|
|
);
|
|
$driver->findElement(WebDriverBy::xpath($artistBtnXpath))->click();
|
|
// Youtube has multiple elements with the same ID (Naughty!). We will give a reasonable analog time to render.
|
|
sleep(5);
|
|
|
|
$contentDivs = $driver->findElements(WebDriverBy::cssSelector('#contents'));
|
|
$divCount = 0;
|
|
foreach ($contentDivs as $content) {
|
|
$divCount += 1;
|
|
$artists = $content->findElements(WebDriverBy::xpath('//ytmusic-responsive-list-item-renderer'));
|
|
if ($artists) {
|
|
$resultCap = 6;
|
|
$resultIndex = 0;
|
|
foreach ($artists as $artist) {
|
|
// There are a bunch of elements with no text in them; just a quick and dirty filter
|
|
$hasText = $artist->getText();
|
|
if ($hasText) {
|
|
$resultIndex += 1;
|
|
// Artist Details
|
|
$artistThumbnail = $artist->findElement(WebDriverBy::cssSelector('img'))->getAttribute('src');
|
|
$artistLink = $artist->findElements(WebDriverBy::cssSelector('a'));
|
|
$artistHref = $artistLink[0]->getAttribute('href');
|
|
$artistName = $artistLink[0]->getAttribute('aria-label');
|
|
|
|
// Resize image and save to file, provide path to data
|
|
$imageUrl = ImageUrl::modifyGoogleImageUrl($artistThumbnail);
|
|
$imageFileUrl = ImageUrl::save_img_url($imageUrl, 'artist');
|
|
|
|
// Create if we don't have it yet
|
|
$data = [
|
|
'name' => $artistName,
|
|
'thumbnail' => $artistThumbnail,
|
|
'url_remote' => $artistHref,
|
|
'image' => $imageFileUrl,
|
|
];
|
|
$artist_id = Artist::findOrCreateByName($artistName, $data);
|
|
$response[] = $artist_id->read();
|
|
// Limit the results, there are alot of them
|
|
if ($resultCap <= $resultIndex) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
// There are 4 div#contents returned, one empty and 3 with duplicated info
|
|
break;
|
|
}
|
|
}
|
|
|
|
return $response;
|
|
}
|
|
|
|
public static function processAlbums($albumContainer, $artist)
|
|
{
|
|
$albumLink = $albumContainer->findElement(WebDriverBy::cssSelector('a'));
|
|
$albumHref = $albumLink->getAttribute('href');
|
|
$albumTitle = $albumLink->getAttribute('title');
|
|
$albumThumbnail = $albumLink->findElement(WebDriverBy::cssSelector('img'))->getAttribute('src');
|
|
|
|
// Resize image and save to file, provide path to data
|
|
$imageUrl = ImageUrl::modifyGoogleImageUrl($albumThumbnail);
|
|
$imageFileUrl = ImageUrl::save_img_url($imageUrl, 'album');
|
|
|
|
$data = [
|
|
'name' => $albumTitle,
|
|
'artist_id' => $artist->id,
|
|
'thumbnail' => $albumThumbnail,
|
|
'url_remote' => $albumHref,
|
|
'image' => $imageFileUrl,
|
|
];
|
|
$album_id = Album::findOrCreateByName($artist, $albumTitle, $data);
|
|
AlbumQueue::addQueue($album_id);
|
|
}
|
|
|
|
/**
|
|
* Scrape the album data from given artist page, create new album records and queue those records for download
|
|
*
|
|
* @return RemoteWebDriver
|
|
*/
|
|
public static function scrapeAlbums($driver, $artist_id)
|
|
{
|
|
$url = 'https://music.youtube.com/' . $artist_id->url_remote;
|
|
$driver->get($url);
|
|
$response = 0;
|
|
try {
|
|
\Log::info('Looking for Albums button..');
|
|
$albumBtn = $driver->findElements(WebDriverBy::xpath('//a[text()="Albums"]'));
|
|
if ($albumBtn) {
|
|
\Log::info('Clicking on located Albums button..');
|
|
$albumBtn[0]->click();
|
|
sleep(3);
|
|
$itemsContainer = $driver->findElements(WebDriverBy::cssSelector('#items'));
|
|
foreach ($itemsContainer as $item) {
|
|
$albumContainers = $item->findElements(WebDriverBy::cssSelector('.ytmusic-grid-renderer'));
|
|
if ($albumContainers) {
|
|
foreach ($albumContainers as $albumContainer) {
|
|
$response += 1;
|
|
WebScraper::processAlbums($albumContainer, $artist_id);
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
\Log::info('Could not locate Albums button');
|
|
$ytRows = $driver->findElements(WebDriverBy::cssSelector('ytmusic-carousel-shelf-renderer'));
|
|
foreach ($ytRows as $ytRow) {
|
|
$contentGroup = $ytRow->findElements(WebDriverBy::cssSelector('#content-group'));
|
|
foreach ($contentGroup as $group) {
|
|
$groupName = $group->getText();
|
|
if ($groupName == 'Albums') {
|
|
|
|
// Sometimes we don't have the option to click the albums button to filter
|
|
// Yet, the albums are in a carousel and the images won't load unless they are in view
|
|
$caroselNextButton = $driver->findElements(WebDriverBy::cssSelector('#next-items-button'));
|
|
if ($caroselNextButton) {
|
|
// Youtube is smart enough to block this without an action
|
|
for ($i = 0; $i <= 3; $i++) {
|
|
if ($caroselNextButton[0]->isEnabled()) {
|
|
$action = $driver->action();
|
|
$action->moveToElement($caroselNextButton[0])->click()->perform();
|
|
sleep(1);
|
|
}
|
|
}
|
|
}
|
|
|
|
$itemsContainer = $ytRow->findElements(WebDriverBy::cssSelector('#items'));
|
|
foreach ($itemsContainer as $item) {
|
|
$albumContainers = $item->findElements(WebDriverBy::cssSelector('ytmusic-two-row-item-renderer'));
|
|
if ($albumContainers) {
|
|
foreach ($albumContainers as $albumContainer) {
|
|
WebScraper::processAlbums($albumContainer, $artist_id);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
} catch (\Exception $e) {
|
|
\Log::warning('Failed to scrape albums: ---------');
|
|
\Log::warning($e->getMessage());
|
|
}
|
|
|
|
return $response;
|
|
}
|
|
}
|