-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adopt WebDriverAbstract as a solution for active (JavaScript) websites (
#3971) * first working version --------- Co-authored-by: Dag <[email protected]>
- Loading branch information
Showing
7 changed files
with
473 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,164 @@ | ||
<?php | ||
|
||
use Facebook\WebDriver\Exception\NoSuchElementException; | ||
use Facebook\WebDriver\Remote\RemoteWebElement; | ||
use Facebook\WebDriver\WebDriverBy; | ||
use Facebook\WebDriver\WebDriverExpectedCondition; | ||
|
||
class GULPProjekteBridge extends WebDriverAbstract | ||
{ | ||
const NAME = 'GULP Projekte'; | ||
const URI = 'https://www.gulp.de/gulp2/g/projekte'; | ||
const DESCRIPTION = 'Projektsuche'; | ||
const MAINTAINER = 'hleskien'; | ||
|
||
const MAXITEMS = 60; | ||
|
||
/** | ||
* Adds accept language german to the Chrome Options. | ||
* | ||
* @return Facebook\WebDriver\Chrome\ChromeOptions | ||
*/ | ||
protected function getBrowserOptions() | ||
{ | ||
$chromeOptions = parent::getBrowserOptions(); | ||
$chromeOptions->addArguments(['--accept-lang=de']); | ||
return $chromeOptions; | ||
} | ||
|
||
/** | ||
* @throws Facebook\WebDriver\Exception\NoSuchElementException | ||
* @throws Facebook\WebDriver\Exception\TimeoutException | ||
*/ | ||
protected function clickAwayCookieBanner() | ||
{ | ||
$this->getDriver()->wait()->until(WebDriverExpectedCondition::visibilityOfElementLocated(WebDriverBy::id('onetrust-reject-all-handler'))); | ||
$buttonRejectCookies = $this->getDriver()->findElement(WebDriverBy::id('onetrust-reject-all-handler')); | ||
$buttonRejectCookies->click(); | ||
$this->getDriver()->wait()->until(WebDriverExpectedCondition::invisibilityOfElementLocated(WebDriverBy::id('onetrust-reject-all-handler'))); | ||
} | ||
|
||
/** | ||
* @throws Facebook\WebDriver\Exception\NoSuchElementException | ||
* @throws Facebook\WebDriver\Exception\TimeoutException | ||
*/ | ||
protected function clickNextPage() | ||
{ | ||
$nextPage = $this->getDriver()->findElement(WebDriverBy::xpath('//app-linkable-paginator//li[@id="next-page"]/a')); | ||
$href = $nextPage->getAttribute('href'); | ||
$nextPage->click(); | ||
$this->getDriver()->wait()->until(WebDriverExpectedCondition::not( | ||
WebDriverExpectedCondition::presenceOfElementLocated( | ||
WebDriverBy::xpath('//app-linkable-paginator//li[@id="next-page"]/a[@href="' . $href . '"]') | ||
) | ||
)); | ||
} | ||
|
||
/** | ||
* Returns the uri of the 'Projektanbieter' logo or false if there is | ||
* no logo present in the item. | ||
* | ||
* @return string | false | ||
*/ | ||
protected function getLogo(RemoteWebElement $item) | ||
{ | ||
try { | ||
$logo = $item->findElement(WebDriverBy::tagName('img'))->getAttribute('src'); | ||
if (str_starts_with($logo, 'http')) { | ||
// different domain | ||
return $logo; | ||
} else { | ||
// relative path | ||
$remove = substr(self::URI, strrpos(self::URI, '/') + 1); | ||
return substr(self::URI, 0, -strlen($remove)) . $logo; | ||
} | ||
} catch (NoSuchElementException $e) { | ||
return false; | ||
} | ||
} | ||
|
||
/** | ||
* Converts a string like "vor einigen Minuten" into a reasonable timestamp. | ||
* Long and complicated, but we don't want to be more specific than | ||
* the information we have available. | ||
* | ||
* @throws Exception If the DateInterval can't be parsed. | ||
*/ | ||
protected function getTimestamp(string $timeAgo): int | ||
{ | ||
$dateTime = new DateTime(); | ||
$dateArray = explode(' ', $dateTime->format('Y m d H i s')); | ||
$quantityStr = explode(' ', $timeAgo)[1]; | ||
// convert possible word into a number | ||
if (in_array($quantityStr, ['einem', 'einer', 'einigen'])) { | ||
$quantity = 1; | ||
} else { | ||
$quantity = intval($quantityStr); | ||
} | ||
// subtract time ago + inferior units for lower precision | ||
if (str_contains($timeAgo, 'Sekunde')) { | ||
$interval = new DateInterval('PT' . $quantity . 'S'); | ||
} elseif (str_contains($timeAgo, 'Minute')) { | ||
$interval = new DateInterval('PT' . $quantity . 'M' . $dateArray[5] . 'S'); | ||
} elseif (str_contains($timeAgo, 'Stunde')) { | ||
$interval = new DateInterval('PT' . $quantity . 'H' . $dateArray[4] . 'M' . $dateArray[5] . 'S'); | ||
} elseif (str_contains($timeAgo, 'Tag')) { | ||
$interval = new DateInterval('P' . $quantity . 'DT' . $dateArray[3] . 'H' . $dateArray[4] . 'M' . $dateArray[5] . 'S'); | ||
} else { | ||
throw new UnexpectedValueException($timeAgo); | ||
} | ||
$dateTime = $dateTime->sub($interval); | ||
return $dateTime->getTimestamp(); | ||
} | ||
|
||
/** | ||
* The main loop which clicks through search result pages and puts | ||
* the content into the $items array. | ||
* | ||
* @throws Facebook\WebDriver\Exception\NoSuchElementException | ||
* @throws Facebook\WebDriver\Exception\TimeoutException | ||
*/ | ||
public function collectData() | ||
{ | ||
parent::collectData(); | ||
|
||
try { | ||
$this->clickAwayCookieBanner(); | ||
$this->setIcon($this->getDriver()->findElement(WebDriverBy::xpath('//link[@rel="shortcut icon"]'))->getAttribute('href')); | ||
|
||
while (true) { | ||
$items = $this->getDriver()->findElements(WebDriverBy::tagName('app-project-view')); | ||
foreach ($items as $item) { | ||
$feedItem = new FeedItem(); | ||
|
||
$heading = $item->findElement(WebDriverBy::xpath('.//app-heading-tag/h1/a')); | ||
$feedItem->setTitle($heading->getText()); | ||
$feedItem->setURI('https://www.gulp.de' . $heading->getAttribute('href')); | ||
$info = $item->findElement(WebDriverBy::tagName('app-icon-info-list')); | ||
if ($logo = $this->getLogo($item)) { | ||
$feedItem->setEnclosures([$logo]); | ||
} | ||
if (str_contains($info->getText(), 'Projektanbieter:')) { | ||
$feedItem->setAuthor($info->findElement(WebDriverBy::xpath('.//li/span[2]/span'))->getText()); | ||
} else { | ||
// mostly "Direkt vom Auftraggeber" or "GULP Agentur" | ||
$feedItem->setAuthor($item->findElement(WebDriverBy::tagName('b'))->getText()); | ||
} | ||
$feedItem->setContent($item->findElement(WebDriverBy::xpath('.//p[@class="description"]'))->getText()); | ||
$timeAgo = $item->findElement(WebDriverBy::xpath('.//small[contains(@class, "time-ago")]'))->getText(); | ||
$feedItem->setTimestamp($this->getTimestamp($timeAgo)); | ||
|
||
$this->items[] = $feedItem; | ||
} | ||
|
||
if (count($this->items) < self::MAXITEMS) { | ||
$this->clickNextPage(); | ||
} else { | ||
break; | ||
} | ||
} | ||
} finally { | ||
$this->cleanUp(); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
<?php | ||
|
||
use Facebook\WebDriver\WebDriverBy; | ||
use Facebook\WebDriver\WebDriverExpectedCondition; | ||
|
||
class ScalableCapitalBlogBridge extends WebDriverAbstract | ||
{ | ||
const NAME = 'Scalable Capital Blog'; | ||
const URI = 'https://de.scalable.capital/blog'; | ||
const DESCRIPTION = 'Alle Artikel'; | ||
const MAINTAINER = 'hleskien'; | ||
|
||
/** | ||
* Adds accept language german to the Chrome Options. | ||
* | ||
* @return Facebook\WebDriver\Chrome\ChromeOptions | ||
*/ | ||
protected function getBrowserOptions() | ||
{ | ||
$chromeOptions = parent::getBrowserOptions(); | ||
$chromeOptions->addArguments(['--accept-lang=de']); | ||
return $chromeOptions; | ||
} | ||
|
||
/** | ||
* Puts the content of the first page into the $items array. | ||
* | ||
* @throws Facebook\WebDriver\Exception\NoSuchElementException | ||
* @throws Facebook\WebDriver\Exception\TimeoutException | ||
*/ | ||
public function collectData() | ||
{ | ||
parent::collectData(); | ||
|
||
try { | ||
// wait until last item is loaded | ||
$this->getDriver()->wait()->until(WebDriverExpectedCondition::visibilityOfElementLocated( | ||
WebDriverBy::xpath('//div[contains(@class, "articles")]//div[@class="items"]//div[contains(@class, "item")][15]') | ||
)); | ||
$this->setIcon($this->getDriver()->findElement(WebDriverBy::xpath('//link[@rel="shortcut icon"]'))->getAttribute('href')); | ||
|
||
$items = $this->getDriver()->findElements(WebDriverBy::xpath('//div[contains(@class, "articles")]//div[@class="items"]//div[contains(@class, "item")]')); | ||
foreach ($items as $item) { | ||
$feedItem = new FeedItem(); | ||
|
||
$feedItem->setEnclosures(['https://de.scalable.capital' . $item->findElement(WebDriverBy::tagName('img'))->getAttribute('src')]); | ||
$heading = $item->findElement(WebDriverBy::tagName('a')); | ||
$feedItem->setTitle($heading->getText()); | ||
$feedItem->setURI('https://de.scalable.capital' . $heading->getAttribute('href')); | ||
$feedItem->setContent($item->findElement(WebDriverBy::xpath('.//div[@class="summary"]'))->getText()); | ||
$date = $item->findElement(WebDriverBy::xpath('.//div[@class="published-date"]'))->getText(); | ||
$feedItem->setTimestamp($this->formatItemTimestamp($date)); | ||
$feedItem->setAuthor($item->findElement(WebDriverBy::xpath('.//div[@class="author"]'))->getText()); | ||
|
||
$this->items[] = $feedItem; | ||
} | ||
} finally { | ||
$this->cleanUp(); | ||
} | ||
} | ||
|
||
/** | ||
* Converts the given date (dd.mm.yyyy) into a timestamp. | ||
* | ||
* @param $value string | ||
* @return int | ||
*/ | ||
protected function formatItemTimestamp($value) | ||
{ | ||
$formatter = new IntlDateFormatter('de', IntlDateFormatter::LONG, IntlDateFormatter::NONE); | ||
return $formatter->parse($value); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
`WebDriverAbstract` extends [`BridgeAbstract`](./02_BridgeAbstract.md) and adds functionality for generating feeds | ||
from active websites that use XMLHttpRequest (XHR) to load content and / or JavaScript to | ||
modify content. | ||
It highly depends on the php-webdriver library which offers Selenium WebDriver bindings for PHP. | ||
|
||
- https://github.com/php-webdriver/php-webdriver (Project Repository) | ||
- https://php-webdriver.github.io/php-webdriver/latest/ (API) | ||
|
||
Please note that this class is intended as a solution for websites _that cannot be covered | ||
by the other classes_. The WebDriver starts a browser and is therefore very resource-intensive. | ||
|
||
# Configuration | ||
|
||
You need a running WebDriver to use bridges that depend on `WebDriverAbstract`. | ||
The easiest way is to start the Selenium server from the project of the same name: | ||
``` | ||
docker run -d -p 4444:4444 --shm-size="2g" docker.io/selenium/standalone-chrome:latest | ||
``` | ||
|
||
- https://github.com/SeleniumHQ/docker-selenium | ||
|
||
With these parameters only one browser window can be started at a time. | ||
On a multi-user site, Selenium Grid should be used | ||
and the number of sessions should be adjusted to the number of processor cores. | ||
|
||
Finally, the `config.ini.php` file must be adjusted so that the WebDriver | ||
can find the Selenium server: | ||
``` | ||
[webdriver] | ||
selenium_server_url = "http://localhost:4444" | ||
``` | ||
|
||
# Development | ||
|
||
While you are programming a new bridge, it is easier to start a local WebDriver because then you can see what is happening and where the errors are. I've also had good experience recording the process with a screen video to find any timing problems. | ||
|
||
``` | ||
chromedriver --port=4444 | ||
``` | ||
|
||
- https://chromedriver.chromium.org/ | ||
|
||
If you start rss-bridge from a container, then Chrome driver is only accessible | ||
if you call it with the `--allowed-ips` option so that it binds to all network interfaces. | ||
|
||
``` | ||
chromedriver --port=4444 --allowed-ips=192.168.1.42 | ||
``` | ||
|
||
The **most important rule** is that after an event such as loading the web page | ||
or pressing a button, you often have to explicitly wait for the desired elements to appear. | ||
|
||
A simple example is the bridge `ScalableCapitalBlogBridge.php`. | ||
A more complex and relatively complete example is the bridge `GULPProjekteBridge.php`. | ||
|
||
# Template | ||
|
||
Use this template to create your own bridge. | ||
|
||
```PHP | ||
<?php | ||
|
||
class MyBridge extends WebDriverAbstract | ||
{ | ||
const NAME = 'My Bridge'; | ||
const URI = 'https://www.example.org'; | ||
const DESCRIPTION = 'Further description'; | ||
const MAINTAINER = 'your name'; | ||
|
||
public function collectData() | ||
{ | ||
parent::collectData(); | ||
|
||
try { | ||
// TODO | ||
} finally { | ||
$this->cleanUp(); | ||
} | ||
} | ||
} | ||
|
||
``` |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.