Skip to content

Commit

Permalink
Adopt WebDriverAbstract as a solution for active (JavaScript) websites (
Browse files Browse the repository at this point in the history
#3971)

* first working version

---------

Co-authored-by: Dag <[email protected]>
  • Loading branch information
hleskien and dvikan authored Feb 10, 2024
1 parent ff7840d commit 8e8028b
Show file tree
Hide file tree
Showing 7 changed files with 473 additions and 1 deletion.
164 changes: 164 additions & 0 deletions bridges/GULPProjekteBridge.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
<?php

use Facebook\WebDriver\Exception\NoSuchElementException;
use Facebook\WebDriver\Remote\RemoteWebElement;
use Facebook\WebDriver\WebDriverBy;
use Facebook\WebDriver\WebDriverExpectedCondition;

class GULPProjekteBridge extends WebDriverAbstract
{
const NAME = 'GULP Projekte';
const URI = 'https://www.gulp.de/gulp2/g/projekte';
const DESCRIPTION = 'Projektsuche';
const MAINTAINER = 'hleskien';

const MAXITEMS = 60;

/**
* Adds accept language german to the Chrome Options.
*
* @return Facebook\WebDriver\Chrome\ChromeOptions
*/
protected function getBrowserOptions()
{
$chromeOptions = parent::getBrowserOptions();
$chromeOptions->addArguments(['--accept-lang=de']);
return $chromeOptions;
}

/**
* @throws Facebook\WebDriver\Exception\NoSuchElementException
* @throws Facebook\WebDriver\Exception\TimeoutException
*/
protected function clickAwayCookieBanner()
{
$this->getDriver()->wait()->until(WebDriverExpectedCondition::visibilityOfElementLocated(WebDriverBy::id('onetrust-reject-all-handler')));
$buttonRejectCookies = $this->getDriver()->findElement(WebDriverBy::id('onetrust-reject-all-handler'));
$buttonRejectCookies->click();
$this->getDriver()->wait()->until(WebDriverExpectedCondition::invisibilityOfElementLocated(WebDriverBy::id('onetrust-reject-all-handler')));
}

/**
* @throws Facebook\WebDriver\Exception\NoSuchElementException
* @throws Facebook\WebDriver\Exception\TimeoutException
*/
protected function clickNextPage()
{
$nextPage = $this->getDriver()->findElement(WebDriverBy::xpath('//app-linkable-paginator//li[@id="next-page"]/a'));
$href = $nextPage->getAttribute('href');
$nextPage->click();
$this->getDriver()->wait()->until(WebDriverExpectedCondition::not(
WebDriverExpectedCondition::presenceOfElementLocated(
WebDriverBy::xpath('//app-linkable-paginator//li[@id="next-page"]/a[@href="' . $href . '"]')
)
));
}

/**
* Returns the uri of the 'Projektanbieter' logo or false if there is
* no logo present in the item.
*
* @return string | false
*/
protected function getLogo(RemoteWebElement $item)
{
try {
$logo = $item->findElement(WebDriverBy::tagName('img'))->getAttribute('src');
if (str_starts_with($logo, 'http')) {
// different domain
return $logo;
} else {
// relative path
$remove = substr(self::URI, strrpos(self::URI, '/') + 1);
return substr(self::URI, 0, -strlen($remove)) . $logo;
}
} catch (NoSuchElementException $e) {
return false;
}
}

/**
* Converts a string like "vor einigen Minuten" into a reasonable timestamp.
* Long and complicated, but we don't want to be more specific than
* the information we have available.
*
* @throws Exception If the DateInterval can't be parsed.
*/
protected function getTimestamp(string $timeAgo): int
{
$dateTime = new DateTime();
$dateArray = explode(' ', $dateTime->format('Y m d H i s'));
$quantityStr = explode(' ', $timeAgo)[1];
// convert possible word into a number
if (in_array($quantityStr, ['einem', 'einer', 'einigen'])) {
$quantity = 1;
} else {
$quantity = intval($quantityStr);
}
// subtract time ago + inferior units for lower precision
if (str_contains($timeAgo, 'Sekunde')) {
$interval = new DateInterval('PT' . $quantity . 'S');
} elseif (str_contains($timeAgo, 'Minute')) {
$interval = new DateInterval('PT' . $quantity . 'M' . $dateArray[5] . 'S');
} elseif (str_contains($timeAgo, 'Stunde')) {
$interval = new DateInterval('PT' . $quantity . 'H' . $dateArray[4] . 'M' . $dateArray[5] . 'S');
} elseif (str_contains($timeAgo, 'Tag')) {
$interval = new DateInterval('P' . $quantity . 'DT' . $dateArray[3] . 'H' . $dateArray[4] . 'M' . $dateArray[5] . 'S');
} else {
throw new UnexpectedValueException($timeAgo);
}
$dateTime = $dateTime->sub($interval);
return $dateTime->getTimestamp();
}

/**
* The main loop which clicks through search result pages and puts
* the content into the $items array.
*
* @throws Facebook\WebDriver\Exception\NoSuchElementException
* @throws Facebook\WebDriver\Exception\TimeoutException
*/
public function collectData()
{
parent::collectData();

try {
$this->clickAwayCookieBanner();
$this->setIcon($this->getDriver()->findElement(WebDriverBy::xpath('//link[@rel="shortcut icon"]'))->getAttribute('href'));

while (true) {
$items = $this->getDriver()->findElements(WebDriverBy::tagName('app-project-view'));
foreach ($items as $item) {
$feedItem = new FeedItem();

$heading = $item->findElement(WebDriverBy::xpath('.//app-heading-tag/h1/a'));
$feedItem->setTitle($heading->getText());
$feedItem->setURI('https://www.gulp.de' . $heading->getAttribute('href'));
$info = $item->findElement(WebDriverBy::tagName('app-icon-info-list'));
if ($logo = $this->getLogo($item)) {
$feedItem->setEnclosures([$logo]);
}
if (str_contains($info->getText(), 'Projektanbieter:')) {
$feedItem->setAuthor($info->findElement(WebDriverBy::xpath('.//li/span[2]/span'))->getText());
} else {
// mostly "Direkt vom Auftraggeber" or "GULP Agentur"
$feedItem->setAuthor($item->findElement(WebDriverBy::tagName('b'))->getText());
}
$feedItem->setContent($item->findElement(WebDriverBy::xpath('.//p[@class="description"]'))->getText());
$timeAgo = $item->findElement(WebDriverBy::xpath('.//small[contains(@class, "time-ago")]'))->getText();
$feedItem->setTimestamp($this->getTimestamp($timeAgo));

$this->items[] = $feedItem;
}

if (count($this->items) < self::MAXITEMS) {
$this->clickNextPage();
} else {
break;
}
}
} finally {
$this->cleanUp();
}
}
}
73 changes: 73 additions & 0 deletions bridges/ScalableCapitalBlogBridge.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
<?php

use Facebook\WebDriver\WebDriverBy;
use Facebook\WebDriver\WebDriverExpectedCondition;

class ScalableCapitalBlogBridge extends WebDriverAbstract
{
const NAME = 'Scalable Capital Blog';
const URI = 'https://de.scalable.capital/blog';
const DESCRIPTION = 'Alle Artikel';
const MAINTAINER = 'hleskien';

/**
* Adds accept language german to the Chrome Options.
*
* @return Facebook\WebDriver\Chrome\ChromeOptions
*/
protected function getBrowserOptions()
{
$chromeOptions = parent::getBrowserOptions();
$chromeOptions->addArguments(['--accept-lang=de']);
return $chromeOptions;
}

/**
* Puts the content of the first page into the $items array.
*
* @throws Facebook\WebDriver\Exception\NoSuchElementException
* @throws Facebook\WebDriver\Exception\TimeoutException
*/
public function collectData()
{
parent::collectData();

try {
// wait until last item is loaded
$this->getDriver()->wait()->until(WebDriverExpectedCondition::visibilityOfElementLocated(
WebDriverBy::xpath('//div[contains(@class, "articles")]//div[@class="items"]//div[contains(@class, "item")][15]')
));
$this->setIcon($this->getDriver()->findElement(WebDriverBy::xpath('//link[@rel="shortcut icon"]'))->getAttribute('href'));

$items = $this->getDriver()->findElements(WebDriverBy::xpath('//div[contains(@class, "articles")]//div[@class="items"]//div[contains(@class, "item")]'));
foreach ($items as $item) {
$feedItem = new FeedItem();

$feedItem->setEnclosures(['https://de.scalable.capital' . $item->findElement(WebDriverBy::tagName('img'))->getAttribute('src')]);
$heading = $item->findElement(WebDriverBy::tagName('a'));
$feedItem->setTitle($heading->getText());
$feedItem->setURI('https://de.scalable.capital' . $heading->getAttribute('href'));
$feedItem->setContent($item->findElement(WebDriverBy::xpath('.//div[@class="summary"]'))->getText());
$date = $item->findElement(WebDriverBy::xpath('.//div[@class="published-date"]'))->getText();
$feedItem->setTimestamp($this->formatItemTimestamp($date));
$feedItem->setAuthor($item->findElement(WebDriverBy::xpath('.//div[@class="author"]'))->getText());

$this->items[] = $feedItem;
}
} finally {
$this->cleanUp();
}
}

/**
* Converts the given date (dd.mm.yyyy) into a timestamp.
*
* @param $value string
* @return int
*/
protected function formatItemTimestamp($value)
{
$formatter = new IntlDateFormatter('de', IntlDateFormatter::LONG, IntlDateFormatter::NONE);
return $formatter->parse($value);
}
}
10 changes: 10 additions & 0 deletions config.default.ini.php
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,16 @@
; false = disabled (default)
by_bridge = false

[webdriver]

; Sets the url of the webdriver or selenium server
selenium_server_url = "http://localhost:4444"

; Sets whether the browser should run in headless mode (no visible ui)
; true = enabled
; false = disabled (default)
headless = false

[authentication]

; HTTP basic authentication
Expand Down
83 changes: 83 additions & 0 deletions docs/05_Bridge_API/04_WebDriverAbstract.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
`WebDriverAbstract` extends [`BridgeAbstract`](./02_BridgeAbstract.md) and adds functionality for generating feeds
from active websites that use XMLHttpRequest (XHR) to load content and / or JavaScript to
modify content.
It highly depends on the php-webdriver library which offers Selenium WebDriver bindings for PHP.

- https://github.com/php-webdriver/php-webdriver (Project Repository)
- https://php-webdriver.github.io/php-webdriver/latest/ (API)

Please note that this class is intended as a solution for websites _that cannot be covered
by the other classes_. The WebDriver starts a browser and is therefore very resource-intensive.

# Configuration

You need a running WebDriver to use bridges that depend on `WebDriverAbstract`.
The easiest way is to start the Selenium server from the project of the same name:
```
docker run -d -p 4444:4444 --shm-size="2g" docker.io/selenium/standalone-chrome:latest
```

- https://github.com/SeleniumHQ/docker-selenium

With these parameters only one browser window can be started at a time.
On a multi-user site, Selenium Grid should be used
and the number of sessions should be adjusted to the number of processor cores.

Finally, the `config.ini.php` file must be adjusted so that the WebDriver
can find the Selenium server:
```
[webdriver]
selenium_server_url = "http://localhost:4444"
```

# Development

While you are programming a new bridge, it is easier to start a local WebDriver because then you can see what is happening and where the errors are. I've also had good experience recording the process with a screen video to find any timing problems.

```
chromedriver --port=4444
```

- https://chromedriver.chromium.org/

If you start rss-bridge from a container, then Chrome driver is only accessible
if you call it with the `--allowed-ips` option so that it binds to all network interfaces.

```
chromedriver --port=4444 --allowed-ips=192.168.1.42
```

The **most important rule** is that after an event such as loading the web page
or pressing a button, you often have to explicitly wait for the desired elements to appear.

A simple example is the bridge `ScalableCapitalBlogBridge.php`.
A more complex and relatively complete example is the bridge `GULPProjekteBridge.php`.

# Template

Use this template to create your own bridge.

```PHP
<?php

class MyBridge extends WebDriverAbstract
{
const NAME = 'My Bridge';
const URI = 'https://www.example.org';
const DESCRIPTION = 'Further description';
const MAINTAINER = 'your name';

public function collectData()
{
parent::collectData();

try {
// TODO
} finally {
$this->cleanUp();
}
}
}

```
File renamed without changes.
3 changes: 2 additions & 1 deletion docs/05_Bridge_API/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Base class | Description
-----------|------------
[`BridgeAbstract`](./02_BridgeAbstract.md) | This class is intended for standard _Bridges_ that need to filter HTML pages for content.
[`FeedExpander`](./03_FeedExpander.md) | Expand/modify existing feed urls
[`XPathAbstract`](./04_XPathAbstract.md) | This class is meant as an alternative base class for bridge implementations. It offers preliminary functionality for generating feeds based on _XPath expressions_.
[`WebDriverAbstract`](./04_WebDriverAbstract) |
[`XPathAbstract`](./05_XPathAbstract) | This class is meant as an alternative base class for bridge implementations. It offers preliminary functionality for generating feeds based on _XPath expressions_.

For more information about how to create a new _Bridge_, read [How to create a new Bridge?](./01_How_to_create_a_new_bridge.md)
Loading

0 comments on commit 8e8028b

Please sign in to comment.